nlp manual

 

import nltk
nltk.download('punkt') # Download the punkt tokenizer models for word tokenization
def count_lines(file_path):
"""
This function counts the number of lines in the given text file.
"""
try:
with open(file_path, 'r') as file:
lines = file.readlines()
return len(lines)
except FileNotFoundError:
print("File not found. Please check the file path.")
return 0
def count_words(file_path):
"""
This function counts the number of words in the given text file.
It uses NLTK's word_tokenize method for accurate word tokenization.
"""
try:
with open(file_path, 'r') as file:text = file.read()
words = nltk.word_tokenize(text) # Tokenizes the text into words
return len(words)
except FileNotFoundError:
print("File not found. Please check the file path.")
return 0
def main():
file_path = input("Enter the path of the text file: ")
# Counting lines
line_count = count_lines(file_path)
print(f"Number of lines in the file: {line_count}")
# Counting words
word_count = count_words(file_path)
print(f"Number of words in the file: {word_count}")
if __name__ == "__main__":
main()
OUTPUT
Hello, how are you?
I am fine.
Thanks for asking.
How to Use:
Save your text file (for example, sample.txt).
Run the Python script.
When prompted, enter the file path (e.g., sample.txt).
Enter the path of the text file: nlp2.txt
Number of lines in the file: 3
Number of words in the file: 14Submitted By :
Checked By:
Sign:
Asst.Prof.Dhanshree R.Shinde.
Name:
Roll No:SSBT’s College of Engineering & Technology, Bambhori, Jalgaon
Department of Computer Applications
Practical: 03
Date of Performance:
Date of Completion:
Tittle: Implement program to count number of articles (a, an, the) in file.
import nltk
nltk.download('punkt')
import nltk
from nltk.tokenize import word_tokenize
# Ensure you have the necessary NLTK data
nltk.download('punkt')
def count_articles_in_file(file_path):
# Define the list of articles we are looking for
articles = ['a', 'an', 'the']
# Initialize a counter for articles
article_count = {'a': 0, 'an': 0, 'the': 0}
try:
# Open the file in read mode
with open(file_path, 'r', encoding='utf-8') as file:
# Process the file line by line
for line in file:
# Tokenize the line into words
words = word_tokenize(line.lower()) # Convert to lower case to ensure case
insensitivity
# Count the articles in the line
for word in words:if word in articles:
article_count[word] += 1
# Return the count of each article
return article_count
except FileNotFoundError:
return f"Error: The file at {file_path} was not found."
# Example usage
file_path = 'nlp3.txt' # Specify the path to your text file
article_counts = count_articles_in_file(file_path)
if isinstance(article_counts, dict):
print(f"Count of 'a': {article_counts['a']}")
print(f"Count of 'an': {article_counts['an']}")
print(f"Count of 'the': {article_counts['the']}")
else:
print(article_counts)
# In case of file not found or error
OUTPUT
Save this txt file
The quick brown fox jumps over a lazy dog.
An apple a day keeps the doctor away.
The sun is shining brightly.
Count of 'a': 2
Count of 'an': 1
Count of 'the': 3Submitted By :
Checked By:
Sign:
Asst.Prof.Dhanshree R.Shinde.
Name:
Roll No:SSBT’s College of Engineering & Technology, Bambhori, Jalgaon
Department of Computer Applications
Practical: 04
Date of Performance:
Date of Completion:
Tittle: Implement a program to perform tokenization and filtering stopwords in file.
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Download required NLTK resources if not already downloaded
nltk.download('punkt') # For tokenization
nltk.download('stopwords') # For stopwords
def tokenize_and_filter(text):
# Step 1: Tokenize the text into words
tokens = word_tokenize(text)
# Step 2: Convert tokens to lowercase (optional but recommended)
tokens = [token.lower() for token in tokens]
# Step 3: Load the set of stopwords
stop_words = set(stopwords.words('english'))
# Step 4: Filter out stopwords from tokens
filtered_tokens = [token for token in tokens if token not in stop_words]
return filtered_tokens
def read_file(file_path):
with open(file_path, 'r') as file:
text = file.read()
return text
def main():# Path to the text file (replace with your file path)
file_path = 'nlp2.txt'
# Step 1: Read content from file
text = read_file(file_path)
# Step 2: Tokenize and filter stopwords
filtered_tokens = tokenize_and_filter(text)
# Step 3: Print the result
print("Filtered Tokens:", filtered_tokens)
if __name__ == "__main__":
main()
OUTPUT
Save this text file
Hello, how are you?
I am fine.
Thanks for asking.
Filtered Tokens: ['hello', ',', '?', 'fine', '.', 'thanks', 'asking', '.']
Submitted By :
Checked By:
Sign:
Asst.Prof.Dhanshree R.Shinde.
Name:
Roll No:SSBT’s College of Engineering & Technology, Bambhori, Jalgaon
Department of Computer Applications
Practical: 05
Date of Performance:
Date of Completion:
Tittle: Implement a program which makes use of basics in regular
expressions like /a*/, /a+/, /a? /, /[ˆA Z]/, /[ˆSs]/, etc.
import nltk
import re
from nltk.tokenize import word_tokenize
# Ensure you have the necessary NLTK data
nltk.download('punkt')
def regex_patterns_in_text(file_path):
# Open the file in read mode
try:
with open(file_path, 'r', encoding='utf-8') as file:
# Read the content of the file
text = file.read().lower() # Convert to lowercase for case insensitivity
# Tokenize the text using NLTK
tokens = word_tokenize(text)
# Define regular expressions for various patterns
patterns = {
'a*': r'a*', # Matches zero or more 'a's
'a+': r'a+', # Matches one or more 'a's
'a?': r'a?', # Matches zero or one 'a'
'uppercase_letters': r'[A-Z]', # Matches any uppercase letter
's_or_S': r'[Ss]' # Matches 'S' or 's'
}# Match patterns in the tokenized text and store the results
results = {key: [] for key in patterns} # Initialize results for each pattern
# Search for each pattern in the tokens
for word in tokens:
for pattern_name, pattern in patterns.items():
if re.search(pattern, word): # If a match is found
results[pattern_name].append(word)
# Return the results for each regex pattern
return results
except FileNotFoundError:
return f"Error: The file at {file_path} was not found."
# Example usage
file_path = 'nlp5.txt' # Specify the path to your text file
regex_results = regex_patterns_in_text(file_path)
if isinstance(regex_results, dict):
for pattern_name, matched_words in regex_results.items():
print(f"Words matching '{pattern_name}': {matched_words}")
else:
print(regex_results) # In case of file not found or error
OUTPUT
Save this text file
An example sentence. Super Simple sentences, like this one, have many S words. Always,
do
your best.
Words matching 'a*': ['an', 'example', 'sentence', '.', 'super', 'simple', 'sentences', ',', 'like',
'this',
'one', ',', 'have', 'many', 's', 'words', '.', 'always', ',', 'do', 'your', 'best', '!']Words matching 'a+': ['an', 'example', 'have', 'many', 'always']
Words matching 'a?': ['an', 'example', 'sentence', '.', 'super', 'simple', 'sentences', ',', 'like',
'this',
'one', ',', 'have', 'many', 's', 'words', '.', 'always', ',', 'do', 'your', 'best', '!']
Words matching 'uppercase_letters': []
Words matching 's_or_S': ['sentence', 'super', 'simple', 'sentences', 'this', 's', 'words',
'always',
'best']
Submi

Comments

Popular posts from this blog

nlp