nlp manual

import nltk

nltk.download('punkt') # Download the punkt tokenizer models for word tokenization

def count_lines(file_path):

"""

This function counts the number of lines in the given text file.

"""

try:

with open(file_path, 'r') as file:

lines = file.readlines()

return len(lines)

except FileNotFoundError:

print("File not found. Please check the file path.")

return 0

def count_words(file_path):

"""

This function counts the number of words in the given text file.

It uses NLTK's word_tokenize method for accurate word tokenization.

"""

try:

with open(file_path, 'r') as file:text = file.read()

words = nltk.word_tokenize(text) # Tokenizes the text into words

return len(words)

except FileNotFoundError:

print("File not found. Please check the file path.")

return 0

def main():

file_path = input("Enter the path of the text file: ")

# Counting lines

line_count = count_lines(file_path)

print(f"Number of lines in the file: {line_count}")

# Counting words

word_count = count_words(file_path)

print(f"Number of words in the file: {word_count}")

if __name__ == "__main__":

main()

OUTPUT

Hello, how are you?

I am fine.

Thanks for asking.

How to Use:

Save your text file (for example, sample.txt).

Run the Python script.

When prompted, enter the file path (e.g., sample.txt).

Enter the path of the text file: nlp2.txt

Number of lines in the file: 3

Number of words in the file: 14Submitted By :

Checked By:

Sign:

Asst.Prof.Dhanshree R.Shinde.

Name:

Roll No:SSBT’s College of Engineering & Technology, Bambhori, Jalgaon

Department of Computer Applications

Practical: 03

Date of Performance:

Date of Completion:

Tittle: Implement program to count number of articles (a, an, the) in file.

import nltk

nltk.download('punkt')

import nltk

from nltk.tokenize import word_tokenize

# Ensure you have the necessary NLTK data

nltk.download('punkt')

def count_articles_in_file(file_path):

# Define the list of articles we are looking for

articles = ['a', 'an', 'the']

# Initialize a counter for articles

article_count = {'a': 0, 'an': 0, 'the': 0}

try:

# Open the file in read mode

with open(file_path, 'r', encoding='utf-8') as file:

# Process the file line by line

for line in file:

# Tokenize the line into words

words = word_tokenize(line.lower()) # Convert to lower case to ensure case

insensitivity

# Count the articles in the line

for word in words:if word in articles:

article_count[word] += 1

# Return the count of each article

return article_count

except FileNotFoundError:

return f"Error: The file at {file_path} was not found."

# Example usage

file_path = 'nlp3.txt' # Specify the path to your text file

article_counts = count_articles_in_file(file_path)

if isinstance(article_counts, dict):

print(f"Count of 'a': {article_counts['a']}")

print(f"Count of 'an': {article_counts['an']}")

print(f"Count of 'the': {article_counts['the']}")

else:

print(article_counts)

# In case of file not found or error

OUTPUT

Save this txt file

The quick brown fox jumps over a lazy dog.

An apple a day keeps the doctor away.

The sun is shining brightly.

Count of 'a': 2

Count of 'an': 1

Count of 'the': 3Submitted By :

Checked By:

Sign:

Asst.Prof.Dhanshree R.Shinde.

Name:

Roll No:SSBT’s College of Engineering & Technology, Bambhori, Jalgaon

Department of Computer Applications

Practical: 04

Date of Performance:

Date of Completion:

Tittle: Implement a program to perform tokenization and filtering stopwords in file.

import nltk

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

# Download required NLTK resources if not already downloaded

nltk.download('punkt') # For tokenization

nltk.download('stopwords') # For stopwords

def tokenize_and_filter(text):

# Step 1: Tokenize the text into words

tokens = word_tokenize(text)

# Step 2: Convert tokens to lowercase (optional but recommended)

tokens = [token.lower() for token in tokens]

# Step 3: Load the set of stopwords

stop_words = set(stopwords.words('english'))

# Step 4: Filter out stopwords from tokens

filtered_tokens = [token for token in tokens if token not in stop_words]

return filtered_tokens

def read_file(file_path):

with open(file_path, 'r') as file:

text = file.read()

return text

def main():# Path to the text file (replace with your file path)

file_path = 'nlp2.txt'

# Step 1: Read content from file

text = read_file(file_path)

# Step 2: Tokenize and filter stopwords

filtered_tokens = tokenize_and_filter(text)

# Step 3: Print the result

print("Filtered Tokens:", filtered_tokens)

if __name__ == "__main__":

main()

OUTPUT

Save this text file

Hello, how are you?

I am fine.

Thanks for asking.

Filtered Tokens: ['hello', ',', '?', 'fine', '.', 'thanks', 'asking', '.']

Submitted By :

Checked By:

Sign:

Asst.Prof.Dhanshree R.Shinde.

Name:

Roll No:SSBT’s College of Engineering & Technology, Bambhori, Jalgaon

Department of Computer Applications

Practical: 05

Date of Performance:

Date of Completion:

Tittle: Implement a program which makes use of basics in regular

expressions like /a*/, /a+/, /a? /, /[ˆA Z]/, /[ˆSs]/, etc.

import nltk

import re

from nltk.tokenize import word_tokenize

# Ensure you have the necessary NLTK data

nltk.download('punkt')

def regex_patterns_in_text(file_path):

# Open the file in read mode

try:

with open(file_path, 'r', encoding='utf-8') as file:

# Read the content of the file

text = file.read().lower() # Convert to lowercase for case insensitivity

# Tokenize the text using NLTK

tokens = word_tokenize(text)

# Define regular expressions for various patterns

patterns = {

'a*': r'a*', # Matches zero or more 'a's

'a+': r'a+', # Matches one or more 'a's

'a?': r'a?', # Matches zero or one 'a'

'uppercase_letters': r'[A-Z]', # Matches any uppercase letter

's_or_S': r'[Ss]' # Matches 'S' or 's'

}# Match patterns in the tokenized text and store the results

results = {key: [] for key in patterns} # Initialize results for each pattern

# Search for each pattern in the tokens

for word in tokens:

for pattern_name, pattern in patterns.items():

if re.search(pattern, word): # If a match is found

results[pattern_name].append(word)

# Return the results for each regex pattern

return results

except FileNotFoundError:

return f"Error: The file at {file_path} was not found."

# Example usage

file_path = 'nlp5.txt' # Specify the path to your text file

regex_results = regex_patterns_in_text(file_path)

if isinstance(regex_results, dict):

for pattern_name, matched_words in regex_results.items():

print(f"Words matching '{pattern_name}': {matched_words}")

else:

print(regex_results) # In case of file not found or error

OUTPUT

Save this text file

An example sentence. Super Simple sentences, like this one, have many S words. Always,

your best.

Words matching 'a*': ['an', 'example', 'sentence', '.', 'super', 'simple', 'sentences', ',', 'like',

'this',

'one', ',', 'have', 'many', 's', 'words', '.', 'always', ',', 'do', 'your', 'best', '!']Words matching 'a+': ['an', 'example', 'have', 'many', 'always']

Words matching 'a?': ['an', 'example', 'sentence', '.', 'super', 'simple', 'sentences', ',', 'like',

'this',

'one', ',', 'have', 'many', 's', 'words', '.', 'always', ',', 'do', 'your', 'best', '!']

Words matching 'uppercase_letters': []

Words matching 's_or_S': ['sentence', 'super', 'simple', 'sentences', 'this', 's', 'words',

'always',

'best']

Submi

Search This Blog

mca

nlp manual

Comments

Post a Comment

Popular posts from this blog

nlp