ChatGS / location.py
sharmamohit8624's picture
Upload 2395 files
829f2ca verified
import string
import nltk
import json
import re
from nltk import pos_tag, word_tokenize
import pandas as pd
from nltk.corpus import stopwords
from difflib import SequenceMatcher
# from googletrans import Translator
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# translator = Translator()
# Pre-compile regular expressions for efficiency
date_pattern = re.compile(r'\b(year|week|month|day)\b', re.IGNORECASE)
preposition_pattern = re.compile(r'\b(last|previous)\b', re.IGNORECASE)
stopword_pattern = re.compile(r'\b(' + '|'.join(stopwords.words("english")) + r')\b', re.IGNORECASE)
class LocationExtractor:
def __init__(self, csv_file_path='ISROP.csv'):
# Load the CSV file into a DataFrame
self.df = pd.read_csv(csv_file_path)
self.column_to_check = self.df['ROI_Name'].str.lower()
def extract_nouns(self, text):
words = word_tokenize(text)
pos_tags = pos_tag(words)
filtered_words = []
for word, pos in pos_tags:
if pos not in ['WP', 'CD', 'DT', 'EX', 'IN', 'LS', 'MD', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'] \
and not stopword_pattern.match(word) \
and not preposition_pattern.match(word) \
and not date_pattern.match(word) \
and 'spread' not in word.lower() \
and 'water' not in word.lower() \
and 'level' not in word.lower():
filtered_words.append(word)
filtered_text = ' '.join(filtered_words)
filtered_text = filtered_text.translate(str.maketrans('', '', string.punctuation))
return filtered_text
def extract_entities(self, input_text=""):
given_noun = self.extract_nouns(input_text).lower().strip()
print("Given Noun:", given_noun)
# Check for an exact match
exact_match = self.column_to_check.str.lower().str.strip().eq(given_noun)
print("Exact: ", exact_match.any())
if exact_match.any():
return given_noun, "exact"
else:
# Preprocess the given noun to improve matching
preprocessed_noun = ' '.join(filter(lambda x: x.strip(), re.split(r'\W+', given_noun)))
print("Preprocessed Noun:", preprocessed_noun)
# Split the preprocessed noun into individual words
words = preprocessed_noun.split()
print("Words:", words)
# Initialize results for individual word matching
for word in words:
print(f"Checking word: {word}")
# Check for an exact match of each word
exact_match_word = self.column_to_check.str.lower().str.strip().eq(word)
print(f"Exact Match for '{word}': ", exact_match_word.any())
if exact_match_word.any():
return word, "exact"
# If no exact match for individual words, proceed with closest match for full preprocessed noun
similarity_scores = [SequenceMatcher(None, preprocessed_noun, str(target).lower()).ratio() for target in self.column_to_check]
# Find the index of the closest match
closest_match_index = similarity_scores.index(max(similarity_scores))
# Get the closest match
closest_match = self.column_to_check.iloc[closest_match_index]
print("Closest: ", closest_match, similarity_scores[closest_match_index])
# Adjust the cutoff value according to your needs
if similarity_scores[closest_match_index] >= 0.5:
return closest_match, "closest"
else:
# If no match is found, return the default value
return "himayatsagar", ""
def convert_text_to_text(text, s_lang, d_lang):
numeral_mappings = {
# Telugu numerals
'౧': '1', '౨': '2', '౩': '3', '౪': '4', '౫': '5',
'౬': '6', '౭': '7', '౮': '8', '౯': '9', '౦': '0',
# Tamil numerals
'௧': '1', '௨': '2', '௩': '3', '௪': '4', '௫': '5',
'௬': '6', '௭': '7', '௮': '8', '௯': '9', '௦': '0',
# Malayalam numerals
'൧': '1', '൨': '2', '൩': '3', '൪': '4', '൫': '5',
'൬': '6', '൭': '7', '൮': '8', '൯': '9', '൦': '0',
# Gujarati numerals
'૧': '1', '૨': '2', '૩': '3', '૪': '4', '૫': '5',
'૬': '6', '૭': '7', '૮': '8', '૯': '9', '૦': '0',
# Bengali numerals
'১': '1', '২': '2', '৩': '3', '৪': '4', '৫': '5',
'৬': '6', '৭': '7', '৮': '8', '৯': '9', '০': '0',
# Kannada numerals
'೧': '1', '೨': '2', '೩': '3', '೪': '4', '೫': '5',
'೬': '6', '೭': '7', '೮': '8', '೯': '9', '೦': '0',
}
for numeral, replacement in numeral_mappings.items():
text = text.replace(numeral, replacement)
return translator.translate(text=text, src=s_lang, dest=d_lang).text
# location_extractor = LocationExtractor()
# user_text = convert_text_to_text("గత నెలలో ఉస్మాన్‌సాగర్ నీటి వ్యాప్తి ఏమిటి", "te", "en")
# print(user_text)
# location_extractor.extract_entities(user_text)