import string import nltk import json import re from nltk import pos_tag, word_tokenize import pandas as pd from nltk.corpus import stopwords from difflib import SequenceMatcher # from googletrans import Translator # nltk.download('punkt') # nltk.download('averaged_perceptron_tagger') # nltk.download('stopwords') # translator = Translator() # Pre-compile regular expressions for efficiency date_pattern = re.compile(r'\b(year|week|month|day)\b', re.IGNORECASE) preposition_pattern = re.compile(r'\b(last|previous)\b', re.IGNORECASE) stopword_pattern = re.compile(r'\b(' + '|'.join(stopwords.words("english")) + r')\b', re.IGNORECASE) class LocationExtractor: def __init__(self, csv_file_path='ISROP.csv'): # Load the CSV file into a DataFrame self.df = pd.read_csv(csv_file_path) self.column_to_check = self.df['ROI_Name'].str.lower() def extract_nouns(self, text): words = word_tokenize(text) pos_tags = pos_tag(words) filtered_words = [] for word, pos in pos_tags: if pos not in ['WP', 'CD', 'DT', 'EX', 'IN', 'LS', 'MD', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'] \ and not stopword_pattern.match(word) \ and not preposition_pattern.match(word) \ and not date_pattern.match(word) \ and 'spread' not in word.lower() \ and 'water' not in word.lower() \ and 'level' not in word.lower(): filtered_words.append(word) filtered_text = ' '.join(filtered_words) filtered_text = filtered_text.translate(str.maketrans('', '', string.punctuation)) return filtered_text def extract_entities(self, input_text=""): given_noun = self.extract_nouns(input_text).lower().strip() print("Given Noun:", given_noun) # Check for an exact match exact_match = self.column_to_check.str.lower().str.strip().eq(given_noun) print("Exact: ", exact_match.any()) if exact_match.any(): return given_noun, "exact" else: # Preprocess the given noun to improve matching preprocessed_noun = ' '.join(filter(lambda x: x.strip(), re.split(r'\W+', given_noun))) print("Preprocessed Noun:", preprocessed_noun) # Split the preprocessed noun into individual words words = preprocessed_noun.split() print("Words:", words) # Initialize results for individual word matching for word in words: print(f"Checking word: {word}") # Check for an exact match of each word exact_match_word = self.column_to_check.str.lower().str.strip().eq(word) print(f"Exact Match for '{word}': ", exact_match_word.any()) if exact_match_word.any(): return word, "exact" # If no exact match for individual words, proceed with closest match for full preprocessed noun similarity_scores = [SequenceMatcher(None, preprocessed_noun, str(target).lower()).ratio() for target in self.column_to_check] # Find the index of the closest match closest_match_index = similarity_scores.index(max(similarity_scores)) # Get the closest match closest_match = self.column_to_check.iloc[closest_match_index] print("Closest: ", closest_match, similarity_scores[closest_match_index]) # Adjust the cutoff value according to your needs if similarity_scores[closest_match_index] >= 0.5: return closest_match, "closest" else: # If no match is found, return the default value return "himayatsagar", "" def convert_text_to_text(text, s_lang, d_lang): numeral_mappings = { # Telugu numerals '౧': '1', '౨': '2', '౩': '3', '౪': '4', '౫': '5', '౬': '6', '౭': '7', '౮': '8', '౯': '9', '౦': '0', # Tamil numerals '௧': '1', '௨': '2', '௩': '3', '௪': '4', '௫': '5', '௬': '6', '௭': '7', '௮': '8', '௯': '9', '௦': '0', # Malayalam numerals '൧': '1', '൨': '2', '൩': '3', '൪': '4', '൫': '5', '൬': '6', '൭': '7', '൮': '8', '൯': '9', '൦': '0', # Gujarati numerals '૧': '1', '૨': '2', '૩': '3', '૪': '4', '૫': '5', '૬': '6', '૭': '7', '૮': '8', '૯': '9', '૦': '0', # Bengali numerals '১': '1', '২': '2', '৩': '3', '৪': '4', '৫': '5', '৬': '6', '৭': '7', '৮': '8', '৯': '9', '০': '0', # Kannada numerals '೧': '1', '೨': '2', '೩': '3', '೪': '4', '೫': '5', '೬': '6', '೭': '7', '೮': '8', '೯': '9', '೦': '0', } for numeral, replacement in numeral_mappings.items(): text = text.replace(numeral, replacement) return translator.translate(text=text, src=s_lang, dest=d_lang).text # location_extractor = LocationExtractor() # user_text = convert_text_to_text("గత నెలలో ఉస్మాన్‌సాగర్ నీటి వ్యాప్తి ఏమిటి", "te", "en") # print(user_text) # location_extractor.extract_entities(user_text)