Spaces:
Sleeping
Sleeping
| import string | |
| import nltk | |
| import json | |
| import re | |
| from nltk import pos_tag, word_tokenize | |
| import pandas as pd | |
| from nltk.corpus import stopwords | |
| from difflib import SequenceMatcher | |
| # from googletrans import Translator | |
| # nltk.download('punkt') | |
| # nltk.download('averaged_perceptron_tagger') | |
| # nltk.download('stopwords') | |
| # translator = Translator() | |
| # Pre-compile regular expressions for efficiency | |
| date_pattern = re.compile(r'\b(year|week|month|day)\b', re.IGNORECASE) | |
| preposition_pattern = re.compile(r'\b(last|previous)\b', re.IGNORECASE) | |
| stopword_pattern = re.compile(r'\b(' + '|'.join(stopwords.words("english")) + r')\b', re.IGNORECASE) | |
| class LocationExtractor: | |
| def __init__(self, csv_file_path='ISROP.csv'): | |
| # Load the CSV file into a DataFrame | |
| self.df = pd.read_csv(csv_file_path) | |
| self.column_to_check = self.df['ROI_Name'].str.lower() | |
| def extract_nouns(self, text): | |
| words = word_tokenize(text) | |
| pos_tags = pos_tag(words) | |
| filtered_words = [] | |
| for word, pos in pos_tags: | |
| if pos not in ['WP', 'CD', 'DT', 'EX', 'IN', 'LS', 'MD', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'] \ | |
| and not stopword_pattern.match(word) \ | |
| and not preposition_pattern.match(word) \ | |
| and not date_pattern.match(word) \ | |
| and 'spread' not in word.lower() \ | |
| and 'water' not in word.lower() \ | |
| and 'level' not in word.lower(): | |
| filtered_words.append(word) | |
| filtered_text = ' '.join(filtered_words) | |
| filtered_text = filtered_text.translate(str.maketrans('', '', string.punctuation)) | |
| return filtered_text | |
| def extract_entities(self, input_text=""): | |
| given_noun = self.extract_nouns(input_text).lower().strip() | |
| print("Given Noun:", given_noun) | |
| # Check for an exact match | |
| exact_match = self.column_to_check.str.lower().str.strip().eq(given_noun) | |
| print("Exact: ", exact_match.any()) | |
| if exact_match.any(): | |
| return given_noun, "exact" | |
| else: | |
| # Preprocess the given noun to improve matching | |
| preprocessed_noun = ' '.join(filter(lambda x: x.strip(), re.split(r'\W+', given_noun))) | |
| print("Preprocessed Noun:", preprocessed_noun) | |
| # Split the preprocessed noun into individual words | |
| words = preprocessed_noun.split() | |
| print("Words:", words) | |
| # Initialize results for individual word matching | |
| for word in words: | |
| print(f"Checking word: {word}") | |
| # Check for an exact match of each word | |
| exact_match_word = self.column_to_check.str.lower().str.strip().eq(word) | |
| print(f"Exact Match for '{word}': ", exact_match_word.any()) | |
| if exact_match_word.any(): | |
| return word, "exact" | |
| # If no exact match for individual words, proceed with closest match for full preprocessed noun | |
| similarity_scores = [SequenceMatcher(None, preprocessed_noun, str(target).lower()).ratio() for target in self.column_to_check] | |
| # Find the index of the closest match | |
| closest_match_index = similarity_scores.index(max(similarity_scores)) | |
| # Get the closest match | |
| closest_match = self.column_to_check.iloc[closest_match_index] | |
| print("Closest: ", closest_match, similarity_scores[closest_match_index]) | |
| # Adjust the cutoff value according to your needs | |
| if similarity_scores[closest_match_index] >= 0.5: | |
| return closest_match, "closest" | |
| else: | |
| # If no match is found, return the default value | |
| return "himayatsagar", "" | |
| def convert_text_to_text(text, s_lang, d_lang): | |
| numeral_mappings = { | |
| # Telugu numerals | |
| '౧': '1', '౨': '2', '౩': '3', '౪': '4', '౫': '5', | |
| '౬': '6', '౭': '7', '౮': '8', '౯': '9', '౦': '0', | |
| # Tamil numerals | |
| '௧': '1', '௨': '2', '௩': '3', '௪': '4', '௫': '5', | |
| '௬': '6', '௭': '7', '௮': '8', '௯': '9', '௦': '0', | |
| # Malayalam numerals | |
| '൧': '1', '൨': '2', '൩': '3', '൪': '4', '൫': '5', | |
| '൬': '6', '൭': '7', '൮': '8', '൯': '9', '൦': '0', | |
| # Gujarati numerals | |
| '૧': '1', '૨': '2', '૩': '3', '૪': '4', '૫': '5', | |
| '૬': '6', '૭': '7', '૮': '8', '૯': '9', '૦': '0', | |
| # Bengali numerals | |
| '১': '1', '২': '2', '৩': '3', '৪': '4', '৫': '5', | |
| '৬': '6', '৭': '7', '৮': '8', '৯': '9', '০': '0', | |
| # Kannada numerals | |
| '೧': '1', '೨': '2', '೩': '3', '೪': '4', '೫': '5', | |
| '೬': '6', '೭': '7', '೮': '8', '೯': '9', '೦': '0', | |
| } | |
| for numeral, replacement in numeral_mappings.items(): | |
| text = text.replace(numeral, replacement) | |
| return translator.translate(text=text, src=s_lang, dest=d_lang).text | |
| # location_extractor = LocationExtractor() | |
| # user_text = convert_text_to_text("గత నెలలో ఉస్మాన్సాగర్ నీటి వ్యాప్తి ఏమిటి", "te", "en") | |
| # print(user_text) | |
| # location_extractor.extract_entities(user_text) |