Spaces:

sharmamohit8624
/

ChatGS

Sleeping

App Files Files Community

ChatGS / location.py

sharmamohit8624

Upload 2395 files

829f2ca verified about 2 months ago

raw

history blame contribute delete

5.39 kB

	import string
	import nltk
	import json
	import re
	from nltk import pos_tag, word_tokenize
	import pandas as pd
	from nltk.corpus import stopwords
	from difflib import SequenceMatcher
	# from googletrans import Translator

	# nltk.download('punkt')
	# nltk.download('averaged_perceptron_tagger')
	# nltk.download('stopwords')
	# translator = Translator()

	# Pre-compile regular expressions for efficiency
	date_pattern = re.compile(r'\b(year\|week\|month\|day)\b', re.IGNORECASE)
	preposition_pattern = re.compile(r'\b(last\|previous)\b', re.IGNORECASE)
	stopword_pattern = re.compile(r'\b(' + '\|'.join(stopwords.words("english")) + r')\b', re.IGNORECASE)

	class LocationExtractor:
	def __init__(self, csv_file_path='ISROP.csv'):
	# Load the CSV file into a DataFrame
	self.df = pd.read_csv(csv_file_path)
	self.column_to_check = self.df['ROI_Name'].str.lower()

	def extract_nouns(self, text):
	words = word_tokenize(text)
	pos_tags = pos_tag(words)

	filtered_words = []
	for word, pos in pos_tags:
	if pos not in ['WP', 'CD', 'DT', 'EX', 'IN', 'LS', 'MD', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'] \
	and not stopword_pattern.match(word) \
	and not preposition_pattern.match(word) \
	and not date_pattern.match(word) \
	and 'spread' not in word.lower() \
	and 'water' not in word.lower() \
	and 'level' not in word.lower():
	filtered_words.append(word)

	filtered_text = ' '.join(filtered_words)
	filtered_text = filtered_text.translate(str.maketrans('', '', string.punctuation))

	return filtered_text

	def extract_entities(self, input_text=""):
	given_noun = self.extract_nouns(input_text).lower().strip()
	print("Given Noun:", given_noun)

	# Check for an exact match
	exact_match = self.column_to_check.str.lower().str.strip().eq(given_noun)
	print("Exact: ", exact_match.any())

	if exact_match.any():
	return given_noun, "exact"
	else:
	# Preprocess the given noun to improve matching
	preprocessed_noun = ' '.join(filter(lambda x: x.strip(), re.split(r'\W+', given_noun)))
	print("Preprocessed Noun:", preprocessed_noun)

	# Split the preprocessed noun into individual words
	words = preprocessed_noun.split()
	print("Words:", words)

	# Initialize results for individual word matching
	for word in words:
	print(f"Checking word: {word}")
	# Check for an exact match of each word
	exact_match_word = self.column_to_check.str.lower().str.strip().eq(word)
	print(f"Exact Match for '{word}': ", exact_match_word.any())

	if exact_match_word.any():
	return word, "exact"

	# If no exact match for individual words, proceed with closest match for full preprocessed noun
	similarity_scores = [SequenceMatcher(None, preprocessed_noun, str(target).lower()).ratio() for target in self.column_to_check]

	# Find the index of the closest match
	closest_match_index = similarity_scores.index(max(similarity_scores))

	# Get the closest match
	closest_match = self.column_to_check.iloc[closest_match_index]
	print("Closest: ", closest_match, similarity_scores[closest_match_index])

	# Adjust the cutoff value according to your needs
	if similarity_scores[closest_match_index] >= 0.5:
	return closest_match, "closest"
	else:
	# If no match is found, return the default value
	return "himayatsagar", ""

	def convert_text_to_text(text, s_lang, d_lang):
	numeral_mappings = {
	# Telugu numerals
	'౧': '1', '౨': '2', '౩': '3', '౪': '4', '౫': '5',
	'౬': '6', '౭': '7', '౮': '8', '౯': '9', '౦': '0',
	# Tamil numerals
	'௧': '1', '௨': '2', '௩': '3', '௪': '4', '௫': '5',
	'௬': '6', '௭': '7', '௮': '8', '௯': '9', '௦': '0',
	# Malayalam numerals
	'൧': '1', '൨': '2', '൩': '3', '൪': '4', '൫': '5',
	'൬': '6', '൭': '7', '൮': '8', '൯': '9', '൦': '0',
	# Gujarati numerals
	'૧': '1', '૨': '2', '૩': '3', '૪': '4', '૫': '5',
	'૬': '6', '૭': '7', '૮': '8', '૯': '9', '૦': '0',
	# Bengali numerals
	'১': '1', '২': '2', '৩': '3', '৪': '4', '৫': '5',
	'৬': '6', '৭': '7', '৮': '8', '৯': '9', '০': '0',
	# Kannada numerals
	'೧': '1', '೨': '2', '೩': '3', '೪': '4', '೫': '5',
	'೬': '6', '೭': '7', '೮': '8', '೯': '9', '೦': '0',
	}

	for numeral, replacement in numeral_mappings.items():
	text = text.replace(numeral, replacement)

	return translator.translate(text=text, src=s_lang, dest=d_lang).text

	# location_extractor = LocationExtractor()
	# user_text = convert_text_to_text("గత నెలలో ఉస్మాన్‌సాగర్ నీటి వ్యాప్తి ఏమిటి", "te", "en")
	# print(user_text)

	# location_extractor.extract_entities(user_text)