import string
import nltk
import json
import re
from nltk import pos_tag, word_tokenize
import pandas as pd
from nltk.corpus import stopwords
from difflib import SequenceMatcher
# from googletrans import Translator

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# translator = Translator()

# Pre-compile regular expressions for efficiency
date_pattern = re.compile(r'\b(year|week|month|day)\b', re.IGNORECASE)
preposition_pattern = re.compile(r'\b(last|previous)\b', re.IGNORECASE)
stopword_pattern = re.compile(r'\b(' + '|'.join(stopwords.words("english")) + r')\b', re.IGNORECASE)

class LocationExtractor:
    def __init__(self, csv_file_path='ISROP.csv'):
        # Load the CSV file into a DataFrame
        self.df = pd.read_csv(csv_file_path)
        self.column_to_check = self.df['ROI_Name'].str.lower()

    def extract_nouns(self, text):
        words = word_tokenize(text)
        pos_tags = pos_tag(words)

        filtered_words = []
        for word, pos in pos_tags:
            if pos not in ['WP', 'CD', 'DT', 'EX', 'IN', 'LS', 'MD', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'] \
                and not stopword_pattern.match(word) \
                and not preposition_pattern.match(word) \
                and not date_pattern.match(word) \
                and 'spread' not in word.lower() \
                and 'water' not in word.lower() \
                and 'level' not in word.lower():
                    filtered_words.append(word)

        filtered_text = ' '.join(filtered_words)
        filtered_text = filtered_text.translate(str.maketrans('', '', string.punctuation))

        return filtered_text

    def extract_entities(self, input_text=""):
        given_noun = self.extract_nouns(input_text).lower().strip()
        print("Given Noun:", given_noun)

        # Check for an exact match
        exact_match = self.column_to_check.str.lower().str.strip().eq(given_noun)
        print("Exact: ", exact_match.any())

        if exact_match.any():
            return given_noun, "exact"
        else:
            # Preprocess the given noun to improve matching
            preprocessed_noun = ' '.join(filter(lambda x: x.strip(), re.split(r'\W+', given_noun)))
            print("Preprocessed Noun:", preprocessed_noun)

            # Split the preprocessed noun into individual words
            words = preprocessed_noun.split()
            print("Words:", words)

            # Initialize results for individual word matching
            for word in words:
                print(f"Checking word: {word}")
                # Check for an exact match of each word
                exact_match_word = self.column_to_check.str.lower().str.strip().eq(word)
                print(f"Exact Match for '{word}': ", exact_match_word.any())

                if exact_match_word.any():
                    return word, "exact"

            # If no exact match for individual words, proceed with closest match for full preprocessed noun
            similarity_scores = [SequenceMatcher(None, preprocessed_noun, str(target).lower()).ratio() for target in self.column_to_check]

            # Find the index of the closest match
            closest_match_index = similarity_scores.index(max(similarity_scores))

            # Get the closest match
            closest_match = self.column_to_check.iloc[closest_match_index]
            print("Closest: ", closest_match, similarity_scores[closest_match_index])

            # Adjust the cutoff value according to your needs
            if similarity_scores[closest_match_index] >= 0.5:
                return closest_match, "closest"
            else:
                # If no match is found, return the default value
                return "himayatsagar", ""

def convert_text_to_text(text, s_lang, d_lang):
    numeral_mappings = {
        # Telugu numerals
        '౧': '1', '౨': '2', '౩': '3', '౪': '4', '౫': '5',
        '౬': '6', '౭': '7', '౮': '8', '౯': '9', '౦': '0',
        # Tamil numerals
        '௧': '1', '௨': '2', '௩': '3', '௪': '4', '௫': '5',
        '௬': '6', '௭': '7', '௮': '8', '௯': '9', '௦': '0',
        # Malayalam numerals
        '൧': '1', '൨': '2', '൩': '3', '൪': '4', '൫': '5',
        '൬': '6', '൭': '7', '൮': '8', '൯': '9', '൦': '0',
        # Gujarati numerals
        '૧': '1', '૨': '2', '૩': '3', '૪': '4', '૫': '5',
        '૬': '6', '૭': '7', '૮': '8', '૯': '9', '૦': '0',
        # Bengali numerals
        '১': '1', '২': '2', '৩': '3', '৪': '4', '৫': '5',
        '৬': '6', '৭': '7', '৮': '8', '৯': '9', '০': '0',
        # Kannada numerals
        '೧': '1', '೨': '2', '೩': '3', '೪': '4', '೫': '5',
        '೬': '6', '೭': '7', '೮': '8', '೯': '9', '೦': '0',
    }

    for numeral, replacement in numeral_mappings.items():
        text = text.replace(numeral, replacement)

    return translator.translate(text=text, src=s_lang, dest=d_lang).text

# location_extractor = LocationExtractor()
# user_text = convert_text_to_text("గత నెలలో ఉస్మాన్‌సాగర్ నీటి వ్యాప్తి ఏమిటి", "te", "en")
# print(user_text)

# location_extractor.extract_entities(user_text)