import re
import math
from collections import Counter


class LexicalModule:
    def __init__(self, r1, r2, r3, mrc):
        self.r1, self.r2, self.r3, self.mrc = r1, r2, r3, mrc

        # --- Set INGLESI ---
        self.demons_set_en = {"this", "that", "these", "those"}
        self.deictics_set_en = {
            "i", "me", "my", "mine", "we", "us", "our", "ours",
            "you", "your", "yours", "he", "him", "his", "she",
            "her", "hers", "it", "its", "they", "them", "their", "theirs",
            "here", "there", "this", "these", "that", "those",
            "yonder", "above", "below", "in front", "behind",
            "now", "then", "today", "yesterday", "tomorrow", "tonight",
            "last week", "last month", "last year", "next week",
            "next month", "next year", "ago", "soon", "later", "currently",
            "sir", "madam", "ma'am", "your honor", "professor", "doctor",
            "the former", "the latter", "above-mentioned", "as follows", "hereby",
            "this way", "that time", "these days", "those years",
            "thus", "therefore", "hereby", "henceforth"
        }
        self.emphatics_en = {
            "just", "even", "so", "really", "absolutely",
            "quite", "indeed", "definitely", "certainly",
            "rather", "pretty", "precisely", "actually",
            "totally", "completely", "utterly", "extremely",
            "basically", "simply", "literally", "honestly",
            "frankly", "truly", "surely", "clearly", "obviously",
            "perfectly", "exactly", "merely", "only", "still",
            "yet", "already", "else", "ever", "never", "always",
            "very", "much", "too", "enough", "well", "oh",
            "wow", "hey", "like", "such", "right", "now",
            "then", "further", "more", "less", "super",
            "incredibly", "remarkably", "genuinely",
            "positively", "unbelievably",
            "extraordinarily", "immensely", "vastly",
            "somewhat", "kind of", "sort of",
            "pretty much", "practically", "essentially",
            "virtually", "nearly", "almost", "about",
            "roughly", "approximately", "potentially",
            "arguably", "seemingly", "apparently",
            "theoretically", "conceptually", "fundamentally",
            "relatively", "comparatively", "distinctly",
            "considerably", "significantly", "substantially",
            "surprisingly", "unexpectedly", "oddly",
            "strangely", "curiously", "ironically",
            "paradoxically", "undoubtedly", "indisputably", "categorically"
        }
        self.first_pers_en = {"i", "me", "myself"}
        self.articles_en = {"a", "an", "the", "some"}
        self.definite_en = {"the"}

        # --- Set ITALIANI ---
        self.demons_set_it = {
            "questo", "questa", "questi", "queste",
            "quello", "quella", "quelli", "quelle",
            "codesto", "codesta", "codesti", "codeste",
            "ciò", "tal", "tale", "tali"
        }
        self.deictics_set_it = {
            # Personali
            "io", "me", "mi", "mio", "mia", "miei", "mie",
            "noi", "ci", "nostro", "nostra", "nostri", "nostre",
            "tu", "te", "ti", "tuo", "tua", "tuoi", "tue",
            "voi", "vi", "vostro", "vostra", "vostri", "vostre",
            "lui", "lo", "gli", "suo", "sua", "suoi", "sue",
            "lei", "la", "loro", "li", "le",
            "essi", "esse", "sé", "si",
            # Spaziali
            "qui", "qua", "là", "lì", "lassù", "laggiù",
            "davanti", "dietro", "sopra", "sotto", "accanto",
            "vicino", "lontano", "oltre", "presso",
            # Temporali
            "ora", "adesso", "oggi", "ieri", "domani", "stanotte",
            "settimana scorsa", "mese scorso", "anno scorso",
            "prossima settimana", "prossimo mese", "prossimo anno",
            "presto", "tardi", "attualmente", "nel frattempo",
            # Sociali
            "signore", "signora", "professore", "dottore", "eccellenza",
            # Discorso
            "il suddetto", "il sottoscritto", "quanto sopra", "come segue",
            # Dimostrativi avverbiali
            "così", "pertanto", "quindi", "allora", "dunque"
        }
        self.emphatics_it = {
            "molto", "tanto", "assai", "parecchio", "alquanto",
            "enormemente", "immensamente", "incredibilmente", "straordinariamente",
            "oltremodo", "sommamente", "estremamente", "tremendamente",
            "proprio", "davvero", "veramente", "realmente", "effettivamente",
            "concretamente", "precisamente", "esattamente", "letteralmente",
            "certamente", "sicuramente", "assolutamente", "decisamente",
            "indubbiamente", "senz'altro", "senza dubbio", "ovviamente",
            "naturalmente", "chiaramente", "evidentemente",
            "abbastanza", "piuttosto", "relativamente",
            "praticamente", "quasi", "circa", "pressappoco", "sostanzialmente",
            "fondamentalmente", "essenzialmente", "in sostanza", "in fondo",
            "tutto sommato", "grosso modo", "più o meno",
            "solo", "soltanto", "solamente", "appena", "persino", "perfino",
            "addirittura", "almeno", "quanto meno", "perlomeno",
            "già", "ancora", "mai", "sempre", "pure", "anche",
            "insomma", "dopotutto", "in effetti", "anzi", "ecco",
            "beh", "bene", "cioè"
        }
        self.first_pers_it = {"io", "me", "mi", "me stesso", "me stessa"}
        # Articoli italiani (determinativi + indeterminativi comuni)
        self.articles_it = {"il", "lo", "la", "i", "gli", "le", "un", "uno", "una", "un'"}
        self.definite_it = {"il", "lo", "la", "i", "gli", "le"}

    def analyze(self, doc, lemmas, tokens_text, token_text_SP, raw_text, sentences, paragraphs, lang="en"):
        # Seleziona i set in base alla lingua
        if lang == "it":
            active_demons  = self.demons_set_it
            active_deictics = self.deictics_set_it
            active_emphatics = self.emphatics_it
            active_first_pers = self.first_pers_it
            active_articles = self.articles_it
            active_definite = self.definite_it
        else:
            active_demons   = self.demons_set_en
            active_deictics = self.deictics_set_en
            active_emphatics = self.emphatics_en
            active_first_pers = self.first_pers_en
            active_articles = self.articles_en
            active_definite = self.definite_en

        counts = Counter(tokens_text)
        noun_count = sum(1 for t in doc if t.pos_ in ("NOUN", "PROPN"))
        pron_count = sum(1 for t in doc if t.pos_ == "PRON")

        article_count     = sum(1 for t in doc if t.text.lower() in active_articles)
        dimostrativi_count = sum(1 for t in doc if t.text.lower() in active_demons)
        definiti_count    = sum(1 for t in doc if t.text.lower() in active_definite)
        deictic_count     = sum(1 for t in doc if t.text.lower() in active_deictics)

        num_sents = len(sentences) if sentences else 1
        hapax_total = sum(1.0 for w in counts if counts[w] == 1.0)
        hapax_ratio = round(hapax_total / len(token_text_SP), 4) if token_text_SP else 0

        # Ripetizioni (schema linguistico-agnostico: lavora sui token grezzi)
        clean = re.sub(r'[^\w\s]', '', raw_text, flags=re.UNICODE).lower().split()
        rep1, rep2, rep3 = 0, 0, 0
        if len(clean) > 1:
            for i in range(len(clean) - 1):
                if clean[i] == clean[i + 1]: rep1 += 1
        if len(clean) > 3:
            for i in range(len(clean) - 3):
                if clean[i:i+2] == clean[i+2:i+4]: rep2 += 1
        if len(clean) > 5:
            for i in range(len(clean) - 5):
                if clean[i:i+3] == clean[i+3:i+6]: rep3 += 1

        guiraud = len(set(lemmas)) / math.sqrt(len(lemmas)) if lemmas else 0
        dvalue = d_value(raw_text, num_sents)[0]
        dvalue_sballato = d_value_sballato(raw_text, num_sents)[0]

        first_p_count = sum(1 for t in doc if t.text.lower() in active_first_pers)
        pers_total = sum(1 for t in doc if t.pos_ == "PRON")
        mrcdict = [(l, self.mrc.get(l, 0)) for l in lemmas if self.mrc.get(l, 0) > 0]
        emph_count = sum(1 for t in doc if t.text.lower() in active_emphatics)

        return {
            "r1": round(sum(1 for l in lemmas if l in self.r1) / len(tokens_text) if tokens_text else 0, 4),
            "r2": round(sum(1 for l in lemmas if l in self.r2) / len(tokens_text) if tokens_text else 0, 4),
            "r3": round(sum(1 for l in lemmas if l in self.r3) / len(tokens_text) if tokens_text else 0, 4),
            "other": max(0, len(lemmas) - sum(1 for l in lemmas if l in (self.r1 | self.r2 | self.r3))),
            "concreteness": sum(self.mrc.get(l, 0) for l in lemmas) / len(mrcdict) if mrcdict else 0,
            "hapax": hapax_total,
            "hapax_ratio": hapax_ratio,
            "pron_noun_ratio": round(pron_count / noun_count, 4) if noun_count > 0 else 0,
            "reps_total": round((rep1 + rep2 + rep3) / len(sentences), 4) if sentences else 0,
            "deictics": deictic_count,
            "articles": article_count,
            "pronouns": pron_count,
            "nouns": noun_count,
            "demonstratives": dimostrativi_count,
            "definite_articles": definiti_count,
            "demonstratives_ratio": round(dimostrativi_count / num_sents, 4) if num_sents > 0 else 0,
            "definite_articles_ratio": round(definiti_count / num_sents, 4) if num_sents > 0 else 0,
            "TTR": round(guiraud, 2),
            "gunning_fog": round(dvalue, 2),
            "indice_d_testo": round(dvalue_sballato / paragraphs, 2),
            "HD-D": round(calculate_hdd(tokens_text), 2),
            "emphatic_particles": round(emph_count / len(tokens_text), 4) if tokens_text else 0,
            "first_person_ratio": round(first_p_count / pers_total, 4) if pers_total > 0 else 0,
            "deictic_Frequency": round(deictic_count / (article_count + deictic_count), 4) if article_count + deictic_count > 0 else 0,
        }


# ---------------------------------------------------------------------------
# Funzioni module-level (invariate rispetto all'originale)
# ---------------------------------------------------------------------------

def d_value(text, num_sents):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    num_sentences = num_sents
    words = text.split()
    num_words = len(words)
    asl = num_words / num_sentences if num_sentences > 0 else 0

    polysyllable_words = 0
    vowel_pattern = re.compile(r'[aeiouy]+')
    for word in words:
        if len(vowel_pattern.findall(word)) >= 3:
            polysyllable_words += 1

    pw = (polysyllable_words / num_words * 100) if num_words > 0 else 0
    return 0.4 * (asl + pw), asl, pw


def d_value_sballato(text, num_sents):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    words = text.split()
    num_words = len(words)

    polysyllable_words = 0
    vowel_pattern = re.compile(r'[aeiouy]+')
    for word in words:
        if len(vowel_pattern.findall(word)) >= 3:
            polysyllable_words += 1

    pw = (polysyllable_words / num_words * 100) if num_words > 0 else 0
    return 0.4 * (num_words + pw), num_words, pw


def calculate_hdd(tokens, n=42):
    N = len(tokens)
    if N < n:
        return float(len(set(tokens)))

    counts = Counter(tokens)
    denom = math.comb(N, n)
    hdd_sum = 0.0

    for word in counts:
        f_i = counts[word]
        if (N - f_i) >= n:
            prob_not_present = math.comb(N - f_i, n) / denom
        else:
            prob_not_present = 0.0
        hdd_sum += (1.0 - prob_not_present)

    return hdd_sum