| from collections import Counter |
|
|
|
|
| class SyntaxModule: |
| def __init__(self): |
| |
| self.adj_attrr_en = { |
| "good", "new", "first", "last", "long", "great", "little", "own", "other", "old", |
| "right", "big", "high", "different", "small", "large", "next", "early", "young", "important", |
| "few", "public", "bad", "same", "able", "black", "white", "green", "blue", "red", |
| "serious", "happy", "strong", "special", "weak", "wide", "beautiful", "nice", "smart", "tall", |
| "major", "real", "basic", "full", "free", "perfect", "dark", "light", "hard", "clear", |
| "simple", "cold", "warm", "rich", "strange", "sure", "deep", "local", "quick", "cheap", |
| "direct", "tight", "open", "empty", "fine", "short", "dry", "loud", "safe", "calm", |
| "dead", "mad", "sharp", "hot", "raw", "narrow", "round", "smooth", "soft", "heavy", |
| "solid", "pure", "honest", "brave", "gentle", "wild", "shy", "bold", "bright", "fresh", |
| "sweet", "bitter", "sour", "thick", "thin", "flat", "low", "poor", "quiet", "dangerous", |
| "sad", "glad", "angry", "kind", "cruel", "stupid", "wise", "foolish", "proud", "humble", |
| "funny", "normal", "familiar", "common", "rare", "possible", "impossible", "necessary", |
| "uncertain", "confident", "hidden", "visible", "vague", "excited", "bored", |
| "interested", "worried", "nervous", "relaxed", "tense", "helpful", "harmful", "useful", |
| "useless", "valuable", "worthless", "trivial", "actual", "potential", "specific", |
| "general", "particular", "universal", "individual", "collective", "personal", |
| "main", "primary", "secondary", "unique", "typical", "standard", |
| "advanced", "modern", "traditional", "contemporary", "recent", "current", "past", "future", |
| "temporary", "permanent", "brief", "intermediate", "initial", "final", |
| "external", "internal", "global", "national", "international", "central", |
| "minor", "significant", "essential", "critical", "fundamental", "key", "core", |
| "active", "passive", "indirect", "positive", "negative", "neutral", |
| "absolute", "relative", "total", "partial", "complete", "incomplete", "complex", |
| "theoretical", "practical", "empirical", "systematic", |
| "random", "broad", "limited", |
| "extensive", "intensive", "substantial", "minimal", "maximum", "minimum", "average", |
| "natural", "artificial", "original", "conventional", |
| "innovative", "experimental", "alternative", "tertiary", |
| "preliminary", "ultimate", |
| "concrete", "abstract", "mental", "physical", "emotional", |
| "intellectual", "cultural", "social", "economic", "political", "scientific", "technical", |
| "professional", "commercial", "industrial", "agricultural", "medical", "legal", "financial", |
| "educational", "environmental", "historical", "geographical", "mathematical", "linguistic", |
| "psychological", "philosophical", "artistic", "literary", "musical", "digital", "analog", |
| "electronic", "mechanical", "nuclear", "biological", "chemical", "organic", "inorganic", |
| "static", "dynamic", "stable", "unstable", "consistent", "inconsistent", "reliable", |
| "unreliable", "valid", "invalid", "accurate", "inaccurate", "precise", "imprecise", |
| "certain", "definite", "indefinite", "probable", |
| "improbable", "likely", "unlikely", "unnecessary", "nonessential", |
| "mandatory", "optional", "compulsory", "voluntary", "peripheral", |
| "marginal", "trivial", "insignificant", |
| "strategic", "tactical", "operational", "administrative", "executive", "legislative", |
| "judicial", "regulatory", "preventive", "corrective", "proactive", "reactive", "adaptive", |
| "responsive", "interactive", "collaborative", "competitive", "cooperative", "constructive", |
| "destructive", "productive", "unproductive", "effective", "ineffective", "efficient", |
| "inefficient", "sustainable", "unsustainable", "viable", "unviable", "progressive", |
| "regressive", "radical", "conservative", "liberal", "moderate", "extreme", "mainstream", |
| } |
|
|
| |
| self.adj_attrr_it = { |
| |
| "grande", "piccolo", "lungo", "corto", "alto", "basso", |
| "largo", "stretto", "grosso", "sottile", "spesso", "piatto", |
| "rotondo", "quadrato", "dritto", "curvo", "pesante", "leggero", |
| "duro", "morbido", "liscio", "ruvido", "caldo", "freddo", |
| "bollente", "gelido", "tiepido", |
| |
| "nero", "bianco", "rosso", "verde", "blu", "giallo", |
| "arancione", "viola", "rosa", "grigio", "marrone", "azzurro", |
| "dorato", "argentato", "scuro", "chiaro", "pallido", "vivace", |
| |
| "buono", "cattivo", "onesto", "disonesto", "coraggioso", "vigliacco", |
| "gentile", "crudele", "generoso", "avaro", "giusto", "ingiusto", |
| "fedele", "leale", "sleale", "umile", "arrogante", |
| "paziente", "impaziente", "saggio", "stolto", "prudente", "temerario", |
| "sincero", "falso", "ingenuo", "furbo", |
| |
| "intelligente", "stupido", "brillante", "mediocre", "capace", |
| "incapace", "abile", "inabile", "esperto", "inesperto", |
| "colto", "ignorante", "istruito", |
| |
| "bello", "brutto", "elegante", "goffo", "raffinato", "rozzo", |
| "attraente", "affascinante", "noioso", "interessante", |
| "banale", "originale", "creativo", |
| |
| "nuovo", "vecchio", "antico", "moderno", "recente", "passato", |
| "futuro", "attuale", "contemporaneo", "storico", "tradizionale", |
| "obsoleto", "innovativo", "eterno", "temporaneo", "definitivo", |
| |
| "vicino", "lontano", "centrale", "periferico", "interno", "esterno", |
| "locale", "globale", "nazionale", "internazionale", "regionale", |
| "urbano", "rurale", "pubblico", "privato", |
| |
| "intero", "parziale", "completo", "incompleto", "totale", |
| "principale", "secondario", "unico", "raro", "comune", "frequente", |
| "scarso", "abbondante", "sufficiente", "insufficiente", |
| "massimo", "minimo", "medio", "normale", "eccezionale", |
| |
| "simile", "diverso", "uguale", "opposto", "specifico", "generico", |
| "particolare", "generale", "tipico", "atipico", "naturale", |
| "artificiale", "reale", "virtuale", "concreto", "astratto", |
| "positivo", "negativo", "neutro", "relativo", "assoluto", |
| |
| "felice", "triste", "arrabbiato", "calmo", "ansioso", "sereno", |
| "agitato", "tranquillo", "nervoso", "rilassato", "allegro", |
| "malinconico", "entusiasta", "deluso", "soddisfatto", "insoddisfatto", |
| |
| "scientifico", "tecnico", "culturale", "sociale", "economico", |
| "politico", "giuridico", "medico", "biologico", "chimico", |
| "fisico", "matematico", "linguistico", "letterario", "artistico", |
| "filosofico", "psicologico", "geografico", |
| "professionale", "accademico", "scolastico", "educativo", |
| |
| "certo", "incerto", "probabile", "improbabile", "possibile", |
| "impossibile", "necessario", "inutile", "fondamentale", "marginale", |
| "importante", "irrilevante", "significativo", "trascurabile", |
| "evidente", "oscuro", "ambiguo", "definito", "vago", |
| } |
|
|
| def get_node_depth(self, token): |
| depth = 1 |
| curr = token |
| while curr.head != curr: |
| depth += 1 |
| curr = curr.head |
| return depth |
|
|
| def analyze(self, doc, sentences, lang="en"): |
| |
| active_adj = self.adj_attrr_it if lang == "it" else self.adj_attrr_en |
|
|
| |
| if lang == "it": |
| present_tags = {"Pres"} |
| past_tags = {"Past"} |
| part_tags = {"Part"} |
| else: |
| present_tags = {"VB", "VBP", "VBZ"} |
| past_tags = {"VBD"} |
| part_tags = {"VBN", "VBG"} |
|
|
| num_sents = len(sentences) if sentences else 1 |
| num_tokens = len(doc) if len(doc) > 0 else 1 |
|
|
| total_punct_pairs = 0 |
| total_svo_inversions = 0 |
| total_non_comp_subs = 0 |
|
|
| for sent in sentences: |
| punct_count = sum(1 for t in sent if t.text in (",", ";")) |
| total_punct_pairs += punct_count // 2 |
|
|
| for token in sent: |
| if token.pos_ in ("VERB", "AUX"): |
| for child in token.children: |
| if child.dep_ in ("nsubj", "nsubjpass", "csubj", "csubjpass", "expl") and child.i > token.i: |
| total_svo_inversions += 1 |
| elif child.dep_ in ("obj", "dobj", "pobj", "dative", "ccomp", "xcomp", "acomp") and child.i < token.i: |
| total_svo_inversions += 1 |
|
|
| if token.dep_ in ("advcl", "acl", "relcl", "acl:relcl", "advcl:relcl"): |
| total_non_comp_subs += 1 |
|
|
| |
| verbs = [t for t in doc if t.pos_ in ("VERB", "AUX")] |
|
|
| if lang == "it": |
| |
| present = sum(1 for v in verbs if v.morph.get("Tense") == ["Pres"]) |
| past = sum(1 for v in verbs if v.morph.get("Tense") == ["Past"]) |
| parts = sum(1 for v in verbs if v.morph.get("VerbForm") == ["Part"]) |
| else: |
| present = sum(1 for v in verbs if v.tag_ in present_tags) |
| past = sum(1 for v in verbs if v.tag_ in past_tags) |
| parts = sum(1 for v in verbs if v.tag_ in part_tags) |
|
|
| tenses = [v.tag_ for v in verbs] |
| stability = (Counter(tenses).most_common(1)[0][1] / len(verbs)) if verbs else 0 |
|
|
| sub_labels = {"csubj", "ccomp", "parataxis", "advcl", "acl", "xcomp", |
| "advcl:relcl", "csub:pass", "csubj:outer", "acl:relcl"} |
| nsub = sum(1 for t in doc if t.dep_ in sub_labels) |
|
|
| total_root_dist = sum(abs(t.i - t.sent.root.i) for t in doc) |
|
|
| rel_clauses = sum(1 for t in doc if t.dep_ in ["relcl", "acl:relcl", "advcl:relcl"]) |
| adj_count = sum(1 for t in doc if t.pos_ == "ADJ") |
| attr_adjs = sum(1 for t in doc if t.lemma_.lower() in active_adj and t.pos_ == "ADJ") |
|
|
| nouns = sum(1 for t in doc if t.pos_ == "NOUN") |
| mods = sum(1 for t in doc if t.dep_ in ("amod", "nmod", "poss", "nummod", "prep", "case", "mark") |
| and t.head.pos_ in ("NOUN", "PROPN")) |
|
|
| return { |
| "avg_sent_len": round(num_tokens / num_sents, 2), |
| "root_dist": round(total_root_dist / num_tokens, 2), |
| "sub_ratio": round(nsub / num_sents, 4), |
| "present_ratio": round(present / len(verbs), 4) if verbs else 0, |
| "past_ratio": round(past / len(verbs), 4) if verbs else 0, |
| "participle_ratio": round(parts / len(verbs), 4) if verbs else 0, |
| "rel_clauses_per_sent": round(rel_clauses / num_sents, 4), |
| "attr_adjs_freq": round(attr_adjs / num_tokens, 4), |
| "attr_adjs_ratio": round(attr_adjs / adj_count, 4) if adj_count > 0 else 0, |
| "adj_x_sent": round(adj_count / num_sents, 4), |
| "adj_count": round(adj_count, 4), |
| "mod_per_noun": round(mods / nouns, 4) if nouns else 0, |
| "punct_pairs_per_sent": round(total_punct_pairs / num_sents, 4), |
| "svo_inversions_per_sent": round(total_svo_inversions / num_sents, 4), |
| "non_comp_sub_per_sent": round(total_non_comp_subs / num_sents, 4), |
| } |
|
|