| import math |
| from spacy.tokens import Doc, Token |
|
|
| class ConsecutioAnalyzer: |
| def __init__(self): |
| self.reset_metrics() |
|
|
| def reset_metrics(self): |
| self.total_sentences = 0 |
| self.sum_max_depth = 0 |
| self.total_verb_pairs = 0 |
| self.valid_consecutio_pairs = 0 |
| self.total_words = 0 |
| self.total_verb_groups = 0 |
| self.root_tenses_list = [] |
|
|
| class TokenNode: |
| def __init__(self, spacy_token): |
| |
| self.id = str(spacy_token.i) |
| self.form = spacy_token.text |
| self.pos_tag = spacy_token.tag_ |
| self.dep_rel = spacy_token.dep_ |
| |
| self.head_id = str(spacy_token.head.i) if spacy_token.head.i != spacy_token.i else "0" |
| self.children = [] |
| self.compound_tense = "" |
| self.is_compound_head = False |
|
|
| def is_verb(self): |
| |
| return self.pos_tag.startswith("V") or self.pos_tag.startswith("MD") or self.pos_tag == "AUX" |
| |
| def is_auxiliary_rel(self): |
| |
| return "aux" in self.dep_rel.lower() |
| |
| def get_simple_tense(self): |
| |
| tag = self.pos_tag |
| if tag in ["VBD", "VBN"]: return "Past" |
| if tag in ["VBP", "VBZ"]: return "Pres" |
| if tag == "VBG": return "Ger" |
| if tag == "MD": return "Mod" |
| if tag == "VB": return "Inf" |
| return "N/A" |
|
|
| def analyze(self, doc): |
| """ |
| Riceve l'oggetto 'doc' di spaCy. |
| Implementa analyzeSingleDocument di consecutio.java. |
| """ |
| self.reset_metrics() |
| |
| |
| for sent in doc.sents: |
| self.total_sentences += 1 |
| |
| words_in_sent = [t for t in sent if not t.is_punct] |
| self.total_words += len(words_in_sent) |
|
|
| try: |
| |
| node_map = {str(t.i): self.TokenNode(t) for t in sent} |
| root = None |
| |
| |
| for node in node_map.values(): |
| if node.head_id == "0" or node.dep_rel.lower() == "root": |
| root = node |
| elif node.head_id in node_map: |
| node_map[node.head_id].children.append(node) |
|
|
| if root: |
| self._process_compound_tenses(root) |
| self.root_tenses_list.append(root.compound_tense) |
| self.sum_max_depth += self._calculate_tree_depth(root) |
| self._check_consecutio_recursively(root) |
| else: |
| self.root_tenses_list.append("N/A") |
| except Exception: |
| continue |
|
|
| |
| stability = self._calculate_dominant_stability() |
| avg_depth = self.sum_max_depth / self.total_sentences if self.total_sentences > 0 else 0.0 |
| consecutio_score = self.valid_consecutio_pairs / self.total_verb_pairs if self.total_verb_pairs > 0 else 1.0 |
| verb_density = self.total_verb_groups / self.total_words if self.total_words > 0 else 0.0 |
| sentence_depths = self.calculate_average_graph_depth(doc) |
| return { |
| "tense_stability": round(stability, 4), |
| "avg_depth": round(avg_depth, 4) if self.total_sentences > 0 else 0.0, |
| "consecutio_index": round(consecutio_score, 4), |
| "verb_density": round(verb_density, 4), |
| "sentence_depths": round(sentence_depths, 4) |
| } |
|
|
| def _process_compound_tenses(self, node): |
| |
| aux_children = [] |
| for child in node.children: |
| if child.is_auxiliary_rel(): |
| aux_children.append(child) |
| self._process_compound_tenses(child) |
| |
| if node.is_verb() and not node.is_auxiliary_rel(): |
| node.is_compound_head = True |
| self.total_verb_groups += 1 |
| group = [node] + aux_children |
| node.compound_tense = self._infer_compound_tense(group, node) |
|
|
| def _infer_compound_tense(self, group, head): |
| |
| has_will, has_have, has_modal = False, False, False |
| for n in group: |
| if n == head: continue |
| f = n.form.lower() |
| if "will" in f or "'ll" in f: has_will = True |
| elif any(x in f for x in ["have", "has", "had"]): has_have = True |
| elif n.pos_tag == "MD": has_modal = True |
| |
| if has_modal: return "Mod" |
| if has_will: return "Fut" |
| if has_have and head.pos_tag == "VBN": return "Perf" |
| return head.get_simple_tense() |
|
|
| def _calculate_tree_depth(self, node): |
| |
| if not node.children: return 1 |
| max_d = 0 |
| for child in node.children: |
| if child.is_auxiliary_rel(): continue |
| max_d = max(max_d, self._calculate_tree_depth(child)) |
| return 1 + max_d |
|
|
| def _check_consecutio_recursively(self, parent): |
| |
| for child in parent.children: |
| if child.is_auxiliary_rel(): continue |
| if parent.is_compound_head and child.is_compound_head: |
| self.total_verb_pairs += 1 |
| if self._is_consecutio_valid(parent.compound_tense, child.compound_tense): |
| self.valid_consecutio_pairs += 1 |
| self._check_consecutio_recursively(child) |
|
|
| def _is_consecutio_valid(self, p, c): |
| |
| if self._is_pres_group(p): return True |
| if self._is_past_group(p): |
| if c == "Pres" or c == "Ger": return False |
| return True |
|
|
| def _is_past_group(self, t): |
| return "Past" in t or "Perf" in t or "Mod" in t |
|
|
| def _is_pres_group(self, t): |
| return "Pres" in t or "Fut" in t or "Ger" in t |
|
|
| def _calculate_dominant_stability(self): |
| |
| valid_roots = [t for t in self.root_tenses_list if t != "N/A"] |
| if not valid_roots: return 0.0 |
| past_c = sum(1 for t in valid_roots if self._is_past_group(t)) |
| pres_c = sum(1 for t in valid_roots if self._is_pres_group(t)) |
| dom_past = past_c >= pres_c |
| aligned = sum(1 for t in valid_roots if (dom_past and self._is_past_group(t)) or (not dom_past and self._is_pres_group(t))) |
| return aligned / len(valid_roots) |
| |
| @staticmethod |
| def calculate_average_graph_depth(doc: Doc) -> float: |
| """ |
| Calcola la profondità media di tutte le frasi nel documento spaCy. |
| """ |
| total_depth = 0 |
| total_nodes = 0 |
|
|
| for sent in doc.sents: |
| |
| |
| depths = {} |
| |
| for token in sent: |
| depths[token.i] = ConsecutioAnalyzer._get_token_depth(token, depths) |
| total_depth += depths[token.i] |
| |
| total_nodes += len(sent) |
|
|
| return total_depth / total_nodes if total_nodes > 0 else 0.0 |
|
|
| @staticmethod |
| def _get_token_depth(token: Token, depths: dict) -> int: |
| """ |
| Calcola ricorsivamente la profondità di un token spaCy. |
| """ |
| if token.i in depths: |
| return depths[token.i] |
|
|
| |
| if token.head == token: |
| depths[token.i] = 1 |
| return 1 |
|
|
| |
| depth = ConsecutioAnalyzer._get_token_depth(token.head, depths) + 1 |
| depths[token.i] = depth |
| return depth |