File size: 8,114 Bytes
9d2f453 b47539a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | import math
from spacy.tokens import Doc, Token
class ConsecutioAnalyzer:
def __init__(self):
self.reset_metrics()
def reset_metrics(self):
self.total_sentences = 0
self.sum_max_depth = 0
self.total_verb_pairs = 0
self.valid_consecutio_pairs = 0
self.total_words = 0
self.total_verb_groups = 0
self.root_tenses_list = []
class TokenNode:
def __init__(self, spacy_token):
# Mapping identico a consecutio.java
self.id = str(spacy_token.i)
self.form = spacy_token.text
self.pos_tag = spacy_token.tag_ # Usa TAG (VBD, VBP) per i tempi
self.dep_rel = spacy_token.dep_
# In spaCy la root punta a se stessa; in Java la root ha head "0"
self.head_id = str(spacy_token.head.i) if spacy_token.head.i != spacy_token.i else "0"
self.children = []
self.compound_tense = ""
self.is_compound_head = False
def is_verb(self):
# Traduzione esatta di isVerb()
return self.pos_tag.startswith("V") or self.pos_tag.startswith("MD") or self.pos_tag == "AUX"
def is_auxiliary_rel(self):
# In spaCy la relazione è in .dep_
return "aux" in self.dep_rel.lower()
def get_simple_tense(self):
# Traduzione esatta dello switch case Java
tag = self.pos_tag
if tag in ["VBD", "VBN"]: return "Past"
if tag in ["VBP", "VBZ"]: return "Pres"
if tag == "VBG": return "Ger" # Mappa il 'Prog' o 'Ger'
if tag == "MD": return "Mod"
if tag == "VB": return "Inf"
return "N/A"
def analyze(self, doc):
"""
Riceve l'oggetto 'doc' di spaCy.
Implementa analyzeSingleDocument di consecutio.java.
"""
self.reset_metrics()
# doc.sents fornisce le frasi analizzate
for sent in doc.sents:
self.total_sentences += 1
# Conteggio parole escludendo punteggiatura
words_in_sent = [t for t in sent if not t.is_punct]
self.total_words += len(words_in_sent)
try:
# buildSentenceMap
node_map = {str(t.i): self.TokenNode(t) for t in sent}
root = None
# Ricostruzione gerarchia figli
for node in node_map.values():
if node.head_id == "0" or node.dep_rel.lower() == "root":
root = node
elif node.head_id in node_map:
node_map[node.head_id].children.append(node)
if root:
self._process_compound_tenses(root)
self.root_tenses_list.append(root.compound_tense)
self.sum_max_depth += self._calculate_tree_depth(root)
self._check_consecutio_recursively(root)
else:
self.root_tenses_list.append("N/A")
except Exception:
continue
# Calcoli finali identici al Java
stability = self._calculate_dominant_stability()
avg_depth = self.sum_max_depth / self.total_sentences if self.total_sentences > 0 else 0.0
consecutio_score = self.valid_consecutio_pairs / self.total_verb_pairs if self.total_verb_pairs > 0 else 1.0
verb_density = self.total_verb_groups / self.total_words if self.total_words > 0 else 0.0
sentence_depths = self.calculate_average_graph_depth(doc)
return {
"tense_stability": round(stability, 4),
"avg_depth": round(avg_depth, 4) if self.total_sentences > 0 else 0.0,
"consecutio_index": round(consecutio_score, 4),
"verb_density": round(verb_density, 4),
"sentence_depths": round(sentence_depths, 4)
}
def _process_compound_tenses(self, node):
# processCompoundTenses
aux_children = []
for child in node.children:
if child.is_auxiliary_rel():
aux_children.append(child)
self._process_compound_tenses(child)
if node.is_verb() and not node.is_auxiliary_rel():
node.is_compound_head = True
self.total_verb_groups += 1
group = [node] + aux_children
node.compound_tense = self._infer_compound_tense(group, node)
def _infer_compound_tense(self, group, head):
# inferCompoundTense
has_will, has_have, has_modal = False, False, False
for n in group:
if n == head: continue
f = n.form.lower()
if "will" in f or "'ll" in f: has_will = True
elif any(x in f for x in ["have", "has", "had"]): has_have = True
elif n.pos_tag == "MD": has_modal = True
if has_modal: return "Mod"
if has_will: return "Fut"
if has_have and head.pos_tag == "VBN": return "Perf"
return head.get_simple_tense()
def _calculate_tree_depth(self, node):
# calculateTreeDepth
if not node.children: return 1
max_d = 0
for child in node.children:
if child.is_auxiliary_rel(): continue
max_d = max(max_d, self._calculate_tree_depth(child))
return 1 + max_d
def _check_consecutio_recursively(self, parent):
# checkConsecutioRecursively
for child in parent.children:
if child.is_auxiliary_rel(): continue
if parent.is_compound_head and child.is_compound_head:
self.total_verb_pairs += 1
if self._is_consecutio_valid(parent.compound_tense, child.compound_tense):
self.valid_consecutio_pairs += 1
self._check_consecutio_recursively(child)
def _is_consecutio_valid(self, p, c):
# isConsecutioValid
if self._is_pres_group(p): return True
if self._is_past_group(p):
if c == "Pres" or c == "Ger": return False
return True
def _is_past_group(self, t):
return "Past" in t or "Perf" in t or "Mod" in t
def _is_pres_group(self, t):
return "Pres" in t or "Fut" in t or "Ger" in t
def _calculate_dominant_stability(self):
# calculateDominantStability
valid_roots = [t for t in self.root_tenses_list if t != "N/A"]
if not valid_roots: return 0.0
past_c = sum(1 for t in valid_roots if self._is_past_group(t))
pres_c = sum(1 for t in valid_roots if self._is_pres_group(t))
dom_past = past_c >= pres_c
aligned = sum(1 for t in valid_roots if (dom_past and self._is_past_group(t)) or (not dom_past and self._is_pres_group(t)))
return aligned / len(valid_roots)
@staticmethod
def calculate_average_graph_depth(doc: Doc) -> float:
"""
Calcola la profondità media di tutte le frasi nel documento spaCy.
"""
total_depth = 0
total_nodes = 0
for sent in doc.sents:
# Dizionario per memorizzare le profondità (memoization)
# Usiamo l'indice del token nel documento come chiave
depths = {}
for token in sent:
depths[token.i] = ConsecutioAnalyzer._get_token_depth(token, depths)
total_depth += depths[token.i]
total_nodes += len(sent)
return total_depth / total_nodes if total_nodes > 0 else 0.0
@staticmethod
def _get_token_depth(token: Token, depths: dict) -> int:
"""
Calcola ricorsivamente la profondità di un token spaCy.
"""
if token.i in depths:
return depths[token.i]
# In spaCy, la radice ha se stessa come head (token.head == token)
if token.head == token:
depths[token.i] = 1
return 1
# Profondità = 1 + profondità del padre
depth = ConsecutioAnalyzer._get_token_depth(token.head, depths) + 1
depths[token.i] = depth
return depth |