fix math term detection
Browse files- handler.py +18 -12
handler.py
CHANGED
|
@@ -265,23 +265,29 @@ def load_math_terms():
|
|
| 265 |
math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
|
| 266 |
math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
|
| 267 |
else:
|
| 268 |
-
|
| 269 |
-
|
| 270 |
return math_terms, math_terms_dict
|
| 271 |
|
| 272 |
def run_math_density(transcript):
|
| 273 |
math_terms, math_terms_dict = load_math_terms()
|
| 274 |
-
|
| 275 |
-
|
| 276 |
text = utt.get_clean_text(remove_punct=False)
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
|
| 286 |
class EndpointHandler():
|
| 287 |
def __init__(self, path="."):
|
|
|
|
| 265 |
math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
|
| 266 |
math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
|
| 267 |
else:
|
| 268 |
+
math_terms.append(term)
|
| 269 |
+
math_terms_dict[term] = term
|
| 270 |
return math_terms, math_terms_dict
|
| 271 |
|
| 272 |
def run_math_density(transcript):
|
| 273 |
math_terms, math_terms_dict = load_math_terms()
|
| 274 |
+
sorted_terms = sorted(math_terms, key=len, reverse=True)
|
| 275 |
+
for i, utt in enumerate(transcript.utterances):
|
| 276 |
text = utt.get_clean_text(remove_punct=False)
|
| 277 |
+
num_matches = 0
|
| 278 |
+
matched_positions = set()
|
| 279 |
+
match_list = []
|
| 280 |
+
for term in sorted_terms:
|
| 281 |
+
matches = list(re.finditer(term, text, re.IGNORECASE))
|
| 282 |
+
# Filter out matches that share positions with longer terms
|
| 283 |
+
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
| 284 |
+
if len(matches) > 0:
|
| 285 |
+
match_list.append(math_terms_dict[term])
|
| 286 |
+
# Update existing match positions
|
| 287 |
+
matched_positions.update((match.start(), match.end()) for match in matches)
|
| 288 |
+
num_matches += len(matches)
|
| 289 |
+
utt.num_math_terms = num_matches
|
| 290 |
+
utt.math_terms = match_list
|
| 291 |
|
| 292 |
class EndpointHandler():
|
| 293 |
def __init__(self, path="."):
|