hylee
commited on
Commit
·
c3bae84
1
Parent(s):
454b944
revise math term checking
Browse files- handler.py +69 -21
handler.py
CHANGED
|
@@ -361,35 +361,83 @@ def load_math_terms():
|
|
| 361 |
math_terms = []
|
| 362 |
math_terms_dict = {}
|
| 363 |
for term in MATH_WORDS:
|
| 364 |
-
math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
|
| 365 |
-
math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
|
|
|
|
|
|
|
|
|
| 369 |
# else:
|
| 370 |
# math_terms_dict[f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)"] = term
|
| 371 |
# math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
|
|
|
| 375 |
return math_terms, math_terms_dict
|
| 376 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
def run_math_density(transcript):
|
| 378 |
math_terms, math_terms_dict = load_math_terms()
|
| 379 |
-
|
| 380 |
-
|
| 381 |
text = utt.get_clean_text(remove_punct=False)
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
|
| 394 |
class EndpointHandler():
|
| 395 |
def __init__(self, path="."):
|
|
|
|
| 361 |
math_terms = []
|
| 362 |
math_terms_dict = {}
|
| 363 |
for term in MATH_WORDS:
|
| 364 |
+
# math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
|
| 365 |
+
# math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
|
| 366 |
+
if term in MATH_PREFIXES:
|
| 367 |
+
math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
|
| 368 |
+
math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
|
| 369 |
+
else:
|
| 370 |
+
math_terms.append(term)
|
| 371 |
+
math_terms_dict[term] = term
|
| 372 |
# else:
|
| 373 |
# math_terms_dict[f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)"] = term
|
| 374 |
# math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
|
| 375 |
+
# logging.set_verbosity_info()
|
| 376 |
+
# logger = logging.get_logger("transformers")
|
| 377 |
+
# logger.info(f"maths terms values: {math_terms_dict.values()}")
|
| 378 |
+
# return math_terms, math_terms_dict
|
| 379 |
return math_terms, math_terms_dict
|
| 380 |
|
| 381 |
+
# def run_math_density(transcript):
|
| 382 |
+
# math_terms, math_terms_dict = load_math_terms()
|
| 383 |
+
# for i, utt in enumerate(transcript.utterances):
|
| 384 |
+
# found_math_terms = set()
|
| 385 |
+
# text = utt.get_clean_text(remove_punct=False)
|
| 386 |
+
# logging.set_verbosity_info()
|
| 387 |
+
# logger = logging.get_logger("transformers")
|
| 388 |
+
# # logger.info(f"clean text in math density: {text}")
|
| 389 |
+
# num_math_terms = 0
|
| 390 |
+
# for term in math_terms:
|
| 391 |
+
# count = len(re.findall(term, text))
|
| 392 |
+
# if count > 0:
|
| 393 |
+
# found_math_terms.add(math_terms_dict[term])
|
| 394 |
+
# num_math_terms += count
|
| 395 |
+
# utt.num_math_terms = num_math_terms
|
| 396 |
+
# utt.math_terms = list(found_math_terms)
|
| 397 |
+
|
| 398 |
def run_math_density(transcript):
|
| 399 |
math_terms, math_terms_dict = load_math_terms()
|
| 400 |
+
sorted_terms = sorted(math_terms, key=len, reverse=True)
|
| 401 |
+
for i, utt in enumerate(transcript.utterances):
|
| 402 |
text = utt.get_clean_text(remove_punct=False)
|
| 403 |
+
num_matches = 0
|
| 404 |
+
matched_positions = set()
|
| 405 |
+
match_list = []
|
| 406 |
+
for term in sorted_terms:
|
| 407 |
+
# Use re.finditer to find all non-overlapping match objects
|
| 408 |
+
matches = list(re.finditer(term, text, re.IGNORECASE))
|
| 409 |
+
# count = len(re.findall(term, input_string))
|
| 410 |
+
# print('term: ', term)
|
| 411 |
+
# print("count with findall: ", count)
|
| 412 |
+
# Filter out matches that share positions with longer terms
|
| 413 |
+
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
| 414 |
+
if len(matches) > 0:
|
| 415 |
+
match_list.append(math_terms_dict[term])
|
| 416 |
+
# Update matched positions
|
| 417 |
+
matched_positions.update((match.start(), match.end()) for match in matches)
|
| 418 |
+
# Count the number of matches
|
| 419 |
+
num_matches += len(matches)
|
| 420 |
+
utt.num_math_terms = num_matches
|
| 421 |
+
utt.math_terms = match_list
|
| 422 |
+
|
| 423 |
+
|
| 424 |
+
|
| 425 |
+
# def gloss_check_vec(s):
|
| 426 |
+
# gloss =
|
| 427 |
+
# # Sort glossary terms by length in descending order
|
| 428 |
+
# sorted_gloss = sorted(gloss, key=len, reverse=True)
|
| 429 |
+
|
| 430 |
+
# # Create a logical vector indicating whether each term in 'gloss' is found in 's'
|
| 431 |
+
# gloss_found_dict = {}
|
| 432 |
+
# for g in sorted_gloss:
|
| 433 |
+
# if re.search(re.escape(g), s, re.IGNORECASE):
|
| 434 |
+
# gloss_found_dict[g] = True
|
| 435 |
+
# else:
|
| 436 |
+
# gloss_found_dict[g] = False
|
| 437 |
+
|
| 438 |
+
# # Return the resulting logical vector
|
| 439 |
+
# return gloss_found_dict
|
| 440 |
+
|
| 441 |
|
| 442 |
class EndpointHandler():
|
| 443 |
def __init__(self, path="."):
|