hylee719
/

transcript-analysis-testing

Model card Files Files and versions

xet

Community

hylee commited on Nov 16, 2023

Commit

c3bae84

1 Parent(s): 454b944

revise math term checking

Browse files

Files changed (1) hide show

handler.py +69 -21

handler.py CHANGED Viewed

@@ -361,35 +361,83 @@ def load_math_terms():
     math_terms = []
     math_terms_dict = {}
     for term in MATH_WORDS:
-        math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
-        math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
-        # if term in MATH_PREFIXES:
-        #     math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
-        #     math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
         # else:
         #     math_terms_dict[f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)"] = term
         #     math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
-        logging.set_verbosity_info()
-        logger = logging.get_logger("transformers")
-        logger.info(f"maths terms values: {math_terms_dict.values()}")
     return math_terms, math_terms_dict
 def run_math_density(transcript):
     math_terms, math_terms_dict = load_math_terms()
-    for i, utt in enumerate(transcript.utterances):
-        found_math_terms = set()
         text = utt.get_clean_text(remove_punct=False)
-        logging.set_verbosity_info()
-        logger = logging.get_logger("transformers")
-        logger.info(f"clean text in math density: {text}")
-        num_math_terms = 0
-        for term in math_terms:
-            count = len(re.findall(term, text))
-            if count > 0:
-                found_math_terms.add(math_terms_dict[term])
-            num_math_terms += count
-        utt.num_math_terms = num_math_terms
-        utt.math_terms = list(found_math_terms)
 class EndpointHandler():
     def __init__(self, path="."):

     math_terms = []
     math_terms_dict = {}
     for term in MATH_WORDS:
+        # math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
+        # math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
+        if term in MATH_PREFIXES:
+            math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
+            math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
+        else:
+            math_terms.append(term)
+            math_terms_dict[term] = term
         # else:
         #     math_terms_dict[f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)"] = term
         #     math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
+    #     logging.set_verbosity_info()
+    #     logger = logging.get_logger("transformers")
+    #     logger.info(f"maths terms values: {math_terms_dict.values()}")
+    # return math_terms, math_terms_dict
     return math_terms, math_terms_dict
+# def run_math_density(transcript):
+#     math_terms, math_terms_dict = load_math_terms()
+#     for i, utt in enumerate(transcript.utterances):
+#         found_math_terms = set()
+#         text = utt.get_clean_text(remove_punct=False)
+#         logging.set_verbosity_info()
+#         logger = logging.get_logger("transformers")
+#         # logger.info(f"clean text in math density: {text}")
+#         num_math_terms = 0
+#         for term in math_terms:
+#             count = len(re.findall(term, text))
+#             if count > 0:
+#                 found_math_terms.add(math_terms_dict[term])
+#             num_math_terms += count
+#         utt.num_math_terms = num_math_terms
+#         utt.math_terms = list(found_math_terms)
 def run_math_density(transcript):
     math_terms, math_terms_dict = load_math_terms()
+    sorted_terms = sorted(math_terms, key=len, reverse=True)
+    for i, utt in enumerate(transcript.utterances):
         text = utt.get_clean_text(remove_punct=False)
+        num_matches = 0
+        matched_positions = set()
+        match_list = []
+        for term in sorted_terms:
+            # Use re.finditer to find all non-overlapping match objects
+            matches = list(re.finditer(term, text, re.IGNORECASE))
+            # count = len(re.findall(term, input_string))
+            # print('term: ', term)
+            # print("count with findall: ", count)
+            # Filter out matches that share positions with longer terms
+            matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
+            if len(matches) > 0:
+                match_list.append(math_terms_dict[term])
+            # Update matched positions
+            matched_positions.update((match.start(), match.end()) for match in matches)
+            # Count the number of matches
+            num_matches += len(matches)
+        utt.num_math_terms = num_matches
+        utt.math_terms = match_list
+# def gloss_check_vec(s):
+#     gloss =
+#     # Sort glossary terms by length in descending order
+#     sorted_gloss = sorted(gloss, key=len, reverse=True)
+#     # Create a logical vector indicating whether each term in 'gloss' is found in 's'
+#     gloss_found_dict = {}
+#     for g in sorted_gloss:
+#         if re.search(re.escape(g), s, re.IGNORECASE):
+#             gloss_found_dict[g] = True
+#         else:
+#             gloss_found_dict[g] = False
+#     # Return the resulting logical vector
+#     return gloss_found_dict
 class EndpointHandler():
     def __init__(self, path="."):