stanford-nlpxed
/

transcript-analysis

Model card Files Files and versions

xet

Community

hylee719 commited on Nov 10, 2023

Commit

4c48e8e

1 Parent(s): 0da9196

add math terms

Browse files

Files changed (1) hide show

handler.py +32 -2

handler.py CHANGED Viewed

@@ -2,10 +2,11 @@ from typing import Dict, List, Any
 from scipy.special import softmax
 import numpy as np
 import weakref
 from utils import clean_str, clean_str_nopunct
 import torch
-from utils import MultiHeadModel, BertInputBuilder, get_num_words
 import transformers
 from transformers import BertTokenizer, BertForSequenceClassification
@@ -29,6 +30,8 @@ class Utterance:
         self.endtime = endtime
         self.transcript = weakref.ref(transcript) if transcript else None
         self.props = kwargs
         self.uptake = None
         self.reasoning = None
@@ -53,7 +56,9 @@ class Utterance:
             'uptake': self.uptake,
             'reasoning': self.reasoning,
             'question':  self.question,
-            'focusingquestion': self.focusing_question,
             **self.props
         }
@@ -252,6 +257,29 @@ class FocusingQuestionModel:
         return output
 class EndpointHandler():
     def __init__(self, path="."):
         print("Loading models...")
@@ -304,4 +332,6 @@ class EndpointHandler():
             self.device, self.tokenizer, self.input_builder)
         focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
         return transcript.to_dict()

 from scipy.special import softmax
 import numpy as np
 import weakref
+import re
 from utils import clean_str, clean_str_nopunct
 import torch
+from utils import MultiHeadModel, BertInputBuilder, get_num_words, MATH_PREFIXES, MATH_WORDS
 import transformers
 from transformers import BertTokenizer, BertForSequenceClassification
         self.endtime = endtime
         self.transcript = weakref.ref(transcript) if transcript else None
         self.props = kwargs
+        self.num_math_terms = None
+        self.math_terms = None
         self.uptake = None
         self.reasoning = None
             'uptake': self.uptake,
             'reasoning': self.reasoning,
             'question':  self.question,
+            'focusingQuestion': self.focusing_question,
+            'numMathTerms': self.num_math_terms,
+            'mathTerms': self.math_terms,
             **self.props
         }
         return output
+def load_math_terms():
+    math_terms = []
+    for term in MATH_WORDS:
+        if term in MATH_PREFIXES:
+            math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
+        else:
+            math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
+    return math_terms
+def run_math_density(transcript):
+    math_terms = load_math_terms()
+    for i, utt in enumerate(transcript.utterances):
+        found_math_terms = set()
+        text = utt.get_clean_text(remove_punct=False)
+        num_math_terms = 0
+        for term in math_terms:
+            count = len(re.findall(term, text))
+            if count > 0:
+                found_math_terms.add(term)
+            num_math_terms += count
+        utt.num_math_terms = num_math_terms
+        utt.math_terms = list(found_math_terms)
 class EndpointHandler():
     def __init__(self, path="."):
         print("Loading models...")
             self.device, self.tokenizer, self.input_builder)
         focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
+        run_math_density(transcript)
         return transcript.to_dict()