add math terms
Browse files- handler.py +32 -2
handler.py
CHANGED
|
@@ -2,10 +2,11 @@ from typing import Dict, List, Any
|
|
| 2 |
from scipy.special import softmax
|
| 3 |
import numpy as np
|
| 4 |
import weakref
|
|
|
|
| 5 |
|
| 6 |
from utils import clean_str, clean_str_nopunct
|
| 7 |
import torch
|
| 8 |
-
from utils import MultiHeadModel, BertInputBuilder, get_num_words
|
| 9 |
|
| 10 |
import transformers
|
| 11 |
from transformers import BertTokenizer, BertForSequenceClassification
|
|
@@ -29,6 +30,8 @@ class Utterance:
|
|
| 29 |
self.endtime = endtime
|
| 30 |
self.transcript = weakref.ref(transcript) if transcript else None
|
| 31 |
self.props = kwargs
|
|
|
|
|
|
|
| 32 |
|
| 33 |
self.uptake = None
|
| 34 |
self.reasoning = None
|
|
@@ -53,7 +56,9 @@ class Utterance:
|
|
| 53 |
'uptake': self.uptake,
|
| 54 |
'reasoning': self.reasoning,
|
| 55 |
'question': self.question,
|
| 56 |
-
'
|
|
|
|
|
|
|
| 57 |
**self.props
|
| 58 |
}
|
| 59 |
|
|
@@ -252,6 +257,29 @@ class FocusingQuestionModel:
|
|
| 252 |
return output
|
| 253 |
|
| 254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
class EndpointHandler():
|
| 256 |
def __init__(self, path="."):
|
| 257 |
print("Loading models...")
|
|
@@ -304,4 +332,6 @@ class EndpointHandler():
|
|
| 304 |
self.device, self.tokenizer, self.input_builder)
|
| 305 |
focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
|
| 306 |
|
|
|
|
|
|
|
| 307 |
return transcript.to_dict()
|
|
|
|
| 2 |
from scipy.special import softmax
|
| 3 |
import numpy as np
|
| 4 |
import weakref
|
| 5 |
+
import re
|
| 6 |
|
| 7 |
from utils import clean_str, clean_str_nopunct
|
| 8 |
import torch
|
| 9 |
+
from utils import MultiHeadModel, BertInputBuilder, get_num_words, MATH_PREFIXES, MATH_WORDS
|
| 10 |
|
| 11 |
import transformers
|
| 12 |
from transformers import BertTokenizer, BertForSequenceClassification
|
|
|
|
| 30 |
self.endtime = endtime
|
| 31 |
self.transcript = weakref.ref(transcript) if transcript else None
|
| 32 |
self.props = kwargs
|
| 33 |
+
self.num_math_terms = None
|
| 34 |
+
self.math_terms = None
|
| 35 |
|
| 36 |
self.uptake = None
|
| 37 |
self.reasoning = None
|
|
|
|
| 56 |
'uptake': self.uptake,
|
| 57 |
'reasoning': self.reasoning,
|
| 58 |
'question': self.question,
|
| 59 |
+
'focusingQuestion': self.focusing_question,
|
| 60 |
+
'numMathTerms': self.num_math_terms,
|
| 61 |
+
'mathTerms': self.math_terms,
|
| 62 |
**self.props
|
| 63 |
}
|
| 64 |
|
|
|
|
| 257 |
return output
|
| 258 |
|
| 259 |
|
| 260 |
+
def load_math_terms():
|
| 261 |
+
math_terms = []
|
| 262 |
+
for term in MATH_WORDS:
|
| 263 |
+
if term in MATH_PREFIXES:
|
| 264 |
+
math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
|
| 265 |
+
else:
|
| 266 |
+
math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
|
| 267 |
+
return math_terms
|
| 268 |
+
|
| 269 |
+
def run_math_density(transcript):
|
| 270 |
+
math_terms = load_math_terms()
|
| 271 |
+
for i, utt in enumerate(transcript.utterances):
|
| 272 |
+
found_math_terms = set()
|
| 273 |
+
text = utt.get_clean_text(remove_punct=False)
|
| 274 |
+
num_math_terms = 0
|
| 275 |
+
for term in math_terms:
|
| 276 |
+
count = len(re.findall(term, text))
|
| 277 |
+
if count > 0:
|
| 278 |
+
found_math_terms.add(term)
|
| 279 |
+
num_math_terms += count
|
| 280 |
+
utt.num_math_terms = num_math_terms
|
| 281 |
+
utt.math_terms = list(found_math_terms)
|
| 282 |
+
|
| 283 |
class EndpointHandler():
|
| 284 |
def __init__(self, path="."):
|
| 285 |
print("Loading models...")
|
|
|
|
| 332 |
self.device, self.tokenizer, self.input_builder)
|
| 333 |
focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
|
| 334 |
|
| 335 |
+
run_math_density(transcript)
|
| 336 |
+
|
| 337 |
return transcript.to_dict()
|