hylee719
/

transcript-analysis-testing

Model card Files Files and versions

xet

Community

hylee commited on Nov 15, 2023

Commit

3f1e400

1 Parent(s): 8e1280d

integrate focusing question and math terms

Browse files

Files changed (2) hide show

handler.py +81 -4
utils.py +282 -0

handler.py CHANGED Viewed

@@ -19,6 +19,7 @@ transformers.logging.set_verbosity_debug()
 UPTAKE_MODEL = 'ddemszky/uptake-model'
 REASONING_MODEL = 'ddemszky/student-reasoning'
 QUESTION_MODEL = 'ddemszky/question-detection'
 class Utterance:
@@ -36,11 +37,14 @@ class Utterance:
         self.timestamp = [starttime, endtime]
         self.unit_measure = None
         self.aggregate_unit_measure = endtime
         # moments
         self.uptake = None
         self.reasoning = None
         self.question = None
     def get_clean_text(self, remove_punct=False):
         if remove_punct:
@@ -60,6 +64,9 @@ class Utterance:
             'uptake': self.uptake,
             'reasoning': self.reasoning,
             'question':  self.question,
             **self.props
         }
@@ -69,10 +76,12 @@ class Utterance:
             'text': self.text,
             'role': self.role,
             'timestamp': self.timestamp,
-            'moments': {'reasoning': True if self.reasoning else False, 'questioning': True if self.question else False, 'uptake': True if self.uptake else False},
             'unitMeasure': self.unit_measure,
             'aggregateUnitMeasure': self.aggregate_unit_measure,
-            'wordCount': self.word_count
         }
     def __repr__(self):
@@ -311,6 +320,67 @@ class UptakeModel:
                             return_pooler_output=False)
         return output
 class EndpointHandler():
     def __init__(self, path="."):
@@ -358,14 +428,21 @@ class EndpointHandler():
         question_model = QuestionModel(
             self.device, self.tokenizer, self.input_builder)
         question_model.run_inference(transcript)
         transcript.update_utterance_roles(uptake_speaker)
         transcript.calculate_aggregate_word_count()
         return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None}
         talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
         return_dict['talkDistribution'] = talk_dist
         return_dict['talkLength'] = talk_len
-        talk_timeline = transcript.get_talk_timeline()
-        talk_moments = talk_timeline
         return_dict['talkMoments'] = talk_moments
         word_cloud, uptake_word_cloud = transcript.get_word_cloud_dicts()
         return_dict['commonTopWords'] = word_cloud

 UPTAKE_MODEL = 'ddemszky/uptake-model'
 REASONING_MODEL = 'ddemszky/student-reasoning'
 QUESTION_MODEL = 'ddemszky/question-detection'
+FOCUSING_QUESTION_MODEL = 'ddemszky/focusing-questions'
 class Utterance:
         self.timestamp = [starttime, endtime]
         self.unit_measure = None
         self.aggregate_unit_measure = endtime
+        self.num_math_terms = None
+        self.math_terms = None
         # moments
         self.uptake = None
         self.reasoning = None
         self.question = None
+        self.focusing_question = None
     def get_clean_text(self, remove_punct=False):
         if remove_punct:
             'uptake': self.uptake,
             'reasoning': self.reasoning,
             'question':  self.question,
+            'focusingQuestion': self.focusing_question,
+            'numMathTerms': self.num_math_terms,
+            'mathTerms': self.math_terms,
             **self.props
         }
             'text': self.text,
             'role': self.role,
             'timestamp': self.timestamp,
+            'moments': {'reasoning': True if self.reasoning else False, 'questioning': True if self.question else False, 'uptake': True if self.uptake else False, 'focusingQuestion': True if self.focusing_question else False},
             'unitMeasure': self.unit_measure,
             'aggregateUnitMeasure': self.aggregate_unit_measure,
+            'wordCount': self.word_count,
+            'numMathTerms': self.num_math_terms,
+            'mathTerms': self.math_terms
         }
     def __repr__(self):
                             return_pooler_output=False)
         return output
+class FocusingQuestionModel:
+    def __init__(self, device, tokenizer, input_builder, max_length=128, path=FOCUSING_QUESTION_MODEL):
+        print("Loading models...")
+        self.device = device
+        self.tokenizer = tokenizer
+        self.input_builder = input_builder
+        self.model = BertForSequenceClassification.from_pretrained(path)
+        self.model.to(self.device)
+        self.max_length = max_length
+    def run_inference(self, transcript, min_focusing_words=0, uptake_speaker=None):
+        self.model.eval()
+        with torch.no_grad():
+            for i, utt in enumerate(transcript.utterances):
+                if utt.speaker != uptake_speaker or uptake_speaker is None:
+                    utt.focusing_question = None
+                    continue
+                if utt.get_num_words() < min_focusing_words:
+                    utt.focusing_question = None
+                    continue
+                instance = self.input_builder.build_inputs([], utt.text, max_length=self.max_length, input_str=True)
+                output = self.get_prediction(instance)
+                utt.focusing_question = np.argmax(output["logits"][0].tolist())
+    def get_prediction(self, instance):
+        instance["attention_mask"] = [[1] * len(instance["input_ids"])]
+        for key in ["input_ids", "token_type_ids", "attention_mask"]:
+            instance[key] = torch.tensor(
+                instance[key]).unsqueeze(0)  # Batch size = 1
+            instance[key].to(self.device)
+        output = self.model(input_ids=instance["input_ids"],
+                            attention_mask=instance["attention_mask"],
+                            token_type_ids=instance["token_type_ids"])
+        return output
+def load_math_terms():
+    math_terms = []
+    math_terms_dict = {}
+    for term in MATH_WORDS:
+        if term in MATH_PREFIXES:
+            math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
+            math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
+        else:
+            math_terms_dict[f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)"] = term
+            math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
+    return math_terms, math_terms_dict
+def run_math_density(transcript):
+    math_terms, math_terms_dict = load_math_terms()
+    for i, utt in enumerate(transcript.utterances):
+        found_math_terms = set()
+        text = utt.get_clean_text(remove_punct=False)
+        num_math_terms = 0
+        for term in math_terms:
+            count = len(re.findall(term, text))
+            if count > 0:
+                found_math_terms.add(math_terms_dict[term])
+            num_math_terms += count
+        utt.num_math_terms = num_math_terms
+        utt.math_terms = list(found_math_terms)
 class EndpointHandler():
     def __init__(self, path="."):
         question_model = QuestionModel(
             self.device, self.tokenizer, self.input_builder)
         question_model.run_inference(transcript)
+        # Focusing Question
+        focusing_question_model = FocusingQuestionModel(
+            self.device, self.tokenizer, self.input_builder)
+        focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
+        run_math_density(transcript)
         transcript.update_utterance_roles(uptake_speaker)
         transcript.calculate_aggregate_word_count()
         return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None}
         talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
         return_dict['talkDistribution'] = talk_dist
         return_dict['talkLength'] = talk_len
+        talk_moments = transcript.get_talk_timeline()
         return_dict['talkMoments'] = talk_moments
         word_cloud, uptake_word_cloud = transcript.get_word_cloud_dicts()
         return_dict['commonTopWords'] = word_cloud

utils.py CHANGED Viewed

@@ -13,6 +13,288 @@ punct_chars.sort()
 punctuation = ''.join(punct_chars)
 replace = re.compile('[%s]' % re.escape(punctuation))
 def get_num_words(text):
     if not isinstance(text, str):
         print("%s is not a string" % text)

 punctuation = ''.join(punct_chars)
 replace = re.compile('[%s]' % re.escape(punctuation))
+MATH_PREFIXES = [
+    "sum",
+    "arc",
+    "mass",
+    "digit",
+    "graph",
+    "liter",
+    "gram",
+    "add",
+    "angle",
+    "scale",
+    "data",
+    "array",
+    "ruler",
+    "meter",
+    "total",
+    "unit",
+    "prism",
+    "median",
+    "ratio",
+    "area",
+]
+MATH_WORDS = [
+    "absolute value",
+    "area",
+    "average",
+    "base of",
+    "box plot",
+    "categorical",
+    "coefficient",
+    "common factor",
+    "common multiple",
+    "compose",
+    "coordinate",
+    "cubed",
+    "decompose",
+    "dependent variable",
+    "distribution",
+    "dot plot",
+    "double number line diagram",
+    "equivalent",
+    "equivalent expression",
+    "ratio",
+    "exponent",
+    "frequency",
+    "greatest common factor",
+    "gcd",
+    "height of",
+    "histogram",
+    "independent variable",
+    "interquartile range",
+    "iqr",
+    "least common multiple",
+    "long division",
+    "mean absolute deviation",
+    "median",
+    "negative number",
+    "opposite vertex",
+    "parallelogram",
+    "percent",
+    "polygon",
+    "polyhedron",
+    "positive number",
+    "prism",
+    "pyramid",
+    "quadrant",
+    "quadrilateral",
+    "quartile",
+    "rational number",
+    "reciprocal",
+    "equality",
+    "inequality",
+    "squared",
+    "statistic",
+    "surface area",
+    "identity property",
+    "addend",
+    "unit",
+    "number sentence",
+    "make ten",
+    "take from ten",
+    "number bond",
+    "total",
+    "estimate",
+    "hashmark",
+    "meter",
+    "number line",
+    "ruler",
+    "centimeter",
+    "base ten",
+    "expanded form",
+    "hundred",
+    "thousand",
+    "place value",
+    "number disk",
+    "standard form",
+    "unit form",
+    "word form",
+    "tens place",
+    "algorithm",
+    "equation",
+    "simplif",
+    "addition",
+    "subtract",
+    "array",
+    "even number",
+    "odd number",
+    "repeated addition",
+    "tessellat",
+    "whole number",
+    "number path",
+    "rectangle",
+    "square",
+    "bar graph",
+    "data",
+    "degree",
+    "line plot",
+    "picture graph",
+    "scale",
+    "survey",
+    "thermometer",
+    "estimat",
+    "tape diagram",
+    "value",
+    "analog",
+    "angle",
+    "parallel",
+    "partition",
+    "pentagon",
+    "right angle",
+    "cube",
+    "digital",
+    "quarter of",
+    "tangram",
+    "circle",
+    "hexagon",
+    "half circle",
+    "half-circle",
+    "quarter circle",
+    "quarter-circle",
+    "semicircle",
+    "semi-circle",
+    "rectang",
+    "rhombus",
+    "trapezoid",
+    "triangle",
+    "commutative",
+    "equal group",
+    "distributive",
+    "divide",
+    "division",
+    "multipl",
+    "parentheses",
+    "quotient",
+    "rotate",
+    "unknown",
+    "add",
+    "capacity",
+    "continuous",
+    "endpoint",
+    "gram",
+    "interval",
+    "kilogram",
+    "volume",
+    "liter",
+    "milliliter",
+    "approximate",
+    "area model",
+    "square unit",
+    "unit square",
+    "geometr",
+    "equivalent fraction",
+    "fraction form",
+    "fractional unit",
+    "unit fraction",
+    "unit interval",
+    "measur",
+    "graph",
+    "scaled graph",
+    "diagonal",
+    "perimeter",
+    "regular polygon",
+    "tessellate",
+    "tetromino",
+    "heptagon",
+    "octagon",
+    "digit",
+    "expression",
+    "sum",
+    "kilometer",
+    "mass",
+    "mixed unit",
+    "length",
+    "measure",
+    "simplify",
+    "associative",
+    "composite",
+    "divisible",
+    "divisor",
+    "partial product",
+    "prime number",
+    "remainder",
+    "acute",
+    "arc",
+    "collinear",
+    "equilateral",
+    "intersect",
+    "isosceles",
+    "symmetry",
+    "line segment",
+    "line",
+    "obtuse",
+    "perpendicular",
+    "protractor",
+    "scalene",
+    "straight angle",
+    "supplementary angle",
+    "vertex",
+    "common denominator",
+    "denominator",
+    "fraction",
+    "mixed number",
+    "numerator",
+    "whole",
+    "decimal expanded form",
+    "decimal",
+    "hundredth",
+    "tenth",
+    "customary system of measurement",
+    "customary unit",
+    "gallon",
+    "metric",
+    "metric unit",
+    "ounce",
+    "pint",
+    "quart",
+    "convert",
+    "distance",
+    "millimeter",
+    "thousandth",
+    "hundredths",
+    "conversion factor",
+    "decimal fraction",
+    "multiplier",
+    "equivalence",
+    "multiple",
+    "product",
+    "benchmark fraction",
+    "cup",
+    "pound",
+    "yard",
+    "whole unit",
+    "decimal divisor",
+    "factors",
+    "bisect",
+    "cubic units",
+    "hierarchy",
+    "unit cube",
+    "attribute",
+    "kite",
+    "bisector",
+    "solid figure",
+    "square units",
+    "dimension",
+    "axis",
+    "ordered pair",
+    "angle measure",
+    "horizontal",
+    "vertical",
+    "categorical data",
+    "lcm",
+    "measure of center",
+    "meters per second",
+    "numerical",
+    "solution",
+    "unit price",
+    "unit rate",
+    "variability",
+    "variable",
+]
 def get_num_words(text):
     if not isinstance(text, str):
         print("%s is not a string" % text)