hylee719
/

transcript-analysis-testing

Model card Files Files and versions

xet

Community

hylee commited on Jan 30, 2024

Commit

4a18cf3

1 Parent(s): c2f7754

update with del

Browse files

Files changed (1) hide show

handler.py +42 -27

handler.py CHANGED Viewed

@@ -36,8 +36,11 @@ class Utterance:
         self.role = None
         self.word_count = self.get_num_words()
         self.timestamp = [starttime, endtime]
-        self.unit_measure = None
-        self.aggregate_unit_measure = endtime
         self.num_math_terms = None
         self.math_terms = None
@@ -75,6 +78,7 @@ class Utterance:
         return{
             'speaker': self.speaker,
             'text': self.text,
             'role': self.role,
             'timestamp': self.timestamp,
             'moments': {'reasoning': True if self.reasoning else False, 'questioning': True if self.question else False, 'uptake': True if self.uptake else False, 'focusingQuestion': True if self.focusing_question else False},
@@ -137,20 +141,21 @@ class Transcript:
                 utt.role = 'student'
                 student_words += utt.get_num_words()
                 student_utt_count += 1
-        teacher_percentage = round(
-            (teacher_words / (teacher_words + student_words)) * 100)
-        student_percentage = 100 - teacher_percentage
-        avg_teacher_length = teacher_words / teacher_utt_count
-        avg_student_length = student_words / student_utt_count
         return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
-    def get_word_cloud_dicts(self):
         teacher_dict = {}
         student_dict = {}
         uptake_teacher_dict = {}
         stop_words = stopwords.words('english')
-        # stopwords = nltk.corpus.stopwords.word('english')
-        # print("stopwords: ", stopwords)
         for utt in self.utterances:
             words = (utt.get_clean_text(remove_punct=True)).split(' ')
             for word in words:
@@ -227,7 +232,7 @@ class QuestionModel:
                                                                max_length=self.max_length,
                                                                input_str=True)
                     output = self.get_prediction(instance)
-                    print(output)
                     utt.question = np.argmax(
                         output["is_question_logits"][0].tolist())
@@ -255,11 +260,11 @@ class ReasoningModel:
         self.model = BertForSequenceClassification.from_pretrained(path)
         self.model.to(self.device)
-    def run_inference(self, transcript, min_num_words=8):
         self.model.eval()
         with torch.no_grad():
             for i, utt in enumerate(transcript.utterances):
-                if utt.get_num_words() >= min_num_words:
                     instance = self.input_builder.build_inputs([], utt.text,
                                                                max_length=self.max_length,
                                                                input_str=True)
@@ -372,6 +377,7 @@ def load_math_terms():
 def run_math_density(transcript):
     math_terms, math_terms_dict = load_math_terms()
     sorted_terms = sorted(math_terms, key=len, reverse=True)
     for i, utt in enumerate(transcript.utterances):
         text = utt.get_clean_text(remove_punct=False)
         num_matches = 0
@@ -382,12 +388,21 @@ def run_math_density(transcript):
             # Filter out matches that share positions with longer terms
             matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
             if len(matches) > 0:
                 match_list.append(math_terms_dict[term])
             # Update matched positions
             matched_positions.update((match.start(), match.end()) for match in matches)
             num_matches += len(matches)
         utt.num_math_terms = num_matches
         utt.math_terms = match_list
 class EndpointHandler():
     def __init__(self, path="."):
@@ -410,10 +425,6 @@ class EndpointHandler():
         utterances = data.pop("inputs", data)
         params = data.pop("parameters", None)
-        print("EXAMPLES")
-        for utt in utterances[:3]:
-            print("speaker %s: %s" % (utt["speaker"], utt["text"]))
         transcript = Transcript(filename=params.pop("filename", None))
         for utt in utterances:
             transcript.add_utterance(Utterance(**utt))
@@ -426,34 +437,38 @@ class EndpointHandler():
         uptake_speaker = params.pop("uptake_speaker", None)
         uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
                                    uptake_speaker=uptake_speaker)
         # Reasoning
         reasoning_model = ReasoningModel(
             self.device, self.tokenizer, self.input_builder)
-        reasoning_model.run_inference(transcript)
         # Question
         question_model = QuestionModel(
             self.device, self.tokenizer, self.input_builder)
         question_model.run_inference(transcript)
         # Focusing Question
         focusing_question_model = FocusingQuestionModel(
             self.device, self.tokenizer, self.input_builder)
         focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
-        run_math_density(transcript)
         transcript.update_utterance_roles(uptake_speaker)
         transcript.calculate_aggregate_word_count()
-        return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None}
         talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
         return_dict['talkDistribution'] = talk_dist
         return_dict['talkLength'] = talk_len
         talk_moments = transcript.get_talk_timeline()
         return_dict['talkMoments'] = talk_moments
-        word_cloud, uptake_word_cloud = transcript.get_word_cloud_dicts()
         return_dict['commonTopWords'] = word_cloud
-        return_dict['uptakeTopwords'] = uptake_word_cloud
         return return_dict

         self.role = None
         self.word_count = self.get_num_words()
         self.timestamp = [starttime, endtime]
+        if starttime is not None and endtime is not None:
+            self.unit_measure = endtime - starttime
+        else:
+            self.unit_measure = None
+        self.aggregate_unit_measure = endtime
         self.num_math_terms = None
         self.math_terms = None
         return{
             'speaker': self.speaker,
             'text': self.text,
+            'uid': self.uid,
             'role': self.role,
             'timestamp': self.timestamp,
             'moments': {'reasoning': True if self.reasoning else False, 'questioning': True if self.question else False, 'uptake': True if self.uptake else False, 'focusingQuestion': True if self.focusing_question else False},
                 utt.role = 'student'
                 student_words += utt.get_num_words()
                 student_utt_count += 1
+        if teacher_words + student_words > 0:
+            teacher_percentage = round(
+                (teacher_words / (teacher_words + student_words)) * 100)
+            student_percentage = 100 - teacher_percentage
+        else:
+            teacher_percentage = student_percentage = 0
+        avg_teacher_length = teacher_words / teacher_utt_count if teacher_utt_count > 0 else 0
+        avg_student_length = student_words / student_utt_count if student_utt_count > 0 else 0
         return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
+    def get_word_clouds(self):
         teacher_dict = {}
         student_dict = {}
         uptake_teacher_dict = {}
         stop_words = stopwords.words('english')
         for utt in self.utterances:
             words = (utt.get_clean_text(remove_punct=True)).split(' ')
             for word in words:
                                                                max_length=self.max_length,
                                                                input_str=True)
                     output = self.get_prediction(instance)
+                    # print(output)
                     utt.question = np.argmax(
                         output["is_question_logits"][0].tolist())
         self.model = BertForSequenceClassification.from_pretrained(path)
         self.model.to(self.device)
+    def run_inference(self, transcript, min_num_words=8, uptake_speaker=None):
         self.model.eval()
         with torch.no_grad():
             for i, utt in enumerate(transcript.utterances):
+                if utt.get_num_words() >= min_num_words and utt.speaker != uptake_speaker:
                     instance = self.input_builder.build_inputs([], utt.text,
                                                                max_length=self.max_length,
                                                                input_str=True)
 def run_math_density(transcript):
     math_terms, math_terms_dict = load_math_terms()
     sorted_terms = sorted(math_terms, key=len, reverse=True)
+    math_word_cloud = {}
     for i, utt in enumerate(transcript.utterances):
         text = utt.get_clean_text(remove_punct=False)
         num_matches = 0
             # Filter out matches that share positions with longer terms
             matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
             if len(matches) > 0:
+                if math_terms_dict[term] not in math_word_cloud:
+                    math_word_cloud[math_terms_dict[term]] = 0
+                math_word_cloud[math_terms_dict[term]] += len(matches)
                 match_list.append(math_terms_dict[term])
             # Update matched positions
             matched_positions.update((match.start(), match.end()) for match in matches)
             num_matches += len(matches)
         utt.num_math_terms = num_matches
         utt.math_terms = match_list
+    dict_list = []
+    for word in math_word_cloud.keys():
+        dict_list.append(
+            {'text': word, 'value': math_word_cloud[word]})
+    sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
+    return sorted_dict_list[:50]
 class EndpointHandler():
     def __init__(self, path="."):
         utterances = data.pop("inputs", data)
         params = data.pop("parameters", None)
         transcript = Transcript(filename=params.pop("filename", None))
         for utt in utterances:
             transcript.add_utterance(Utterance(**utt))
         uptake_speaker = params.pop("uptake_speaker", None)
         uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
                                    uptake_speaker=uptake_speaker)
+        del uptake_model
         # Reasoning
         reasoning_model = ReasoningModel(
             self.device, self.tokenizer, self.input_builder)
+        reasoning_model.run_inference(transcript, uptake_speaker=uptake_speaker)
+        del reasoning_model
         # Question
         question_model = QuestionModel(
             self.device, self.tokenizer, self.input_builder)
         question_model.run_inference(transcript)
+        del question_model
         # Focusing Question
         focusing_question_model = FocusingQuestionModel(
             self.device, self.tokenizer, self.input_builder)
         focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
+        del focusing_question_model
+        math_cloud = run_math_density(transcript)
         transcript.update_utterance_roles(uptake_speaker)
         transcript.calculate_aggregate_word_count()
+        return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None, 'mathTopWords': None}
         talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
         return_dict['talkDistribution'] = talk_dist
         return_dict['talkLength'] = talk_len
         talk_moments = transcript.get_talk_timeline()
         return_dict['talkMoments'] = talk_moments
+        word_cloud, uptake_word_cloud = transcript.get_word_clouds()
         return_dict['commonTopWords'] = word_cloud
+        return_dict['uptakeTopWords'] = uptake_word_cloud
+        return_dict['mathTopWords'] = math_cloud
         return return_dict