sparshmehta commited on
Commit
582a495
·
verified ·
1 Parent(s): b8058cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -785
app.py CHANGED
@@ -662,7 +662,7 @@ Important:
662
 
663
  def _evaluate_speech_metrics(self, transcript: str, audio_features: Dict[str, float],
664
  progress_callback=None) -> Dict[str, Any]:
665
- """Evaluate speech metrics with improved accuracy and stricter checks"""
666
  try:
667
  if progress_callback:
668
  progress_callback(0.2, "Calculating speech metrics...")
@@ -670,86 +670,62 @@ Important:
670
  # Calculate words and duration
671
  words = len(transcript.split())
672
  duration_minutes = float(audio_features.get('duration', 0)) / 60
 
673
 
674
- # Enhanced grammatical error detection with stricter patterns
675
- grammatical_errors = []
676
-
677
- # Subject-verb agreement errors
678
- sv_errors = re.findall(r'\b(they is|he are|she are|it are|there are \w+s|there is \w+s)\b', transcript.lower())
679
- grammatical_errors.extend([("Subject-Verb Agreement", err) for err in sv_errors])
680
-
681
- # Article misuse
682
- article_errors = re.findall(r'\b(a [aeiou]\w+|an [^aeiou\s]\w+)\b', transcript.lower())
683
- grammatical_errors.extend([("Article Misuse", err) for err in article_errors])
684
-
685
- # Double negatives
686
- double_neg = re.findall(r'\b(don\'t.*no|doesn\'t.*no|didn\'t.*no|never.*no)\b', transcript.lower())
687
- grammatical_errors.extend([("Double Negative", err) for err in double_neg])
688
-
689
- # Preposition errors
690
- prep_errors = re.findall(r'\b(depend of|different than|identical than)\b', transcript.lower())
691
- grammatical_errors.extend([("Preposition Error", err) for err in prep_errors])
692
-
693
- # Incomplete sentences (stricter detection)
694
- incomplete = re.findall(r'[a-zA-Z]+\s*[.!?]\s*(?![A-Z])|[a-zA-Z]+\s*-\s+|[a-zA-Z]+\s*\.\.\.', transcript)
695
- grammatical_errors.extend([("Incomplete Sentence", err) for err in incomplete])
696
-
697
- # Calculate errors per minute with stricter threshold
698
- errors_count = len(grammatical_errors)
699
- errors_per_minute = float(errors_count / duration_minutes if duration_minutes > 0 else 0)
700
 
701
- # Stricter threshold for errors (max 1 error per minute)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
702
  max_errors = 1.0
703
-
704
- # Calculate monotone score with stricter thresholds
705
- pitch_mean = float(audio_features.get("pitch_mean", 0))
706
- pitch_std = float(audio_features.get("pitch_std", 0))
707
- pitch_variation_coeff = (pitch_std / pitch_mean * 100) if pitch_mean > 0 else 0
708
- direction_changes = float(audio_features.get("direction_changes_per_min", 0))
709
- pitch_range = float(audio_features.get("pitch_range", 0))
710
-
711
- # Recalibrated scoring factors with stricter ranges
712
- # Variation factor: needs wider variation (20-40% is good)
713
- variation_factor = min(1.0, max(0.0,
714
- 1.0 if 20 <= pitch_variation_coeff <= 40
715
- else 0.5 if 15 <= pitch_variation_coeff <= 45
716
- else 0.0
717
- ))
718
-
719
- # Range factor: needs wider range (200-300% is good)
720
- range_ratio = (pitch_range / pitch_mean * 100) if pitch_mean > 0 else 0
721
- range_factor = min(1.0, max(0.0,
722
- 1.0 if 200 <= range_ratio <= 300
723
- else 0.5 if 150 <= range_ratio <= 350
724
- else 0.0
725
- ))
726
-
727
- # Changes factor: needs more frequent changes (450-650 changes/min is good)
728
- changes_factor = min(1.0, max(0.0,
729
- 1.0 if 450 <= direction_changes <= 650
730
- else 0.5 if 350 <= direction_changes <= 750
731
- else 0.0
732
- ))
733
-
734
- # Calculate final monotone score (0-1, higher means more monotonous)
735
- # Using weighted average to emphasize variation importance
736
- weights = [0.4, 0.3, 0.3] # More weight on pitch variation
737
- monotone_score = 1.0 - (
738
- (variation_factor * weights[0] +
739
- range_factor * weights[1] +
740
- changes_factor * weights[2])
741
- )
742
-
743
- # Add debug logging
744
- logger.info(f"""Monotone score calculation:
745
- Pitch variation coeff: {pitch_variation_coeff:.2f}
746
- Pitch range ratio: {range_ratio:.2f}%
747
- Changes per minute: {direction_changes:.2f}
748
- Variation factor: {variation_factor:.2f}
749
- Range factor: {range_factor:.2f}
750
- Changes factor: {changes_factor:.2f}
751
- Final score: {monotone_score:.2f}
752
- """)
753
 
754
  return {
755
  "speed": {
@@ -759,41 +735,49 @@ Important:
759
  "duration_minutes": duration_minutes
760
  },
761
  "fluency": {
762
- "score": 1 if errors_per_minute <= max_errors else 0,
763
  "errorsPerMin": errors_per_minute,
764
- "maxErrorsThreshold": max_errors,
765
- "detectedErrors": [
766
- {
767
- "type": error_type,
768
- "context": error_text
769
- } for error_type, error_text in grammatical_errors
770
- ]
771
- },
772
- "flow": {
773
- "score": 1 if audio_features.get("pauses_per_minute", 0) <= 12 else 0,
774
- "pausesPerMin": audio_features.get("pauses_per_minute", 0)
775
- },
776
- "intonation": {
777
- "pitch": pitch_mean,
778
- "pitchScore": 1 if not any(monotone_indicators.values()) else 0,
779
- "pitchVariation": pitch_variation_coeff,
780
- "monotoneScore": monotone_score,
781
- "monotoneIndicators": monotone_indicators,
782
- "directionChanges": direction_changes,
783
- "variationsPerMin": audio_features.get("variations_per_minute", 0)
784
- },
785
- "energy": {
786
- "score": 1 if 60 <= audio_features.get("mean_amplitude", 0) <= 75 else 0,
787
- "meanAmplitude": audio_features.get("mean_amplitude", 0),
788
- "amplitudeDeviation": audio_features.get("amplitude_deviation", 0),
789
- "variationScore": 1 if 0.05 <= audio_features.get("amplitude_deviation", 0) <= 0.15 else 0
790
  }
791
- }
 
 
 
 
792
 
793
  except Exception as e:
794
  logger.error(f"Error in speech metrics evaluation: {e}")
795
  raise
796
 
 
 
 
 
797
  def generate_suggestions(self, category: str, citations: List[str]) -> List[str]:
798
  """Generate contextual suggestions based on category and citations"""
799
  try:
@@ -812,685 +796,6 @@ Important:
812
  Format as a JSON array with a single string."""}
813
  ],
814
  response_format={"type": "json_object"},
815
- temperature=0.7
816
- )
817
-
818
- result = json.loads(response.choices[0].message.content)
819
- return result.get("suggestions", [])
820
-
821
- except Exception as e:
822
- logger.error(f"Error generating suggestions: {e}")
823
- return [f"Unable to generate specific suggestions: {str(e)}"]
824
-
825
- class RecommendationGenerator:
826
- """Generates teaching recommendations using OpenAI API"""
827
- def __init__(self, api_key: str):
828
- self.client = OpenAI(api_key=api_key)
829
- self.retry_count = 3
830
- self.retry_delay = 1
831
-
832
- def generate_recommendations(self,
833
- metrics: Dict[str, Any],
834
- content_analysis: Dict[str, Any],
835
- progress_callback=None) -> Dict[str, Any]:
836
- """Generate recommendations with robust JSON handling"""
837
- for attempt in range(self.retry_count):
838
- try:
839
- if progress_callback:
840
- progress_callback(0.2, "Preparing recommendation analysis...")
841
-
842
- prompt = self._create_recommendation_prompt(metrics, content_analysis)
843
-
844
- if progress_callback:
845
- progress_callback(0.5, "Generating recommendations...")
846
-
847
- response = self.client.chat.completions.create(
848
- model="gpt-4o-mini",
849
- messages=[
850
- {"role": "system", "content": """You are a teaching expert providing actionable recommendations.
851
- Each improvement must be categorized as one of:
852
- - COMMUNICATION: Related to speaking, pace, tone, clarity, delivery
853
- - TEACHING: Related to explanation, examples, engagement, structure
854
- - TECHNICAL: Related to code, implementation, technical concepts
855
-
856
- Always respond with a valid JSON object containing categorized improvements."""},
857
- {"role": "user", "content": prompt}
858
- ],
859
- response_format={"type": "json_object"}
860
- )
861
-
862
- if progress_callback:
863
- progress_callback(0.8, "Formatting recommendations...")
864
-
865
- result_text = response.choices[0].message.content.strip()
866
-
867
- try:
868
- result = json.loads(result_text)
869
- # Ensure improvements are properly formatted
870
- if "improvements" in result:
871
- formatted_improvements = []
872
- for imp in result["improvements"]:
873
- if isinstance(imp, str):
874
- # Default categorization for legacy format
875
- formatted_improvements.append({
876
- "category": "TECHNICAL",
877
- "message": imp
878
- })
879
- elif isinstance(imp, dict):
880
- # Ensure proper structure for dict format
881
- formatted_improvements.append({
882
- "category": imp.get("category", "TECHNICAL"),
883
- "message": imp.get("message", str(imp))
884
- })
885
- result["improvements"] = formatted_improvements
886
- except json.JSONDecodeError:
887
- result = {
888
- "geographyFit": "Unknown",
889
- "improvements": [
890
- {
891
- "category": "TECHNICAL",
892
- "message": "Unable to generate specific recommendations"
893
- }
894
- ],
895
- "rigor": "Undetermined",
896
- "profileMatches": []
897
- }
898
-
899
- if progress_callback:
900
- progress_callback(1.0, "Recommendations complete!")
901
-
902
- return result
903
-
904
- except Exception as e:
905
- logger.error(f"Recommendation generation attempt {attempt + 1} failed: {e}")
906
- if attempt == self.retry_count - 1:
907
- return {
908
- "geographyFit": "Unknown",
909
- "improvements": [
910
- {
911
- "category": "TECHNICAL",
912
- "message": f"Unable to generate specific recommendations: {str(e)}"
913
- }
914
- ],
915
- "rigor": "Undetermined",
916
- "profileMatches": []
917
- }
918
- time.sleep(self.retry_delay * (2 ** attempt))
919
-
920
- def _create_recommendation_prompt(self, metrics: Dict[str, Any], content_analysis: Dict[str, Any]) -> str:
921
- """Create the recommendation prompt"""
922
- return f"""Based on the following metrics and analysis, provide recommendations:
923
- Metrics: {json.dumps(metrics)}
924
- Content Analysis: {json.dumps(content_analysis)}
925
-
926
- Analyze the teaching style and provide:
927
- 1. A concise performance summary (2-3 paragraphs highlighting key strengths and areas for improvement)
928
- 2. Geography fit assessment
929
- 3. Specific improvements needed (each must be categorized as COMMUNICATION, TEACHING, or TECHNICAL)
930
- 4. Profile matching for different learner types (choose ONLY ONE best match)
931
- 5. Overall teaching rigor assessment
932
-
933
- Required JSON structure:
934
- {{
935
- "summary": "Comprehensive summary of teaching performance, strengths, and areas for improvement",
936
- "geographyFit": "String describing geographical market fit",
937
- "improvements": [
938
- {{
939
- "category": "COMMUNICATION",
940
- "message": "Specific improvement recommendation"
941
- }},
942
- {{
943
- "category": "TEACHING",
944
- "message": "Specific improvement recommendation"
945
- }},
946
- {{
947
- "category": "TECHNICAL",
948
- "message": "Specific improvement recommendation"
949
- }}
950
- ],
951
- "rigor": "Assessment of teaching rigor",
952
- "profileMatches": [
953
- {{
954
- "profile": "junior_technical",
955
- "match": false,
956
- "reason": "Detailed explanation why this profile is not the best match"
957
- }},
958
- {{
959
- "profile": "senior_non_technical",
960
- "match": false,
961
- "reason": "Detailed explanation why this profile is not the best match"
962
- }},
963
- {{
964
- "profile": "junior_expert",
965
- "match": false,
966
- "reason": "Detailed explanation why this profile is not the best match"
967
- }},
968
- {{
969
- "profile": "senior_expert",
970
- "match": false,
971
- "reason": "Detailed explanation why this profile is not the best match"
972
- }}
973
- ]
974
- }}
975
-
976
- Consider:
977
- - Teaching pace and complexity level
978
- - Balance of technical vs business context
979
- - Depth of code explanations
980
- - Use of examples and analogies
981
- - Engagement style
982
- - Communication metrics
983
- - Teaching assessment scores"""
984
-
985
- class CostCalculator:
986
- """Calculates API and processing costs"""
987
- def __init__(self):
988
- self.GPT4_INPUT_COST = 0.15 / 1_000_000 # $0.15 per 1M tokens input
989
- self.GPT4_OUTPUT_COST = 0.60 / 1_000_000 # $0.60 per 1M tokens output
990
- self.WHISPER_COST = 0.006 / 60 # $0.006 per minute
991
- self.costs = {
992
- 'transcription': 0.0,
993
- 'content_analysis': 0.0,
994
- 'recommendations': 0.0,
995
- 'total': 0.0
996
- }
997
-
998
- def estimate_tokens(self, text: str) -> int:
999
- """Rough estimation of token count based on words"""
1000
- return len(text.split()) * 1.3 # Approximate tokens per word
1001
-
1002
- def add_transcription_cost(self, duration_seconds: float):
1003
- """Calculate Whisper transcription cost"""
1004
- cost = (duration_seconds / 60) * self.WHISPER_COST
1005
- self.costs['transcription'] = cost
1006
- self.costs['total'] += cost
1007
- print(f"\nTranscription Cost: ${cost:.4f}")
1008
-
1009
- def add_gpt4_cost(self, input_text: str, output_text: str, operation: str):
1010
- """Calculate GPT-4 API cost for a single operation"""
1011
- input_tokens = self.estimate_tokens(input_text)
1012
- output_tokens = self.estimate_tokens(output_text)
1013
-
1014
- input_cost = input_tokens * self.GPT4_INPUT_COST
1015
- output_cost = output_tokens * self.GPT4_OUTPUT_COST
1016
- total_cost = input_cost + output_cost
1017
-
1018
- self.costs[operation] = total_cost
1019
- self.costs['total'] += total_cost
1020
-
1021
- print(f"\n{operation.replace('_', ' ').title()} Cost:")
1022
- print(f"Input tokens: {input_tokens:.0f} (${input_cost:.4f})")
1023
- print(f"Output tokens: {output_tokens:.0f} (${output_cost:.4f})")
1024
- print(f"Operation total: ${total_cost:.4f}")
1025
-
1026
- def print_total_cost(self):
1027
- """Print total cost breakdown"""
1028
- print("\n=== Cost Breakdown ===")
1029
- for key, cost in self.costs.items():
1030
- if key != 'total':
1031
- print(f"{key.replace('_', ' ').title()}: ${cost:.4f}")
1032
- print(f"\nTotal Cost: ${self.costs['total']:.4f}")
1033
-
1034
- class MentorEvaluator:
1035
- """Main class for video evaluation"""
1036
- def __init__(self, model_cache_dir: Optional[str] = None):
1037
- # Fix potential API key issue
1038
- self.api_key = st.secrets.get("OPENAI_API_KEY") # Use get() method
1039
- if not self.api_key:
1040
- raise ValueError("OpenAI API key not found in secrets")
1041
-
1042
- # Add error handling for model cache directory
1043
- try:
1044
- if model_cache_dir:
1045
- self.model_cache_dir = Path(model_cache_dir)
1046
- else:
1047
- self.model_cache_dir = Path.home() / ".cache" / "whisper"
1048
- self.model_cache_dir.mkdir(parents=True, exist_ok=True)
1049
- except Exception as e:
1050
- raise RuntimeError(f"Failed to create model cache directory: {e}")
1051
-
1052
- # Initialize components with proper error handling
1053
- try:
1054
- self.feature_extractor = AudioFeatureExtractor()
1055
- self.content_analyzer = ContentAnalyzer(self.api_key)
1056
- self.recommendation_generator = RecommendationGenerator(self.api_key)
1057
- self.cost_calculator = CostCalculator()
1058
- except Exception as e:
1059
- raise RuntimeError(f"Failed to initialize components: {e}")
1060
-
1061
- def _get_cached_result(self, key: str) -> Optional[Any]:
1062
- """Get cached result if available and not expired"""
1063
- if key in self._cache:
1064
- timestamp, value = self._cache[key]
1065
- if time.time() - timestamp < self.cache_ttl:
1066
- return value
1067
- return None
1068
-
1069
- def _set_cached_result(self, key: str, value: Any):
1070
- """Cache result with timestamp"""
1071
- self._cache[key] = (time.time(), value)
1072
-
1073
- def _extract_audio(self, video_path: str, output_path: str, progress_callback=None) -> str:
1074
- """Extract audio from video with optimized settings"""
1075
- try:
1076
- if progress_callback:
1077
- progress_callback(0.1, "Checking dependencies...")
1078
-
1079
- # Add optimized ffmpeg settings
1080
- ffmpeg_cmd = [
1081
- 'ffmpeg',
1082
- '-i', video_path,
1083
- '-ar', '16000', # Set sample rate to 16kHz
1084
- '-ac', '1', # Convert to mono
1085
- '-f', 'wav', # Output format
1086
- '-v', 'warning', # Reduce verbosity
1087
- '-y', # Overwrite output file
1088
- # Add these optimizations:
1089
- '-c:a', 'pcm_s16le', # Use simple audio codec
1090
- '-movflags', 'faststart', # Optimize for streaming
1091
- '-threads', str(max(1, multiprocessing.cpu_count() - 1)), # Use multiple threads
1092
- output_path
1093
- ]
1094
-
1095
- # Use subprocess with optimized buffer size
1096
- result = subprocess.run(
1097
- ffmpeg_cmd,
1098
- capture_output=True,
1099
- text=True,
1100
- bufsize=10*1024*1024 # 10MB buffer
1101
- )
1102
-
1103
- if result.returncode != 0:
1104
- raise AudioProcessingError(f"FFmpeg Error: {result.stderr}")
1105
-
1106
- if not os.path.exists(output_path):
1107
- raise AudioProcessingError("Audio extraction failed: output file not created")
1108
-
1109
- if progress_callback:
1110
- progress_callback(1.0, "Audio extraction complete!")
1111
-
1112
- return output_path
1113
-
1114
- except Exception as e:
1115
- logger.error(f"Error in audio extraction: {e}")
1116
- raise AudioProcessingError(f"Audio extraction failed: {str(e)}")
1117
-
1118
- def _preprocess_audio(self, input_path: str, output_path: Optional[str] = None) -> str:
1119
- """Preprocess audio for analysis"""
1120
- try:
1121
- if not os.path.exists(input_path):
1122
- raise FileNotFoundError(f"Input audio file not found: {input_path}")
1123
-
1124
- # If no output path specified, use the input path
1125
- if output_path is None:
1126
- output_path = input_path
1127
-
1128
- # Load audio
1129
- audio, sr = librosa.load(input_path, sr=16000)
1130
-
1131
- # Apply preprocessing steps
1132
- # 1. Normalize audio
1133
- audio = librosa.util.normalize(audio)
1134
-
1135
- # 2. Remove silence
1136
- non_silent = librosa.effects.trim(audio, top_db=20)[0]
1137
-
1138
- # 3. Save processed audio
1139
- sf.write(output_path, non_silent, sr)
1140
-
1141
- return output_path
1142
-
1143
- except Exception as e:
1144
- logger.error(f"Error in audio preprocessing: {e}")
1145
- raise AudioProcessingError(f"Audio preprocessing failed: {str(e)}")
1146
-
1147
- def evaluate_video(self, video_path: str, transcript_file: Optional[str] = None) -> Dict[str, Any]:
1148
- try:
1149
- # Add input validation
1150
- if not os.path.exists(video_path):
1151
- raise FileNotFoundError(f"Video file not found: {video_path}")
1152
-
1153
- # Validate video file format
1154
- valid_extensions = {'.mp4', '.avi', '.mov'}
1155
- if not any(video_path.lower().endswith(ext) for ext in valid_extensions):
1156
- raise ValueError("Unsupported video format. Use MP4, AVI, or MOV")
1157
-
1158
- # Create progress tracking containers with error handling
1159
- try:
1160
- status = st.empty()
1161
- progress = st.progress(0)
1162
- tracker = ProgressTracker(status, progress)
1163
- except Exception as e:
1164
- logger.error(f"Failed to create progress trackers: {e}")
1165
- raise
1166
-
1167
- # Add cleanup for temporary files
1168
- temp_files = []
1169
- try:
1170
- with temporary_file(suffix=".wav") as temp_audio, \
1171
- temporary_file(suffix=".wav") as processed_audio:
1172
- temp_files.extend([temp_audio, processed_audio])
1173
-
1174
- # Step 1: Extract audio from video
1175
- tracker.update(0.1, "Extracting audio from video")
1176
- self._extract_audio(video_path, temp_audio)
1177
- tracker.next_step()
1178
-
1179
- # Step 2: Preprocess audio
1180
- tracker.update(0.2, "Preprocessing audio")
1181
- self._preprocess_audio(temp_audio, processed_audio)
1182
- tracker.next_step()
1183
-
1184
- # Step 3: Extract features
1185
- tracker.update(0.4, "Extracting audio features")
1186
- audio_features = self.feature_extractor.extract_features(processed_audio)
1187
- tracker.next_step()
1188
-
1189
- # Step 4: Get transcript - Modified to handle 3-argument progress callback
1190
- tracker.update(0.6, "Processing transcript")
1191
- if transcript_file:
1192
- transcript = transcript_file.getvalue().decode('utf-8')
1193
- else:
1194
- # Update progress callback to handle 3 arguments
1195
- tracker.update(0.6, "Transcribing audio")
1196
- transcript = self._transcribe_audio(
1197
- processed_audio,
1198
- lambda p, m, extra=None: tracker.update(0.6 + p * 0.2, m)
1199
- )
1200
- tracker.next_step()
1201
-
1202
- # Step 5: Analyze content
1203
- tracker.update(0.8, "Analyzing teaching content")
1204
- content_analysis = self.content_analyzer.analyze_content(transcript)
1205
-
1206
- # Step 6: Generate recommendations
1207
- tracker.update(0.9, "Generating recommendations")
1208
- recommendations = self.recommendation_generator.generate_recommendations(
1209
- audio_features,
1210
- content_analysis
1211
- )
1212
- tracker.next_step()
1213
-
1214
- # Add speech metrics evaluation
1215
- speech_metrics = self._evaluate_speech_metrics(transcript, audio_features)
1216
-
1217
- # Clear progress indicators
1218
- status.empty()
1219
- progress.empty()
1220
-
1221
- return {
1222
- "audio_features": audio_features,
1223
- "transcript": transcript,
1224
- "teaching": content_analysis,
1225
- "recommendations": recommendations,
1226
- "speech_metrics": speech_metrics
1227
- }
1228
-
1229
- finally:
1230
- # Clean up any remaining temporary files
1231
- for temp_file in temp_files:
1232
- try:
1233
- if os.path.exists(temp_file):
1234
- os.remove(temp_file)
1235
- except Exception as e:
1236
- logger.warning(f"Failed to remove temporary file {temp_file}: {e}")
1237
-
1238
- except Exception as e:
1239
- logger.error(f"Error in video evaluation: {e}")
1240
- # Clean up UI elements on error
1241
- if 'status' in locals():
1242
- status.empty()
1243
- if 'progress' in locals():
1244
- progress.empty()
1245
- raise RuntimeError(f"Analysis failed: {str(e)}")
1246
-
1247
- def _transcribe_audio(self, audio_path: str, progress_callback=None) -> str:
1248
- """Transcribe audio with optimized segment detection and detailed progress tracking"""
1249
- try:
1250
- if progress_callback:
1251
- progress_callback(0.1, "Loading transcription model...")
1252
-
1253
- # Check if GPU is available and set device accordingly
1254
- device = "cuda" if torch.cuda.is_available() else "cpu"
1255
- compute_type = "float16" if device == "cuda" else "int8"
1256
-
1257
- # Generate cache key based on file content
1258
- cache_key = f"transcript_{hashlib.md5(open(audio_path, 'rb').read()).hexdigest()}"
1259
-
1260
- # Check cache first
1261
- if cache_key in st.session_state:
1262
- logger.info("Using cached transcription")
1263
- if progress_callback:
1264
- progress_callback(1.0, "Retrieved from cache")
1265
- return st.session_state[cache_key]
1266
-
1267
- # Add validation for audio file
1268
- if not os.path.exists(audio_path):
1269
- raise FileNotFoundError(f"Audio file not found: {audio_path}")
1270
-
1271
- try:
1272
- audio_info = sf.info(audio_path)
1273
- if audio_info.samplerate != 16000:
1274
- logger.warning(f"Audio sample rate is {audio_info.samplerate}Hz, expected 16000Hz")
1275
- except Exception as e:
1276
- logger.error(f"Error checking audio file: {e}")
1277
- raise ValueError(f"Invalid audio file: {str(e)}")
1278
-
1279
- if progress_callback:
1280
- progress_callback(0.2, "Initializing model...")
1281
-
1282
- # Initialize model with optimized settings and proper error handling
1283
- try:
1284
- model = WhisperModel(
1285
- "medium",
1286
- device=device,
1287
- compute_type=compute_type,
1288
- download_root=self.model_cache_dir,
1289
- local_files_only=False,
1290
- cpu_threads=4,
1291
- num_workers=2
1292
- )
1293
- except Exception as e:
1294
- logger.error(f"Error initializing Whisper model: {e}")
1295
- raise RuntimeError(f"Failed to initialize transcription model: {str(e)}")
1296
-
1297
- if progress_callback:
1298
- progress_callback(0.3, "Starting transcription...")
1299
-
1300
- # Get audio duration for progress calculation
1301
- total_duration = audio_info.duration
1302
-
1303
- # Transcribe with optimized VAD settings and error handling
1304
- try:
1305
- segments, _ = model.transcribe(
1306
- audio_path,
1307
- beam_size=5,
1308
- word_timestamps=True,
1309
- vad_filter=True,
1310
- vad_parameters=dict(
1311
- min_silence_duration_ms=500,
1312
- speech_pad_ms=100,
1313
- threshold=0.3,
1314
- min_speech_duration_ms=250
1315
- ),
1316
- language='en'
1317
- )
1318
- except Exception as e:
1319
- logger.error(f"Error during transcription: {e}")
1320
- raise RuntimeError(f"Transcription failed: {str(e)}")
1321
-
1322
- # Process segments with better error handling and validation
1323
- transcript_parts = []
1324
- segments = list(segments) # Convert generator to list
1325
- total_segments = len(segments)
1326
- batch_size = 10
1327
-
1328
- if total_segments == 0:
1329
- logger.warning("No speech segments detected")
1330
- raise ValueError("No speech detected in audio file")
1331
-
1332
- for i, segment in enumerate(segments, 1):
1333
- if segment.text: # Only add non-empty segments
1334
- # Validate segment text
1335
- cleaned_text = segment.text.strip()
1336
- if cleaned_text:
1337
- transcript_parts.append(cleaned_text)
1338
-
1339
- # Update progress less frequently for better performance
1340
- if i % 5 == 0 or i == total_segments:
1341
- progress = min(i / total_segments, 1.0)
1342
- progress = 0.3 + (progress * 0.6)
1343
-
1344
- current_batch = (i - 1) // batch_size + 1
1345
- total_batches = (total_segments + batch_size - 1) // batch_size
1346
-
1347
- if progress_callback:
1348
- progress_callback(
1349
- progress,
1350
- f"Transcribing Batch {current_batch}/{total_batches}",
1351
- f"Processing segment {i} of {total_segments}"
1352
- )
1353
-
1354
- # Validate final transcript
1355
- transcript = ' '.join(transcript_parts)
1356
- if not transcript.strip():
1357
- raise ValueError("Transcription produced empty result")
1358
-
1359
- # Cache the result
1360
- st.session_state[cache_key] = transcript
1361
-
1362
- if progress_callback:
1363
- progress_callback(1.0, "Transcription complete!")
1364
-
1365
- return transcript
1366
-
1367
- except Exception as e:
1368
- logger.error(f"Error in transcription: {e}")
1369
- if progress_callback:
1370
- progress_callback(1.0, "Error in transcription", str(e))
1371
- raise
1372
-
1373
- def _merge_transcripts(self, transcripts: List[str]) -> str:
1374
- """Merge transcripts with overlap deduplication"""
1375
- if not transcripts:
1376
- return ""
1377
-
1378
- def clean_text(text):
1379
- # Remove extra spaces and normalize punctuation
1380
- return ' '.join(text.split())
1381
-
1382
- def find_overlap(text1, text2):
1383
- # Find overlapping text between consecutive chunks
1384
- words1 = text1.split()
1385
- words2 = text2.split()
1386
-
1387
- for i in range(min(len(words1), 20), 0, -1): # Check up to 20 words
1388
- if ' '.join(words1[-i:]) == ' '.join(words2[:i]):
1389
- return i
1390
- return 0
1391
-
1392
- merged = clean_text(transcripts[0])
1393
-
1394
- for i in range(1, len(transcripts)):
1395
- current = clean_text(transcripts[i])
1396
- overlap_size = find_overlap(merged, current)
1397
- merged += ' ' + current.split(' ', overlap_size)[-1]
1398
-
1399
- return merged
1400
-
1401
- def calculate_speech_metrics(self, transcript: str, audio_duration: float) -> Dict[str, float]:
1402
- """Calculate words per minute and other speech metrics."""
1403
- words = len(transcript.split())
1404
- minutes = audio_duration / 60
1405
- return {
1406
- 'words_per_minute': words / minutes if minutes > 0 else 0,
1407
- 'total_words': words,
1408
- 'duration_minutes': minutes
1409
- }
1410
-
1411
- def _evaluate_speech_metrics(self, transcript: str, audio_features: Dict[str, float],
1412
- progress_callback=None) -> Dict[str, Any]:
1413
- """Evaluate speech metrics with improved accuracy"""
1414
- try:
1415
- if progress_callback:
1416
- progress_callback(0.2, "Calculating speech metrics...")
1417
-
1418
- # Calculate words and duration
1419
- words = len(transcript.split())
1420
- duration_minutes = float(audio_features.get('duration', 0)) / 60
1421
-
1422
- # Calculate words per minute with updated range (130-160 WPM is ideal for teaching)
1423
- words_per_minute = float(words / duration_minutes if duration_minutes > 0 else 0)
1424
-
1425
- # Improved filler word detection (2-3 per minute is acceptable)
1426
- filler_words = re.findall(r'\b(um|uh|like|you\s+know|basically|actually|literally)\b',
1427
- transcript.lower())
1428
- fillers_count = len(filler_words)
1429
- fillers_per_minute = float(fillers_count / duration_minutes if duration_minutes > 0 else 0)
1430
-
1431
- # Improved error detection (1-2 per minute is acceptable)
1432
- repeated_words = len(re.findall(r'\b(\w+)\s+\1\b', transcript.lower()))
1433
- incomplete_sentences = len(re.findall(r'[a-zA-Z]+\s*\.\.\.|\b[a-zA-Z]+\s*-\s+', transcript))
1434
- errors_count = repeated_words + incomplete_sentences
1435
- errors_per_minute = float(errors_count / duration_minutes if duration_minutes > 0 else 0)
1436
-
1437
- # Set default thresholds if analysis fails
1438
- max_errors = 1.0
1439
- max_fillers = 3.0
1440
- threshold_explanation = "Using standard thresholds"
1441
- grammatical_errors = []
1442
-
1443
- # Calculate fluency score based on both errors and fillers
1444
- fluency_score = 1 if (errors_per_minute <= max_errors and fillers_per_minute <= max_fillers) else 0
1445
-
1446
- return {
1447
- "speed": {
1448
- "score": 1 if 120 <= words_per_minute <= 180 else 0,
1449
- "wpm": words_per_minute,
1450
- "total_words": words,
1451
- "duration_minutes": duration_minutes
1452
- },
1453
- "fluency": {
1454
- "score": fluency_score, # Add explicit fluency score
1455
- "errorsPerMin": errors_per_minute,
1456
- "fillersPerMin": fillers_per_minute,
1457
- "maxErrorsThreshold": max_errors,
1458
- "maxFillersThreshold": max_fillers,
1459
- "thresholdExplanation": threshold_explanation,
1460
- "detectedErrors": [
1461
- {
1462
- "type": "Grammar",
1463
- "context": error,
1464
- } for error in grammatical_errors
1465
- ],
1466
- "detectedFillers": filler_words
1467
- },
1468
- "flow": {
1469
- "score": 1 if audio_features.get("pauses_per_minute", 0) <= 12 else 0,
1470
- "pausesPerMin": audio_features.get("pauses_per_minute", 0)
1471
- },
1472
- "intonation": {
1473
- "pitch": audio_features.get("pitch_mean", 0),
1474
- "pitchScore": 1 if 20 <= (audio_features.get("pitch_std", 0) / audio_features.get("pitch_mean", 0) * 100 if audio_features.get("pitch_mean", 0) > 0 else 0) <= 40 else 0,
1475
- "pitchVariation": audio_features.get("pitch_std", 0),
1476
- "patternScore": 1 if audio_features.get("variations_per_minute", 0) >= 120 else 0,
1477
- "risingPatterns": audio_features.get("rising_patterns", 0),
1478
- "fallingPatterns": audio_features.get("falling_patterns", 0),
1479
- "variationsPerMin": audio_features.get("variations_per_minute", 0),
1480
- "mu": audio_features.get("pitch_mean", 0)
1481
- },
1482
- "energy": {
1483
- "score": 1 if 60 <= audio_features.get("mean_amplitude", 0) <= 75 else 0,
1484
- "meanAmplitude": audio_features.get("mean_amplitude", 0),
1485
- "amplitudeDeviation": audio_features.get("amplitude_deviation", 0),
1486
- "variationScore": 1 if 0.05 <= audio_features.get("amplitude_deviation", 0) <= 0.15 else 0
1487
- }
1488
- }
1489
-
1490
- except Exception as e:
1491
- logger.error(f"Error in speech metrics evaluation: {e}")
1492
- raise
1493
-
1494
  def validate_video_file(file_path: str):
1495
  """Validate video file before processing"""
1496
  MAX_SIZE = 1024 * 1024 * 1024 # 500MB limit
 
662
 
663
  def _evaluate_speech_metrics(self, transcript: str, audio_features: Dict[str, float],
664
  progress_callback=None) -> Dict[str, Any]:
665
+ """Evaluate speech metrics with improved accuracy and AI-powered error detection"""
666
  try:
667
  if progress_callback:
668
  progress_callback(0.2, "Calculating speech metrics...")
 
670
  # Calculate words and duration
671
  words = len(transcript.split())
672
  duration_minutes = float(audio_features.get('duration', 0)) / 60
673
+ words_per_minute = float(words / duration_minutes if duration_minutes > 0 else 0)
674
 
675
+ # Use OpenAI to analyze filler words and speech errors
676
+ analysis_prompt = f"""Analyze this teaching transcript for filler words and speech errors.
677
+ Identify:
678
+ 1. Filler words (um, uh, like, you know, etc.)
679
+ 2. Speech errors (stutters, repeated words, incomplete sentences)
680
+ 3. Grammar errors
681
+
682
+ Format response as JSON:
683
+ {{
684
+ "filler_words": [
685
+ {{"word": "word", "count": number, "timestamps": ["MM:SS"]}}
686
+ ],
687
+ "speech_errors": [
688
+ {{"type": "error_type", "context": "error in context", "timestamps": ["MM:SS"]}}
689
+ ],
690
+ "grammar_errors": [
691
+ {{"type": "error_type", "context": "error in context", "timestamps": ["MM:SS"]}}
692
+ ]
693
+ }}
694
+
695
+ Transcript:
696
+ {transcript}
697
+ """
 
 
 
698
 
699
+ try:
700
+ response = self.content_analyzer.client.chat.completions.create(
701
+ model="gpt-4o-mini",
702
+ messages=[
703
+ {"role": "system", "content": "You are a speech analysis expert focusing on identifying speech patterns and errors."},
704
+ {"role": "user", "content": analysis_prompt}
705
+ ],
706
+ response_format={"type": "json_object"},
707
+ temperature=0.3
708
+ )
709
+
710
+ analysis = json.loads(response.choices[0].message.content)
711
+
712
+ # Calculate metrics from AI analysis
713
+ filler_words = analysis.get("filler_words", [])
714
+ speech_errors = analysis.get("speech_errors", [])
715
+ grammar_errors = analysis.get("grammar_errors", [])
716
+
717
+ total_fillers = sum(fw["count"] for fw in filler_words)
718
+ fillers_per_minute = float(total_fillers / duration_minutes if duration_minutes > 0 else 0)
719
+
720
+ total_errors = len(speech_errors) + len(grammar_errors)
721
+ errors_per_minute = float(total_errors / duration_minutes if duration_minutes > 0 else 0)
722
+
723
+ # Set thresholds
724
  max_errors = 1.0
725
+ max_fillers = 3.0
726
+
727
+ # Calculate fluency score
728
+ fluency_score = 1 if (errors_per_minute <= max_errors and fillers_per_minute <= max_fillers) else 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
729
 
730
  return {
731
  "speed": {
 
735
  "duration_minutes": duration_minutes
736
  },
737
  "fluency": {
738
+ "score": fluency_score,
739
  "errorsPerMin": errors_per_minute,
740
+ "fillersPerMin": fillers_per_minute,
741
+ "maxErrorsThreshold": max_errors,
742
+ "maxFillersThreshold": max_fillers,
743
+ "detectedFillers": filler_words,
744
+ "detectedSpeechErrors": speech_errors,
745
+ "detectedGrammarErrors": grammar_errors
746
+ },
747
+ "flow": {
748
+ "score": 1 if audio_features.get("pauses_per_minute", 0) <= 12 else 0,
749
+ "pausesPerMin": audio_features.get("pauses_per_minute", 0)
750
+ },
751
+ "intonation": {
752
+ "pitch": audio_features.get("pitch_mean", 0),
753
+ "pitchScore": 1 if 20 <= (audio_features.get("pitch_std", 0) / audio_features.get("pitch_mean", 0) * 100 if audio_features.get("pitch_mean", 0) > 0 else 0) <= 40 else 0,
754
+ "pitchVariation": audio_features.get("pitch_std", 0),
755
+ "patternScore": 1 if audio_features.get("variations_per_minute", 0) >= 120 else 0,
756
+ "risingPatterns": audio_features.get("rising_patterns", 0),
757
+ "fallingPatterns": audio_features.get("falling_patterns", 0),
758
+ "variationsPerMin": audio_features.get("variations_per_minute", 0)
759
+ },
760
+ "energy": {
761
+ "score": 1 if 60 <= audio_features.get("mean_amplitude", 0) <= 75 else 0,
762
+ "meanAmplitude": audio_features.get("mean_amplitude", 0),
763
+ "amplitudeDeviation": audio_features.get("amplitude_deviation", 0),
764
+ "variationScore": 1 if 0.05 <= audio_features.get("amplitude_deviation", 0) <= 0.15 else 0
765
+ }
766
  }
767
+
768
+ except Exception as api_error:
769
+ logger.error(f"Error in AI analysis: {api_error}")
770
+ # Fall back to basic analysis if AI fails
771
+ return self._basic_speech_metrics(transcript, audio_features)
772
 
773
  except Exception as e:
774
  logger.error(f"Error in speech metrics evaluation: {e}")
775
  raise
776
 
777
+ def _basic_speech_metrics(self, transcript: str, audio_features: Dict[str, float]) -> Dict[str, Any]:
778
+ """Fallback method for basic speech metrics when AI analysis fails"""
779
+ # ... (keep the original regex-based analysis as fallback) ...
780
+
781
  def generate_suggestions(self, category: str, citations: List[str]) -> List[str]:
782
  """Generate contextual suggestions based on category and citations"""
783
  try:
 
796
  Format as a JSON array with a single string."""}
797
  ],
798
  response_format={"type": "json_object"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
799
  def validate_video_file(file_path: str):
800
  """Validate video file before processing"""
801
  MAX_SIZE = 1024 * 1024 * 1024 # 500MB limit