sparshmehta commited on
Commit
571e8a0
·
verified ·
1 Parent(s): 754e3eb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +373 -291
app.py CHANGED
@@ -25,7 +25,6 @@ import sys
25
  import multiprocessing
26
  import concurrent.futures
27
  import hashlib
28
- import plotly.express as px
29
 
30
  # Set up logging
31
  logging.basicConfig(
@@ -292,12 +291,9 @@ class ContentAnalyzer:
292
  if progress_callback:
293
  progress_callback(0.2, "Preparing content analysis...")
294
 
295
- # Extract transcript text from transcript data structure
296
- transcript_text = transcript.get("transcript", "") if isinstance(transcript, dict) else transcript
297
-
298
  # Remove any truncation of transcript - pass full text to API
299
- prompt = self._create_analysis_prompt(transcript_text)
300
- logger.info(f"Sending full transcript of length: {len(transcript_text)} characters")
301
 
302
  if progress_callback:
303
  progress_callback(0.5, "Processing with AI model...")
@@ -535,40 +531,37 @@ Important:
535
 
536
  def _evaluate_speech_metrics(self, transcript: str, audio_features: Dict[str, float],
537
  progress_callback=None) -> Dict[str, Any]:
538
- """Evaluate speech metrics with stricter error thresholds"""
539
  try:
540
  if progress_callback:
541
  progress_callback(0.2, "Calculating speech metrics...")
542
 
543
- # Extract transcript text from transcript data structure
544
- transcript_text = transcript.get("transcript", "") if isinstance(transcript, dict) else transcript
545
-
546
  # Calculate words and duration
547
- words = len(transcript_text.split())
548
  duration_minutes = float(audio_features.get('duration', 0)) / 60
549
 
550
- # Calculate words per minute (130-160 WPM is ideal for teaching)
551
  words_per_minute = float(words / duration_minutes if duration_minutes > 0 else 0)
552
 
553
- # Stricter filler word detection (max 1-2 per minute is acceptable)
554
  filler_words = re.findall(r'\b(um|uh|like|you\s+know|basically|actually|literally)\b',
555
- transcript_text.lower())
556
  fillers_count = len(filler_words)
557
  fillers_per_minute = float(fillers_count / duration_minutes if duration_minutes > 0 else 0)
558
 
559
- # Stricter error detection (max 1 per minute is acceptable)
560
- repeated_words = len(re.findall(r'\b(\w+)\s+\1\b', transcript_text.lower()))
561
- incomplete_sentences = len(re.findall(r'[a-zA-Z]+\s*\.\.\.|\b[a-zA-Z]+\s*-\s+', transcript_text))
562
  errors_count = repeated_words + incomplete_sentences
563
  errors_per_minute = float(errors_count / duration_minutes if duration_minutes > 0 else 0)
564
 
565
- # Updated stricter thresholds
566
- max_errors = 1.0 # Reduced from 2.0
567
- max_fillers = 2.0 # Reduced from 3.0
568
- threshold_explanation = "Using strict thresholds for professional teaching"
569
  grammatical_errors = []
570
 
571
- # Calculate fluency score based on stricter thresholds
572
  fluency_score = 1 if (errors_per_minute <= max_errors and fillers_per_minute <= max_fillers) else 0
573
 
574
  return {
@@ -579,7 +572,7 @@ Important:
579
  "duration_minutes": duration_minutes
580
  },
581
  "fluency": {
582
- "score": fluency_score,
583
  "errorsPerMin": errors_per_minute,
584
  "fillersPerMin": fillers_per_minute,
585
  "maxErrorsThreshold": max_errors,
@@ -1069,8 +1062,8 @@ class MentorEvaluator:
1069
  progress.empty()
1070
  raise RuntimeError(f"Analysis failed: {str(e)}")
1071
 
1072
- def _transcribe_audio(self, audio_path: str, progress_callback=None) -> Dict[str, Any]:
1073
- """Transcribe audio with accent detection and detailed progress tracking"""
1074
  try:
1075
  if progress_callback:
1076
  progress_callback(0.1, "Loading transcription model...")
@@ -1107,7 +1100,7 @@ class MentorEvaluator:
1107
  # Initialize model with optimized settings and proper error handling
1108
  try:
1109
  model = WhisperModel(
1110
- "small", # Using larger model for better accent detection
1111
  device=device,
1112
  compute_type=compute_type,
1113
  download_root=self.model_cache_dir,
@@ -1120,14 +1113,14 @@ class MentorEvaluator:
1120
  raise RuntimeError(f"Failed to initialize transcription model: {str(e)}")
1121
 
1122
  if progress_callback:
1123
- progress_callback(0.3, "Starting transcription and accent analysis...")
1124
 
1125
  # Get audio duration for progress calculation
1126
  total_duration = audio_info.duration
1127
 
1128
- # Transcribe with accent detection
1129
  try:
1130
- segments, info = model.transcribe(
1131
  audio_path,
1132
  beam_size=5,
1133
  word_timestamps=True,
@@ -1138,14 +1131,8 @@ class MentorEvaluator:
1138
  threshold=0.3,
1139
  min_speech_duration_ms=250
1140
  ),
1141
- language='en',
1142
- task='transcribe'
1143
  )
1144
-
1145
- # Perform accent classification
1146
- accent_probs = self._classify_accent(model, audio_path)
1147
- detected_accent = max(accent_probs.items(), key=lambda x: x[1])
1148
-
1149
  except Exception as e:
1150
  logger.error(f"Error during transcription: {e}")
1151
  raise RuntimeError(f"Transcription failed: {str(e)}")
@@ -1187,23 +1174,13 @@ class MentorEvaluator:
1187
  if not transcript.strip():
1188
  raise ValueError("Transcription produced empty result")
1189
 
1190
- # Prepare result with accent information
1191
- result = {
1192
- "transcript": transcript,
1193
- "accent_analysis": {
1194
- "detected_accent": detected_accent[0],
1195
- "confidence": detected_accent[1],
1196
- "accent_probabilities": accent_probs
1197
- }
1198
- }
1199
-
1200
  # Cache the result
1201
- st.session_state[cache_key] = result
1202
 
1203
  if progress_callback:
1204
- progress_callback(1.0, "Transcription and accent analysis complete!")
1205
 
1206
- return result
1207
 
1208
  except Exception as e:
1209
  logger.error(f"Error in transcription: {e}")
@@ -1211,152 +1188,6 @@ class MentorEvaluator:
1211
  progress_callback(1.0, "Error in transcription", str(e))
1212
  raise
1213
 
1214
- def _classify_accent(self, model: WhisperModel, audio_path: str) -> Dict[str, float]:
1215
- """Classify accent using both acoustic features and Whisper ASR confidence"""
1216
- try:
1217
- # Load audio once and limit duration for faster processing
1218
- audio, sr = librosa.load(audio_path, sr=16000, duration=30)
1219
-
1220
- # 1. Get Whisper ASR confidence scores for different English accents
1221
- whisper_scores = {}
1222
- accent_langs = {
1223
- "Indian": "en",
1224
- "American": "en",
1225
- "British": "en-GB",
1226
- "Australian": "en-AU"
1227
- }
1228
-
1229
- # Run Whisper inference once with language detection
1230
- segments, info = model.transcribe(
1231
- audio_path,
1232
- language=None, # Let Whisper detect language
1233
- beam_size=3,
1234
- word_timestamps=False, # Disable for speed
1235
- condition_on_previous_text=False,
1236
- vad_filter=True,
1237
- vad_parameters=dict(
1238
- min_silence_duration_ms=500,
1239
- threshold=0.3
1240
- )
1241
- )
1242
-
1243
- # Get language confidence from Whisper
1244
- detected_language = info.language
1245
- language_probability = info.language_probability
1246
-
1247
- # Assign scores based on detected language
1248
- for accent, lang_code in accent_langs.items():
1249
- if lang_code == detected_language:
1250
- whisper_scores[accent] = float(language_probability)
1251
- else:
1252
- # Give other English variants a smaller baseline score
1253
- whisper_scores[accent] = float(language_probability * 0.5)
1254
-
1255
- # 2. Extract key acoustic features
1256
- features = {}
1257
-
1258
- # Pitch features using faster implementation
1259
- hop_length = 512
1260
- f0, voiced_flag, _ = librosa.pyin(
1261
- audio,
1262
- sr=sr,
1263
- fmin=70,
1264
- fmax=400,
1265
- frame_length=2048,
1266
- hop_length=hop_length,
1267
- fill_na=0.0
1268
- )
1269
-
1270
- valid_f0 = f0[voiced_flag == 1]
1271
- features['pitch_mean'] = float(np.mean(valid_f0)) if len(valid_f0) > 0 else 0.0
1272
- features['pitch_std'] = float(np.std(valid_f0)) if len(valid_f0) > 0 else 0.0
1273
-
1274
- # Rhythm features using window='hann' instead of hamming
1275
- onset_env = librosa.onset.onset_strength(
1276
- y=audio,
1277
- sr=sr,
1278
- hop_length=hop_length,
1279
- window='hann' # Changed from 'hamming' to 'hann'
1280
- )
1281
- tempo, _ = librosa.beat.beat_track(
1282
- onset_envelope=onset_env,
1283
- sr=sr,
1284
- hop_length=hop_length
1285
- )
1286
- features['rhythm_regularity'] = float(tempo)
1287
-
1288
- # Efficient spectral feature extraction with hann window
1289
- mfcc = librosa.feature.mfcc(
1290
- y=audio,
1291
- sr=sr,
1292
- n_mfcc=13,
1293
- hop_length=hop_length,
1294
- window='hann' # Changed from 'hamming' to 'hann'
1295
- )
1296
- features['spectral_variance'] = float(np.mean(np.std(mfcc, axis=1)))
1297
-
1298
- # 3. Combine acoustic and ASR features for classification
1299
- accent_scores = {}
1300
-
1301
- for accent in accent_langs.keys():
1302
- score = 0.0
1303
-
1304
- # Weight from Whisper ASR confidence (40%)
1305
- score += 0.4 * whisper_scores.get(accent, 0.0)
1306
-
1307
- # Weight from acoustic features (60%)
1308
- pitch_range = features['pitch_std'] / features['pitch_mean'] if features['pitch_mean'] > 0 else 0
1309
-
1310
- if accent == "Indian":
1311
- score += 0.6 * (
1312
- 0.3 * (0.1 <= pitch_range <= 0.2) +
1313
- 0.3 * (features['rhythm_regularity'] > 120) +
1314
- 0.4 * (features['spectral_variance'] > 2.0)
1315
- )
1316
-
1317
- elif accent == "American":
1318
- score += 0.6 * (
1319
- 0.3 * (0.15 <= pitch_range <= 0.25) +
1320
- 0.3 * (90 <= features['rhythm_regularity'] <= 110) +
1321
- 0.4 * (1.5 <= features['spectral_variance'] <= 2.0)
1322
- )
1323
-
1324
- elif accent == "British":
1325
- score += 0.6 * (
1326
- 0.3 * (0.12 <= pitch_range <= 0.22) +
1327
- 0.3 * (100 <= features['rhythm_regularity'] <= 120) +
1328
- 0.4 * (1.8 <= features['spectral_variance'] <= 2.2)
1329
- )
1330
-
1331
- elif accent == "Australian":
1332
- score += 0.6 * (
1333
- 0.3 * (0.14 <= pitch_range <= 0.24) +
1334
- 0.3 * (95 <= features['rhythm_regularity'] <= 115) +
1335
- 0.4 * (1.7 <= features['spectral_variance'] <= 2.1)
1336
- )
1337
-
1338
- accent_scores[accent] = float(score)
1339
-
1340
- # Normalize scores
1341
- total_score = sum(accent_scores.values())
1342
- if total_score > 0:
1343
- accent_scores = {k: v/total_score for k, v in accent_scores.items()}
1344
-
1345
- # Apply confidence threshold
1346
- threshold = 0.25
1347
- accent_scores = {k: v for k, v in accent_scores.items() if v > threshold}
1348
-
1349
- if not accent_scores:
1350
- # Return highest scoring accent if none pass threshold
1351
- max_score = max(accent_scores.values())
1352
- accent_scores = {k: v for k, v in accent_scores.items() if v == max_score}
1353
-
1354
- return accent_scores
1355
-
1356
- except Exception as e:
1357
- logger.error(f"Error in hybrid accent classification: {e}")
1358
- raise RuntimeError(f"Accent classification failed: {str(e)}")
1359
-
1360
  def _merge_transcripts(self, transcripts: List[str]) -> str:
1361
  """Merge transcripts with overlap deduplication"""
1362
  if not transcripts:
@@ -1397,40 +1228,37 @@ class MentorEvaluator:
1397
 
1398
  def _evaluate_speech_metrics(self, transcript: str, audio_features: Dict[str, float],
1399
  progress_callback=None) -> Dict[str, Any]:
1400
- """Evaluate speech metrics with stricter error thresholds"""
1401
  try:
1402
  if progress_callback:
1403
  progress_callback(0.2, "Calculating speech metrics...")
1404
 
1405
- # Extract transcript text from transcript data structure
1406
- transcript_text = transcript.get("transcript", "") if isinstance(transcript, dict) else transcript
1407
-
1408
  # Calculate words and duration
1409
- words = len(transcript_text.split())
1410
  duration_minutes = float(audio_features.get('duration', 0)) / 60
1411
 
1412
- # Calculate words per minute (130-160 WPM is ideal for teaching)
1413
  words_per_minute = float(words / duration_minutes if duration_minutes > 0 else 0)
1414
 
1415
- # Stricter filler word detection (max 1-2 per minute is acceptable)
1416
  filler_words = re.findall(r'\b(um|uh|like|you\s+know|basically|actually|literally)\b',
1417
- transcript_text.lower())
1418
  fillers_count = len(filler_words)
1419
  fillers_per_minute = float(fillers_count / duration_minutes if duration_minutes > 0 else 0)
1420
 
1421
- # Stricter error detection (max 1 per minute is acceptable)
1422
- repeated_words = len(re.findall(r'\b(\w+)\s+\1\b', transcript_text.lower()))
1423
- incomplete_sentences = len(re.findall(r'[a-zA-Z]+\s*\.\.\.|\b[a-zA-Z]+\s*-\s+', transcript_text))
1424
  errors_count = repeated_words + incomplete_sentences
1425
  errors_per_minute = float(errors_count / duration_minutes if duration_minutes > 0 else 0)
1426
 
1427
- # Updated stricter thresholds
1428
- max_errors = 1.0 # Reduced from 2.0
1429
- max_fillers = 2.0 # Reduced from 3.0
1430
- threshold_explanation = "Using strict thresholds for professional teaching"
1431
  grammatical_errors = []
1432
 
1433
- # Calculate fluency score based on stricter thresholds
1434
  fluency_score = 1 if (errors_per_minute <= max_errors and fillers_per_minute <= max_fillers) else 0
1435
 
1436
  return {
@@ -1441,7 +1269,7 @@ class MentorEvaluator:
1441
  "duration_minutes": duration_minutes
1442
  },
1443
  "fluency": {
1444
- "score": fluency_score,
1445
  "errorsPerMin": errors_per_minute,
1446
  "fillersPerMin": fillers_per_minute,
1447
  "maxErrorsThreshold": max_errors,
@@ -1515,79 +1343,15 @@ def display_evaluation(evaluation: Dict[str, Any]):
1515
  with tabs[0]:
1516
  st.header("Communication Metrics")
1517
 
1518
- # Add Accent Analysis section
1519
- with st.expander("🗣️ Accent Analysis", expanded=True):
1520
- # Safely handle transcript data structure
1521
- transcript_data = evaluation.get("transcript", {})
1522
- if isinstance(transcript_data, (str, bytes)):
1523
- # Handle case where transcript is direct text
1524
- accent_data = {}
1525
- transcript_text = str(transcript_data)
1526
- elif isinstance(transcript_data, dict):
1527
- # Handle case where transcript is a dictionary
1528
- accent_data = transcript_data.get("accent_analysis", {})
1529
- transcript_text = transcript_data.get("transcript", "")
1530
- else:
1531
- # Handle unexpected data type
1532
- accent_data = {}
1533
- transcript_text = ""
1534
- logger.warning(f"Unexpected transcript data type: {type(transcript_data)}")
1535
-
1536
- if accent_data:
1537
- col1, col2 = st.columns(2)
1538
- with col1:
1539
- detected_accent = accent_data.get("detected_accent", "Unknown")
1540
- confidence = accent_data.get("confidence", 0.0)
1541
-
1542
- st.metric("Detected Accent", str(detected_accent))
1543
- st.metric("Confidence", f"{float(confidence)*100:.1f}%")
1544
-
1545
- with col2:
1546
- # Display accent probability distribution
1547
- accent_probs = accent_data.get("accent_probabilities", {})
1548
- if accent_probs:
1549
- # Ensure all values are properly converted to float
1550
- df = pd.DataFrame(
1551
- [(k, float(v)) for k, v in accent_probs.items()],
1552
- columns=['Accent', 'Probability']
1553
- ).sort_values('Probability', ascending=False)
1554
-
1555
- fig = px.bar(
1556
- df,
1557
- x='Accent',
1558
- y='Probability',
1559
- title='Accent Probability Distribution'
1560
- )
1561
- fig.update_layout(
1562
- xaxis_tickangle=-45,
1563
- showlegend=False
1564
- )
1565
- st.plotly_chart(fig, use_container_width=True)
1566
-
1567
- # Add explanation card
1568
- st.markdown("""
1569
- <div class="metric-explanation-card">
1570
- <h4>🌍 Understanding Accent Analysis</h4>
1571
- <ul>
1572
- <li><strong>Detected Accent:</strong> The most probable accent based on speech patterns</li>
1573
- <li><strong>Confidence:</strong> Model's confidence in the accent classification</li>
1574
- <li><strong>Distribution:</strong> Probability scores across different accent possibilities</li>
1575
- </ul>
1576
- </div>
1577
- """, unsafe_allow_html=True)
1578
- else:
1579
- st.warning("Accent analysis data not available")
1580
-
1581
  # Get audio features and ensure we have the required metrics
1582
  audio_features = evaluation.get("audio_features", {})
1583
 
1584
- # Ensure all metrics are properly converted to appropriate types
1585
- speech_metrics = evaluation.get("speech_metrics", {})
1586
-
1587
  # Speed Metrics
1588
  with st.expander("🏃 Speed", expanded=True):
 
 
1589
  speed_data = speech_metrics.get("speed", {})
1590
- words_per_minute = float(speed_data.get("wpm", 0))
1591
 
1592
  col1, col2 = st.columns(2)
1593
  with col1:
@@ -1608,14 +1372,14 @@ def display_evaluation(evaluation: Dict[str, Any]):
1608
 
1609
  col1, col2 = st.columns(2)
1610
  with col1:
1611
- st.metric("Score", "✅ Pass" if fillers_per_minute <= 2 and errors_per_minute <= 1 else "❌ Needs Improvement")
1612
  st.metric("Fillers per Minute", f"{fillers_per_minute:.1f}")
1613
  st.metric("Errors per Minute", f"{errors_per_minute:.1f}")
1614
  with col2:
1615
  st.info("""
1616
  **Acceptable Ranges:**
1617
- - Fillers per Minute: <2
1618
- - Errors per Minute: <1
1619
  """)
1620
 
1621
  # Flow Metrics
@@ -1977,18 +1741,336 @@ def display_evaluation(evaluation: Dict[str, Any]):
1977
  """, unsafe_allow_html=True)
1978
 
1979
  with tabs[3]:
1980
- st.header("Transcript")
1981
- # Safely display transcript
1982
- if isinstance(transcript_text, (str, bytes)):
1983
- st.text_area("Full Transcript", str(transcript_text), height=300)
1984
- else:
1985
- st.warning("Transcript data not available in expected format")
 
 
 
 
 
 
 
 
 
 
1986
 
1987
  except Exception as e:
1988
- logger.error(f"Error displaying evaluation results: {e}")
1989
  st.error(f"Error displaying results: {str(e)}")
1990
- # Log the evaluation data structure for debugging
1991
- logger.debug(f"Evaluation data structure: {evaluation}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1992
 
1993
  def check_dependencies() -> List[str]:
1994
  """Check if required dependencies are installed"""
 
25
  import multiprocessing
26
  import concurrent.futures
27
  import hashlib
 
28
 
29
  # Set up logging
30
  logging.basicConfig(
 
291
  if progress_callback:
292
  progress_callback(0.2, "Preparing content analysis...")
293
 
 
 
 
294
  # Remove any truncation of transcript - pass full text to API
295
+ prompt = self._create_analysis_prompt(transcript)
296
+ logger.info(f"Sending full transcript of length: {len(transcript)} characters")
297
 
298
  if progress_callback:
299
  progress_callback(0.5, "Processing with AI model...")
 
531
 
532
  def _evaluate_speech_metrics(self, transcript: str, audio_features: Dict[str, float],
533
  progress_callback=None) -> Dict[str, Any]:
534
+ """Evaluate speech metrics with improved accuracy"""
535
  try:
536
  if progress_callback:
537
  progress_callback(0.2, "Calculating speech metrics...")
538
 
 
 
 
539
  # Calculate words and duration
540
+ words = len(transcript.split())
541
  duration_minutes = float(audio_features.get('duration', 0)) / 60
542
 
543
+ # Calculate words per minute with updated range (130-160 WPM is ideal for teaching)
544
  words_per_minute = float(words / duration_minutes if duration_minutes > 0 else 0)
545
 
546
+ # Improved filler word detection (2-3 per minute is acceptable)
547
  filler_words = re.findall(r'\b(um|uh|like|you\s+know|basically|actually|literally)\b',
548
+ transcript.lower())
549
  fillers_count = len(filler_words)
550
  fillers_per_minute = float(fillers_count / duration_minutes if duration_minutes > 0 else 0)
551
 
552
+ # Improved error detection (1-2 per minute is acceptable)
553
+ repeated_words = len(re.findall(r'\b(\w+)\s+\1\b', transcript.lower()))
554
+ incomplete_sentences = len(re.findall(r'[a-zA-Z]+\s*\.\.\.|\b[a-zA-Z]+\s*-\s+', transcript))
555
  errors_count = repeated_words + incomplete_sentences
556
  errors_per_minute = float(errors_count / duration_minutes if duration_minutes > 0 else 0)
557
 
558
+ # Set default thresholds if analysis fails
559
+ max_errors = 2.0
560
+ max_fillers = 3.0
561
+ threshold_explanation = "Using standard thresholds"
562
  grammatical_errors = []
563
 
564
+ # Calculate fluency score based on both errors and fillers
565
  fluency_score = 1 if (errors_per_minute <= max_errors and fillers_per_minute <= max_fillers) else 0
566
 
567
  return {
 
572
  "duration_minutes": duration_minutes
573
  },
574
  "fluency": {
575
+ "score": fluency_score, # Add explicit fluency score
576
  "errorsPerMin": errors_per_minute,
577
  "fillersPerMin": fillers_per_minute,
578
  "maxErrorsThreshold": max_errors,
 
1062
  progress.empty()
1063
  raise RuntimeError(f"Analysis failed: {str(e)}")
1064
 
1065
+ def _transcribe_audio(self, audio_path: str, progress_callback=None) -> str:
1066
+ """Transcribe audio with optimized segment detection and detailed progress tracking"""
1067
  try:
1068
  if progress_callback:
1069
  progress_callback(0.1, "Loading transcription model...")
 
1100
  # Initialize model with optimized settings and proper error handling
1101
  try:
1102
  model = WhisperModel(
1103
+ "small",
1104
  device=device,
1105
  compute_type=compute_type,
1106
  download_root=self.model_cache_dir,
 
1113
  raise RuntimeError(f"Failed to initialize transcription model: {str(e)}")
1114
 
1115
  if progress_callback:
1116
+ progress_callback(0.3, "Starting transcription...")
1117
 
1118
  # Get audio duration for progress calculation
1119
  total_duration = audio_info.duration
1120
 
1121
+ # Transcribe with optimized VAD settings and error handling
1122
  try:
1123
+ segments, _ = model.transcribe(
1124
  audio_path,
1125
  beam_size=5,
1126
  word_timestamps=True,
 
1131
  threshold=0.3,
1132
  min_speech_duration_ms=250
1133
  ),
1134
+ language='en'
 
1135
  )
 
 
 
 
 
1136
  except Exception as e:
1137
  logger.error(f"Error during transcription: {e}")
1138
  raise RuntimeError(f"Transcription failed: {str(e)}")
 
1174
  if not transcript.strip():
1175
  raise ValueError("Transcription produced empty result")
1176
 
 
 
 
 
 
 
 
 
 
 
1177
  # Cache the result
1178
+ st.session_state[cache_key] = transcript
1179
 
1180
  if progress_callback:
1181
+ progress_callback(1.0, "Transcription complete!")
1182
 
1183
+ return transcript
1184
 
1185
  except Exception as e:
1186
  logger.error(f"Error in transcription: {e}")
 
1188
  progress_callback(1.0, "Error in transcription", str(e))
1189
  raise
1190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1191
  def _merge_transcripts(self, transcripts: List[str]) -> str:
1192
  """Merge transcripts with overlap deduplication"""
1193
  if not transcripts:
 
1228
 
1229
  def _evaluate_speech_metrics(self, transcript: str, audio_features: Dict[str, float],
1230
  progress_callback=None) -> Dict[str, Any]:
1231
+ """Evaluate speech metrics with improved accuracy"""
1232
  try:
1233
  if progress_callback:
1234
  progress_callback(0.2, "Calculating speech metrics...")
1235
 
 
 
 
1236
  # Calculate words and duration
1237
+ words = len(transcript.split())
1238
  duration_minutes = float(audio_features.get('duration', 0)) / 60
1239
 
1240
+ # Calculate words per minute with updated range (130-160 WPM is ideal for teaching)
1241
  words_per_minute = float(words / duration_minutes if duration_minutes > 0 else 0)
1242
 
1243
+ # Improved filler word detection (2-3 per minute is acceptable)
1244
  filler_words = re.findall(r'\b(um|uh|like|you\s+know|basically|actually|literally)\b',
1245
+ transcript.lower())
1246
  fillers_count = len(filler_words)
1247
  fillers_per_minute = float(fillers_count / duration_minutes if duration_minutes > 0 else 0)
1248
 
1249
+ # Improved error detection (1-2 per minute is acceptable)
1250
+ repeated_words = len(re.findall(r'\b(\w+)\s+\1\b', transcript.lower()))
1251
+ incomplete_sentences = len(re.findall(r'[a-zA-Z]+\s*\.\.\.|\b[a-zA-Z]+\s*-\s+', transcript))
1252
  errors_count = repeated_words + incomplete_sentences
1253
  errors_per_minute = float(errors_count / duration_minutes if duration_minutes > 0 else 0)
1254
 
1255
+ # Set default thresholds if analysis fails
1256
+ max_errors = 2.0
1257
+ max_fillers = 3.0
1258
+ threshold_explanation = "Using standard thresholds"
1259
  grammatical_errors = []
1260
 
1261
+ # Calculate fluency score based on both errors and fillers
1262
  fluency_score = 1 if (errors_per_minute <= max_errors and fillers_per_minute <= max_fillers) else 0
1263
 
1264
  return {
 
1269
  "duration_minutes": duration_minutes
1270
  },
1271
  "fluency": {
1272
+ "score": fluency_score, # Add explicit fluency score
1273
  "errorsPerMin": errors_per_minute,
1274
  "fillersPerMin": fillers_per_minute,
1275
  "maxErrorsThreshold": max_errors,
 
1343
  with tabs[0]:
1344
  st.header("Communication Metrics")
1345
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1346
  # Get audio features and ensure we have the required metrics
1347
  audio_features = evaluation.get("audio_features", {})
1348
 
 
 
 
1349
  # Speed Metrics
1350
  with st.expander("🏃 Speed", expanded=True):
1351
+ # Fix: Calculate WPM using total words and duration
1352
+ speech_metrics = evaluation.get("speech_metrics", {})
1353
  speed_data = speech_metrics.get("speed", {})
1354
+ words_per_minute = speed_data.get("wpm", 0) # Get WPM from speech metrics
1355
 
1356
  col1, col2 = st.columns(2)
1357
  with col1:
 
1372
 
1373
  col1, col2 = st.columns(2)
1374
  with col1:
1375
+ st.metric("Score", "✅ Pass" if fillers_per_minute <= 3 and errors_per_minute <= 2 else "❌ Needs Improvement")
1376
  st.metric("Fillers per Minute", f"{fillers_per_minute:.1f}")
1377
  st.metric("Errors per Minute", f"{errors_per_minute:.1f}")
1378
  with col2:
1379
  st.info("""
1380
  **Acceptable Ranges:**
1381
+ - Fillers per Minute: <3
1382
+ - Errors per Minute: <2
1383
  """)
1384
 
1385
  # Flow Metrics
 
1741
  """, unsafe_allow_html=True)
1742
 
1743
  with tabs[3]:
1744
+ st.header("Transcript with Timestamps")
1745
+ transcript = evaluation.get("transcript", "")
1746
+
1747
+ # Split transcript into sentences and add timestamps
1748
+ sentences = re.split(r'(?<=[.!?])\s+', transcript)
1749
+ for i, sentence in enumerate(sentences):
1750
+ # Calculate approximate timestamp based on words and average speaking rate
1751
+ words_before = len(' '.join(sentences[:i]).split())
1752
+ timestamp = words_before / 150 # Assuming 150 words per minute
1753
+ minutes = int(timestamp)
1754
+ seconds = int((timestamp - minutes) * 60)
1755
+
1756
+ st.markdown(f"**[{minutes:02d}:{seconds:02d}]** {sentence}")
1757
+
1758
+ # Comment out original transcript display
1759
+ # st.text(evaluation.get("transcript", "Transcript not available"))
1760
 
1761
  except Exception as e:
1762
+ logger.error(f"Error displaying evaluation: {e}")
1763
  st.error(f"Error displaying results: {str(e)}")
1764
+ st.error("Please check the evaluation data structure and try again.")
1765
+
1766
+ # Add these styles to the existing CSS in the main function
1767
+ st.markdown("""
1768
+ <style>
1769
+ /* ... existing styles ... */
1770
+
1771
+ .citation-box {
1772
+ background-color: #f8f9fa;
1773
+ border-left: 3px solid #6c757d;
1774
+ padding: 10px;
1775
+ margin: 5px 0;
1776
+ border-radius: 0 4px 4px 0;
1777
+ }
1778
+
1779
+ .recommendation-card {
1780
+ background-color: #ffffff;
1781
+ border-left: 4px solid #1f77b4;
1782
+ padding: 15px;
1783
+ margin: 10px 0;
1784
+ border-radius: 4px;
1785
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
1786
+ }
1787
+
1788
+ .recommendation-card h4 {
1789
+ color: #1f77b4;
1790
+ margin: 0 0 10px 0;
1791
+ }
1792
+
1793
+ .rigor-card {
1794
+ background-color: #ffffff;
1795
+ border: 1px solid #e0e0e0;
1796
+ padding: 20px;
1797
+ margin: 10px 0;
1798
+ border-radius: 8px;
1799
+ box-shadow: 0 2px 4px rgba(0,0,0,0.05);
1800
+ }
1801
+
1802
+ .score-badge {
1803
+ display: inline-block;
1804
+ padding: 4px 12px;
1805
+ border-radius: 15px;
1806
+ font-weight: bold;
1807
+ margin: 10px 0;
1808
+ }
1809
+
1810
+ .green-score {
1811
+ background-color: #28a745;
1812
+ color: white;
1813
+ }
1814
+
1815
+ .orange-score {
1816
+ background-color: #fd7e14;
1817
+ color: white;
1818
+ }
1819
+
1820
+ .metric-container {
1821
+ background-color: #f8f9fa;
1822
+ padding: 15px;
1823
+ border-radius: 8px;
1824
+ margin: 10px 0;
1825
+ }
1826
+
1827
+ .profile-guide {
1828
+ background-color: #f8f9fa;
1829
+ padding: 15px;
1830
+ border-radius: 8px;
1831
+ margin-bottom: 20px;
1832
+ border-left: 4px solid #1f77b4;
1833
+ }
1834
+
1835
+ .profile-card {
1836
+ background-color: #ffffff;
1837
+ border: 1px solid #e0e0e0;
1838
+ border-radius: 8px;
1839
+ padding: 20px;
1840
+ margin: 10px 0;
1841
+ box-shadow: 0 2px 4px rgba(0,0,0,0.05);
1842
+ transition: all 0.3s ease;
1843
+ }
1844
+
1845
+ .profile-card.recommended {
1846
+ border-left: 4px solid #28a745;
1847
+ }
1848
+
1849
+ .profile-header {
1850
+ margin-bottom: 15px;
1851
+ }
1852
+
1853
+ .profile-badge {
1854
+ display: inline-block;
1855
+ padding: 4px 12px;
1856
+ border-radius: 15px;
1857
+ font-size: 0.9em;
1858
+ margin-top: 5px;
1859
+ background-color: #f8f9fa;
1860
+ }
1861
+
1862
+ .profile-content ul {
1863
+ margin: 10px 0;
1864
+ padding-left: 20px;
1865
+ }
1866
+
1867
+ .recommendation-status {
1868
+ margin-top: 15px;
1869
+ padding: 10px;
1870
+ border-radius: 4px;
1871
+ background-color: #f8f9fa;
1872
+ font-weight: bold;
1873
+ }
1874
+
1875
+ .recommendation-status small {
1876
+ display: block;
1877
+ margin-top: 5px;
1878
+ font-weight: normal;
1879
+ color: #666;
1880
+ }
1881
+
1882
+ .recommendation-status.recommended {
1883
+ background-color: #d4edda;
1884
+ border-color: #c3e6cb;
1885
+ color: #155724;
1886
+ }
1887
+
1888
+ .recommendation-status:not(.recommended) {
1889
+ background-color: #fff3cd;
1890
+ border-color: #ffeeba;
1891
+ color: #856404;
1892
+ }
1893
+
1894
+ .profile-card.recommended {
1895
+ border-left: 4px solid #28a745;
1896
+ box-shadow: 0 2px 8px rgba(40, 167, 69, 0.1);
1897
+ }
1898
+
1899
+ .profile-card:not(.recommended) {
1900
+ border-left: 4px solid #ffc107;
1901
+ opacity: 0.8;
1902
+ }
1903
+
1904
+ .profile-card:hover {
1905
+ transform: translateY(-2px);
1906
+ box-shadow: 0 4px 12px rgba(0,0,0,0.1);
1907
+ }
1908
+
1909
+ .progress-metric {
1910
+ background: linear-gradient(135deg, #f6f8fa 0%, #ffffff 100%);
1911
+ padding: 10px 15px;
1912
+ border-radius: 8px;
1913
+ border-left: 4px solid #1f77b4;
1914
+ margin: 5px 0;
1915
+ box-shadow: 0 2px 4px rgba(0,0,0,0.05);
1916
+ transition: transform 0.2s ease;
1917
+ }
1918
+
1919
+ .progress-metric:hover {
1920
+ transform: translateX(5px);
1921
+ }
1922
+
1923
+ .progress-metric b {
1924
+ color: #1f77b4;
1925
+ }
1926
+
1927
+ /* Enhanced status messages */
1928
+ .status-message {
1929
+ padding: 10px;
1930
+ border-radius: 8px;
1931
+ margin: 5px 0;
1932
+ animation: fadeIn 0.5s ease;
1933
+ }
1934
+
1935
+ .status-processing {
1936
+ background: linear-gradient(135deg, #f0f7ff 0%, #e5f0ff 100%);
1937
+ border-left: 4px solid #1f77b4;
1938
+ }
1939
+
1940
+ .status-complete {
1941
+ background: linear-gradient(135deg, #f0fff0 0%, #e5ffe5 100%);
1942
+ border-left: 4px solid #28a745;
1943
+ }
1944
+
1945
+ .status-error {
1946
+ background: linear-gradient(135deg, #fff0f0 0%, #ffe5e5 100%);
1947
+ border-left: 4px solid #dc3545;
1948
+ }
1949
+
1950
+ /* Progress bar enhancement */
1951
+ .stProgress > div > div {
1952
+ background-image: linear-gradient(
1953
+ to right,
1954
+ rgba(31, 119, 180, 0.8),
1955
+ rgba(31, 119, 180, 1)
1956
+ );
1957
+ transition: width 0.3s ease;
1958
+ }
1959
+
1960
+ /* Batch indicator animation */
1961
+ @keyframes pulse {
1962
+ 0% { transform: scale(1); }
1963
+ 50% { transform: scale(1.05); }
1964
+ 100% { transform: scale(1); }
1965
+ }
1966
+
1967
+ .batch-indicator {
1968
+ display: inline-block;
1969
+ padding: 4px 8px;
1970
+ background: #1f77b4;
1971
+ color: white;
1972
+ border-radius: 4px;
1973
+ animation: pulse 1s infinite;
1974
+ }
1975
+
1976
+ .metric-box {
1977
+ background: linear-gradient(135deg, #f8f9fa 0%, #ffffff 100%);
1978
+ padding: 10px;
1979
+ border-radius: 8px;
1980
+ margin: 5px;
1981
+ border-left: 4px solid #1f77b4;
1982
+ box-shadow: 0 2px 4px rgba(0,0,0,0.05);
1983
+ transition: transform 0.2s ease;
1984
+ }
1985
+
1986
+ .metric-box:hover {
1987
+ transform: translateX(5px);
1988
+ }
1989
+
1990
+ .metric-box.batch {
1991
+ border-left-color: #28a745;
1992
+ }
1993
+
1994
+ .metric-box.time {
1995
+ border-left-color: #dc3545;
1996
+ }
1997
+
1998
+ .metric-box.progress {
1999
+ border-left-color: #ffc107;
2000
+ }
2001
+
2002
+ .metric-box.segment {
2003
+ border-left-color: #17a2b8;
2004
+ }
2005
+
2006
+ .metric-box b {
2007
+ color: #1f77b4;
2008
+ }
2009
+
2010
+ <style>
2011
+ .metric-explanation-card {
2012
+ background: linear-gradient(135deg, #f8f9fa 0%, #ffffff 100%);
2013
+ padding: 15px;
2014
+ border-radius: 8px;
2015
+ margin-top: 15px;
2016
+ border-left: 4px solid #17a2b8;
2017
+ box-shadow: 0 2px 4px rgba(0,0,0,0.05);
2018
+ }
2019
+
2020
+ .metric-explanation-card h4 {
2021
+ color: #17a2b8;
2022
+ margin-bottom: 10px;
2023
+ }
2024
+
2025
+ .metric-explanation-card ul {
2026
+ list-style-type: none;
2027
+ padding-left: 0;
2028
+ }
2029
+
2030
+ .metric-explanation-card li {
2031
+ margin-bottom: 12px;
2032
+ padding-left: 15px;
2033
+ border-left: 2px solid #e9ecef;
2034
+ }
2035
+
2036
+ .metric-explanation-card li:hover {
2037
+ border-left: 2px solid #17a2b8;
2038
+ }
2039
+ </style>
2040
+
2041
+ <style>
2042
+ /* ... existing styles ... */
2043
+
2044
+ .suggestions-box {
2045
+ background-color: #f8f9fa;
2046
+ padding: 10px 15px;
2047
+ margin-top: 15px;
2048
+ border-radius: 8px;
2049
+ border-left: 4px solid #ffc107;
2050
+ }
2051
+
2052
+ .suggestions-box h4 {
2053
+ color: #856404;
2054
+ margin: 0;
2055
+ padding: 5px 0;
2056
+ }
2057
+
2058
+ .suggestion-item {
2059
+ padding: 5px 15px;
2060
+ color: #666;
2061
+ border-left: 2px solid #ffc107;
2062
+ margin: 5px 0;
2063
+ background-color: #fff;
2064
+ border-radius: 0 4px 4px 0;
2065
+ }
2066
+
2067
+ .suggestion-item:hover {
2068
+ background-color: #fff9e6;
2069
+ transform: translateX(5px);
2070
+ transition: all 0.2s ease;
2071
+ }
2072
+ </style>
2073
+ """, unsafe_allow_html=True)
2074
 
2075
  def check_dependencies() -> List[str]:
2076
  """Check if required dependencies are installed"""