norhan12 commited on
Commit
a9ef52b
·
verified ·
1 Parent(s): 7f5bb69

Update process_interview.py

Browse files
Files changed (1) hide show
  1. process_interview.py +308 -388
process_interview.py CHANGED
@@ -17,7 +17,6 @@ from sklearn.feature_extraction.text import TfidfVectorizer
17
  import re
18
  from typing import Dict, List, Tuple
19
  import logging
20
- # --- Imports for enhanced PDF ---
21
  from reportlab.lib.pagesizes import letter
22
  from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
23
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
@@ -27,13 +26,13 @@ import matplotlib.pyplot as plt
27
  import matplotlib
28
  matplotlib.use('Agg')
29
  from reportlab.platypus import Image
30
- import io # --- FIX: إضافة import io لـ BytesIO ---
31
- # --- End Imports for enhanced PDF ---
32
  from transformers import AutoTokenizer, AutoModel
33
  import spacy
34
  import google.generativeai as genai
35
  import joblib
36
  from concurrent.futures import ThreadPoolExecutor
 
37
 
38
  # Setup logging
39
  logging.basicConfig(level=logging.INFO)
@@ -42,7 +41,7 @@ logging.getLogger("nemo_logging").setLevel(logging.ERROR)
42
  logging.getLogger("nemo").setLevel(logging.ERROR)
43
 
44
  # Configuration
45
- AUDIO_DIR = "./uploads"
46
  OUTPUT_DIR = "./processed_audio"
47
  os.makedirs(OUTPUT_DIR, exist_ok=True)
48
 
@@ -51,7 +50,6 @@ PINECONE_KEY = os.getenv("PINECONE_KEY")
51
  ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY")
52
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
53
 
54
-
55
  # Initialize services
56
  def initialize_services():
57
  try:
@@ -74,21 +72,18 @@ def initialize_services():
74
  logger.error(f"Error initializing services: {str(e)}")
75
  raise
76
 
77
-
78
  index, gemini_model = initialize_services()
79
 
80
  # Device setup
81
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
82
  logger.info(f"Using device: {device}")
83
 
84
-
85
  def load_speaker_model():
86
  try:
87
- import torch
88
  torch.set_num_threads(5)
89
  model = EncDecSpeakerLabelModel.from_pretrained(
90
  "nvidia/speakerverification_en_titanet_large",
91
- map_location=torch.device('cpu')
92
  )
93
  model.eval()
94
  return model
@@ -96,49 +91,64 @@ def load_speaker_model():
96
  logger.error(f"Model loading failed: {str(e)}")
97
  raise RuntimeError("Could not load speaker verification model")
98
 
99
-
100
  # Load ML models
101
  def load_models():
102
  speaker_model = load_speaker_model()
103
  nlp = spacy.load("en_core_web_sm")
104
-
105
  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
106
  llm_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
107
  llm_model.eval()
108
-
109
  return speaker_model, nlp, tokenizer, llm_model
110
 
111
-
112
  speaker_model, nlp, tokenizer, llm_model = load_models()
113
 
114
-
115
  # Audio processing functions
116
- def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
 
117
  try:
118
  audio = AudioSegment.from_file(audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  if audio.channels > 1:
120
  audio = audio.set_channels(1)
121
  audio = audio.set_frame_rate(16000)
122
-
123
  wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
124
  audio.export(wav_file, format="wav")
 
125
  return wav_file
126
  except Exception as e:
127
  logger.error(f"Audio conversion failed: {str(e)}")
 
 
128
  raise
129
 
130
-
131
  def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
132
  try:
 
 
 
 
 
 
133
  audio = AudioSegment.from_file(audio_path)
134
  segment = audio[start_ms:end_ms]
135
  temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
136
  segment.export(temp_path, format="wav")
137
-
138
  y, sr = librosa.load(temp_path, sr=16000)
139
  pitches = librosa.piptrack(y=y, sr=sr)[0]
140
  pitches = pitches[pitches > 0]
141
-
142
  features = {
143
  'duration': (end_ms - start_ms) / 1000,
144
  'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
@@ -150,24 +160,17 @@ def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Di
150
  'intensityMax': float(np.max(librosa.feature.rms(y=y)[0])),
151
  'intensitySD': float(np.std(librosa.feature.rms(y=y)[0])),
152
  }
153
-
154
  os.remove(temp_path)
155
  return features
156
  except Exception as e:
157
  logger.error(f"Feature extraction failed: {str(e)}")
 
 
158
  return {
159
- 'duration': 0.0,
160
- 'mean_pitch': 0.0,
161
- 'min_pitch': 0.0,
162
- 'max_pitch': 0.0,
163
- 'pitch_sd': 0.0,
164
- 'intensityMean': 0.0,
165
- 'intensityMin': 0.0,
166
- 'intensityMax': 0.0,
167
- 'intensitySD': 0.0,
168
  }
169
 
170
-
171
  def transcribe(audio_path: str) -> Dict:
172
  try:
173
  with open(audio_path, 'rb') as f:
@@ -177,7 +180,6 @@ def transcribe(audio_path: str) -> Dict:
177
  data=f
178
  )
179
  audio_url = upload_response.json()['upload_url']
180
-
181
  transcript_response = requests.post(
182
  "https://api.assemblyai.com/v2/transcript",
183
  headers={"authorization": ASSEMBLYAI_KEY},
@@ -188,63 +190,61 @@ def transcribe(audio_path: str) -> Dict:
188
  }
189
  )
190
  transcript_id = transcript_response.json()['id']
191
-
192
  while True:
193
  result = requests.get(
194
  f"https://api.assemblyai.com/v2/transcript/{transcript_id}",
195
  headers={"authorization": ASSEMBLYAI_KEY}
196
  ).json()
197
-
198
  if result['status'] == 'completed':
199
  return result
200
  elif result['status'] == 'error':
201
  raise Exception(result['error'])
202
-
203
  time.sleep(5)
204
  except Exception as e:
205
  logger.error(f"Transcription failed: {str(e)}")
206
  raise
207
 
208
-
209
  def process_utterance(utterance, full_audio, wav_file):
210
  try:
211
  start = utterance['start']
212
  end = utterance['end']
 
 
 
 
 
 
 
 
213
  segment = full_audio[start:end]
214
  temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
215
  segment.export(temp_path, format="wav")
216
-
217
  with torch.no_grad():
218
- embedding = speaker_model.get_embedding(temp_path).cpu().numpy() # Ensure numpy array
219
-
220
- # --- FIX: Convert embedding to a flat list for Pinecone query ---
221
  embedding_list = embedding.flatten().tolist()
222
- # --- End FIX ---
223
-
224
  query_result = index.query(
225
- vector=embedding_list, # Use the corrected flat list
226
  top_k=1,
227
  include_metadata=True
228
  )
229
-
230
  if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
231
  speaker_id = query_result['matches'][0]['id']
232
  speaker_name = query_result['matches'][0]['metadata']['speaker_name']
233
  else:
234
  speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
235
  speaker_name = f"Speaker_{speaker_id[-4:]}"
236
- index.upsert([(speaker_id, embedding_list, {"speaker_name": speaker_name})]) # Use corrected list
237
-
238
  os.remove(temp_path)
239
-
240
  return {
241
  **utterance,
242
  'speaker': speaker_name,
243
  'speaker_id': speaker_id,
244
- 'embedding': embedding_list # Store the corrected list
245
  }
246
  except Exception as e:
247
- logger.error(f"Utterance processing failed: {str(e)}", exc_info=True)
 
 
248
  return {
249
  **utterance,
250
  'speaker': 'Unknown',
@@ -252,65 +252,101 @@ def process_utterance(utterance, full_audio, wav_file):
252
  'embedding': None
253
  }
254
 
255
-
256
  def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
257
  try:
258
  full_audio = AudioSegment.from_wav(wav_file)
259
- utterances = transcript['utterances']
260
-
261
- with ThreadPoolExecutor(max_workers=5) as executor: # Changed to 5 workers
 
 
262
  futures = [
263
  executor.submit(process_utterance, utterance, full_audio, wav_file)
264
  for utterance in utterances
265
  ]
266
  results = [f.result() for f in futures]
267
-
268
  return results
269
  except Exception as e:
270
  logger.error(f"Speaker identification failed: {str(e)}")
271
  raise
272
 
 
 
 
 
 
 
 
 
273
 
274
- def train_role_classifier(utterances: List[Dict]):
275
  try:
276
- texts = [u['text'] for u in utterances] # تم حذف الـ 'u' الزائدة
 
 
 
277
  vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
278
  X_text = vectorizer.fit_transform(texts)
279
-
280
  features = []
281
  labels = []
282
-
 
 
 
 
 
 
 
283
  for i, utterance in enumerate(utterances):
284
- prosodic = utterance['prosodic_features']
 
 
 
 
 
 
 
 
285
  feat = [
286
- prosodic['duration'],
287
- prosodic['mean_pitch'],
288
- prosodic['min_pitch'],
289
- prosodic['max_pitch'],
290
- prosodic['pitch_sd'],
291
- prosodic['intensityMean'],
292
- prosodic['intensityMin'],
293
- prosodic['intensityMax'],
294
- prosodic['intensitySD'],
 
 
 
 
295
  ]
296
-
297
  feat.extend(X_text[i].toarray()[0].tolist())
298
-
299
- doc = nlp(utterance['text'])
300
  feat.extend([
301
- int(utterance['text'].endswith('?')),
302
- len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower())),
303
- len(utterance['text'].split()),
304
  sum(1 for token in doc if token.pos_ == 'VERB'),
305
- sum(1 for token in doc if token.pos_ == 'NOUN')
 
 
 
 
306
  ])
307
-
 
 
 
 
 
 
308
  features.append(feat)
309
- labels.append(0 if i % 2 == 0 else 1)
310
-
 
311
  scaler = StandardScaler()
312
  X = scaler.fit_transform(features)
313
-
314
  clf = RandomForestClassifier(
315
  n_estimators=150,
316
  max_depth=10,
@@ -318,122 +354,154 @@ def train_role_classifier(utterances: List[Dict]):
318
  class_weight='balanced'
319
  )
320
  clf.fit(X, labels)
321
-
322
  joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
323
  joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
324
  joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
325
-
326
  return clf, vectorizer, scaler
327
  except Exception as e:
328
  logger.error(f"Classifier training failed: {str(e)}")
329
  raise
330
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
- def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
333
  try:
334
- texts = [u['text'] for u in utterances]
 
 
 
335
  X_text = vectorizer.transform(texts)
336
-
 
 
 
 
 
 
 
337
  results = []
338
  for i, utterance in enumerate(utterances):
339
- prosodic = utterance['prosodic_features']
 
 
 
 
 
 
 
 
 
340
  feat = [
341
- prosodic['duration'],
342
- prosodic['mean_pitch'],
343
- prosodic['min_pitch'],
344
- prosodic['max_pitch'],
345
- prosodic['pitch_sd'],
346
- prosodic['intensityMean'],
347
- prosodic['intensityMin'],
348
- prosodic['intensityMax'],
349
- prosodic['intensitySD'],
 
 
 
 
350
  ]
351
-
352
  feat.extend(X_text[i].toarray()[0].tolist())
353
-
354
- doc = nlp(utterance['text'])
355
  feat.extend([
356
- int(utterance['text'].endswith('?')),
357
- len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower())),
358
- len(utterance['text'].split()),
359
  sum(1 for token in doc if token.pos_ == 'VERB'),
360
- sum(1 for token in doc if token.pos_ == 'NOUN')
 
 
 
 
361
  ])
362
-
363
  X = scaler.transform([feat])
364
  role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
365
-
 
366
  results.append({**utterance, 'role': role})
367
-
 
 
 
 
 
 
 
 
 
368
  return results
369
  except Exception as e:
370
  logger.error(f"Role classification failed: {str(e)}")
371
  raise
372
 
373
-
374
  def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
375
  try:
376
  y, sr = librosa.load(audio_path, sr=16000)
377
-
378
- interviewee_utterances = [u for u in utterances if u['role'] == 'Interviewee']
379
  if not interviewee_utterances:
 
380
  return {'error': 'No interviewee utterances found'}
381
-
382
  segments = []
383
  for u in interviewee_utterances:
384
  start = int(u['start'] * sr / 1000)
385
  end = int(u['end'] * sr / 1000)
386
- segments.append(y[start:end])
387
-
 
 
 
388
  combined_audio = np.concatenate(segments)
389
-
390
  total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
391
  total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
392
  speaking_rate = total_words / total_duration if total_duration > 0 else 0
393
-
394
  filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
395
  filler_count = sum(
396
  sum(u['text'].lower().count(fw) for fw in filler_words)
397
  for u in interviewee_utterances
398
  )
399
  filler_ratio = filler_count / total_words if total_words > 0 else 0
400
-
401
  all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
402
  word_counts = {}
403
  for i in range(len(all_words) - 1):
404
  bigram = (all_words[i], all_words[i + 1])
405
  word_counts[bigram] = word_counts.get(bigram, 0) + 1
406
- repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(
407
- word_counts) if word_counts else 0
408
-
409
  pitches = []
410
  for segment in segments:
411
  f0, voiced_flag, _ = librosa.pyin(segment, fmin=80, fmax=300, sr=sr)
412
  pitches.extend(f0[voiced_flag])
413
-
414
  pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
415
  pitch_std = np.std(pitches) if len(pitches) > 0 else 0
416
  jitter = np.mean(np.abs(np.diff(pitches))) / pitch_mean if len(pitches) > 1 and pitch_mean > 0 else 0
417
-
418
  intensities = []
419
  for segment in segments:
420
  rms = librosa.feature.rms(y=segment)[0]
421
  intensities.extend(rms)
422
-
423
  intensity_mean = np.mean(intensities) if intensities else 0
424
  intensity_std = np.std(intensities) if intensities else 0
425
- shimmer = np.mean(np.abs(np.diff(intensities))) / intensity_mean if len(
426
- intensities) > 1 and intensity_mean > 0 else 0
427
-
428
  anxiety_score = 0.6 * (pitch_std / pitch_mean) + 0.4 * (jitter + shimmer) if pitch_mean > 0 else 0
429
  confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
430
  hesitation_score = filler_ratio + repetition_score
431
-
432
  anxiety_level = 'high' if anxiety_score > 0.15 else 'moderate' if anxiety_score > 0.07 else 'low'
433
  confidence_level = 'high' if confidence_score > 0.7 else 'moderate' if confidence_score > 0.5 else 'low'
434
  fluency_level = 'fluent' if (filler_ratio < 0.05 and repetition_score < 0.1) else 'moderate' if (
435
  filler_ratio < 0.1 and repetition_score < 0.2) else 'disfluent'
436
-
437
  return {
438
  'speaking_rate': float(round(speaking_rate, 2)),
439
  'filler_ratio': float(round(filler_ratio, 4)),
@@ -463,12 +531,9 @@ def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
463
  logger.error(f"Voice analysis failed: {str(e)}")
464
  return {'error': str(e)}
465
 
466
-
467
  def generate_voice_interpretation(analysis: Dict) -> str:
468
- # This function is used to provide the text interpretation for Gemini's prompt.
469
  if 'error' in analysis:
470
  return "Voice analysis not available."
471
-
472
  interpretation_lines = []
473
  interpretation_lines.append("Voice Analysis Summary:")
474
  interpretation_lines.append(f"- Speaking Rate: {analysis['speaking_rate']} words/sec (average)")
@@ -487,124 +552,74 @@ def generate_voice_interpretation(analysis: Dict) -> str:
487
  interpretation_lines.append("3. Anxiety is measured through pitch variability and voice instability.")
488
  interpretation_lines.append("4. Confidence is assessed through voice intensity and stability.")
489
  interpretation_lines.append("5. Fluency combines filler words and repetition metrics.")
490
-
491
  return "\n".join(interpretation_lines)
492
 
493
-
494
- # --- Chart Generation Function ---
495
- def generate_anxiety_confidence_chart(composite_scores: Dict, chart_path: str):
496
  try:
497
  labels = ['Anxiety', 'Confidence']
498
  scores = [composite_scores.get('anxiety', 0), composite_scores.get('confidence', 0)]
499
-
500
- fig, ax = plt.subplots(figsize=(4, 2.5)) # Smaller size for embedding in PDF
501
  ax.bar(labels, scores, color=['lightcoral', 'lightskyblue'])
502
  ax.set_ylabel('Score')
503
  ax.set_title('Anxiety vs. Confidence Scores')
504
- ax.set_ylim(0, 1.0) # Assuming scores are normalized 0-1
505
-
506
  for i, v in enumerate(scores):
507
  ax.text(i, v + 0.05, f"{v:.2f}", color='black', ha='center', fontweight='bold')
508
-
509
- # هذه الأوامر يجب أن تكون خارج الـ loop عشان يتم تنفيذها مرة واحدة بعد رسم كل العناصر
510
  plt.tight_layout()
511
- plt.savefig(chart_path)
512
- plt.close(fig) # Close the figure to free up memory
 
513
  except Exception as e:
514
  logger.error(f"Error generating chart: {str(e)}")
 
515
 
516
-
517
- # --- Acceptance Probability Calculation ---
518
  def calculate_acceptance_probability(analysis_data: Dict) -> float:
519
- """
520
- Calculates a hypothetical acceptance probability based on voice and content analysis.
521
- This is a simplified, heuristic model and can be refined with more data/ML.
522
- """
523
  voice = analysis_data.get('voice_analysis', {})
524
-
525
  if 'error' in voice:
526
- return 0.0 # Cannot calculate if voice analysis failed
527
-
528
- # Weights for different factors (adjust these to fine-tune the model)
529
  w_confidence = 0.4
530
- w_anxiety = -0.3 # Negative weight for anxiety
531
  w_fluency = 0.2
532
- w_speaking_rate = 0.1 # Ideal rate gets higher score
533
- w_filler_repetition = -0.1 # Negative weight for filler/repetition
534
- w_content_strengths = 0.2 # Placeholder, ideally from deeper content analysis
535
-
536
- # Normalize/interpret scores
537
  confidence_score = voice.get('composite_scores', {}).get('confidence', 0.0)
538
  anxiety_score = voice.get('composite_scores', {}).get('anxiety', 0.0)
539
  fluency_level = voice.get('interpretation', {}).get('fluency_level', 'disfluent')
540
  speaking_rate = voice.get('speaking_rate', 0.0)
541
  filler_ratio = voice.get('filler_ratio', 0.0)
542
  repetition_score = voice.get('repetition_score', 0.0)
543
-
544
- # Fluency mapping (higher score for more fluent)
545
  fluency_map = {'fluent': 1.0, 'moderate': 0.5, 'disfluent': 0.0}
546
  fluency_val = fluency_map.get(fluency_level, 0.0)
547
-
548
- # Speaking rate scoring (e.g., ideal is around 2.5 words/sec, gets lower for too fast/slow)
549
- # This is a simple inverse of deviation from ideal
550
  ideal_speaking_rate = 2.5
551
  speaking_rate_deviation = abs(speaking_rate - ideal_speaking_rate)
552
- speaking_rate_score = max(0, 1 - (speaking_rate_deviation / ideal_speaking_rate)) # Max 1.0, min 0.0
553
-
554
- # Filler/Repetition score (lower is better, so 1 - score)
555
- filler_repetition_composite = (filler_ratio + repetition_score) / 2 # Average them
556
  filler_repetition_score = max(0, 1 - filler_repetition_composite)
557
-
558
- # Simplified content strength score (you might need a more sophisticated NLP method here)
559
- # For now, based on presence of strengths in Gemini's content analysis
560
- content_strength_val = 0.0
561
- # This part would ideally come from a structured output from Gemini's content analysis.
562
- # For now, we'll make a simplified assumption based on the analysis data:
563
- # If content analysis found "strengths" (which is likely if Gemini generates a full report)
564
- # This needs refinement if Gemini output is not structured for this.
565
- if analysis_data.get('text_analysis', {}).get('total_duration', 0) > 0: # Basic check if interview happened
566
- content_strength_val = 0.8 # Assume moderate strength if analysis went through
567
- # You could parse gemini_report_text for specific phrases like "Strengths:" and count items.
568
-
569
- # Calculate raw score
570
  raw_score = (
571
- confidence_score * w_confidence +
572
- (1 - anxiety_score) * abs(w_anxiety) + # (1 - anxiety) because lower anxiety is better
573
- fluency_val * w_fluency +
574
- speaking_rate_score * w_speaking_rate +
575
- filler_repetition_score * abs(w_filler_repetition) + # Use abs weight as score is already inverted
576
- content_strength_val * w_content_strengths
577
  )
578
-
579
- # Normalize to 0-1 and then to percentage
580
- # These max/min values are rough estimates and should be calibrated with real data
581
- min_possible_score = (0 * w_confidence) + (0 * abs(w_anxiety)) + (0 * w_fluency) + (0 * w_speaking_rate) + (
582
- 0 * abs(w_filler_repetition)) + (0 * w_content_strengths)
583
- max_possible_score = (1 * w_confidence) + (1 * abs(w_anxiety)) + (1 * w_fluency) + (1 * w_speaking_rate) + (
584
- 1 * abs(w_filler_repetition)) + (1 * w_content_strengths)
585
-
586
- # Prevent division by zero if all weights are zero or min/max are same
587
- if max_possible_score == min_possible_score:
588
- normalized_score = 0.5 # Default if no variation
589
- else:
590
- normalized_score = (raw_score - min_possible_score) / (max_possible_score - min_possible_score)
591
-
592
- acceptance_probability = max(0.0, min(1.0, normalized_score)) # Clamp between 0 and 1
593
-
594
- return float(f"{acceptance_probability * 100:.2f}") # Return as percentage
595
-
596
 
597
  def generate_report(analysis_data: Dict) -> str:
598
  try:
599
  voice = analysis_data.get('voice_analysis', {})
600
  voice_interpretation = generate_voice_interpretation(voice)
601
-
602
  interviewee_responses = [
603
- f"Speaker {u['speaker']} ({u['role']}): {u['text']}"
604
- for u in analysis_data['transcript']
605
- if u['role'] == 'Interviewee'
606
- ][:5] # Limit to first 5 for prompt brevity
607
-
608
  acceptance_prob = analysis_data.get('acceptance_probability', None)
609
  acceptance_line = ""
610
  if acceptance_prob is not None:
@@ -615,29 +630,23 @@ def generate_report(analysis_data: Dict) -> str:
615
  acceptance_line += "This indicates a solid candidate with potential for improvement."
616
  else:
617
  acceptance_line += "This candidate may require significant development or may not be a strong fit."
618
-
619
  prompt = f"""
620
  As EvalBot, an AI interview analysis system, generate a highly professional, well-structured, and concise interview analysis report.
621
  The report should be suitable for a professional setting and clearly highlight key findings and actionable recommendations.
622
  Use clear headings and subheadings. For bullet points, use '- '.
623
-
624
  {acceptance_line}
625
-
626
  **1. Executive Summary**
627
  Provide a brief, high-level overview of the interview.
628
  - Overall interview duration: {analysis_data['text_analysis']['total_duration']:.2f} seconds
629
  - Number of speaker turns: {analysis_data['text_analysis']['speaker_turns']}
630
  - Main participants: {', '.join(analysis_data['speakers'])}
631
-
632
  **2. Voice Analysis Insights**
633
  Analyze key voice metrics and provide a detailed interpretation.
634
  {voice_interpretation}
635
-
636
  **3. Content Analysis & Strengths/Areas for Development**
637
  Analyze the key themes and identify both strengths and areas for development in the interviewee's responses.
638
  Key responses from interviewee (for context):
639
  {chr(10).join(interviewee_responses)}
640
-
641
  **4. Actionable Recommendations**
642
  Offer specific, actionable suggestions for improvement.
643
  Focus on:
@@ -645,81 +654,50 @@ def generate_report(analysis_data: Dict) -> str:
645
  - Content Delivery (e.g., quantifying achievements, structuring answers)
646
  - Professional Presentation (e.g., research, specific examples, mock interviews)
647
  """
648
-
649
  response = gemini_model.generate_content(prompt)
650
  return response.text
651
  except Exception as e:
652
  logger.error(f"Report generation failed: {str(e)}")
653
  return f"Error generating report: {str(e)}"
654
 
655
-
656
- # --- ENHANCED PDF GENERATION FUNCTION ---
657
- def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str):
658
  try:
659
  doc = SimpleDocTemplate(output_path, pagesize=letter)
660
  styles = getSampleStyleSheet()
661
-
662
- # Define custom styles
663
- h1 = ParagraphStyle(name='Heading1', parent=styles['h1'], fontSize=16, spaceAfter=14, alignment=1,
664
- textColor=colors.HexColor('#003366'))
665
- h2 = ParagraphStyle(name='Heading2', parent=styles['h2'], fontSize=12, spaceBefore=10, spaceAfter=8,
666
- textColor=colors.HexColor('#336699'))
667
- h3 = ParagraphStyle(name='Heading3', parent=styles['h3'], fontSize=10, spaceBefore=8, spaceAfter=4,
668
- textColor=colors.HexColor('#0055AA'))
669
  body_text = ParagraphStyle(name='BodyText', parent=styles['Normal'], fontSize=9, leading=12, spaceAfter=4)
670
- bullet_style = ParagraphStyle(name='Bullet', parent=styles['Normal'], fontSize=9, leading=12, leftIndent=18,
671
- bulletIndent=9)
672
-
673
  story = []
674
-
675
- # Title and Date
676
  story.append(Paragraph(f"<b>EvalBot Interview Analysis Report</b>", h1))
677
  story.append(Spacer(1, 0.2 * inch))
678
  story.append(Paragraph(f"<b>Date:</b> {time.strftime('%Y-%m-%d')}", body_text))
679
  story.append(Spacer(1, 0.3 * inch))
680
-
681
- # --- Acceptance Probability (New Section) ---
682
  acceptance_prob = analysis_data.get('acceptance_probability', None)
683
  if acceptance_prob is not None:
684
  story.append(Paragraph("<b>Candidate Evaluation Summary</b>", h2))
685
  story.append(Spacer(1, 0.1 * inch))
686
-
687
- prob_color = colors.green if acceptance_prob >= 70 else (
688
- colors.orange if acceptance_prob >= 40 else colors.red)
689
-
690
- # --- FIX: Call .hexval() as a method ---
691
  story.append(Paragraph(
692
- f"<font size='12' color='{prob_color.hexval()}'><b>Estimated Acceptance Probability: {acceptance_prob:.2f}%</b></font>",
693
- ParagraphStyle(name='AcceptanceProbability', parent=styles['Normal'], fontSize=12, spaceAfter=10,
694
- alignment=1)
695
  ))
696
- # --- End FIX ---
697
-
698
  if acceptance_prob >= 80:
699
- story.append(
700
- Paragraph("This indicates a very strong candidate with high potential. Well done!", body_text))
701
  elif acceptance_prob >= 50:
702
- story.append(Paragraph(
703
- "This candidate shows solid potential but has areas for improvement to become an even stronger fit.",
704
- body_text))
705
  else:
706
- story.append(Paragraph(
707
- "This candidate may require significant development or may not be the ideal fit at this time.",
708
- body_text))
709
  story.append(Spacer(1, 0.3 * inch))
710
- # --- End Acceptance Probability ---
711
-
712
- # Parse Gemini's report into sections for better PDF structuring
713
  sections = {}
714
  current_section = None
715
- # Use regex to robustly identify sections, especially with varied bullet points
716
  section_patterns = {
717
  r'^\s*\*\*\s*1\.\s*Executive Summary\s*\*\*': 'Executive Summary',
718
  r'^\s*\*\*\s*2\.\s*Voice Analysis Insights\s*\*\*': 'Voice Analysis Insights',
719
  r'^\s*\*\*\s*3\.\s*Content Analysis & Strengths/Areas for Development\s*\*\*': 'Content Analysis & Strengths/Areas for Development',
720
  r'^\s*\*\*\s*4\.\s*Actionable Recommendations\s*\*\*': 'Actionable Recommendations'
721
  }
722
-
723
  for line in gemini_report_text.split('\n'):
724
  matched_section = False
725
  for pattern, section_name in section_patterns.items():
@@ -730,35 +708,29 @@ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text:
730
  break
731
  if not matched_section and current_section:
732
  sections[current_section].append(line)
733
-
734
- # 1. Executive Summary
735
  story.append(Paragraph("1. Executive Summary", h2))
736
  story.append(Spacer(1, 0.1 * inch))
737
  if 'Executive Summary' in sections:
738
  for line in sections['Executive Summary']:
739
  if line.strip():
740
  story.append(Paragraph(line.strip(), body_text))
741
- story.append(Spacer(1, 0.2 * inch))
742
-
743
- # 2. Voice Analysis (Detailed - using Table for summary)
744
  story.append(Paragraph("2. Voice Analysis", h2))
745
  voice_analysis = analysis_data.get('voice_analysis', {})
746
-
747
  if voice_analysis and 'error' not in voice_analysis:
748
- # Voice Analysis Summary Table
749
  table_data = [
750
  ['Metric', 'Value', 'Interpretation'],
751
- ['Speaking Rate', f"{voice_analysis['speaking_rate']:.2f} words/sec", 'Average rate'],
752
- ['Filler Words', f"{voice_analysis['filler_ratio'] * 100:.1f}%", 'Percentage of total words'],
753
- ['Repetition Score', f"{voice_analysis['repetition_score']:.3f}", 'Lower is better articulation'],
754
- ['Anxiety Level', voice_analysis['interpretation']['anxiety_level'].upper(),
755
- f"Score: {voice_analysis['composite_scores']['anxiety']:.3f}"],
756
- ['Confidence Level', voice_analysis['interpretation']['confidence_level'].upper(),
757
- f"Score: {voice_analysis['composite_scores']['confidence']:.3f}"],
758
- ['Fluency', voice_analysis['interpretation']['fluency_level'].upper(), 'Overall speech flow']
759
  ]
760
-
761
- table_style = TableStyle([
762
  ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#6699CC')),
763
  ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
764
  ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
@@ -770,96 +742,68 @@ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text:
770
  ('RIGHTPADDING', (0, 0), (-1, -1), 6),
771
  ('TOPPADDING', (0, 0), (-1, -1), 6),
772
  ('BOTTOMPADDING', (0, 0), (-1, -1), 6),
773
- ])
774
-
775
- table = Table(table_data)
776
- table.setStyle(table_style)
777
  story.append(table)
778
  story.append(Spacer(1, 0.2 * inch))
779
-
780
- # --- Charts ---
781
  story.append(Paragraph("Score Visualization:", h3))
782
- # chart_path = os.path.join(OUTPUT_DIR, f"anxiety_confidence_{uuid.uuid4().hex[:8]}.png") # Removed from here
783
- # --- FIX: Generate chart in memory (BytesIO) ---
784
- chart_buffer = io.BytesIO() # Create in-memory buffer
785
  try:
786
- generate_anxiety_confidence_chart(voice_analysis['composite_scores'], chart_buffer) # Pass buffer instead of path
787
- chart_buffer.seek(0) # Rewind the buffer to the beginning
788
- img = Image(chart_buffer, width=3.5*inch, height=2.0*inch) # Load image from buffer
789
  story.append(img)
790
  story.append(Spacer(1, 0.1 * inch))
791
- except NameError:
792
- logger.warning("Chart generation function 'generate_anxiety_confidence_chart' is not defined. Skipping chart.")
793
- except Exception as chart_e:
794
- logger.warning(f"Could not add chart image to PDF: {chart_e}. Please check matplotlib installation.")
795
- # --- End FIX ---
796
- # --- End Charts ---
797
-
798
- # Detailed Interpretation from Gemini (if present)
799
  if 'Voice Analysis Insights' in sections:
800
  story.append(Paragraph("Detailed Interpretation:", h3))
801
  for line in sections['Voice Analysis Insights']:
802
  if line.strip():
803
- # Handle numbered lists from Gemini
804
  if re.match(r'^\d+\.\s', line.strip()):
805
- story.append(
806
- Paragraph(line.strip(), bullet_style))
807
  else:
808
  story.append(Paragraph(line.strip(), body_text))
809
  story.append(Spacer(1, 0.2 * inch))
810
-
811
  else:
812
- story.append(Paragraph("Voice analysis not available or encountered an error.", body_text))
813
  story.append(Spacer(1, 0.3 * inch))
814
-
815
- # 3. Content Analysis
816
  story.append(Paragraph("3. Content Analysis", h2))
817
  if 'Content Analysis & Strengths/Areas for Development' in sections:
818
  for line in sections['Content Analysis & Strengths/Areas for Development']:
819
  if line.strip():
820
- # Handle bullet points from Gemini
821
  if line.strip().startswith('-'):
822
- story.append(Paragraph(line.strip()[1:].strip(), bullet_style)) # Remove the '-' and strip
823
  else:
824
  story.append(Paragraph(line.strip(), body_text))
825
  story.append(Spacer(1, 0.2 * inch))
826
-
827
- # Add some interviewee responses to the report (can be formatted as a list)
828
- story.append(Paragraph("Key Interviewee Responses (Contextual):", h3))
829
  interviewee_responses = [
830
- f"Speaker {u['speaker']} ({u['role']}): {u['text']}"
831
- for u in analysis_data['transcript']
832
- if u['role'] == 'Interviewee'
833
- ][:5]
834
  for res in interviewee_responses:
835
  story.append(Paragraph(res, bullet_style))
836
  story.append(Spacer(1, 0.3 * inch))
837
-
838
- # 4. Recommendations
839
  story.append(Paragraph("4. Recommendations", h2))
840
  if 'Actionable Recommendations' in sections:
841
  for line in sections['Actionable Recommendations']:
842
  if line.strip():
843
- # Handle bullet points from Gemini
844
  if line.strip().startswith('-'):
845
- story.append(Paragraph(line.strip()[1:].strip(), bullet_style)) # Remove the '-' and strip
846
  else:
847
  story.append(Paragraph(line.strip(), body_text))
848
  story.append(Spacer(1, 0.2 * inch))
849
-
850
- # Footer Text
851
  story.append(Spacer(1, 0.5 * inch))
852
  story.append(Paragraph("--- Analysis by EvalBot ---", ParagraphStyle(
853
- name='FooterText', parent=styles['Normal'], fontSize=8, alignment=1, textColor=colors.HexColor('#666666')
854
  )))
855
-
856
  doc.build(story)
857
  return True
858
  except Exception as e:
859
- logger.error(f"PDF creation failed: {str(e)}", exc_info=True)
860
  return False
861
 
862
-
863
  def convert_to_serializable(obj):
864
  if isinstance(obj, np.generic):
865
  return obj.item()
@@ -871,80 +815,56 @@ def convert_to_serializable(obj):
871
  return obj.tolist()
872
  return obj
873
 
874
-
875
- def process_interview(audio_path: str):
876
  try:
877
  logger.info(f"Starting processing for {audio_path}")
878
-
879
  wav_file = convert_to_wav(audio_path)
880
-
881
- logger.info("Starting transcription")
882
- transcript = transcribe(wav_file)
883
-
884
- logger.info("Extracting prosodic features")
885
- for utterance in transcript['utterances']:
886
- utterance['prosodic_features'] = extract_prosodic_features(
887
- wav_file,
888
- utterance['start'],
889
- utterance['end']
890
- )
891
-
892
- logger.info("Identifying speakers")
893
- utterances_with_speakers = identify_speakers(transcript, wav_file)
894
-
895
- logger.info("Classifying roles")
896
- # Ensure role classifier models are loaded/trained only once if possible,
897
- # or handled carefully in a multi-threaded context.
898
- # For simplicity, keeping it inside process_interview for now.
899
- if os.path.exists(os.path.join(OUTPUT_DIR, 'role_classifier.pkl')):
900
- clf = joblib.load(os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
901
- vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
902
- scaler = joblib.load(os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
903
- else:
904
- clf, vectorizer, scaler = train_role_classifier(utterances_with_speakers)
905
-
906
- classified_utterances = classify_roles(utterances_with_speakers, clf, vectorizer, scaler)
907
-
908
- logger.info("Analyzing interviewee voice")
909
- voice_analysis = analyze_interviewee_voice(wav_file, classified_utterances)
910
-
911
- analysis_data = {
912
- 'transcript': classified_utterances,
913
- 'speakers': list(set(u['speaker'] for u in classified_utterances)),
914
- 'voice_analysis': voice_analysis,
915
- 'text_analysis': {
916
- 'total_duration': sum(u['prosodic_features']['duration'] for u in classified_utterances),
917
- 'speaker_turns': len(classified_utterances)
918
  }
919
- }
920
-
921
- # --- Calculate Acceptance Probability ---
922
- acceptance_probability = calculate_acceptance_probability(analysis_data)
923
- analysis_data['acceptance_probability'] = acceptance_probability
924
- # --- End Acceptance Probability ---
925
-
926
- logger.info("Generating report text using Gemini")
927
- gemini_report_text = generate_report(analysis_data)
928
-
929
- base_name = os.path.splitext(os.path.basename(audio_path))[0]
930
- pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
931
- create_pdf_report(analysis_data, pdf_path, gemini_report_text=gemini_report_text)
932
-
933
- json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
934
- with open(json_path, 'w') as f:
935
- serializable_data = convert_to_serializable(analysis_data)
936
- json.dump(serializable_data, f, indent=2)
937
-
938
- os.remove(wav_file) # Clean up WAV file after processing
939
-
940
- logger.info(f"Processing completed for {audio_path}")
941
- return {
942
- 'pdf_path': pdf_path,
943
- 'json_path': json_path
944
- }
945
  except Exception as e:
946
- logger.error(f"Processing failed: {str(e)}", exc_info=True)
947
- # Clean up wav_file in case of error
948
- if 'wav_file' in locals() and os.path.exists(wav_file):
949
- os.remove(wav_file)
950
  raise
 
17
  import re
18
  from typing import Dict, List, Tuple
19
  import logging
 
20
  from reportlab.lib.pagesizes import letter
21
  from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
22
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
 
26
  import matplotlib
27
  matplotlib.use('Agg')
28
  from reportlab.platypus import Image
29
+ import io
 
30
  from transformers import AutoTokenizer, AutoModel
31
  import spacy
32
  import google.generativeai as genai
33
  import joblib
34
  from concurrent.futures import ThreadPoolExecutor
35
+ from textblob import TextBlob # Added for sentiment analysis
36
 
37
  # Setup logging
38
  logging.basicConfig(level=logging.INFO)
 
41
  logging.getLogger("nemo").setLevel(logging.ERROR)
42
 
43
  # Configuration
44
+ AUDIO_DIR = "./Uploads"
45
  OUTPUT_DIR = "./processed_audio"
46
  os.makedirs(OUTPUT_DIR, exist_ok=True)
47
 
 
50
  ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY")
51
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
52
 
 
53
  # Initialize services
54
  def initialize_services():
55
  try:
 
72
  logger.error(f"Error initializing services: {str(e)}")
73
  raise
74
 
 
75
  index, gemini_model = initialize_services()
76
 
77
  # Device setup
78
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
79
  logger.info(f"Using device: {device}")
80
 
 
81
  def load_speaker_model():
82
  try:
 
83
  torch.set_num_threads(5)
84
  model = EncDecSpeakerLabelModel.from_pretrained(
85
  "nvidia/speakerverification_en_titanet_large",
86
+ map_location=device
87
  )
88
  model.eval()
89
  return model
 
91
  logger.error(f"Model loading failed: {str(e)}")
92
  raise RuntimeError("Could not load speaker verification model")
93
 
 
94
  # Load ML models
95
  def load_models():
96
  speaker_model = load_speaker_model()
97
  nlp = spacy.load("en_core_web_sm")
 
98
  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
99
  llm_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
100
  llm_model.eval()
 
101
  return speaker_model, nlp, tokenizer, llm_model
102
 
 
103
  speaker_model, nlp, tokenizer, llm_model = load_models()
104
 
 
105
  # Audio processing functions
106
+ def preprocess_audio(audio_path: str, output_path: str) -> str:
107
+ """Preprocess audio to improve quality before transcription."""
108
  try:
109
  audio = AudioSegment.from_file(audio_path)
110
+ audio = audio.set_channels(1).set_frame_rate(16000)
111
+ audio = audio.normalize()
112
+ audio = audio.apply_gain(5) # Slight volume boost
113
+ audio.export(output_path, format="wav")
114
+ return output_path
115
+ except Exception as e:
116
+ logger.error(f"Audio preprocessing failed: {str(e)}")
117
+ raise
118
+
119
+ def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
120
+ try:
121
+ temp_path = os.path.join(output_dir, f"temp_{uuid.uuid4()}.wav")
122
+ preprocessed_path = preprocess_audio(audio_path, temp_path)
123
+ audio = AudioSegment.from_file(preprocessed_path)
124
  if audio.channels > 1:
125
  audio = audio.set_channels(1)
126
  audio = audio.set_frame_rate(16000)
 
127
  wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
128
  audio.export(wav_file, format="wav")
129
+ os.remove(temp_path)
130
  return wav_file
131
  except Exception as e:
132
  logger.error(f"Audio conversion failed: {str(e)}")
133
+ if os.path.exists(temp_path):
134
+ os.remove(temp_path)
135
  raise
136
 
 
137
  def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
138
  try:
139
+ if start_ms >= end_ms or end_ms <= 0:
140
+ logger.warning("Invalid audio segment times, returning default features")
141
+ return {
142
+ 'duration': 0.0, 'mean_pitch': 0.0, 'min_pitch': 0.0, 'max_pitch': 0.0, 'pitch_sd': 0.0,
143
+ 'intensityMean': 0.0, 'intensityMin': 0.0, 'intensityMax': 0.0, 'intensitySD': 0.0
144
+ }
145
  audio = AudioSegment.from_file(audio_path)
146
  segment = audio[start_ms:end_ms]
147
  temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
148
  segment.export(temp_path, format="wav")
 
149
  y, sr = librosa.load(temp_path, sr=16000)
150
  pitches = librosa.piptrack(y=y, sr=sr)[0]
151
  pitches = pitches[pitches > 0]
 
152
  features = {
153
  'duration': (end_ms - start_ms) / 1000,
154
  'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
 
160
  'intensityMax': float(np.max(librosa.feature.rms(y=y)[0])),
161
  'intensitySD': float(np.std(librosa.feature.rms(y=y)[0])),
162
  }
 
163
  os.remove(temp_path)
164
  return features
165
  except Exception as e:
166
  logger.error(f"Feature extraction failed: {str(e)}")
167
+ if os.path.exists(temp_path):
168
+ os.remove(temp_path)
169
  return {
170
+ 'duration': 0.0, 'mean_pitch': 0.0, 'min_pitch': 0.0, 'max_pitch': 0.0, 'pitch_sd': 0.0,
171
+ 'intensityMean': 0.0, 'intensityMin': 0.0, 'intensityMax': 0.0, 'intensitySD': 0.0
 
 
 
 
 
 
 
172
  }
173
 
 
174
  def transcribe(audio_path: str) -> Dict:
175
  try:
176
  with open(audio_path, 'rb') as f:
 
180
  data=f
181
  )
182
  audio_url = upload_response.json()['upload_url']
 
183
  transcript_response = requests.post(
184
  "https://api.assemblyai.com/v2/transcript",
185
  headers={"authorization": ASSEMBLYAI_KEY},
 
190
  }
191
  )
192
  transcript_id = transcript_response.json()['id']
 
193
  while True:
194
  result = requests.get(
195
  f"https://api.assemblyai.com/v2/transcript/{transcript_id}",
196
  headers={"authorization": ASSEMBLYAI_KEY}
197
  ).json()
 
198
  if result['status'] == 'completed':
199
  return result
200
  elif result['status'] == 'error':
201
  raise Exception(result['error'])
 
202
  time.sleep(5)
203
  except Exception as e:
204
  logger.error(f"Transcription failed: {str(e)}")
205
  raise
206
 
 
207
  def process_utterance(utterance, full_audio, wav_file):
208
  try:
209
  start = utterance['start']
210
  end = utterance['end']
211
+ if start >= end or end <= 0:
212
+ logger.warning(f"Invalid utterance times: start={start}, end={end}")
213
+ return {
214
+ **utterance,
215
+ 'speaker': 'Unknown',
216
+ 'speaker_id': 'unknown',
217
+ 'embedding': None
218
+ }
219
  segment = full_audio[start:end]
220
  temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
221
  segment.export(temp_path, format="wav")
 
222
  with torch.no_grad():
223
+ embedding = speaker_model.get_embedding(temp_path).cpu().numpy()
 
 
224
  embedding_list = embedding.flatten().tolist()
 
 
225
  query_result = index.query(
226
+ vector=embedding_list,
227
  top_k=1,
228
  include_metadata=True
229
  )
 
230
  if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
231
  speaker_id = query_result['matches'][0]['id']
232
  speaker_name = query_result['matches'][0]['metadata']['speaker_name']
233
  else:
234
  speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
235
  speaker_name = f"Speaker_{speaker_id[-4:]}"
236
+ index.upsert([(speaker_id, embedding_list, {"speaker_name": speaker_name})])
 
237
  os.remove(temp_path)
 
238
  return {
239
  **utterance,
240
  'speaker': speaker_name,
241
  'speaker_id': speaker_id,
242
+ 'embedding': embedding_list
243
  }
244
  except Exception as e:
245
+ logger.error(f"Utterance processing failed: {str(e)}")
246
+ if os.path.exists(temp_path):
247
+ os.remove(temp_path)
248
  return {
249
  **utterance,
250
  'speaker': 'Unknown',
 
252
  'embedding': None
253
  }
254
 
 
255
  def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
256
  try:
257
  full_audio = AudioSegment.from_wav(wav_file)
258
+ utterances = transcript.get('utterances', [])
259
+ if not utterances:
260
+ logger.error("No utterances found in transcript")
261
+ raise ValueError("Empty transcript")
262
+ with ThreadPoolExecutor(max_workers=5) as executor:
263
  futures = [
264
  executor.submit(process_utterance, utterance, full_audio, wav_file)
265
  for utterance in utterances
266
  ]
267
  results = [f.result() for f in futures]
 
268
  return results
269
  except Exception as e:
270
  logger.error(f"Speaker identification failed: {str(e)}")
271
  raise
272
 
273
+ def get_sentiment_score(text: str) -> float:
274
+ """Calculate sentiment polarity using TextBlob."""
275
+ try:
276
+ blob = TextBlob(text)
277
+ return blob.sentiment.polarity
278
+ except Exception as e:
279
+ logger.warning(f"Sentiment analysis failed for text '{text}': {str(e)}")
280
+ return 0.0
281
 
282
+ def train_role_classifier(utterances: List[Dict]) -> Tuple[RandomForestClassifier, TfidfVectorizer, StandardScaler]:
283
  try:
284
+ texts = [u['text'] for u in utterances if u.get('text', '').strip()]
285
+ if not texts:
286
+ logger.error("No valid texts found for role classifier training")
287
+ raise ValueError("Empty text data for training")
288
  vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
289
  X_text = vectorizer.fit_transform(texts)
 
290
  features = []
291
  labels = []
292
+ open_question_starters = [
293
+ 'tell me', 'describe', 'walk me through', 'explain', 'share',
294
+ 'give me', 'talk about', 'discuss'
295
+ ]
296
+ exploratory_question_indicators = [
297
+ 'can i ask', 'what about', 'could you tell', 'is it', 'are there',
298
+ 'what’s the', 'how does', 'may i'
299
+ ]
300
  for i, utterance in enumerate(utterances):
301
+ text = utterance.get('text', '').lower().strip()
302
+ if not text:
303
+ continue
304
+ prosodic = utterance.get('prosodic_features', {})
305
+ word_count = len(text.split())
306
+ is_question = text.endswith('?')
307
+ prev_is_question = utterances[i-1]['text'].endswith('?') if i > 0 else False
308
+ speaker_frequency = sum(1 for u in utterances[:i+1] if u.get('speaker_id') == utterance.get('speaker_id')) / (i+1 or 1)
309
+ sentiment_score = get_sentiment_score(text)
310
  feat = [
311
+ prosodic.get('duration', 0.0),
312
+ prosodic.get('mean_pitch', 0.0),
313
+ prosodic.get('min_pitch', 0.0),
314
+ prosodic.get('max_pitch', 0.0),
315
+ prosodic.get('pitch_sd', 0.0),
316
+ prosodic.get('intensityMean', 0.0),
317
+ prosodic.get('intensityMin', 0.0),
318
+ prosodic.get('intensityMax', 0.0),
319
+ prosodic.get('intensitySD', 0.0),
320
+ word_count,
321
+ speaker_frequency,
322
+ int(prev_is_question),
323
+ sentiment_score
324
  ]
 
325
  feat.extend(X_text[i].toarray()[0].tolist())
326
+ doc = nlp(text)
 
327
  feat.extend([
328
+ int(is_question),
329
+ len(re.findall(r'\b(why|how|what|when|where|who|which)\b', text)),
 
330
  sum(1 for token in doc if token.pos_ == 'VERB'),
331
+ sum(1 for token in doc if token.pos_ == 'NOUN'),
332
+ int(i < 2),
333
+ int(any(text.startswith(starter) for starter in open_question_starters)),
334
+ int(any(text.startswith(ind) for ind in exploratory_question_indicators)),
335
+ prosodic.get('duration', 0.0) / word_count if word_count > 0 else 0.0
336
  ])
337
+ is_interviewer = (
338
+ (is_question and not any(text.startswith(ind) for ind in exploratory_question_indicators)) or
339
+ any(text.startswith(starter) for starter in open_question_starters) or
340
+ (i < 2 and word_count < 10) or
341
+ (prev_is_question and word_count < 5 and is_question)
342
+ )
343
+ labels.append(0 if is_interviewer else 1) # 0: Interviewer, 1: Interviewee
344
  features.append(feat)
345
+ if not features or not labels:
346
+ logger.error("No features or labels generated for training")
347
+ raise ValueError("No valid training data")
348
  scaler = StandardScaler()
349
  X = scaler.fit_transform(features)
 
350
  clf = RandomForestClassifier(
351
  n_estimators=150,
352
  max_depth=10,
 
354
  class_weight='balanced'
355
  )
356
  clf.fit(X, labels)
 
357
  joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
358
  joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
359
  joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
 
360
  return clf, vectorizer, scaler
361
  except Exception as e:
362
  logger.error(f"Classifier training failed: {str(e)}")
363
  raise
364
 
365
+ def enforce_conversation_flow(results: List[Dict]) -> List[Dict]:
366
+ """Enforce logical conversation flow for role assignments."""
367
+ try:
368
+ for i in range(1, len(results)):
369
+ if results[i-1]['role'] == 'Interviewer' and results[i]['text'].endswith('?'):
370
+ results[i]['role'] = 'Interviewer'
371
+ elif results[i-1]['role'] == 'Interviewer' and not results[i]['text'].endswith('?'):
372
+ results[i]['role'] = 'Interviewee'
373
+ return results
374
+ except Exception as e:
375
+ logger.error(f"Conversation flow enforcement failed: {str(e)}")
376
+ return results
377
 
378
+ def classify_roles(utterances: List[Dict], clf, vectorizer, scaler) -> List[Dict]:
379
  try:
380
+ texts = [u['text'] for u in utterances if u.get('text', '').strip()]
381
+ if not texts:
382
+ logger.error("No valid texts found for role classification")
383
+ raise ValueError("Empty text data for classification")
384
  X_text = vectorizer.transform(texts)
385
+ open_question_starters = [
386
+ 'tell me', 'describe', 'walk me through', 'explain', 'share',
387
+ 'give me', 'talk about', 'discuss'
388
+ ]
389
+ exploratory_question_indicators = [
390
+ 'can i ask', 'what about', 'could you tell', 'is it', 'are there',
391
+ 'what’s the', 'how does', 'may i'
392
+ ]
393
  results = []
394
  for i, utterance in enumerate(utterances):
395
+ text = utterance.get('text', '').lower().strip()
396
+ if not text:
397
+ results.append({**utterance, 'role': 'Unknown'})
398
+ continue
399
+ prosodic = utterance.get('prosodic_features', {})
400
+ word_count = len(text.split())
401
+ is_question = text.endswith('?')
402
+ prev_is_question = utterances[i-1]['text'].endswith('?') if i > 0 else False
403
+ speaker_frequency = sum(1 for u in utterances[:i+1] if u.get('speaker_id') == utterance.get('speaker_id')) / (i+1 or 1)
404
+ sentiment_score = get_sentiment_score(text)
405
  feat = [
406
+ prosodic.get('duration', 0.0),
407
+ prosodic.get('mean_pitch', 0.0),
408
+ prosodic.get('min_pitch', 0.0),
409
+ prosodic.get('max_pitch', 0.0),
410
+ prosodic.get('pitch_sd', 0.0),
411
+ prosodic.get('intensityMean', 0.0),
412
+ prosodic.get('intensityMin', 0.0),
413
+ prosodic.get('intensityMax', 0.0),
414
+ prosodic.get('intensitySD', 0.0),
415
+ word_count,
416
+ speaker_frequency,
417
+ int(prev_is_question),
418
+ sentiment_score
419
  ]
 
420
  feat.extend(X_text[i].toarray()[0].tolist())
421
+ doc = nlp(text)
 
422
  feat.extend([
423
+ int(is_question),
424
+ len(re.findall(r'\b(why|how|what|when|where|who|which)\b', text)),
 
425
  sum(1 for token in doc if token.pos_ == 'VERB'),
426
+ sum(1 for token in doc if token.pos_ == 'NOUN'),
427
+ int(i < 2),
428
+ int(any(text.startswith(starter) for starter in open_question_starters)),
429
+ int(any(text.startswith(ind) for ind in exploratory_question_indicators)),
430
+ prosodic.get('duration', 0.0) / word_count if word_count > 0 else 0.0
431
  ])
 
432
  X = scaler.transform([feat])
433
  role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
434
+ prob = clf.predict_proba(X)[0]
435
+ logger.debug(f"Utterance {i}: Text='{text}', Role={role}, Prob={prob}")
436
  results.append({**utterance, 'role': role})
437
+ results = enforce_conversation_flow(results)
438
+ if all(r['role'] == 'Interviewer' for r in results):
439
+ logger.warning("No Interviewee detected. Forcing longest non-question or exploratory utterance as Interviewee.")
440
+ candidates = [
441
+ i for i, r in enumerate(results)
442
+ if not r['text'].endswith('?') or any(r['text'].lower().startswith(ind) for ind in exploratory_question_indicators)
443
+ ]
444
+ if candidates:
445
+ max_duration_idx = max(candidates, key=lambda x: results[x]['prosodic_features']['duration'])
446
+ results[max_duration_idx]['role'] = 'Interviewee'
447
  return results
448
  except Exception as e:
449
  logger.error(f"Role classification failed: {str(e)}")
450
  raise
451
 
 
452
  def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
453
  try:
454
  y, sr = librosa.load(audio_path, sr=16000)
455
+ interviewee_utterances = [u for u in utterances if u.get('role') == 'Interviewee']
 
456
  if not interviewee_utterances:
457
+ logger.warning("No interviewee utterances found")
458
  return {'error': 'No interviewee utterances found'}
 
459
  segments = []
460
  for u in interviewee_utterances:
461
  start = int(u['start'] * sr / 1000)
462
  end = int(u['end'] * sr / 1000)
463
+ if start < end and end <= len(y):
464
+ segments.append(y[start:end])
465
+ if not segments:
466
+ logger.warning("No valid audio segments for interviewee")
467
+ return {'error': 'No valid audio segments found'}
468
  combined_audio = np.concatenate(segments)
 
469
  total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
470
  total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
471
  speaking_rate = total_words / total_duration if total_duration > 0 else 0
 
472
  filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
473
  filler_count = sum(
474
  sum(u['text'].lower().count(fw) for fw in filler_words)
475
  for u in interviewee_utterances
476
  )
477
  filler_ratio = filler_count / total_words if total_words > 0 else 0
 
478
  all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
479
  word_counts = {}
480
  for i in range(len(all_words) - 1):
481
  bigram = (all_words[i], all_words[i + 1])
482
  word_counts[bigram] = word_counts.get(bigram, 0) + 1
483
+ repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(word_counts) if word_counts else 0
 
 
484
  pitches = []
485
  for segment in segments:
486
  f0, voiced_flag, _ = librosa.pyin(segment, fmin=80, fmax=300, sr=sr)
487
  pitches.extend(f0[voiced_flag])
 
488
  pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
489
  pitch_std = np.std(pitches) if len(pitches) > 0 else 0
490
  jitter = np.mean(np.abs(np.diff(pitches))) / pitch_mean if len(pitches) > 1 and pitch_mean > 0 else 0
 
491
  intensities = []
492
  for segment in segments:
493
  rms = librosa.feature.rms(y=segment)[0]
494
  intensities.extend(rms)
 
495
  intensity_mean = np.mean(intensities) if intensities else 0
496
  intensity_std = np.std(intensities) if intensities else 0
497
+ shimmer = np.mean(np.abs(np.diff(intensities))) / intensity_mean if len(intensities) > 1 and intensity_mean > 0 else 0
 
 
498
  anxiety_score = 0.6 * (pitch_std / pitch_mean) + 0.4 * (jitter + shimmer) if pitch_mean > 0 else 0
499
  confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
500
  hesitation_score = filler_ratio + repetition_score
 
501
  anxiety_level = 'high' if anxiety_score > 0.15 else 'moderate' if anxiety_score > 0.07 else 'low'
502
  confidence_level = 'high' if confidence_score > 0.7 else 'moderate' if confidence_score > 0.5 else 'low'
503
  fluency_level = 'fluent' if (filler_ratio < 0.05 and repetition_score < 0.1) else 'moderate' if (
504
  filler_ratio < 0.1 and repetition_score < 0.2) else 'disfluent'
 
505
  return {
506
  'speaking_rate': float(round(speaking_rate, 2)),
507
  'filler_ratio': float(round(filler_ratio, 4)),
 
531
  logger.error(f"Voice analysis failed: {str(e)}")
532
  return {'error': str(e)}
533
 
 
534
  def generate_voice_interpretation(analysis: Dict) -> str:
 
535
  if 'error' in analysis:
536
  return "Voice analysis not available."
 
537
  interpretation_lines = []
538
  interpretation_lines.append("Voice Analysis Summary:")
539
  interpretation_lines.append(f"- Speaking Rate: {analysis['speaking_rate']} words/sec (average)")
 
552
  interpretation_lines.append("3. Anxiety is measured through pitch variability and voice instability.")
553
  interpretation_lines.append("4. Confidence is assessed through voice intensity and stability.")
554
  interpretation_lines.append("5. Fluency combines filler words and repetition metrics.")
 
555
  return "\n".join(interpretation_lines)
556
 
557
+ def generate_anxiety_confidence_chart(composite_scores: Dict, output: io.BytesIO):
 
 
558
  try:
559
  labels = ['Anxiety', 'Confidence']
560
  scores = [composite_scores.get('anxiety', 0), composite_scores.get('confidence', 0)]
561
+ fig, ax = plt.subplots(figsize=(4, 2.5))
 
562
  ax.bar(labels, scores, color=['lightcoral', 'lightskyblue'])
563
  ax.set_ylabel('Score')
564
  ax.set_title('Anxiety vs. Confidence Scores')
565
+ ax.set_ylim(0, 1.0)
 
566
  for i, v in enumerate(scores):
567
  ax.text(i, v + 0.05, f"{v:.2f}", color='black', ha='center', fontweight='bold')
 
 
568
  plt.tight_layout()
569
+ plt.savefig(output, format='png')
570
+ plt.close(fig)
571
+ output.seek(0)
572
  except Exception as e:
573
  logger.error(f"Error generating chart: {str(e)}")
574
+ raise
575
 
 
 
576
  def calculate_acceptance_probability(analysis_data: Dict) -> float:
 
 
 
 
577
  voice = analysis_data.get('voice_analysis', {})
 
578
  if 'error' in voice:
579
+ return 0.0
 
 
580
  w_confidence = 0.4
581
+ w_anxiety = -0.3
582
  w_fluency = 0.2
583
+ w_speaking_rate = 0.1
584
+ w_filler_repetition = -0.1
585
+ w_content_strengths = 0.2
 
 
586
  confidence_score = voice.get('composite_scores', {}).get('confidence', 0.0)
587
  anxiety_score = voice.get('composite_scores', {}).get('anxiety', 0.0)
588
  fluency_level = voice.get('interpretation', {}).get('fluency_level', 'disfluent')
589
  speaking_rate = voice.get('speaking_rate', 0.0)
590
  filler_ratio = voice.get('filler_ratio', 0.0)
591
  repetition_score = voice.get('repetition_score', 0.0)
 
 
592
  fluency_map = {'fluent': 1.0, 'moderate': 0.5, 'disfluent': 0.0}
593
  fluency_val = fluency_map.get(fluency_level, 0.0)
 
 
 
594
  ideal_speaking_rate = 2.5
595
  speaking_rate_deviation = abs(speaking_rate - ideal_speaking_rate)
596
+ speaking_rate_score = max(0, 1 - (speaking_rate_deviation / ideal_speaking_rate))
597
+ filler_repetition_composite = (filler_ratio + repetition_score) / 2
 
 
598
  filler_repetition_score = max(0, 1 - filler_repetition_composite)
599
+ content_strength_val = 0.8 if analysis_data.get('text_analysis', {}).get('total_duration', 0) > 0 else 0.0
 
 
 
 
 
 
 
 
 
 
 
 
600
  raw_score = (
601
+ confidence_score * w_confidence +
602
+ (1 - anxiety_score) * abs(w_anxiety) +
603
+ fluency_val * w_fluency +
604
+ speaking_rate_score * w_speaking_rate +
605
+ filler_repetition_score * abs(w_filler_repetition) +
606
+ content_strength_val * w_content_strengths
607
  )
608
+ min_possible_score = 0
609
+ max_possible_score = w_confidence + abs(w_anxiety) + w_fluency + w_speaking_rate + abs(w_filler_repetition) + w_content_strengths
610
+ normalized_score = (raw_score - min_possible_score) / (max_possible_score - min_possible_score) if max_possible_score > min_possible_score else 0.5
611
+ acceptance_probability = max(0.0, min(1.0, normalized_score))
612
+ return float(f"{acceptance_probability * 100:.2f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
613
 
614
  def generate_report(analysis_data: Dict) -> str:
615
  try:
616
  voice = analysis_data.get('voice_analysis', {})
617
  voice_interpretation = generate_voice_interpretation(voice)
 
618
  interviewee_responses = [
619
+ f"Speaker {u['speaker']} ({u['role']}): {u['text']}"
620
+ for u in analysis_data['transcript']
621
+ if u['role'] == 'Interviewee'
622
+ ][:5]
 
623
  acceptance_prob = analysis_data.get('acceptance_probability', None)
624
  acceptance_line = ""
625
  if acceptance_prob is not None:
 
630
  acceptance_line += "This indicates a solid candidate with potential for improvement."
631
  else:
632
  acceptance_line += "This candidate may require significant development or may not be a strong fit."
 
633
  prompt = f"""
634
  As EvalBot, an AI interview analysis system, generate a highly professional, well-structured, and concise interview analysis report.
635
  The report should be suitable for a professional setting and clearly highlight key findings and actionable recommendations.
636
  Use clear headings and subheadings. For bullet points, use '- '.
 
637
  {acceptance_line}
 
638
  **1. Executive Summary**
639
  Provide a brief, high-level overview of the interview.
640
  - Overall interview duration: {analysis_data['text_analysis']['total_duration']:.2f} seconds
641
  - Number of speaker turns: {analysis_data['text_analysis']['speaker_turns']}
642
  - Main participants: {', '.join(analysis_data['speakers'])}
 
643
  **2. Voice Analysis Insights**
644
  Analyze key voice metrics and provide a detailed interpretation.
645
  {voice_interpretation}
 
646
  **3. Content Analysis & Strengths/Areas for Development**
647
  Analyze the key themes and identify both strengths and areas for development in the interviewee's responses.
648
  Key responses from interviewee (for context):
649
  {chr(10).join(interviewee_responses)}
 
650
  **4. Actionable Recommendations**
651
  Offer specific, actionable suggestions for improvement.
652
  Focus on:
 
654
  - Content Delivery (e.g., quantifying achievements, structuring answers)
655
  - Professional Presentation (e.g., research, specific examples, mock interviews)
656
  """
 
657
  response = gemini_model.generate_content(prompt)
658
  return response.text
659
  except Exception as e:
660
  logger.error(f"Report generation failed: {str(e)}")
661
  return f"Error generating report: {str(e)}"
662
 
663
+ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str) -> bool:
 
 
664
  try:
665
  doc = SimpleDocTemplate(output_path, pagesize=letter)
666
  styles = getSampleStyleSheet()
667
+ h1 = ParagraphStyle(name='Heading1', parent=styles['h1'], fontSize=16, spaceAfter=14, alignment=1, textColor=colors.HexColor('#003366'))
668
+ h2 = ParagraphStyle(name='Heading2', parent=styles['h2'], fontSize=12, spaceBefore=10, spaceAfter=8, textColor=colors.HexColor('#336699'))
669
+ h3 = ParagraphStyle(name='Heading3', parent=styles['h3'], fontSize=10, spaceBefore=8, spaceAfter=4, textColor=colors.HexColor('#0055AA'))
 
 
 
 
 
670
  body_text = ParagraphStyle(name='BodyText', parent=styles['Normal'], fontSize=9, leading=12, spaceAfter=4)
671
+ bullet_style = ParagraphStyle(name='Bullet', parent=styles['Normal'], fontSize=9, leading=12, leftIndent=18, bulletIndent=9)
 
 
672
  story = []
 
 
673
  story.append(Paragraph(f"<b>EvalBot Interview Analysis Report</b>", h1))
674
  story.append(Spacer(1, 0.2 * inch))
675
  story.append(Paragraph(f"<b>Date:</b> {time.strftime('%Y-%m-%d')}", body_text))
676
  story.append(Spacer(1, 0.3 * inch))
 
 
677
  acceptance_prob = analysis_data.get('acceptance_probability', None)
678
  if acceptance_prob is not None:
679
  story.append(Paragraph("<b>Candidate Evaluation Summary</b>", h2))
680
  story.append(Spacer(1, 0.1 * inch))
681
+ prob_color = colors.green if acceptance_prob >= 70 else (colors.orange if acceptance_prob >= 40 else colors.red)
 
 
 
 
682
  story.append(Paragraph(
683
+ f"<font size='12' color='{prob_color.hex()}'><b>Estimated Acceptance Probability: {acceptance_prob:.2f}%</b></font>",
684
+ ParagraphStyle(name='AcceptanceProbability', parent=styles['Normal'], fontSize=12, spaceAfter=10, alignment=1)
 
685
  ))
 
 
686
  if acceptance_prob >= 80:
687
+ story.append(Paragraph("This indicates a very strong candidate with high potential. Well done!", body_text))
 
688
  elif acceptance_prob >= 50:
689
+ story.append(Paragraph("This candidate shows solid potential but has areas for improvement.", body_text))
 
 
690
  else:
691
+ story.append(Paragraph("This candidate may require significant development.", body_text))
 
 
692
  story.append(Spacer(1, 0.3 * inch))
 
 
 
693
  sections = {}
694
  current_section = None
 
695
  section_patterns = {
696
  r'^\s*\*\*\s*1\.\s*Executive Summary\s*\*\*': 'Executive Summary',
697
  r'^\s*\*\*\s*2\.\s*Voice Analysis Insights\s*\*\*': 'Voice Analysis Insights',
698
  r'^\s*\*\*\s*3\.\s*Content Analysis & Strengths/Areas for Development\s*\*\*': 'Content Analysis & Strengths/Areas for Development',
699
  r'^\s*\*\*\s*4\.\s*Actionable Recommendations\s*\*\*': 'Actionable Recommendations'
700
  }
 
701
  for line in gemini_report_text.split('\n'):
702
  matched_section = False
703
  for pattern, section_name in section_patterns.items():
 
708
  break
709
  if not matched_section and current_section:
710
  sections[current_section].append(line)
 
 
711
  story.append(Paragraph("1. Executive Summary", h2))
712
  story.append(Spacer(1, 0.1 * inch))
713
  if 'Executive Summary' in sections:
714
  for line in sections['Executive Summary']:
715
  if line.strip():
716
  story.append(Paragraph(line.strip(), body_text))
717
+ story.append(Spacer(1, 0.2 * inch))
 
 
718
  story.append(Paragraph("2. Voice Analysis", h2))
719
  voice_analysis = analysis_data.get('voice_analysis', {})
 
720
  if voice_analysis and 'error' not in voice_analysis:
 
721
  table_data = [
722
  ['Metric', 'Value', 'Interpretation'],
723
+ ['Speaking Rate', f"{voice.get('speaking_rate', 0.0):.2f} words/sec", 'Average rate'],
724
+ ['Filler Words', f"{voice.get('filler_ratio', 0.0) * 100:.1f}%", 'Percentage of words'],
725
+ ['Repetition Score', f"{voice.get('repetition_score', 0.0):.3f}", 'Lower is better'],
726
+ ['Anxiety Level', voice_analysis['interpretation'].get('anxiety_level', '').upper(),
727
+ f"Score: {voice_analysis['composite_scores'].get('anxiety', 0.0):.3f}"],
728
+ ['Confidence Level', voice_analysis['interpretation'].get('confidence_level', '').upper(),
729
+ f"Score: {voice_analysis['composite_scores'].get('confidence', 0.0):.3f}"],
730
+ ['Fluency', voice_analysis['interpretation'].get('fluency_level', '').upper(), 'Speech flow']
731
  ]
732
+ table = Table(table_data)
733
+ table.setStyle(TableStyle([
734
  ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#6699CC')),
735
  ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
736
  ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
 
742
  ('RIGHTPADDING', (0, 0), (-1, -1), 6),
743
  ('TOPPADDING', (0, 0), (-1, -1), 6),
744
  ('BOTTOMPADDING', (0, 0), (-1, -1), 6),
745
+ ]))
 
 
 
746
  story.append(table)
747
  story.append(Spacer(1, 0.2 * inch))
 
 
748
  story.append(Paragraph("Score Visualization:", h3))
749
+ chart_buffer = io.BytesIO()
 
 
750
  try:
751
+ generate_anxiety_confidence_chart(voice_analysis['composite_scores'], chart_buffer)
752
+ chart_buffer.seek(0)
753
+ img = Image(chart_buffer, width=3.0 * inch, height=1.8 * inch)
754
  story.append(img)
755
  story.append(Spacer(1, 0.1 * inch))
756
+ except Exception as e:
757
+ logger.warning(f"Could not add chart image to PDF: {str(e)}")
 
 
 
 
 
 
758
  if 'Voice Analysis Insights' in sections:
759
  story.append(Paragraph("Detailed Interpretation:", h3))
760
  for line in sections['Voice Analysis Insights']:
761
  if line.strip():
 
762
  if re.match(r'^\d+\.\s', line.strip()):
763
+ story.append(Paragraph(line.strip(), bullet_style))
 
764
  else:
765
  story.append(Paragraph(line.strip(), body_text))
766
  story.append(Spacer(1, 0.2 * inch))
 
767
  else:
768
+ story.append(Paragraph("Voice analysis not available.", body_text))
769
  story.append(Spacer(1, 0.3 * inch))
 
 
770
  story.append(Paragraph("3. Content Analysis", h2))
771
  if 'Content Analysis & Strengths/Areas for Development' in sections:
772
  for line in sections['Content Analysis & Strengths/Areas for Development']:
773
  if line.strip():
 
774
  if line.strip().startswith('-'):
775
+ story.append(Paragraph(line.strip()[1:].strip(), bullet_style))
776
  else:
777
  story.append(Paragraph(line.strip(), body_text))
778
  story.append(Spacer(1, 0.2 * inch))
779
+ story.append(Paragraph("Key Interviewee Responses:", h3))
 
 
780
  interviewee_responses = [
781
+ f"Speaker {u['speaker']} ({u['role']}): {u['text']}"
782
+ for u in analysis_data['transcript']
783
+ if u['role'] == 'Interviewee'
784
+ ][:5]
785
  for res in interviewee_responses:
786
  story.append(Paragraph(res, bullet_style))
787
  story.append(Spacer(1, 0.3 * inch))
 
 
788
  story.append(Paragraph("4. Recommendations", h2))
789
  if 'Actionable Recommendations' in sections:
790
  for line in sections['Actionable Recommendations']:
791
  if line.strip():
 
792
  if line.strip().startswith('-'):
793
+ story.append(Paragraph(line.strip()[1:].strip(), bullet_style))
794
  else:
795
  story.append(Paragraph(line.strip(), body_text))
796
  story.append(Spacer(1, 0.2 * inch))
 
 
797
  story.append(Spacer(1, 0.5 * inch))
798
  story.append(Paragraph("--- Analysis by EvalBot ---", ParagraphStyle(
799
+ name='FooterText', fontSize=8, alignment=1, textColor=colors.HexColor('#666666')
800
  )))
 
801
  doc.build(story)
802
  return True
803
  except Exception as e:
804
+ logger.error(f"PDF creation failed: {str(e)}")
805
  return False
806
 
 
807
  def convert_to_serializable(obj):
808
  if isinstance(obj, np.generic):
809
  return obj.item()
 
815
  return obj.tolist()
816
  return obj
817
 
818
+ def process_interview(audio_path: str) -> Dict:
 
819
  try:
820
  logger.info(f"Starting processing for {audio_path}")
 
821
  wav_file = convert_to_wav(audio_path)
822
+ try:
823
+ logger.info("Starting transcription")
824
+ transcript = transcribe(wav_file)
825
+ logger.info("Extracting prosodic features")
826
+ for utterance in transcript.get('utterances', []):
827
+ utterance['prosodic_features'] = extract_prosodic_features(
828
+ wav_file, utterance['start'], utterance['end']
829
+ )
830
+ logger.info("Identifying speakers")
831
+ utterances_with_speakers = identify_speakers(transcript, wav_file)
832
+ logger.info("Classifying roles")
833
+ if os.path.exists(os.path.join(OUTPUT_DIR, 'role_classifier.pkl')):
834
+ clf = joblib.load(os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
835
+ vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
836
+ scaler = joblib.load(os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
837
+ else:
838
+ clf, vectorizer, scaler = train_role_classifier(utterances_with_speakers)
839
+ classified_utterances = classify_roles(utterances_with_speakers, clf, vectorizer, scaler)
840
+ logger.info("Analyzing interviewee voice")
841
+ voice_analysis = analyze_interviewee_voice(wav_file, classified_utterances)
842
+ analysis_data = {
843
+ 'transcript': classified_utterances,
844
+ 'speakers': list(set(u['speaker'] for u in classified_utterances if u.get('speaker'))),
845
+ 'voice_analysis': voice_analysis,
846
+ 'text_analysis': {
847
+ 'total_duration': sum(u['prosodic_features'].get('duration', 0.0) for u in classified_utterances),
848
+ 'speaker_turns': len(classified_utterances)
849
+ }
 
 
 
 
 
 
 
 
 
 
850
  }
851
+ analysis_data['acceptance_probability'] = calculate_acceptance_probability(analysis_data)
852
+ logger.info("Generating report text")
853
+ gemini_report_text = generate_report(analysis_data)
854
+ base_name = os.path.splitext(os.path.basename(audio_path))[0]
855
+ pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
856
+ create_pdf_report(analysis_data, pdf_path, gemini_report_text)
857
+ json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
858
+ with open(json_path, 'w', encoding='utf-8') as f:
859
+ serializable_data = convert_to_serializable(analysis_data)
860
+ json.dump(serializable_data, f, indent=2)
861
+ return {
862
+ 'pdf_path': pdf_path,
863
+ 'json_path': json_path
864
+ }
865
+ finally:
866
+ if os.path.exists(wav_file):
867
+ os.remove(wav_file)
 
 
 
 
 
 
 
 
 
868
  except Exception as e:
869
+ logger.error(f"Processing failed: {str(e)}")
 
 
 
870
  raise