norhan12 commited on
Commit
52e76bd
·
verified ·
1 Parent(s): ebf10a8

Update process_interview.py

Browse files
Files changed (1) hide show
  1. process_interview.py +91 -99
process_interview.py CHANGED
@@ -1,6 +1,3 @@
1
- # ==============================================================================
2
- # 1. IMPORTS
3
- # ==============================================================================
4
  import os
5
  import torch
6
  import numpy as np
@@ -42,6 +39,7 @@ matplotlib.use('Agg')
42
 
43
  # Concurrency
44
  from concurrent.futures import ThreadPoolExecutor
 
45
 
46
  # ==============================================================================
47
  # 2. CONFIGURATION AND INITIALIZATION
@@ -52,8 +50,11 @@ logging.getLogger("nemo_logging").setLevel(logging.ERROR)
52
  logging.getLogger("nemo").setLevel(logging.ERROR)
53
  logging.getLogger("transformers").setLevel(logging.ERROR)
54
 
55
- OUTPUT_DIR = "./processed_audio"
56
- os.makedirs(OUTPUT_DIR, exist_ok=True)
 
 
 
57
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
58
 
59
  PINECONE_KEY = os.getenv("PINECONE_KEY")
@@ -65,7 +66,6 @@ if not all([PINECONE_KEY, ASSEMBLYAI_KEY, GEMINI_API_KEY]):
65
  # Global variables for models and services
66
  index, gemini_model, speaker_model, nlp, tokenizer, text_embedding_model = (None,) * 6
67
 
68
-
69
  def initialize_all_services_and_models():
70
  """Initializes all external services and loads all AI models into memory."""
71
  global index, gemini_model, speaker_model, nlp, tokenizer, text_embedding_model
@@ -85,10 +85,8 @@ def initialize_all_services_and_models():
85
  text_embedding_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device).eval()
86
  logger.info("All services and models are ready.")
87
 
88
-
89
  initialize_all_services_and_models()
90
 
91
-
92
  # ==============================================================================
93
  # 3. HELPER AND UTILITY FUNCTIONS
94
  # ==============================================================================
@@ -97,11 +95,11 @@ def temp_audio_file(suffix='.wav'):
97
  temp_file_path = None
98
  try:
99
  fd, temp_file_path = tempfile.mkstemp(suffix=suffix)
100
- os.close(fd);
101
  yield temp_file_path
102
  finally:
103
- if temp_file_path and os.path.exists(temp_file_path): os.remove(temp_file_path)
104
-
105
 
106
  def convert_to_wav(input_path: str) -> str:
107
  temp_wav_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False).name
@@ -111,11 +109,11 @@ def convert_to_wav(input_path: str) -> str:
111
  subprocess.run(command, check=True, capture_output=True, text=True)
112
  return temp_wav_file
113
  except Exception as e:
114
- if os.path.exists(temp_wav_file): os.remove(temp_wav_file)
115
- logger.error(f"Audio conversion failed: {e}", exc_info=True);
 
116
  raise
117
 
118
-
119
  def transcribe(audio_path: str) -> Dict:
120
  try:
121
  headers = {"authorization": ASSEMBLYAI_KEY}
@@ -131,21 +129,23 @@ def transcribe(audio_path: str) -> Dict:
131
  logger.info(f"Transcription submitted. Polling for results (ID: {transcript_id})...")
132
  while True:
133
  result = requests.get(f"https://api.assemblyai.com/v2/transcript/{transcript_id}", headers=headers).json()
134
- if result['status'] == 'completed': return result
135
- if result['status'] == 'error': raise Exception(f"Transcription failed: {result['error']}")
 
 
136
  time.sleep(5)
137
  except Exception as e:
138
- logger.error(f"Transcription failed: {e}", exc_info=True);
139
  raise
140
 
141
-
142
  def identify_speakers(transcript: Dict, wav_file_path: str) -> List[Dict]:
143
  try:
144
  full_audio = AudioSegment.from_wav(wav_file_path)
145
 
146
  def process_utterance(utterance):
147
  start_ms, end_ms = utterance['start'], utterance['end']
148
- if end_ms - start_ms < 1000: return {**utterance, 'speaker_id': 'unknown_short_utterance'}
 
149
  with temp_audio_file() as temp_path:
150
  full_audio[start_ms:end_ms].export(temp_path, format="wav")
151
  with torch.no_grad():
@@ -164,44 +164,44 @@ def identify_speakers(transcript: Dict, wav_file_path: str) -> List[Dict]:
164
  with ThreadPoolExecutor() as executor:
165
  return list(executor.map(process_utterance, transcript.get('utterances', [])))
166
  except Exception as e:
167
- logger.error(f"Speaker identification failed: {e}", exc_info=True);
168
  raise
169
 
170
-
171
  def get_text_embedding(text: str) -> np.ndarray:
172
  with torch.no_grad():
173
  inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding=True).to(device)
174
  outputs = text_embedding_model(**inputs)
175
  return outputs.last_hidden_state[0, 0, :].cpu().numpy()
176
 
177
-
178
  def extract_detailed_prosodic_features(audio_segment: AudioSegment) -> Dict:
179
  try:
180
  with temp_audio_file() as temp_path:
181
  audio_segment.export(temp_path, format="wav")
182
  y, sr = librosa.load(temp_path, sr=16000)
183
- if len(y) == 0: return {'pitch_std': 0}
 
184
  f0, _, _ = librosa.pyin(y, fmin=80, fmax=400, sr=sr)
185
  f0_values = f0[~np.isnan(f0)]
186
  return {'pitch_std': float(np.std(f0_values)) if len(f0_values) > 1 else 0}
187
  except Exception:
188
  return {'pitch_std': 0}
189
 
190
-
191
  def extract_duration_feature(utterances: List[Dict]) -> List[Dict]:
192
  for u in utterances:
193
  u['prosodic_features'] = {'duration': (u['end'] - u['start']) / 1000.0}
194
  return utterances
195
 
196
-
197
  def convert_to_serializable(obj):
198
- if isinstance(obj, (np.integer, np.floating)): return obj.item()
199
- if isinstance(obj, np.ndarray): return obj.tolist()
200
- if isinstance(obj, dict): return {k: convert_to_serializable(v) for k, v in obj.items()}
201
- if isinstance(obj, list): return [convert_to_serializable(item) for item in obj]
 
 
 
 
202
  return obj
203
 
204
-
205
  # ==============================================================================
206
  # 4. CORE LOGIC - ULTIMATE ROLE CLASSIFIER
207
  # ==============================================================================
@@ -209,42 +209,45 @@ def classify_roles_ultimate(utterances: List[Dict], audio_path: str) -> List[Dic
209
  logger.info("Starting ULTIMATE role classification with prosodic analysis...")
210
  full_audio = AudioSegment.from_wav(audio_path)
211
  speakers = {u['speaker_id'] for u in utterances if 'speaker_id' in u and not u['speaker_id'].startswith('unknown')}
212
- if len(speakers) < 2: return utterances
213
- speaker_data = {sid: {'rule_score': 0, 'prosodic_score': 0, 'utterance_count': 0, 'embeddings': []} for sid in
214
- speakers}
215
  interviewer_keywords = r'\b(what|why|how|when|where|who|which|tell me about|can you explain|describe|give me an example)\b'
216
  for u in utterances:
217
  sid, text = u.get('speaker_id'), u.get('text', '').lower()
218
- if sid not in speaker_data or not text: continue
219
- rule_score = 10 if text.endswith('?') else 0;
 
220
  rule_score += 5 * len(re.findall(interviewer_keywords, text))
221
  rule_score += 2 if len(text.split()) < 10 else -5 if len(text.split()) > 30 else 0
222
  speaker_data[sid]['rule_score'] += rule_score
223
- segment = full_audio[u['start']:u['end']];
224
  prosodic_features = extract_detailed_prosodic_features(segment)
225
  speaker_data[sid]['prosodic_score'] += -5 if prosodic_features['pitch_std'] > 40 else 2
226
- speaker_data[sid]['embeddings'].append(get_text_embedding(u['text']));
227
  speaker_data[sid]['utterance_count'] += 1
228
  canonical_question_embedding = get_text_embedding("Tell me about your experience and skills.")
229
  for sid, data in speaker_data.items():
230
- if not data['embeddings']: data['semantic_score'] = 0; continue
 
 
231
  avg_embedding = np.mean(data['embeddings'], axis=0).reshape(1, -1)
232
  data['semantic_score'] = cosine_similarity(avg_embedding, canonical_question_embedding.reshape(1, -1))[0][0]
233
  final_scores = {}
234
  for sid, data in speaker_data.items():
235
- if data['utterance_count'] == 0: final_scores[sid] = -999; continue
236
- avg_rule_score = data['rule_score'] / data['utterance_count'];
 
 
237
  avg_prosodic_score = data['prosodic_score'] / data['utterance_count']
238
  final_scores[sid] = (avg_rule_score * 0.5) + (data['semantic_score'] * 0.3) + (avg_prosodic_score * 0.2)
239
  sorted_speakers = sorted(final_scores.items(), key=lambda item: item[1], reverse=True)
240
  interviewer_id, interviewee_id = sorted_speakers[0][0], sorted_speakers[1][0]
241
  logger.info(f"Ultimate Role Classification: Interviewer -> {interviewer_id}, Interviewee -> {interviewee_id}")
242
  for u in utterances:
243
- u['role'] = 'Interviewer' if u.get('speaker_id') == interviewer_id else 'Interviewee' if u.get(
244
- 'speaker_id') == interviewee_id else 'Unknown'
245
  return utterances
246
 
247
-
248
  # ==============================================================================
249
  # 5. YOUR CUSTOM ANALYSIS & REPORTING FUNCTIONS
250
  # ==============================================================================
@@ -253,29 +256,29 @@ def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
253
  try:
254
  y, sr = librosa.load(audio_path, sr=16000)
255
  interviewee_utterances = [u for u in utterances if u.get('role') == 'Interviewee']
256
- if not interviewee_utterances: return {'error': 'No interviewee utterances found'}
 
257
  segments = [y[int(u['start'] * sr / 1000):int(u['end'] * sr / 1000)] for u in interviewee_utterances]
258
- if not segments: return {'error': 'No valid interviewee segments to analyze.'}
 
259
  combined_audio = np.concatenate(segments)
260
  total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
261
  total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
262
  speaking_rate = total_words / total_duration if total_duration > 0 else 0
263
- filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean'];
264
  filler_count = sum(sum(u['text'].lower().count(fw) for fw in filler_words) for u in interviewee_utterances)
265
  filler_ratio = filler_count / total_words if total_words > 0 else 0
266
  all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
267
- word_counts = {tuple(all_words[i:i + 2]): all_words.count(tuple(all_words[i:i + 2])) for i in
268
- range(len(all_words) - 1)}
269
- repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(
270
- word_counts) if word_counts else 0
271
  f0, voiced_flag, _ = librosa.pyin(combined_audio, fmin=80, fmax=300, sr=sr)
272
- f0_values = f0[voiced_flag & ~np.isnan(f0)];
273
  pitch_mean = np.mean(f0_values) if len(f0_values) > 0 else 0
274
- pitch_std = np.std(f0_values) if len(f0_values) > 0 else 0;
275
  jitter = np.mean(np.abs(np.diff(f0_values))) / pitch_mean if len(f0_values) > 1 and pitch_mean > 0 else 0
276
- rms = librosa.feature.rms(y=combined_audio)[0];
277
  intensity_mean = np.mean(rms) if len(rms) > 0 else 0
278
- intensity_std = np.std(rms) if len(rms) > 0 else 0;
279
  shimmer = np.mean(np.abs(np.diff(rms))) / intensity_mean if len(rms) > 1 and intensity_mean > 0 else 0
280
  anxiety_score = 0.6 * (pitch_std / pitch_mean if pitch_mean > 0 else 0) + 0.4 * (jitter + shimmer)
281
  confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
@@ -288,14 +291,14 @@ def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
288
  'composite_scores': {'anxiety': float(anxiety_score), 'confidence': float(confidence_score),
289
  'hesitation': float(hesitation_score)}}
290
  except Exception as e:
291
- logger.error(f"Error in detailed voice analysis: {e}", exc_info=True);
292
  return {'error': str(e)}
293
 
294
-
295
  def generate_voice_interpretation(analysis: Dict) -> str:
296
- if 'error' in analysis: return "<b>Detailed Vocal Metrics:</b><br/>Analysis not available."
297
- scores = analysis.get('composite_scores', {});
298
- pitch = analysis.get('pitch_analysis', {});
 
299
  intensity = analysis.get('intensity_analysis', {})
300
  return (f"<b>Detailed Vocal Metrics Interpretation:</b><br/>"
301
  f"- Speaking Rate: {analysis.get('speaking_rate', 0):.2f} words/sec<br/>"
@@ -309,38 +312,37 @@ def generate_voice_interpretation(analysis: Dict) -> str:
309
  f"- <b>Confidence Score:</b> {scores.get('confidence', 0):.3f}<br/>"
310
  f"- <b>Hesitation Score:</b> {scores.get('hesitation', 0):.3f}")
311
 
312
-
313
  def generate_anxiety_confidence_chart(composite_scores: Dict, chart_path_or_buffer):
314
  try:
315
- labels = ['Anxiety', 'Confidence', 'Hesitation'];
316
  scores = [composite_scores.get(k.lower(), 0) for k in labels]
317
- fig, ax = plt.subplots(figsize=(6, 4));
318
  ax.bar(labels, scores, color=['#FF6B6B', '#4ECDC4', '#FFA500'], edgecolor='black', width=0.5)
319
- ax.set_ylabel('Score');
320
- ax.set_title('Candidate Vocal Dynamics');
321
  ax.set_ylim(0, max(scores) * 1.2 if scores and max(scores) > 0 else 1)
322
- for bar in ax.patches: ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
323
- f"{bar.get_height():.2f}", ha='center', color='black')
324
- plt.tight_layout();
325
- plt.savefig(chart_path_or_buffer, format='png', dpi=150);
 
326
  plt.close(fig)
327
  except Exception as e:
328
  logger.error(f"Error generating chart: {e}")
329
 
330
-
331
  def calculate_acceptance_probability(analysis_data: Dict) -> float:
332
  logger.info("Calculating final acceptance probability...")
333
  voice_metrics = analysis_data.get('voice_analysis_metrics', {})
334
- if 'error' in voice_metrics or not voice_metrics.get('composite_scores'): return 30.0
335
- scores = voice_metrics['composite_scores'];
336
- confidence = scores.get('confidence', 0.5);
337
- anxiety = scores.get('anxiety', 0.5);
 
338
  hesitation = scores.get('hesitation', 0.5)
339
  raw_score = (confidence * 0.6) + ((1 - anxiety) * 0.2) + ((1 - hesitation) * 0.2)
340
  max_score = 0.6 + 0.2 + 0.2
341
  return round(max(10.0, min(99.0, (raw_score / max_score if max_score > 0 else 0) * 100)), 2)
342
 
343
-
344
  # ==============================================================================
345
  # 6. AI-POWERED NARRATIVE AND PDF REPORTING
346
  # ==============================================================================
@@ -348,25 +350,20 @@ def generate_gemini_report_text(analysis_data: Dict) -> str:
348
  """Generates a comprehensive narrative report using the Gemini model, based on your prompt structure."""
349
  logger.info("Generating AI-powered narrative report with Gemini...")
350
  voice = analysis_data.get('voice_analysis_metrics', {})
351
- interviewee_text = "\n".join(
352
- [f"- {u['text']}" for u in analysis_data['transcript_with_roles'] if u.get('role') == 'Interviewee'])
353
  acceptance_prob = analysis_data.get('acceptance_probability', 50.0)
354
 
355
  prompt = f"""
356
  You are EvalBot, a highly experienced senior HR analyst generating a comprehensive interview evaluation report.
357
  Analyze deeply based on actual responses provided below. Avoid generic analysis.
358
  Maintain professional, HR-standard language with clear structure and bullet points.
359
-
360
  **Suitability Score: {acceptance_prob:.2f}%**
361
-
362
  ### Interviewee Full Responses:
363
  {interviewee_text if interviewee_text else "No responses recorded."}
364
-
365
  ### Key Metrics:
366
  - Confidence Score: {voice.get('composite_scores', {}).get('confidence', 'N/A'):.2f}
367
  - Anxiety Score: {voice.get('composite_scores', {}).get('anxiety', 'N/A'):.2f}
368
  - Speaking Rate: {voice.get('speaking_rate', 'N/A')} words/sec
369
-
370
  ### Report Sections to Generate (Follow this structure exactly):
371
  **1. Executive Summary:**
372
  - 3 bullets summarizing performance, key strengths, and hiring recommendation.
@@ -381,13 +378,12 @@ def generate_gemini_report_text(analysis_data: Dict) -> str:
381
  - Provide 5 actionable recommendations and 5 clear next steps.
382
  """
383
  try:
384
- response = gemini_model.generate_content(prompt);
385
  return response.text
386
  except Exception as e:
387
- logger.error(f"Gemini report generation failed: {e}");
388
  return "Error: Could not generate AI analysis report."
389
 
390
-
391
  def create_pdf_report(analysis_data: Dict, output_path: str):
392
  """Generates a detailed, professional PDF report including all analysis sections, based on your structure."""
393
  logger.info(f"Generating comprehensive PDF report at {output_path}...")
@@ -397,6 +393,8 @@ def create_pdf_report(analysis_data: Dict, output_path: str):
397
  fontName='Helvetica-Bold', alignment=TA_CENTER))
398
  styles.add(ParagraphStyle(name='H2', fontSize=14, leading=18, spaceBefore=12, spaceAfter=8,
399
  textColor=colors.HexColor('#0050BC'), fontName='Helvetica-Bold'))
 
 
400
  styles.add(ParagraphStyle(name='Body', fontSize=10, leading=14, spaceAfter=6, alignment=TA_JUSTIFY))
401
  story = []
402
 
@@ -405,10 +403,9 @@ def create_pdf_report(analysis_data: Dict, output_path: str):
405
  story.append(Spacer(1, 0.2 * inch))
406
  story.append(Paragraph(f"Candidate ID: {analysis_data.get('user_id', 'N/A')}", styles['Body']))
407
  story.append(Paragraph(f"Date of Analysis: {time.strftime('%B %d, %Y')}", styles['Body']))
408
- prob = analysis_data.get('acceptance_probability', 0);
409
  prob_color = 'green' if prob >= 75 else 'orange' if prob >= 50 else 'red'
410
- story.append(
411
- Paragraph(f"<b>Overall Suitability Score:</b> <font size=16 color='{prob_color}'>{prob}%</font>", styles['H2']))
412
  story.append(PageBreak())
413
 
414
  # Quantitative Analysis Page
@@ -426,7 +423,8 @@ def create_pdf_report(analysis_data: Dict, output_path: str):
426
  gemini_text = analysis_data.get('gemini_report_text', 'Not available.')
427
  for line in gemini_text.split('\n'):
428
  line = line.strip()
429
- if not line: continue
 
430
  if line.startswith('**') and line.endswith('**'):
431
  story.append(Paragraph(line.strip('*'), styles['H3']))
432
  elif line.startswith('- ') or line.startswith('* '):
@@ -437,13 +435,9 @@ def create_pdf_report(analysis_data: Dict, output_path: str):
437
  doc.build(story)
438
  logger.info("PDF report generated successfully.")
439
 
440
-
441
  # ==============================================================================
442
  # 7. MAIN PROCESSING PIPELINE
443
  # ==============================================================================
444
- import joblib # Added import
445
- import io
446
-
447
  def process_interview(audio_path: str, user_id: str = "candidate-123") -> Dict:
448
  try:
449
  logger.info(f"Starting processing for {audio_path} (User ID: {user_id})")
@@ -492,14 +486,13 @@ def process_interview(audio_path: str, user_id: str = "candidate-123") -> Dict:
492
 
493
  logger.info("Generating report text using Gemini")
494
  gemini_report_text = generate_gemini_report_text(analysis_data)
 
495
 
496
- base_name = f"{user_id}_{os.path.splitext(os.path.basename(audio_path))[0].split('_', 1)[1]}"
497
- pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
498
- if not create_pdf_report(analysis_data, pdf_path, gemini_report_text=gemini_report_text):
499
- logger.error(f"Failed to create PDF report: {pdf_path}")
500
- raise RuntimeError("PDF report generation failed")
501
 
502
- json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
503
  with open(json_path, 'w') as f:
504
  logger.debug(f"Serializing analysis_data with keys: {list(analysis_data.keys())}")
505
  serializable_data = convert_to_serializable(analysis_data)
@@ -516,5 +509,4 @@ def process_interview(audio_path: str, user_id: str = "candidate-123") -> Dict:
516
  logger.error(f"Processing failed: {str(e)}", exc_info=True)
517
  if 'wav_file' in locals() and os.path.exists(wav_file):
518
  os.remove(wav_file)
519
- raise
520
-
 
 
 
 
1
  import os
2
  import torch
3
  import numpy as np
 
39
 
40
  # Concurrency
41
  from concurrent.futures import ThreadPoolExecutor
42
+ import joblib # Added import
43
 
44
  # ==============================================================================
45
  # 2. CONFIGURATION AND INITIALIZATION
 
50
  logging.getLogger("nemo").setLevel(logging.ERROR)
51
  logging.getLogger("transformers").setLevel(logging.ERROR)
52
 
53
+ OUTPUT_DIR = "./static/outputs"
54
+ JSON_DIR = os.path.join(OUTPUT_DIR, "json")
55
+ PDF_DIR = os.path.join(OUTPUT_DIR, "pdf")
56
+ os.makedirs(JSON_DIR, exist_ok=True)
57
+ os.makedirs(PDF_DIR, exist_ok=True)
58
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
59
 
60
  PINECONE_KEY = os.getenv("PINECONE_KEY")
 
66
  # Global variables for models and services
67
  index, gemini_model, speaker_model, nlp, tokenizer, text_embedding_model = (None,) * 6
68
 
 
69
  def initialize_all_services_and_models():
70
  """Initializes all external services and loads all AI models into memory."""
71
  global index, gemini_model, speaker_model, nlp, tokenizer, text_embedding_model
 
85
  text_embedding_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device).eval()
86
  logger.info("All services and models are ready.")
87
 
 
88
  initialize_all_services_and_models()
89
 
 
90
  # ==============================================================================
91
  # 3. HELPER AND UTILITY FUNCTIONS
92
  # ==============================================================================
 
95
  temp_file_path = None
96
  try:
97
  fd, temp_file_path = tempfile.mkstemp(suffix=suffix)
98
+ os.close(fd)
99
  yield temp_file_path
100
  finally:
101
+ if temp_file_path and os.path.exists(temp_file_path):
102
+ os.remove(temp_file_path)
103
 
104
  def convert_to_wav(input_path: str) -> str:
105
  temp_wav_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False).name
 
109
  subprocess.run(command, check=True, capture_output=True, text=True)
110
  return temp_wav_file
111
  except Exception as e:
112
+ if os.path.exists(temp_wav_file):
113
+ os.remove(temp_wav_file)
114
+ logger.error(f"Audio conversion failed: {e}", exc_info=True)
115
  raise
116
 
 
117
  def transcribe(audio_path: str) -> Dict:
118
  try:
119
  headers = {"authorization": ASSEMBLYAI_KEY}
 
129
  logger.info(f"Transcription submitted. Polling for results (ID: {transcript_id})...")
130
  while True:
131
  result = requests.get(f"https://api.assemblyai.com/v2/transcript/{transcript_id}", headers=headers).json()
132
+ if result['status'] == 'completed':
133
+ return result
134
+ if result['status'] == 'error':
135
+ raise Exception(f"Transcription failed: {result['error']}")
136
  time.sleep(5)
137
  except Exception as e:
138
+ logger.error(f"Transcription failed: {e}", exc_info=True)
139
  raise
140
 
 
141
  def identify_speakers(transcript: Dict, wav_file_path: str) -> List[Dict]:
142
  try:
143
  full_audio = AudioSegment.from_wav(wav_file_path)
144
 
145
  def process_utterance(utterance):
146
  start_ms, end_ms = utterance['start'], utterance['end']
147
+ if end_ms - start_ms < 1000:
148
+ return {**utterance, 'speaker_id': 'unknown_short_utterance'}
149
  with temp_audio_file() as temp_path:
150
  full_audio[start_ms:end_ms].export(temp_path, format="wav")
151
  with torch.no_grad():
 
164
  with ThreadPoolExecutor() as executor:
165
  return list(executor.map(process_utterance, transcript.get('utterances', [])))
166
  except Exception as e:
167
+ logger.error(f"Speaker identification failed: {e}", exc_info=True)
168
  raise
169
 
 
170
  def get_text_embedding(text: str) -> np.ndarray:
171
  with torch.no_grad():
172
  inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding=True).to(device)
173
  outputs = text_embedding_model(**inputs)
174
  return outputs.last_hidden_state[0, 0, :].cpu().numpy()
175
 
 
176
  def extract_detailed_prosodic_features(audio_segment: AudioSegment) -> Dict:
177
  try:
178
  with temp_audio_file() as temp_path:
179
  audio_segment.export(temp_path, format="wav")
180
  y, sr = librosa.load(temp_path, sr=16000)
181
+ if len(y) == 0:
182
+ return {'pitch_std': 0}
183
  f0, _, _ = librosa.pyin(y, fmin=80, fmax=400, sr=sr)
184
  f0_values = f0[~np.isnan(f0)]
185
  return {'pitch_std': float(np.std(f0_values)) if len(f0_values) > 1 else 0}
186
  except Exception:
187
  return {'pitch_std': 0}
188
 
 
189
  def extract_duration_feature(utterances: List[Dict]) -> List[Dict]:
190
  for u in utterances:
191
  u['prosodic_features'] = {'duration': (u['end'] - u['start']) / 1000.0}
192
  return utterances
193
 
 
194
  def convert_to_serializable(obj):
195
+ if isinstance(obj, (np.integer, np.floating)):
196
+ return obj.item()
197
+ if isinstance(obj, np.ndarray):
198
+ return obj.tolist()
199
+ if isinstance(obj, dict):
200
+ return {k: convert_to_serializable(v) for k, v in obj.items()}
201
+ if isinstance(obj, list):
202
+ return [convert_to_serializable(item) for item in obj]
203
  return obj
204
 
 
205
  # ==============================================================================
206
  # 4. CORE LOGIC - ULTIMATE ROLE CLASSIFIER
207
  # ==============================================================================
 
209
  logger.info("Starting ULTIMATE role classification with prosodic analysis...")
210
  full_audio = AudioSegment.from_wav(audio_path)
211
  speakers = {u['speaker_id'] for u in utterances if 'speaker_id' in u and not u['speaker_id'].startswith('unknown')}
212
+ if len(speakers) < 2:
213
+ return utterances
214
+ speaker_data = {sid: {'rule_score': 0, 'prosodic_score': 0, 'utterance_count': 0, 'embeddings': []} for sid in speakers}
215
  interviewer_keywords = r'\b(what|why|how|when|where|who|which|tell me about|can you explain|describe|give me an example)\b'
216
  for u in utterances:
217
  sid, text = u.get('speaker_id'), u.get('text', '').lower()
218
+ if sid not in speaker_data or not text:
219
+ continue
220
+ rule_score = 10 if text.endswith('?') else 0
221
  rule_score += 5 * len(re.findall(interviewer_keywords, text))
222
  rule_score += 2 if len(text.split()) < 10 else -5 if len(text.split()) > 30 else 0
223
  speaker_data[sid]['rule_score'] += rule_score
224
+ segment = full_audio[u['start']:u['end']]
225
  prosodic_features = extract_detailed_prosodic_features(segment)
226
  speaker_data[sid]['prosodic_score'] += -5 if prosodic_features['pitch_std'] > 40 else 2
227
+ speaker_data[sid]['embeddings'].append(get_text_embedding(u['text']))
228
  speaker_data[sid]['utterance_count'] += 1
229
  canonical_question_embedding = get_text_embedding("Tell me about your experience and skills.")
230
  for sid, data in speaker_data.items():
231
+ if not data['embeddings']:
232
+ data['semantic_score'] = 0
233
+ continue
234
  avg_embedding = np.mean(data['embeddings'], axis=0).reshape(1, -1)
235
  data['semantic_score'] = cosine_similarity(avg_embedding, canonical_question_embedding.reshape(1, -1))[0][0]
236
  final_scores = {}
237
  for sid, data in speaker_data.items():
238
+ if data['utterance_count'] == 0:
239
+ final_scores[sid] = -999
240
+ continue
241
+ avg_rule_score = data['rule_score'] / data['utterance_count']
242
  avg_prosodic_score = data['prosodic_score'] / data['utterance_count']
243
  final_scores[sid] = (avg_rule_score * 0.5) + (data['semantic_score'] * 0.3) + (avg_prosodic_score * 0.2)
244
  sorted_speakers = sorted(final_scores.items(), key=lambda item: item[1], reverse=True)
245
  interviewer_id, interviewee_id = sorted_speakers[0][0], sorted_speakers[1][0]
246
  logger.info(f"Ultimate Role Classification: Interviewer -> {interviewer_id}, Interviewee -> {interviewee_id}")
247
  for u in utterances:
248
+ u['role'] = 'Interviewer' if u.get('speaker_id') == interviewer_id else 'Interviewee' if u.get('speaker_id') == interviewee_id else 'Unknown'
 
249
  return utterances
250
 
 
251
  # ==============================================================================
252
  # 5. YOUR CUSTOM ANALYSIS & REPORTING FUNCTIONS
253
  # ==============================================================================
 
256
  try:
257
  y, sr = librosa.load(audio_path, sr=16000)
258
  interviewee_utterances = [u for u in utterances if u.get('role') == 'Interviewee']
259
+ if not interviewee_utterances:
260
+ return {'error': 'No interviewee utterances found'}
261
  segments = [y[int(u['start'] * sr / 1000):int(u['end'] * sr / 1000)] for u in interviewee_utterances]
262
+ if not segments:
263
+ return {'error': 'No valid interviewee segments to analyze.'}
264
  combined_audio = np.concatenate(segments)
265
  total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
266
  total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
267
  speaking_rate = total_words / total_duration if total_duration > 0 else 0
268
+ filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
269
  filler_count = sum(sum(u['text'].lower().count(fw) for fw in filler_words) for u in interviewee_utterances)
270
  filler_ratio = filler_count / total_words if total_words > 0 else 0
271
  all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
272
+ word_counts = {tuple(all_words[i:i + 2]): all_words.count(tuple(all_words[i:i + 2])) for i in range(len(all_words) - 1)}
273
+ repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(word_counts) if word_counts else 0
 
 
274
  f0, voiced_flag, _ = librosa.pyin(combined_audio, fmin=80, fmax=300, sr=sr)
275
+ f0_values = f0[voiced_flag & ~np.isnan(f0)]
276
  pitch_mean = np.mean(f0_values) if len(f0_values) > 0 else 0
277
+ pitch_std = np.std(f0_values) if len(f0_values) > 0 else 0
278
  jitter = np.mean(np.abs(np.diff(f0_values))) / pitch_mean if len(f0_values) > 1 and pitch_mean > 0 else 0
279
+ rms = librosa.feature.rms(y=combined_audio)[0]
280
  intensity_mean = np.mean(rms) if len(rms) > 0 else 0
281
+ intensity_std = np.std(rms) if len(rms) > 0 else 0
282
  shimmer = np.mean(np.abs(np.diff(rms))) / intensity_mean if len(rms) > 1 and intensity_mean > 0 else 0
283
  anxiety_score = 0.6 * (pitch_std / pitch_mean if pitch_mean > 0 else 0) + 0.4 * (jitter + shimmer)
284
  confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
 
291
  'composite_scores': {'anxiety': float(anxiety_score), 'confidence': float(confidence_score),
292
  'hesitation': float(hesitation_score)}}
293
  except Exception as e:
294
+ logger.error(f"Error in detailed voice analysis: {e}", exc_info=True)
295
  return {'error': str(e)}
296
 
 
297
  def generate_voice_interpretation(analysis: Dict) -> str:
298
+ if 'error' in analysis:
299
+ return "<b>Detailed Vocal Metrics:</b><br/>Analysis not available."
300
+ scores = analysis.get('composite_scores', {})
301
+ pitch = analysis.get('pitch_analysis', {})
302
  intensity = analysis.get('intensity_analysis', {})
303
  return (f"<b>Detailed Vocal Metrics Interpretation:</b><br/>"
304
  f"- Speaking Rate: {analysis.get('speaking_rate', 0):.2f} words/sec<br/>"
 
312
  f"- <b>Confidence Score:</b> {scores.get('confidence', 0):.3f}<br/>"
313
  f"- <b>Hesitation Score:</b> {scores.get('hesitation', 0):.3f}")
314
 
 
315
  def generate_anxiety_confidence_chart(composite_scores: Dict, chart_path_or_buffer):
316
  try:
317
+ labels = ['Anxiety', 'Confidence', 'Hesitation']
318
  scores = [composite_scores.get(k.lower(), 0) for k in labels]
319
+ fig, ax = plt.subplots(figsize=(6, 4))
320
  ax.bar(labels, scores, color=['#FF6B6B', '#4ECDC4', '#FFA500'], edgecolor='black', width=0.5)
321
+ ax.set_ylabel('Score')
322
+ ax.set_title('Candidate Vocal Dynamics')
323
  ax.set_ylim(0, max(scores) * 1.2 if scores and max(scores) > 0 else 1)
324
+ for bar in ax.patches:
325
+ ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
326
+ f"{bar.get_height():.2f}", ha='center', color='black')
327
+ plt.tight_layout()
328
+ plt.savefig(chart_path_or_buffer, format='png', dpi=150)
329
  plt.close(fig)
330
  except Exception as e:
331
  logger.error(f"Error generating chart: {e}")
332
 
 
333
  def calculate_acceptance_probability(analysis_data: Dict) -> float:
334
  logger.info("Calculating final acceptance probability...")
335
  voice_metrics = analysis_data.get('voice_analysis_metrics', {})
336
+ if 'error' in voice_metrics or not voice_metrics.get('composite_scores'):
337
+ return 30.0
338
+ scores = voice_metrics['composite_scores']
339
+ confidence = scores.get('confidence', 0.5)
340
+ anxiety = scores.get('anxiety', 0.5)
341
  hesitation = scores.get('hesitation', 0.5)
342
  raw_score = (confidence * 0.6) + ((1 - anxiety) * 0.2) + ((1 - hesitation) * 0.2)
343
  max_score = 0.6 + 0.2 + 0.2
344
  return round(max(10.0, min(99.0, (raw_score / max_score if max_score > 0 else 0) * 100)), 2)
345
 
 
346
  # ==============================================================================
347
  # 6. AI-POWERED NARRATIVE AND PDF REPORTING
348
  # ==============================================================================
 
350
  """Generates a comprehensive narrative report using the Gemini model, based on your prompt structure."""
351
  logger.info("Generating AI-powered narrative report with Gemini...")
352
  voice = analysis_data.get('voice_analysis_metrics', {})
353
+ interviewee_text = "\n".join([f"- {u['text']}" for u in analysis_data['transcript_with_roles'] if u.get('role') == 'Interviewee'])
 
354
  acceptance_prob = analysis_data.get('acceptance_probability', 50.0)
355
 
356
  prompt = f"""
357
  You are EvalBot, a highly experienced senior HR analyst generating a comprehensive interview evaluation report.
358
  Analyze deeply based on actual responses provided below. Avoid generic analysis.
359
  Maintain professional, HR-standard language with clear structure and bullet points.
 
360
  **Suitability Score: {acceptance_prob:.2f}%**
 
361
  ### Interviewee Full Responses:
362
  {interviewee_text if interviewee_text else "No responses recorded."}
 
363
  ### Key Metrics:
364
  - Confidence Score: {voice.get('composite_scores', {}).get('confidence', 'N/A'):.2f}
365
  - Anxiety Score: {voice.get('composite_scores', {}).get('anxiety', 'N/A'):.2f}
366
  - Speaking Rate: {voice.get('speaking_rate', 'N/A')} words/sec
 
367
  ### Report Sections to Generate (Follow this structure exactly):
368
  **1. Executive Summary:**
369
  - 3 bullets summarizing performance, key strengths, and hiring recommendation.
 
378
  - Provide 5 actionable recommendations and 5 clear next steps.
379
  """
380
  try:
381
+ response = gemini_model.generate_content(prompt)
382
  return response.text
383
  except Exception as e:
384
+ logger.error(f"Gemini report generation failed: {e}")
385
  return "Error: Could not generate AI analysis report."
386
 
 
387
  def create_pdf_report(analysis_data: Dict, output_path: str):
388
  """Generates a detailed, professional PDF report including all analysis sections, based on your structure."""
389
  logger.info(f"Generating comprehensive PDF report at {output_path}...")
 
393
  fontName='Helvetica-Bold', alignment=TA_CENTER))
394
  styles.add(ParagraphStyle(name='H2', fontSize=14, leading=18, spaceBefore=12, spaceAfter=8,
395
  textColor=colors.HexColor('#0050BC'), fontName='Helvetica-Bold'))
396
+ styles.add(ParagraphStyle(name='H3', fontSize=12, leading=16, spaceBefore=10, spaceAfter=6,
397
+ textColor=colors.HexColor('#333333'), fontName='Helvetica-Bold'))
398
  styles.add(ParagraphStyle(name='Body', fontSize=10, leading=14, spaceAfter=6, alignment=TA_JUSTIFY))
399
  story = []
400
 
 
403
  story.append(Spacer(1, 0.2 * inch))
404
  story.append(Paragraph(f"Candidate ID: {analysis_data.get('user_id', 'N/A')}", styles['Body']))
405
  story.append(Paragraph(f"Date of Analysis: {time.strftime('%B %d, %Y')}", styles['Body']))
406
+ prob = analysis_data.get('acceptance_probability', 0)
407
  prob_color = 'green' if prob >= 75 else 'orange' if prob >= 50 else 'red'
408
+ story.append(Paragraph(f"<b>Overall Suitability Score:</b> <font size=16 color='{prob_color}'>{prob}%</font>", styles['H2']))
 
409
  story.append(PageBreak())
410
 
411
  # Quantitative Analysis Page
 
423
  gemini_text = analysis_data.get('gemini_report_text', 'Not available.')
424
  for line in gemini_text.split('\n'):
425
  line = line.strip()
426
+ if not line:
427
+ continue
428
  if line.startswith('**') and line.endswith('**'):
429
  story.append(Paragraph(line.strip('*'), styles['H3']))
430
  elif line.startswith('- ') or line.startswith('* '):
 
435
  doc.build(story)
436
  logger.info("PDF report generated successfully.")
437
 
 
438
  # ==============================================================================
439
  # 7. MAIN PROCESSING PIPELINE
440
  # ==============================================================================
 
 
 
441
  def process_interview(audio_path: str, user_id: str = "candidate-123") -> Dict:
442
  try:
443
  logger.info(f"Starting processing for {audio_path} (User ID: {user_id})")
 
486
 
487
  logger.info("Generating report text using Gemini")
488
  gemini_report_text = generate_gemini_report_text(analysis_data)
489
+ analysis_data['gemini_report_text'] = gemini_report_text # Add to analysis_data
490
 
491
+ base_name = f"{user_id}_{os.path.splitext(os.path.basename(audio_path))[0].rsplit('_', 1)[-1]}"
492
+ pdf_path = os.path.join(PDF_DIR, f"{base_name}_report.pdf")
493
+ create_pdf_report(analysis_data, pdf_path)
 
 
494
 
495
+ json_path = os.path.join(JSON_DIR, f"{base_name}_analysis.json")
496
  with open(json_path, 'w') as f:
497
  logger.debug(f"Serializing analysis_data with keys: {list(analysis_data.keys())}")
498
  serializable_data = convert_to_serializable(analysis_data)
 
509
  logger.error(f"Processing failed: {str(e)}", exc_info=True)
510
  if 'wav_file' in locals() and os.path.exists(wav_file):
511
  os.remove(wav_file)
512
+ raise