norhan12 commited on
Commit
285f925
·
verified ·
1 Parent(s): 8474847

Update process_interview.py

Browse files
Files changed (1) hide show
  1. process_interview.py +261 -423
process_interview.py CHANGED
@@ -17,60 +17,56 @@ from sklearn.feature_extraction.text import TfidfVectorizer
17
  import re
18
  from typing import Dict, List, Tuple
19
  import logging
20
- import tempfile
 
21
  from reportlab.lib.pagesizes import letter
22
- from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, Image
23
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
24
  from reportlab.lib.units import inch
25
  from reportlab.lib import colors
26
- import matplotlib.pyplot as plt
27
- import matplotlib
28
- matplotlib.use('Agg')
29
- from reportlab.platypus import Image
30
- import io
31
  from transformers import AutoTokenizer, AutoModel
32
  import spacy
33
  import google.generativeai as genai
34
  import joblib
35
  from concurrent.futures import ThreadPoolExecutor
36
 
 
 
 
 
37
  # Setup logging
38
- logging.basicConfig(level=logging.INFO)
39
- logger = logging.getLogger(_name_)
40
  logging.getLogger("nemo_logging").setLevel(logging.ERROR)
41
- logging.getLogger("nemo").setLevel(logging.ERROR)
42
 
43
  # Configuration
44
  AUDIO_DIR = "./uploads"
45
  OUTPUT_DIR = "./processed_audio"
 
46
  os.makedirs(OUTPUT_DIR, exist_ok=True)
47
 
48
- # API Keys
49
  PINECONE_KEY = os.getenv("PINECONE_KEY")
50
  ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY")
51
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
52
 
53
- def download_audio_from_url(url: str) -> str:
54
- """Downloads an audio file from a URL to a temporary local path."""
55
- try:
56
- temp_dir = tempfile.gettempdir()
57
- temp_path = os.path.join(temp_dir, f"{uuid.uuid4()}.tmp_audio")
58
- logger.info(f"Downloading audio from {url} to {temp_path}")
59
- with requests.get(url, stream=True) as r:
60
- r.raise_for_status()
61
- with open(temp_path, 'wb') as f:
62
- for chunk in r.iter_content(chunk_size=8192):
63
- f.write(chunk)
64
- return temp_path
65
- except Exception as e:
66
- logger.error(f"Failed to download audio from URL {url}: {e}")
67
- raise
68
 
69
  def initialize_services():
70
  try:
 
71
  pc = Pinecone(api_key=PINECONE_KEY)
72
  index_name = "interview-speaker-embeddings"
73
  if index_name not in pc.list_indexes().names():
 
74
  pc.create_index(
75
  name=index_name,
76
  dimension=192,
@@ -80,9 +76,10 @@ def initialize_services():
80
  index = pc.Index(index_name)
81
  genai.configure(api_key=GEMINI_API_KEY)
82
  gemini_model = genai.GenerativeModel('gemini-1.5-flash')
 
83
  return index, gemini_model
84
  except Exception as e:
85
- logger.error(f"Error initializing services: {str(e)}")
86
  raise
87
 
88
  index, gemini_model = initialize_services()
@@ -90,29 +87,31 @@ index, gemini_model = initialize_services()
90
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
91
  logger.info(f"Using device: {device}")
92
 
93
- def load_speaker_model():
94
  try:
95
- import torch
96
- torch.set_num_threads(5)
97
- model = EncDecSpeakerLabelModel.from_pretrained(
98
  "nvidia/speakerverification_en_titanet_large",
99
  map_location=torch.device('cpu')
100
  )
101
- model.eval()
102
- return model
 
 
 
 
 
103
  except Exception as e:
104
- logger.error(f"Model loading failed: {str(e)}")
105
- raise RuntimeError("Could not load speaker verification model")
 
 
106
 
107
- def load_models():
108
- speaker_model = load_speaker_model()
109
- nlp = spacy.load("en_core_web_sm")
110
- tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
111
- llm_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
112
- llm_model.eval()
113
- return speaker_model, nlp, tokenizer, llm_model
114
 
115
- speaker_model, nlp, tokenizer, llm_model = load_models()
 
 
116
 
117
  def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
118
  try:
@@ -124,7 +123,7 @@ def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
124
  audio.export(wav_file, format="wav")
125
  return wav_file
126
  except Exception as e:
127
- logger.error(f"Audio conversion failed: {str(e)}")
128
  raise
129
 
130
  def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
@@ -150,11 +149,10 @@ def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Di
150
  os.remove(temp_path)
151
  return features
152
  except Exception as e:
153
- logger.error(f"Feature extraction failed: {str(e)}")
154
  return {
155
- 'duration': 0.0, 'mean_pitch': 0.0, 'min_pitch': 0.0, 'max_pitch': 0.0,
156
- 'pitch_sd': 0.0, 'intensityMean': 0.0, 'intensityMin': 0.0,
157
- 'intensityMax': 0.0, 'intensitySD': 0.0
158
  }
159
 
160
  def transcribe(audio_path: str) -> Dict:
@@ -162,127 +160,138 @@ def transcribe(audio_path: str) -> Dict:
162
  with open(audio_path, 'rb') as f:
163
  upload_response = requests.post(
164
  "https://api.assemblyai.com/v2/upload",
165
- headers={"authorization": ASSEMBLYAI_KEY},
166
- data=f
167
  )
168
- audio_url = upload_response.json()['upload_url']
 
 
169
  transcript_response = requests.post(
170
  "https://api.assemblyai.com/v2/transcript",
171
  headers={"authorization": ASSEMBLYAI_KEY},
172
- json={
173
- "audio_url": audio_url,
174
- "speaker_labels": True,
175
- "filter_profanity": True
176
- }
177
  )
 
178
  transcript_id = transcript_response.json()['id']
 
179
  while True:
180
- result = requests.get(
181
  f"https://api.assemblyai.com/v2/transcript/{transcript_id}",
182
  headers={"authorization": ASSEMBLYAI_KEY}
183
- ).json()
 
 
 
184
  if result['status'] == 'completed':
 
 
 
185
  return result
186
  elif result['status'] == 'error':
187
- raise Exception(result['error'])
188
  time.sleep(5)
189
  except Exception as e:
190
- logger.error(f"Transcription failed: {str(e)}")
191
  raise
192
 
193
- def process_utterance(utterance, full_audio, wav_file):
194
  try:
195
  start = utterance['start']
196
  end = utterance['end']
197
  segment = full_audio[start:end]
198
- temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
199
  segment.export(temp_path, format="wav")
 
200
  with torch.no_grad():
201
- embedding = speaker_model.get_embedding(temp_path).cpu().numpy()
202
- embedding_list = embedding.flatten().tolist()
203
  query_result = index.query(
204
- vector=embedding_list,
205
- top_k=1,
206
- include_metadata=True
207
  )
 
208
  if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
209
  speaker_id = query_result['matches'][0]['id']
210
  speaker_name = query_result['matches'][0]['metadata']['speaker_name']
211
  else:
212
  speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
213
  speaker_name = f"Speaker_{speaker_id[-4:]}"
214
- index.upsert([(speaker_id, embedding_list, {"speaker_name": speaker_name})])
 
215
  os.remove(temp_path)
216
  return {
217
- **utterance,
218
- 'speaker': speaker_name,
219
- 'speaker_id': speaker_id,
220
- 'embedding': embedding_list
221
  }
222
  except Exception as e:
223
- logger.error(f"Utterance processing failed: {str(e)}", exc_info=True)
224
- return {
225
- **utterance,
226
- 'speaker': 'Unknown',
227
- 'speaker_id': 'unknown',
228
- 'embedding': None
229
- }
230
 
231
  def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
232
  try:
 
 
233
  full_audio = AudioSegment.from_wav(wav_file)
234
  utterances = transcript['utterances']
235
- with ThreadPoolExecutor(max_workers=5) as executor:
236
- futures = [
237
- executor.submit(process_utterance, utterance, full_audio, wav_file)
238
- for utterance in utterances
239
- ]
240
  results = [f.result() for f in futures]
241
  return results
242
  except Exception as e:
243
- logger.error(f"Speaker identification failed: {str(e)}")
244
  raise
245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  def train_role_classifier(utterances: List[Dict]):
 
247
  try:
248
  texts = [u['text'] for u in utterances]
249
  vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
250
  X_text = vectorizer.fit_transform(texts)
251
- features = []
252
- labels = []
253
  for i, utterance in enumerate(utterances):
254
  prosodic = utterance['prosodic_features']
255
  feat = [
256
  prosodic['duration'], prosodic['mean_pitch'], prosodic['min_pitch'],
257
- prosodic['max_pitch'], prosodic['pitch_sd'], prosodic['intensityMean'],
258
- prosodic['intensityMin'], prosodic['intensityMax'], prosodic['intensitySD'],
259
  ]
260
  feat.extend(X_text[i].toarray()[0].tolist())
261
  doc = nlp(utterance['text'])
262
  feat.extend([
263
  int(utterance['text'].endswith('?')),
264
- len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower())),
265
  len(utterance['text'].split()),
266
  sum(1 for token in doc if token.pos_ == 'VERB'),
267
- sum(1 for token in doc if token.pos_ == 'NOUN')
268
  ])
269
  features.append(feat)
270
- labels.append(0 if i % 2 == 0 else 1)
 
271
  scaler = StandardScaler()
272
  X = scaler.fit_transform(features)
273
- clf = RandomForestClassifier(
274
- n_estimators=150, max_depth=10, random_state=42, class_weight='balanced'
275
- )
276
  clf.fit(X, labels)
 
277
  joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
278
  joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
279
  joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
280
  return clf, vectorizer, scaler
281
  except Exception as e:
282
- logger.error(f"Classifier training failed: {str(e)}")
283
  raise
284
 
285
  def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
 
286
  try:
287
  texts = [u['text'] for u in utterances]
288
  X_text = vectorizer.transform(texts)
@@ -291,405 +300,234 @@ def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
291
  prosodic = utterance['prosodic_features']
292
  feat = [
293
  prosodic['duration'], prosodic['mean_pitch'], prosodic['min_pitch'],
294
- prosodic['max_pitch'], prosodic['pitch_sd'], prosodic['intensityMean'],
295
- prosodic['intensityMin'], prosodic['intensityMax'], prosodic['intensitySD'],
296
  ]
297
  feat.extend(X_text[i].toarray()[0].tolist())
298
  doc = nlp(utterance['text'])
299
  feat.extend([
300
  int(utterance['text'].endswith('?')),
301
- len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower())),
302
  len(utterance['text'].split()),
303
  sum(1 for token in doc if token.pos_ == 'VERB'),
304
- sum(1 for token in doc if token.pos_ == 'NOUN')
305
  ])
306
  X = scaler.transform([feat])
307
  role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
308
  results.append({**utterance, 'role': role})
309
  return results
310
  except Exception as e:
311
- logger.error(f"Role classification failed: {str(e)}")
312
- raise
 
313
 
314
- def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
 
315
  try:
316
- y, sr = librosa.load(audio_path, sr=16000)
317
- interviewee_utterances = [u for u in utterances if u['role'] == 'Interviewee']
318
  if not interviewee_utterances:
319
- return {'error': 'No interviewee utterances found'}
320
- segments = []
321
- for u in interviewee_utterances:
322
- start = int(u['start'] * sr / 1000)
323
- end = int(u['end'] * sr / 1000)
324
- segments.append(y[start:end])
325
  total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
326
  total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
327
  speaking_rate = total_words / total_duration if total_duration > 0 else 0
 
328
  filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
329
- filler_count = sum(sum(u['text'].lower().count(fw) for fw in filler_words) for u in interviewee_utterances)
330
  filler_ratio = filler_count / total_words if total_words > 0 else 0
331
- all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
332
- word_counts = {}
333
- for i in range(len(all_words) - 1):
334
- bigram = (all_words[i], all_words[i + 1])
335
- word_counts[bigram] = word_counts.get(bigram, 0) + 1
336
- repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(word_counts) if word_counts else 0
337
- pitches = []
338
- for segment in segments:
339
- f0, voiced_flag, _ = librosa.pyin(segment, fmin=80, fmax=300, sr=sr)
340
- pitches.extend(f0[voiced_flag])
341
- pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
342
- pitch_std = np.std(pitches) if len(pitches) > 0 else 0
343
- jitter = np.mean(np.abs(np.diff(pitches))) / pitch_mean if len(pitches) > 1 and pitch_mean > 0 else 0
344
- intensities = []
345
- for segment in segments:
346
- rms = librosa.feature.rms(y=segment)[0]
347
- intensities.extend(rms)
348
- intensity_mean = np.mean(intensities) if intensities else 0
349
- intensity_std = np.std(intensities) if intensities else 0
350
- shimmer = np.mean(np.abs(np.diff(intensities))) / intensity_mean if len(intensities) > 1 and intensity_mean > 0 else 0
351
- anxiety_score = 0.6 * (pitch_std / pitch_mean) + 0.4 * (jitter + shimmer) if pitch_mean > 0 else 0
352
- confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
353
- hesitation_score = filler_ratio + repetition_score
354
- anxiety_level = 'High' if anxiety_score > 0.15 else 'Moderate' if anxiety_score > 0.07 else 'Low'
355
- confidence_level = 'High' if confidence_score > 0.7 else 'Moderate' if confidence_score > 0.5 else 'Low'
356
- fluency_level = 'Fluent' if (filler_ratio < 0.05 and repetition_score < 0.1) else 'Moderate' if (filler_ratio < 0.1 and repetition_score < 0.2) else 'Disfluent'
357
  return {
358
  'speaking_rate': float(round(speaking_rate, 2)),
359
  'filler_ratio': float(round(filler_ratio, 4)),
360
- 'repetition_score': float(round(repetition_score, 4)),
361
- 'pitch_analysis': {'mean': float(round(pitch_mean, 2)), 'std_dev': float(round(pitch_std, 2)), 'jitter': float(round(jitter, 4))},
362
- 'intensity_analysis': {'mean': float(round(intensity_mean, 2)), 'std_dev': float(round(intensity_std, 2)), 'shimmer': float(round(shimmer, 4))},
363
- 'composite_scores': {'anxiety': float(round(anxiety_score, 4)), 'confidence': float(round(confidence_score, 4)), 'hesitation': float(round(hesitation_score, 4))},
364
- 'interpretation': {'anxiety_level': anxiety_level, 'confidence_level': confidence_level, 'fluency_level': fluency_level}
 
365
  }
366
  except Exception as e:
367
- logger.error(f"Voice analysis failed: {str(e)}")
368
  return {'error': str(e)}
369
 
370
- def generate_voice_interpretation(analysis: Dict) -> str:
371
- if 'error' in analysis:
372
- return "Voice analysis not available due to processing error."
373
- interpretation_lines = [
374
- "Voice and Speech Profile:",
375
- f"- Speaking Rate: {analysis['speaking_rate']} words/sec - Compared to optimal range (2.0-3.0 words/sec)",
376
- f"- Filler Word Usage: {analysis['filler_ratio'] * 100:.1f}% - Frequency of non-content words (e.g., 'um', 'like')",
377
- f"- Repetition Tendency: {analysis['repetition_score']:.3f} - Measure of repeated phrases",
378
- f"- Anxiety Indicator: {analysis['interpretation']['anxiety_level']} (Score: {analysis['composite_scores']['anxiety']:.3f}) - Based on pitch and voice stability",
379
- f"- Confidence Indicator: {analysis['interpretation']['confidence_level']} (Score: {analysis['composite_scores']['confidence']:.3f}) - Derived from vocal consistency",
380
- f"- Fluency Assessment: {analysis['interpretation']['fluency_level']} - Reflects speech flow and coherence",
381
- "",
382
- "HR Insights:",
383
- "- Faster speaking rates may indicate confidence but can suggest nervousness if excessive.",
384
- "- High filler word usage often reduces perceived professionalism and clarity.",
385
- "- Elevated anxiety indicators (pitch variability, jitter) may reflect interview pressure.",
386
- "- Strong confidence scores suggest effective vocal presence and control.",
387
- "- Fluency impacts listener engagement; disfluency may hinder communication effectiveness."
388
- ]
389
- return "\n".join(interpretation_lines)
390
-
391
- def generate_anxiety_confidence_chart(composite_scores: Dict, chart_path_or_buffer):
392
- try:
393
- labels = ['Anxiety', 'Confidence']
394
- scores = [composite_scores.get('anxiety', 0), composite_scores.get('confidence', 0)]
395
- fig, ax = plt.subplots(figsize=(4, 2.5))
396
- bars = ax.bar(labels, scores, color=['#FF6B6B', '#4ECDC4'], edgecolor='black')
397
- ax.set_ylabel('Score (Normalized)')
398
- ax.set_title('Vocal Dynamics: Anxiety vs. Confidence')
399
- ax.set_ylim(0, 1.2)
400
- for bar in bars:
401
- height = bar.get_height()
402
- ax.text(bar.get_x() + bar.get_width()/2, height + 0.05, f"{height:.2f}",
403
- ha='center', color='black', fontweight='bold', fontsize=10)
404
- plt.tight_layout()
405
- plt.savefig(chart_path_or_buffer, format='png', bbox_inches='tight', dpi=150)
406
- plt.close(fig)
407
- except Exception as e:
408
- logger.error(f"Error generating chart: {str(e)}")
409
-
410
- def calculate_acceptance_probability(analysis_data: Dict) -> float:
411
- voice = analysis_data.get('voice_analysis', {})
412
- if 'error' in voice: return 0.0
413
- w_confidence, w_anxiety, w_fluency, w_speaking_rate, w_filler_repetition, w_content_strengths = 0.4, -0.3, 0.2, 0.1, -0.1, 0.2
414
- confidence_score = voice.get('composite_scores', {}).get('confidence', 0.0)
415
- anxiety_score = voice.get('composite_scores', {}).get('anxiety', 0.0)
416
- fluency_level = voice.get('interpretation', {}).get('fluency_level', 'Disfluent')
417
- speaking_rate = voice.get('speaking_rate', 0.0)
418
- filler_ratio = voice.get('filler_ratio', 0.0)
419
- repetition_score = voice.get('repetition_score', 0.0)
420
- fluency_map = {'Fluent': 1.0, 'Moderate': 0.5, 'Disfluent': 0.0}
421
- fluency_val = fluency_map.get(fluency_level, 0.0)
422
- ideal_speaking_rate = 2.5
423
- speaking_rate_deviation = abs(speaking_rate - ideal_speaking_rate)
424
- speaking_rate_score = max(0, 1 - (speaking_rate_deviation / ideal_speaking_rate))
425
- filler_repetition_composite = (filler_ratio + repetition_score) / 2
426
- filler_repetition_score = max(0, 1 - filler_repetition_composite)
427
- content_strength_val = 0.8 if analysis_data.get('text_analysis', {}).get('total_duration', 0) > 0 else 0.0
428
- raw_score = (confidence_score * w_confidence + (1 - anxiety_score) * abs(w_anxiety) + fluency_val * w_fluency + speaking_rate_score * w_speaking_rate + filler_repetition_score * abs(w_filler_repetition) + content_strength_val * w_content_strengths)
429
- max_possible_score = (w_confidence + abs(w_anxiety) + w_fluency + w_speaking_rate + abs(w_filler_repetition) + w_content_strengths)
430
- if max_possible_score == 0: return 50.0
431
- normalized_score = raw_score / max_possible_score
432
- acceptance_probability = max(0.0, min(1.0, normalized_score))
433
- return float(f"{acceptance_probability * 100:.2f}")
434
-
435
- def generate_report(analysis_data: Dict) -> str:
436
  try:
437
  voice = analysis_data.get('voice_analysis', {})
438
- voice_interpretation = generate_voice_interpretation(voice)
439
- interviewee_responses = [f"Speaker {u['speaker']} ({u['role']}): {u['text']}" for u in analysis_data['transcript'] if u['role'] == 'Interviewee'][:5]
440
- acceptance_prob = analysis_data.get('acceptance_probability', None)
441
- acceptance_line = ""
442
- if acceptance_prob is not None:
443
- acceptance_line = f"\n*Hiring Potential Score: {acceptance_prob:.2f}%*\n"
444
- if acceptance_prob >= 80: acceptance_line += "Assessment: Exceptional candidate, strongly recommended for advancement."
445
- elif acceptance_prob >= 50: acceptance_line += "Assessment: Promising candidate with moderate strengths; consider for further evaluation."
446
- else: acceptance_line += "Assessment: Limited alignment with role expectations; significant development needed."
447
  prompt = f"""
448
- You are an expert HR consultant, EvalBot, tasked with producing a professional, concise, and actionable interview analysis report. Structure the report with clear headings, subheadings, and bullet points (use '- ' for bullets). Adopt a formal, HR-professional tone, focusing on candidate evaluation, fit for role, and development insights.
449
- {acceptance_line}
450
- *1. Executive Summary*
451
- - Provide a concise overview of the interview, highlighting key metrics and overall candidate performance.
452
- - Interview duration: {analysis_data['text_analysis']['total_duration']:.2f} seconds
453
- - Total speaker turns: {analysis_data['text_analysis']['speaker_turns']}
454
- - Participants: {', '.join(analysis_data['speakers'])}
455
- *2. Communication and Vocal Analysis*
456
- - Evaluate the candidate's vocal delivery, including speaking rate, fluency, and confidence indicators.
457
- - Provide HR-relevant insights into how these metrics impact perceived professionalism and role suitability.
458
- {voice_interpretation}
459
- *3. Content Analysis and Competency Assessment*
460
- - Analyze key themes in the candidate's responses to assess alignment with job competencies (e.g., problem-solving, communication, leadership).
461
- - Identify strengths and areas for improvement, supported by specific examples.
462
- - Sample responses for context:
463
- {chr(10).join(interviewee_responses)}
464
- *4. Fit and Potential Evaluation*
465
- - Assess the candidate's overall fit for a typical professional role based on communication, content, and vocal dynamics.
466
- - Consider cultural fit, adaptability, and readiness for the role.
467
- *5. Actionable HR Recommendations*
468
- - Provide specific, prioritized recommendations for the candidate’s development.
469
- - Focus areas: Effective Communication, Content Clarity and Depth, Professional Presence.
470
- - Suggest next steps for hiring managers (e.g., advance to next round, additional assessments, training focus).
471
  """
472
  response = gemini_model.generate_content(prompt)
473
  return response.text
474
  except Exception as e:
475
- logger.error(f"Report generation failed: {str(e)}")
476
  return f"Error generating report: {str(e)}"
477
 
478
  def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str):
 
479
  try:
480
- doc = SimpleDocTemplate(output_path, pagesize=letter,
481
- rightMargin=0.75*inch, leftMargin=0.75*inch,
482
- topMargin=1*inch, bottomMargin=1*inch)
483
  styles = getSampleStyleSheet()
484
- h1 = ParagraphStyle(name='Heading1', fontSize=22, leading=26, spaceAfter=20, alignment=1, textColor=colors.HexColor('#1A3C5E'))
485
- h2 = ParagraphStyle(name='Heading2', fontSize=14, leading=18, spaceBefore=14, spaceAfter=8, textColor=colors.HexColor('#2E5A87'))
486
- body_text = ParagraphStyle(name='BodyText', parent=styles['Normal'], fontSize=10, leading=14, spaceAfter=8, fontName='Helvetica')
487
- bullet_style = ParagraphStyle(name='Bullet', parent=body_text, leftIndent=20, bulletIndent=10, fontName='Helvetica')
488
-
489
  story = []
490
-
491
- def header_footer(canvas, doc):
492
- canvas.saveState()
493
- canvas.setFont('Helvetica', 9)
494
- canvas.setFillColor(colors.grey)
495
- canvas.drawString(doc.leftMargin, 0.5 * inch, f"Page {doc.page} | EvalBot HR Interview Report | Confidential")
496
- canvas.setStrokeColor(colors.HexColor('#2E5A87'))
497
- canvas.setLineWidth(1)
498
- canvas.line(doc.leftMargin, doc.height + 0.85*inch, doc.width + doc.leftMargin, doc.height + 0.85*inch)
499
- canvas.setFont('Helvetica-Bold', 10)
500
- canvas.drawString(doc.leftMargin, doc.height + 0.9*inch, "Candidate Interview Analysis Report")
501
- canvas.restoreState()
502
-
503
- # Title Page
504
- story.append(Paragraph("Candidate Interview Analysis Report", h1))
505
- story.append(Paragraph(f"Generated on: {time.strftime('%B %d, %Y')}", ParagraphStyle(name='Date', alignment=1, fontSize=10, textColor=colors.grey)))
506
- story.append(Spacer(1, 0.5 * inch))
507
- acceptance_prob = analysis_data.get('acceptance_probability')
508
- if acceptance_prob is not None:
509
- story.append(Paragraph("Hiring Potential Snapshot", h2))
510
- prob_color = colors.HexColor('#2E7D32') if acceptance_prob >= 70 else (colors.HexColor('#F57C00') if acceptance_prob >= 40 else colors.HexColor('#D32F2F'))
511
- story.append(Paragraph(f"Hiring Potential Score: <font size=16 color='{prob_color.hexval()}'><b>{acceptance_prob:.2f}%</b></font>",
512
- ParagraphStyle(name='Prob', fontSize=12, spaceAfter=12, alignment=1)))
513
- if acceptance_prob >= 80:
514
- story.append(Paragraph("<b>HR Assessment:</b> Exceptional candidate, strongly recommended for advancement to the next stage.", body_text))
515
- elif acceptance_prob >= 50:
516
- story.append(Paragraph("<b>HR Assessment:</b> Promising candidate with moderate strengths; consider for further evaluation.", body_text))
517
- else:
518
- story.append(Paragraph("<b>HR Assessment:</b> Limited alignment with role expectations; significant development needed.", body_text))
519
- story.append(Spacer(1, 0.3 * inch))
520
- story.append(Paragraph("Prepared by: EvalBot - AI-Powered HR Interview Analysis System", body_text))
521
- story.append(PageBreak())
522
-
523
- # Detailed Analysis
524
- story.append(Paragraph("Detailed Candidate Evaluation", h1))
525
 
526
- story.append(Paragraph("1. Communication and Vocal Profile", h2))
527
- voice_analysis = analysis_data.get('voice_analysis', {})
528
- if voice_analysis and 'error' not in voice_analysis:
529
- table_data = [
530
- ['Metric', 'Value', 'HR Insight'],
531
- ['Speaking Rate', f"{voice_analysis.get('speaking_rate', 0):.2f} words/sec", 'Optimal: 2.0-3.0 wps; impacts clarity and confidence'],
532
- ['Filler Word Usage', f"{voice_analysis.get('filler_ratio', 0) * 100:.1f}%", 'High usage may reduce perceived professionalism'],
533
- ['Anxiety Indicator', voice_analysis.get('interpretation', {}).get('anxiety_level', 'N/A'), f"Score: {voice_analysis.get('composite_scores', {}).get('anxiety', 0):.3f}; reflects pressure response"],
534
- ['Confidence Indicator', voice_analysis.get('interpretation', {}).get('confidence_level', 'N/A'), f"Score: {voice_analysis.get('composite_scores', {}).get('confidence', 0):.3f}; indicates vocal authority"],
535
- ['Fluency Assessment', voice_analysis.get('interpretation', {}).get('fluency_level', 'N/A'), 'Affects engagement and message delivery']
536
- ]
537
- table = Table(table_data, colWidths=[1.8*inch, 1.2*inch, 3.5*inch])
538
- table.setStyle(TableStyle([
539
- ('BACKGROUND', (0,0), (-1,0), colors.HexColor('#2E5A87')),
540
- ('TEXTCOLOR', (0,0), (-1,0), colors.whitesmoke),
541
- ('ALIGN', (0,0), (-1,-1), 'LEFT'),
542
- ('VALIGN', (0,0), (-1,-1), 'MIDDLE'),
543
- ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
544
- ('FONTSIZE', (0, 0), (-1, -1), 9),
545
- ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
546
- ('TOPPADDING', (0, 0), (-1, 0), 12),
547
- ('BACKGROUND', (0, 1), (-1, -1), colors.HexColor('#F5F7FA')),
548
- ('GRID', (0,0), (-1,-1), 1, colors.HexColor('#DDE4EB'))
549
- ]))
550
- story.append(table)
551
- story.append(Spacer(1, 0.25 * inch))
552
- chart_buffer = io.BytesIO()
553
- generate_anxiety_confidence_chart(voice_analysis.get('composite_scores', {}), chart_buffer)
554
- chart_buffer.seek(0)
555
- img = Image(chart_buffer, width=4.5*inch, height=2.8*inch)
556
- img.hAlign = 'CENTER'
557
- story.append(img)
558
- else:
559
- story.append(Paragraph("Voice analysis unavailable due to processing limitations.", body_text))
560
- story.append(Spacer(1, 0.3 * inch))
561
-
562
- # Parse Gemini Report
563
- sections = {}
564
- section_titles = ["Executive Summary", "Communication and Vocal Analysis",
565
- "Content Analysis and Competency Assessment",
566
- "Fit and Potential Evaluation", "Actionable HR Recommendations"]
567
- for title in section_titles:
568
- sections[title] = []
569
- report_parts = re.split(r'(\s*\\\s*\d\.\s*.?\s\\)', gemini_report_text)
570
- current_section = None
571
  for part in report_parts:
572
- if not part.strip(): continue
573
- is_heading = False
574
- for title in section_titles:
575
- if title.lower() in part.lower():
576
- current_section = title
577
- is_heading = True
578
- break
579
- if not is_heading and current_section:
580
- sections[current_section].append(part.strip())
581
-
582
- # Executive Summary
583
- story.append(Paragraph("2. Executive Summary", h2))
584
- if sections['Executive Summary']:
585
- for line in sections['Executive Summary']:
586
- if line.startswith(('-', '•', '*')):
587
- story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
588
- else:
589
- story.append(Paragraph(line, body_text))
590
- else:
591
- story.append(Paragraph("Summary not available from analysis.", body_text))
592
- story.append(Spacer(1, 0.3 * inch))
593
-
594
- # Content and Competency
595
- story.append(Paragraph("3. Content and Competency Assessment", h2))
596
- if sections['Content Analysis and Competency Assessment']:
597
- for line in sections['Content Analysis and Competency Assessment']:
598
- if line.startswith(('-', '•', '*')):
599
- story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
600
- else:
601
- story.append(Paragraph(line, body_text))
602
- else:
603
- story.append(Paragraph("Content and competency analysis not provided.", body_text))
604
- story.append(PageBreak())
605
-
606
- # Fit and Potential
607
- story.append(Paragraph("4. Fit and Potential Evaluation", h2))
608
- if sections['Fit and Potential Evaluation']:
609
- for line in sections['Fit and Potential Evaluation']:
610
- if line.startswith(('-', '•', '*')):
611
- story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
612
  else:
613
- story.append(Paragraph(line, body_text))
614
- else:
615
- story.append(Paragraph("Fit and potential evaluation not available.", body_text))
616
- story.append(Spacer(1, 0.3 * inch))
617
-
618
- # HR Recommendations
619
- story.append(Paragraph("5. Actionable HR Recommendations", h2))
620
- if sections['Actionable HR Recommendations']:
621
- for line in sections['Actionable HR Recommendations']:
622
- if line.startswith(('-', '•', '*')):
623
- story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
624
- else:
625
- story.append(Paragraph(line, body_text))
626
- else:
627
- story.append(Paragraph("HR recommendations not provided.", body_text))
628
-
629
- doc.build(story, onFirstPage=header_footer, onLaterPages=header_footer)
630
- return True
631
  except Exception as e:
632
- logger.error(f"Enhanced PDF creation failed: {str(e)}", exc_info=True)
633
- return False
 
 
634
 
635
  def convert_to_serializable(obj):
 
636
  if isinstance(obj, np.generic): return obj.item()
637
- if isinstance(obj, dict): return {k: convert_to_serializable(v) for k, v in obj.items()}
638
- if isinstance(obj, list): return [convert_to_serializable(i) for i in obj]
639
  if isinstance(obj, np.ndarray): return obj.tolist()
640
  return obj
641
 
642
- def process_interview(audio_path_or_url: str):
643
- local_audio_path = None
 
 
 
 
 
 
 
 
644
  wav_file = None
645
- is_downloaded = False
646
  try:
647
- logger.info(f"Starting processing for {audio_path_or_url}")
648
- if audio_path_or_url.startswith(('http://', 'https://')):
649
- local_audio_path = download_audio_from_url(audio_path_or_url)
650
- is_downloaded = True
651
- else:
652
- local_audio_path = audio_path_or_url
653
- wav_file = convert_to_wav(local_audio_path)
654
  transcript = transcribe(wav_file)
 
 
655
  for utterance in transcript['utterances']:
656
- utterance['prosodic_features'] = extract_prosodic_features(wav_file, utterance['start'], utterance['end'])
 
 
 
 
657
  utterances_with_speakers = identify_speakers(transcript, wav_file)
658
- clf, vectorizer, scaler = None, None, None
659
- if os.path.exists(os.path.join(OUTPUT_DIR, 'role_classifier.pkl')):
660
- clf = joblib.load(os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
661
- vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
662
- scaler = joblib.load(os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
663
- else:
664
  clf, vectorizer, scaler = train_role_classifier(utterances_with_speakers)
665
  classified_utterances = classify_roles(utterances_with_speakers, clf, vectorizer, scaler)
666
- voice_analysis = analyze_interviewee_voice(wav_file, classified_utterances)
 
 
 
667
  analysis_data = {
668
  'transcript': classified_utterances,
669
  'speakers': list(set(u['speaker'] for u in classified_utterances)),
670
  'voice_analysis': voice_analysis,
671
  'text_analysis': {
672
- 'total_duration': sum(u['prosodic_features']['duration'] for u in classified_utterances),
673
  'speaker_turns': len(classified_utterances)
674
  }
675
  }
676
- analysis_data['acceptance_probability'] = calculate_acceptance_probability(analysis_data)
677
- gemini_report_text = generate_report(analysis_data)
678
- base_name = str(uuid.uuid4())
 
679
  pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
680
  json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
681
- create_pdf_report(analysis_data, pdf_path, gemini_report_text=gemini_report_text)
 
 
 
682
  with open(json_path, 'w') as f:
683
  serializable_data = convert_to_serializable(analysis_data)
684
  json.dump(serializable_data, f, indent=2)
685
- logger.info(f"Processing completed for {audio_path_or_url}")
 
686
  return {'pdf_path': pdf_path, 'json_path': json_path}
687
- except Exception as e:
688
- logger.error(f"Processing failed for {audio_path_or_url}: {str(e)}", exc_info=True)
689
- raise
690
  finally:
691
  if wav_file and os.path.exists(wav_file):
692
  os.remove(wav_file)
693
- if is_downloaded and local_audio_path and os.path.exists(local_audio_path):
694
- os.remove(local_audio_path)
695
- logger.info(f"Cleaned up temporary downloaded file: {local_audio_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  import re
18
  from typing import Dict, List, Tuple
19
  import logging
20
+
21
+ # --- Imports for enhanced PDF ---
22
  from reportlab.lib.pagesizes import letter
23
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
24
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
25
  from reportlab.lib.units import inch
26
  from reportlab.lib import colors
27
+
28
+ # --- Imports for NLP and models ---
 
 
 
29
  from transformers import AutoTokenizer, AutoModel
30
  import spacy
31
  import google.generativeai as genai
32
  import joblib
33
  from concurrent.futures import ThreadPoolExecutor
34
 
35
+ # ==============================================================================
36
+ # 1. SETUP & CONFIGURATION
37
+ # ==============================================================================
38
+
39
  # Setup logging
40
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
41
+ logger = logging.getLogger(__name__)
42
  logging.getLogger("nemo_logging").setLevel(logging.ERROR)
 
43
 
44
  # Configuration
45
  AUDIO_DIR = "./uploads"
46
  OUTPUT_DIR = "./processed_audio"
47
+ os.makedirs(AUDIO_DIR, exist_ok=True)
48
  os.makedirs(OUTPUT_DIR, exist_ok=True)
49
 
50
+ # API Keys from environment variables
51
  PINECONE_KEY = os.getenv("PINECONE_KEY")
52
  ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY")
53
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
54
 
55
+ if not all([PINECONE_KEY, ASSEMBLYAI_KEY, GEMINI_API_KEY]):
56
+ logger.error("CRITICAL: API keys (PINECONE_KEY, ASSEMBLYAI_KEY, GEMINI_API_KEY) must be set as environment variables.")
57
+ raise EnvironmentError("API keys must be set for the application to run.")
58
+
59
+ # ==============================================================================
60
+ # 2. INITIALIZE MODELS AND SERVICES (Executed once on import)
61
+ # ==============================================================================
 
 
 
 
 
 
 
 
62
 
63
  def initialize_services():
64
  try:
65
+ logger.info("Initializing Pinecone and Gemini services...")
66
  pc = Pinecone(api_key=PINECONE_KEY)
67
  index_name = "interview-speaker-embeddings"
68
  if index_name not in pc.list_indexes().names():
69
+ logger.info(f"Creating Pinecone index: {index_name}")
70
  pc.create_index(
71
  name=index_name,
72
  dimension=192,
 
76
  index = pc.Index(index_name)
77
  genai.configure(api_key=GEMINI_API_KEY)
78
  gemini_model = genai.GenerativeModel('gemini-1.5-flash')
79
+ logger.info("Services initialized successfully.")
80
  return index, gemini_model
81
  except Exception as e:
82
+ logger.error(f"Error initializing services: {str(e)}", exc_info=True)
83
  raise
84
 
85
  index, gemini_model = initialize_services()
 
87
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
88
  logger.info(f"Using device: {device}")
89
 
90
+ def load_models():
91
  try:
92
+ logger.info("Loading ML models...")
93
+ # Speaker model
94
+ speaker_model = EncDecSpeakerLabelModel.from_pretrained(
95
  "nvidia/speakerverification_en_titanet_large",
96
  map_location=torch.device('cpu')
97
  )
98
+ speaker_model.eval()
99
+
100
+ # NLP model
101
+ nlp = spacy.load("en_core_web_sm")
102
+
103
+ logger.info("All models loaded successfully.")
104
+ return speaker_model, nlp
105
  except Exception as e:
106
+ logger.error(f"Model loading failed: {str(e)}", exc_info=True)
107
+ raise RuntimeError("Could not load machine learning models.")
108
+
109
+ speaker_model, nlp = load_models()
110
 
 
 
 
 
 
 
 
111
 
112
+ # ==============================================================================
113
+ # 3. HELPER FUNCTIONS (The core logic for each step of the pipeline)
114
+ # ==============================================================================
115
 
116
  def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
117
  try:
 
123
  audio.export(wav_file, format="wav")
124
  return wav_file
125
  except Exception as e:
126
+ logger.error(f"Audio conversion failed for {audio_path}: {str(e)}")
127
  raise
128
 
129
  def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
 
149
  os.remove(temp_path)
150
  return features
151
  except Exception as e:
152
+ logger.warning(f"Feature extraction failed, returning zeros: {str(e)}")
153
  return {
154
+ 'duration': (end_ms - start_ms) / 1000, 'mean_pitch': 0.0, 'min_pitch': 0.0, 'max_pitch': 0.0,
155
+ 'pitch_sd': 0.0, 'intensityMean': 0.0, 'intensityMin': 0.0, 'intensityMax': 0.0, 'intensitySD': 0.0,
 
156
  }
157
 
158
  def transcribe(audio_path: str) -> Dict:
 
160
  with open(audio_path, 'rb') as f:
161
  upload_response = requests.post(
162
  "https://api.assemblyai.com/v2/upload",
163
+ headers={"authorization": ASSEMBLYAI_KEY}, data=f
 
164
  )
165
+ upload_response.raise_for_status()
166
+ audio_url = upload_response.json()['upload_url']
167
+
168
  transcript_response = requests.post(
169
  "https://api.assemblyai.com/v2/transcript",
170
  headers={"authorization": ASSEMBLYAI_KEY},
171
+ json={"audio_url": audio_url, "speaker_labels": True, "filter_profanity": True}
 
 
 
 
172
  )
173
+ transcript_response.raise_for_status()
174
  transcript_id = transcript_response.json()['id']
175
+
176
  while True:
177
+ result_response = requests.get(
178
  f"https://api.assemblyai.com/v2/transcript/{transcript_id}",
179
  headers={"authorization": ASSEMBLYAI_KEY}
180
+ )
181
+ result_response.raise_for_status()
182
+ result = result_response.json()
183
+
184
  if result['status'] == 'completed':
185
+ if 'utterances' not in result or result['utterances'] is None:
186
+ result['utterances'] = []
187
+ logger.warning("Transcription completed but no utterances found.")
188
  return result
189
  elif result['status'] == 'error':
190
+ raise Exception(f"Transcription failed: {result['error']}")
191
  time.sleep(5)
192
  except Exception as e:
193
+ logger.error(f"Transcription process failed: {str(e)}", exc_info=True)
194
  raise
195
 
196
+ def process_utterance(utterance, full_audio):
197
  try:
198
  start = utterance['start']
199
  end = utterance['end']
200
  segment = full_audio[start:end]
201
+ temp_path = os.path.join(OUTPUT_DIR, f"temp_utterance_{uuid.uuid4()}.wav")
202
  segment.export(temp_path, format="wav")
203
+
204
  with torch.no_grad():
205
+ embedding = speaker_model.get_embedding(temp_path).to(device)
206
+
207
  query_result = index.query(
208
+ vector=embedding.cpu().numpy().tolist(), top_k=1, include_metadata=True
 
 
209
  )
210
+
211
  if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
212
  speaker_id = query_result['matches'][0]['id']
213
  speaker_name = query_result['matches'][0]['metadata']['speaker_name']
214
  else:
215
  speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
216
  speaker_name = f"Speaker_{speaker_id[-4:]}"
217
+ index.upsert([(speaker_id, embedding.cpu().numpy().tolist(), {"speaker_name": speaker_name})])
218
+
219
  os.remove(temp_path)
220
  return {
221
+ **utterance, 'speaker': speaker_name, 'speaker_id': speaker_id
 
 
 
222
  }
223
  except Exception as e:
224
+ logger.warning(f"Utterance processing failed: {str(e)}")
225
+ return {**utterance, 'speaker': 'Unknown', 'speaker_id': 'unknown'}
 
 
 
 
 
226
 
227
  def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
228
  try:
229
+ if not transcript.get('utterances'):
230
+ return []
231
  full_audio = AudioSegment.from_wav(wav_file)
232
  utterances = transcript['utterances']
233
+
234
+ with ThreadPoolExecutor(max_workers=4) as executor:
235
+ futures = [executor.submit(process_utterance, utterance, full_audio) for utterance in utterances]
 
 
236
  results = [f.result() for f in futures]
237
  return results
238
  except Exception as e:
239
+ logger.error(f"Speaker identification failed: {str(e)}", exc_info=True)
240
  raise
241
 
242
+ def get_role_classification_models():
243
+ """Loads role classification models if they exist, otherwise returns None."""
244
+ clf_path = os.path.join(OUTPUT_DIR, 'role_classifier.pkl')
245
+ vec_path = os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl')
246
+ scl_path = os.path.join(OUTPUT_DIR, 'feature_scaler.pkl')
247
+
248
+ if all(os.path.exists(p) for p in [clf_path, vec_path, scl_path]):
249
+ clf = joblib.load(clf_path)
250
+ vectorizer = joblib.load(vec_path)
251
+ scaler = joblib.load(scl_path)
252
+ return clf, vectorizer, scaler
253
+ return None, None, None
254
+
255
  def train_role_classifier(utterances: List[Dict]):
256
+ """Trains and saves a role classifier based on utterance features."""
257
  try:
258
  texts = [u['text'] for u in utterances]
259
  vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
260
  X_text = vectorizer.fit_transform(texts)
261
+ features, labels = [], []
262
+ # Simple heuristic: assume alternating speakers are interviewer/interviewee
263
  for i, utterance in enumerate(utterances):
264
  prosodic = utterance['prosodic_features']
265
  feat = [
266
  prosodic['duration'], prosodic['mean_pitch'], prosodic['min_pitch'],
267
+ prosodic['pitch_sd'], prosodic['intensityMean'],
 
268
  ]
269
  feat.extend(X_text[i].toarray()[0].tolist())
270
  doc = nlp(utterance['text'])
271
  feat.extend([
272
  int(utterance['text'].endswith('?')),
273
+ len(re.findall(r'\b(why|how|what|when|where)\b', utterance['text'].lower())),
274
  len(utterance['text'].split()),
275
  sum(1 for token in doc if token.pos_ == 'VERB'),
 
276
  ])
277
  features.append(feat)
278
+ labels.append(i % 2) # 0 for interviewer, 1 for interviewee
279
+
280
  scaler = StandardScaler()
281
  X = scaler.fit_transform(features)
282
+ clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
 
 
283
  clf.fit(X, labels)
284
+
285
  joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
286
  joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
287
  joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
288
  return clf, vectorizer, scaler
289
  except Exception as e:
290
+ logger.error(f"Classifier training failed: {str(e)}", exc_info=True)
291
  raise
292
 
293
  def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
294
+ """Classifies roles for each utterance using a pre-trained model."""
295
  try:
296
  texts = [u['text'] for u in utterances]
297
  X_text = vectorizer.transform(texts)
 
300
  prosodic = utterance['prosodic_features']
301
  feat = [
302
  prosodic['duration'], prosodic['mean_pitch'], prosodic['min_pitch'],
303
+ prosodic['pitch_sd'], prosodic['intensityMean'],
 
304
  ]
305
  feat.extend(X_text[i].toarray()[0].tolist())
306
  doc = nlp(utterance['text'])
307
  feat.extend([
308
  int(utterance['text'].endswith('?')),
309
+ len(re.findall(r'\b(why|how|what|when|where)\b', utterance['text'].lower())),
310
  len(utterance['text'].split()),
311
  sum(1 for token in doc if token.pos_ == 'VERB'),
 
312
  ])
313
  X = scaler.transform([feat])
314
  role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
315
  results.append({**utterance, 'role': role})
316
  return results
317
  except Exception as e:
318
+ logger.error(f"Role classification failed: {str(e)}", exc_info=True)
319
+ # Fallback if classification fails
320
+ return [dict(u, role='Unknown') for u in utterances]
321
 
322
+ def analyze_interviewee_voice(utterances: List[Dict]) -> Dict:
323
+ # (This function is complex, including it fully)
324
  try:
325
+ interviewee_utterances = [u for u in utterances if u.get('role') == 'Interviewee']
 
326
  if not interviewee_utterances:
327
+ return {'error': 'No interviewee utterances found to analyze.'}
328
+
 
 
 
 
329
  total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
330
  total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
331
  speaking_rate = total_words / total_duration if total_duration > 0 else 0
332
+
333
  filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
334
+ filler_count = sum(u['text'].lower().count(fw) for u in interviewee_utterances for fw in filler_words)
335
  filler_ratio = filler_count / total_words if total_words > 0 else 0
336
+
337
+ all_pitches = [u['prosodic_features']['mean_pitch'] for u in interviewee_utterances if u['prosodic_features']['mean_pitch'] > 0]
338
+ pitch_mean = np.mean(all_pitches) if all_pitches else 0
339
+ pitch_std = np.std(all_pitches) if all_pitches else 0
340
+
341
+ anxiety_score = (pitch_std / 100) + (filler_ratio * 2)
342
+ confidence_score = 1 - anxiety_score if anxiety_score < 1 else 0
343
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  return {
345
  'speaking_rate': float(round(speaking_rate, 2)),
346
  'filler_ratio': float(round(filler_ratio, 4)),
347
+ 'pitch_mean': float(round(pitch_mean, 2)),
348
+ 'pitch_std_dev': float(round(pitch_std, 2)),
349
+ 'composite_scores': {
350
+ 'anxiety': float(round(anxiety_score, 4)),
351
+ 'confidence': float(round(confidence_score, 4)),
352
+ }
353
  }
354
  except Exception as e:
355
+ logger.error(f"Voice analysis failed: {str(e)}", exc_info=True)
356
  return {'error': str(e)}
357
 
358
+ def generate_report_text(analysis_data: Dict) -> str:
359
+ """Generates the text for the final report using Gemini."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  try:
361
  voice = analysis_data.get('voice_analysis', {})
362
+ interviewee_responses = [u['text'] for u in analysis_data['transcript'] if u.get('role') == 'Interviewee']
363
+
 
 
 
 
 
 
 
364
  prompt = f"""
365
+ Analyze the following interview data and generate a concise, professional report.
366
+
367
+ **Interview Data:**
368
+ - Total Duration: {analysis_data['text_analysis']['total_duration']:.2f} seconds
369
+ - Speaker Turns: {analysis_data['text_analysis']['speaker_turns']}
370
+ - Speakers: {', '.join(analysis_data['speakers'])}
371
+
372
+ **Voice Analysis of Interviewee:**
373
+ - Speaking Rate: {voice.get('speaking_rate', 'N/A')} words/sec
374
+ - Filler Word Ratio: {voice.get('filler_ratio', 'N/A')}
375
+ - Anxiety Score (lower is better): {voice.get('composite_scores', {}).get('anxiety', 'N/A')}
376
+ - Confidence Score (higher is better): {voice.get('composite_scores', {}).get('confidence', 'N/A')}
377
+
378
+ **Interviewee's Key Responses:**
379
+ - {"- ".join(interviewee_responses[:3])}
380
+
381
+ **Task:**
382
+ Based on all the data above, provide:
383
+ 1. **Executive Summary:** A brief paragraph summarizing the candidate's performance.
384
+ 2. **Strengths:** 2-3 bullet points on what the candidate did well (e.g., clear articulation, confidence).
385
+ 3. **Areas for Improvement:** 2-3 bullet points on specific, actionable feedback (e.g., reduce filler words, elaborate on answers).
 
 
386
  """
387
  response = gemini_model.generate_content(prompt)
388
  return response.text
389
  except Exception as e:
390
+ logger.error(f"Report generation with Gemini failed: {str(e)}", exc_info=True)
391
  return f"Error generating report: {str(e)}"
392
 
393
  def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str):
394
+ """Creates a PDF report from the analysis data."""
395
  try:
396
+ doc = SimpleDocTemplate(output_path, pagesize=letter)
 
 
397
  styles = getSampleStyleSheet()
 
 
 
 
 
398
  story = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
+ story.append(Paragraph("Interview Analysis Report", styles['h1']))
401
+ story.append(Spacer(1, 0.2 * inch))
402
+
403
+ # Split Gemini text into paragraphs for cleaner formatting
404
+ report_parts = gemini_report_text.split('\n')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
  for part in report_parts:
406
+ if part.strip():
407
+ if part.startswith('**'):
408
+ story.append(Paragraph(part.replace('**', ''), styles['h2']))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  else:
410
+ story.append(Paragraph(part, styles['BodyText']))
411
+
412
+ doc.build(story)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
  except Exception as e:
414
+ logger.error(f"PDF creation failed: {str(e)}", exc_info=True)
415
+ # Create a fallback text file if PDF fails
416
+ with open(output_path.replace('.pdf', '.txt'), 'w') as f:
417
+ f.write(gemini_report_text)
418
 
419
  def convert_to_serializable(obj):
420
+ """Converts numpy types to native Python types for JSON serialization."""
421
  if isinstance(obj, np.generic): return obj.item()
422
+ if isinstance(obj, dict): return {key: convert_to_serializable(value) for key, value in obj.items()}
423
+ if isinstance(obj, list): return [convert_to_serializable(item) for item in obj]
424
  if isinstance(obj, np.ndarray): return obj.tolist()
425
  return obj
426
 
427
+
428
+ # ==============================================================================
429
+ # 4. ORCHESTRATION FUNCTIONS
430
+ # ==============================================================================
431
+
432
+ def _process_local_audio_file(local_audio_path: str, base_name: str) -> dict:
433
+ """
434
+ Internal function to process a local audio file.
435
+ This contains the main pipeline logic.
436
+ """
437
  wav_file = None
 
438
  try:
439
+ logger.info(f"Step 1/8: Converting to WAV: {local_audio_path}")
440
+ wav_file = convert_to_wav(local_audio_path, OUTPUT_DIR)
441
+
442
+ logger.info("Step 2/8: Transcribing audio...")
 
 
 
443
  transcript = transcribe(wav_file)
444
+
445
+ logger.info("Step 3/8: Extracting prosodic features...")
446
  for utterance in transcript['utterances']:
447
+ utterance['prosodic_features'] = extract_prosodic_features(
448
+ wav_file, utterance['start'], utterance['end']
449
+ )
450
+
451
+ logger.info("Step 4/8: Identifying speakers...")
452
  utterances_with_speakers = identify_speakers(transcript, wav_file)
453
+
454
+ logger.info("Step 5/8: Classifying speaker roles...")
455
+ clf, vectorizer, scaler = get_role_classification_models()
456
+ if not clf:
457
+ logger.info("No role classifier found, training a new one...")
 
458
  clf, vectorizer, scaler = train_role_classifier(utterances_with_speakers)
459
  classified_utterances = classify_roles(utterances_with_speakers, clf, vectorizer, scaler)
460
+
461
+ logger.info("Step 6/8: Analyzing interviewee voice...")
462
+ voice_analysis = analyze_interviewee_voice(classified_utterances)
463
+
464
  analysis_data = {
465
  'transcript': classified_utterances,
466
  'speakers': list(set(u['speaker'] for u in classified_utterances)),
467
  'voice_analysis': voice_analysis,
468
  'text_analysis': {
469
+ 'total_duration': transcript.get('audio_duration', 0),
470
  'speaker_turns': len(classified_utterances)
471
  }
472
  }
473
+
474
+ logger.info("Step 7/8: Generating report text with Gemini...")
475
+ gemini_report_text = generate_report_text(analysis_data)
476
+
477
  pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
478
  json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
479
+
480
+ logger.info(f"Step 8/8: Creating output files (PDF and JSON)...")
481
+ create_pdf_report(analysis_data, pdf_path, gemini_report_text)
482
+
483
  with open(json_path, 'w') as f:
484
  serializable_data = convert_to_serializable(analysis_data)
485
  json.dump(serializable_data, f, indent=2)
486
+
487
+ logger.info("Processing completed successfully.")
488
  return {'pdf_path': pdf_path, 'json_path': json_path}
489
+
 
 
490
  finally:
491
  if wav_file and os.path.exists(wav_file):
492
  os.remove(wav_file)
493
+ logger.info(f"Cleaned up temporary WAV file: {wav_file}")
494
+
495
+ def process_interview(audio_url: str) -> dict:
496
+ """
497
+ Main public function called by the API. It downloads a file from a URL,
498
+ processes it using the internal pipeline, and returns the output file paths.
499
+ """
500
+ temp_audio_path = None
501
+ try:
502
+ # 1. Download the audio file from the URL
503
+ logger.info(f"Downloading audio from URL: {audio_url}")
504
+ response = requests.get(audio_url, stream=True, timeout=60) # 60 second timeout
505
+ response.raise_for_status() # Raise an exception for bad status codes
506
+
507
+ # Generate a unique name for the temporary file
508
+ original_filename = audio_url.split('/')[-1]
509
+ file_extension = os.path.splitext(original_filename)[1] or '.tmp'
510
+ base_name = f"{uuid.uuid4()}"
511
+ temp_audio_path = os.path.join(AUDIO_DIR, f"{base_name}{file_extension}")
512
+
513
+ with open(temp_audio_path, 'wb') as f:
514
+ for chunk in response.iter_content(chunk_size=8192):
515
+ f.write(chunk)
516
+
517
+ logger.info(f"Audio downloaded and saved to: {temp_audio_path}")
518
+
519
+ # 2. Process the downloaded local file using the main pipeline
520
+ result = _process_local_audio_file(temp_audio_path, base_name)
521
+ return result
522
+
523
+ except requests.exceptions.RequestException as e:
524
+ logger.error(f"Failed to download or access URL {audio_url}: {e}")
525
+ raise RuntimeError(f"Could not download file from URL: {audio_url}") from e
526
+ except Exception as e:
527
+ logger.error(f"An unexpected error occurred during processing for URL {audio_url}: {e}", exc_info=True)
528
+ raise
529
+ finally:
530
+ # 3. Clean up the downloaded audio file
531
+ if temp_audio_path and os.path.exists(temp_audio_path):
532
+ os.remove(temp_audio_path)
533
+ logger.info(f"Cleaned up temporary downloaded file: {temp_audio_path}")