norhan12 commited on
Commit
bfc9b9d
·
verified ·
1 Parent(s): 8e1b49d

Update process_interview.py

Browse files
Files changed (1) hide show
  1. process_interview.py +507 -303
process_interview.py CHANGED
@@ -17,56 +17,60 @@ from sklearn.feature_extraction.text import TfidfVectorizer
17
  import re
18
  from typing import Dict, List, Tuple
19
  import logging
20
-
21
- # --- Imports for enhanced PDF ---
22
  from reportlab.lib.pagesizes import letter
23
- from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
24
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
25
  from reportlab.lib.units import inch
26
  from reportlab.lib import colors
27
-
28
- # --- Imports for NLP and models ---
 
 
 
29
  from transformers import AutoTokenizer, AutoModel
30
  import spacy
31
  import google.generativeai as genai
32
  import joblib
33
  from concurrent.futures import ThreadPoolExecutor
34
 
35
- # ==============================================================================
36
- # 1. SETUP & CONFIGURATION
37
- # ==============================================================================
38
-
39
  # Setup logging
40
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
41
  logger = logging.getLogger(__name__)
42
- logging.getLogger("nemo_logging").setLevel(logging.ERROR)
 
43
 
44
  # Configuration
45
- AUDIO_DIR = "./uploads"
46
  OUTPUT_DIR = "./processed_audio"
47
- os.makedirs(AUDIO_DIR, exist_ok=True)
48
  os.makedirs(OUTPUT_DIR, exist_ok=True)
49
 
50
- # API Keys from environment variables
51
- PINECONE_KEY = os.getenv("PINECONE_KEY")
52
- ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY")
53
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
54
 
55
- if not all([PINECONE_KEY, ASSEMBLYAI_KEY, GEMINI_API_KEY]):
56
- logger.error("CRITICAL: API keys (PINECONE_KEY, ASSEMBLYAI_KEY, GEMINI_API_KEY) must be set as environment variables.")
57
- raise EnvironmentError("API keys must be set for the application to run.")
58
-
59
- # ==============================================================================
60
- # 2. INITIALIZE MODELS AND SERVICES (Executed once on import)
61
- # ==============================================================================
 
 
 
 
 
 
 
 
62
 
63
  def initialize_services():
64
  try:
65
- logger.info("Initializing Pinecone and Gemini services...")
66
  pc = Pinecone(api_key=PINECONE_KEY)
67
  index_name = "interview-speaker-embeddings"
68
  if index_name not in pc.list_indexes().names():
69
- logger.info(f"Creating Pinecone index: {index_name}")
70
  pc.create_index(
71
  name=index_name,
72
  dimension=192,
@@ -76,10 +80,9 @@ def initialize_services():
76
  index = pc.Index(index_name)
77
  genai.configure(api_key=GEMINI_API_KEY)
78
  gemini_model = genai.GenerativeModel('gemini-1.5-flash')
79
- logger.info("Services initialized successfully.")
80
  return index, gemini_model
81
  except Exception as e:
82
- logger.error(f"Error initializing services: {str(e)}", exc_info=True)
83
  raise
84
 
85
  index, gemini_model = initialize_services()
@@ -87,31 +90,29 @@ index, gemini_model = initialize_services()
87
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
88
  logger.info(f"Using device: {device}")
89
 
90
- def load_models():
91
  try:
92
- logger.info("Loading ML models...")
93
- # Speaker model
94
- speaker_model = EncDecSpeakerLabelModel.from_pretrained(
95
  "nvidia/speakerverification_en_titanet_large",
96
  map_location=torch.device('cpu')
97
  )
98
- speaker_model.eval()
99
-
100
- # NLP model
101
- nlp = spacy.load("en_core_web_sm")
102
-
103
- logger.info("All models loaded successfully.")
104
- return speaker_model, nlp
105
  except Exception as e:
106
- logger.error(f"Model loading failed: {str(e)}", exc_info=True)
107
- raise RuntimeError("Could not load machine learning models.")
108
-
109
- speaker_model, nlp = load_models()
110
 
 
 
 
 
 
 
 
111
 
112
- # ==============================================================================
113
- # 3. HELPER FUNCTIONS (The core logic for each step of the pipeline)
114
- # ==============================================================================
115
 
116
  def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
117
  try:
@@ -123,7 +124,7 @@ def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
123
  audio.export(wav_file, format="wav")
124
  return wav_file
125
  except Exception as e:
126
- logger.error(f"Audio conversion failed for {audio_path}: {str(e)}")
127
  raise
128
 
129
  def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
@@ -149,397 +150,600 @@ def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Di
149
  os.remove(temp_path)
150
  return features
151
  except Exception as e:
152
- logger.warning(f"Feature extraction failed, returning zeros: {str(e)}")
153
  return {
154
- 'duration': (end_ms - start_ms) / 1000, 'mean_pitch': 0.0, 'min_pitch': 0.0, 'max_pitch': 0.0,
155
- 'pitch_sd': 0.0, 'intensityMean': 0.0, 'intensityMin': 0.0, 'intensityMax': 0.0, 'intensitySD': 0.0,
 
156
  }
157
 
158
  def transcribe(audio_path: str) -> Dict:
159
- """
160
- Transcribes the audio file using AssemblyAI with enhanced speaker diarization.
161
- """
162
  try:
163
  with open(audio_path, 'rb') as f:
164
  upload_response = requests.post(
165
  "https://api.assemblyai.com/v2/upload",
166
- headers={"authorization": ASSEMBLYAI_KEY}, data=f
 
167
  )
168
- upload_response.raise_for_status()
169
- audio_url = upload_response.json()['upload_url']
170
-
171
- json_payload = {
172
- "audio_url": audio_url,
173
- "speaker_labels": True,
174
- "speakers_expected": 2 }
175
- # --------------------
176
-
177
  transcript_response = requests.post(
178
  "https://api.assemblyai.com/v2/transcript",
179
  headers={"authorization": ASSEMBLYAI_KEY},
180
- json=json_payload
 
 
 
 
181
  )
182
- transcript_response.raise_for_status()
183
  transcript_id = transcript_response.json()['id']
184
-
185
  while True:
186
- result_response = requests.get(
187
  f"https://api.assemblyai.com/v2/transcript/{transcript_id}",
188
  headers={"authorization": ASSEMBLYAI_KEY}
189
- )
190
- result_response.raise_for_status()
191
- result = result_response.json()
192
-
193
  if result['status'] == 'completed':
194
- if 'utterances' not in result or result['utterances'] is None:
195
- result['utterances'] = []
196
- logger.warning("Transcription completed but no utterances found.")
197
  return result
198
  elif result['status'] == 'error':
199
- raise Exception(f"Transcription failed: {result['error']}")
200
-
201
- logger.info(f"Transcription status: {result['status']}...")
202
  time.sleep(5)
203
-
204
  except Exception as e:
205
- logger.error(f"Transcription process failed: {str(e)}", exc_info=True)
206
  raise
207
 
208
- def process_utterance(utterance, full_audio):
209
  try:
210
  start = utterance['start']
211
  end = utterance['end']
212
  segment = full_audio[start:end]
213
- temp_path = os.path.join(OUTPUT_DIR, f"temp_utterance_{uuid.uuid4()}.wav")
214
  segment.export(temp_path, format="wav")
215
-
216
  with torch.no_grad():
217
- embedding = speaker_model.get_embedding(temp_path).to(device)
218
-
219
  query_result = index.query(
220
- vector=embedding.cpu().numpy().tolist(), top_k=1, include_metadata=True
 
 
221
  )
222
-
223
  if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
224
  speaker_id = query_result['matches'][0]['id']
225
  speaker_name = query_result['matches'][0]['metadata']['speaker_name']
226
  else:
227
  speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
228
  speaker_name = f"Speaker_{speaker_id[-4:]}"
229
- index.upsert([(speaker_id, embedding.cpu().numpy().tolist(), {"speaker_name": speaker_name})])
230
-
231
  os.remove(temp_path)
232
  return {
233
- **utterance, 'speaker': speaker_name, 'speaker_id': speaker_id
 
 
 
234
  }
235
  except Exception as e:
236
- logger.warning(f"Utterance processing failed: {str(e)}")
237
- return {**utterance, 'speaker': 'Unknown', 'speaker_id': 'unknown'}
 
 
 
 
 
238
 
239
- def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
240
  try:
241
- if not transcript.get('utterances'):
242
- return []
243
- full_audio = AudioSegment.from_wav(wav_file)
244
- utterances = transcript['utterances']
245
-
246
- with ThreadPoolExecutor(max_workers=4) as executor:
247
- futures = [executor.submit(process_utterance, utterance, full_audio) for utterance in utterances]
248
  results = [f.result() for f in futures]
249
  return results
250
  except Exception as e:
251
- logger.error(f"Speaker identification failed: {str(e)}", exc_info=True)
252
  raise
253
 
254
- def get_role_classification_models():
255
- """Loads role classification models if they exist, otherwise returns None."""
256
- clf_path = os.path.join(OUTPUT_DIR, 'role_classifier.pkl')
257
- vec_path = os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl')
258
- scl_path = os.path.join(OUTPUT_DIR, 'feature_scaler.pkl')
259
-
260
- if all(os.path.exists(p) for p in [clf_path, vec_path, scl_path]):
261
- clf = joblib.load(clf_path)
262
- vectorizer = joblib.load(vec_path)
263
- scaler = joblib.load(scl_path)
264
- return clf, vectorizer, scaler
265
- return None, None, None
266
-
267
- def train_role_classifier(utterances: List[Dict]):
268
- """Trains and saves a role classifier based on utterance features."""
269
  try:
270
- texts = [u['text'] for u in utterances]
271
- vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
272
- X_text = vectorizer.fit_transform(texts)
273
- features, labels = [], []
274
- # Simple heuristic: assume alternating speakers are interviewer/interviewee
275
- for i, utterance in enumerate(utterances):
276
- prosodic = utterance['prosodic_features']
277
  feat = [
278
- prosodic['duration'], prosodic['mean_pitch'], prosodic['min_pitch'],
279
- prosodic['pitch_sd'], prosodic['intensityMean'],
 
 
 
280
  ]
281
- feat.extend(X_text[i].toarray()[0].tolist())
282
- doc = nlp(utterance['text'])
283
- feat.extend([
284
- int(utterance['text'].endswith('?')),
285
- len(re.findall(r'\b(why|how|what|when|where)\b', utterance['text'].lower())),
286
- len(utterance['text'].split()),
287
- sum(1 for token in doc if token.pos_ == 'VERB'),
 
288
  ])
289
  features.append(feat)
290
- labels.append(i % 2) # 0 for interviewer, 1 for interviewee
291
-
292
  scaler = StandardScaler()
293
  X = scaler.fit_transform(features)
294
- clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
 
 
295
  clf.fit(X, labels)
296
-
297
  joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
298
  joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
299
  joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
300
  return clf, vectorizer, scaler
301
  except Exception as e:
302
- logger.error(f"Classifier training failed: {str(e)}", exc_info=True)
303
  raise
304
 
305
- def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
306
- """Classifies roles for each utterance using a pre-trained model."""
307
  try:
308
- texts = [u['text'] for u in utterances]
309
- X_text = vectorizer.transform(texts)
310
  results = []
311
- for i, utterance in enumerate(utterances):
312
- prosodic = utterance['prosodic_features']
313
  feat = [
314
  prosodic['duration'], prosodic['mean_pitch'], prosodic['min_pitch'],
315
- prosodic['pitch_sd'], prosodic['intensityMean'],
 
316
  ]
317
  feat.extend(X_text[i].toarray()[0].tolist())
318
- doc = nlp(utterance['text'])
319
  feat.extend([
320
- int(utterance['text'].endswith('?')),
321
- len(re.findall(r'\b(why|how|what|when|where)\b', utterance['text'].lower())),
322
- len(utterance['text'].split()),
323
  sum(1 for token in doc if token.pos_ == 'VERB'),
 
324
  ])
325
  X = scaler.transform([feat])
326
  role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
327
- results.append({**utterance, 'role': role})
328
  return results
329
  except Exception as e:
330
- logger.error(f"Role classification failed: {str(e)}", exc_info=True)
331
- # Fallback if classification fails
332
- return [dict(u, role='Unknown') for u in utterances]
333
 
334
- def analyze_interviewee_voice(utterances: List[Dict]) -> Dict:
335
- # (This function is complex, including it fully)
336
  try:
337
- interviewee_utterances = [u for u in utterances if u.get('role') == 'Interviewee']
338
- if not interviewee_utterances:
339
- return {'error': 'No interviewee utterances found to analyze.'}
340
-
341
- total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
342
- total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
 
 
 
 
 
343
  speaking_rate = total_words / total_duration if total_duration > 0 else 0
344
-
345
  filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
346
- filler_count = sum(u['text'].lower().count(fw) for u in interviewee_utterances for fw in filler_words)
347
  filler_ratio = filler_count / total_words if total_words > 0 else 0
348
-
349
- all_pitches = [u['prosodic_features']['mean_pitch'] for u in interviewee_utterances if u['prosodic_features']['mean_pitch'] > 0]
350
- pitch_mean = np.mean(all_pitches) if all_pitches else 0
351
- pitch_std = np.std(all_pitches) if all_pitches else 0
352
-
353
- anxiety_score = (pitch_std / 100) + (filler_ratio * 2)
354
- confidence_score = 1 - anxiety_score if anxiety_score < 1 else 0
355
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  return {
357
  'speaking_rate': float(round(speaking_rate, 2)),
358
  'filler_ratio': float(round(filler_ratio, 4)),
359
- 'pitch_mean': float(round(pitch_mean, 2)),
360
- 'pitch_std_dev': float(round(pitch_std, 2)),
361
- 'composite_scores': {
362
- 'anxiety': float(round(anxiety_score, 4)),
363
- 'confidence': float(round(confidence_score, 4)),
364
- }
365
  }
366
  except Exception as e:
367
- logger.error(f"Voice analysis failed: {str(e)}", exc_info=True)
368
  return {'error': str(e)}
369
 
370
- def generate_report_text(analysis_data: Dict) -> str:
371
- """Generates the text for the final report using Gemini."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  try:
373
  voice = analysis_data.get('voice_analysis', {})
374
- interviewee_responses = [u['text'] for u in analysis_data['transcript'] if u.get('role') == 'Interviewee']
375
-
 
 
 
 
 
 
 
 
376
  prompt = f"""
377
- Analyze the following interview data and generate a concise, professional report.
378
-
379
- **Interview Data:**
380
- - Total Duration: {analysis_data['text_analysis']['total_duration']:.2f} seconds
381
- - Speaker Turns: {analysis_data['text_analysis']['speaker_turns']}
382
- - Speakers: {', '.join(analysis_data['speakers'])}
383
-
384
- **Voice Analysis of Interviewee:**
385
- - Speaking Rate: {voice.get('speaking_rate', 'N/A')} words/sec
386
- - Filler Word Ratio: {voice.get('filler_ratio', 'N/A')}
387
- - Anxiety Score (lower is better): {voice.get('composite_scores', {}).get('anxiety', 'N/A')}
388
- - Confidence Score (higher is better): {voice.get('composite_scores', {}).get('confidence', 'N/A')}
389
-
390
- **Interviewee's Key Responses:**
391
- - {"- ".join(interviewee_responses[:3])}
392
-
393
- **Task:**
394
- Based on all the data above, provide:
395
- 1. **Executive Summary:** A brief paragraph summarizing the candidate's performance.
396
- 2. **Strengths:** 2-3 bullet points on what the candidate did well (e.g., clear articulation, confidence).
397
- 3. **Areas for Improvement:** 2-3 bullet points on specific, actionable feedback (e.g., reduce filler words, elaborate on answers).
 
 
398
  """
399
  response = gemini_model.generate_content(prompt)
400
  return response.text
401
  except Exception as e:
402
- logger.error(f"Report generation with Gemini failed: {str(e)}", exc_info=True)
403
  return f"Error generating report: {str(e)}"
404
 
405
  def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str):
406
- """Creates a PDF report from the analysis data."""
407
  try:
408
- doc = SimpleDocTemplate(output_path, pagesize=letter)
 
 
409
  styles = getSampleStyleSheet()
410
- story = []
 
 
 
 
411
 
412
- story.append(Paragraph("Interview Analysis Report", styles['h1']))
413
- story.append(Spacer(1, 0.2 * inch))
414
 
415
- # Split Gemini text into paragraphs for cleaner formatting
416
- report_parts = gemini_report_text.split('\n')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
  for part in report_parts:
418
- if part.strip():
419
- if part.startswith('**'):
420
- story.append(Paragraph(part.replace('**', ''), styles['h2']))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  else:
422
- story.append(Paragraph(part, styles['BodyText']))
423
-
424
- doc.build(story)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
  except Exception as e:
426
  logger.error(f"PDF creation failed: {str(e)}", exc_info=True)
427
- # Create a fallback text file if PDF fails
428
- with open(output_path.replace('.pdf', '.txt'), 'w') as f:
429
- f.write(gemini_report_text)
430
 
431
  def convert_to_serializable(obj):
432
- """Converts numpy types to native Python types for JSON serialization."""
433
  if isinstance(obj, np.generic): return obj.item()
434
- if isinstance(obj, dict): return {key: convert_to_serializable(value) for key, value in obj.items()}
435
- if isinstance(obj, list): return [convert_to_serializable(item) for item in obj]
436
  if isinstance(obj, np.ndarray): return obj.tolist()
437
  return obj
438
 
439
-
440
- # ==============================================================================
441
- # 4. ORCHESTRATION FUNCTIONS
442
- # ==============================================================================
443
-
444
- def _process_local_audio_file(local_audio_path: str, base_name: str) -> dict:
445
- """
446
- Internal function to process a local audio file.
447
- This contains the main pipeline logic.
448
- """
449
  wav_file = None
 
450
  try:
451
- logger.info(f"Step 1/8: Converting to WAV: {local_audio_path}")
452
- wav_file = convert_to_wav(local_audio_path, OUTPUT_DIR)
453
-
454
- logger.info("Step 2/8: Transcribing audio...")
 
 
 
455
  transcript = transcribe(wav_file)
456
-
457
- logger.info("Step 3/8: Extracting prosodic features...")
458
  for utterance in transcript['utterances']:
459
- utterance['prosodic_features'] = extract_prosodic_features(
460
- wav_file, utterance['start'], utterance['end']
461
- )
462
-
463
- logger.info("Step 4/8: Identifying speakers...")
464
  utterances_with_speakers = identify_speakers(transcript, wav_file)
465
-
466
- logger.info("Step 5/8: Classifying speaker roles...")
467
- clf, vectorizer, scaler = get_role_classification_models()
468
- if not clf:
469
- logger.info("No role classifier found, training a new one...")
 
470
  clf, vectorizer, scaler = train_role_classifier(utterances_with_speakers)
471
  classified_utterances = classify_roles(utterances_with_speakers, clf, vectorizer, scaler)
472
-
473
- logger.info("Step 6/8: Analyzing interviewee voice...")
474
- voice_analysis = analyze_interviewee_voice(classified_utterances)
475
-
476
  analysis_data = {
477
  'transcript': classified_utterances,
478
  'speakers': list(set(u['speaker'] for u in classified_utterances)),
479
  'voice_analysis': voice_analysis,
480
  'text_analysis': {
481
- 'total_duration': transcript.get('audio_duration', 0),
482
  'speaker_turns': len(classified_utterances)
483
  }
484
  }
485
-
486
- logger.info("Step 7/8: Generating report text with Gemini...")
487
- gemini_report_text = generate_report_text(analysis_data)
488
-
489
  pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
490
  json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
491
-
492
- logger.info(f"Step 8/8: Creating output files (PDF and JSON)...")
493
- create_pdf_report(analysis_data, pdf_path, gemini_report_text)
494
-
495
  with open(json_path, 'w') as f:
496
  serializable_data = convert_to_serializable(analysis_data)
497
  json.dump(serializable_data, f, indent=2)
498
-
499
- logger.info("Processing completed successfully.")
500
  return {'pdf_path': pdf_path, 'json_path': json_path}
501
-
502
- finally:
503
- if wav_file and os.path.exists(wav_file):
504
- os.remove(wav_file)
505
- logger.info(f"Cleaned up temporary WAV file: {wav_file}")
506
-
507
- def process_interview(audio_url: str) -> dict:
508
- """
509
- Main public function called by the API. It downloads a file from a URL,
510
- processes it using the internal pipeline, and returns the output file paths.
511
- """
512
- temp_audio_path = None
513
- try:
514
- # 1. Download the audio file from the URL
515
- logger.info(f"Downloading audio from URL: {audio_url}")
516
- response = requests.get(audio_url, stream=True, timeout=60) # 60 second timeout
517
- response.raise_for_status() # Raise an exception for bad status codes
518
-
519
- # Generate a unique name for the temporary file
520
- original_filename = audio_url.split('/')[-1]
521
- file_extension = os.path.splitext(original_filename)[1] or '.tmp'
522
- base_name = f"{uuid.uuid4()}"
523
- temp_audio_path = os.path.join(AUDIO_DIR, f"{base_name}{file_extension}")
524
-
525
- with open(temp_audio_path, 'wb') as f:
526
- for chunk in response.iter_content(chunk_size=8192):
527
- f.write(chunk)
528
-
529
- logger.info(f"Audio downloaded and saved to: {temp_audio_path}")
530
-
531
- # 2. Process the downloaded local file using the main pipeline
532
- result = _process_local_audio_file(temp_audio_path, base_name)
533
- return result
534
-
535
- except requests.exceptions.RequestException as e:
536
- logger.error(f"Failed to download or access URL {audio_url}: {e}")
537
- raise RuntimeError(f"Could not download file from URL: {audio_url}") from e
538
  except Exception as e:
539
- logger.error(f"An unexpected error occurred during processing for URL {audio_url}: {e}", exc_info=True)
540
  raise
541
  finally:
542
- # 3. Clean up the downloaded audio file
543
- if temp_audio_path and os.path.exists(temp_audio_path):
544
- os.remove(temp_audio_path)
545
- logger.info(f"Cleaned up temporary downloaded file: {temp_audio_path}")
 
 
17
  import re
18
  from typing import Dict, List, Tuple
19
  import logging
20
+ import tempfile
 
21
  from reportlab.lib.pagesizes import letter
22
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, Image
23
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
24
  from reportlab.lib.units import inch
25
  from reportlab.lib import colors
26
+ import matplotlib.pyplot as plt
27
+ import matplotlib
28
+ matplotlib.use('Agg')
29
+ from reportlab.platypus import Image
30
+ import io
31
  from transformers import AutoTokenizer, AutoModel
32
  import spacy
33
  import google.generativeai as genai
34
  import joblib
35
  from concurrent.futures import ThreadPoolExecutor
36
 
 
 
 
 
37
  # Setup logging
38
+ logging.basicConfig(level=logging.INFO)
39
  logger = logging.getLogger(__name__)
40
+ logging.getLogger("nemo_logging").setLevel(logging.INFO)
41
+ logging.getLogger("nemo").setLevel(logging.INFO)
42
 
43
  # Configuration
44
+ AUDIO_DIR = "./Uploads"
45
  OUTPUT_DIR = "./processed_audio"
 
46
  os.makedirs(OUTPUT_DIR, exist_ok=True)
47
 
48
+ # API Keys
49
+ PINECONE_KEY = os.getenv("PINECONE_KEY")'
50
+ ASSEMBLYAI_KEY = 'os.getenv("ASSEMBLYAI_KEY")'
51
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
52
 
53
+ def download_audio_from_url(url: str) -> str:
54
+ """Downloads an audio file from a URL to a temporary local path."""
55
+ try:
56
+ temp_dir = tempfile.gettempdir()
57
+ temp_path = os.path.join(temp_dir, f"{uuid.uuid4()}.tmp_audio")
58
+ logger.info(f"Downloading audio from {url} to {temp_path}")
59
+ with requests.get(url, stream=True) as r:
60
+ r.raise_for_status()
61
+ with open(temp_path, 'wb') as f:
62
+ for chunk in r.iter_content(chunk_size=8192):
63
+ f.write(chunk)
64
+ return temp_path
65
+ except Exception as e:
66
+ logger.error(f"Failed to download audio from URL {url}: {e}")
67
+ raise
68
 
69
  def initialize_services():
70
  try:
 
71
  pc = Pinecone(api_key=PINECONE_KEY)
72
  index_name = "interview-speaker-embeddings"
73
  if index_name not in pc.list_indexes().names():
 
74
  pc.create_index(
75
  name=index_name,
76
  dimension=192,
 
80
  index = pc.Index(index_name)
81
  genai.configure(api_key=GEMINI_API_KEY)
82
  gemini_model = genai.GenerativeModel('gemini-1.5-flash')
 
83
  return index, gemini_model
84
  except Exception as e:
85
+ logger.error(f"Error initializing services: {str(e)}")
86
  raise
87
 
88
  index, gemini_model = initialize_services()
 
90
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
91
  logger.info(f"Using device: {device}")
92
 
93
+ def load_speaker_model():
94
  try:
95
+ import torch
96
+ torch.set_num_threads(5)
97
+ model = EncDecSpeakerLabelModel.from_pretrained(
98
  "nvidia/speakerverification_en_titanet_large",
99
  map_location=torch.device('cpu')
100
  )
101
+ model.eval()
102
+ return model
 
 
 
 
 
103
  except Exception as e:
104
+ logger.error(f"Model loading failed: {str(e)}")
105
+ raise RuntimeError("Could not load speaker verification model")
 
 
106
 
107
+ def load_models():
108
+ speaker_model = load_speaker_model()
109
+ nlp = spacy.load("en_core_web_sm")
110
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
111
+ llm_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
112
+ llm_model.eval()
113
+ return speaker_model, nlp, tokenizer, llm_model
114
 
115
+ speaker_model, nlp, tokenizer, llm_model = load_models()
 
 
116
 
117
  def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
118
  try:
 
124
  audio.export(wav_file, format="wav")
125
  return wav_file
126
  except Exception as e:
127
+ logger.error(f"Audio conversion failed: {str(e)}")
128
  raise
129
 
130
  def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
 
150
  os.remove(temp_path)
151
  return features
152
  except Exception as e:
153
+ logger.error(f"Feature extraction failed: {str(e)}")
154
  return {
155
+ 'duration': 0.0, 'mean_pitch': 0.0, 'min_pitch': 0.0, 'max_pitch': 0.0,
156
+ 'pitch_sd': 0.0, 'intensityMean': 0.0, 'intensityMin': 0.0,
157
+ 'intensityMax': 0.0, 'intensitySD': 0.0
158
  }
159
 
160
  def transcribe(audio_path: str) -> Dict:
 
 
 
161
  try:
162
  with open(audio_path, 'rb') as f:
163
  upload_response = requests.post(
164
  "https://api.assemblyai.com/v2/upload",
165
+ headers={"authorization": ASSEMBLYAI_KEY},
166
+ data=f
167
  )
168
+ audio_url = upload_response.json()['upload_url']
 
 
 
 
 
 
 
 
169
  transcript_response = requests.post(
170
  "https://api.assemblyai.com/v2/transcript",
171
  headers={"authorization": ASSEMBLYAI_KEY},
172
+ json={
173
+ "audio_url": audio_url,
174
+ "speaker_labels": True,
175
+ "filter_profanity": True
176
+ }
177
  )
 
178
  transcript_id = transcript_response.json()['id']
 
179
  while True:
180
+ result = requests.get(
181
  f"https://api.assemblyai.com/v2/transcript/{transcript_id}",
182
  headers={"authorization": ASSEMBLYAI_KEY}
183
+ ).json()
 
 
 
184
  if result['status'] == 'completed':
 
 
 
185
  return result
186
  elif result['status'] == 'error':
187
+ raise Exception(result['error'])
 
 
188
  time.sleep(5)
 
189
  except Exception as e:
190
+ logger.error(f"Transcription failed: {str(e)}")
191
  raise
192
 
193
+ def process_utterance(utterance, full_audio, wav_file):
194
  try:
195
  start = utterance['start']
196
  end = utterance['end']
197
  segment = full_audio[start:end]
198
+ temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
199
  segment.export(temp_path, format="wav")
 
200
  with torch.no_grad():
201
+ embedding = speaker_model.get_embedding(temp_path).cpu().numpy()
202
+ embedding_list = embedding.flatten().tolist()
203
  query_result = index.query(
204
+ vector=embedding_list,
205
+ top_k=1,
206
+ include_metadata=True
207
  )
 
208
  if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
209
  speaker_id = query_result['matches'][0]['id']
210
  speaker_name = query_result['matches'][0]['metadata']['speaker_name']
211
  else:
212
  speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
213
  speaker_name = f"Speaker_{speaker_id[-4:]}"
214
+ index.upsert([(speaker_id, embedding_list, {"speaker_name": speaker_id})])
 
215
  os.remove(temp_path)
216
  return {
217
+ ...
218
+ **speech, 'speaker': speaker_name,
219
+ 'speaker_id': speaker_id,
220
+ 'embedding': embedding_list
221
  }
222
  except Exception as e:
223
+ logger.error(f"Utterance processing failed: {str(e)}", exc_info=True)
224
+ return {
225
+ ...
226
+ speech, 'speech': 'Unknown',
227
+ 'speaker_id': speaker_id,
228
+ 'embedding_id': None
229
+ }
230
 
231
+ def identify_speakers(audio: Dict, text: str) -> List[Dict]:
232
  try:
233
+ audio = AudioSegment.from_wav(text)
234
+ speakers = audio['speech']
235
+ with ThreadPoolExecutor(max_workers=5) as executor:
236
+ futures = [
237
+ executor.submit(process_speech, speech, speakers, text)
238
+ for speech in speakers
239
+ ]
240
  results = [f.result() for f in futures]
241
  return results
242
  except Exception as e:
243
+ logger.error(f"Speaker identification failed: {str(e)}")
244
  raise
245
 
246
+ def train_role_classifier(speakers: List[Dict]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  try:
248
+ speech = [u['speech'].split()]
249
+ vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1,2))
250
+ X_text = vectorizer.fit_transform(speech)
251
+ features = []
252
+ labels = []
253
+ for i, speaker in enumerate(speakers):
254
+ utterance = speaker['speech_features']
255
  feat = [
256
+ utterance['duration'], utterance['speech_rate'], utterance['duration'], utterance['mean_pitch'],
257
+ utterance['min_pitch'], utterance['max_pitch'],
258
+ utterance['speech_sd'], utterance['intensityLevel'],
259
+ utterance['intensity_level'],
260
+ utterance['speechMax']], utterance['speechSD'],
261
  ]
262
+ feat.extend(X_text[i].toarray()[0])
263
+ doc = nlp(speaker['speech'])
264
+ speech.extend([
265
+ int(speaker['speech'].endswith('?'))),
266
+ len(re.findall(r'\b(why|how|what|when|where|who|which)\b', speaker['speech'].lower())),
267
+ len(speaker['speech'].split())),
268
+ sum(frequency for token in speech if token.pos_ == 'VERB'),
269
+ sum(frequency for token in speech if token.pos == 'NOUN')
270
  ])
271
  features.append(feat)
272
+ labels.append((0 if i % 2 == 0 else 1))
 
273
  scaler = StandardScaler()
274
  X = scaler.fit_transform(features)
275
+ clf = RandomForestClassifier(
276
+ n_estimators=150, max_depth=10, random_state=42, class_weight='balanced'
277
+ )
278
  clf.fit(X, labels)
 
279
  joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
280
  joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
281
  joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
282
  return clf, vectorizer, scaler
283
  except Exception as e:
284
+ logger.error(f"Classifier training failed: {str(e)}")
285
  raise
286
 
287
+ def classify_roles(speakers: List[Dict], clf, vectorizer, scaler):
 
288
  try:
289
+ speech = [u['speech'] for u in speakers]
290
+ X_text = vectorizer.transform(speech)
291
  results = []
292
+ for i, speaker in enumerate(speakers):
293
+ prosodic = speaker['speech_features']
294
  feat = [
295
  prosodic['duration'], prosodic['mean_pitch'], prosodic['min_pitch'],
296
+ prosodic['max_pitch'], prosodic['pitch_sd'], prosodic['intensityMean'],
297
+ prosodic['intensityMin'], prosodic['intensityMax'], prosodic['intensitySD'],
298
  ]
299
  feat.extend(X_text[i].toarray()[0].tolist())
300
+ doc = nlp(speaker['speech'])
301
  feat.extend([
302
+ int(speaker['speech'].endswith('?')),
303
+ len(re.findall(r'\b(why|how|what|when|where|who|which)\b', speaker['speech'].lower())),
304
+ len(speaker['speech'].split()),
305
  sum(1 for token in doc if token.pos_ == 'VERB'),
306
+ sum(1 for token in doc if token.pos_ == 'NOUN')
307
  ])
308
  X = scaler.transform([feat])
309
  role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
310
+ results.append({**speaker, 'role': role})
311
  return results
312
  except Exception as e:
313
+ logger.error(f"Role classification failed: {str(e)}")
314
+ raise
 
315
 
316
+ def analyze_interviewee_voice(audio_path: str, speakers: List[Dict]) -> Dict:
 
317
  try:
318
+ y, sr = librosa.load(audio_path, sr=16000)
319
+ interviewee_speakers = [u for u in speakers if u['role'] == 'Interviewee']
320
+ if not interviewee_speakers:
321
+ return {'error': 'No interviewee speeches found'}
322
+ segments = []
323
+ for u in interviewee_speakers:
324
+ start = int(u['start'] * sr / 1000)
325
+ end = int(u['end'] * sr / 1000)
326
+ segments.append(y[start:end])
327
+ total_duration = sum(u['speech_features']['duration'] for u in interviewee_speakers)
328
+ total_words = sum(len(u['speech'].split()) for u in interviewee_speakers)
329
  speaking_rate = total_words / total_duration if total_duration > 0 else 0
 
330
  filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
331
+ filler_count = sum(sum(u['speech'].lower().count(fw) for fw in filler_words) for u in interviewee_speakers)
332
  filler_ratio = filler_count / total_words if total_words > 0 else 0
333
+ all_words = ' '.join(u['speech'].lower() for u in interviewee_speakers).split()
334
+ word_counts = {}
335
+ for i in range(len(all_words) - 1):
336
+ bigram = (all_words[i], all_words[i + 1])
337
+ word_counts[bigram] = word_counts.get(bigram, 0) + 1
338
+ repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(word_counts) if word_counts else 0
339
+ pitches = []
340
+ for segment in segments:
341
+ f0, voiced_flag, _ = librosa.pyin(segment, fmin=80, fmax=300, sr=sr)
342
+ pitches.extend(f0[voiced_flag])
343
+ pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
344
+ pitch_std = np.std(pitches) if len(pitches) > 0 else 0
345
+ jitter = np.mean(np.abs(np.diff(pitches))) / pitch_mean if len(pitches) > 1 and pitch_mean > 0 else 0
346
+ intensities = []
347
+ for segment in segments:
348
+ rms = librosa.feature.rms(y=segment)[0]
349
+ intensities.extend(rms)
350
+ intensity_mean = np.mean(intensities) if intensities else 0
351
+ intensity_std = np.std(intensities) if intensities else 0
352
+ shimmer = np.mean(np.abs(np.diff(intensities))) / intensity_mean if len(intensities) > 1 and intensity_mean > 0 else 0
353
+ anxiety_score = 0.6 * (pitch_std / pitch_mean) + 0.4 * (jitter + shimmer) if pitch_mean > 0 else 0
354
+ confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
355
+ hesitation_score = filler_ratio + repetition_score
356
+ anxiety_level = 'High' if anxiety_score > 0.15 else 'Moderate' if anxiety_score > 0.07 else 'Low'
357
+ confidence_level = 'High' if confidence_score > 0.7 else 'Moderate' if confidence_score > 0.5 else 'Low'
358
+ fluency_level = 'Fluent' if (filler_ratio < 0.05 and repetition_score < 0.1) else 'Moderate' if (filler_ratio < 0.1 and repetition_score < 0.2) else 'Disfluent'
359
  return {
360
  'speaking_rate': float(round(speaking_rate, 2)),
361
  'filler_ratio': float(round(filler_ratio, 4)),
362
+ 'repetition_score': float(round(repetition_score, 4)),
363
+ 'pitch_analysis': {'mean': float(round(pitch_mean, 2)), 'std_dev': float(round(pitch_std, 2)), 'jitter': float(round(jitter, 4))},
364
+ 'intensity_analysis': {'mean': float(round(intensity_mean, 2)), 'std_dev': float(round(intensity_std, 2)), 'shimmer': float(round(shimmer, 4))},
365
+ 'composite_scores': {'anxiety': float(round(anxiety_score, 4)), 'confidence': float(round(confidence_score, 4)), 'hesitation': float(round(hesitation_score, 4))},
366
+ 'interpretation': {'anxiety_level': anxiety_level, 'confidence_level': confidence_level, 'fluency_level': fluency_level}
 
367
  }
368
  except Exception as e:
369
+ logger.error(f"Voice analysis failed: {str(e)}")
370
  return {'error': str(e)}
371
 
372
+ def generate_voice_interpretation(analysis: Dict) -> str:
373
+ if 'error' in analysis:
374
+ return "Voice analysis unavailable due to processing limitations."
375
+ interpretation_lines = [
376
+ "Vocal Performance Profile:",
377
+ f"- Speaking Rate: {analysis['speaking_rate']} words/sec - Benchmark: 2.0-3.0 wps for clear delivery",
378
+ f"- Filler Word Frequency: {analysis['filler_ratio'] * 100:.1f}% - Measures non-content words",
379
+ f"- Repetition Index: {analysis['repetition_score']:.3f} - Frequency of repeated phrases",
380
+ f"- Anxiety Indicator: {analysis['interpretation']['anxiety_level']} (Score: {analysis['composite_scores']['anxiety']:.3f}) - Pitch and vocal stability",
381
+ f"- Confidence Indicator: {analysis['interpretation']['confidence_level']} (Score: {analysis['composite_scores']['confidence']:.3f}) - Vocal strength",
382
+ f"- Fluency Rating: {analysis['interpretation']['fluency_level']} - Speech flow and coherence",
383
+ "",
384
+ "HR Insights:",
385
+ "- Rapid speech (>3.0 wps) may signal enthusiasm but risks clarity.",
386
+ "- High filler word use reduces perceived professionalism.",
387
+ "- Elevated anxiety suggests pressure; training can build resilience.",
388
+ "- Strong confidence aligns with leadership presence.",
389
+ "- Fluent speech enhances engagement, critical for team roles."
390
+ ]
391
+ return "\n".join(interpretation_lines)
392
+
393
+ def generate_anxiety_confidence_chart(composite_scores: Dict, chart_path_or_buffer):
394
+ try:
395
+ labels = ['Anxiety', 'Confidence']
396
+ scores = [composite_scores.get('anxiety', 0), composite_scores.get('confidence', 0)]
397
+ fig, ax = plt.subplots(figsize=(5, 3.5))
398
+ bars = ax.bar(labels, scores, color=['#FF5252', '#26A69A'], edgecolor='black', width=0.45)
399
+ ax.set_ylabel('Score (Normalized)', fontsize=12)
400
+ ax.set_title('Vocal Dynamics: Anxiety vs. Confidence', fontsize=14, pad=15)
401
+ ax.set_ylim(0, 1.3)
402
+ for bar in bars:
403
+ height = bar.get_height()
404
+ ax.text(bar.get_x() + bar.get_width()/2, height + 0.05, f"{height:.2f}",
405
+ ha='center', color='black', fontweight='bold', fontsize=11)
406
+ ax.grid(True, axis='y', linestyle='--', alpha=0.7)
407
+ plt.tight_layout()
408
+ plt.savefig(chart_path_or_buffer, format='png', bbox_inches='tight', dpi=300)
409
+ plt.close(fig)
410
+ except Exception as e:
411
+ logger.error(f"Error generating chart: {str(e)}")
412
+
413
+ def calculate_acceptance_probability(analysis_data: Dict) -> float:
414
+ voice = analysis_data.get('voice_analysis', {})
415
+ if 'error' in voice: return 0.0
416
+ w_confidence, w_anxiety, w_fluency, w_speaking_rate, w_filler_repetition, w_content_strengths = 0.35, -0.25, 0.2, 0.15, -0.15, 0.25
417
+ confidence_score = voice.get('composite_scores', {}).get('confidence', 0.0)
418
+ anxiety_score = voice.get('composite_scores', {}).get('anxiety', 0.0)
419
+ fluency_level = voice.get('interpretation', {}).get('fluency_level', 'Disfluent')
420
+ speaking_rate = voice.get('speaking_rate', 0.0)
421
+ filler_ratio = voice.get('filler_ratio', 0.0)
422
+ repetition_score = voice.get('repetition_score', 0.0)
423
+ fluency_map = {'Fluent': 1.0, 'Moderate': 0.6, 'Disfluent': 0.2}
424
+ fluency_val = fluency_map.get(fluency_level, 0.2)
425
+ ideal_speaking_rate = 2.5
426
+ speaking_rate_deviation = abs(speaking_rate - ideal_speaking_rate)
427
+ speaking_rate_score = max(0, 1 - (speaking_rate_deviation / ideal_speaking_rate))
428
+ filler_repetition_composite = (filler_ratio + repetition_score) / 2
429
+ filler_repetition_score = max(0, 1 - filler_repetition_composite)
430
+ content_strength_val = 0.85 if analysis_data.get('text_analysis', {}).get('total_duration', 0) > 60 else 0.4
431
+ raw_score = (confidence_score * w_confidence + (1 - anxiety_score) * abs(w_anxiety) + fluency_val * w_fluency + speaking_rate_score * w_speaking_rate + filler_repetition_score * abs(w_filler_repetition) + content_strength_val * w_content_strengths)
432
+ max_possible_score = (w_confidence + abs(w_anxiety) + w_fluency + w_speaking_rate + abs(w_filler_repetition) + w_content_strengths)
433
+ if max_possible_score == 0: return 50.0
434
+ normalized_score = raw_score / max_possible_score
435
+ acceptance_probability = max(0.0, min(1.0, normalized_score))
436
+ return float(f"{acceptance_probability * 100:.2f}")
437
+
438
+ def generate_report(analysis_data: Dict) -> str:
439
  try:
440
  voice = analysis_data.get('voice_analysis', {})
441
+ voice_interpretation = generate_voice_interpretation(voice)
442
+ interviewee_responses = [f"Speaker {u['speaker']} ({u['role']}): {u['text']}" for u in analysis_data['transcript'] if u['role'] == 'Interviewee'][:6]
443
+ acceptance_prob = analysis_data.get('acceptance_probability', None)
444
+ acceptance_line = ""
445
+ if acceptance_prob is not None:
446
+ acceptance_line = f"\n**Hiring Suitability Score: {acceptance_prob:.2f}%**\n"
447
+ if acceptance_prob >= 80: acceptance_line += "HR Verdict: Outstanding candidate, highly recommended for immediate advancement."
448
+ elif acceptance_prob >= 60: acceptance_line += "HR Verdict: Strong candidate, suitable for further evaluation with targeted development."
449
+ elif acceptance_prob >= 40: acceptance_line += "HR Verdict: Moderate potential, requires additional assessment and skill-building."
450
+ else: acceptance_line += "HR Verdict: Limited fit, significant improvement needed for role alignment."
451
  prompt = f"""
452
+ You are EvalBot, a senior HR consultant with 20+ years of experience, delivering a polished, concise, and engaging interview analysis report. Use a professional tone, clear headings, and bullet points ('- ') for readability. Avoid redundancy and ensure distinct sections for strengths, growth areas, and recommendations.
453
+ {acceptance_line}
454
+ **1. Executive Summary**
455
+ - Provide a concise overview of performance, key metrics, and hiring potential.
456
+ - Interview length: {analysis_data['text_analysis']['total_duration']:.2f} seconds
457
+ - Speaker turns: {analysis_data['text_analysis']['speaker_turns']}
458
+ - Participants: {', '.join(analysis_data['speakers'])}
459
+ **2. Communication and Vocal Dynamics**
460
+ - Evaluate vocal delivery (rate, fluency, confidence) and professional impact.
461
+ - Offer HR insights on workplace alignment.
462
+ {voice_interpretation}
463
+ **3. Competency and Content Evaluation**
464
+ - Assess competencies: leadership, problem-solving, communication, adaptability.
465
+ - List strengths and growth areas separately, with specific examples.
466
+ - Sample responses:
467
+ {chr(10).join(interviewee_responses)}
468
+ **4. Role Fit and Growth Potential**
469
+ - Analyze cultural fit, role readiness, and long-term potential.
470
+ - Highlight enthusiasm and scalability.
471
+ **5. Strategic HR Recommendations**
472
+ - Provide distinct, prioritized strategies for candidate growth.
473
+ - Target: Communication, Response Depth, Professional Presence.
474
+ - List clear next steps for hiring managers (e.g., advance, train, assess).
475
  """
476
  response = gemini_model.generate_content(prompt)
477
  return response.text
478
  except Exception as e:
479
+ logger.error(f"Report generation failed: {str(e)}")
480
  return f"Error generating report: {str(e)}"
481
 
482
  def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str):
 
483
  try:
484
+ doc = SimpleDocTemplate(output_path, pagesize=letter,
485
+ rightMargin=0.7*inch, leftMargin=0.7*inch,
486
+ topMargin=0.9*inch, bottomMargin=0.9*inch)
487
  styles = getSampleStyleSheet()
488
+ h1 = ParagraphStyle(name='Heading1', fontSize=22, leading=26, spaceAfter=20, alignment=1, textColor=colors.HexColor('#003087'), fontName='Helvetica-Bold')
489
+ h2 = ParagraphStyle(name='Heading2', fontSize=15, leading=18, spaceBefore=14, spaceAfter=8, textColor=colors.HexColor('#0050BC'), fontName='Helvetica-Bold')
490
+ h3 = ParagraphStyle(name='Heading3', fontSize=11, leading=14, spaceBefore=10, spaceAfter=6, textColor=colors.HexColor('#3F7CFF'), fontName='Helvetica')
491
+ body_text = ParagraphStyle(name='BodyText', fontSize=10, leading=13, spaceAfter=8, fontName='Helvetica', textColor=colors.HexColor('#333333'))
492
+ bullet_style = ParagraphStyle(name='Bullet', parent=body_text, leftIndent=20, bulletIndent=10, fontName='Helvetica', bulletFontName='Helvetica', bulletFontSize=10)
493
 
494
+ story = []
 
495
 
496
+ def header_footer(canvas, doc):
497
+ canvas.saveState()
498
+ canvas.setFont('Helvetica', 8)
499
+ canvas.setFillColor(colors.HexColor('#666666'))
500
+ canvas.drawString(doc.leftMargin, 0.4 * inch, f"Page {doc.page} | EvalBot HR Interview Report | Confidential")
501
+ canvas.setStrokeColor(colors.HexColor('#0050BC'))
502
+ canvas.setLineWidth(1)
503
+ canvas.line(doc.leftMargin, doc.height + 0.85*inch, doc.width + doc.leftMargin, doc.height + 0.85*inch)
504
+ canvas.setFont('Helvetica-Bold', 10)
505
+ canvas.drawString(doc.leftMargin, doc.height + 0.9*inch, "Candidate Interview Analysis")
506
+ canvas.drawRightString(doc.width + doc.leftMargin, doc.height + 0.9*inch, time.strftime('%B %d, %Y'))
507
+ canvas.restoreState()
508
+
509
+ # Title Page
510
+ story.append(Paragraph("Candidate Interview Analysis", h1))
511
+ story.append(Paragraph(f"Generated: {time.strftime('%B %d, %Y')}", ParagraphStyle(name='Date', alignment=1, fontSize=10, textColor=colors.HexColor('#666666'), fontName='Helvetica')))
512
+ story.append(Spacer(1, 0.5 * inch))
513
+ acceptance_prob = analysis_data.get('acceptance_probability')
514
+ if acceptance_prob is not None:
515
+ story.append(Paragraph("Hiring Suitability Snapshot", h2))
516
+ prob_color = colors.HexColor('#2E7D32') if acceptance_prob >= 80 else (colors.HexColor('#F57C00') if acceptance_prob >= 60 else colors.HexColor('#D32F2F'))
517
+ story.append(Paragraph(f"Suitability Score: <font size=16 color='{prob_color.hexval()}'><b>{acceptance_prob:.2f}%</b></font>",
518
+ ParagraphStyle(name='Prob', fontSize=12, spaceAfter=12, alignment=1, fontName='Helvetica-Bold')))
519
+ if acceptance_prob >= 80:
520
+ story.append(Paragraph("<b>HR Verdict:</b> Outstanding candidate, highly recommended for immediate advancement.", body_text))
521
+ elif acceptance_prob >= 60:
522
+ story.append(Paragraph("<b>HR Verdict:</b> Strong candidate, suitable for further evaluation with targeted development.", body_text))
523
+ elif acceptance_prob >= 40:
524
+ story.append(Paragraph("<b>HR Verdict:</b> Moderate potential, requires additional assessment and skill-building.", body_text))
525
+ else:
526
+ story.append(Paragraph("<b>HR Verdict:</b> Limited fit, significant improvement needed for role alignment.", body_text))
527
+ story.append(Spacer(1, 0.3 * inch))
528
+ table_data = [
529
+ ['Metric', 'Value'],
530
+ ['Interview Duration', f"{analysis_data['text_analysis']['total_duration']:.2f} seconds"],
531
+ ['Speaker Turns', f"{analysis_data['text_analysis']['speaker_turns']}"],
532
+ ['Participants', ', '.join(sorted(analysis_data['speakers']))]
533
+ ]
534
+ table = Table(table_data, colWidths=[2.2*inch, 3.8*inch])
535
+ table.setStyle(TableStyle([
536
+ ('BACKGROUND', (0,0), (-1,0), colors.HexColor('#0050BC')),
537
+ ('TEXTCOLOR', (0,0), (-1,0), colors.white),
538
+ ('ALIGN', (0,0), (-1,-1), 'LEFT'),
539
+ ('VALIGN', (0,0), (-1,-1), 'MIDDLE'),
540
+ ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
541
+ ('FONTSIZE', (0,0), (-1,-1), 9),
542
+ ('BOTTOMPADDING', (0,0), (-1,0), 10),
543
+ ('TOPPADDING', (0,0), (-1,0), 10),
544
+ ('BACKGROUND', (0,1), (-1,-1), colors.HexColor('#F5F6FA')),
545
+ ('GRID', (0,0), (-1,-1), 0.5, colors.HexColor('#DDE4EB'))
546
+ ]))
547
+ story.append(table)
548
+ story.append(Spacer(1, 0.4 * inch))
549
+ story.append(Paragraph("Prepared by: EvalBot - AI-Powered HR Analysis", body_text))
550
+ story.append(PageBreak())
551
+
552
+ # Detailed Analysis
553
+ story.append(Paragraph("Detailed Candidate Evaluation", h1))
554
+
555
+ # Communication and Vocal Dynamics
556
+ story.append(Paragraph("1. Communication & Vocal Dynamics", h2))
557
+ voice_analysis = analysis_data.get('voice_analysis', {})
558
+ if voice_analysis and 'error' not in voice_analysis:
559
+ table_data = [
560
+ ['Metric', 'Value', 'HR Insight'],
561
+ ['Speaking Rate', f"{voice_analysis.get('speaking_rate', 0):.2f} words/sec", 'Benchmark: 2.0-3.0 wps; impacts clarity'],
562
+ ['Filler Words', f"{voice_analysis.get('filler_ratio', 0) * 100:.1f}%", 'High usage reduces credibility'],
563
+ ['Anxiety', voice_analysis.get('interpretation', {}).get('anxiety_level', 'N/A'), f"Score: {voice_analysis.get('composite_scores', {}).get('anxiety', 0):.3f}; stress response"],
564
+ ['Confidence', voice_analysis.get('interpretation', {}).get('confidence_level', 'N/A'), f"Score: {voice_analysis.get('composite_scores', {}).get('confidence', 0):.3f}; vocal strength"],
565
+ ['Fluency', voice_analysis.get('interpretation', {}).get('fluency_level', 'N/A'), 'Drives engagement']
566
+ ]
567
+ table = Table(table_data, colWidths=[1.7*inch, 1.2*inch, 3.1*inch])
568
+ table.setStyle(TableStyle([
569
+ ('BACKGROUND', (0,0), (-1,0), colors.HexColor('#0050BC')),
570
+ ('TEXTCOLOR', (0,0), (-1,0), colors.white),
571
+ ('ALIGN', (0,0), (-1,-1), 'LEFT'),
572
+ ('VALIGN', (0,0), (-1,-1), 'MIDDLE'),
573
+ ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
574
+ ('FONTSIZE', (0,0), (-1,-1), 9),
575
+ ('BOTTOMPADDING', (0,0), (-1,0), 10),
576
+ ('TOPPADDING', (0,0), (-1,0), 10),
577
+ ('BACKGROUND', (0,1), (-1,-1), colors.HexColor('#F5F6FA')),
578
+ ('GRID', (0,0), (-1,-1), 0.5, colors.HexColor('#DDE4EB'))
579
+ ]))
580
+ story.append(table)
581
+ story.append(Spacer(1, 0.2 * inch))
582
+ chart_buffer = io.BytesIO()
583
+ generate_anxiety_confidence_chart(voice_analysis.get('composite_scores', {}), chart_buffer)
584
+ chart_buffer.seek(0)
585
+ img = Image(chart_buffer, width=4.8*inch, height=3.2*inch)
586
+ img.hAlign = 'CENTER'
587
+ story.append(img)
588
+ else:
589
+ story.append(Paragraph("Vocal analysis unavailable.", body_text))
590
+ story.append(Spacer(1, 0.3 * inch))
591
+
592
+ # Parse Gemini Report
593
+ sections = {
594
+ "Executive Summary": [],
595
+ "Communication and Vocal Dynamics": [],
596
+ "Competency and Content Evaluation": {"Strengths": [], "Growth Areas": []},
597
+ "Role Fit and Growth Potential": [],
598
+ "Strategic HR Recommendations": {"Development Priorities": [], "Next Steps": []}
599
+ }
600
+ report_parts = re.split(r'(\s*\*\*\s*\d\.\s*.*?\s*\*\*)', gemini_report_text)
601
+ current_section = None
602
  for part in report_parts:
603
+ if not part.strip(): continue
604
+ is_heading = False
605
+ for title in sections.keys():
606
+ if title.lower() in part.lower():
607
+ current_section = title
608
+ is_heading = True
609
+ break
610
+ if not is_heading and current_section:
611
+ if current_section == "Competency and Content Evaluation":
612
+ if 'strength' in part.lower() or any(k in part.lower() for k in ['leadership', 'problem-solving', 'communication', 'adaptability']):
613
+ sections[current_section]["Strengths"].append(part.strip())
614
+ elif 'improve' in part.lower() or 'grow' in part.lower() or 'challenge' in part.lower():
615
+ sections[current_section]["Growth Areas"].append(part.strip())
616
+ elif current_section == "Strategic HR Recommendations":
617
+ if any(k in part.lower() for k in ['communication', 'depth', 'presence', 'improve']):
618
+ sections[current_section]["Development Priorities"].append(part.strip())
619
+ elif any(k in part.lower() for k in ['advance', 'train', 'assess', 'next step']):
620
+ sections[current_section]["Next Steps"].append(part.strip())
621
  else:
622
+ sections[current_section].append(part.strip())
623
+
624
+ # Executive Summary
625
+ story.append(Paragraph("2. Executive Summary", h2))
626
+ if sections['Executive Summary']:
627
+ for line in sections['Executive Summary']:
628
+ if line.startswith(('-', '•', '*')):
629
+ story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
630
+ else:
631
+ story.append(Paragraph(line, body_text))
632
+ else:
633
+ story.append(Paragraph("Summary unavailable.", body_text))
634
+ story.append(Spacer(1, 0.3 * inch))
635
+
636
+ # Competency and Content
637
+ story.append(Paragraph("3. Competency & Content", h2))
638
+ story.append(Paragraph("Strengths", h3))
639
+ if sections['Competency and Content Evaluation']['Strengths']:
640
+ for line in sections['Competency and Content Evaluation']['Strengths']:
641
+ story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
642
+ else:
643
+ story.append(Paragraph("No strengths identified.", body_text))
644
+ story.append(Spacer(1, 0.2 * inch))
645
+ story.append(Paragraph("Growth Areas", h3))
646
+ if sections['Competency and Content Evaluation']['Growth Areas']:
647
+ for line in sections['Competency and Content Evaluation']['Growth Areas']:
648
+ story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
649
+ else:
650
+ story.append(Paragraph("No growth areas identified.", body_text))
651
+ story.append(Spacer(1, 0.3 * inch))
652
+
653
+ # Role Fit
654
+ story.append(Paragraph("4. Role Fit & Potential", h2))
655
+ if sections['Role Fit and Growth Potential']:
656
+ for line in sections['Role Fit and Growth Potential']:
657
+ if line.startswith(('-', '•', '*')):
658
+ story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
659
+ else:
660
+ story.append(Paragraph(line, body_text))
661
+ else:
662
+ story.append(Paragraph("Fit and potential analysis unavailable.", body_text))
663
+ story.append(Spacer(1, 0.3 * inch))
664
+
665
+ # Strategic Recommendations
666
+ story.append(Paragraph("5. Strategic Recommendations", h2))
667
+ story.append(Paragraph("Development Priorities", h3))
668
+ if sections['Strategic HR Recommendations']['Development Priorities']:
669
+ for line in sections['Strategic HR Recommendations']['Development Priorities']:
670
+ story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
671
+ else:
672
+ story.append(Paragraph("No development priorities specified.", body_text))
673
+ story.append(Spacer(1, 0.2 * inch))
674
+ story.append(Paragraph("Next Steps for Managers", h3))
675
+ if sections['Strategic HR Recommendations']['Next Steps']:
676
+ for line in sections['Strategic HR Recommendations']['Next Steps']:
677
+ story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
678
+ else:
679
+ story.append(Paragraph("No next steps provided.", body_text))
680
+ story.append(Spacer(1, 0.3 * inch))
681
+ story.append(Paragraph("This report provides a data-driven evaluation to guide hiring and development decisions.", body_text))
682
+
683
+ doc.build(story, onFirstPage=header_footer, onLaterPages=header_footer)
684
+ return True
685
  except Exception as e:
686
  logger.error(f"PDF creation failed: {str(e)}", exc_info=True)
687
+ return False
 
 
688
 
689
  def convert_to_serializable(obj):
 
690
  if isinstance(obj, np.generic): return obj.item()
691
+ if isinstance(obj, dict): return {k: convert_to_serializable(v) for k, v in obj.items()}
692
+ if isinstance(obj, list): return [convert_to_serializable(i) for i in obj]
693
  if isinstance(obj, np.ndarray): return obj.tolist()
694
  return obj
695
 
696
+ def process_interview(audio_path_or_url: str):
697
+ local_audio_path = None
 
 
 
 
 
 
 
 
698
  wav_file = None
699
+ is_downloaded = False
700
  try:
701
+ logger.info(f"Starting processing for {audio_path_or_url}")
702
+ if audio_path_or_url.startswith(('http://', 'https://')):
703
+ local_audio_path = download_audio_from_url(audio_path_or_url)
704
+ is_downloaded = True
705
+ else:
706
+ local_audio_path = audio_path_or_url
707
+ wav_file = convert_to_wav(local_audio_path)
708
  transcript = transcribe(wav_file)
 
 
709
  for utterance in transcript['utterances']:
710
+ utterance['prosodic_features'] = extract_prosodic_features(wav_file, utterance['start'], utterance['end'])
 
 
 
 
711
  utterances_with_speakers = identify_speakers(transcript, wav_file)
712
+ clf, vectorizer, scaler = None, None, None
713
+ if os.path.exists(os.path.join(OUTPUT_DIR, 'role_classifier.pkl')):
714
+ clf = joblib.load(os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
715
+ vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
716
+ scaler = joblib.load(os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
717
+ else:
718
  clf, vectorizer, scaler = train_role_classifier(utterances_with_speakers)
719
  classified_utterances = classify_roles(utterances_with_speakers, clf, vectorizer, scaler)
720
+ voice_analysis = analyze_interviewee_voice(wav_file, classified_utterances)
 
 
 
721
  analysis_data = {
722
  'transcript': classified_utterances,
723
  'speakers': list(set(u['speaker'] for u in classified_utterances)),
724
  'voice_analysis': voice_analysis,
725
  'text_analysis': {
726
+ 'total_duration': sum(u['prosodic_features']['duration'] for u in classified_utterances),
727
  'speaker_turns': len(classified_utterances)
728
  }
729
  }
730
+ analysis_data['acceptance_probability'] = calculate_acceptance_probability(analysis_data)
731
+ gemini_report_text = generate_report(analysis_data)
732
+ base_name = str(uuid.uuid4())
 
733
  pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
734
  json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
735
+ create_pdf_report(analysis_data, pdf_path, gemini_report_text=gemini_report_text)
 
 
 
736
  with open(json_path, 'w') as f:
737
  serializable_data = convert_to_serializable(analysis_data)
738
  json.dump(serializable_data, f, indent=2)
739
+ logger.info(f"Processing completed for {audio_path_or_url}")
 
740
  return {'pdf_path': pdf_path, 'json_path': json_path}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
741
  except Exception as e:
742
+ logger.error(f"Processing failed for {audio_path_or_url}: {str(e)}", exc_info=True)
743
  raise
744
  finally:
745
+ if wav_file and os.path.exists(wav_file):
746
+ os.remove(wav_file)
747
+ if is_downloaded and local_audio_path and os.path.exists(local_audio_path):
748
+ os.remove(local_audio_path)
749
+ logger.info(f"Cleaned up temporary downloaded file: {local_audio_path}")