norhan12 commited on
Commit
4d417cb
·
verified ·
1 Parent(s): 71e2e34

Update process_interview.py

Browse files
Files changed (1) hide show
  1. process_interview.py +602 -279
process_interview.py CHANGED
@@ -1,77 +1,76 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- # ==============================================================================
4
- # EvalBot - AI Interview Analysis Pipeline
5
- # ==============================================================================
6
-
7
- # --- 1. Imports ---
8
  import os
9
- import logging
10
- import re
11
- import time
12
- import json
13
- import uuid
14
- import tempfile
15
- from typing import Dict, List
16
-
17
- # --- Third-party Libraries ---
18
  import torch
19
  import numpy as np
 
20
  import requests
21
- import urllib3
 
22
  from pydub import AudioSegment
23
- import librosa
24
- import spacy
25
- import google.generativeai as genai
26
- from concurrent.futures import ThreadPoolExecutor
27
-
28
- # --- Machine Learning & Models ---
29
  from nemo.collections.asr.models import EncDecSpeakerLabelModel
30
  from pinecone import Pinecone, ServerlessSpec
31
- import joblib
 
32
  from sklearn.ensemble import RandomForestClassifier
33
  from sklearn.preprocessing import StandardScaler
34
  from sklearn.feature_extraction.text import TfidfVectorizer
35
-
36
- # --- PDF Generation (Optional but included) ---
 
 
37
  from reportlab.lib.pagesizes import letter
38
- from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
39
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
40
  from reportlab.lib.units import inch
41
  from reportlab.lib import colors
 
 
 
 
 
 
 
 
 
 
42
 
43
- # --- 2. Configuration and Setup ---
44
-
45
- # إعدادات التسجيل (Logging)
46
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s')
47
  logger = logging.getLogger(__name__)
 
 
48
 
49
- # تقليل verbosity من المكتبات الأخرى
50
- logging.getLogger("nemo_logging").setLevel(logging.ERROR)
51
- logging.getLogger("urllib3").setLevel(logging.WARNING)
52
-
53
- # الإعدادات العامة (Constants)
54
  OUTPUT_DIR = "./processed_audio"
55
  os.makedirs(OUTPUT_DIR, exist_ok=True)
56
 
57
- # مفاتيح API (يجب تعيينها كمتغيرات بيئة)
58
- PINECONE_KEY = os.getenv("PINECONE_KEY")
59
- ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY")
60
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
61
 
62
- if not all([PINECONE_KEY, ASSEMBLYAI_KEY, GEMINI_API_KEY]):
63
- logger.warning("One or more API keys are missing. Please set PINECONE_KEY, ASSEMBLYAI_KEY, and GEMINI_API_KEY environment variables.")
64
-
65
- # --- 3. Service and Model Initialization ---
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  def initialize_services():
68
- """Initializes external services like Pinecone and Gemini."""
69
  try:
70
- logger.info("Initializing Pinecone and Gemini services...")
71
  pc = Pinecone(api_key=PINECONE_KEY)
72
  index_name = "interview-speaker-embeddings"
73
  if index_name not in pc.list_indexes().names():
74
- logger.info(f"Creating new Pinecone index: {index_name}")
75
  pc.create_index(
76
  name=index_name,
77
  dimension=192,
@@ -79,348 +78,672 @@ def initialize_services():
79
  spec=ServerlessSpec(cloud="aws", region="us-east-1")
80
  )
81
  index = pc.Index(index_name)
82
-
83
  genai.configure(api_key=GEMINI_API_KEY)
84
  gemini_model = genai.GenerativeModel('gemini-1.5-flash')
85
- logger.info("Services initialized successfully.")
86
  return index, gemini_model
87
  except Exception as e:
88
  logger.error(f"Error initializing services: {str(e)}")
89
  raise
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  def load_models():
92
- """Loads all necessary machine learning models."""
93
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
94
- logger.info(f"Using device: {device}")
95
-
96
- logger.info("Loading speaker verification model (Titanet)...")
97
- speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large", map_location=device)
98
- speaker_model.eval()
99
-
100
- logger.info("Loading NLP model (spaCy)...")
101
  nlp = spacy.load("en_core_web_sm")
102
-
103
- return speaker_model, nlp, device
 
 
104
 
105
- # تحميل الخدمات والنماذج عند بدء التشغيل
106
- index, gemini_model = initialize_services()
107
- speaker_model, nlp, device = load_models()
108
-
109
- # --- 4. Core Processing Functions ---
110
-
111
- def download_audio_to_temp_file(url: str, retries=3) -> str:
112
- """Downloads an audio file from a URL to a temporary local path with retries."""
113
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".tmp_audio")
114
- temp_path = temp_file.name
115
- temp_file.close()
116
-
117
- logger.info(f"Downloading audio from {url} to {temp_path}")
118
-
119
- for attempt in range(retries):
120
- try:
121
- with requests.get(url, stream=True, timeout=60) as r:
122
- r.raise_for_status()
123
- with open(temp_path, 'wb') as f:
124
- for chunk in r.iter_content(chunk_size=8192):
125
- f.write(chunk)
126
- logger.info("Download completed successfully.")
127
- return temp_path
128
- except (requests.exceptions.RequestException, urllib3.exceptions.ProtocolError) as e:
129
- logger.warning(f"Download attempt {attempt + 1}/{retries} failed: {e}. Retrying...")
130
- if attempt < retries - 1:
131
- time.sleep(2 ** attempt)
132
- else:
133
- os.remove(temp_path)
134
- logger.error(f"Failed to download audio after {retries} attempts.")
135
- raise
136
- raise Exception(f"Failed to download audio from URL {url}")
137
 
138
  def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
139
- """Converts an audio file to a 16kHz mono WAV file."""
140
  try:
141
- logger.info(f"Converting {audio_path} to WAV format...")
142
  audio = AudioSegment.from_file(audio_path)
143
- audio = audio.set_frame_rate(16000).set_channels(1)
 
 
144
  wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
145
  audio.export(wav_file, format="wav")
146
- logger.info(f"Successfully converted to {wav_file}")
147
  return wav_file
148
  except Exception as e:
149
- logger.error(f"Audio conversion failed for {audio_path}: {str(e)}")
150
  raise
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  def transcribe(audio_path: str) -> Dict:
153
- """Transcribes audio using AssemblyAI with diarization."""
154
  try:
155
- logger.info("Uploading audio to AssemblyAI...")
156
- headers = {"authorization": ASSEMBLYAI_KEY}
157
  with open(audio_path, 'rb') as f:
158
- upload_response = requests.post("https://api.assemblyai.com/v2/upload", headers=headers, data=f)
159
-
 
 
 
160
  audio_url = upload_response.json()['upload_url']
161
-
162
- logger.info("Submitting transcription job with diarization...")
163
- transcript_request = {"audio_url": audio_url, "diarization": True}
164
- transcript_response = requests.post("https://api.assemblyai.com/v2/transcript", json=transcript_request, headers=headers)
 
 
 
 
 
165
  transcript_id = transcript_response.json()['id']
166
-
167
- logger.info(f"Waiting for transcription job (ID: {transcript_id}) to complete...")
168
  while True:
169
- result = requests.get(f"https://api.assemblyai.com/v2/transcript/{transcript_id}", headers=headers).json()
 
 
 
170
  if result['status'] == 'completed':
171
- logger.info("Transcription job completed.")
172
- if not result.get('utterances'):
173
- raise ValueError("Transcription succeeded but no utterances were found.")
174
  return result
175
  elif result['status'] == 'error':
176
- raise Exception(f"Transcription failed: {result['error']}")
177
  time.sleep(5)
178
  except Exception as e:
179
- logger.error(f"Transcription process failed: {str(e)}")
180
  raise
181
 
182
- def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
183
- """Extracts prosodic features from a specific audio segment."""
184
  try:
185
- y, sr = librosa.load(audio_path, sr=16000, offset=start_ms/1000.0, duration=(end_ms-start_ms)/1000.0)
186
-
187
- if len(y) == 0: return {'duration': 0, 'mean_pitch': 0, 'pitch_sd': 0, 'intensityMean': 0, 'intensitySD': 0}
188
-
189
- pitches, _ = librosa.piptrack(y=y, sr=sr)
190
- pitches = pitches[pitches > 0]
191
- rms = librosa.feature.rms(y=y)[0]
192
-
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  return {
194
- 'duration': (end_ms - start_ms) / 1000,
195
- 'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
196
- 'pitch_sd': float(np.std(pitches)) if len(pitches) > 0 else 0.0,
197
- 'intensityMean': float(np.mean(rms)),
198
- 'intensitySD': float(np.std(rms)),
199
  }
200
  except Exception as e:
201
- logger.error(f"Feature extraction failed for segment {start_ms}-{end_ms}: {str(e)}")
202
- return {}
203
-
 
 
 
 
204
 
205
- # --- 5. Role Classification Functions (As Requested) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
- def train_role_classifier(utterances: List[Dict]):
208
- """
209
- Trains a RandomForestClassifier based on utterance features.
210
- NOTE: Assumes an alternating turn-taking pattern for labeling.
211
- """
212
  try:
213
- logger.info("Training new role classifier model...")
214
- texts = [u['text'] for u in utterances]
215
- vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
216
- X_text = vectorizer.fit_transform(texts)
217
-
218
  features = []
219
- labels = [] # 0 for Interviewer, 1 for Interviewee
220
-
221
- for i, utterance in enumerate(utterances):
222
- prosodic = utterance.get('prosodic_features', {})
223
  feat = [
224
- prosodic.get('duration', 0), prosodic.get('mean_pitch', 0), prosodic.get('pitch_sd', 0),
225
- prosodic.get('intensityMean', 0), prosodic.get('intensitySD', 0)
 
 
 
226
  ]
227
- feat.extend(X_text[i].toarray()[0].tolist())
228
- doc = nlp(utterance['text'])
229
- feat.extend([
230
- int(utterance['text'].endswith('?')),
231
- len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower())),
232
- len(utterance['text'].split()),
233
- sum(1 for token in doc if token.pos_ == 'VERB'),
234
- sum(1 for token in doc if token.pos_ == 'NOUN')
235
  ])
236
  features.append(feat)
237
- labels.append(0 if i % 2 == 0 else 1) # Assumes alternating roles
238
-
239
  scaler = StandardScaler()
240
  X = scaler.fit_transform(features)
241
-
242
- clf = RandomForestClassifier(n_estimators=150, max_depth=10, random_state=42, class_weight='balanced')
 
243
  clf.fit(X, labels)
244
-
245
- logger.info("Saving trained models to disk...")
246
  joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
247
  joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
248
  joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
249
-
250
  return clf, vectorizer, scaler
251
  except Exception as e:
252
  logger.error(f"Classifier training failed: {str(e)}")
253
  raise
254
 
255
- def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
256
- """Classifies roles using the pre-trained RandomForest model."""
257
  try:
258
- logger.info("Classifying roles using trained model...")
259
- texts = [u['text'] for u in utterances]
260
- X_text = vectorizer.transform(texts)
261
  results = []
262
- for i, utterance in enumerate(utterances):
263
- prosodic = utterance.get('prosodic_features', {})
264
  feat = [
265
- prosodic.get('duration', 0), prosodic.get('mean_pitch', 0), prosodic.get('pitch_sd', 0),
266
- prosodic.get('intensityMean', 0), prosodic.get('intensitySD', 0)
 
267
  ]
268
  feat.extend(X_text[i].toarray()[0].tolist())
269
- doc = nlp(utterance['text'])
270
  feat.extend([
271
- int(utterance['text'].endswith('?')),
272
- len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower())),
273
- len(utterance['text'].split()),
274
  sum(1 for token in doc if token.pos_ == 'VERB'),
275
  sum(1 for token in doc if token.pos_ == 'NOUN')
276
  ])
277
  X = scaler.transform([feat])
278
  role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
279
- results.append({**utterance, 'role': role})
280
  return results
281
  except Exception as e:
282
- logger.error(f"Role classification execution failed: {str(e)}")
283
  raise
284
 
285
- # --- 6. Analysis and Reporting Functions ---
286
-
287
- def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
288
- """Analyzes voice characteristics of all utterances classified as 'Interviewee'."""
289
  try:
290
- interviewee_utterances = [u for u in utterances if u.get('role') == 'Interviewee']
291
- if not interviewee_utterances:
292
- logger.warning("No interviewee utterances found to analyze.")
293
- return {'error': 'No interviewee utterances found'}
294
-
295
- logger.info(f"Analyzing {len(interviewee_utterances)} interviewee utterances...")
296
  y, sr = librosa.load(audio_path, sr=16000)
297
-
298
- segments = [y[int(u['start']*sr/1000):int(u['end']*sr/1000)] for u in interviewee_utterances]
299
-
300
- total_duration = sum(u['prosodic_features'].get('duration', 0) for u in interviewee_utterances)
301
- total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
302
- speaking_rate = (total_words / total_duration) * 60 if total_duration > 0 else 0
303
-
304
- filler_words = {'um', 'uh', 'like', 'you know', 'so', 'i mean', 'actually'}
305
- filler_count = sum(1 for u in interviewee_utterances for word in u['text'].lower().split() if word in filler_words)
 
 
 
 
306
  filler_ratio = filler_count / total_words if total_words > 0 else 0
307
-
308
- pitches = np.concatenate([librosa.pyin(s, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))[0] for s in segments if len(s)>0])
309
- pitches = pitches[~np.isnan(pitches)]
310
-
311
- intensities = np.concatenate([librosa.feature.rms(y=s)[0] for s in segments if len(s)>0])
312
-
 
 
 
 
 
313
  pitch_std = np.std(pitches) if len(pitches) > 0 else 0
314
- intensity_std = np.std(intensities) if len(intensities) > 0 else 0
315
-
316
- anxiety_score = max(0, min(1, pitch_std / 50))
317
- confidence_score = max(0, min(1, 1 - (intensity_std * 10)))
318
- hesitation_score = max(0, min(1, (filler_ratio * 2) + (pitch_std / 100)))
319
-
 
 
 
 
 
 
 
 
320
  return {
321
  'speaking_rate': float(round(speaking_rate, 2)),
322
  'filler_ratio': float(round(filler_ratio, 4)),
323
- 'pitch_std_dev': float(round(pitch_std, 2)),
324
- 'intensity_std_dev': float(round(intensity_std, 4)),
325
- 'composite_scores': {
326
- 'anxiety': float(round(anxiety_score, 4)),
327
- 'confidence': float(round(confidence_score, 4)),
328
- 'hesitation': float(round(hesitation_score, 4))
329
- }
330
  }
331
  except Exception as e:
332
- logger.error(f"Voice analysis failed: {str(e)}", exc_info=True)
333
  return {'error': str(e)}
334
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  def generate_report(analysis_data: Dict) -> str:
336
- """Generates a text-based summary report using Gemini AI."""
337
  try:
338
- logger.info("Generating final report text with Gemini...")
339
- # ... (Your generate_report function logic here)
340
- return "Gemini report text would be generated here."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  except Exception as e:
342
  logger.error(f"Report generation failed: {str(e)}")
343
- return f"Error in report generation: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
- # --- 7. Main Orchestration Function ---
 
 
 
 
 
346
 
347
- def process_interview_from_url(audio_url: str):
348
- """
349
- Main pipeline to download, process, and analyze an interview from a URL.
350
- """
351
  local_audio_path = None
352
  wav_file = None
353
-
354
  try:
355
- # Step 1: Download and Convert
356
- local_audio_path = download_audio_to_temp_file(audio_url)
 
 
 
 
357
  wav_file = convert_to_wav(local_audio_path)
358
-
359
- # Step 2: Transcribe and Diarize
360
  transcript = transcribe(wav_file)
361
-
362
- # Step 3: Extract Features
363
- logger.info("Extracting prosodic features for all utterances...")
364
- with ThreadPoolExecutor() as executor:
365
- futures = {executor.submit(extract_prosodic_features, wav_file, u['start'], u['end']): u for u in transcript['utterances']}
366
- for future in futures:
367
- utterance = futures[future]
368
- utterance['prosodic_features'] = future.result()
369
-
370
- # Step 4: Classify Roles
371
- classifier_path = os.path.join(OUTPUT_DIR, 'role_classifier.pkl')
372
- if os.path.exists(classifier_path):
373
- logger.info("Loading existing role classifier model.")
374
- clf = joblib.load(classifier_path)
375
  vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
376
  scaler = joblib.load(os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
377
  else:
378
- clf, vectorizer, scaler = train_role_classifier(transcript['utterances'])
379
-
380
- classified_utterances = classify_roles(transcript['utterances'], clf, vectorizer, scaler)
381
-
382
- # Step 5: Analyze Voice and Generate Report
383
  voice_analysis = analyze_interviewee_voice(wav_file, classified_utterances)
384
-
385
  analysis_data = {
386
  'transcript': classified_utterances,
387
- 'speakers': list(set(u['speaker'] for u in classified_utterances if u.get('speaker'))),
388
  'voice_analysis': voice_analysis,
389
  'text_analysis': {
390
- 'total_duration': transcript.get('audio_duration', 0),
391
  'speaker_turns': len(classified_utterances)
392
  }
393
  }
394
-
395
  gemini_report_text = generate_report(analysis_data)
396
-
397
- # Step 6: Save Results
398
  base_name = str(uuid.uuid4())
 
399
  json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
400
-
401
  with open(json_path, 'w') as f:
402
- # Use default=str to handle any non-serializable data types gracefully
403
- json.dump(analysis_data, f, indent=4, default=str)
404
-
405
- logger.info(f"Processing completed. Analysis saved to: {json_path}")
406
- return {'json_path': json_path, 'report_text': gemini_report_text}
407
-
408
  except Exception as e:
409
- logger.error(f"Main processing pipeline failed for URL {audio_url}: {str(e)}", exc_info=True)
410
  raise
411
-
412
  finally:
413
- # Step 7: Cleanup
414
- logger.info("Cleaning up temporary files...")
415
  if wav_file and os.path.exists(wav_file):
416
- try:
417
- os.remove(wav_file)
418
- logger.info(f"Removed temporary WAV file: {wav_file}")
419
- except OSError as e:
420
- logger.error(f"Error removing WAV file {wav_file}: {e}")
421
- if local_audio_path and os.path.exists(local_audio_path):
422
- try:
423
- os.remove(local_audio_path)
424
- logger.info(f"Removed temporary downloaded file: {local_audio_path}")
425
- except OSError as e:
426
- logger.error(f"Error removing downloaded file {local_audio_path}: {e}")
 
 
 
 
 
 
 
 
1
  import os
 
 
 
 
 
 
 
 
 
2
  import torch
3
  import numpy as np
4
+ import uuid
5
  import requests
6
+ import time
7
+ import json
8
  from pydub import AudioSegment
9
+ import wave
 
 
 
 
 
10
  from nemo.collections.asr.models import EncDecSpeakerLabelModel
11
  from pinecone import Pinecone, ServerlessSpec
12
+ import librosa
13
+ import pandas as pd
14
  from sklearn.ensemble import RandomForestClassifier
15
  from sklearn.preprocessing import StandardScaler
16
  from sklearn.feature_extraction.text import TfidfVectorizer
17
+ import re
18
+ from typing import Dict, List, Tuple
19
+ import logging
20
+ import tempfile
21
  from reportlab.lib.pagesizes import letter
22
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, Image
23
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
24
  from reportlab.lib.units import inch
25
  from reportlab.lib import colors
26
+ import matplotlib.pyplot as plt
27
+ import matplotlib
28
+ matplotlib.use('Agg')
29
+ from reportlab.platypus import Image
30
+ import io
31
+ from transformers import AutoTokenizer, AutoModel
32
+ import spacy
33
+ import google.generativeai as genai
34
+ import joblib
35
+ from concurrent.futures import ThreadPoolExecutor
36
 
37
+ # Setup logging
38
+ logging.basicConfig(level=logging.INFO)
 
 
39
  logger = logging.getLogger(__name__)
40
+ logging.getLogger("nemo_logging").setLevel(logging.INFO)
41
+ logging.getLogger("nemo").setLevel(logging.INFO)
42
 
43
+ # Configuration
44
+ AUDIO_DIR = "./Uploads"
 
 
 
45
  OUTPUT_DIR = "./processed_audio"
46
  os.makedirs(OUTPUT_DIR, exist_ok=True)
47
 
48
+ # API Keys
49
+ PINECONE_KEY = os.getenv("PINECONE_KEY")'
50
+ ASSEMBLYAI_KEY = 'os.getenv("ASSEMBLYAI_KEY")'
51
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
52
 
53
+ def download_audio_from_url(url: str) -> str:
54
+ """Downloads an audio file from a URL to a temporary local path."""
55
+ try:
56
+ temp_dir = tempfile.gettempdir()
57
+ temp_path = os.path.join(temp_dir, f"{uuid.uuid4()}.tmp_audio")
58
+ logger.info(f"Downloading audio from {url} to {temp_path}")
59
+ with requests.get(url, stream=True) as r:
60
+ r.raise_for_status()
61
+ with open(temp_path, 'wb') as f:
62
+ for chunk in r.iter_content(chunk_size=8192):
63
+ f.write(chunk)
64
+ return temp_path
65
+ except Exception as e:
66
+ logger.error(f"Failed to download audio from URL {url}: {e}")
67
+ raise
68
 
69
  def initialize_services():
 
70
  try:
 
71
  pc = Pinecone(api_key=PINECONE_KEY)
72
  index_name = "interview-speaker-embeddings"
73
  if index_name not in pc.list_indexes().names():
 
74
  pc.create_index(
75
  name=index_name,
76
  dimension=192,
 
78
  spec=ServerlessSpec(cloud="aws", region="us-east-1")
79
  )
80
  index = pc.Index(index_name)
 
81
  genai.configure(api_key=GEMINI_API_KEY)
82
  gemini_model = genai.GenerativeModel('gemini-1.5-flash')
 
83
  return index, gemini_model
84
  except Exception as e:
85
  logger.error(f"Error initializing services: {str(e)}")
86
  raise
87
 
88
+ index, gemini_model = initialize_services()
89
+
90
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
91
+ logger.info(f"Using device: {device}")
92
+
93
+ def load_speaker_model():
94
+ try:
95
+ import torch
96
+ torch.set_num_threads(5)
97
+ model = EncDecSpeakerLabelModel.from_pretrained(
98
+ "nvidia/speakerverification_en_titanet_large",
99
+ map_location=torch.device('cpu')
100
+ )
101
+ model.eval()
102
+ return model
103
+ except Exception as e:
104
+ logger.error(f"Model loading failed: {str(e)}")
105
+ raise RuntimeError("Could not load speaker verification model")
106
+
107
  def load_models():
108
+ speaker_model = load_speaker_model()
 
 
 
 
 
 
 
 
109
  nlp = spacy.load("en_core_web_sm")
110
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
111
+ llm_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
112
+ llm_model.eval()
113
+ return speaker_model, nlp, tokenizer, llm_model
114
 
115
+ speaker_model, nlp, tokenizer, llm_model = load_models()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
 
118
  try:
 
119
  audio = AudioSegment.from_file(audio_path)
120
+ if audio.channels > 1:
121
+ audio = audio.set_channels(1)
122
+ audio = audio.set_frame_rate(16000)
123
  wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
124
  audio.export(wav_file, format="wav")
 
125
  return wav_file
126
  except Exception as e:
127
+ logger.error(f"Audio conversion failed: {str(e)}")
128
  raise
129
 
130
+ def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
131
+ try:
132
+ audio = AudioSegment.from_file(audio_path)
133
+ segment = audio[start_ms:end_ms]
134
+ temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
135
+ segment.export(temp_path, format="wav")
136
+ y, sr = librosa.load(temp_path, sr=16000)
137
+ pitches = librosa.piptrack(y=y, sr=sr)[0]
138
+ pitches = pitches[pitches > 0]
139
+ features = {
140
+ 'duration': (end_ms - start_ms) / 1000,
141
+ 'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
142
+ 'min_pitch': float(np.min(pitches)) if len(pitches) > 0 else 0.0,
143
+ 'max_pitch': float(np.max(pitches)) if len(pitches) > 0 else 0.0,
144
+ 'pitch_sd': float(np.std(pitches)) if len(pitches) > 0 else 0.0,
145
+ 'intensityMean': float(np.mean(librosa.feature.rms(y=y)[0])),
146
+ 'intensityMin': float(np.min(librosa.feature.rms(y=y)[0])),
147
+ 'intensityMax': float(np.max(librosa.feature.rms(y=y)[0])),
148
+ 'intensitySD': float(np.std(librosa.feature.rms(y=y)[0])),
149
+ }
150
+ os.remove(temp_path)
151
+ return features
152
+ except Exception as e:
153
+ logger.error(f"Feature extraction failed: {str(e)}")
154
+ return {
155
+ 'duration': 0.0, 'mean_pitch': 0.0, 'min_pitch': 0.0, 'max_pitch': 0.0,
156
+ 'pitch_sd': 0.0, 'intensityMean': 0.0, 'intensityMin': 0.0,
157
+ 'intensityMax': 0.0, 'intensitySD': 0.0
158
+ }
159
+
160
  def transcribe(audio_path: str) -> Dict:
 
161
  try:
 
 
162
  with open(audio_path, 'rb') as f:
163
+ upload_response = requests.post(
164
+ "https://api.assemblyai.com/v2/upload",
165
+ headers={"authorization": ASSEMBLYAI_KEY},
166
+ data=f
167
+ )
168
  audio_url = upload_response.json()['upload_url']
169
+ transcript_response = requests.post(
170
+ "https://api.assemblyai.com/v2/transcript",
171
+ headers={"authorization": ASSEMBLYAI_KEY},
172
+ json={
173
+ "audio_url": audio_url,
174
+ "speaker_labels": True,
175
+ "filter_profanity": True
176
+ }
177
+ )
178
  transcript_id = transcript_response.json()['id']
 
 
179
  while True:
180
+ result = requests.get(
181
+ f"https://api.assemblyai.com/v2/transcript/{transcript_id}",
182
+ headers={"authorization": ASSEMBLYAI_KEY}
183
+ ).json()
184
  if result['status'] == 'completed':
 
 
 
185
  return result
186
  elif result['status'] == 'error':
187
+ raise Exception(result['error'])
188
  time.sleep(5)
189
  except Exception as e:
190
+ logger.error(f"Transcription failed: {str(e)}")
191
  raise
192
 
193
+ def process_utterance(utterance, full_audio, wav_file):
 
194
  try:
195
+ start = utterance['start']
196
+ end = utterance['end']
197
+ segment = full_audio[start:end]
198
+ temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
199
+ segment.export(temp_path, format="wav")
200
+ with torch.no_grad():
201
+ embedding = speaker_model.get_embedding(temp_path).cpu().numpy()
202
+ embedding_list = embedding.flatten().tolist()
203
+ query_result = index.query(
204
+ vector=embedding_list,
205
+ top_k=1,
206
+ include_metadata=True
207
+ )
208
+ if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
209
+ speaker_id = query_result['matches'][0]['id']
210
+ speaker_name = query_result['matches'][0]['metadata']['speaker_name']
211
+ else:
212
+ speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
213
+ speaker_name = f"Speaker_{speaker_id[-4:]}"
214
+ index.upsert([(speaker_id, embedding_list, {"speaker_name": speaker_id})])
215
+ os.remove(temp_path)
216
  return {
217
+ ...
218
+ **speech, 'speaker': speaker_name,
219
+ 'speaker_id': speaker_id,
220
+ 'embedding': embedding_list
 
221
  }
222
  except Exception as e:
223
+ logger.error(f"Utterance processing failed: {str(e)}", exc_info=True)
224
+ return {
225
+ ...
226
+ speech, 'speech': 'Unknown',
227
+ 'speaker_id': speaker_id,
228
+ 'embedding_id': None
229
+ }
230
 
231
+ def identify_speakers(audio: Dict, text: str) -> List[Dict]:
232
+ try:
233
+ audio = AudioSegment.from_wav(text)
234
+ speakers = audio['speech']
235
+ with ThreadPoolExecutor(max_workers=5) as executor:
236
+ futures = [
237
+ executor.submit(process_speech, speech, speakers, text)
238
+ for speech in speakers
239
+ ]
240
+ results = [f.result() for f in futures]
241
+ return results
242
+ except Exception as e:
243
+ logger.error(f"Speaker identification failed: {str(e)}")
244
+ raise
245
 
246
+ def train_role_classifier(speakers: List[Dict]):
 
 
 
 
247
  try:
248
+ speech = [u['speech'].split()]
249
+ vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1,2))
250
+ X_text = vectorizer.fit_transform(speech)
 
 
251
  features = []
252
+ labels = []
253
+ for i, speaker in enumerate(speakers):
254
+ utterance = speaker['speech_features']
 
255
  feat = [
256
+ utterance['duration'], utterance['speech_rate'], utterance['duration'], utterance['mean_pitch'],
257
+ utterance['min_pitch'], utterance['max_pitch'],
258
+ utterance['speech_sd'], utterance['intensityLevel'],
259
+ utterance['intensity_level'],
260
+ utterance['speechMax']], utterance['speechSD'],
261
  ]
262
+ feat.extend(X_text[i].toarray()[0])
263
+ doc = nlp(speaker['speech'])
264
+ speech.extend([
265
+ int(speaker['speech'].endswith('?'))),
266
+ len(re.findall(r'\b(why|how|what|when|where|who|which)\b', speaker['speech'].lower())),
267
+ len(speaker['speech'].split())),
268
+ sum(frequency for token in speech if token.pos_ == 'VERB'),
269
+ sum(frequency for token in speech if token.pos == 'NOUN')
270
  ])
271
  features.append(feat)
272
+ labels.append((0 if i % 2 == 0 else 1))
 
273
  scaler = StandardScaler()
274
  X = scaler.fit_transform(features)
275
+ clf = RandomForestClassifier(
276
+ n_estimators=150, max_depth=10, random_state=42, class_weight='balanced'
277
+ )
278
  clf.fit(X, labels)
 
 
279
  joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
280
  joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
281
  joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
 
282
  return clf, vectorizer, scaler
283
  except Exception as e:
284
  logger.error(f"Classifier training failed: {str(e)}")
285
  raise
286
 
287
+ def classify_roles(speakers: List[Dict], clf, vectorizer, scaler):
 
288
  try:
289
+ speech = [u['speech'] for u in speakers]
290
+ X_text = vectorizer.transform(speech)
 
291
  results = []
292
+ for i, speaker in enumerate(speakers):
293
+ prosodic = speaker['speech_features']
294
  feat = [
295
+ prosodic['duration'], prosodic['mean_pitch'], prosodic['min_pitch'],
296
+ prosodic['max_pitch'], prosodic['pitch_sd'], prosodic['intensityMean'],
297
+ prosodic['intensityMin'], prosodic['intensityMax'], prosodic['intensitySD'],
298
  ]
299
  feat.extend(X_text[i].toarray()[0].tolist())
300
+ doc = nlp(speaker['speech'])
301
  feat.extend([
302
+ int(speaker['speech'].endswith('?')),
303
+ len(re.findall(r'\b(why|how|what|when|where|who|which)\b', speaker['speech'].lower())),
304
+ len(speaker['speech'].split()),
305
  sum(1 for token in doc if token.pos_ == 'VERB'),
306
  sum(1 for token in doc if token.pos_ == 'NOUN')
307
  ])
308
  X = scaler.transform([feat])
309
  role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
310
+ results.append({**speaker, 'role': role})
311
  return results
312
  except Exception as e:
313
+ logger.error(f"Role classification failed: {str(e)}")
314
  raise
315
 
316
+ def analyze_interviewee_voice(audio_path: str, speakers: List[Dict]) -> Dict:
 
 
 
317
  try:
 
 
 
 
 
 
318
  y, sr = librosa.load(audio_path, sr=16000)
319
+ interviewee_speakers = [u for u in speakers if u['role'] == 'Interviewee']
320
+ if not interviewee_speakers:
321
+ return {'error': 'No interviewee speeches found'}
322
+ segments = []
323
+ for u in interviewee_speakers:
324
+ start = int(u['start'] * sr / 1000)
325
+ end = int(u['end'] * sr / 1000)
326
+ segments.append(y[start:end])
327
+ total_duration = sum(u['speech_features']['duration'] for u in interviewee_speakers)
328
+ total_words = sum(len(u['speech'].split()) for u in interviewee_speakers)
329
+ speaking_rate = total_words / total_duration if total_duration > 0 else 0
330
+ filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
331
+ filler_count = sum(sum(u['speech'].lower().count(fw) for fw in filler_words) for u in interviewee_speakers)
332
  filler_ratio = filler_count / total_words if total_words > 0 else 0
333
+ all_words = ' '.join(u['speech'].lower() for u in interviewee_speakers).split()
334
+ word_counts = {}
335
+ for i in range(len(all_words) - 1):
336
+ bigram = (all_words[i], all_words[i + 1])
337
+ word_counts[bigram] = word_counts.get(bigram, 0) + 1
338
+ repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(word_counts) if word_counts else 0
339
+ pitches = []
340
+ for segment in segments:
341
+ f0, voiced_flag, _ = librosa.pyin(segment, fmin=80, fmax=300, sr=sr)
342
+ pitches.extend(f0[voiced_flag])
343
+ pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
344
  pitch_std = np.std(pitches) if len(pitches) > 0 else 0
345
+ jitter = np.mean(np.abs(np.diff(pitches))) / pitch_mean if len(pitches) > 1 and pitch_mean > 0 else 0
346
+ intensities = []
347
+ for segment in segments:
348
+ rms = librosa.feature.rms(y=segment)[0]
349
+ intensities.extend(rms)
350
+ intensity_mean = np.mean(intensities) if intensities else 0
351
+ intensity_std = np.std(intensities) if intensities else 0
352
+ shimmer = np.mean(np.abs(np.diff(intensities))) / intensity_mean if len(intensities) > 1 and intensity_mean > 0 else 0
353
+ anxiety_score = 0.6 * (pitch_std / pitch_mean) + 0.4 * (jitter + shimmer) if pitch_mean > 0 else 0
354
+ confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
355
+ hesitation_score = filler_ratio + repetition_score
356
+ anxiety_level = 'High' if anxiety_score > 0.15 else 'Moderate' if anxiety_score > 0.07 else 'Low'
357
+ confidence_level = 'High' if confidence_score > 0.7 else 'Moderate' if confidence_score > 0.5 else 'Low'
358
+ fluency_level = 'Fluent' if (filler_ratio < 0.05 and repetition_score < 0.1) else 'Moderate' if (filler_ratio < 0.1 and repetition_score < 0.2) else 'Disfluent'
359
  return {
360
  'speaking_rate': float(round(speaking_rate, 2)),
361
  'filler_ratio': float(round(filler_ratio, 4)),
362
+ 'repetition_score': float(round(repetition_score, 4)),
363
+ 'pitch_analysis': {'mean': float(round(pitch_mean, 2)), 'std_dev': float(round(pitch_std, 2)), 'jitter': float(round(jitter, 4))},
364
+ 'intensity_analysis': {'mean': float(round(intensity_mean, 2)), 'std_dev': float(round(intensity_std, 2)), 'shimmer': float(round(shimmer, 4))},
365
+ 'composite_scores': {'anxiety': float(round(anxiety_score, 4)), 'confidence': float(round(confidence_score, 4)), 'hesitation': float(round(hesitation_score, 4))},
366
+ 'interpretation': {'anxiety_level': anxiety_level, 'confidence_level': confidence_level, 'fluency_level': fluency_level}
 
 
367
  }
368
  except Exception as e:
369
+ logger.error(f"Voice analysis failed: {str(e)}")
370
  return {'error': str(e)}
371
 
372
+ def generate_voice_interpretation(analysis: Dict) -> str:
373
+ if 'error' in analysis:
374
+ return "Voice analysis unavailable due to processing limitations."
375
+ interpretation_lines = [
376
+ "Vocal Performance Profile:",
377
+ f"- Speaking Rate: {analysis['speaking_rate']} words/sec - Benchmark: 2.0-3.0 wps for clear delivery",
378
+ f"- Filler Word Frequency: {analysis['filler_ratio'] * 100:.1f}% - Measures non-content words",
379
+ f"- Repetition Index: {analysis['repetition_score']:.3f} - Frequency of repeated phrases",
380
+ f"- Anxiety Indicator: {analysis['interpretation']['anxiety_level']} (Score: {analysis['composite_scores']['anxiety']:.3f}) - Pitch and vocal stability",
381
+ f"- Confidence Indicator: {analysis['interpretation']['confidence_level']} (Score: {analysis['composite_scores']['confidence']:.3f}) - Vocal strength",
382
+ f"- Fluency Rating: {analysis['interpretation']['fluency_level']} - Speech flow and coherence",
383
+ "",
384
+ "HR Insights:",
385
+ "- Rapid speech (>3.0 wps) may signal enthusiasm but risks clarity.",
386
+ "- High filler word use reduces perceived professionalism.",
387
+ "- Elevated anxiety suggests pressure; training can build resilience.",
388
+ "- Strong confidence aligns with leadership presence.",
389
+ "- Fluent speech enhances engagement, critical for team roles."
390
+ ]
391
+ return "\n".join(interpretation_lines)
392
+
393
+ def generate_anxiety_confidence_chart(composite_scores: Dict, chart_path_or_buffer):
394
+ try:
395
+ labels = ['Anxiety', 'Confidence']
396
+ scores = [composite_scores.get('anxiety', 0), composite_scores.get('confidence', 0)]
397
+ fig, ax = plt.subplots(figsize=(5, 3.5))
398
+ bars = ax.bar(labels, scores, color=['#FF5252', '#26A69A'], edgecolor='black', width=0.45)
399
+ ax.set_ylabel('Score (Normalized)', fontsize=12)
400
+ ax.set_title('Vocal Dynamics: Anxiety vs. Confidence', fontsize=14, pad=15)
401
+ ax.set_ylim(0, 1.3)
402
+ for bar in bars:
403
+ height = bar.get_height()
404
+ ax.text(bar.get_x() + bar.get_width()/2, height + 0.05, f"{height:.2f}",
405
+ ha='center', color='black', fontweight='bold', fontsize=11)
406
+ ax.grid(True, axis='y', linestyle='--', alpha=0.7)
407
+ plt.tight_layout()
408
+ plt.savefig(chart_path_or_buffer, format='png', bbox_inches='tight', dpi=300)
409
+ plt.close(fig)
410
+ except Exception as e:
411
+ logger.error(f"Error generating chart: {str(e)}")
412
+
413
+ def calculate_acceptance_probability(analysis_data: Dict) -> float:
414
+ voice = analysis_data.get('voice_analysis', {})
415
+ if 'error' in voice: return 0.0
416
+ w_confidence, w_anxiety, w_fluency, w_speaking_rate, w_filler_repetition, w_content_strengths = 0.35, -0.25, 0.2, 0.15, -0.15, 0.25
417
+ confidence_score = voice.get('composite_scores', {}).get('confidence', 0.0)
418
+ anxiety_score = voice.get('composite_scores', {}).get('anxiety', 0.0)
419
+ fluency_level = voice.get('interpretation', {}).get('fluency_level', 'Disfluent')
420
+ speaking_rate = voice.get('speaking_rate', 0.0)
421
+ filler_ratio = voice.get('filler_ratio', 0.0)
422
+ repetition_score = voice.get('repetition_score', 0.0)
423
+ fluency_map = {'Fluent': 1.0, 'Moderate': 0.6, 'Disfluent': 0.2}
424
+ fluency_val = fluency_map.get(fluency_level, 0.2)
425
+ ideal_speaking_rate = 2.5
426
+ speaking_rate_deviation = abs(speaking_rate - ideal_speaking_rate)
427
+ speaking_rate_score = max(0, 1 - (speaking_rate_deviation / ideal_speaking_rate))
428
+ filler_repetition_composite = (filler_ratio + repetition_score) / 2
429
+ filler_repetition_score = max(0, 1 - filler_repetition_composite)
430
+ content_strength_val = 0.85 if analysis_data.get('text_analysis', {}).get('total_duration', 0) > 60 else 0.4
431
+ raw_score = (confidence_score * w_confidence + (1 - anxiety_score) * abs(w_anxiety) + fluency_val * w_fluency + speaking_rate_score * w_speaking_rate + filler_repetition_score * abs(w_filler_repetition) + content_strength_val * w_content_strengths)
432
+ max_possible_score = (w_confidence + abs(w_anxiety) + w_fluency + w_speaking_rate + abs(w_filler_repetition) + w_content_strengths)
433
+ if max_possible_score == 0: return 50.0
434
+ normalized_score = raw_score / max_possible_score
435
+ acceptance_probability = max(0.0, min(1.0, normalized_score))
436
+ return float(f"{acceptance_probability * 100:.2f}")
437
+
438
  def generate_report(analysis_data: Dict) -> str:
 
439
  try:
440
+ voice = analysis_data.get('voice_analysis', {})
441
+ voice_interpretation = generate_voice_interpretation(voice)
442
+ interviewee_responses = [f"Speaker {u['speaker']} ({u['role']}): {u['text']}" for u in analysis_data['transcript'] if u['role'] == 'Interviewee'][:6]
443
+ acceptance_prob = analysis_data.get('acceptance_probability', None)
444
+ acceptance_line = ""
445
+ if acceptance_prob is not None:
446
+ acceptance_line = f"\n**Hiring Suitability Score: {acceptance_prob:.2f}%**\n"
447
+ if acceptance_prob >= 80: acceptance_line += "HR Verdict: Outstanding candidate, highly recommended for immediate advancement."
448
+ elif acceptance_prob >= 60: acceptance_line += "HR Verdict: Strong candidate, suitable for further evaluation with targeted development."
449
+ elif acceptance_prob >= 40: acceptance_line += "HR Verdict: Moderate potential, requires additional assessment and skill-building."
450
+ else: acceptance_line += "HR Verdict: Limited fit, significant improvement needed for role alignment."
451
+ prompt = f"""
452
+ You are EvalBot, a senior HR consultant with 20+ years of experience, delivering a polished, concise, and engaging interview analysis report. Use a professional tone, clear headings, and bullet points ('- ') for readability. Avoid redundancy and ensure distinct sections for strengths, growth areas, and recommendations.
453
+ {acceptance_line}
454
+ **1. Executive Summary**
455
+ - Provide a concise overview of performance, key metrics, and hiring potential.
456
+ - Interview length: {analysis_data['text_analysis']['total_duration']:.2f} seconds
457
+ - Speaker turns: {analysis_data['text_analysis']['speaker_turns']}
458
+ - Participants: {', '.join(analysis_data['speakers'])}
459
+ **2. Communication and Vocal Dynamics**
460
+ - Evaluate vocal delivery (rate, fluency, confidence) and professional impact.
461
+ - Offer HR insights on workplace alignment.
462
+ {voice_interpretation}
463
+ **3. Competency and Content Evaluation**
464
+ - Assess competencies: leadership, problem-solving, communication, adaptability.
465
+ - List strengths and growth areas separately, with specific examples.
466
+ - Sample responses:
467
+ {chr(10).join(interviewee_responses)}
468
+ **4. Role Fit and Growth Potential**
469
+ - Analyze cultural fit, role readiness, and long-term potential.
470
+ - Highlight enthusiasm and scalability.
471
+ **5. Strategic HR Recommendations**
472
+ - Provide distinct, prioritized strategies for candidate growth.
473
+ - Target: Communication, Response Depth, Professional Presence.
474
+ - List clear next steps for hiring managers (e.g., advance, train, assess).
475
+ """
476
+ response = gemini_model.generate_content(prompt)
477
+ return response.text
478
  except Exception as e:
479
  logger.error(f"Report generation failed: {str(e)}")
480
+ return f"Error generating report: {str(e)}"
481
+
482
+ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str):
483
+ try:
484
+ doc = SimpleDocTemplate(output_path, pagesize=letter,
485
+ rightMargin=0.7*inch, leftMargin=0.7*inch,
486
+ topMargin=0.9*inch, bottomMargin=0.9*inch)
487
+ styles = getSampleStyleSheet()
488
+ h1 = ParagraphStyle(name='Heading1', fontSize=22, leading=26, spaceAfter=20, alignment=1, textColor=colors.HexColor('#003087'), fontName='Helvetica-Bold')
489
+ h2 = ParagraphStyle(name='Heading2', fontSize=15, leading=18, spaceBefore=14, spaceAfter=8, textColor=colors.HexColor('#0050BC'), fontName='Helvetica-Bold')
490
+ h3 = ParagraphStyle(name='Heading3', fontSize=11, leading=14, spaceBefore=10, spaceAfter=6, textColor=colors.HexColor('#3F7CFF'), fontName='Helvetica')
491
+ body_text = ParagraphStyle(name='BodyText', fontSize=10, leading=13, spaceAfter=8, fontName='Helvetica', textColor=colors.HexColor('#333333'))
492
+ bullet_style = ParagraphStyle(name='Bullet', parent=body_text, leftIndent=20, bulletIndent=10, fontName='Helvetica', bulletFontName='Helvetica', bulletFontSize=10)
493
+
494
+ story = []
495
+
496
+ def header_footer(canvas, doc):
497
+ canvas.saveState()
498
+ canvas.setFont('Helvetica', 8)
499
+ canvas.setFillColor(colors.HexColor('#666666'))
500
+ canvas.drawString(doc.leftMargin, 0.4 * inch, f"Page {doc.page} | EvalBot HR Interview Report | Confidential")
501
+ canvas.setStrokeColor(colors.HexColor('#0050BC'))
502
+ canvas.setLineWidth(1)
503
+ canvas.line(doc.leftMargin, doc.height + 0.85*inch, doc.width + doc.leftMargin, doc.height + 0.85*inch)
504
+ canvas.setFont('Helvetica-Bold', 10)
505
+ canvas.drawString(doc.leftMargin, doc.height + 0.9*inch, "Candidate Interview Analysis")
506
+ canvas.drawRightString(doc.width + doc.leftMargin, doc.height + 0.9*inch, time.strftime('%B %d, %Y'))
507
+ canvas.restoreState()
508
+
509
+ # Title Page
510
+ story.append(Paragraph("Candidate Interview Analysis", h1))
511
+ story.append(Paragraph(f"Generated: {time.strftime('%B %d, %Y')}", ParagraphStyle(name='Date', alignment=1, fontSize=10, textColor=colors.HexColor('#666666'), fontName='Helvetica')))
512
+ story.append(Spacer(1, 0.5 * inch))
513
+ acceptance_prob = analysis_data.get('acceptance_probability')
514
+ if acceptance_prob is not None:
515
+ story.append(Paragraph("Hiring Suitability Snapshot", h2))
516
+ prob_color = colors.HexColor('#2E7D32') if acceptance_prob >= 80 else (colors.HexColor('#F57C00') if acceptance_prob >= 60 else colors.HexColor('#D32F2F'))
517
+ story.append(Paragraph(f"Suitability Score: <font size=16 color='{prob_color.hexval()}'><b>{acceptance_prob:.2f}%</b></font>",
518
+ ParagraphStyle(name='Prob', fontSize=12, spaceAfter=12, alignment=1, fontName='Helvetica-Bold')))
519
+ if acceptance_prob >= 80:
520
+ story.append(Paragraph("<b>HR Verdict:</b> Outstanding candidate, highly recommended for immediate advancement.", body_text))
521
+ elif acceptance_prob >= 60:
522
+ story.append(Paragraph("<b>HR Verdict:</b> Strong candidate, suitable for further evaluation with targeted development.", body_text))
523
+ elif acceptance_prob >= 40:
524
+ story.append(Paragraph("<b>HR Verdict:</b> Moderate potential, requires additional assessment and skill-building.", body_text))
525
+ else:
526
+ story.append(Paragraph("<b>HR Verdict:</b> Limited fit, significant improvement needed for role alignment.", body_text))
527
+ story.append(Spacer(1, 0.3 * inch))
528
+ table_data = [
529
+ ['Metric', 'Value'],
530
+ ['Interview Duration', f"{analysis_data['text_analysis']['total_duration']:.2f} seconds"],
531
+ ['Speaker Turns', f"{analysis_data['text_analysis']['speaker_turns']}"],
532
+ ['Participants', ', '.join(sorted(analysis_data['speakers']))]
533
+ ]
534
+ table = Table(table_data, colWidths=[2.2*inch, 3.8*inch])
535
+ table.setStyle(TableStyle([
536
+ ('BACKGROUND', (0,0), (-1,0), colors.HexColor('#0050BC')),
537
+ ('TEXTCOLOR', (0,0), (-1,0), colors.white),
538
+ ('ALIGN', (0,0), (-1,-1), 'LEFT'),
539
+ ('VALIGN', (0,0), (-1,-1), 'MIDDLE'),
540
+ ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
541
+ ('FONTSIZE', (0,0), (-1,-1), 9),
542
+ ('BOTTOMPADDING', (0,0), (-1,0), 10),
543
+ ('TOPPADDING', (0,0), (-1,0), 10),
544
+ ('BACKGROUND', (0,1), (-1,-1), colors.HexColor('#F5F6FA')),
545
+ ('GRID', (0,0), (-1,-1), 0.5, colors.HexColor('#DDE4EB'))
546
+ ]))
547
+ story.append(table)
548
+ story.append(Spacer(1, 0.4 * inch))
549
+ story.append(Paragraph("Prepared by: EvalBot - AI-Powered HR Analysis", body_text))
550
+ story.append(PageBreak())
551
+
552
+ # Detailed Analysis
553
+ story.append(Paragraph("Detailed Candidate Evaluation", h1))
554
+
555
+ # Communication and Vocal Dynamics
556
+ story.append(Paragraph("1. Communication & Vocal Dynamics", h2))
557
+ voice_analysis = analysis_data.get('voice_analysis', {})
558
+ if voice_analysis and 'error' not in voice_analysis:
559
+ table_data = [
560
+ ['Metric', 'Value', 'HR Insight'],
561
+ ['Speaking Rate', f"{voice_analysis.get('speaking_rate', 0):.2f} words/sec", 'Benchmark: 2.0-3.0 wps; impacts clarity'],
562
+ ['Filler Words', f"{voice_analysis.get('filler_ratio', 0) * 100:.1f}%", 'High usage reduces credibility'],
563
+ ['Anxiety', voice_analysis.get('interpretation', {}).get('anxiety_level', 'N/A'), f"Score: {voice_analysis.get('composite_scores', {}).get('anxiety', 0):.3f}; stress response"],
564
+ ['Confidence', voice_analysis.get('interpretation', {}).get('confidence_level', 'N/A'), f"Score: {voice_analysis.get('composite_scores', {}).get('confidence', 0):.3f}; vocal strength"],
565
+ ['Fluency', voice_analysis.get('interpretation', {}).get('fluency_level', 'N/A'), 'Drives engagement']
566
+ ]
567
+ table = Table(table_data, colWidths=[1.7*inch, 1.2*inch, 3.1*inch])
568
+ table.setStyle(TableStyle([
569
+ ('BACKGROUND', (0,0), (-1,0), colors.HexColor('#0050BC')),
570
+ ('TEXTCOLOR', (0,0), (-1,0), colors.white),
571
+ ('ALIGN', (0,0), (-1,-1), 'LEFT'),
572
+ ('VALIGN', (0,0), (-1,-1), 'MIDDLE'),
573
+ ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
574
+ ('FONTSIZE', (0,0), (-1,-1), 9),
575
+ ('BOTTOMPADDING', (0,0), (-1,0), 10),
576
+ ('TOPPADDING', (0,0), (-1,0), 10),
577
+ ('BACKGROUND', (0,1), (-1,-1), colors.HexColor('#F5F6FA')),
578
+ ('GRID', (0,0), (-1,-1), 0.5, colors.HexColor('#DDE4EB'))
579
+ ]))
580
+ story.append(table)
581
+ story.append(Spacer(1, 0.2 * inch))
582
+ chart_buffer = io.BytesIO()
583
+ generate_anxiety_confidence_chart(voice_analysis.get('composite_scores', {}), chart_buffer)
584
+ chart_buffer.seek(0)
585
+ img = Image(chart_buffer, width=4.8*inch, height=3.2*inch)
586
+ img.hAlign = 'CENTER'
587
+ story.append(img)
588
+ else:
589
+ story.append(Paragraph("Vocal analysis unavailable.", body_text))
590
+ story.append(Spacer(1, 0.3 * inch))
591
+
592
+ # Parse Gemini Report
593
+ sections = {
594
+ "Executive Summary": [],
595
+ "Communication and Vocal Dynamics": [],
596
+ "Competency and Content Evaluation": {"Strengths": [], "Growth Areas": []},
597
+ "Role Fit and Growth Potential": [],
598
+ "Strategic HR Recommendations": {"Development Priorities": [], "Next Steps": []}
599
+ }
600
+ report_parts = re.split(r'(\s*\*\*\s*\d\.\s*.*?\s*\*\*)', gemini_report_text)
601
+ current_section = None
602
+ for part in report_parts:
603
+ if not part.strip(): continue
604
+ is_heading = False
605
+ for title in sections.keys():
606
+ if title.lower() in part.lower():
607
+ current_section = title
608
+ is_heading = True
609
+ break
610
+ if not is_heading and current_section:
611
+ if current_section == "Competency and Content Evaluation":
612
+ if 'strength' in part.lower() or any(k in part.lower() for k in ['leadership', 'problem-solving', 'communication', 'adaptability']):
613
+ sections[current_section]["Strengths"].append(part.strip())
614
+ elif 'improve' in part.lower() or 'grow' in part.lower() or 'challenge' in part.lower():
615
+ sections[current_section]["Growth Areas"].append(part.strip())
616
+ elif current_section == "Strategic HR Recommendations":
617
+ if any(k in part.lower() for k in ['communication', 'depth', 'presence', 'improve']):
618
+ sections[current_section]["Development Priorities"].append(part.strip())
619
+ elif any(k in part.lower() for k in ['advance', 'train', 'assess', 'next step']):
620
+ sections[current_section]["Next Steps"].append(part.strip())
621
+ else:
622
+ sections[current_section].append(part.strip())
623
+
624
+ # Executive Summary
625
+ story.append(Paragraph("2. Executive Summary", h2))
626
+ if sections['Executive Summary']:
627
+ for line in sections['Executive Summary']:
628
+ if line.startswith(('-', '•', '*')):
629
+ story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
630
+ else:
631
+ story.append(Paragraph(line, body_text))
632
+ else:
633
+ story.append(Paragraph("Summary unavailable.", body_text))
634
+ story.append(Spacer(1, 0.3 * inch))
635
+
636
+ # Competency and Content
637
+ story.append(Paragraph("3. Competency & Content", h2))
638
+ story.append(Paragraph("Strengths", h3))
639
+ if sections['Competency and Content Evaluation']['Strengths']:
640
+ for line in sections['Competency and Content Evaluation']['Strengths']:
641
+ story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
642
+ else:
643
+ story.append(Paragraph("No strengths identified.", body_text))
644
+ story.append(Spacer(1, 0.2 * inch))
645
+ story.append(Paragraph("Growth Areas", h3))
646
+ if sections['Competency and Content Evaluation']['Growth Areas']:
647
+ for line in sections['Competency and Content Evaluation']['Growth Areas']:
648
+ story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
649
+ else:
650
+ story.append(Paragraph("No growth areas identified.", body_text))
651
+ story.append(Spacer(1, 0.3 * inch))
652
+
653
+ # Role Fit
654
+ story.append(Paragraph("4. Role Fit & Potential", h2))
655
+ if sections['Role Fit and Growth Potential']:
656
+ for line in sections['Role Fit and Growth Potential']:
657
+ if line.startswith(('-', '•', '*')):
658
+ story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
659
+ else:
660
+ story.append(Paragraph(line, body_text))
661
+ else:
662
+ story.append(Paragraph("Fit and potential analysis unavailable.", body_text))
663
+ story.append(Spacer(1, 0.3 * inch))
664
+
665
+ # Strategic Recommendations
666
+ story.append(Paragraph("5. Strategic Recommendations", h2))
667
+ story.append(Paragraph("Development Priorities", h3))
668
+ if sections['Strategic HR Recommendations']['Development Priorities']:
669
+ for line in sections['Strategic HR Recommendations']['Development Priorities']:
670
+ story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
671
+ else:
672
+ story.append(Paragraph("No development priorities specified.", body_text))
673
+ story.append(Spacer(1, 0.2 * inch))
674
+ story.append(Paragraph("Next Steps for Managers", h3))
675
+ if sections['Strategic HR Recommendations']['Next Steps']:
676
+ for line in sections['Strategic HR Recommendations']['Next Steps']:
677
+ story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
678
+ else:
679
+ story.append(Paragraph("No next steps provided.", body_text))
680
+ story.append(Spacer(1, 0.3 * inch))
681
+ story.append(Paragraph("This report provides a data-driven evaluation to guide hiring and development decisions.", body_text))
682
+
683
+ doc.build(story, onFirstPage=header_footer, onLaterPages=header_footer)
684
+ return True
685
+ except Exception as e:
686
+ logger.error(f"PDF creation failed: {str(e)}", exc_info=True)
687
+ return False
688
 
689
+ def convert_to_serializable(obj):
690
+ if isinstance(obj, np.generic): return obj.item()
691
+ if isinstance(obj, dict): return {k: convert_to_serializable(v) for k, v in obj.items()}
692
+ if isinstance(obj, list): return [convert_to_serializable(i) for i in obj]
693
+ if isinstance(obj, np.ndarray): return obj.tolist()
694
+ return obj
695
 
696
+ def process_interview(audio_path_or_url: str):
 
 
 
697
  local_audio_path = None
698
  wav_file = None
699
+ is_downloaded = False
700
  try:
701
+ logger.info(f"Starting processing for {audio_path_or_url}")
702
+ if audio_path_or_url.startswith(('http://', 'https://')):
703
+ local_audio_path = download_audio_from_url(audio_path_or_url)
704
+ is_downloaded = True
705
+ else:
706
+ local_audio_path = audio_path_or_url
707
  wav_file = convert_to_wav(local_audio_path)
 
 
708
  transcript = transcribe(wav_file)
709
+ for utterance in transcript['utterances']:
710
+ utterance['prosodic_features'] = extract_prosodic_features(wav_file, utterance['start'], utterance['end'])
711
+ utterances_with_speakers = identify_speakers(transcript, wav_file)
712
+ clf, vectorizer, scaler = None, None, None
713
+ if os.path.exists(os.path.join(OUTPUT_DIR, 'role_classifier.pkl')):
714
+ clf = joblib.load(os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
 
 
 
 
 
 
 
 
715
  vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
716
  scaler = joblib.load(os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
717
  else:
718
+ clf, vectorizer, scaler = train_role_classifier(utterances_with_speakers)
719
+ classified_utterances = classify_roles(utterances_with_speakers, clf, vectorizer, scaler)
 
 
 
720
  voice_analysis = analyze_interviewee_voice(wav_file, classified_utterances)
 
721
  analysis_data = {
722
  'transcript': classified_utterances,
723
+ 'speakers': list(set(u['speaker'] for u in classified_utterances)),
724
  'voice_analysis': voice_analysis,
725
  'text_analysis': {
726
+ 'total_duration': sum(u['prosodic_features']['duration'] for u in classified_utterances),
727
  'speaker_turns': len(classified_utterances)
728
  }
729
  }
730
+ analysis_data['acceptance_probability'] = calculate_acceptance_probability(analysis_data)
731
  gemini_report_text = generate_report(analysis_data)
 
 
732
  base_name = str(uuid.uuid4())
733
+ pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
734
  json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
735
+ create_pdf_report(analysis_data, pdf_path, gemini_report_text=gemini_report_text)
736
  with open(json_path, 'w') as f:
737
+ serializable_data = convert_to_serializable(analysis_data)
738
+ json.dump(serializable_data, f, indent=2)
739
+ logger.info(f"Processing completed for {audio_path_or_url}")
740
+ return {'pdf_path': pdf_path, 'json_path': json_path}
 
 
741
  except Exception as e:
742
+ logger.error(f"Processing failed for {audio_path_or_url}: {str(e)}", exc_info=True)
743
  raise
 
744
  finally:
 
 
745
  if wav_file and os.path.exists(wav_file):
746
+ os.remove(wav_file)
747
+ if is_downloaded and local_audio_path and os.path.exists(local_audio_path):
748
+ os.remove(local_audio_path)
749
+ logger.info(f"Cleaned up temporary downloaded file: {local_audio_path}")