norhan12 commited on
Commit
87066d1
·
verified ·
1 Parent(s): dda086c

Update process_interview.py

Browse files
Files changed (1) hide show
  1. process_interview.py +541 -535
process_interview.py CHANGED
@@ -10,34 +10,35 @@ import wave
10
  from nemo.collections.asr.models import EncDecSpeakerLabelModel
11
  from pinecone import Pinecone, ServerlessSpec
12
  import librosa
 
 
 
 
13
  import re
14
- from typing import Dict, List
15
  import logging
16
- import tempfile
17
  from reportlab.lib.pagesizes import letter
18
  from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
19
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
20
  from reportlab.lib.units import inch
21
  from reportlab.lib import colors
22
- import matplotlib.pyplot as plt
23
- import matplotlib
24
- matplotlib.use('Agg')
25
- from reportlab.platypus import Image
26
- import io
27
  import spacy
28
  import google.generativeai as genai
 
29
  from concurrent.futures import ThreadPoolExecutor
30
- import urllib3 # <-- تم الإصلاح: إضافة استيراد urllib3
31
 
32
- # إعدادات التسجيل (Logging)
33
  logging.basicConfig(level=logging.INFO)
34
  logger = logging.getLogger(__name__)
35
- # تقليل verbosity من مكتبة NeMo
36
- logging.getLogger("nemo_logging").setLevel(logging.WARNING)
37
- logging.getLogger("nemo").setLevel(logging.WARNING)
38
-
39
 
40
  # Configuration
 
41
  OUTPUT_DIR = "./processed_audio"
42
  os.makedirs(OUTPUT_DIR, exist_ok=True)
43
 
@@ -46,34 +47,9 @@ PINECONE_KEY = os.getenv("PINECONE_KEY")
46
  ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY")
47
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
48
 
49
- # --- تم الإصلاح: دالة تحميل محسّنة مع إعادة المحاولة ---
50
- def download_audio_from_url(url: str, retries=3) -> str:
51
- """Downloads an audio file from a URL to a temporary local path with retries."""
52
- temp_dir = tempfile.gettempdir()
53
- temp_path = os.path.join(temp_dir, f"{uuid.uuid4()}.tmp_audio")
54
- logger.info(f"Downloading audio from {url} to {temp_path}")
55
-
56
- for attempt in range(retries):
57
- try:
58
- with requests.get(url, stream=True, timeout=60) as r: # زيادة timeout
59
- r.raise_for_status()
60
- with open(temp_path, 'wb') as f:
61
- for chunk in r.iter_content(chunk_size=8192):
62
- f.write(chunk)
63
- logger.info("Download completed successfully.")
64
- return temp_path
65
- except (requests.exceptions.RequestException, urllib3.exceptions.ProtocolError) as e:
66
- logger.warning(f"Attempt {attempt + 1}/{retries} failed: {e}. Retrying...")
67
- if attempt < retries - 1:
68
- time.sleep(2 ** attempt) # Exponential backoff
69
- else:
70
- logger.error(f"Failed to download audio after {retries} attempts.")
71
- raise
72
- raise Exception(f"Failed to download audio from URL {url}")
73
-
74
 
 
75
  def initialize_services():
76
- """Initializes Pinecone and Gemini services."""
77
  try:
78
  pc = Pinecone(api_key=PINECONE_KEY)
79
  index_name = "interview-speaker-embeddings"
@@ -85,23 +61,30 @@ def initialize_services():
85
  spec=ServerlessSpec(cloud="aws", region="us-east-1")
86
  )
87
  index = pc.Index(index_name)
88
-
89
  genai.configure(api_key=GEMINI_API_KEY)
90
  gemini_model = genai.GenerativeModel('gemini-1.5-flash')
 
91
  return index, gemini_model
92
  except Exception as e:
93
  logger.error(f"Error initializing services: {str(e)}")
94
  raise
95
 
 
96
  index, gemini_model = initialize_services()
 
 
97
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
98
  logger.info(f"Using device: {device}")
99
 
 
100
  def load_speaker_model():
101
- """Loads the speaker verification model."""
102
  try:
103
- # يضمن عدم استخدام عدد كبير جدًا من الخيوط
104
- torch.set_num_threads(1)
 
 
 
105
  model = EncDecSpeakerLabelModel.from_pretrained(
106
  "nvidia/speakerverification_en_titanet_large",
107
  map_location=torch.device('cpu')
@@ -112,19 +95,30 @@ def load_speaker_model():
112
  logger.error(f"Model loading failed: {str(e)}")
113
  raise RuntimeError("Could not load speaker verification model")
114
 
 
 
115
  def load_models():
116
- """Loads all necessary models."""
117
  speaker_model = load_speaker_model()
118
  nlp = spacy.load("en_core_web_sm")
119
- return speaker_model, nlp
120
 
121
- speaker_model, nlp = load_models()
 
 
 
 
122
 
 
 
 
 
 
123
  def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
124
- """Converts any audio file to a 16kHz mono WAV file."""
125
  try:
126
  audio = AudioSegment.from_file(audio_path)
127
- audio = audio.set_frame_rate(16000).set_channels(1)
 
 
 
128
  wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
129
  audio.export(wav_file, format="wav")
130
  return wav_file
@@ -132,628 +126,640 @@ def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
132
  logger.error(f"Audio conversion failed: {str(e)}")
133
  raise
134
 
 
135
  def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
136
- """Extracts prosodic features from an audio segment."""
137
  try:
138
- y, sr = librosa.load(audio_path, sr=16000, offset=start_ms/1000.0, duration=(end_ms-start_ms)/1000.0)
139
-
140
- pitches, _ = librosa.piptrack(y=y, sr=sr)
 
 
 
 
141
  pitches = pitches[pitches > 0]
142
-
143
- rms = librosa.feature.rms(y=y)[0]
144
 
145
- return {
146
  'duration': (end_ms - start_ms) / 1000,
147
  'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
 
 
148
  'pitch_sd': float(np.std(pitches)) if len(pitches) > 0 else 0.0,
149
- 'intensityMean': float(np.mean(rms)),
150
- 'intensitySD': float(np.std(rms)),
 
 
151
  }
 
 
 
152
  except Exception as e:
153
  logger.error(f"Feature extraction failed: {str(e)}")
154
- return {'duration': 0, 'mean_pitch': 0, 'pitch_sd': 0, 'intensityMean': 0, 'intensitySD': 0}
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  def transcribe(audio_path: str) -> Dict:
157
- """Transcribes audio using AssemblyAI and enables speaker labels."""
158
  try:
159
- headers = {"authorization": ASSEMBLYAI_KEY}
160
  with open(audio_path, 'rb') as f:
161
- upload_response = requests.post("https://api.assemblyai.com/v2/upload", headers=headers, data=f)
162
-
 
 
 
163
  audio_url = upload_response.json()['upload_url']
164
-
165
- transcript_request = {
166
- "audio_url": audio_url,
167
- "speaker_labels": True,
168
- }
169
-
170
- transcript_response = requests.post("https://api.assemblyai.com/v2/transcript", json=transcript_request, headers=headers)
 
 
 
171
  transcript_id = transcript_response.json()['id']
172
 
173
  while True:
174
- result = requests.get(f"https://api.assemblyai.com/v2/transcript/{transcript_id}", headers=headers).json()
 
 
 
 
175
  if result['status'] == 'completed':
176
- if not result.get('utterances'):
177
- raise ValueError("Transcription completed but no utterances were returned. The audio may be too short or silent.")
178
  return result
179
  elif result['status'] == 'error':
180
- raise Exception(f"Transcription failed: {result['error']}")
 
181
  time.sleep(5)
182
  except Exception as e:
183
  logger.error(f"Transcription failed: {str(e)}")
184
  raise
185
 
186
- def process_utterance(utterance, full_audio):
187
- """Processes a single utterance to get a speaker embedding."""
188
  try:
189
- start, end = utterance['start'], utterance['end']
 
190
  segment = full_audio[start:end]
191
-
192
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_f:
193
- segment.export(temp_f.name, format="wav")
194
- with torch.no_grad():
195
- embedding = speaker_model.get_embedding(temp_f.name).cpu().numpy().flatten()
196
-
197
- return {**utterance, 'embedding': embedding}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  except Exception as e:
199
  logger.error(f"Utterance processing failed: {str(e)}")
200
- return {**utterance, 'embedding': np.zeros(192)} # Return zero vector on failure
 
 
 
 
 
 
201
 
202
  def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
203
- """Identifies unique speakers from utterances."""
204
  try:
205
  full_audio = AudioSegment.from_wav(wav_file)
206
  utterances = transcript['utterances']
207
-
208
- with ThreadPoolExecutor(max_workers=4) as executor:
209
- futures = [executor.submit(process_utterance, u, full_audio) for u in utterances]
210
- processed_utterances = [f.result() for f in futures]
211
-
212
- # Map AssemblyAI speaker labels (A, B, C...) to unique speaker names
213
- speaker_map = {}
214
- unique_speaker_count = 0
215
-
216
- for u in processed_utterances:
217
- assembly_speaker = u['speaker']
218
- if assembly_speaker not in speaker_map:
219
- unique_speaker_count += 1
220
- speaker_map[assembly_speaker] = f"Speaker_{unique_speaker_count}"
221
- u['speaker_name'] = speaker_map[assembly_speaker]
222
-
223
- return processed_utterances
224
 
 
 
 
 
 
 
 
 
225
  except Exception as e:
226
  logger.error(f"Speaker identification failed: {str(e)}")
227
  raise
228
 
229
- # --- تم الإصلاح: استبدال نموذج التصنيف بمنهجية إرشادية (Heuristic) لتصنيف الأدوار ---
230
- def classify_roles(utterances: List[Dict]) -> List[Dict]:
231
- """
232
- Classifies roles as 'Interviewer' or 'Interviewee' based on heuristics.
233
- The 'Interviewer' is assumed to be the one who asks more questions.
234
- """
235
  try:
236
- speaker_stats = {}
237
- question_words = {'what', 'why', 'how', 'when', 'where', 'who', 'which', 'tell', 'describe', 'explain'}
238
-
239
- for u in utterances:
240
- speaker = u['speaker_name']
241
- if speaker not in speaker_stats:
242
- speaker_stats[speaker] = {'question_score': 0, 'utterance_count': 0}
243
-
244
- speaker_stats[speaker]['utterance_count'] += 1
245
- text_lower = u['text'].lower()
246
-
247
- # زيادة النتيجة إذا انتهى النص بعلامة استفهام
248
- if text_lower.endswith('?'):
249
- speaker_stats[speaker]['question_score'] += 1
250
-
251
- # زيادة النتيجة لكل كلمة استفهامية
252
- for word in question_words:
253
- if word in text_lower.split():
254
- speaker_stats[speaker]['question_score'] += 1
255
-
256
- if not speaker_stats:
257
- # إذا لم يتم العثور على متحدثين، لا يمكن التصنيف
258
- return utterances
259
-
260
- # تحديد المحاور بناءً على أعلى "question_score"
261
- interviewer_speaker = max(speaker_stats, key=lambda s: speaker_stats[s]['question_score'])
262
-
263
- logger.info(f"Speaker stats for role classification: {speaker_stats}")
264
- logger.info(f"Identified Interviewer: {interviewer_speaker}")
265
-
266
- for u in utterances:
267
- if u['speaker_name'] == interviewer_speaker:
268
- u['role'] = 'Interviewer'
269
- else:
270
- u['role'] = 'Interviewee'
271
-
272
- return utterances
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  except Exception as e:
274
  logger.error(f"Role classification failed: {str(e)}")
275
- # تعيين دور افتراضي في حالة الفشل
276
- for u in utterances:
277
- u['role'] = 'Unknown'
278
- return utterances
279
 
280
 
281
  def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
282
- """Analyzes the voice characteristics of the interviewee."""
283
  try:
284
- interviewee_utterances = [u for u in utterances if u.get('role') == 'Interviewee']
 
 
285
  if not interviewee_utterances:
286
  return {'error': 'No interviewee utterances found'}
287
-
288
- y, sr = librosa.load(audio_path, sr=16000)
289
-
290
- # استخراج مقاطع صوتية للمرشح
291
- segments = [y[int(u['start']*sr/1000):int(u['end']*sr/1000)] for u in interviewee_utterances]
292
-
 
 
 
293
  total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
294
  total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
295
- speaking_rate = total_words / (total_duration / 60) if total_duration > 0 else 0 # Words per minute
296
 
297
- # تحليل الكلمات الحشوية (Filler words)
298
- filler_words = {'um', 'uh', 'like', 'you know', 'so', 'i mean', 'actually'}
299
- filler_count = sum(1 for u in interviewee_utterances for word in u['text'].lower().split() if word in filler_words)
 
 
300
  filler_ratio = filler_count / total_words if total_words > 0 else 0
301
 
302
- # تحليل تكرار الكلمات
303
  all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
304
- repetition_score = (len(all_words) - len(set(all_words))) / len(all_words) if all_words else 0
305
-
306
- # تحليل طبقة الصوت (Pitch) والكثافة (Intensity)
307
- pitches = np.concatenate([librosa.pyin(s, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))[0] for s in segments if len(s)>0])
308
- pitches = pitches[~np.isnan(pitches)]
309
-
310
- intensities = np.concatenate([librosa.feature.rms(y=s)[0] for s in segments if len(s)>0])
 
 
 
 
311
 
312
  pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
313
  pitch_std = np.std(pitches) if len(pitches) > 0 else 0
314
- intensity_mean = np.mean(intensities) if len(intensities) > 0 else 0
315
- intensity_std = np.std(intensities) if len(intensities) > 0 else 0
316
-
317
- # حساب الدرجات المركبة
318
- anxiety_score = (pitch_std / 150) if pitch_std > 0 else 0 # تطبيع بسيط
319
- confidence_score = 1 - (intensity_std * 5) if intensity_std > 0 else 1 # تطبيع بسيط
320
- hesitation_score = (filler_ratio + repetition_score) / 2
321
-
322
- # تقييد الدرجات بين 0 و 1
323
- anxiety_score = max(0, min(1, anxiety_score))
324
- confidence_score = max(0, min(1, confidence_score))
325
-
 
 
 
 
 
 
 
 
 
326
  return {
327
  'speaking_rate': float(round(speaking_rate, 2)),
328
  'filler_ratio': float(round(filler_ratio, 4)),
329
  'repetition_score': float(round(repetition_score, 4)),
330
- 'pitch_analysis': {'mean': float(round(pitch_mean, 2)), 'std_dev': float(round(pitch_std, 2))},
331
- 'intensity_analysis': {'mean': float(round(intensity_mean, 4)), 'std_dev': float(round(intensity_std, 4))},
 
 
 
 
 
 
 
 
332
  'composite_scores': {
333
  'anxiety': float(round(anxiety_score, 4)),
334
  'confidence': float(round(confidence_score, 4)),
335
  'hesitation': float(round(hesitation_score, 4))
336
  },
337
  'interpretation': {
338
- 'anxiety_level': 'high' if anxiety_score > 0.6 else 'moderate' if anxiety_score > 0.3 else 'low',
339
- 'confidence_level': 'high' if confidence_score > 0.7 else 'moderate' if confidence_score > 0.4 else 'low',
340
- 'fluency_level': 'disfluent' if hesitation_score > 0.1 else 'moderate' if hesitation_score > 0.05 else 'fluent'
341
  }
342
  }
343
  except Exception as e:
344
- logger.error(f"Voice analysis failed: {str(e)}", exc_info=True)
345
  return {'error': str(e)}
346
 
347
- def generate_anxiety_confidence_chart(composite_scores: Dict, chart_path_or_buffer):
348
- """Generates a bar chart for anxiety and confidence scores."""
349
- try:
350
- labels = ['Anxiety', 'Confidence']
351
- scores = [composite_scores.get('anxiety', 0), composite_scores.get('confidence', 0)]
352
-
353
- fig, ax = plt.subplots(figsize=(5, 3.5))
354
- bars = ax.bar(labels, scores, color=['#FF5252', '#26A69A'], edgecolor='black', width=0.45)
355
-
356
- ax.set_ylabel('Score (0 to 1)', fontsize=12)
357
- ax.set_title('Vocal Dynamics: Anxiety vs. Confidence', fontsize=14, pad=15)
358
- ax.set_ylim(0, 1.1)
359
-
360
- for bar in bars:
361
- height = bar.get_height()
362
- ax.text(bar.get_x() + bar.get_width()/2, height + 0.02, f"{height:.2f}",
363
- ha='center', va='bottom', color='black', fontweight='bold', fontsize=11)
364
-
365
- ax.grid(True, axis='y', linestyle='--', alpha=0.7)
366
- plt.tight_layout()
367
- plt.savefig(chart_path_or_buffer, format='png', bbox_inches='tight', dpi=300)
368
- plt.close(fig)
369
- except Exception as e:
370
- logger.error(f"Error generating chart: {str(e)}")
371
-
372
- def calculate_acceptance_probability(analysis_data: Dict) -> float:
373
- """Calculates a suitability score based on analysis data."""
374
- voice = analysis_data.get('voice_analysis', {})
375
- if 'error' in voice: return 0.0
376
-
377
- # تعريف الأوزان
378
- w_confidence, w_anxiety, w_fluency, w_speaking_rate = 0.4, -0.2, 0.2, 0.2
379
-
380
- confidence_score = voice.get('composite_scores', {}).get('confidence', 0.5)
381
- anxiety_score = voice.get('composite_scores', {}).get('anxiety', 0.5)
382
- hesitation_score = voice.get('composite_scores', {}).get('hesitation', 0.5)
383
- fluency_score = 1 - hesitation_score
384
-
385
- # تقييم سرعة الكلام
386
- rate = voice.get('speaking_rate', 150)
387
- if 120 <= rate <= 180:
388
- speaking_rate_score = 1.0
389
- elif 100 <= rate < 120 or 180 < rate <= 200:
390
- speaking_rate_score = 0.7
391
- else:
392
- speaking_rate_score = 0.4
393
-
394
- raw_score = (confidence_score * w_confidence +
395
- (1 - anxiety_score) * abs(w_anxiety) +
396
- fluency_score * w_fluency +
397
- speaking_rate_score * w_speaking_rate)
398
-
399
- max_possible_score = w_confidence + abs(w_anxiety) + w_fluency + w_speaking_rate
400
-
401
- normalized_score = raw_score / max_possible_score if max_possible_score != 0 else 0
402
- acceptance_probability = max(0.0, min(1.0, normalized_score))
403
-
404
- return float(f"{acceptance_probability * 100:.2f}")
405
-
406
- # --- تم الإصلاح: إضافة الدالة المفقودة ---
407
- def generate_voice_interpretation(voice: Dict) -> str:
408
- """Generates a human-readable interpretation of voice analysis."""
409
- if not voice or 'error' in voice:
410
- return "- Vocal analysis could not be performed as no interviewee was identified."
411
-
412
- interp = voice.get('interpretation', {})
413
- scores = voice.get('composite_scores', {})
414
-
415
- confidence = interp.get('confidence_level', 'N/A').capitalize()
416
- anxiety = interp.get('anxiety_level', 'N/A').capitalize()
417
- fluency = interp.get('fluency_level', 'N/A').capitalize()
418
- rate = voice.get('speaking_rate', 0)
419
-
420
- lines = [
421
- f"- **Confidence:** {confidence} (Score: {scores.get('confidence', 0):.2f}). The candidate's vocal tone suggests their level of assurance.",
422
- f"- **Anxiety:** {anxiety} (Score: {scores.get('anxiety', 0):.2f}). Vocal stress indicators point to their comfort level during the interview.",
423
- f"- **Fluency & Hesitation:** {fluency} (Hesitation Score: {scores.get('hesitation', 0):.2f}). Reflects the smoothness of speech and use of filler words.",
424
- f"- **Speaking Rate:** {rate:.0f} words per minute. A normal conversational pace is typically between 120-180 WPM."
425
- ]
426
- return "\n".join(lines)
427
 
428
 
429
  def generate_report(analysis_data: Dict) -> str:
430
- """Generates a comprehensive report using Gemini AI."""
431
  try:
432
- voice_interpretation = generate_voice_interpretation(analysis_data.get('voice_analysis', {}))
433
-
434
- interviewee_responses = [f"- {u['text']}" for u in analysis_data['transcript'] if u.get('role') == 'Interviewee'][:4]
435
-
436
- acceptance_prob = analysis_data.get('acceptance_probability')
437
- acceptance_line = ""
438
- if acceptance_prob is not None:
439
- acceptance_line = f"\n**Hiring Suitability Score: {acceptance_prob:.2f}%**\n"
440
- if acceptance_prob >= 80: acceptance_line += "HR Verdict: Outstanding candidate. Highly recommended for advancement."
441
- elif acceptance_prob >= 60: acceptance_line += "HR Verdict: Strong candidate. Suitable for further evaluation."
442
- elif acceptance_prob >= 40: acceptance_line += "HR Verdict: Moderate potential. Requires additional assessment."
443
- else: acceptance_line += "HR Verdict: Limited fit for the role at this time."
444
-
445
- prompt = f"""
446
- You are EvalBot, a senior HR consultant. Generate a polished, concise, and engaging interview analysis report. Use a professional tone, clear headings, and bullet points.
447
 
448
- {acceptance_line}
 
 
 
 
449
 
 
 
 
450
  **1. Executive Summary**
451
- - Provide a concise overview of the candidate's performance, key metrics, and hiring potential.
452
- - Interview length: {analysis_data['text_analysis']['total_duration']:.2f} seconds
453
- - Participants: {', '.join(analysis_data['speakers'])}
454
-
455
- **2. Communication and Vocal Dynamics**
456
- - Evaluate vocal delivery based on the following analysis. Offer HR insights on its impact.
457
  {voice_interpretation}
458
-
459
- **3. Competency and Content Evaluation**
460
- - Based on the sample responses below, assess competencies like leadership, problem-solving, and self-awareness.
461
- - List strengths and growth areas separately, with specific examples.
462
- - Sample Responses from Candidate:
463
- {' '.join(interviewee_responses) if interviewee_responses else "No responses from interviewee were identified."}
464
-
465
- **4. Strategic HR Recommendations**
466
- - Provide prioritized strategies for the candidate's growth.
467
- - List clear next steps for hiring managers (e.g., advance, further technical assessment, reject).
468
  """
 
469
  response = gemini_model.generate_content(prompt)
470
  return response.text
471
  except Exception as e:
472
  logger.error(f"Report generation failed: {str(e)}")
473
  return f"Error generating report: {str(e)}"
474
 
 
 
475
  def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str):
476
  try:
477
- doc = SimpleDocTemplate(output_path, pagesize=letter,
478
- rightMargin=0.7*inch, leftMargin=0.7*inch,
479
- topMargin=0.9*inch, bottomMargin=0.9*inch)
480
  styles = getSampleStyleSheet()
481
- h1 = ParagraphStyle(name='Heading1', fontSize=22, leading=26, spaceAfter=20, alignment=1, textColor=colors.HexColor('#003087'), fontName='Helvetica-Bold')
482
- h2 = ParagraphStyle(name='Heading2', fontSize=15, leading=18, spaceBefore=14, spaceAfter=8, textColor=colors.HexColor('#0050BC'), fontName='Helvetica-Bold')
483
- h3 = ParagraphStyle(name='Heading3', fontSize=11, leading=14, spaceBefore=10, spaceAfter=6, textColor=colors.HexColor('#3F7CFF'), fontName='Helvetica')
484
- body_text = ParagraphStyle(name='BodyText', fontSize=10, leading=13, spaceAfter=8, fontName='Helvetica', textColor=colors.HexColor('#333333'))
485
- bullet_style = ParagraphStyle(name='Bullet', parent=body_text, leftIndent=20, bulletIndent=10, fontName='Helvetica', bulletFontName='Helvetica', bulletFontSize=10)
486
-
 
 
 
 
 
487
  story = []
488
 
489
- def header_footer(canvas, doc):
490
- canvas.saveState()
491
- canvas.setFont('Helvetica', 8)
492
- canvas.setFillColor(colors.HexColor('#666666'))
493
- canvas.drawString(doc.leftMargin, 0.4 * inch, f"Page {doc.page} | EvalBot HR Interview Report | Confidential")
494
- canvas.setStrokeColor(colors.HexColor('#0050BC'))
495
- canvas.setLineWidth(1)
496
- canvas.line(doc.leftMargin, doc.height + 0.85*inch, doc.width + doc.leftMargin, doc.height + 0.85*inch)
497
- canvas.setFont('Helvetica-Bold', 10)
498
- canvas.drawString(doc.leftMargin, doc.height + 0.9*inch, "Candidate Interview Analysis")
499
- canvas.drawRightString(doc.width + doc.leftMargin, doc.height + 0.9*inch, time.strftime('%B %d, %Y'))
500
- canvas.restoreState()
501
-
502
- # Title Page
503
- story.append(Paragraph("Candidate Interview Analysis", h1))
504
- story.append(Paragraph(f"Generated: {time.strftime('%B %d, %Y')}", ParagraphStyle(name='Date', alignment=1, fontSize=10, textColor=colors.HexColor('#666666'), fontName='Helvetica')))
505
- story.append(Spacer(1, 0.5 * inch))
506
- acceptance_prob = analysis_data.get('acceptance_probability')
507
- if acceptance_prob is not None:
508
- story.append(Paragraph("Hiring Suitability Snapshot", h2))
509
- prob_color = colors.HexColor('#2E7D32') if acceptance_prob >= 80 else (colors.HexColor('#F57C00') if acceptance_prob >= 60 else colors.HexColor('#D32F2F'))
510
- story.append(Paragraph(f"Suitability Score: <font size=16 color='{prob_color.hexval()}'><b>{acceptance_prob:.2f}%</b></font>",
511
- ParagraphStyle(name='Prob', fontSize=12, spaceAfter=12, alignment=1, fontName='Helvetica-Bold')))
512
- if acceptance_prob >= 80:
513
- story.append(Paragraph("<b>HR Verdict:</b> Outstanding candidate, highly recommended for immediate advancement.", body_text))
514
- elif acceptance_prob >= 60:
515
- story.append(Paragraph("<b>HR Verdict:</b> Strong candidate, suitable for further evaluation with targeted development.", body_text))
516
- elif acceptance_prob >= 40:
517
- story.append(Paragraph("<b>HR Verdict:</b> Moderate potential, requires additional assessment and skill-building.", body_text))
518
- else:
519
- story.append(Paragraph("<b>HR Verdict:</b> Limited fit, significant improvement needed for role alignment.", body_text))
520
- story.append(Spacer(1, 0.3 * inch))
521
- table_data = [
522
- ['Metric', 'Value'],
523
- ['Interview Duration', f"{analysis_data['text_analysis']['total_duration']:.2f} seconds"],
524
- ['Speaker Turns', f"{analysis_data['text_analysis']['speaker_turns']}"],
525
- ['Participants', ', '.join(sorted(analysis_data['speakers']))]
526
- ]
527
- table = Table(table_data, colWidths=[2.2*inch, 3.8*inch])
528
- table.setStyle(TableStyle([
529
- ('BACKGROUND', (0,0), (-1,0), colors.HexColor('#0050BC')),
530
- ('TEXTCOLOR', (0,0), (-1,0), colors.white),
531
- ('ALIGN', (0,0), (-1,-1), 'LEFT'),
532
- ('VALIGN', (0,0), (-1,-1), 'MIDDLE'),
533
- ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
534
- ('FONTSIZE', (0,0), (-1,-1), 9),
535
- ('BOTTOMPADDING', (0,0), (-1,0), 10),
536
- ('TOPPADDING', (0,0), (-1,0), 10),
537
- ('BACKGROUND', (0,1), (-1,-1), colors.HexColor('#F5F6FA')),
538
- ('GRID', (0,0), (-1,-1), 0.5, colors.HexColor('#DDE4EB'))
539
- ]))
540
- story.append(table)
541
- story.append(Spacer(1, 0.4 * inch))
542
- story.append(Paragraph("Prepared by: EvalBot - AI-Powered HR Analysis", body_text))
543
- story.append(PageBreak())
544
-
545
- # Detailed Analysis
546
- story.append(Paragraph("Detailed Candidate Evaluation", h1))
547
-
548
- # Communication and Vocal Dynamics
549
- story.append(Paragraph("1. Communication & Vocal Dynamics", h2))
550
  voice_analysis = analysis_data.get('voice_analysis', {})
 
551
  if voice_analysis and 'error' not in voice_analysis:
 
552
  table_data = [
553
- ['Metric', 'Value', 'HR Insight'],
554
- ['Speaking Rate', f"{voice_analysis.get('speaking_rate', 0):.2f} words/sec", 'Benchmark: 2.0-3.0 wps; impacts clarity'],
555
- ['Filler Words', f"{voice_analysis.get('filler_ratio', 0) * 100:.1f}%", 'High usage reduces credibility'],
556
- ['Anxiety', voice_analysis.get('interpretation', {}).get('anxiety_level', 'N/A'), f"Score: {voice_analysis.get('composite_scores', {}).get('anxiety', 0):.3f}; stress response"],
557
- ['Confidence', voice_analysis.get('interpretation', {}).get('confidence_level', 'N/A'), f"Score: {voice_analysis.get('composite_scores', {}).get('confidence', 0):.3f}; vocal strength"],
558
- ['Fluency', voice_analysis.get('interpretation', {}).get('fluency_level', 'N/A'), 'Drives engagement']
 
 
 
559
  ]
560
- table = Table(table_data, colWidths=[1.7*inch, 1.2*inch, 3.1*inch])
561
- table.setStyle(TableStyle([
562
- ('BACKGROUND', (0,0), (-1,0), colors.HexColor('#0050BC')),
563
- ('TEXTCOLOR', (0,0), (-1,0), colors.white),
564
- ('ALIGN', (0,0), (-1,-1), 'LEFT'),
565
- ('VALIGN', (0,0), (-1,-1), 'MIDDLE'),
566
- ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
567
- ('FONTSIZE', (0,0), (-1,-1), 9),
568
- ('BOTTOMPADDING', (0,0), (-1,0), 10),
569
- ('TOPPADDING', (0,0), (-1,0), 10),
570
- ('BACKGROUND', (0,1), (-1,-1), colors.HexColor('#F5F6FA')),
571
- ('GRID', (0,0), (-1,-1), 0.5, colors.HexColor('#DDE4EB'))
572
- ]))
 
 
 
 
573
  story.append(table)
574
  story.append(Spacer(1, 0.2 * inch))
575
- chart_buffer = io.BytesIO()
576
- generate_anxiety_confidence_chart(voice_analysis.get('composite_scores', {}), chart_buffer)
577
- chart_buffer.seek(0)
578
- img = Image(chart_buffer, width=4.8*inch, height=3.2*inch)
579
- img.hAlign = 'CENTER'
580
- story.append(img)
581
- else:
582
- story.append(Paragraph("Vocal analysis unavailable.", body_text))
583
- story.append(Spacer(1, 0.3 * inch))
584
 
585
- # Parse Gemini Report
586
- sections = {
587
- "Executive Summary": [],
588
- "Communication and Vocal Dynamics": [],
589
- "Competency and Content Evaluation": {"Strengths": [], "Growth Areas": []},
590
- "Role Fit and Growth Potential": [],
591
- "Strategic HR Recommendations": {"Development Priorities": [], "Next Steps": []}
592
- }
593
- report_parts = re.split(r'(\s*\\\s*\d\.\s*.?\s\\)', gemini_report_text)
594
- current_section = None
595
- for part in report_parts:
596
- if not part.strip(): continue
597
- is_heading = False
598
- for title in sections.keys():
599
- if title.lower() in part.lower():
600
- current_section = title
601
- is_heading = True
602
- break
603
- if not is_heading and current_section:
604
- if current_section == "Competency and Content Evaluation":
605
- if 'strength' in part.lower() or any(k in part.lower() for k in ['leadership', 'problem-solving', 'communication', 'adaptability']):
606
- sections[current_section]["Strengths"].append(part.strip())
607
- elif 'improve' in part.lower() or 'grow' in part.lower() or 'challenge' in part.lower():
608
- sections[current_section]["Growth Areas"].append(part.strip())
609
- elif current_section == "Strategic HR Recommendations":
610
- if any(k in part.lower() for k in ['communication', 'depth', 'presence', 'improve']):
611
- sections[current_section]["Development Priorities"].append(part.strip())
612
- elif any(k in part.lower() for k in ['advance', 'train', 'assess', 'next step']):
613
- sections[current_section]["Next Steps"].append(part.strip())
614
- else:
615
- sections[current_section].append(part.strip())
616
-
617
- # Executive Summary
618
- story.append(Paragraph("2. Executive Summary", h2))
619
- if sections['Executive Summary']:
620
- for line in sections['Executive Summary']:
621
- if line.startswith(('-', '•', '*')):
622
- story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
623
- else:
624
- story.append(Paragraph(line, body_text))
625
- else:
626
- story.append(Paragraph("Summary unavailable.", body_text))
627
- story.append(Spacer(1, 0.3 * inch))
628
 
629
- # Competency and Content
630
- story.append(Paragraph("3. Competency & Content", h2))
631
- story.append(Paragraph("Strengths", h3))
632
- if sections['Competency and Content Evaluation']['Strengths']:
633
- for line in sections['Competency and Content Evaluation']['Strengths']:
634
- story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
635
  else:
636
- story.append(Paragraph("No strengths identified.", body_text))
637
- story.append(Spacer(1, 0.2 * inch))
638
- story.append(Paragraph("Growth Areas", h3))
639
- if sections['Competency and Content Evaluation']['Growth Areas']:
640
- for line in sections['Competency and Content Evaluation']['Growth Areas']:
641
- story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
642
- else:
643
- story.append(Paragraph("No growth areas identified.", body_text))
644
  story.append(Spacer(1, 0.3 * inch))
645
 
646
- # Role Fit
647
- story.append(Paragraph("4. Role Fit & Potential", h2))
648
- if sections['Role Fit and Growth Potential']:
649
- for line in sections['Role Fit and Growth Potential']:
650
- if line.startswith(('-', '•', '*')):
651
- story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
652
- else:
653
- story.append(Paragraph(line, body_text))
654
- else:
655
- story.append(Paragraph("Fit and potential analysis unavailable.", body_text))
656
- story.append(Spacer(1, 0.3 * inch))
657
 
658
- # Strategic Recommendations
659
- story.append(Paragraph("5. Strategic Recommendations", h2))
660
- story.append(Paragraph("Development Priorities", h3))
661
- if sections['Strategic HR Recommendations']['Development Priorities']:
662
- for line in sections['Strategic HR Recommendations']['Development Priorities']:
663
- story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
664
- else:
665
- story.append(Paragraph("No development priorities specified.", body_text))
666
- story.append(Spacer(1, 0.2 * inch))
667
- story.append(Paragraph("Next Steps for Managers", h3))
668
- if sections['Strategic HR Recommendations']['Next Steps']:
669
- for line in sections['Strategic HR Recommendations']['Next Steps']:
670
- story.append(Paragraph(line.lstrip('-•* ').strip(), bullet_style))
671
- else:
672
- story.append(Paragraph("No next steps provided.", body_text))
673
  story.append(Spacer(1, 0.3 * inch))
674
- story.append(Paragraph("This report provides a data-driven evaluation to guide hiring and development decisions.", body_text))
675
 
676
- doc.build(story, onFirstPage=header_footer, onLaterPages=header_footer)
 
 
 
 
 
 
 
 
 
 
 
677
  return True
678
  except Exception as e:
679
  logger.error(f"PDF creation failed: {str(e)}", exc_info=True)
680
  return False
681
 
682
- def convert_to_serializable(obj):
683
- if isinstance(obj, np.generic): return obj.item()
684
- if isinstance(obj, dict): return {k: convert_to_serializable(v) for k, v in obj.items()}
685
- if isinstance(obj, list): return [convert_to_serializable(i) for i in obj]
686
- if isinstance(obj, np.ndarray): return obj.tolist()
687
- return obj
688
 
689
  def convert_to_serializable(obj):
690
- """Converts numpy types to native Python types for JSON serialization."""
691
- if isinstance(obj, np.generic): return obj.item()
692
- if isinstance(obj, dict): return {k: convert_to_serializable(v) for k, v in obj.items()}
693
- if isinstance(obj, list): return [convert_to_serializable(i) for i in obj]
694
- if isinstance(obj, np.ndarray): return obj.tolist()
 
 
 
695
  return obj
696
 
697
- def process_interview(audio_path_or_url: str):
698
- """Main function to process an interview from an audio file or URL."""
699
- local_audio_path, wav_file = None, None
700
- is_downloaded = False
701
-
702
  try:
703
- logger.info(f"Starting processing for {audio_path_or_url}")
704
- if audio_path_or_url.startswith(('http://', 'https://')):
705
- local_audio_path = download_audio_from_url(audio_path_or_url)
706
- is_downloaded = True
707
- else:
708
- local_audio_path = audio_path_or_url
709
 
710
- wav_file = convert_to_wav(local_audio_path)
711
  transcript = transcribe(wav_file)
712
-
713
- for u in transcript['utterances']:
714
- u['prosodic_features'] = extract_prosodic_features(wav_file, u['start'], u['end'])
715
-
 
 
 
 
 
 
716
  utterances_with_speakers = identify_speakers(transcript, wav_file)
717
-
718
- # التصنيف باستخدام المنهجية الإرشادية
719
- classified_utterances = classify_roles(utterances_with_speakers)
720
-
 
 
 
 
 
 
 
 
 
 
 
721
  voice_analysis = analyze_interviewee_voice(wav_file, classified_utterances)
722
-
723
  analysis_data = {
724
  'transcript': classified_utterances,
725
- 'speakers': list(set(u['speaker_name'] for u in classified_utterances)),
726
  'voice_analysis': voice_analysis,
727
  'text_analysis': {
728
  'total_duration': sum(u['prosodic_features']['duration'] for u in classified_utterances),
729
  'speaker_turns': len(classified_utterances)
730
  }
731
  }
732
-
733
- analysis_data['acceptance_probability'] = calculate_acceptance_probability(analysis_data)
734
-
735
  gemini_report_text = generate_report(analysis_data)
736
-
737
- base_name = str(uuid.uuid4())
738
  pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
 
 
 
739
  json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
740
-
741
- # create_pdf_report(analysis_data, pdf_path, gemini_report_text=gemini_report_text)
742
-
743
  with open(json_path, 'w') as f:
744
  serializable_data = convert_to_serializable(analysis_data)
745
  json.dump(serializable_data, f, indent=2)
746
-
747
- logger.info(f"Processing completed. JSON report at: {json_path}")
748
- return {'pdf_path': pdf_path, 'json_path': json_path, 'report_text': gemini_report_text}
749
 
 
 
 
 
 
 
 
750
  except Exception as e:
751
- logger.error(f"Processing failed for {audio_path_or_url}: {str(e)}", exc_info=True)
752
- raise
753
- finally:
754
- # تنظيف الملفات المؤقتة
755
- if wav_file and os.path.exists(wav_file):
756
  os.remove(wav_file)
757
- if is_downloaded and local_audio_path and os.path.exists(local_audio_path):
758
- os.remove(local_audio_path)
759
- logger.info(f"Cleaned up temporary downloaded file: {local_audio_path}")
 
10
  from nemo.collections.asr.models import EncDecSpeakerLabelModel
11
  from pinecone import Pinecone, ServerlessSpec
12
  import librosa
13
+ import pandas as pd
14
+ from sklearn.ensemble import RandomForestClassifier
15
+ from sklearn.preprocessing import StandardScaler
16
+ from sklearn.feature_extraction.text import TfidfVectorizer
17
  import re
18
+ from typing import Dict, List, Tuple
19
  import logging
20
+ # --- Imports for enhanced PDF ---
21
  from reportlab.lib.pagesizes import letter
22
  from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
23
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
24
  from reportlab.lib.units import inch
25
  from reportlab.lib import colors
26
+ import matplotlib.pyplot as plt # Uncomment if you want to add charts and have matplotlib installed
27
+ from reportlab.platypus import Image # Uncomment if you want to add charts and have reportlab.platypus.Image installed
28
+ # --- End Imports for enhanced PDF ---
29
+ from transformers import AutoTokenizer, AutoModel
 
30
  import spacy
31
  import google.generativeai as genai
32
+ import joblib
33
  from concurrent.futures import ThreadPoolExecutor
 
34
 
35
+ # Setup logging
36
  logging.basicConfig(level=logging.INFO)
37
  logger = logging.getLogger(__name__)
38
+ logging.getLogger("nemo_logging").setLevel(logging.ERROR)
 
 
 
39
 
40
  # Configuration
41
+ AUDIO_DIR = "./uploads"
42
  OUTPUT_DIR = "./processed_audio"
43
  os.makedirs(OUTPUT_DIR, exist_ok=True)
44
 
 
47
  ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY")
48
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
+ # Initialize services
52
  def initialize_services():
 
53
  try:
54
  pc = Pinecone(api_key=PINECONE_KEY)
55
  index_name = "interview-speaker-embeddings"
 
61
  spec=ServerlessSpec(cloud="aws", region="us-east-1")
62
  )
63
  index = pc.Index(index_name)
64
+
65
  genai.configure(api_key=GEMINI_API_KEY)
66
  gemini_model = genai.GenerativeModel('gemini-1.5-flash')
67
+
68
  return index, gemini_model
69
  except Exception as e:
70
  logger.error(f"Error initializing services: {str(e)}")
71
  raise
72
 
73
+
74
  index, gemini_model = initialize_services()
75
+
76
+ # Device setup
77
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
78
  logger.info(f"Using device: {device}")
79
 
80
+
81
  def load_speaker_model():
 
82
  try:
83
+ import torch
84
+ torch.set_num_threads(5)
85
+ # -----------------------------------------------------------
86
+ # التعديل هنا: تحميل الموديل مباشرة من Hugging Face Hub
87
+ # -----------------------------------------------------------
88
  model = EncDecSpeakerLabelModel.from_pretrained(
89
  "nvidia/speakerverification_en_titanet_large",
90
  map_location=torch.device('cpu')
 
95
  logger.error(f"Model loading failed: {str(e)}")
96
  raise RuntimeError("Could not load speaker verification model")
97
 
98
+
99
+ # Load ML models
100
  def load_models():
 
101
  speaker_model = load_speaker_model()
102
  nlp = spacy.load("en_core_web_sm")
 
103
 
104
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
105
+ llm_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
106
+ llm_model.eval()
107
+
108
+ return speaker_model, nlp, tokenizer, llm_model
109
 
110
+
111
+ speaker_model, nlp, tokenizer, llm_model = load_models()
112
+
113
+
114
+ # Audio processing functions
115
  def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
 
116
  try:
117
  audio = AudioSegment.from_file(audio_path)
118
+ if audio.channels > 1:
119
+ audio = audio.set_channels(1)
120
+ audio = audio.set_frame_rate(16000)
121
+
122
  wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
123
  audio.export(wav_file, format="wav")
124
  return wav_file
 
126
  logger.error(f"Audio conversion failed: {str(e)}")
127
  raise
128
 
129
+
130
  def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
 
131
  try:
132
+ audio = AudioSegment.from_file(audio_path)
133
+ segment = audio[start_ms:end_ms]
134
+ temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
135
+ segment.export(temp_path, format="wav")
136
+
137
+ y, sr = librosa.load(temp_path, sr=16000)
138
+ pitches = librosa.piptrack(y=y, sr=sr)[0]
139
  pitches = pitches[pitches > 0]
 
 
140
 
141
+ features = {
142
  'duration': (end_ms - start_ms) / 1000,
143
  'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
144
+ 'min_pitch': float(np.min(pitches)) if len(pitches) > 0 else 0.0,
145
+ 'max_pitch': float(np.max(pitches)) if len(pitches) > 0 else 0.0,
146
  'pitch_sd': float(np.std(pitches)) if len(pitches) > 0 else 0.0,
147
+ 'intensityMean': float(np.mean(librosa.feature.rms(y=y)[0])),
148
+ 'intensityMin': float(np.min(librosa.feature.rms(y=y)[0])),
149
+ 'intensityMax': float(np.max(librosa.feature.rms(y=y)[0])),
150
+ 'intensitySD': float(np.std(librosa.feature.rms(y=y)[0])),
151
  }
152
+
153
+ os.remove(temp_path)
154
+ return features
155
  except Exception as e:
156
  logger.error(f"Feature extraction failed: {str(e)}")
157
+ return {
158
+ 'duration': (end_ms - start_ms) / 1000,
159
+ 'mean_pitch': 0.0,
160
+ 'min_pitch': 0.0,
161
+ 'max_pitch': 0.0,
162
+ 'pitch_sd': 0.0,
163
+ 'intensityMean': 0.0,
164
+ 'intensityMin': 0.0,
165
+ 'intensityMax': 0.0,
166
+ 'intensitySD': 0.0,
167
+ }
168
+
169
 
170
  def transcribe(audio_path: str) -> Dict:
 
171
  try:
 
172
  with open(audio_path, 'rb') as f:
173
+ upload_response = requests.post(
174
+ "https://api.assemblyai.com/v2/upload",
175
+ headers={"authorization": ASSEMBLYAI_KEY},
176
+ data=f
177
+ )
178
  audio_url = upload_response.json()['upload_url']
179
+
180
+ transcript_response = requests.post(
181
+ "https://api.assemblyai.com/v2/transcript",
182
+ headers={"authorization": ASSEMBLYAI_KEY},
183
+ json={
184
+ "audio_url": audio_url,
185
+ "speaker_labels": True,
186
+ "filter_profanity": True
187
+ }
188
+ )
189
  transcript_id = transcript_response.json()['id']
190
 
191
  while True:
192
+ result = requests.get(
193
+ f"https://api.assemblyai.com/v2/transcript/{transcript_id}",
194
+ headers={"authorization": ASSEMBLYAI_KEY}
195
+ ).json()
196
+
197
  if result['status'] == 'completed':
 
 
198
  return result
199
  elif result['status'] == 'error':
200
+ raise Exception(result['error'])
201
+
202
  time.sleep(5)
203
  except Exception as e:
204
  logger.error(f"Transcription failed: {str(e)}")
205
  raise
206
 
207
+
208
+ def process_utterance(utterance, full_audio, wav_file):
209
  try:
210
+ start = utterance['start']
211
+ end = utterance['end']
212
  segment = full_audio[start:end]
213
+ temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
214
+ segment.export(temp_path, format="wav")
215
+
216
+ with torch.no_grad():
217
+ embedding = speaker_model.get_embedding(temp_path).to(device)
218
+
219
+ query_result = index.query(
220
+ vector=embedding.cpu().numpy().tolist(),
221
+ top_k=1,
222
+ include_metadata=True
223
+ )
224
+
225
+ if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
226
+ speaker_id = query_result['matches'][0]['id']
227
+ speaker_name = query_result['matches'][0]['metadata']['speaker_name']
228
+ else:
229
+ speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
230
+ speaker_name = f"Speaker_{speaker_id[-4:]}"
231
+ index.upsert([(speaker_id, embedding.tolist(), {"speaker_name": speaker_name})])
232
+
233
+ os.remove(temp_path)
234
+
235
+ return {
236
+ **utterance,
237
+ 'speaker': speaker_name,
238
+ 'speaker_id': speaker_id,
239
+ 'embedding': embedding.cpu().numpy().tolist()
240
+ }
241
  except Exception as e:
242
  logger.error(f"Utterance processing failed: {str(e)}")
243
+ return {
244
+ **utterance,
245
+ 'speaker': 'Unknown',
246
+ 'speaker_id': 'unknown',
247
+ 'embedding': None
248
+ }
249
+
250
 
251
  def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
 
252
  try:
253
  full_audio = AudioSegment.from_wav(wav_file)
254
  utterances = transcript['utterances']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
+ with ThreadPoolExecutor(max_workers=5) as executor: # Changed to 5 workers
257
+ futures = [
258
+ executor.submit(process_utterance, utterance, full_audio, wav_file)
259
+ for utterance in utterances
260
+ ]
261
+ results = [f.result() for f in futures]
262
+
263
+ return results
264
  except Exception as e:
265
  logger.error(f"Speaker identification failed: {str(e)}")
266
  raise
267
 
268
+
269
+ def train_role_classifier(utterances: List[Dict]):
 
 
 
 
270
  try:
271
+ texts = [u['text'] for u in utterances]
272
+ vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
273
+ X_text = vectorizer.fit_transform(texts)
274
+
275
+ features = []
276
+ labels = []
277
+
278
+ for i, utterance in enumerate(utterances):
279
+ prosodic = utterance['prosodic_features']
280
+ feat = [
281
+ prosodic['duration'],
282
+ prosodic['mean_pitch'],
283
+ prosodic['min_pitch'],
284
+ prosodic['max_pitch'],
285
+ prosodic['pitch_sd'],
286
+ prosodic['intensityMean'],
287
+ prosodic['intensityMin'],
288
+ prosodic['intensityMax'],
289
+ prosodic['intensitySD'],
290
+ ]
291
+
292
+ feat.extend(X_text[i].toarray()[0].tolist())
293
+
294
+ doc = nlp(utterance['text'])
295
+ feat.extend([
296
+ int(utterance['text'].endswith('?')),
297
+ len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower())),
298
+ len(utterance['text'].split()),
299
+ sum(1 for token in doc if token.pos_ == 'VERB'),
300
+ sum(1 for token in doc if token.pos_ == 'NOUN')
301
+ ])
302
+
303
+ features.append(feat)
304
+ labels.append(0 if i % 2 == 0 else 1)
305
+
306
+ scaler = StandardScaler()
307
+ X = scaler.fit_transform(features)
308
+
309
+ clf = RandomForestClassifier(
310
+ n_estimators=150,
311
+ max_depth=10,
312
+ random_state=42,
313
+ class_weight='balanced'
314
+ )
315
+ clf.fit(X, labels)
316
+
317
+ joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
318
+ joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
319
+ joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
320
+
321
+ return clf, vectorizer, scaler
322
+ except Exception as e:
323
+ logger.error(f"Classifier training failed: {str(e)}")
324
+ raise
325
+
326
+
327
+ def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
328
+ try:
329
+ texts = [u['text'] for u in utterances]
330
+ X_text = vectorizer.transform(texts)
331
+
332
+ results = []
333
+ for i, utterance in enumerate(utterances):
334
+ prosodic = utterance['prosodic_features']
335
+ feat = [
336
+ prosodic['duration'],
337
+ prosodic['mean_pitch'],
338
+ prosodic['min_pitch'],
339
+ prosodic['max_pitch'],
340
+ prosodic['pitch_sd'],
341
+ prosodic['intensityMean'],
342
+ prosodic['intensityMin'],
343
+ prosodic['intensityMax'],
344
+ prosodic['intensitySD'],
345
+ ]
346
+
347
+ feat.extend(X_text[i].toarray()[0].tolist())
348
+
349
+ doc = nlp(utterance['text'])
350
+ feat.extend([
351
+ int(utterance['text'].endswith('?')),
352
+ len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower())),
353
+ len(utterance['text'].split()),
354
+ sum(1 for token in doc if token.pos_ == 'VERB'),
355
+ sum(1 for token in doc if token.pos_ == 'NOUN')
356
+ ])
357
+
358
+ X = scaler.transform([feat])
359
+ role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
360
+
361
+ results.append({**utterance, 'role': role})
362
+
363
+ return results
364
  except Exception as e:
365
  logger.error(f"Role classification failed: {str(e)}")
366
+ raise
 
 
 
367
 
368
 
369
  def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
 
370
  try:
371
+ y, sr = librosa.load(audio_path, sr=16000)
372
+
373
+ interviewee_utterances = [u for u in utterances if u['role'] == 'Interviewee']
374
  if not interviewee_utterances:
375
  return {'error': 'No interviewee utterances found'}
376
+
377
+ segments = []
378
+ for u in interviewee_utterances:
379
+ start = int(u['start'] * sr / 1000)
380
+ end = int(u['end'] * sr / 1000)
381
+ segments.append(y[start:end])
382
+
383
+ combined_audio = np.concatenate(segments)
384
+
385
  total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
386
  total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
387
+ speaking_rate = total_words / total_duration if total_duration > 0 else 0
388
 
389
+ filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
390
+ filler_count = sum(
391
+ sum(u['text'].lower().count(fw) for fw in filler_words)
392
+ for u in interviewee_utterances
393
+ )
394
  filler_ratio = filler_count / total_words if total_words > 0 else 0
395
 
 
396
  all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
397
+ word_counts = {}
398
+ for i in range(len(all_words) - 1):
399
+ bigram = (all_words[i], all_words[i + 1])
400
+ word_counts[bigram] = word_counts.get(bigram, 0) + 1
401
+ repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(
402
+ word_counts) if word_counts else 0
403
+
404
+ pitches = []
405
+ for segment in segments:
406
+ f0, voiced_flag, _ = librosa.pyin(segment, fmin=80, fmax=300, sr=sr)
407
+ pitches.extend(f0[voiced_flag])
408
 
409
  pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
410
  pitch_std = np.std(pitches) if len(pitches) > 0 else 0
411
+ jitter = np.mean(np.abs(np.diff(pitches))) / pitch_mean if len(pitches) > 1 and pitch_mean > 0 else 0
412
+
413
+ intensities = []
414
+ for segment in segments:
415
+ rms = librosa.feature.rms(y=segment)[0]
416
+ intensities.extend(rms)
417
+
418
+ intensity_mean = np.mean(intensities) if intensities else 0
419
+ intensity_std = np.std(intensities) if intensities else 0
420
+ shimmer = np.mean(np.abs(np.diff(intensities))) / intensity_mean if len(
421
+ intensities) > 1 and intensity_mean > 0 else 0
422
+
423
+ anxiety_score = 0.6 * (pitch_std / pitch_mean) + 0.4 * (jitter + shimmer) if pitch_mean > 0 else 0
424
+ confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
425
+ hesitation_score = filler_ratio + repetition_score
426
+
427
+ anxiety_level = 'high' if anxiety_score > 0.15 else 'moderate' if anxiety_score > 0.07 else 'low'
428
+ confidence_level = 'high' if confidence_score > 0.7 else 'moderate' if confidence_score > 0.5 else 'low'
429
+ fluency_level = 'fluent' if (filler_ratio < 0.05 and repetition_score < 0.1) else 'moderate' if (
430
+ filler_ratio < 0.1 and repetition_score < 0.2) else 'disfluent'
431
+
432
  return {
433
  'speaking_rate': float(round(speaking_rate, 2)),
434
  'filler_ratio': float(round(filler_ratio, 4)),
435
  'repetition_score': float(round(repetition_score, 4)),
436
+ 'pitch_analysis': {
437
+ 'mean': float(round(pitch_mean, 2)),
438
+ 'std_dev': float(round(pitch_std, 2)),
439
+ 'jitter': float(round(jitter, 4))
440
+ },
441
+ 'intensity_analysis': {
442
+ 'mean': float(round(intensity_mean, 2)),
443
+ 'std_dev': float(round(intensity_std, 2)),
444
+ 'shimmer': float(round(shimmer, 4))
445
+ },
446
  'composite_scores': {
447
  'anxiety': float(round(anxiety_score, 4)),
448
  'confidence': float(round(confidence_score, 4)),
449
  'hesitation': float(round(hesitation_score, 4))
450
  },
451
  'interpretation': {
452
+ 'anxiety_level': anxiety_level,
453
+ 'confidence_level': confidence_level,
454
+ 'fluency_level': fluency_level
455
  }
456
  }
457
  except Exception as e:
458
+ logger.error(f"Voice analysis failed: {str(e)}")
459
  return {'error': str(e)}
460
 
461
+
462
+ def generate_voice_interpretation(analysis: Dict) -> str:
463
+ # This function is used to provide the text interpretation for Gemini's prompt.
464
+ if 'error' in analysis:
465
+ return "Voice analysis not available."
466
+
467
+ interpretation_lines = []
468
+ interpretation_lines.append("Voice Analysis Summary:")
469
+ interpretation_lines.append(f"- Speaking Rate: {analysis['speaking_rate']} words/sec (average)")
470
+ interpretation_lines.append(f"- Filler Words: {analysis['filler_ratio'] * 100:.1f}% of words")
471
+ interpretation_lines.append(f"- Repetition Score: {analysis['repetition_score']:.3f}")
472
+ interpretation_lines.append(
473
+ f"- Anxiety Level: {analysis['interpretation']['anxiety_level'].upper()} (score: {analysis['composite_scores']['anxiety']:.3f})")
474
+ interpretation_lines.append(
475
+ f"- Confidence Level: {analysis['interpretation']['confidence_level'].upper()} (score: {analysis['composite_scores']['confidence']:.3f})")
476
+ interpretation_lines.append(f"- Fluency: {analysis['interpretation']['fluency_level'].upper()}")
477
+ interpretation_lines.append("")
478
+ interpretation_lines.append("Detailed Interpretation:")
479
+ interpretation_lines.append(
480
+ "1. A higher speaking rate indicates faster speech, which can suggest nervousness or enthusiasm.")
481
+ interpretation_lines.append("2. Filler words and repetitions reduce speech clarity and professionalism.")
482
+ interpretation_lines.append("3. Anxiety is measured through pitch variability and voice instability.")
483
+ interpretation_lines.append("4. Confidence is assessed through voice intensity and stability.")
484
+ interpretation_lines.append("5. Fluency combines filler words and repetition metrics.")
485
+
486
+ return "\n".join(interpretation_lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
 
488
 
489
  def generate_report(analysis_data: Dict) -> str:
 
490
  try:
491
+ voice = analysis_data.get('voice_analysis', {})
492
+ voice_interpretation = generate_voice_interpretation(voice)
 
 
 
 
 
 
 
 
 
 
 
 
 
493
 
494
+ interviewee_responses = [
495
+ f"Speaker {u['speaker']} ({u['role']}): {u['text']}"
496
+ for u in analysis_data['transcript']
497
+ if u['role'] == 'Interviewee'
498
+ ][:5] # Limit to first 5 for prompt brevity
499
 
500
+ prompt = f"""
501
+ Generate a comprehensive interview analysis report based on the provided data.
502
+ The report should be structured with clear headings and concise summaries.
503
  **1. Executive Summary**
504
+ Provide a brief overview of the interview, its duration, number of speaker turns, and main participants.
505
+ - Overall interview duration: {analysis_data['text_analysis']['total_duration']:.2f} seconds
506
+ - Number of speaker turns: {analysis_data['text_analysis']['speaker_turns']}
507
+ - Main participants: {', '.join(analysis_data['speakers'])}
508
+ **2. Voice Analysis**
509
+ Summarize key voice metrics and provide a detailed interpretation.
510
  {voice_interpretation}
511
+ **3. Content Analysis**
512
+ Analyze the key themes and strengths/weaknesses in the interviewee's responses.
513
+ Key responses from interviewee:
514
+ {chr(10).join(interviewee_responses)}
515
+ **4. Recommendations**
516
+ Offer specific, actionable suggestions for improvement focusing on communication skills, content delivery, and professional presentation.
 
 
 
 
517
  """
518
+
519
  response = gemini_model.generate_content(prompt)
520
  return response.text
521
  except Exception as e:
522
  logger.error(f"Report generation failed: {str(e)}")
523
  return f"Error generating report: {str(e)}"
524
 
525
+
526
+ # --- ENHANCED PDF GENERATION FUNCTION ---
527
  def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str):
528
  try:
529
+ doc = SimpleDocTemplate(output_path, pagesize=letter)
 
 
530
  styles = getSampleStyleSheet()
531
+
532
+ # Define custom styles
533
+ h1 = ParagraphStyle(name='Heading1', parent=styles['h1'], fontSize=16, spaceAfter=14, alignment=1)
534
+ h2 = ParagraphStyle(name='Heading2', parent=styles['h2'], fontSize=12, spaceBefore=10, spaceAfter=8,
535
+ textColor=colors.HexColor('#333366'))
536
+ h3 = ParagraphStyle(name='Heading3', parent=styles['h3'], fontSize=10, spaceBefore=8, spaceAfter=4,
537
+ textColor=colors.HexColor('#0055AA'))
538
+ body_text = ParagraphStyle(name='BodyText', parent=styles['Normal'], fontSize=9, leading=12, spaceAfter=4)
539
+ bullet_style = ParagraphStyle(name='Bullet', parent=styles['Normal'], fontSize=9, leading=12, leftIndent=18,
540
+ bulletIndent=9)
541
+
542
  story = []
543
 
544
+ # Title Page / Header
545
+ story.append(Paragraph("<b>Interview Analysis Report</b>", h1))
546
+ story.append(Spacer(1, 0.2 * inch))
547
+ story.append(Paragraph(f"<b>Date:</b> {time.strftime('%Y-%m-%d')}", body_text))
548
+ story.append(Spacer(1, 0.3 * inch))
549
+
550
+ # Parse Gemini's report into sections for better PDF structuring
551
+ sections = {}
552
+ current_section = None
553
+ for line in gemini_report_text.split('\n'):
554
+ if line.startswith('**1. Executive Summary**'):
555
+ current_section = 'Executive Summary'
556
+ sections[current_section] = []
557
+ elif line.startswith('**2. Voice Analysis**'):
558
+ current_section = 'Voice Analysis (Gemini Interpretation)'
559
+ sections[current_section] = []
560
+ elif line.startswith('**3. Content Analysis**'):
561
+ current_section = 'Content Analysis'
562
+ sections[current_section] = []
563
+ elif line.startswith('**4. Recommendations**'):
564
+ current_section = 'Recommendations'
565
+ sections[current_section] = []
566
+ elif current_section:
567
+ sections[current_section].append(line)
568
+
569
+ # 1. Executive Summary
570
+ story.append(Paragraph("1. Executive Summary", h2))
571
+ story.append(Spacer(1, 0.1 * inch))
572
+ if 'Executive Summary' in sections:
573
+ for line in sections['Executive Summary']:
574
+ if line.strip():
575
+ story.append(Paragraph(line.strip(), body_text))
576
+ story.append(Spacer(1, 0.2 * inch))
577
+
578
+ # 2. Voice Analysis (Detailed - using Table for summary)
579
+ story.append(Paragraph("2. Voice Analysis", h2))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
580
  voice_analysis = analysis_data.get('voice_analysis', {})
581
+
582
  if voice_analysis and 'error' not in voice_analysis:
583
+ # Voice Analysis Summary Table
584
  table_data = [
585
+ ['Metric', 'Value', 'Interpretation'],
586
+ ['Speaking Rate', f"{voice_analysis['speaking_rate']:.2f} words/sec", 'Average rate'],
587
+ ['Filler Words', f"{voice_analysis['filler_ratio'] * 100:.1f}%", 'Percentage of total words'],
588
+ ['Repetition Score', f"{voice_analysis['repetition_score']:.3f}", 'Lower is better articulation'],
589
+ ['Anxiety Level', voice_analysis['interpretation']['anxiety_level'].upper(),
590
+ f"Score: {voice_analysis['composite_scores']['anxiety']:.3f}"],
591
+ ['Confidence Level', voice_analysis['interpretation']['confidence_level'].upper(),
592
+ f"Score: {voice_analysis['composite_scores']['confidence']:.3f}"],
593
+ ['Fluency', voice_analysis['interpretation']['fluency_level'].upper(), 'Overall speech flow']
594
  ]
595
+
596
+ table_style = TableStyle([
597
+ ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#6699CC')),
598
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
599
+ ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
600
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
601
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 10),
602
+ ('BACKGROUND', (0, 1), (-1, -1), colors.HexColor('#EFEFEF')),
603
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#CCCCCC')),
604
+ ('LEFTPADDING', (0, 0), (-1, -1), 6),
605
+ ('RIGHTPADDING', (0, 0), (-1, -1), 6),
606
+ ('TOPPADDING', (0, 0), (-1, -1), 6),
607
+ ('BOTTOMPADDING', (0, 0), (-1, -1), 6),
608
+ ])
609
+
610
+ table = Table(table_data)
611
+ table.setStyle(table_style)
612
  story.append(table)
613
  story.append(Spacer(1, 0.2 * inch))
 
 
 
 
 
 
 
 
 
614
 
615
+ # Detailed Interpretation from Gemini (if present)
616
+ if 'Voice Analysis (Gemini Interpretation)' in sections:
617
+ story.append(Paragraph("Detailed Interpretation:", h3))
618
+ for line in sections['Voice Analysis (Gemini Interpretation)']:
619
+ if line.strip():
620
+ story.append(Paragraph(line.strip(), body_text))
621
+ story.append(Spacer(1, 0.2 * inch))
622
+
623
+ # --- Placeholder for Charts ---
624
+ # You would generate charts here using matplotlib/seaborn
625
+ # Example (uncomment and implement generate_anxiety_confidence_chart):
626
+ # chart_path = os.path.join(OUTPUT_DIR, f"anxiety_confidence_{uuid.uuid4().hex[:8]}.png")
627
+ # generate_anxiety_confidence_chart(voice_analysis['composite_scores'], chart_path) # Your function to generate chart
628
+ # try:
629
+ # if os.path.exists(chart_path):
630
+ # img = Image(chart_path, width=4*inch, height=2.5*inch)
631
+ # story.append(img)
632
+ # story.append(Spacer(1, 0.1 * inch))
633
+ # os.remove(chart_path) # Clean up generated chart image
634
+ # except Exception as img_e:
635
+ # logger.warning(f"Could not add chart image to PDF: {img_e}")
636
+ # --- End Placeholder for Charts ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
637
 
 
 
 
 
 
 
638
  else:
639
+ story.append(Paragraph("Voice analysis not available or encountered an error.", body_text))
 
 
 
 
 
 
 
640
  story.append(Spacer(1, 0.3 * inch))
641
 
642
+ # 3. Content Analysis
643
+ story.append(Paragraph("3. Content Analysis", h2))
644
+ if 'Content Analysis' in sections:
645
+ for line in sections['Content Analysis']:
646
+ if line.strip():
647
+ if line.strip().startswith('-'): # For bullet points from Gemini
648
+ story.append(Paragraph(line.strip(), bullet_style))
649
+ else:
650
+ story.append(Paragraph(line.strip(), body_text))
651
+ story.append(Spacer(1, 0.2 * inch))
 
652
 
653
+ # Add some interviewee responses to the report (can be formatted as a list)
654
+ story.append(Paragraph("Key Interviewee Responses:", h3))
655
+ interviewee_responses = [
656
+ f"Speaker {u['speaker']} ({u['role']}): {u['text']}"
657
+ for u in analysis_data['transcript']
658
+ if u['role'] == 'Interviewee'
659
+ ][:5] # Show only first 5
660
+ for res in interviewee_responses:
661
+ story.append(Paragraph(res, bullet_style))
 
 
 
 
 
 
662
  story.append(Spacer(1, 0.3 * inch))
 
663
 
664
+ # 4. Recommendations
665
+ story.append(Paragraph("4. Recommendations", h2))
666
+ if 'Recommendations' in sections:
667
+ for line in sections['Recommendations']:
668
+ if line.strip():
669
+ if line.strip().startswith('-'): # For bullet points from Gemini
670
+ story.append(Paragraph(line.strip(), bullet_style))
671
+ else:
672
+ story.append(Paragraph(line.strip(), body_text))
673
+ story.append(Spacer(1, 0.2 * inch))
674
+
675
+ doc.build(story)
676
  return True
677
  except Exception as e:
678
  logger.error(f"PDF creation failed: {str(e)}", exc_info=True)
679
  return False
680
 
 
 
 
 
 
 
681
 
682
  def convert_to_serializable(obj):
683
+ if isinstance(obj, np.generic):
684
+ return obj.item()
685
+ elif isinstance(obj, dict):
686
+ return {key: convert_to_serializable(value) for key, value in obj.items()}
687
+ elif isinstance(obj, list):
688
+ return [convert_to_serializable(item) for item in obj]
689
+ elif isinstance(obj, np.ndarray):
690
+ return obj.tolist()
691
  return obj
692
 
693
+
694
+ def process_interview(audio_path: str):
 
 
 
695
  try:
696
+ logger.info(f"Starting processing for {audio_path}")
697
+
698
+ wav_file = convert_to_wav(audio_path)
 
 
 
699
 
700
+ logger.info("Starting transcription")
701
  transcript = transcribe(wav_file)
702
+
703
+ logger.info("Extracting prosodic features")
704
+ for utterance in transcript['utterances']:
705
+ utterance['prosodic_features'] = extract_prosodic_features(
706
+ wav_file,
707
+ utterance['start'],
708
+ utterance['end']
709
+ )
710
+
711
+ logger.info("Identifying speakers")
712
  utterances_with_speakers = identify_speakers(transcript, wav_file)
713
+
714
+ logger.info("Classifying roles")
715
+ # Ensure role classifier models are loaded/trained only once if possible,
716
+ # or handled carefully in a multi-threaded context.
717
+ # For simplicity, keeping it inside process_interview for now.
718
+ if os.path.exists(os.path.join(OUTPUT_DIR, 'role_classifier.pkl')):
719
+ clf = joblib.load(os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
720
+ vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
721
+ scaler = joblib.load(os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
722
+ else:
723
+ clf, vectorizer, scaler = train_role_classifier(utterances_with_speakers)
724
+
725
+ classified_utterances = classify_roles(utterances_with_speakers, clf, vectorizer, scaler)
726
+
727
+ logger.info("Analyzing interviewee voice")
728
  voice_analysis = analyze_interviewee_voice(wav_file, classified_utterances)
729
+
730
  analysis_data = {
731
  'transcript': classified_utterances,
732
+ 'speakers': list(set(u['speaker'] for u in classified_utterances)),
733
  'voice_analysis': voice_analysis,
734
  'text_analysis': {
735
  'total_duration': sum(u['prosodic_features']['duration'] for u in classified_utterances),
736
  'speaker_turns': len(classified_utterances)
737
  }
738
  }
739
+
740
+ logger.info("Generating report text using Gemini")
 
741
  gemini_report_text = generate_report(analysis_data)
742
+
743
+ base_name = os.path.splitext(os.path.basename(audio_path))[0]
744
  pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
745
+ # Pass the full analysis_data AND the gemini_report_text to the PDF function
746
+ create_pdf_report(analysis_data, pdf_path, gemini_report_text=gemini_report_text)
747
+
748
  json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
 
 
 
749
  with open(json_path, 'w') as f:
750
  serializable_data = convert_to_serializable(analysis_data)
751
  json.dump(serializable_data, f, indent=2)
 
 
 
752
 
753
+ os.remove(wav_file) # Clean up WAV file after processing
754
+
755
+ logger.info(f"Processing completed for {audio_path}")
756
+ return {
757
+ 'pdf_path': pdf_path,
758
+ 'json_path': json_path
759
+ }
760
  except Exception as e:
761
+ logger.error(f"Processing failed: {str(e)}", exc_info=True)
762
+ # Clean up wav_file in case of error
763
+ if 'wav_file' in locals() and os.path.exists(wav_file):
 
 
764
  os.remove(wav_file)
765
+ raise