norhan12 commited on
Commit
71e2e34
·
verified ·
1 Parent(s): 87066d1

Update process_interview.py

Browse files
Files changed (1) hide show
  1. process_interview.py +241 -580
process_interview.py CHANGED
@@ -1,59 +1,77 @@
 
 
 
 
 
 
 
1
  import os
 
 
 
 
 
 
 
 
 
2
  import torch
3
  import numpy as np
4
- import uuid
5
  import requests
6
- import time
7
- import json
8
  from pydub import AudioSegment
9
- import wave
 
 
 
 
 
10
  from nemo.collections.asr.models import EncDecSpeakerLabelModel
11
  from pinecone import Pinecone, ServerlessSpec
12
- import librosa
13
- import pandas as pd
14
  from sklearn.ensemble import RandomForestClassifier
15
  from sklearn.preprocessing import StandardScaler
16
  from sklearn.feature_extraction.text import TfidfVectorizer
17
- import re
18
- from typing import Dict, List, Tuple
19
- import logging
20
- # --- Imports for enhanced PDF ---
21
  from reportlab.lib.pagesizes import letter
22
- from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
23
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
24
  from reportlab.lib.units import inch
25
  from reportlab.lib import colors
26
- import matplotlib.pyplot as plt # Uncomment if you want to add charts and have matplotlib installed
27
- from reportlab.platypus import Image # Uncomment if you want to add charts and have reportlab.platypus.Image installed
28
- # --- End Imports for enhanced PDF ---
29
- from transformers import AutoTokenizer, AutoModel
30
- import spacy
31
- import google.generativeai as genai
32
- import joblib
33
- from concurrent.futures import ThreadPoolExecutor
34
 
35
- # Setup logging
36
- logging.basicConfig(level=logging.INFO)
 
 
37
  logger = logging.getLogger(__name__)
 
 
38
  logging.getLogger("nemo_logging").setLevel(logging.ERROR)
 
39
 
40
- # Configuration
41
- AUDIO_DIR = "./uploads"
42
  OUTPUT_DIR = "./processed_audio"
43
  os.makedirs(OUTPUT_DIR, exist_ok=True)
44
 
45
- # API Keys
46
  PINECONE_KEY = os.getenv("PINECONE_KEY")
47
  ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY")
48
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
49
 
 
 
 
 
50
 
51
- # Initialize services
52
  def initialize_services():
 
53
  try:
 
54
  pc = Pinecone(api_key=PINECONE_KEY)
55
  index_name = "interview-speaker-embeddings"
56
  if index_name not in pc.list_indexes().names():
 
57
  pc.create_index(
58
  name=index_name,
59
  dimension=192,
@@ -61,236 +79,152 @@ def initialize_services():
61
  spec=ServerlessSpec(cloud="aws", region="us-east-1")
62
  )
63
  index = pc.Index(index_name)
64
-
65
  genai.configure(api_key=GEMINI_API_KEY)
66
  gemini_model = genai.GenerativeModel('gemini-1.5-flash')
67
-
68
  return index, gemini_model
69
  except Exception as e:
70
  logger.error(f"Error initializing services: {str(e)}")
71
  raise
72
 
73
-
74
- index, gemini_model = initialize_services()
75
-
76
- # Device setup
77
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
78
- logger.info(f"Using device: {device}")
79
-
80
-
81
- def load_speaker_model():
82
- try:
83
- import torch
84
- torch.set_num_threads(5)
85
- # -----------------------------------------------------------
86
- # التعديل هنا: تحميل الموديل مباشرة من Hugging Face Hub
87
- # -----------------------------------------------------------
88
- model = EncDecSpeakerLabelModel.from_pretrained(
89
- "nvidia/speakerverification_en_titanet_large",
90
- map_location=torch.device('cpu')
91
- )
92
- model.eval()
93
- return model
94
- except Exception as e:
95
- logger.error(f"Model loading failed: {str(e)}")
96
- raise RuntimeError("Could not load speaker verification model")
97
-
98
-
99
- # Load ML models
100
  def load_models():
101
- speaker_model = load_speaker_model()
 
 
 
 
 
 
 
 
102
  nlp = spacy.load("en_core_web_sm")
 
 
103
 
104
- tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
105
- llm_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
106
- llm_model.eval()
107
-
108
- return speaker_model, nlp, tokenizer, llm_model
109
-
110
-
111
- speaker_model, nlp, tokenizer, llm_model = load_models()
112
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- # Audio processing functions
115
  def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
 
116
  try:
 
117
  audio = AudioSegment.from_file(audio_path)
118
- if audio.channels > 1:
119
- audio = audio.set_channels(1)
120
- audio = audio.set_frame_rate(16000)
121
-
122
  wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
123
  audio.export(wav_file, format="wav")
 
124
  return wav_file
125
  except Exception as e:
126
- logger.error(f"Audio conversion failed: {str(e)}")
127
  raise
128
 
129
-
130
- def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
131
- try:
132
- audio = AudioSegment.from_file(audio_path)
133
- segment = audio[start_ms:end_ms]
134
- temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
135
- segment.export(temp_path, format="wav")
136
-
137
- y, sr = librosa.load(temp_path, sr=16000)
138
- pitches = librosa.piptrack(y=y, sr=sr)[0]
139
- pitches = pitches[pitches > 0]
140
-
141
- features = {
142
- 'duration': (end_ms - start_ms) / 1000,
143
- 'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
144
- 'min_pitch': float(np.min(pitches)) if len(pitches) > 0 else 0.0,
145
- 'max_pitch': float(np.max(pitches)) if len(pitches) > 0 else 0.0,
146
- 'pitch_sd': float(np.std(pitches)) if len(pitches) > 0 else 0.0,
147
- 'intensityMean': float(np.mean(librosa.feature.rms(y=y)[0])),
148
- 'intensityMin': float(np.min(librosa.feature.rms(y=y)[0])),
149
- 'intensityMax': float(np.max(librosa.feature.rms(y=y)[0])),
150
- 'intensitySD': float(np.std(librosa.feature.rms(y=y)[0])),
151
- }
152
-
153
- os.remove(temp_path)
154
- return features
155
- except Exception as e:
156
- logger.error(f"Feature extraction failed: {str(e)}")
157
- return {
158
- 'duration': (end_ms - start_ms) / 1000,
159
- 'mean_pitch': 0.0,
160
- 'min_pitch': 0.0,
161
- 'max_pitch': 0.0,
162
- 'pitch_sd': 0.0,
163
- 'intensityMean': 0.0,
164
- 'intensityMin': 0.0,
165
- 'intensityMax': 0.0,
166
- 'intensitySD': 0.0,
167
- }
168
-
169
-
170
  def transcribe(audio_path: str) -> Dict:
 
171
  try:
 
 
172
  with open(audio_path, 'rb') as f:
173
- upload_response = requests.post(
174
- "https://api.assemblyai.com/v2/upload",
175
- headers={"authorization": ASSEMBLYAI_KEY},
176
- data=f
177
- )
178
  audio_url = upload_response.json()['upload_url']
179
-
180
- transcript_response = requests.post(
181
- "https://api.assemblyai.com/v2/transcript",
182
- headers={"authorization": ASSEMBLYAI_KEY},
183
- json={
184
- "audio_url": audio_url,
185
- "speaker_labels": True,
186
- "filter_profanity": True
187
- }
188
- )
189
  transcript_id = transcript_response.json()['id']
190
 
 
191
  while True:
192
- result = requests.get(
193
- f"https://api.assemblyai.com/v2/transcript/{transcript_id}",
194
- headers={"authorization": ASSEMBLYAI_KEY}
195
- ).json()
196
-
197
  if result['status'] == 'completed':
 
 
 
198
  return result
199
  elif result['status'] == 'error':
200
- raise Exception(result['error'])
201
-
202
  time.sleep(5)
203
  except Exception as e:
204
- logger.error(f"Transcription failed: {str(e)}")
205
  raise
206
 
207
-
208
- def process_utterance(utterance, full_audio, wav_file):
209
  try:
210
- start = utterance['start']
211
- end = utterance['end']
212
- segment = full_audio[start:end]
213
- temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
214
- segment.export(temp_path, format="wav")
215
-
216
- with torch.no_grad():
217
- embedding = speaker_model.get_embedding(temp_path).to(device)
218
-
219
- query_result = index.query(
220
- vector=embedding.cpu().numpy().tolist(),
221
- top_k=1,
222
- include_metadata=True
223
- )
224
-
225
- if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
226
- speaker_id = query_result['matches'][0]['id']
227
- speaker_name = query_result['matches'][0]['metadata']['speaker_name']
228
- else:
229
- speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
230
- speaker_name = f"Speaker_{speaker_id[-4:]}"
231
- index.upsert([(speaker_id, embedding.tolist(), {"speaker_name": speaker_name})])
232
 
233
- os.remove(temp_path)
 
 
234
 
235
  return {
236
- **utterance,
237
- 'speaker': speaker_name,
238
- 'speaker_id': speaker_id,
239
- 'embedding': embedding.cpu().numpy().tolist()
 
240
  }
241
  except Exception as e:
242
- logger.error(f"Utterance processing failed: {str(e)}")
243
- return {
244
- **utterance,
245
- 'speaker': 'Unknown',
246
- 'speaker_id': 'unknown',
247
- 'embedding': None
248
- }
249
-
250
-
251
- def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
252
- try:
253
- full_audio = AudioSegment.from_wav(wav_file)
254
- utterances = transcript['utterances']
255
 
256
- with ThreadPoolExecutor(max_workers=5) as executor: # Changed to 5 workers
257
- futures = [
258
- executor.submit(process_utterance, utterance, full_audio, wav_file)
259
- for utterance in utterances
260
- ]
261
- results = [f.result() for f in futures]
262
-
263
- return results
264
- except Exception as e:
265
- logger.error(f"Speaker identification failed: {str(e)}")
266
- raise
267
 
 
268
 
269
  def train_role_classifier(utterances: List[Dict]):
 
 
 
 
270
  try:
 
271
  texts = [u['text'] for u in utterances]
272
  vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
273
  X_text = vectorizer.fit_transform(texts)
274
-
275
  features = []
276
- labels = []
277
-
278
  for i, utterance in enumerate(utterances):
279
- prosodic = utterance['prosodic_features']
280
  feat = [
281
- prosodic['duration'],
282
- prosodic['mean_pitch'],
283
- prosodic['min_pitch'],
284
- prosodic['max_pitch'],
285
- prosodic['pitch_sd'],
286
- prosodic['intensityMean'],
287
- prosodic['intensityMin'],
288
- prosodic['intensityMax'],
289
- prosodic['intensitySD'],
290
  ]
291
-
292
  feat.extend(X_text[i].toarray()[0].tolist())
293
-
294
  doc = nlp(utterance['text'])
295
  feat.extend([
296
  int(utterance['text'].endswith('?')),
@@ -299,53 +233,39 @@ def train_role_classifier(utterances: List[Dict]):
299
  sum(1 for token in doc if token.pos_ == 'VERB'),
300
  sum(1 for token in doc if token.pos_ == 'NOUN')
301
  ])
302
-
303
  features.append(feat)
304
- labels.append(0 if i % 2 == 0 else 1)
305
 
306
  scaler = StandardScaler()
307
  X = scaler.fit_transform(features)
308
-
309
- clf = RandomForestClassifier(
310
- n_estimators=150,
311
- max_depth=10,
312
- random_state=42,
313
- class_weight='balanced'
314
- )
315
  clf.fit(X, labels)
316
-
 
317
  joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
318
  joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
319
  joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
320
-
321
  return clf, vectorizer, scaler
322
  except Exception as e:
323
  logger.error(f"Classifier training failed: {str(e)}")
324
  raise
325
 
326
-
327
  def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
 
328
  try:
 
329
  texts = [u['text'] for u in utterances]
330
  X_text = vectorizer.transform(texts)
331
-
332
  results = []
333
  for i, utterance in enumerate(utterances):
334
- prosodic = utterance['prosodic_features']
335
  feat = [
336
- prosodic['duration'],
337
- prosodic['mean_pitch'],
338
- prosodic['min_pitch'],
339
- prosodic['max_pitch'],
340
- prosodic['pitch_sd'],
341
- prosodic['intensityMean'],
342
- prosodic['intensityMin'],
343
- prosodic['intensityMax'],
344
- prosodic['intensitySD'],
345
  ]
346
-
347
  feat.extend(X_text[i].toarray()[0].tolist())
348
-
349
  doc = nlp(utterance['text'])
350
  feat.extend([
351
  int(utterance['text'].endswith('?')),
@@ -354,412 +274,153 @@ def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
354
  sum(1 for token in doc if token.pos_ == 'VERB'),
355
  sum(1 for token in doc if token.pos_ == 'NOUN')
356
  ])
357
-
358
  X = scaler.transform([feat])
359
  role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
360
-
361
  results.append({**utterance, 'role': role})
362
-
363
  return results
364
  except Exception as e:
365
- logger.error(f"Role classification failed: {str(e)}")
366
  raise
367
 
 
368
 
369
  def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
 
370
  try:
371
- y, sr = librosa.load(audio_path, sr=16000)
372
-
373
- interviewee_utterances = [u for u in utterances if u['role'] == 'Interviewee']
374
  if not interviewee_utterances:
 
375
  return {'error': 'No interviewee utterances found'}
376
 
377
- segments = []
378
- for u in interviewee_utterances:
379
- start = int(u['start'] * sr / 1000)
380
- end = int(u['end'] * sr / 1000)
381
- segments.append(y[start:end])
382
-
383
- combined_audio = np.concatenate(segments)
384
-
385
- total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
386
  total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
387
- speaking_rate = total_words / total_duration if total_duration > 0 else 0
388
 
389
- filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
390
- filler_count = sum(
391
- sum(u['text'].lower().count(fw) for fw in filler_words)
392
- for u in interviewee_utterances
393
- )
394
  filler_ratio = filler_count / total_words if total_words > 0 else 0
 
 
 
 
 
395
 
396
- all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
397
- word_counts = {}
398
- for i in range(len(all_words) - 1):
399
- bigram = (all_words[i], all_words[i + 1])
400
- word_counts[bigram] = word_counts.get(bigram, 0) + 1
401
- repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(
402
- word_counts) if word_counts else 0
403
-
404
- pitches = []
405
- for segment in segments:
406
- f0, voiced_flag, _ = librosa.pyin(segment, fmin=80, fmax=300, sr=sr)
407
- pitches.extend(f0[voiced_flag])
408
-
409
- pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
410
  pitch_std = np.std(pitches) if len(pitches) > 0 else 0
411
- jitter = np.mean(np.abs(np.diff(pitches))) / pitch_mean if len(pitches) > 1 and pitch_mean > 0 else 0
412
-
413
- intensities = []
414
- for segment in segments:
415
- rms = librosa.feature.rms(y=segment)[0]
416
- intensities.extend(rms)
417
-
418
- intensity_mean = np.mean(intensities) if intensities else 0
419
- intensity_std = np.std(intensities) if intensities else 0
420
- shimmer = np.mean(np.abs(np.diff(intensities))) / intensity_mean if len(
421
- intensities) > 1 and intensity_mean > 0 else 0
422
 
423
- anxiety_score = 0.6 * (pitch_std / pitch_mean) + 0.4 * (jitter + shimmer) if pitch_mean > 0 else 0
424
- confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
425
- hesitation_score = filler_ratio + repetition_score
426
-
427
- anxiety_level = 'high' if anxiety_score > 0.15 else 'moderate' if anxiety_score > 0.07 else 'low'
428
- confidence_level = 'high' if confidence_score > 0.7 else 'moderate' if confidence_score > 0.5 else 'low'
429
- fluency_level = 'fluent' if (filler_ratio < 0.05 and repetition_score < 0.1) else 'moderate' if (
430
- filler_ratio < 0.1 and repetition_score < 0.2) else 'disfluent'
431
 
432
  return {
433
  'speaking_rate': float(round(speaking_rate, 2)),
434
  'filler_ratio': float(round(filler_ratio, 4)),
435
- 'repetition_score': float(round(repetition_score, 4)),
436
- 'pitch_analysis': {
437
- 'mean': float(round(pitch_mean, 2)),
438
- 'std_dev': float(round(pitch_std, 2)),
439
- 'jitter': float(round(jitter, 4))
440
- },
441
- 'intensity_analysis': {
442
- 'mean': float(round(intensity_mean, 2)),
443
- 'std_dev': float(round(intensity_std, 2)),
444
- 'shimmer': float(round(shimmer, 4))
445
- },
446
  'composite_scores': {
447
  'anxiety': float(round(anxiety_score, 4)),
448
  'confidence': float(round(confidence_score, 4)),
449
  'hesitation': float(round(hesitation_score, 4))
450
- },
451
- 'interpretation': {
452
- 'anxiety_level': anxiety_level,
453
- 'confidence_level': confidence_level,
454
- 'fluency_level': fluency_level
455
  }
456
  }
457
  except Exception as e:
458
- logger.error(f"Voice analysis failed: {str(e)}")
459
  return {'error': str(e)}
460
 
461
-
462
- def generate_voice_interpretation(analysis: Dict) -> str:
463
- # This function is used to provide the text interpretation for Gemini's prompt.
464
- if 'error' in analysis:
465
- return "Voice analysis not available."
466
-
467
- interpretation_lines = []
468
- interpretation_lines.append("Voice Analysis Summary:")
469
- interpretation_lines.append(f"- Speaking Rate: {analysis['speaking_rate']} words/sec (average)")
470
- interpretation_lines.append(f"- Filler Words: {analysis['filler_ratio'] * 100:.1f}% of words")
471
- interpretation_lines.append(f"- Repetition Score: {analysis['repetition_score']:.3f}")
472
- interpretation_lines.append(
473
- f"- Anxiety Level: {analysis['interpretation']['anxiety_level'].upper()} (score: {analysis['composite_scores']['anxiety']:.3f})")
474
- interpretation_lines.append(
475
- f"- Confidence Level: {analysis['interpretation']['confidence_level'].upper()} (score: {analysis['composite_scores']['confidence']:.3f})")
476
- interpretation_lines.append(f"- Fluency: {analysis['interpretation']['fluency_level'].upper()}")
477
- interpretation_lines.append("")
478
- interpretation_lines.append("Detailed Interpretation:")
479
- interpretation_lines.append(
480
- "1. A higher speaking rate indicates faster speech, which can suggest nervousness or enthusiasm.")
481
- interpretation_lines.append("2. Filler words and repetitions reduce speech clarity and professionalism.")
482
- interpretation_lines.append("3. Anxiety is measured through pitch variability and voice instability.")
483
- interpretation_lines.append("4. Confidence is assessed through voice intensity and stability.")
484
- interpretation_lines.append("5. Fluency combines filler words and repetition metrics.")
485
-
486
- return "\n".join(interpretation_lines)
487
-
488
-
489
  def generate_report(analysis_data: Dict) -> str:
 
490
  try:
491
- voice = analysis_data.get('voice_analysis', {})
492
- voice_interpretation = generate_voice_interpretation(voice)
493
-
494
- interviewee_responses = [
495
- f"Speaker {u['speaker']} ({u['role']}): {u['text']}"
496
- for u in analysis_data['transcript']
497
- if u['role'] == 'Interviewee'
498
- ][:5] # Limit to first 5 for prompt brevity
499
-
500
- prompt = f"""
501
- Generate a comprehensive interview analysis report based on the provided data.
502
- The report should be structured with clear headings and concise summaries.
503
- **1. Executive Summary**
504
- Provide a brief overview of the interview, its duration, number of speaker turns, and main participants.
505
- - Overall interview duration: {analysis_data['text_analysis']['total_duration']:.2f} seconds
506
- - Number of speaker turns: {analysis_data['text_analysis']['speaker_turns']}
507
- - Main participants: {', '.join(analysis_data['speakers'])}
508
- **2. Voice Analysis**
509
- Summarize key voice metrics and provide a detailed interpretation.
510
- {voice_interpretation}
511
- **3. Content Analysis**
512
- Analyze the key themes and strengths/weaknesses in the interviewee's responses.
513
- Key responses from interviewee:
514
- {chr(10).join(interviewee_responses)}
515
- **4. Recommendations**
516
- Offer specific, actionable suggestions for improvement focusing on communication skills, content delivery, and professional presentation.
517
- """
518
-
519
- response = gemini_model.generate_content(prompt)
520
- return response.text
521
  except Exception as e:
522
  logger.error(f"Report generation failed: {str(e)}")
523
- return f"Error generating report: {str(e)}"
524
 
 
525
 
526
- # --- ENHANCED PDF GENERATION FUNCTION ---
527
- def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str):
 
 
 
 
 
528
  try:
529
- doc = SimpleDocTemplate(output_path, pagesize=letter)
530
- styles = getSampleStyleSheet()
531
-
532
- # Define custom styles
533
- h1 = ParagraphStyle(name='Heading1', parent=styles['h1'], fontSize=16, spaceAfter=14, alignment=1)
534
- h2 = ParagraphStyle(name='Heading2', parent=styles['h2'], fontSize=12, spaceBefore=10, spaceAfter=8,
535
- textColor=colors.HexColor('#333366'))
536
- h3 = ParagraphStyle(name='Heading3', parent=styles['h3'], fontSize=10, spaceBefore=8, spaceAfter=4,
537
- textColor=colors.HexColor('#0055AA'))
538
- body_text = ParagraphStyle(name='BodyText', parent=styles['Normal'], fontSize=9, leading=12, spaceAfter=4)
539
- bullet_style = ParagraphStyle(name='Bullet', parent=styles['Normal'], fontSize=9, leading=12, leftIndent=18,
540
- bulletIndent=9)
541
-
542
- story = []
543
-
544
- # Title Page / Header
545
- story.append(Paragraph("<b>Interview Analysis Report</b>", h1))
546
- story.append(Spacer(1, 0.2 * inch))
547
- story.append(Paragraph(f"<b>Date:</b> {time.strftime('%Y-%m-%d')}", body_text))
548
- story.append(Spacer(1, 0.3 * inch))
549
-
550
- # Parse Gemini's report into sections for better PDF structuring
551
- sections = {}
552
- current_section = None
553
- for line in gemini_report_text.split('\n'):
554
- if line.startswith('**1. Executive Summary**'):
555
- current_section = 'Executive Summary'
556
- sections[current_section] = []
557
- elif line.startswith('**2. Voice Analysis**'):
558
- current_section = 'Voice Analysis (Gemini Interpretation)'
559
- sections[current_section] = []
560
- elif line.startswith('**3. Content Analysis**'):
561
- current_section = 'Content Analysis'
562
- sections[current_section] = []
563
- elif line.startswith('**4. Recommendations**'):
564
- current_section = 'Recommendations'
565
- sections[current_section] = []
566
- elif current_section:
567
- sections[current_section].append(line)
568
-
569
- # 1. Executive Summary
570
- story.append(Paragraph("1. Executive Summary", h2))
571
- story.append(Spacer(1, 0.1 * inch))
572
- if 'Executive Summary' in sections:
573
- for line in sections['Executive Summary']:
574
- if line.strip():
575
- story.append(Paragraph(line.strip(), body_text))
576
- story.append(Spacer(1, 0.2 * inch))
577
-
578
- # 2. Voice Analysis (Detailed - using Table for summary)
579
- story.append(Paragraph("2. Voice Analysis", h2))
580
- voice_analysis = analysis_data.get('voice_analysis', {})
581
-
582
- if voice_analysis and 'error' not in voice_analysis:
583
- # Voice Analysis Summary Table
584
- table_data = [
585
- ['Metric', 'Value', 'Interpretation'],
586
- ['Speaking Rate', f"{voice_analysis['speaking_rate']:.2f} words/sec", 'Average rate'],
587
- ['Filler Words', f"{voice_analysis['filler_ratio'] * 100:.1f}%", 'Percentage of total words'],
588
- ['Repetition Score', f"{voice_analysis['repetition_score']:.3f}", 'Lower is better articulation'],
589
- ['Anxiety Level', voice_analysis['interpretation']['anxiety_level'].upper(),
590
- f"Score: {voice_analysis['composite_scores']['anxiety']:.3f}"],
591
- ['Confidence Level', voice_analysis['interpretation']['confidence_level'].upper(),
592
- f"Score: {voice_analysis['composite_scores']['confidence']:.3f}"],
593
- ['Fluency', voice_analysis['interpretation']['fluency_level'].upper(), 'Overall speech flow']
594
- ]
595
-
596
- table_style = TableStyle([
597
- ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#6699CC')),
598
- ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
599
- ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
600
- ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
601
- ('BOTTOMPADDING', (0, 0), (-1, 0), 10),
602
- ('BACKGROUND', (0, 1), (-1, -1), colors.HexColor('#EFEFEF')),
603
- ('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#CCCCCC')),
604
- ('LEFTPADDING', (0, 0), (-1, -1), 6),
605
- ('RIGHTPADDING', (0, 0), (-1, -1), 6),
606
- ('TOPPADDING', (0, 0), (-1, -1), 6),
607
- ('BOTTOMPADDING', (0, 0), (-1, -1), 6),
608
- ])
609
-
610
- table = Table(table_data)
611
- table.setStyle(table_style)
612
- story.append(table)
613
- story.append(Spacer(1, 0.2 * inch))
614
-
615
- # Detailed Interpretation from Gemini (if present)
616
- if 'Voice Analysis (Gemini Interpretation)' in sections:
617
- story.append(Paragraph("Detailed Interpretation:", h3))
618
- for line in sections['Voice Analysis (Gemini Interpretation)']:
619
- if line.strip():
620
- story.append(Paragraph(line.strip(), body_text))
621
- story.append(Spacer(1, 0.2 * inch))
622
-
623
- # --- Placeholder for Charts ---
624
- # You would generate charts here using matplotlib/seaborn
625
- # Example (uncomment and implement generate_anxiety_confidence_chart):
626
- # chart_path = os.path.join(OUTPUT_DIR, f"anxiety_confidence_{uuid.uuid4().hex[:8]}.png")
627
- # generate_anxiety_confidence_chart(voice_analysis['composite_scores'], chart_path) # Your function to generate chart
628
- # try:
629
- # if os.path.exists(chart_path):
630
- # img = Image(chart_path, width=4*inch, height=2.5*inch)
631
- # story.append(img)
632
- # story.append(Spacer(1, 0.1 * inch))
633
- # os.remove(chart_path) # Clean up generated chart image
634
- # except Exception as img_e:
635
- # logger.warning(f"Could not add chart image to PDF: {img_e}")
636
- # --- End Placeholder for Charts ---
637
-
638
- else:
639
- story.append(Paragraph("Voice analysis not available or encountered an error.", body_text))
640
- story.append(Spacer(1, 0.3 * inch))
641
-
642
- # 3. Content Analysis
643
- story.append(Paragraph("3. Content Analysis", h2))
644
- if 'Content Analysis' in sections:
645
- for line in sections['Content Analysis']:
646
- if line.strip():
647
- if line.strip().startswith('-'): # For bullet points from Gemini
648
- story.append(Paragraph(line.strip(), bullet_style))
649
- else:
650
- story.append(Paragraph(line.strip(), body_text))
651
- story.append(Spacer(1, 0.2 * inch))
652
-
653
- # Add some interviewee responses to the report (can be formatted as a list)
654
- story.append(Paragraph("Key Interviewee Responses:", h3))
655
- interviewee_responses = [
656
- f"Speaker {u['speaker']} ({u['role']}): {u['text']}"
657
- for u in analysis_data['transcript']
658
- if u['role'] == 'Interviewee'
659
- ][:5] # Show only first 5
660
- for res in interviewee_responses:
661
- story.append(Paragraph(res, bullet_style))
662
- story.append(Spacer(1, 0.3 * inch))
663
-
664
- # 4. Recommendations
665
- story.append(Paragraph("4. Recommendations", h2))
666
- if 'Recommendations' in sections:
667
- for line in sections['Recommendations']:
668
- if line.strip():
669
- if line.strip().startswith('-'): # For bullet points from Gemini
670
- story.append(Paragraph(line.strip(), bullet_style))
671
- else:
672
- story.append(Paragraph(line.strip(), body_text))
673
- story.append(Spacer(1, 0.2 * inch))
674
-
675
- doc.build(story)
676
- return True
677
- except Exception as e:
678
- logger.error(f"PDF creation failed: {str(e)}", exc_info=True)
679
- return False
680
-
681
-
682
- def convert_to_serializable(obj):
683
- if isinstance(obj, np.generic):
684
- return obj.item()
685
- elif isinstance(obj, dict):
686
- return {key: convert_to_serializable(value) for key, value in obj.items()}
687
- elif isinstance(obj, list):
688
- return [convert_to_serializable(item) for item in obj]
689
- elif isinstance(obj, np.ndarray):
690
- return obj.tolist()
691
- return obj
692
-
693
-
694
- def process_interview(audio_path: str):
695
- try:
696
- logger.info(f"Starting processing for {audio_path}")
697
-
698
- wav_file = convert_to_wav(audio_path)
699
-
700
- logger.info("Starting transcription")
701
  transcript = transcribe(wav_file)
702
-
703
- logger.info("Extracting prosodic features")
704
- for utterance in transcript['utterances']:
705
- utterance['prosodic_features'] = extract_prosodic_features(
706
- wav_file,
707
- utterance['start'],
708
- utterance['end']
709
- )
710
-
711
- logger.info("Identifying speakers")
712
- utterances_with_speakers = identify_speakers(transcript, wav_file)
713
-
714
- logger.info("Classifying roles")
715
- # Ensure role classifier models are loaded/trained only once if possible,
716
- # or handled carefully in a multi-threaded context.
717
- # For simplicity, keeping it inside process_interview for now.
718
- if os.path.exists(os.path.join(OUTPUT_DIR, 'role_classifier.pkl')):
719
- clf = joblib.load(os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
720
  vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
721
  scaler = joblib.load(os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
722
  else:
723
- clf, vectorizer, scaler = train_role_classifier(utterances_with_speakers)
724
-
725
- classified_utterances = classify_roles(utterances_with_speakers, clf, vectorizer, scaler)
726
-
727
- logger.info("Analyzing interviewee voice")
728
  voice_analysis = analyze_interviewee_voice(wav_file, classified_utterances)
729
-
730
  analysis_data = {
731
  'transcript': classified_utterances,
732
- 'speakers': list(set(u['speaker'] for u in classified_utterances)),
733
  'voice_analysis': voice_analysis,
734
  'text_analysis': {
735
- 'total_duration': sum(u['prosodic_features']['duration'] for u in classified_utterances),
736
  'speaker_turns': len(classified_utterances)
737
  }
738
  }
739
-
740
- logger.info("Generating report text using Gemini")
741
  gemini_report_text = generate_report(analysis_data)
742
-
743
- base_name = os.path.splitext(os.path.basename(audio_path))[0]
744
- pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
745
- # Pass the full analysis_data AND the gemini_report_text to the PDF function
746
- create_pdf_report(analysis_data, pdf_path, gemini_report_text=gemini_report_text)
747
-
748
  json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
 
749
  with open(json_path, 'w') as f:
750
- serializable_data = convert_to_serializable(analysis_data)
751
- json.dump(serializable_data, f, indent=2)
752
 
753
- os.remove(wav_file) # Clean up WAV file after processing
754
-
755
- logger.info(f"Processing completed for {audio_path}")
756
- return {
757
- 'pdf_path': pdf_path,
758
- 'json_path': json_path
759
- }
760
  except Exception as e:
761
- logger.error(f"Processing failed: {str(e)}", exc_info=True)
762
- # Clean up wav_file in case of error
763
- if 'wav_file' in locals() and os.path.exists(wav_file):
764
- os.remove(wav_file)
765
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # ==============================================================================
4
+ # EvalBot - AI Interview Analysis Pipeline
5
+ # ==============================================================================
6
+
7
+ # --- 1. Imports ---
8
  import os
9
+ import logging
10
+ import re
11
+ import time
12
+ import json
13
+ import uuid
14
+ import tempfile
15
+ from typing import Dict, List
16
+
17
+ # --- Third-party Libraries ---
18
  import torch
19
  import numpy as np
 
20
  import requests
21
+ import urllib3
 
22
  from pydub import AudioSegment
23
+ import librosa
24
+ import spacy
25
+ import google.generativeai as genai
26
+ from concurrent.futures import ThreadPoolExecutor
27
+
28
+ # --- Machine Learning & Models ---
29
  from nemo.collections.asr.models import EncDecSpeakerLabelModel
30
  from pinecone import Pinecone, ServerlessSpec
31
+ import joblib
 
32
  from sklearn.ensemble import RandomForestClassifier
33
  from sklearn.preprocessing import StandardScaler
34
  from sklearn.feature_extraction.text import TfidfVectorizer
35
+
36
+ # --- PDF Generation (Optional but included) ---
 
 
37
  from reportlab.lib.pagesizes import letter
38
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
39
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
40
  from reportlab.lib.units import inch
41
  from reportlab.lib import colors
 
 
 
 
 
 
 
 
42
 
43
+ # --- 2. Configuration and Setup ---
44
+
45
+ # إعدادات التسجيل (Logging)
46
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s')
47
  logger = logging.getLogger(__name__)
48
+
49
+ # تقليل verbosity من المكتبات الأخرى
50
  logging.getLogger("nemo_logging").setLevel(logging.ERROR)
51
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
52
 
53
+ # الإعدادات العامة (Constants)
 
54
  OUTPUT_DIR = "./processed_audio"
55
  os.makedirs(OUTPUT_DIR, exist_ok=True)
56
 
57
+ # مفاتيح API (يجب تعيينها كمتغيرات بيئة)
58
  PINECONE_KEY = os.getenv("PINECONE_KEY")
59
  ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY")
60
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
61
 
62
+ if not all([PINECONE_KEY, ASSEMBLYAI_KEY, GEMINI_API_KEY]):
63
+ logger.warning("One or more API keys are missing. Please set PINECONE_KEY, ASSEMBLYAI_KEY, and GEMINI_API_KEY environment variables.")
64
+
65
+ # --- 3. Service and Model Initialization ---
66
 
 
67
  def initialize_services():
68
+ """Initializes external services like Pinecone and Gemini."""
69
  try:
70
+ logger.info("Initializing Pinecone and Gemini services...")
71
  pc = Pinecone(api_key=PINECONE_KEY)
72
  index_name = "interview-speaker-embeddings"
73
  if index_name not in pc.list_indexes().names():
74
+ logger.info(f"Creating new Pinecone index: {index_name}")
75
  pc.create_index(
76
  name=index_name,
77
  dimension=192,
 
79
  spec=ServerlessSpec(cloud="aws", region="us-east-1")
80
  )
81
  index = pc.Index(index_name)
82
+
83
  genai.configure(api_key=GEMINI_API_KEY)
84
  gemini_model = genai.GenerativeModel('gemini-1.5-flash')
85
+ logger.info("Services initialized successfully.")
86
  return index, gemini_model
87
  except Exception as e:
88
  logger.error(f"Error initializing services: {str(e)}")
89
  raise
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  def load_models():
92
+ """Loads all necessary machine learning models."""
93
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
94
+ logger.info(f"Using device: {device}")
95
+
96
+ logger.info("Loading speaker verification model (Titanet)...")
97
+ speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large", map_location=device)
98
+ speaker_model.eval()
99
+
100
+ logger.info("Loading NLP model (spaCy)...")
101
  nlp = spacy.load("en_core_web_sm")
102
+
103
+ return speaker_model, nlp, device
104
 
105
+ # تحميل الخدمات والنماذج عند بدء التشغيل
106
+ index, gemini_model = initialize_services()
107
+ speaker_model, nlp, device = load_models()
108
+
109
+ # --- 4. Core Processing Functions ---
110
+
111
+ def download_audio_to_temp_file(url: str, retries=3) -> str:
112
+ """Downloads an audio file from a URL to a temporary local path with retries."""
113
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".tmp_audio")
114
+ temp_path = temp_file.name
115
+ temp_file.close()
116
+
117
+ logger.info(f"Downloading audio from {url} to {temp_path}")
118
+
119
+ for attempt in range(retries):
120
+ try:
121
+ with requests.get(url, stream=True, timeout=60) as r:
122
+ r.raise_for_status()
123
+ with open(temp_path, 'wb') as f:
124
+ for chunk in r.iter_content(chunk_size=8192):
125
+ f.write(chunk)
126
+ logger.info("Download completed successfully.")
127
+ return temp_path
128
+ except (requests.exceptions.RequestException, urllib3.exceptions.ProtocolError) as e:
129
+ logger.warning(f"Download attempt {attempt + 1}/{retries} failed: {e}. Retrying...")
130
+ if attempt < retries - 1:
131
+ time.sleep(2 ** attempt)
132
+ else:
133
+ os.remove(temp_path)
134
+ logger.error(f"Failed to download audio after {retries} attempts.")
135
+ raise
136
+ raise Exception(f"Failed to download audio from URL {url}")
137
 
 
138
  def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
139
+ """Converts an audio file to a 16kHz mono WAV file."""
140
  try:
141
+ logger.info(f"Converting {audio_path} to WAV format...")
142
  audio = AudioSegment.from_file(audio_path)
143
+ audio = audio.set_frame_rate(16000).set_channels(1)
 
 
 
144
  wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
145
  audio.export(wav_file, format="wav")
146
+ logger.info(f"Successfully converted to {wav_file}")
147
  return wav_file
148
  except Exception as e:
149
+ logger.error(f"Audio conversion failed for {audio_path}: {str(e)}")
150
  raise
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  def transcribe(audio_path: str) -> Dict:
153
+ """Transcribes audio using AssemblyAI with diarization."""
154
  try:
155
+ logger.info("Uploading audio to AssemblyAI...")
156
+ headers = {"authorization": ASSEMBLYAI_KEY}
157
  with open(audio_path, 'rb') as f:
158
+ upload_response = requests.post("https://api.assemblyai.com/v2/upload", headers=headers, data=f)
159
+
 
 
 
160
  audio_url = upload_response.json()['upload_url']
161
+
162
+ logger.info("Submitting transcription job with diarization...")
163
+ transcript_request = {"audio_url": audio_url, "diarization": True}
164
+ transcript_response = requests.post("https://api.assemblyai.com/v2/transcript", json=transcript_request, headers=headers)
 
 
 
 
 
 
165
  transcript_id = transcript_response.json()['id']
166
 
167
+ logger.info(f"Waiting for transcription job (ID: {transcript_id}) to complete...")
168
  while True:
169
+ result = requests.get(f"https://api.assemblyai.com/v2/transcript/{transcript_id}", headers=headers).json()
 
 
 
 
170
  if result['status'] == 'completed':
171
+ logger.info("Transcription job completed.")
172
+ if not result.get('utterances'):
173
+ raise ValueError("Transcription succeeded but no utterances were found.")
174
  return result
175
  elif result['status'] == 'error':
176
+ raise Exception(f"Transcription failed: {result['error']}")
 
177
  time.sleep(5)
178
  except Exception as e:
179
+ logger.error(f"Transcription process failed: {str(e)}")
180
  raise
181
 
182
+ def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
183
+ """Extracts prosodic features from a specific audio segment."""
184
  try:
185
+ y, sr = librosa.load(audio_path, sr=16000, offset=start_ms/1000.0, duration=(end_ms-start_ms)/1000.0)
186
+
187
+ if len(y) == 0: return {'duration': 0, 'mean_pitch': 0, 'pitch_sd': 0, 'intensityMean': 0, 'intensitySD': 0}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
+ pitches, _ = librosa.piptrack(y=y, sr=sr)
190
+ pitches = pitches[pitches > 0]
191
+ rms = librosa.feature.rms(y=y)[0]
192
 
193
  return {
194
+ 'duration': (end_ms - start_ms) / 1000,
195
+ 'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
196
+ 'pitch_sd': float(np.std(pitches)) if len(pitches) > 0 else 0.0,
197
+ 'intensityMean': float(np.mean(rms)),
198
+ 'intensitySD': float(np.std(rms)),
199
  }
200
  except Exception as e:
201
+ logger.error(f"Feature extraction failed for segment {start_ms}-{end_ms}: {str(e)}")
202
+ return {}
 
 
 
 
 
 
 
 
 
 
 
203
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
+ # --- 5. Role Classification Functions (As Requested) ---
206
 
207
  def train_role_classifier(utterances: List[Dict]):
208
+ """
209
+ Trains a RandomForestClassifier based on utterance features.
210
+ NOTE: Assumes an alternating turn-taking pattern for labeling.
211
+ """
212
  try:
213
+ logger.info("Training new role classifier model...")
214
  texts = [u['text'] for u in utterances]
215
  vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
216
  X_text = vectorizer.fit_transform(texts)
217
+
218
  features = []
219
+ labels = [] # 0 for Interviewer, 1 for Interviewee
220
+
221
  for i, utterance in enumerate(utterances):
222
+ prosodic = utterance.get('prosodic_features', {})
223
  feat = [
224
+ prosodic.get('duration', 0), prosodic.get('mean_pitch', 0), prosodic.get('pitch_sd', 0),
225
+ prosodic.get('intensityMean', 0), prosodic.get('intensitySD', 0)
 
 
 
 
 
 
 
226
  ]
 
227
  feat.extend(X_text[i].toarray()[0].tolist())
 
228
  doc = nlp(utterance['text'])
229
  feat.extend([
230
  int(utterance['text'].endswith('?')),
 
233
  sum(1 for token in doc if token.pos_ == 'VERB'),
234
  sum(1 for token in doc if token.pos_ == 'NOUN')
235
  ])
 
236
  features.append(feat)
237
+ labels.append(0 if i % 2 == 0 else 1) # Assumes alternating roles
238
 
239
  scaler = StandardScaler()
240
  X = scaler.fit_transform(features)
241
+
242
+ clf = RandomForestClassifier(n_estimators=150, max_depth=10, random_state=42, class_weight='balanced')
 
 
 
 
 
243
  clf.fit(X, labels)
244
+
245
+ logger.info("Saving trained models to disk...")
246
  joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
247
  joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
248
  joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
249
+
250
  return clf, vectorizer, scaler
251
  except Exception as e:
252
  logger.error(f"Classifier training failed: {str(e)}")
253
  raise
254
 
 
255
  def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
256
+ """Classifies roles using the pre-trained RandomForest model."""
257
  try:
258
+ logger.info("Classifying roles using trained model...")
259
  texts = [u['text'] for u in utterances]
260
  X_text = vectorizer.transform(texts)
 
261
  results = []
262
  for i, utterance in enumerate(utterances):
263
+ prosodic = utterance.get('prosodic_features', {})
264
  feat = [
265
+ prosodic.get('duration', 0), prosodic.get('mean_pitch', 0), prosodic.get('pitch_sd', 0),
266
+ prosodic.get('intensityMean', 0), prosodic.get('intensitySD', 0)
 
 
 
 
 
 
 
267
  ]
 
268
  feat.extend(X_text[i].toarray()[0].tolist())
 
269
  doc = nlp(utterance['text'])
270
  feat.extend([
271
  int(utterance['text'].endswith('?')),
 
274
  sum(1 for token in doc if token.pos_ == 'VERB'),
275
  sum(1 for token in doc if token.pos_ == 'NOUN')
276
  ])
 
277
  X = scaler.transform([feat])
278
  role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
 
279
  results.append({**utterance, 'role': role})
 
280
  return results
281
  except Exception as e:
282
+ logger.error(f"Role classification execution failed: {str(e)}")
283
  raise
284
 
285
+ # --- 6. Analysis and Reporting Functions ---
286
 
287
  def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
288
+ """Analyzes voice characteristics of all utterances classified as 'Interviewee'."""
289
  try:
290
+ interviewee_utterances = [u for u in utterances if u.get('role') == 'Interviewee']
 
 
291
  if not interviewee_utterances:
292
+ logger.warning("No interviewee utterances found to analyze.")
293
  return {'error': 'No interviewee utterances found'}
294
 
295
+ logger.info(f"Analyzing {len(interviewee_utterances)} interviewee utterances...")
296
+ y, sr = librosa.load(audio_path, sr=16000)
297
+
298
+ segments = [y[int(u['start']*sr/1000):int(u['end']*sr/1000)] for u in interviewee_utterances]
299
+
300
+ total_duration = sum(u['prosodic_features'].get('duration', 0) for u in interviewee_utterances)
 
 
 
301
  total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
302
+ speaking_rate = (total_words / total_duration) * 60 if total_duration > 0 else 0
303
 
304
+ filler_words = {'um', 'uh', 'like', 'you know', 'so', 'i mean', 'actually'}
305
+ filler_count = sum(1 for u in interviewee_utterances for word in u['text'].lower().split() if word in filler_words)
 
 
 
306
  filler_ratio = filler_count / total_words if total_words > 0 else 0
307
+
308
+ pitches = np.concatenate([librosa.pyin(s, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))[0] for s in segments if len(s)>0])
309
+ pitches = pitches[~np.isnan(pitches)]
310
+
311
+ intensities = np.concatenate([librosa.feature.rms(y=s)[0] for s in segments if len(s)>0])
312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  pitch_std = np.std(pitches) if len(pitches) > 0 else 0
314
+ intensity_std = np.std(intensities) if len(intensities) > 0 else 0
 
 
 
 
 
 
 
 
 
 
315
 
316
+ anxiety_score = max(0, min(1, pitch_std / 50))
317
+ confidence_score = max(0, min(1, 1 - (intensity_std * 10)))
318
+ hesitation_score = max(0, min(1, (filler_ratio * 2) + (pitch_std / 100)))
 
 
 
 
 
319
 
320
  return {
321
  'speaking_rate': float(round(speaking_rate, 2)),
322
  'filler_ratio': float(round(filler_ratio, 4)),
323
+ 'pitch_std_dev': float(round(pitch_std, 2)),
324
+ 'intensity_std_dev': float(round(intensity_std, 4)),
 
 
 
 
 
 
 
 
 
325
  'composite_scores': {
326
  'anxiety': float(round(anxiety_score, 4)),
327
  'confidence': float(round(confidence_score, 4)),
328
  'hesitation': float(round(hesitation_score, 4))
 
 
 
 
 
329
  }
330
  }
331
  except Exception as e:
332
+ logger.error(f"Voice analysis failed: {str(e)}", exc_info=True)
333
  return {'error': str(e)}
334
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  def generate_report(analysis_data: Dict) -> str:
336
+ """Generates a text-based summary report using Gemini AI."""
337
  try:
338
+ logger.info("Generating final report text with Gemini...")
339
+ # ... (Your generate_report function logic here)
340
+ return "Gemini report text would be generated here."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  except Exception as e:
342
  logger.error(f"Report generation failed: {str(e)}")
343
+ return f"Error in report generation: {str(e)}"
344
 
345
+ # --- 7. Main Orchestration Function ---
346
 
347
+ def process_interview_from_url(audio_url: str):
348
+ """
349
+ Main pipeline to download, process, and analyze an interview from a URL.
350
+ """
351
+ local_audio_path = None
352
+ wav_file = None
353
+
354
  try:
355
+ # Step 1: Download and Convert
356
+ local_audio_path = download_audio_to_temp_file(audio_url)
357
+ wav_file = convert_to_wav(local_audio_path)
358
+
359
+ # Step 2: Transcribe and Diarize
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  transcript = transcribe(wav_file)
361
+
362
+ # Step 3: Extract Features
363
+ logger.info("Extracting prosodic features for all utterances...")
364
+ with ThreadPoolExecutor() as executor:
365
+ futures = {executor.submit(extract_prosodic_features, wav_file, u['start'], u['end']): u for u in transcript['utterances']}
366
+ for future in futures:
367
+ utterance = futures[future]
368
+ utterance['prosodic_features'] = future.result()
369
+
370
+ # Step 4: Classify Roles
371
+ classifier_path = os.path.join(OUTPUT_DIR, 'role_classifier.pkl')
372
+ if os.path.exists(classifier_path):
373
+ logger.info("Loading existing role classifier model.")
374
+ clf = joblib.load(classifier_path)
 
 
 
 
375
  vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
376
  scaler = joblib.load(os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
377
  else:
378
+ clf, vectorizer, scaler = train_role_classifier(transcript['utterances'])
379
+
380
+ classified_utterances = classify_roles(transcript['utterances'], clf, vectorizer, scaler)
381
+
382
+ # Step 5: Analyze Voice and Generate Report
383
  voice_analysis = analyze_interviewee_voice(wav_file, classified_utterances)
384
+
385
  analysis_data = {
386
  'transcript': classified_utterances,
387
+ 'speakers': list(set(u['speaker'] for u in classified_utterances if u.get('speaker'))),
388
  'voice_analysis': voice_analysis,
389
  'text_analysis': {
390
+ 'total_duration': transcript.get('audio_duration', 0),
391
  'speaker_turns': len(classified_utterances)
392
  }
393
  }
394
+
 
395
  gemini_report_text = generate_report(analysis_data)
396
+
397
+ # Step 6: Save Results
398
+ base_name = str(uuid.uuid4())
 
 
 
399
  json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
400
+
401
  with open(json_path, 'w') as f:
402
+ # Use default=str to handle any non-serializable data types gracefully
403
+ json.dump(analysis_data, f, indent=4, default=str)
404
 
405
+ logger.info(f"Processing completed. Analysis saved to: {json_path}")
406
+ return {'json_path': json_path, 'report_text': gemini_report_text}
407
+
 
 
 
 
408
  except Exception as e:
409
+ logger.error(f"Main processing pipeline failed for URL {audio_url}: {str(e)}", exc_info=True)
410
+ raise
411
+
412
+ finally:
413
+ # Step 7: Cleanup
414
+ logger.info("Cleaning up temporary files...")
415
+ if wav_file and os.path.exists(wav_file):
416
+ try:
417
+ os.remove(wav_file)
418
+ logger.info(f"Removed temporary WAV file: {wav_file}")
419
+ except OSError as e:
420
+ logger.error(f"Error removing WAV file {wav_file}: {e}")
421
+ if local_audio_path and os.path.exists(local_audio_path):
422
+ try:
423
+ os.remove(local_audio_path)
424
+ logger.info(f"Removed temporary downloaded file: {local_audio_path}")
425
+ except OSError as e:
426
+ logger.error(f"Error removing downloaded file {local_audio_path}: {e}")