norhan12 commited on
Commit
dbde83d
·
verified ·
1 Parent(s): 0068d30

Update process_interview.py

Browse files
Files changed (1) hide show
  1. process_interview.py +181 -69
process_interview.py CHANGED
@@ -66,60 +66,58 @@ def download_audio_from_url(url: str) -> str:
66
  logger.error(f"Failed to download audio from URL {url}: {e}")
67
  raise
68
 
 
 
 
 
69
  def initialize_services():
70
- try:
71
- pc = Pinecone(api_key=PINECONE_KEY)
72
- index_name = "interview-speaker-embeddings"
73
- if index_name not in pc.list_indexes().names():
74
- pc.create_index(
75
- name=index_name,
76
- dimension=192,
77
- metric="cosine",
78
- spec=ServerlessSpec(cloud="aws", region="us-east-1")
79
- )
80
- index = pc.Index(index_name)
81
- genai.configure(api_key=GEMINI_API_KEY)
82
- gemini_model = genai.GenerativeModel('gemini-1.5-flash')
83
- return index, gemini_model
84
- except Exception as e:
85
- logger.error(f"Error initializing services: {str(e)}")
86
- raise
87
 
88
  index, gemini_model = initialize_services()
89
 
 
90
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
91
  logger.info(f"Using device: {device}")
92
 
93
- def load_speaker_model():
94
- try:
95
- import torch
96
- torch.set_num_threads(5)
97
- model = EncDecSpeakerLabelModel.from_pretrained(
98
- "nvidia/speakerverification_en_titanet_large",
99
- map_location=torch.device('cpu')
100
- )
101
- model.eval()
102
- return model
103
- except Exception as e:
104
- logger.error(f"Model loading failed: {str(e)}")
105
- raise RuntimeError("Could not load speaker verification model")
106
-
107
  def load_models():
108
- speaker_model = load_speaker_model()
 
 
109
  nlp = spacy.load("en_core_web_sm")
 
110
  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
111
  llm_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
112
  llm_model.eval()
 
113
  return speaker_model, nlp, tokenizer, llm_model
114
 
115
  speaker_model, nlp, tokenizer, llm_model = load_models()
116
 
 
117
  def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
118
  try:
119
  audio = AudioSegment.from_file(audio_path)
120
  if audio.channels > 1:
121
  audio = audio.set_channels(1)
122
  audio = audio.set_frame_rate(16000)
 
123
  wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
124
  audio.export(wav_file, format="wav")
125
  return wav_file
@@ -133,11 +131,11 @@ def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Di
133
  segment = audio[start_ms:end_ms]
134
  temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
135
  segment.export(temp_path, format="wav")
136
-
137
  y, sr = librosa.load(temp_path, sr=16000)
138
  pitches = librosa.piptrack(y=y, sr=sr)[0]
139
  pitches = pitches[pitches > 0]
140
-
141
  features = {
142
  'duration': (end_ms - start_ms) / 1000,
143
  'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
@@ -149,7 +147,7 @@ def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Di
149
  'intensityMax': float(np.max(librosa.feature.rms(y=y)[0])),
150
  'intensitySD': float(np.std(librosa.feature.rms(y=y)[0])),
151
  }
152
-
153
  os.remove(temp_path)
154
  return features
155
  except Exception as e:
@@ -166,9 +164,10 @@ def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Di
166
  'intensitySD': 0.0,
167
  }
168
 
169
-
170
  def transcribe(audio_path: str) -> Dict:
171
  try:
 
172
  with open(audio_path, 'rb') as f:
173
  upload_response = requests.post(
174
  "https://api.assemblyai.com/v2/upload",
@@ -176,62 +175,78 @@ def transcribe(audio_path: str) -> Dict:
176
  data=f
177
  )
178
  audio_url = upload_response.json()['upload_url']
 
 
179
  transcript_response = requests.post(
180
  "https://api.assemblyai.com/v2/transcript",
181
  headers={"authorization": ASSEMBLYAI_KEY},
182
  json={
183
  "audio_url": audio_url,
184
  "speaker_labels": True,
185
- "filter_profanity": True,
186
- "speakers_expected": 2
187
  }
188
  )
189
  transcript_id = transcript_response.json()['id']
 
 
190
  while True:
191
  result = requests.get(
192
  f"https://api.assemblyai.com/v2/transcript/{transcript_id}",
193
  headers={"authorization": ASSEMBLYAI_KEY}
194
  ).json()
 
195
  if result['status'] == 'completed':
196
  return result
197
  elif result['status'] == 'error':
198
  raise Exception(result['error'])
 
199
  time.sleep(5)
200
  except Exception as e:
201
  logger.error(f"Transcription failed: {str(e)}")
202
  raise
203
 
 
 
204
  def process_utterance(utterance, full_audio, wav_file):
205
  try:
 
206
  start = utterance['start']
207
  end = utterance['end']
208
  segment = full_audio[start:end]
209
  temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
210
  segment.export(temp_path, format="wav")
 
 
211
  with torch.no_grad():
212
- embedding = speaker_model.get_embedding(temp_path).cpu().numpy()
213
- embedding_list = embedding.flatten().tolist()
 
214
  query_result = index.query(
215
- vector=embedding_list,
216
  top_k=1,
217
  include_metadata=True
218
  )
 
 
219
  if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
220
  speaker_id = query_result['matches'][0]['id']
221
  speaker_name = query_result['matches'][0]['metadata']['speaker_name']
222
  else:
223
  speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
224
  speaker_name = f"Speaker_{speaker_id[-4:]}"
225
- index.upsert([(speaker_id, embedding_list, {"speaker_name": speaker_name})])
 
 
226
  os.remove(temp_path)
 
227
  return {
228
  **utterance,
229
  'speaker': speaker_name,
230
  'speaker_id': speaker_id,
231
- 'embedding': embedding_list
232
  }
233
  except Exception as e:
234
- logger.error(f"Utterance processing failed: {str(e)}", exc_info=True)
235
  return {
236
  **utterance,
237
  'speaker': 'Unknown',
@@ -239,36 +254,54 @@ def process_utterance(utterance, full_audio, wav_file):
239
  'embedding': None
240
  }
241
 
 
242
  def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
243
  try:
244
  full_audio = AudioSegment.from_wav(wav_file)
245
  utterances = transcript['utterances']
246
- with ThreadPoolExecutor(max_workers=5) as executor:
 
 
247
  futures = [
248
  executor.submit(process_utterance, utterance, full_audio, wav_file)
249
  for utterance in utterances
250
  ]
251
  results = [f.result() for f in futures]
 
252
  return results
253
  except Exception as e:
254
  logger.error(f"Speaker identification failed: {str(e)}")
255
  raise
256
-
257
  def train_role_classifier(utterances: List[Dict]):
258
  try:
 
259
  texts = [u['text'] for u in utterances]
260
  vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
261
  X_text = vectorizer.fit_transform(texts)
 
262
  features = []
263
  labels = []
 
264
  for i, utterance in enumerate(utterances):
 
265
  prosodic = utterance['prosodic_features']
266
  feat = [
267
- prosodic['duration'], prosodic['mean_pitch'], prosodic['min_pitch'],
268
- prosodic['max_pitch'], prosodic['pitch_sd'], prosodic['intensityMean'],
269
- prosodic['intensityMin'], prosodic['intensityMax'], prosodic['intensitySD'],
 
 
 
 
 
 
270
  ]
 
 
271
  feat.extend(X_text[i].toarray()[0].tolist())
 
 
272
  doc = nlp(utterance['text'])
273
  feat.extend([
274
  int(utterance['text'].endswith('?')),
@@ -277,35 +310,59 @@ def train_role_classifier(utterances: List[Dict]):
277
  sum(1 for token in doc if token.pos_ == 'VERB'),
278
  sum(1 for token in doc if token.pos_ == 'NOUN')
279
  ])
 
280
  features.append(feat)
281
- labels.append(0 if i % 2 == 0 else 1)
 
 
282
  scaler = StandardScaler()
283
  X = scaler.fit_transform(features)
 
284
  clf = RandomForestClassifier(
285
- n_estimators=150, max_depth=10, random_state=42, class_weight='balanced'
 
 
 
286
  )
287
  clf.fit(X, labels)
 
 
288
  joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
289
  joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
290
  joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
 
291
  return clf, vectorizer, scaler
292
  except Exception as e:
293
  logger.error(f"Classifier training failed: {str(e)}")
294
  raise
295
 
 
296
  def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
297
  try:
 
298
  texts = [u['text'] for u in utterances]
299
  X_text = vectorizer.transform(texts)
 
300
  results = []
301
  for i, utterance in enumerate(utterances):
 
302
  prosodic = utterance['prosodic_features']
303
  feat = [
304
- prosodic['duration'], prosodic['mean_pitch'], prosodic['min_pitch'],
305
- prosodic['max_pitch'], prosodic['pitch_sd'], prosodic['intensityMean'],
306
- prosodic['intensityMin'], prosodic['intensityMax'], prosodic['intensitySD'],
 
 
 
 
 
 
307
  ]
 
 
308
  feat.extend(X_text[i].toarray()[0].tolist())
 
 
309
  doc = nlp(utterance['text'])
310
  feat.extend([
311
  int(utterance['text'].endswith('?')),
@@ -314,70 +371,120 @@ def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
314
  sum(1 for token in doc if token.pos_ == 'VERB'),
315
  sum(1 for token in doc if token.pos_ == 'NOUN')
316
  ])
 
 
317
  X = scaler.transform([feat])
318
  role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
 
319
  results.append({**utterance, 'role': role})
 
320
  return results
321
  except Exception as e:
322
  logger.error(f"Role classification failed: {str(e)}")
323
  raise
324
 
 
325
  def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
326
  try:
 
327
  y, sr = librosa.load(audio_path, sr=16000)
 
 
328
  interviewee_utterances = [u for u in utterances if u['role'] == 'Interviewee']
329
  if not interviewee_utterances:
330
  return {'error': 'No interviewee utterances found'}
 
 
331
  segments = []
332
  for u in interviewee_utterances:
333
  start = int(u['start'] * sr / 1000)
334
  end = int(u['end'] * sr / 1000)
335
  segments.append(y[start:end])
 
 
 
 
 
336
  total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
337
  total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
338
  speaking_rate = total_words / total_duration if total_duration > 0 else 0
 
 
339
  filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
340
- filler_count = sum(sum(u['text'].lower().count(fw) for fw in filler_words) for u in interviewee_utterances)
 
 
 
341
  filler_ratio = filler_count / total_words if total_words > 0 else 0
 
 
342
  all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
343
  word_counts = {}
344
  for i in range(len(all_words) - 1):
345
- bigram = (all_words[i], all_words[i + 1])
346
  word_counts[bigram] = word_counts.get(bigram, 0) + 1
347
  repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(word_counts) if word_counts else 0
 
 
348
  pitches = []
349
  for segment in segments:
350
  f0, voiced_flag, _ = librosa.pyin(segment, fmin=80, fmax=300, sr=sr)
351
  pitches.extend(f0[voiced_flag])
 
352
  pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
353
  pitch_std = np.std(pitches) if len(pitches) > 0 else 0
354
  jitter = np.mean(np.abs(np.diff(pitches))) / pitch_mean if len(pitches) > 1 and pitch_mean > 0 else 0
 
 
355
  intensities = []
356
  for segment in segments:
357
  rms = librosa.feature.rms(y=segment)[0]
358
  intensities.extend(rms)
 
359
  intensity_mean = np.mean(intensities) if intensities else 0
360
  intensity_std = np.std(intensities) if intensities else 0
361
  shimmer = np.mean(np.abs(np.diff(intensities))) / intensity_mean if len(intensities) > 1 and intensity_mean > 0 else 0
 
 
362
  anxiety_score = 0.6 * (pitch_std / pitch_mean) + 0.4 * (jitter + shimmer) if pitch_mean > 0 else 0
363
  confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
364
  hesitation_score = filler_ratio + repetition_score
365
- anxiety_level = 'High' if anxiety_score > 0.15 else 'Moderate' if anxiety_score > 0.07 else 'Low'
366
- confidence_level = 'High' if confidence_score > 0.7 else 'Moderate' if confidence_score > 0.5 else 'Low'
367
- fluency_level = 'Fluent' if (filler_ratio < 0.05 and repetition_score < 0.1) else 'Moderate' if (filler_ratio < 0.1 and repetition_score < 0.2) else 'Disfluent'
 
 
 
368
  return {
369
- 'speaking_rate': float(round(speaking_rate, 2)),
370
  'filler_ratio': float(round(filler_ratio, 4)),
371
  'repetition_score': float(round(repetition_score, 4)),
372
- 'pitch_analysis': {'mean': float(round(pitch_mean, 2)), 'std_dev': float(round(pitch_std, 2)), 'jitter': float(round(jitter, 4))},
373
- 'intensity_analysis': {'mean': float(round(intensity_mean, 2)), 'std_dev': float(round(intensity_std, 2)), 'shimmer': float(round(shimmer, 4))},
374
- 'composite_scores': {'anxiety': float(round(anxiety_score, 4)), 'confidence': float(round(confidence_score, 4)), 'hesitation': float(round(hesitation_score, 4))},
375
- 'interpretation': {'anxiety_level': anxiety_level, 'confidence_level': confidence_level, 'fluency_level': fluency_level}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  }
377
  except Exception as e:
378
  logger.error(f"Voice analysis failed: {str(e)}")
379
  return {'error': str(e)}
380
 
 
381
  def generate_voice_interpretation(analysis: Dict) -> str:
382
  if 'error' in analysis:
383
  return "Voice analysis unavailable due to processing limitations."
@@ -698,10 +805,15 @@ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text:
698
  return False
699
 
700
  def convert_to_serializable(obj):
701
- if isinstance(obj, np.generic): return obj.item()
702
- if isinstance(obj, dict): return {k: convert_to_serializable(v) for k, v in obj.items()}
703
- if isinstance(obj, list): return [convert_to_serializable(i) for i in obj]
704
- if isinstance(obj, np.ndarray): return obj.tolist()
 
 
 
 
 
705
  return obj
706
 
707
  def process_interview(audio_path_or_url: str):
@@ -720,7 +832,7 @@ def process_interview(audio_path_or_url: str):
720
  for utterance in transcript['utterances']:
721
  utterance['prosodic_features'] = extract_prosodic_features(wav_file, utterance['start'], utterance['end'])
722
  utterances_with_speakers = identify_speakers(transcript, wav_file)
723
- clf, vectorizer, scaler = None, None, None
724
  if os.path.exists(os.path.join(OUTPUT_DIR, 'role_classifier.pkl')):
725
  clf = joblib.load(os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
726
  vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
 
66
  logger.error(f"Failed to download audio from URL {url}: {e}")
67
  raise
68
 
69
+
70
+
71
+
72
+ # Initialize services
73
  def initialize_services():
74
+ # Pinecone
75
+ pc = Pinecone(api_key=PINECONE_KEY)
76
+ index_name = "interview-speaker-embeddings"
77
+ if index_name not in pc.list_indexes().names():
78
+ pc.create_index(
79
+ name=index_name,
80
+ dimension=192,
81
+ metric="cosine",
82
+ spec=ServerlessSpec(cloud="aws", region="us-east-1")
83
+ )
84
+ index = pc.Index(index_name)
85
+
86
+ # Gemini
87
+ genai.configure(api_key=GEMINI_API_KEY)
88
+ gemini_model = genai.GenerativeModel('gemini-1.5-flash')
89
+
90
+ return index, gemini_model
91
 
92
  index, gemini_model = initialize_services()
93
 
94
+ # Device setup
95
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
96
  logger.info(f"Using device: {device}")
97
 
98
+ # Load ML models
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  def load_models():
100
+ speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large").to(device)
101
+ speaker_model.eval()
102
+
103
  nlp = spacy.load("en_core_web_sm")
104
+
105
  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
106
  llm_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
107
  llm_model.eval()
108
+
109
  return speaker_model, nlp, tokenizer, llm_model
110
 
111
  speaker_model, nlp, tokenizer, llm_model = load_models()
112
 
113
+ # Audio processing functions
114
  def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
115
  try:
116
  audio = AudioSegment.from_file(audio_path)
117
  if audio.channels > 1:
118
  audio = audio.set_channels(1)
119
  audio = audio.set_frame_rate(16000)
120
+
121
  wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
122
  audio.export(wav_file, format="wav")
123
  return wav_file
 
131
  segment = audio[start_ms:end_ms]
132
  temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
133
  segment.export(temp_path, format="wav")
134
+
135
  y, sr = librosa.load(temp_path, sr=16000)
136
  pitches = librosa.piptrack(y=y, sr=sr)[0]
137
  pitches = pitches[pitches > 0]
138
+
139
  features = {
140
  'duration': (end_ms - start_ms) / 1000,
141
  'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
 
147
  'intensityMax': float(np.max(librosa.feature.rms(y=y)[0])),
148
  'intensitySD': float(np.std(librosa.feature.rms(y=y)[0])),
149
  }
150
+
151
  os.remove(temp_path)
152
  return features
153
  except Exception as e:
 
164
  'intensitySD': 0.0,
165
  }
166
 
167
+ # Transcription
168
  def transcribe(audio_path: str) -> Dict:
169
  try:
170
+ # Upload audio
171
  with open(audio_path, 'rb') as f:
172
  upload_response = requests.post(
173
  "https://api.assemblyai.com/v2/upload",
 
175
  data=f
176
  )
177
  audio_url = upload_response.json()['upload_url']
178
+
179
+ # Start transcription
180
  transcript_response = requests.post(
181
  "https://api.assemblyai.com/v2/transcript",
182
  headers={"authorization": ASSEMBLYAI_KEY},
183
  json={
184
  "audio_url": audio_url,
185
  "speaker_labels": True,
186
+ "filter_profanity": True
 
187
  }
188
  )
189
  transcript_id = transcript_response.json()['id']
190
+
191
+ # Poll for results
192
  while True:
193
  result = requests.get(
194
  f"https://api.assemblyai.com/v2/transcript/{transcript_id}",
195
  headers={"authorization": ASSEMBLYAI_KEY}
196
  ).json()
197
+
198
  if result['status'] == 'completed':
199
  return result
200
  elif result['status'] == 'error':
201
  raise Exception(result['error'])
202
+
203
  time.sleep(5)
204
  except Exception as e:
205
  logger.error(f"Transcription failed: {str(e)}")
206
  raise
207
 
208
+
209
+ # Speaker identification
210
  def process_utterance(utterance, full_audio, wav_file):
211
  try:
212
+ # Extract audio segment
213
  start = utterance['start']
214
  end = utterance['end']
215
  segment = full_audio[start:end]
216
  temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
217
  segment.export(temp_path, format="wav")
218
+
219
+ # Get speaker embedding
220
  with torch.no_grad():
221
+ embedding = speaker_model.get_embedding(temp_path).to(device)
222
+
223
+ # Query speaker database
224
  query_result = index.query(
225
+ vector=embedding.cpu().numpy().tolist(),
226
  top_k=1,
227
  include_metadata=True
228
  )
229
+
230
+ # Identify speaker
231
  if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
232
  speaker_id = query_result['matches'][0]['id']
233
  speaker_name = query_result['matches'][0]['metadata']['speaker_name']
234
  else:
235
  speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
236
  speaker_name = f"Speaker_{speaker_id[-4:]}"
237
+ index.upsert([(speaker_id, embedding.tolist(), {"speaker_name": speaker_name})])
238
+
239
+ # Cleanup
240
  os.remove(temp_path)
241
+
242
  return {
243
  **utterance,
244
  'speaker': speaker_name,
245
  'speaker_id': speaker_id,
246
+ 'embedding': embedding.cpu().numpy().tolist()
247
  }
248
  except Exception as e:
249
+ logger.error(f"Utterance processing failed: {str(e)}")
250
  return {
251
  **utterance,
252
  'speaker': 'Unknown',
 
254
  'embedding': None
255
  }
256
 
257
+
258
  def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
259
  try:
260
  full_audio = AudioSegment.from_wav(wav_file)
261
  utterances = transcript['utterances']
262
+
263
+ # Process utterances in parallel
264
+ with ThreadPoolExecutor(max_workers=4) as executor:
265
  futures = [
266
  executor.submit(process_utterance, utterance, full_audio, wav_file)
267
  for utterance in utterances
268
  ]
269
  results = [f.result() for f in futures]
270
+
271
  return results
272
  except Exception as e:
273
  logger.error(f"Speaker identification failed: {str(e)}")
274
  raise
275
+ # Role classification
276
  def train_role_classifier(utterances: List[Dict]):
277
  try:
278
+ # Prepare data
279
  texts = [u['text'] for u in utterances]
280
  vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
281
  X_text = vectorizer.fit_transform(texts)
282
+
283
  features = []
284
  labels = []
285
+
286
  for i, utterance in enumerate(utterances):
287
+ # Prosodic features
288
  prosodic = utterance['prosodic_features']
289
  feat = [
290
+ prosodic['duration'],
291
+ prosodic['mean_pitch'],
292
+ prosodic['min_pitch'],
293
+ prosodic['max_pitch'],
294
+ prosodic['pitch_sd'],
295
+ prosodic['intensityMean'],
296
+ prosodic['intensityMin'],
297
+ prosodic['intensityMax'],
298
+ prosodic['intensitySD'],
299
  ]
300
+
301
+ # Text features
302
  feat.extend(X_text[i].toarray()[0].tolist())
303
+
304
+ # Linguistic features
305
  doc = nlp(utterance['text'])
306
  feat.extend([
307
  int(utterance['text'].endswith('?')),
 
310
  sum(1 for token in doc if token.pos_ == 'VERB'),
311
  sum(1 for token in doc if token.pos_ == 'NOUN')
312
  ])
313
+
314
  features.append(feat)
315
+ labels.append(0 if i % 2 == 0 else 1) # Temporary labeling
316
+
317
+ # Train classifier
318
  scaler = StandardScaler()
319
  X = scaler.fit_transform(features)
320
+
321
  clf = RandomForestClassifier(
322
+ n_estimators=150,
323
+ max_depth=10,
324
+ random_state=42,
325
+ class_weight='balanced'
326
  )
327
  clf.fit(X, labels)
328
+
329
+ # Save models
330
  joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
331
  joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
332
  joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
333
+
334
  return clf, vectorizer, scaler
335
  except Exception as e:
336
  logger.error(f"Classifier training failed: {str(e)}")
337
  raise
338
 
339
+
340
  def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
341
  try:
342
+ # Prepare features for classification
343
  texts = [u['text'] for u in utterances]
344
  X_text = vectorizer.transform(texts)
345
+
346
  results = []
347
  for i, utterance in enumerate(utterances):
348
+ # Prosodic features
349
  prosodic = utterance['prosodic_features']
350
  feat = [
351
+ prosodic['duration'],
352
+ prosodic['mean_pitch'],
353
+ prosodic['min_pitch'],
354
+ prosodic['max_pitch'],
355
+ prosodic['pitch_sd'],
356
+ prosodic['intensityMean'],
357
+ prosodic['intensityMin'],
358
+ prosodic['intensityMax'],
359
+ prosodic['intensitySD'],
360
  ]
361
+
362
+ # Text features
363
  feat.extend(X_text[i].toarray()[0].tolist())
364
+
365
+ # Linguistic features
366
  doc = nlp(utterance['text'])
367
  feat.extend([
368
  int(utterance['text'].endswith('?')),
 
371
  sum(1 for token in doc if token.pos_ == 'VERB'),
372
  sum(1 for token in doc if token.pos_ == 'NOUN')
373
  ])
374
+
375
+ # Predict
376
  X = scaler.transform([feat])
377
  role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
378
+
379
  results.append({**utterance, 'role': role})
380
+
381
  return results
382
  except Exception as e:
383
  logger.error(f"Role classification failed: {str(e)}")
384
  raise
385
 
386
+ # Voice analysis for interviewee
387
  def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
388
  try:
389
+ # Load full audio
390
  y, sr = librosa.load(audio_path, sr=16000)
391
+
392
+ # Filter interviewee utterances
393
  interviewee_utterances = [u for u in utterances if u['role'] == 'Interviewee']
394
  if not interviewee_utterances:
395
  return {'error': 'No interviewee utterances found'}
396
+
397
+ # Extract all interviewee segments
398
  segments = []
399
  for u in interviewee_utterances:
400
  start = int(u['start'] * sr / 1000)
401
  end = int(u['end'] * sr / 1000)
402
  segments.append(y[start:end])
403
+
404
+ # Combine all segments
405
+ combined_audio = np.concatenate(segments)
406
+
407
+ # Speaking rate analysis
408
  total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
409
  total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
410
  speaking_rate = total_words / total_duration if total_duration > 0 else 0
411
+
412
+ # Filler words analysis
413
  filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
414
+ filler_count = sum(
415
+ sum(u['text'].lower().count(fw) for fw in filler_words)
416
+ for u in interviewee_utterances
417
+ )
418
  filler_ratio = filler_count / total_words if total_words > 0 else 0
419
+
420
+ # Repetition analysis
421
  all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
422
  word_counts = {}
423
  for i in range(len(all_words) - 1):
424
+ bigram = (all_words[i], all_words[i+1])
425
  word_counts[bigram] = word_counts.get(bigram, 0) + 1
426
  repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(word_counts) if word_counts else 0
427
+
428
+ # Pitch analysis (anxiety)
429
  pitches = []
430
  for segment in segments:
431
  f0, voiced_flag, _ = librosa.pyin(segment, fmin=80, fmax=300, sr=sr)
432
  pitches.extend(f0[voiced_flag])
433
+
434
  pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
435
  pitch_std = np.std(pitches) if len(pitches) > 0 else 0
436
  jitter = np.mean(np.abs(np.diff(pitches))) / pitch_mean if len(pitches) > 1 and pitch_mean > 0 else 0
437
+
438
+ # Intensity analysis (confidence)
439
  intensities = []
440
  for segment in segments:
441
  rms = librosa.feature.rms(y=segment)[0]
442
  intensities.extend(rms)
443
+
444
  intensity_mean = np.mean(intensities) if intensities else 0
445
  intensity_std = np.std(intensities) if intensities else 0
446
  shimmer = np.mean(np.abs(np.diff(intensities))) / intensity_mean if len(intensities) > 1 and intensity_mean > 0 else 0
447
+
448
+ # Composite scores
449
  anxiety_score = 0.6 * (pitch_std / pitch_mean) + 0.4 * (jitter + shimmer) if pitch_mean > 0 else 0
450
  confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
451
  hesitation_score = filler_ratio + repetition_score
452
+
453
+ # Interpretation
454
+ anxiety_level = 'high' if anxiety_score > 0.15 else 'moderate' if anxiety_score > 0.07 else 'low'
455
+ confidence_level = 'high' if confidence_score > 0.7 else 'moderate' if confidence_score > 0.5 else 'low'
456
+ fluency_level = 'fluent' if (filler_ratio < 0.05 and repetition_score < 0.1) else 'moderate' if (filler_ratio < 0.1 and repetition_score < 0.2) else 'disfluent'
457
+
458
  return {
459
+ 'speaking_rate':float (round(speaking_rate, 2)),
460
  'filler_ratio': float(round(filler_ratio, 4)),
461
  'repetition_score': float(round(repetition_score, 4)),
462
+ 'pitch_analysis': {
463
+ 'mean': float(round(pitch_mean, 2)),
464
+ 'std_dev':float(round(pitch_std, 2)),
465
+ 'jitter': float(round(jitter, 4))
466
+ },
467
+ 'intensity_analysis': {
468
+ 'mean': float(round(intensity_mean, 2)),
469
+ 'std_dev': float(round(intensity_std, 2)),
470
+ 'shimmer': float(round(shimmer, 4))
471
+ },
472
+ 'composite_scores': {
473
+ 'anxiety': float(round(anxiety_score, 4)),
474
+ 'confidence': float(round(confidence_score, 4)),
475
+ 'hesitation': float(round(hesitation_score, 4))
476
+ },
477
+ 'interpretation': {
478
+ 'anxiety_level': anxiety_level,
479
+ 'confidence_level': confidence_level,
480
+ 'fluency_level': fluency_level
481
+ }
482
  }
483
  except Exception as e:
484
  logger.error(f"Voice analysis failed: {str(e)}")
485
  return {'error': str(e)}
486
 
487
+
488
  def generate_voice_interpretation(analysis: Dict) -> str:
489
  if 'error' in analysis:
490
  return "Voice analysis unavailable due to processing limitations."
 
805
  return False
806
 
807
  def convert_to_serializable(obj):
808
+ """Convert numpy data types to Python native types for JSON serialization"""
809
+ if isinstance(obj, np.generic):
810
+ return obj.item()
811
+ elif isinstance(obj, dict):
812
+ return {key: convert_to_serializable(value) for key, value in obj.items()}
813
+ elif isinstance(obj, list):
814
+ return [convert_to_serializable(item) for item in obj]
815
+ elif isinstance(obj, np.ndarray):
816
+ return obj.tolist()
817
  return obj
818
 
819
  def process_interview(audio_path_or_url: str):
 
832
  for utterance in transcript['utterances']:
833
  utterance['prosodic_features'] = extract_prosodic_features(wav_file, utterance['start'], utterance['end'])
834
  utterances_with_speakers = identify_speakers(transcript, wav_file)
835
+
836
  if os.path.exists(os.path.join(OUTPUT_DIR, 'role_classifier.pkl')):
837
  clf = joblib.load(os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
838
  vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))