norhan12 commited on
Commit
520d9b2
·
verified ·
1 Parent(s): 81affdc

Update process_interview.py

Browse files
Files changed (1) hide show
  1. process_interview.py +166 -94
process_interview.py CHANGED
@@ -113,13 +113,13 @@ def load_models():
113
  return speaker_model, nlp, tokenizer, llm_model
114
 
115
  speaker_model, nlp, tokenizer, llm_model = load_models()
116
-
117
  def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
118
  try:
119
  audio = AudioSegment.from_file(audio_path)
120
  if audio.channels > 1:
121
  audio = audio.set_channels(1)
122
  audio = audio.set_frame_rate(16000)
 
123
  wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
124
  audio.export(wav_file, format="wav")
125
  return wav_file
@@ -127,15 +127,18 @@ def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
127
  logger.error(f"Audio conversion failed: {str(e)}")
128
  raise
129
 
 
130
  def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
131
  try:
132
  audio = AudioSegment.from_file(audio_path)
133
  segment = audio[start_ms:end_ms]
134
  temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
135
  segment.export(temp_path, format="wav")
 
136
  y, sr = librosa.load(temp_path, sr=16000)
137
  pitches = librosa.piptrack(y=y, sr=sr)[0]
138
  pitches = pitches[pitches > 0]
 
139
  features = {
140
  'duration': (end_ms - start_ms) / 1000,
141
  'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
@@ -147,16 +150,24 @@ def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Di
147
  'intensityMax': float(np.max(librosa.feature.rms(y=y)[0])),
148
  'intensitySD': float(np.std(librosa.feature.rms(y=y)[0])),
149
  }
 
150
  os.remove(temp_path)
151
  return features
152
  except Exception as e:
153
  logger.error(f"Feature extraction failed: {str(e)}")
154
  return {
155
- 'duration': 0.0, 'mean_pitch': 0.0, 'min_pitch': 0.0, 'max_pitch': 0.0,
156
- 'pitch_sd': 0.0, 'intensityMean': 0.0, 'intensityMin': 0.0,
157
- 'intensityMax': 0.0, 'intensitySD': 0.0
 
 
 
 
 
 
158
  }
159
 
 
160
  def transcribe(audio_path: str) -> Dict:
161
  try:
162
  with open(audio_path, 'rb') as f:
@@ -166,6 +177,7 @@ def transcribe(audio_path: str) -> Dict:
166
  data=f
167
  )
168
  audio_url = upload_response.json()['upload_url']
 
169
  transcript_response = requests.post(
170
  "https://api.assemblyai.com/v2/transcript",
171
  headers={"authorization": ASSEMBLYAI_KEY},
@@ -176,20 +188,24 @@ def transcribe(audio_path: str) -> Dict:
176
  }
177
  )
178
  transcript_id = transcript_response.json()['id']
 
179
  while True:
180
  result = requests.get(
181
  f"https://api.assemblyai.com/v2/transcript/{transcript_id}",
182
  headers={"authorization": ASSEMBLYAI_KEY}
183
  ).json()
 
184
  if result['status'] == 'completed':
185
  return result
186
  elif result['status'] == 'error':
187
  raise Exception(result['error'])
 
188
  time.sleep(5)
189
  except Exception as e:
190
  logger.error(f"Transcription failed: {str(e)}")
191
  raise
192
 
 
193
  def process_utterance(utterance, full_audio, wav_file):
194
  try:
195
  start = utterance['start']
@@ -197,198 +213,254 @@ def process_utterance(utterance, full_audio, wav_file):
197
  segment = full_audio[start:end]
198
  temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
199
  segment.export(temp_path, format="wav")
 
200
  with torch.no_grad():
201
- embedding = speaker_model.get_embedding(temp_path).cpu().numpy()
202
- embedding_list = embedding.flatten().tolist()
203
  query_result = index.query(
204
- vector=embedding_list,
205
  top_k=1,
206
  include_metadata=True
207
  )
 
208
  if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
209
  speaker_id = query_result['matches'][0]['id']
210
  speaker_name = query_result['matches'][0]['metadata']['speaker_name']
211
  else:
212
  speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
213
  speaker_name = f"Speaker_{speaker_id[-4:]}"
214
- index.upsert([(speaker_id, embedding_list, {"speaker_name": speaker_id})])
 
215
  os.remove(temp_path)
 
216
  return {
217
- ...
218
- **speech, 'speaker': speaker_name,
219
  'speaker_id': speaker_id,
220
- 'embedding': embedding_list
221
  }
222
  except Exception as e:
223
- logger.error(f"Utterance processing failed: {str(e)}", exc_info=True)
224
  return {
225
- ...
226
- speech, 'speech': 'Unknown',
227
- 'speaker_id': speaker_id,
228
- 'embedding_id': None
229
  }
230
 
231
- def identify_speakers(audio: Dict, text: str) -> List[Dict]:
 
232
  try:
233
- audio = AudioSegment.from_wav(text)
234
- speakers = audio['speech']
235
- with ThreadPoolExecutor(max_workers=5) as executor:
 
236
  futures = [
237
- executor.submit(process_speech, speech, speakers, text)
238
- for speech in speakers
239
  ]
240
  results = [f.result() for f in futures]
 
241
  return results
242
  except Exception as e:
243
  logger.error(f"Speaker identification failed: {str(e)}")
244
  raise
245
 
246
- def train_role_classifier(speakers: List[Dict]):
 
247
  try:
248
- speech = [u['speech'].split()]
249
- vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1,2))
250
- X_text = vectorizer.fit_transform(speech)
 
251
  features = []
252
  labels = []
253
- for i, speaker in enumerate(speakers):
254
- utterance = speaker['speech_features']
 
255
  feat = [
256
- utterance['duration'], utterance['speech_rate'], utterance['duration'], utterance['mean_pitch'],
257
- utterance['min_pitch'], utterance['max_pitch'],
258
- utterance['speech_sd'], utterance['intensityLevel'],
259
- utterance['intensity_level'],
260
- utterance['speechMax']], utterance['speechSD'],
 
 
 
 
261
  ]
262
- feat.extend(X_text[i].toarray()[0])
263
- doc = nlp(speaker['speech'])
264
- speech.extend([
265
- int(speaker['speech'].endswith('?'))),
266
- len(re.findall(r'\b(why|how|what|when|where|who|which)\b', speaker['speech'].lower())),
267
- len(speaker['speech'].split())),
268
- sum(frequency for token in speech if token.pos_ == 'VERB'),
269
- sum(frequency for token in speech if token.pos == 'NOUN')
 
 
270
  ])
 
271
  features.append(feat)
272
- labels.append((0 if i % 2 == 0 else 1))
 
273
  scaler = StandardScaler()
274
  X = scaler.fit_transform(features)
 
275
  clf = RandomForestClassifier(
276
- n_estimators=150, max_depth=10, random_state=42, class_weight='balanced'
 
 
 
277
  )
278
  clf.fit(X, labels)
 
279
  joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
280
  joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
281
  joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
 
282
  return clf, vectorizer, scaler
283
  except Exception as e:
284
  logger.error(f"Classifier training failed: {str(e)}")
285
  raise
286
 
287
- def classify_roles(speakers: List[Dict], clf, vectorizer, scaler):
 
288
  try:
289
- speech = [u['speech'] for u in speakers]
290
- X_text = vectorizer.transform(speech)
 
291
  results = []
292
- for i, speaker in enumerate(speakers):
293
- prosodic = speaker['speech_features']
294
  feat = [
295
- prosodic['duration'], prosodic['mean_pitch'], prosodic['min_pitch'],
296
- prosodic['max_pitch'], prosodic['pitch_sd'], prosodic['intensityMean'],
297
- prosodic['intensityMin'], prosodic['intensityMax'], prosodic['intensitySD'],
 
 
 
 
 
 
298
  ]
 
299
  feat.extend(X_text[i].toarray()[0].tolist())
300
- doc = nlp(speaker['speech'])
 
301
  feat.extend([
302
- int(speaker['speech'].endswith('?')),
303
- len(re.findall(r'\b(why|how|what|when|where|who|which)\b', speaker['speech'].lower())),
304
- len(speaker['speech'].split()),
305
  sum(1 for token in doc if token.pos_ == 'VERB'),
306
  sum(1 for token in doc if token.pos_ == 'NOUN')
307
  ])
 
308
  X = scaler.transform([feat])
309
  role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
310
- results.append({**speaker, 'role': role})
 
 
311
  return results
312
  except Exception as e:
313
  logger.error(f"Role classification failed: {str(e)}")
314
  raise
315
 
316
- def analyze_interviewee_voice(audio_path: str, speakers: List[Dict]) -> Dict:
 
 
 
317
  try:
318
  y, sr = librosa.load(audio_path, sr=16000)
319
- interviewee_speakers = [u for u in speakers if u['role'] == 'Interviewee']
320
- if not interviewee_speakers:
321
- return {'error': 'No interviewee speeches found'}
 
 
322
  segments = []
323
- for u in interviewee_speakers:
324
  start = int(u['start'] * sr / 1000)
325
  end = int(u['end'] * sr / 1000)
326
  segments.append(y[start:end])
327
- total_duration = sum(u['speech_features']['duration'] for u in interviewee_speakers)
328
- total_words = sum(len(u['speech'].split()) for u in interviewee_speakers)
 
 
 
329
  speaking_rate = total_words / total_duration if total_duration > 0 else 0
 
330
  filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
331
- filler_count = sum(sum(u['speech'].lower().count(fw) for fw in filler_words) for u in interviewee_speakers)
 
 
 
332
  filler_ratio = filler_count / total_words if total_words > 0 else 0
333
- all_words = ' '.join(u['speech'].lower() for u in interviewee_speakers).split()
 
334
  word_counts = {}
335
  for i in range(len(all_words) - 1):
336
  bigram = (all_words[i], all_words[i + 1])
337
  word_counts[bigram] = word_counts.get(bigram, 0) + 1
338
- repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(word_counts) if word_counts else 0
 
 
339
  pitches = []
340
  for segment in segments:
341
  f0, voiced_flag, _ = librosa.pyin(segment, fmin=80, fmax=300, sr=sr)
342
  pitches.extend(f0[voiced_flag])
 
343
  pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
344
  pitch_std = np.std(pitches) if len(pitches) > 0 else 0
345
  jitter = np.mean(np.abs(np.diff(pitches))) / pitch_mean if len(pitches) > 1 and pitch_mean > 0 else 0
 
346
  intensities = []
347
  for segment in segments:
348
  rms = librosa.feature.rms(y=segment)[0]
349
  intensities.extend(rms)
 
350
  intensity_mean = np.mean(intensities) if intensities else 0
351
  intensity_std = np.std(intensities) if intensities else 0
352
- shimmer = np.mean(np.abs(np.diff(intensities))) / intensity_mean if len(intensities) > 1 and intensity_mean > 0 else 0
 
 
353
  anxiety_score = 0.6 * (pitch_std / pitch_mean) + 0.4 * (jitter + shimmer) if pitch_mean > 0 else 0
354
  confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
355
  hesitation_score = filler_ratio + repetition_score
356
- anxiety_level = 'High' if anxiety_score > 0.15 else 'Moderate' if anxiety_score > 0.07 else 'Low'
357
- confidence_level = 'High' if confidence_score > 0.7 else 'Moderate' if confidence_score > 0.5 else 'Low'
358
- fluency_level = 'Fluent' if (filler_ratio < 0.05 and repetition_score < 0.1) else 'Moderate' if (filler_ratio < 0.1 and repetition_score < 0.2) else 'Disfluent'
 
 
 
359
  return {
360
  'speaking_rate': float(round(speaking_rate, 2)),
361
  'filler_ratio': float(round(filler_ratio, 4)),
362
  'repetition_score': float(round(repetition_score, 4)),
363
- 'pitch_analysis': {'mean': float(round(pitch_mean, 2)), 'std_dev': float(round(pitch_std, 2)), 'jitter': float(round(jitter, 4))},
364
- 'intensity_analysis': {'mean': float(round(intensity_mean, 2)), 'std_dev': float(round(intensity_std, 2)), 'shimmer': float(round(shimmer, 4))},
365
- 'composite_scores': {'anxiety': float(round(anxiety_score, 4)), 'confidence': float(round(confidence_score, 4)), 'hesitation': float(round(hesitation_score, 4))},
366
- 'interpretation': {'anxiety_level': anxiety_level, 'confidence_level': confidence_level, 'fluency_level': fluency_level}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
  }
368
  except Exception as e:
369
  logger.error(f"Voice analysis failed: {str(e)}")
370
  return {'error': str(e)}
371
 
372
- def generate_voice_interpretation(analysis: Dict) -> str:
373
- if 'error' in analysis:
374
- return "Voice analysis unavailable due to processing limitations."
375
- interpretation_lines = [
376
- "Vocal Performance Profile:",
377
- f"- Speaking Rate: {analysis['speaking_rate']} words/sec - Benchmark: 2.0-3.0 wps for clear delivery",
378
- f"- Filler Word Frequency: {analysis['filler_ratio'] * 100:.1f}% - Measures non-content words",
379
- f"- Repetition Index: {analysis['repetition_score']:.3f} - Frequency of repeated phrases",
380
- f"- Anxiety Indicator: {analysis['interpretation']['anxiety_level']} (Score: {analysis['composite_scores']['anxiety']:.3f}) - Pitch and vocal stability",
381
- f"- Confidence Indicator: {analysis['interpretation']['confidence_level']} (Score: {analysis['composite_scores']['confidence']:.3f}) - Vocal strength",
382
- f"- Fluency Rating: {analysis['interpretation']['fluency_level']} - Speech flow and coherence",
383
- "",
384
- "HR Insights:",
385
- "- Rapid speech (>3.0 wps) may signal enthusiasm but risks clarity.",
386
- "- High filler word use reduces perceived professionalism.",
387
- "- Elevated anxiety suggests pressure; training can build resilience.",
388
- "- Strong confidence aligns with leadership presence.",
389
- "- Fluent speech enhances engagement, critical for team roles."
390
- ]
391
- return "\n".join(interpretation_lines)
392
 
393
  def generate_anxiety_confidence_chart(composite_scores: Dict, chart_path_or_buffer):
394
  try:
 
113
  return speaker_model, nlp, tokenizer, llm_model
114
 
115
  speaker_model, nlp, tokenizer, llm_model = load_models()
 
116
  def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
117
  try:
118
  audio = AudioSegment.from_file(audio_path)
119
  if audio.channels > 1:
120
  audio = audio.set_channels(1)
121
  audio = audio.set_frame_rate(16000)
122
+
123
  wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
124
  audio.export(wav_file, format="wav")
125
  return wav_file
 
127
  logger.error(f"Audio conversion failed: {str(e)}")
128
  raise
129
 
130
+
131
  def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
132
  try:
133
  audio = AudioSegment.from_file(audio_path)
134
  segment = audio[start_ms:end_ms]
135
  temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
136
  segment.export(temp_path, format="wav")
137
+
138
  y, sr = librosa.load(temp_path, sr=16000)
139
  pitches = librosa.piptrack(y=y, sr=sr)[0]
140
  pitches = pitches[pitches > 0]
141
+
142
  features = {
143
  'duration': (end_ms - start_ms) / 1000,
144
  'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
 
150
  'intensityMax': float(np.max(librosa.feature.rms(y=y)[0])),
151
  'intensitySD': float(np.std(librosa.feature.rms(y=y)[0])),
152
  }
153
+
154
  os.remove(temp_path)
155
  return features
156
  except Exception as e:
157
  logger.error(f"Feature extraction failed: {str(e)}")
158
  return {
159
+ 'duration': (end_ms - start_ms) / 1000,
160
+ 'mean_pitch': 0.0,
161
+ 'min_pitch': 0.0,
162
+ 'max_pitch': 0.0,
163
+ 'pitch_sd': 0.0,
164
+ 'intensityMean': 0.0,
165
+ 'intensityMin': 0.0,
166
+ 'intensityMax': 0.0,
167
+ 'intensitySD': 0.0,
168
  }
169
 
170
+
171
  def transcribe(audio_path: str) -> Dict:
172
  try:
173
  with open(audio_path, 'rb') as f:
 
177
  data=f
178
  )
179
  audio_url = upload_response.json()['upload_url']
180
+
181
  transcript_response = requests.post(
182
  "https://api.assemblyai.com/v2/transcript",
183
  headers={"authorization": ASSEMBLYAI_KEY},
 
188
  }
189
  )
190
  transcript_id = transcript_response.json()['id']
191
+
192
  while True:
193
  result = requests.get(
194
  f"https://api.assemblyai.com/v2/transcript/{transcript_id}",
195
  headers={"authorization": ASSEMBLYAI_KEY}
196
  ).json()
197
+
198
  if result['status'] == 'completed':
199
  return result
200
  elif result['status'] == 'error':
201
  raise Exception(result['error'])
202
+
203
  time.sleep(5)
204
  except Exception as e:
205
  logger.error(f"Transcription failed: {str(e)}")
206
  raise
207
 
208
+
209
  def process_utterance(utterance, full_audio, wav_file):
210
  try:
211
  start = utterance['start']
 
213
  segment = full_audio[start:end]
214
  temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
215
  segment.export(temp_path, format="wav")
216
+
217
  with torch.no_grad():
218
+ embedding = speaker_model.get_embedding(temp_path).to(device)
219
+
220
  query_result = index.query(
221
+ vector=embedding.cpu().numpy().tolist(),
222
  top_k=1,
223
  include_metadata=True
224
  )
225
+
226
  if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
227
  speaker_id = query_result['matches'][0]['id']
228
  speaker_name = query_result['matches'][0]['metadata']['speaker_name']
229
  else:
230
  speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
231
  speaker_name = f"Speaker_{speaker_id[-4:]}"
232
+ index.upsert([(speaker_id, embedding.tolist(), {"speaker_name": speaker_name})])
233
+
234
  os.remove(temp_path)
235
+
236
  return {
237
+ **utterance,
238
+ 'speaker': speaker_name,
239
  'speaker_id': speaker_id,
240
+ 'embedding': embedding.cpu().numpy().tolist()
241
  }
242
  except Exception as e:
243
+ logger.error(f"Utterance processing failed: {str(e)}")
244
  return {
245
+ **utterance,
246
+ 'speaker': 'Unknown',
247
+ 'speaker_id': 'unknown',
248
+ 'embedding': None
249
  }
250
 
251
+
252
+ def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
253
  try:
254
+ full_audio = AudioSegment.from_wav(wav_file)
255
+ utterances = transcript['utterances']
256
+
257
+ with ThreadPoolExecutor(max_workers=5) as executor: # Changed to 5 workers
258
  futures = [
259
+ executor.submit(process_utterance, utterance, full_audio, wav_file)
260
+ for utterance in utterances
261
  ]
262
  results = [f.result() for f in futures]
263
+
264
  return results
265
  except Exception as e:
266
  logger.error(f"Speaker identification failed: {str(e)}")
267
  raise
268
 
269
+
270
+ def train_role_classifier(utterances: List[Dict]):
271
  try:
272
+ texts = [u['text'] for u in utterances]
273
+ vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
274
+ X_text = vectorizer.fit_transform(texts)
275
+
276
  features = []
277
  labels = []
278
+
279
+ for i, utterance in enumerate(utterances):
280
+ prosodic = utterance['prosodic_features']
281
  feat = [
282
+ prosodic['duration'],
283
+ prosodic['mean_pitch'],
284
+ prosodic['min_pitch'],
285
+ prosodic['max_pitch'],
286
+ prosodic['pitch_sd'],
287
+ prosodic['intensityMean'],
288
+ prosodic['intensityMin'],
289
+ prosodic['intensityMax'],
290
+ prosodic['intensitySD'],
291
  ]
292
+
293
+ feat.extend(X_text[i].toarray()[0].tolist())
294
+
295
+ doc = nlp(utterance['text'])
296
+ feat.extend([
297
+ int(utterance['text'].endswith('?')),
298
+ len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower())),
299
+ len(utterance['text'].split()),
300
+ sum(1 for token in doc if token.pos_ == 'VERB'),
301
+ sum(1 for token in doc if token.pos_ == 'NOUN')
302
  ])
303
+
304
  features.append(feat)
305
+ labels.append(0 if i % 2 == 0 else 1)
306
+
307
  scaler = StandardScaler()
308
  X = scaler.fit_transform(features)
309
+
310
  clf = RandomForestClassifier(
311
+ n_estimators=150,
312
+ max_depth=10,
313
+ random_state=42,
314
+ class_weight='balanced'
315
  )
316
  clf.fit(X, labels)
317
+
318
  joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
319
  joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
320
  joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
321
+
322
  return clf, vectorizer, scaler
323
  except Exception as e:
324
  logger.error(f"Classifier training failed: {str(e)}")
325
  raise
326
 
327
+
328
+ def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
329
  try:
330
+ texts = [u['text'] for u in utterances]
331
+ X_text = vectorizer.transform(texts)
332
+
333
  results = []
334
+ for i, utterance in enumerate(utterances):
335
+ prosodic = utterance['prosodic_features']
336
  feat = [
337
+ prosodic['duration'],
338
+ prosodic['mean_pitch'],
339
+ prosodic['min_pitch'],
340
+ prosodic['max_pitch'],
341
+ prosodic['pitch_sd'],
342
+ prosodic['intensityMean'],
343
+ prosodic['intensityMin'],
344
+ prosodic['intensityMax'],
345
+ prosodic['intensitySD'],
346
  ]
347
+
348
  feat.extend(X_text[i].toarray()[0].tolist())
349
+
350
+ doc = nlp(utterance['text'])
351
  feat.extend([
352
+ int(utterance['text'].endswith('?')),
353
+ len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower())),
354
+ len(utterance['text'].split()),
355
  sum(1 for token in doc if token.pos_ == 'VERB'),
356
  sum(1 for token in doc if token.pos_ == 'NOUN')
357
  ])
358
+
359
  X = scaler.transform([feat])
360
  role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
361
+
362
+ results.append({**utterance, 'role': role})
363
+
364
  return results
365
  except Exception as e:
366
  logger.error(f"Role classification failed: {str(e)}")
367
  raise
368
 
369
+
370
+
371
+
372
+ def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
373
  try:
374
  y, sr = librosa.load(audio_path, sr=16000)
375
+
376
+ interviewee_utterances = [u for u in utterances if u['role'] == 'Interviewee']
377
+ if not interviewee_utterances:
378
+ return {'error': 'No interviewee utterances found'}
379
+
380
  segments = []
381
+ for u in interviewee_utterances:
382
  start = int(u['start'] * sr / 1000)
383
  end = int(u['end'] * sr / 1000)
384
  segments.append(y[start:end])
385
+
386
+ combined_audio = np.concatenate(segments)
387
+
388
+ total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
389
+ total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
390
  speaking_rate = total_words / total_duration if total_duration > 0 else 0
391
+
392
  filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
393
+ filler_count = sum(
394
+ sum(u['text'].lower().count(fw) for fw in filler_words)
395
+ for u in interviewee_utterances
396
+ )
397
  filler_ratio = filler_count / total_words if total_words > 0 else 0
398
+
399
+ all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
400
  word_counts = {}
401
  for i in range(len(all_words) - 1):
402
  bigram = (all_words[i], all_words[i + 1])
403
  word_counts[bigram] = word_counts.get(bigram, 0) + 1
404
+ repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(
405
+ word_counts) if word_counts else 0
406
+
407
  pitches = []
408
  for segment in segments:
409
  f0, voiced_flag, _ = librosa.pyin(segment, fmin=80, fmax=300, sr=sr)
410
  pitches.extend(f0[voiced_flag])
411
+
412
  pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
413
  pitch_std = np.std(pitches) if len(pitches) > 0 else 0
414
  jitter = np.mean(np.abs(np.diff(pitches))) / pitch_mean if len(pitches) > 1 and pitch_mean > 0 else 0
415
+
416
  intensities = []
417
  for segment in segments:
418
  rms = librosa.feature.rms(y=segment)[0]
419
  intensities.extend(rms)
420
+
421
  intensity_mean = np.mean(intensities) if intensities else 0
422
  intensity_std = np.std(intensities) if intensities else 0
423
+ shimmer = np.mean(np.abs(np.diff(intensities))) / intensity_mean if len(
424
+ intensities) > 1 and intensity_mean > 0 else 0
425
+
426
  anxiety_score = 0.6 * (pitch_std / pitch_mean) + 0.4 * (jitter + shimmer) if pitch_mean > 0 else 0
427
  confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
428
  hesitation_score = filler_ratio + repetition_score
429
+
430
+ anxiety_level = 'high' if anxiety_score > 0.15 else 'moderate' if anxiety_score > 0.07 else 'low'
431
+ confidence_level = 'high' if confidence_score > 0.7 else 'moderate' if confidence_score > 0.5 else 'low'
432
+ fluency_level = 'fluent' if (filler_ratio < 0.05 and repetition_score < 0.1) else 'moderate' if (
433
+ filler_ratio < 0.1 and repetition_score < 0.2) else 'disfluent'
434
+
435
  return {
436
  'speaking_rate': float(round(speaking_rate, 2)),
437
  'filler_ratio': float(round(filler_ratio, 4)),
438
  'repetition_score': float(round(repetition_score, 4)),
439
+ 'pitch_analysis': {
440
+ 'mean': float(round(pitch_mean, 2)),
441
+ 'std_dev': float(round(pitch_std, 2)),
442
+ 'jitter': float(round(jitter, 4))
443
+ },
444
+ 'intensity_analysis': {
445
+ 'mean': float(round(intensity_mean, 2)),
446
+ 'std_dev': float(round(intensity_std, 2)),
447
+ 'shimmer': float(round(shimmer, 4))
448
+ },
449
+ 'composite_scores': {
450
+ 'anxiety': float(round(anxiety_score, 4)),
451
+ 'confidence': float(round(confidence_score, 4)),
452
+ 'hesitation': float(round(hesitation_score, 4))
453
+ },
454
+ 'interpretation': {
455
+ 'anxiety_level': anxiety_level,
456
+ 'confidence_level': confidence_level,
457
+ 'fluency_level': fluency_level
458
+ }
459
  }
460
  except Exception as e:
461
  logger.error(f"Voice analysis failed: {str(e)}")
462
  return {'error': str(e)}
463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
 
465
  def generate_anxiety_confidence_chart(composite_scores: Dict, chart_path_or_buffer):
466
  try: