norhan12 commited on
Commit
97f53a0
·
verified ·
1 Parent(s): 0ef7f55

Update process_interview.py

Browse files
Files changed (1) hide show
  1. process_interview.py +22 -22
process_interview.py CHANGED
@@ -41,7 +41,7 @@ logging.getLogger("nemo_logging").setLevel(logging.INFO)
41
  logging.getLogger("nemo").setLevel(logging.INFO)
42
 
43
  # Configuration
44
- AUDIO_DIR = "./Uploads"
45
  OUTPUT_DIR = "./processed_audio"
46
  os.makedirs(OUTPUT_DIR, exist_ok=True)
47
 
@@ -211,31 +211,31 @@ def process_utterance(utterance, full_audio, wav_file):
211
  else:
212
  speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
213
  speaker_name = f"Speaker_{speaker_id[-4:]}"
214
- index.upsert([(speaker_id, embedding_list, {"speaker_name": speaker_id})])
215
  os.remove(temp_path)
216
  return {
217
- ...
218
- **speech, 'speaker': speaker_name,
219
  'speaker_id': speaker_id,
220
  'embedding': embedding_list
221
  }
222
  except Exception as e:
223
  logger.error(f"Utterance processing failed: {str(e)}", exc_info=True)
224
  return {
225
- ...
226
- speech, 'speech': 'Unknown',
227
- 'speaker_id': speaker_id,
228
- 'embedding_id': None
229
  }
230
 
231
- def identify_speakers(audio: Dict, text: str) -> List[Dict]:
232
  try:
233
- audio = AudioSegment.from_wav(text)
234
- speakers = audio['speech']
235
  with ThreadPoolExecutor(max_workers=5) as executor:
236
  futures = [
237
- executor.submit(process_speech, speech, speakers, text)
238
- for speech in speakers
239
  ]
240
  results = [f.result() for f in futures]
241
  return results
@@ -311,24 +311,24 @@ def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
311
  logger.error(f"Role classification failed: {str(e)}")
312
  raise
313
 
314
- def analyze_interviewee_voice(audio_path: str, speakers: List[Dict]) -> Dict:
315
  try:
316
  y, sr = librosa.load(audio_path, sr=16000)
317
- interviewee_speakers = [u for u in speakers if u['role'] == 'Interviewee']
318
- if not interviewee_speakers:
319
- return {'error': 'No interviewee speeches found'}
320
  segments = []
321
- for u in interviewee_speakers:
322
  start = int(u['start'] * sr / 1000)
323
  end = int(u['end'] * sr / 1000)
324
  segments.append(y[start:end])
325
- total_duration = sum(u['speech_features']['duration'] for u in interviewee_speakers)
326
- total_words = sum(len(u['speech'].split()) for u in interviewee_speakers)
327
  speaking_rate = total_words / total_duration if total_duration > 0 else 0
328
  filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
329
- filler_count = sum(sum(u['speech'].lower().count(fw) for fw in filler_words) for u in interviewee_speakers)
330
  filler_ratio = filler_count / total_words if total_words > 0 else 0
331
- all_words = ' '.join(u['speech'].lower() for u in interviewee_speakers).split()
332
  word_counts = {}
333
  for i in range(len(all_words) - 1):
334
  bigram = (all_words[i], all_words[i + 1])
 
41
  logging.getLogger("nemo").setLevel(logging.INFO)
42
 
43
  # Configuration
44
+ AUDIO_DIR = "./uploads"
45
  OUTPUT_DIR = "./processed_audio"
46
  os.makedirs(OUTPUT_DIR, exist_ok=True)
47
 
 
211
  else:
212
  speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
213
  speaker_name = f"Speaker_{speaker_id[-4:]}"
214
+ index.upsert([(speaker_id, embedding_list, {"speaker_name": speaker_name})])
215
  os.remove(temp_path)
216
  return {
217
+ **utterance,
218
+ 'speaker': speaker_name,
219
  'speaker_id': speaker_id,
220
  'embedding': embedding_list
221
  }
222
  except Exception as e:
223
  logger.error(f"Utterance processing failed: {str(e)}", exc_info=True)
224
  return {
225
+ **utterance,
226
+ 'speaker': 'Unknown',
227
+ 'speaker_id': 'unknown',
228
+ 'embedding': None
229
  }
230
 
231
+ def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
232
  try:
233
+ full_audio = AudioSegment.from_wav(wav_file)
234
+ utterances = transcript['utterances']
235
  with ThreadPoolExecutor(max_workers=5) as executor:
236
  futures = [
237
+ executor.submit(process_utterance, utterance, full_audio, wav_file)
238
+ for utterance in utterances
239
  ]
240
  results = [f.result() for f in futures]
241
  return results
 
311
  logger.error(f"Role classification failed: {str(e)}")
312
  raise
313
 
314
+ def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
315
  try:
316
  y, sr = librosa.load(audio_path, sr=16000)
317
+ interviewee_utterances = [u for u in utterances if u['role'] == 'Interviewee']
318
+ if not interviewee_utterances:
319
+ return {'error': 'No interviewee utterances found'}
320
  segments = []
321
+ for u in interviewee_utterances:
322
  start = int(u['start'] * sr / 1000)
323
  end = int(u['end'] * sr / 1000)
324
  segments.append(y[start:end])
325
+ total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
326
+ total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
327
  speaking_rate = total_words / total_duration if total_duration > 0 else 0
328
  filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
329
+ filler_count = sum(sum(u['text'].lower().count(fw) for fw in filler_words) for u in interviewee_utterances)
330
  filler_ratio = filler_count / total_words if total_words > 0 else 0
331
+ all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
332
  word_counts = {}
333
  for i in range(len(all_words) - 1):
334
  bigram = (all_words[i], all_words[i + 1])