Update process_interview.py
Browse files- process_interview.py +22 -22
process_interview.py
CHANGED
|
@@ -41,7 +41,7 @@ logging.getLogger("nemo_logging").setLevel(logging.INFO)
|
|
| 41 |
logging.getLogger("nemo").setLevel(logging.INFO)
|
| 42 |
|
| 43 |
# Configuration
|
| 44 |
-
AUDIO_DIR = "./
|
| 45 |
OUTPUT_DIR = "./processed_audio"
|
| 46 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 47 |
|
|
@@ -211,31 +211,31 @@ def process_utterance(utterance, full_audio, wav_file):
|
|
| 211 |
else:
|
| 212 |
speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
|
| 213 |
speaker_name = f"Speaker_{speaker_id[-4:]}"
|
| 214 |
-
index.upsert([(speaker_id, embedding_list, {"speaker_name":
|
| 215 |
os.remove(temp_path)
|
| 216 |
return {
|
| 217 |
-
|
| 218 |
-
|
| 219 |
'speaker_id': speaker_id,
|
| 220 |
'embedding': embedding_list
|
| 221 |
}
|
| 222 |
except Exception as e:
|
| 223 |
logger.error(f"Utterance processing failed: {str(e)}", exc_info=True)
|
| 224 |
return {
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
'speaker_id':
|
| 228 |
-
'
|
| 229 |
}
|
| 230 |
|
| 231 |
-
def identify_speakers(
|
| 232 |
try:
|
| 233 |
-
|
| 234 |
-
|
| 235 |
with ThreadPoolExecutor(max_workers=5) as executor:
|
| 236 |
futures = [
|
| 237 |
-
executor.submit(
|
| 238 |
-
for
|
| 239 |
]
|
| 240 |
results = [f.result() for f in futures]
|
| 241 |
return results
|
|
@@ -311,24 +311,24 @@ def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
|
|
| 311 |
logger.error(f"Role classification failed: {str(e)}")
|
| 312 |
raise
|
| 313 |
|
| 314 |
-
def analyze_interviewee_voice(audio_path: str,
|
| 315 |
try:
|
| 316 |
y, sr = librosa.load(audio_path, sr=16000)
|
| 317 |
-
|
| 318 |
-
if not
|
| 319 |
-
return {'error': 'No interviewee
|
| 320 |
segments = []
|
| 321 |
-
for u in
|
| 322 |
start = int(u['start'] * sr / 1000)
|
| 323 |
end = int(u['end'] * sr / 1000)
|
| 324 |
segments.append(y[start:end])
|
| 325 |
-
total_duration = sum(u['
|
| 326 |
-
total_words = sum(len(u['
|
| 327 |
speaking_rate = total_words / total_duration if total_duration > 0 else 0
|
| 328 |
filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
|
| 329 |
-
filler_count = sum(sum(u['
|
| 330 |
filler_ratio = filler_count / total_words if total_words > 0 else 0
|
| 331 |
-
all_words = ' '.join(u['
|
| 332 |
word_counts = {}
|
| 333 |
for i in range(len(all_words) - 1):
|
| 334 |
bigram = (all_words[i], all_words[i + 1])
|
|
|
|
| 41 |
logging.getLogger("nemo").setLevel(logging.INFO)
|
| 42 |
|
| 43 |
# Configuration
|
| 44 |
+
AUDIO_DIR = "./uploads"
|
| 45 |
OUTPUT_DIR = "./processed_audio"
|
| 46 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 47 |
|
|
|
|
| 211 |
else:
|
| 212 |
speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
|
| 213 |
speaker_name = f"Speaker_{speaker_id[-4:]}"
|
| 214 |
+
index.upsert([(speaker_id, embedding_list, {"speaker_name": speaker_name})])
|
| 215 |
os.remove(temp_path)
|
| 216 |
return {
|
| 217 |
+
**utterance,
|
| 218 |
+
'speaker': speaker_name,
|
| 219 |
'speaker_id': speaker_id,
|
| 220 |
'embedding': embedding_list
|
| 221 |
}
|
| 222 |
except Exception as e:
|
| 223 |
logger.error(f"Utterance processing failed: {str(e)}", exc_info=True)
|
| 224 |
return {
|
| 225 |
+
**utterance,
|
| 226 |
+
'speaker': 'Unknown',
|
| 227 |
+
'speaker_id': 'unknown',
|
| 228 |
+
'embedding': None
|
| 229 |
}
|
| 230 |
|
| 231 |
+
def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
|
| 232 |
try:
|
| 233 |
+
full_audio = AudioSegment.from_wav(wav_file)
|
| 234 |
+
utterances = transcript['utterances']
|
| 235 |
with ThreadPoolExecutor(max_workers=5) as executor:
|
| 236 |
futures = [
|
| 237 |
+
executor.submit(process_utterance, utterance, full_audio, wav_file)
|
| 238 |
+
for utterance in utterances
|
| 239 |
]
|
| 240 |
results = [f.result() for f in futures]
|
| 241 |
return results
|
|
|
|
| 311 |
logger.error(f"Role classification failed: {str(e)}")
|
| 312 |
raise
|
| 313 |
|
| 314 |
+
def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
|
| 315 |
try:
|
| 316 |
y, sr = librosa.load(audio_path, sr=16000)
|
| 317 |
+
interviewee_utterances = [u for u in utterances if u['role'] == 'Interviewee']
|
| 318 |
+
if not interviewee_utterances:
|
| 319 |
+
return {'error': 'No interviewee utterances found'}
|
| 320 |
segments = []
|
| 321 |
+
for u in interviewee_utterances:
|
| 322 |
start = int(u['start'] * sr / 1000)
|
| 323 |
end = int(u['end'] * sr / 1000)
|
| 324 |
segments.append(y[start:end])
|
| 325 |
+
total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
|
| 326 |
+
total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
|
| 327 |
speaking_rate = total_words / total_duration if total_duration > 0 else 0
|
| 328 |
filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
|
| 329 |
+
filler_count = sum(sum(u['text'].lower().count(fw) for fw in filler_words) for u in interviewee_utterances)
|
| 330 |
filler_ratio = filler_count / total_words if total_words > 0 else 0
|
| 331 |
+
all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
|
| 332 |
word_counts = {}
|
| 333 |
for i in range(len(all_words) - 1):
|
| 334 |
bigram = (all_words[i], all_words[i + 1])
|