norhan12 commited on
Commit
75dc755
·
verified ·
1 Parent(s): f233230

Update process_interview.py

Browse files
Files changed (1) hide show
  1. process_interview.py +191 -934
process_interview.py CHANGED
@@ -1,950 +1,207 @@
1
  import os
2
- import torch
3
- import numpy as np
4
  import uuid
5
- import requests
6
- import time
7
  import json
8
- from pydub import AudioSegment
9
- import wave
10
- from nemo.collections.asr.models import EncDecSpeakerLabelModel
11
- from pinecone import Pinecone, ServerlessSpec
12
- import librosa
13
- import pandas as pd
14
- from sklearn.ensemble import RandomForestClassifier
15
- from sklearn.preprocessing import StandardScaler
16
- from sklearn.feature_extraction.text import TfidfVectorizer
17
- import re
18
- from typing import Dict, List, Tuple
19
  import logging
20
- # --- Imports for enhanced PDF ---
21
- from reportlab.lib.pagesizes import letter
22
- from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
23
- from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
24
- from reportlab.lib.units import inch
25
- from reportlab.lib import colors
26
- import matplotlib.pyplot as plt
27
- import matplotlib
28
- matplotlib.use('Agg')
29
- from reportlab.platypus import Image
30
- import io # --- FIX: إضافة import io لـ BytesIO ---
31
- # --- End Imports for enhanced PDF ---
32
- from transformers import AutoTokenizer, AutoModel
33
- import spacy
34
- import google.generativeai as genai
35
- import joblib
36
- from concurrent.futures import ThreadPoolExecutor
37
 
38
- # Setup logging
39
  logging.basicConfig(level=logging.INFO)
40
- logger = logging.getLogger(__name__)
41
- logging.getLogger("nemo_logging").setLevel(logging.ERROR)
42
- logging.getLogger("nemo").setLevel(logging.ERROR)
43
-
44
- # Configuration
45
- AUDIO_DIR = "./uploads"
46
- OUTPUT_DIR = "./processed_audio"
47
- os.makedirs(OUTPUT_DIR, exist_ok=True)
48
-
49
- # API Keys
50
- PINECONE_KEY = os.getenv("PINECONE_KEY")
51
- ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY")
52
- GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
53
-
54
-
55
- # Initialize services
56
- def initialize_services():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  try:
58
- pc = Pinecone(api_key=PINECONE_KEY)
59
- index_name = "interview-speaker-embeddings"
60
- if index_name not in pc.list_indexes().names():
61
- pc.create_index(
62
- name=index_name,
63
- dimension=192,
64
- metric="cosine",
65
- spec=ServerlessSpec(cloud="aws", region="us-east-1")
66
- )
67
- index = pc.Index(index_name)
68
-
69
- genai.configure(api_key=GEMINI_API_KEY)
70
- gemini_model = genai.GenerativeModel('gemini-1.5-flash')
71
-
72
- return index, gemini_model
73
- except Exception as e:
74
- logger.error(f"Error initializing services: {str(e)}")
75
- raise
76
-
77
-
78
- index, gemini_model = initialize_services()
79
-
80
- # Device setup
81
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
82
- logger.info(f"Using device: {device}")
83
-
84
-
85
- def load_speaker_model():
86
- try:
87
- import torch
88
- torch.set_num_threads(5)
89
- model = EncDecSpeakerLabelModel.from_pretrained(
90
- "nvidia/speakerverification_en_titanet_large",
91
- map_location=torch.device('cpu')
92
- )
93
- model.eval()
94
- return model
95
- except Exception as e:
96
- logger.error(f"Model loading failed: {str(e)}")
97
- raise RuntimeError("Could not load speaker verification model")
98
-
99
-
100
- # Load ML models
101
- def load_models():
102
- speaker_model = load_speaker_model()
103
- nlp = spacy.load("en_core_web_sm")
104
-
105
- tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
106
- llm_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
107
- llm_model.eval()
108
-
109
- return speaker_model, nlp, tokenizer, llm_model
110
-
111
-
112
- speaker_model, nlp, tokenizer, llm_model = load_models()
113
-
114
-
115
- # Audio processing functions
116
- def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
117
- try:
118
- audio = AudioSegment.from_file(audio_path)
119
- if audio.channels > 1:
120
- audio = audio.set_channels(1)
121
- audio = audio.set_frame_rate(16000)
122
-
123
- wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
124
- audio.export(wav_file, format="wav")
125
- return wav_file
126
- except Exception as e:
127
- logger.error(f"Audio conversion failed: {str(e)}")
128
- raise
129
-
130
-
131
- def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
132
- try:
133
- audio = AudioSegment.from_file(audio_path)
134
- segment = audio[start_ms:end_ms]
135
- temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
136
- segment.export(temp_path, format="wav")
137
-
138
- y, sr = librosa.load(temp_path, sr=16000)
139
- pitches = librosa.piptrack(y=y, sr=sr)[0]
140
- pitches = pitches[pitches > 0]
141
-
142
- features = {
143
- 'duration': (end_ms - start_ms) / 1000,
144
- 'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
145
- 'min_pitch': float(np.min(pitches)) if len(pitches) > 0 else 0.0,
146
- 'max_pitch': float(np.max(pitches)) if len(pitches) > 0 else 0.0,
147
- 'pitch_sd': float(np.std(pitches)) if len(pitches) > 0 else 0.0,
148
- 'intensityMean': float(np.mean(librosa.feature.rms(y=y)[0])),
149
- 'intensityMin': float(np.min(librosa.feature.rms(y=y)[0])),
150
- 'intensityMax': float(np.max(librosa.feature.rms(y=y)[0])),
151
- 'intensitySD': float(np.std(librosa.feature.rms(y=y)[0])),
152
- }
153
-
154
- os.remove(temp_path)
155
- return features
156
- except Exception as e:
157
- logger.error(f"Feature extraction failed: {str(e)}")
158
- return {
159
- 'duration': 0.0,
160
- 'mean_pitch': 0.0,
161
- 'min_pitch': 0.0,
162
- 'max_pitch': 0.0,
163
- 'pitch_sd': 0.0,
164
- 'intensityMean': 0.0,
165
- 'intensityMin': 0.0,
166
- 'intensityMax': 0.0,
167
- 'intensitySD': 0.0,
168
- }
169
-
170
-
171
- def transcribe(audio_path: str) -> Dict:
172
- try:
173
- with open(audio_path, 'rb') as f:
174
- upload_response = requests.post(
175
- "https://api.assemblyai.com/v2/upload",
176
- headers={"authorization": ASSEMBLYAI_KEY},
177
- data=f
178
- )
179
- audio_url = upload_response.json()['upload_url']
180
-
181
- transcript_response = requests.post(
182
- "https://api.assemblyai.com/v2/transcript",
183
- headers={"authorization": ASSEMBLYAI_KEY},
184
- json={
185
- "audio_url": audio_url,
186
- "speaker_labels": True,
187
- "filter_profanity": True
188
- }
189
- )
190
- transcript_id = transcript_response.json()['id']
191
-
192
- while True:
193
- result = requests.get(
194
- f"https://api.assemblyai.com/v2/transcript/{transcript_id}",
195
- headers={"authorization": ASSEMBLYAI_KEY}
196
- ).json()
197
-
198
- if result['status'] == 'completed':
199
- return result
200
- elif result['status'] == 'error':
201
- raise Exception(result['error'])
202
-
203
- time.sleep(5)
204
- except Exception as e:
205
- logger.error(f"Transcription failed: {str(e)}")
206
- raise
207
-
208
-
209
- def process_utterance(utterance, full_audio, wav_file):
210
- try:
211
- start = utterance['start']
212
- end = utterance['end']
213
- segment = full_audio[start:end]
214
- temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
215
- segment.export(temp_path, format="wav")
216
-
217
- with torch.no_grad():
218
- embedding = speaker_model.get_embedding(temp_path).cpu().numpy() # Ensure numpy array
219
 
220
- # --- FIX: Convert embedding to a flat list for Pinecone query ---
221
- embedding_list = embedding.flatten().tolist()
222
- # --- End FIX ---
223
-
224
- query_result = index.query(
225
- vector=embedding_list, # Use the corrected flat list
226
- top_k=1,
227
- include_metadata=True
228
- )
229
-
230
- if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
231
- speaker_id = query_result['matches'][0]['id']
232
- speaker_name = query_result['matches'][0]['metadata']['speaker_name']
233
- else:
234
- speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
235
- speaker_name = f"Speaker_{speaker_id[-4:]}"
236
- index.upsert([(speaker_id, embedding_list, {"speaker_name": speaker_name})]) # Use corrected list
237
-
238
- os.remove(temp_path)
239
-
240
- return {
241
- **utterance,
242
- 'speaker': speaker_name,
243
- 'speaker_id': speaker_id,
244
- 'embedding': embedding_list # Store the corrected list
245
- }
246
- except Exception as e:
247
- logger.error(f"Utterance processing failed: {str(e)}", exc_info=True)
248
- return {
249
- **utterance,
250
- 'speaker': 'Unknown',
251
- 'speaker_id': 'unknown',
252
- 'embedding': None
253
- }
254
-
255
-
256
- def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
257
- try:
258
- full_audio = AudioSegment.from_wav(wav_file)
259
- utterances = transcript['utterances']
260
-
261
- with ThreadPoolExecutor(max_workers=5) as executor: # Changed to 5 workers
262
- futures = [
263
- executor.submit(process_utterance, utterance, full_audio, wav_file)
264
- for utterance in utterances
265
- ]
266
- results = [f.result() for f in futures]
267
-
268
- return results
269
- except Exception as e:
270
- logger.error(f"Speaker identification failed: {str(e)}")
271
- raise
272
-
273
-
274
- def train_role_classifier(utterances: List[Dict]):
275
- try:
276
- texts = [u['text'] for u in utterances] # تم حذف الـ 'u' الزائدة
277
- vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
278
- X_text = vectorizer.fit_transform(texts)
279
-
280
- features = []
281
- labels = []
282
-
283
- for i, utterance in enumerate(utterances):
284
- prosodic = utterance['prosodic_features']
285
- feat = [
286
- prosodic['duration'],
287
- prosodic['mean_pitch'],
288
- prosodic['min_pitch'],
289
- prosodic['max_pitch'],
290
- prosodic['pitch_sd'],
291
- prosodic['intensityMean'],
292
- prosodic['intensityMin'],
293
- prosodic['intensityMax'],
294
- prosodic['intensitySD'],
295
- ]
296
-
297
- feat.extend(X_text[i].toarray()[0].tolist())
298
-
299
- doc = nlp(utterance['text'])
300
- feat.extend([
301
- int(utterance['text'].endswith('?')),
302
- len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower())),
303
- len(utterance['text'].split()),
304
- sum(1 for token in doc if token.pos_ == 'VERB'),
305
- sum(1 for token in doc if token.pos_ == 'NOUN')
306
- ])
307
-
308
- features.append(feat)
309
- labels.append(0 if i % 2 == 0 else 1)
310
-
311
- scaler = StandardScaler()
312
- X = scaler.fit_transform(features)
313
-
314
- clf = RandomForestClassifier(
315
- n_estimators=150,
316
- max_depth=10,
317
- random_state=42,
318
- class_weight='balanced'
319
- )
320
- clf.fit(X, labels)
321
-
322
- joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
323
- joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
324
- joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
325
-
326
- return clf, vectorizer, scaler
327
- except Exception as e:
328
- logger.error(f"Classifier training failed: {str(e)}")
329
- raise
330
-
331
-
332
- def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
333
- try:
334
- texts = [u['text'] for u in utterances]
335
- X_text = vectorizer.transform(texts)
336
-
337
- results = []
338
- for i, utterance in enumerate(utterances):
339
- prosodic = utterance['prosodic_features']
340
- feat = [
341
- prosodic['duration'],
342
- prosodic['mean_pitch'],
343
- prosodic['min_pitch'],
344
- prosodic['max_pitch'],
345
- prosodic['pitch_sd'],
346
- prosodic['intensityMean'],
347
- prosodic['intensityMin'],
348
- prosodic['intensityMax'],
349
- prosodic['intensitySD'],
350
- ]
351
-
352
- feat.extend(X_text[i].toarray()[0].tolist())
353
-
354
- doc = nlp(utterance['text'])
355
- feat.extend([
356
- int(utterance['text'].endswith('?')),
357
- len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower())),
358
- len(utterance['text'].split()),
359
- sum(1 for token in doc if token.pos_ == 'VERB'),
360
- sum(1 for token in doc if token.pos_ == 'NOUN')
361
- ])
362
-
363
- X = scaler.transform([feat])
364
- role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
365
-
366
- results.append({**utterance, 'role': role})
367
-
368
- return results
369
- except Exception as e:
370
- logger.error(f"Role classification failed: {str(e)}")
371
- raise
372
-
373
-
374
- def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
375
- try:
376
- y, sr = librosa.load(audio_path, sr=16000)
377
-
378
- interviewee_utterances = [u for u in utterances if u['role'] == 'Interviewee']
379
- if not interviewee_utterances:
380
- return {'error': 'No interviewee utterances found'}
381
-
382
- segments = []
383
- for u in interviewee_utterances:
384
- start = int(u['start'] * sr / 1000)
385
- end = int(u['end'] * sr / 1000)
386
- segments.append(y[start:end])
387
-
388
- combined_audio = np.concatenate(segments)
389
-
390
- total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
391
- total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
392
- speaking_rate = total_words / total_duration if total_duration > 0 else 0
393
-
394
- filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
395
- filler_count = sum(
396
- sum(u['text'].lower().count(fw) for fw in filler_words)
397
- for u in interviewee_utterances
398
- )
399
- filler_ratio = filler_count / total_words if total_words > 0 else 0
400
-
401
- all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
402
- word_counts = {}
403
- for i in range(len(all_words) - 1):
404
- bigram = (all_words[i], all_words[i + 1])
405
- word_counts[bigram] = word_counts.get(bigram, 0) + 1
406
- repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(
407
- word_counts) if word_counts else 0
408
-
409
- pitches = []
410
- for segment in segments:
411
- f0, voiced_flag, _ = librosa.pyin(segment, fmin=80, fmax=300, sr=sr)
412
- pitches.extend(f0[voiced_flag])
413
-
414
- pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
415
- pitch_std = np.std(pitches) if len(pitches) > 0 else 0
416
- jitter = np.mean(np.abs(np.diff(pitches))) / pitch_mean if len(pitches) > 1 and pitch_mean > 0 else 0
417
-
418
- intensities = []
419
- for segment in segments:
420
- rms = librosa.feature.rms(y=segment)[0]
421
- intensities.extend(rms)
422
-
423
- intensity_mean = np.mean(intensities) if intensities else 0
424
- intensity_std = np.std(intensities) if intensities else 0
425
- shimmer = np.mean(np.abs(np.diff(intensities))) / intensity_mean if len(
426
- intensities) > 1 and intensity_mean > 0 else 0
427
-
428
- anxiety_score = 0.6 * (pitch_std / pitch_mean) + 0.4 * (jitter + shimmer) if pitch_mean > 0 else 0
429
- confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
430
- hesitation_score = filler_ratio + repetition_score
431
-
432
- anxiety_level = 'high' if anxiety_score > 0.15 else 'moderate' if anxiety_score > 0.07 else 'low'
433
- confidence_level = 'high' if confidence_score > 0.7 else 'moderate' if confidence_score > 0.5 else 'low'
434
- fluency_level = 'fluent' if (filler_ratio < 0.05 and repetition_score < 0.1) else 'moderate' if (
435
- filler_ratio < 0.1 and repetition_score < 0.2) else 'disfluent'
436
-
437
- return {
438
- 'speaking_rate': float(round(speaking_rate, 2)),
439
- 'filler_ratio': float(round(filler_ratio, 4)),
440
- 'repetition_score': float(round(repetition_score, 4)),
441
- 'pitch_analysis': {
442
- 'mean': float(round(pitch_mean, 2)),
443
- 'std_dev': float(round(pitch_std, 2)),
444
- 'jitter': float(round(jitter, 4))
445
- },
446
- 'intensity_analysis': {
447
- 'mean': float(round(intensity_mean, 2)),
448
- 'std_dev': float(round(intensity_std, 2)),
449
- 'shimmer': float(round(shimmer, 4))
450
- },
451
- 'composite_scores': {
452
- 'anxiety': float(round(anxiety_score, 4)),
453
- 'confidence': float(round(confidence_score, 4)),
454
- 'hesitation': float(round(hesitation_score, 4))
455
- },
456
- 'interpretation': {
457
- 'anxiety_level': anxiety_level,
458
- 'confidence_level': confidence_level,
459
- 'fluency_level': fluency_level
460
- }
461
- }
462
- except Exception as e:
463
- logger.error(f"Voice analysis failed: {str(e)}")
464
- return {'error': str(e)}
465
-
466
-
467
- def generate_voice_interpretation(analysis: Dict) -> str:
468
- # This function is used to provide the text interpretation for Gemini's prompt.
469
- if 'error' in analysis:
470
- return "Voice analysis not available."
471
-
472
- interpretation_lines = []
473
- interpretation_lines.append("Voice Analysis Summary:")
474
- interpretation_lines.append(f"- Speaking Rate: {analysis['speaking_rate']} words/sec (average)")
475
- interpretation_lines.append(f"- Filler Words: {analysis['filler_ratio'] * 100:.1f}% of words")
476
- interpretation_lines.append(f"- Repetition Score: {analysis['repetition_score']:.3f}")
477
- interpretation_lines.append(
478
- f"- Anxiety Level: {analysis['interpretation']['anxiety_level'].upper()} (score: {analysis['composite_scores']['anxiety']:.3f})")
479
- interpretation_lines.append(
480
- f"- Confidence Level: {analysis['interpretation']['confidence_level'].upper()} (score: {analysis['composite_scores']['confidence']:.3f})")
481
- interpretation_lines.append(f"- Fluency: {analysis['interpretation']['fluency_level'].upper()}")
482
- interpretation_lines.append("")
483
- interpretation_lines.append("Detailed Interpretation:")
484
- interpretation_lines.append(
485
- "1. A higher speaking rate indicates faster speech, which can suggest nervousness or enthusiasm.")
486
- interpretation_lines.append("2. Filler words and repetitions reduce speech clarity and professionalism.")
487
- interpretation_lines.append("3. Anxiety is measured through pitch variability and voice instability.")
488
- interpretation_lines.append("4. Confidence is assessed through voice intensity and stability.")
489
- interpretation_lines.append("5. Fluency combines filler words and repetition metrics.")
490
-
491
- return "\n".join(interpretation_lines)
492
-
493
-
494
- # --- Chart Generation Function ---
495
- def generate_anxiety_confidence_chart(composite_scores: Dict, chart_path: str):
496
- try:
497
- labels = ['Anxiety', 'Confidence']
498
- scores = [composite_scores.get('anxiety', 0), composite_scores.get('confidence', 0)]
499
-
500
- fig, ax = plt.subplots(figsize=(4, 2.5)) # Smaller size for embedding in PDF
501
- ax.bar(labels, scores, color=['lightcoral', 'lightskyblue'])
502
- ax.set_ylabel('Score')
503
- ax.set_title('Anxiety vs. Confidence Scores')
504
- ax.set_ylim(0, 1.0) # Assuming scores are normalized 0-1
505
-
506
- for i, v in enumerate(scores):
507
- ax.text(i, v + 0.05, f"{v:.2f}", color='black', ha='center', fontweight='bold')
508
-
509
- # هذه الأوامر يجب أن تكون خارج الـ loop عشان يتم تنفيذها مرة واحدة بعد رسم كل العناصر
510
- plt.tight_layout()
511
- plt.savefig(chart_path)
512
- plt.close(fig) # Close the figure to free up memory
513
  except Exception as e:
514
- logger.error(f"Error generating chart: {str(e)}")
515
-
516
-
517
- # --- Acceptance Probability Calculation ---
518
- def calculate_acceptance_probability(analysis_data: Dict) -> float:
 
 
 
 
 
 
 
519
  """
520
- Calculates a hypothetical acceptance probability based on voice and content analysis.
521
- This is a simplified, heuristic model and can be refined with more data/ML.
 
522
  """
523
- voice = analysis_data.get('voice_analysis', {})
524
-
525
- if 'error' in voice:
526
- return 0.0 # Cannot calculate if voice analysis failed
527
-
528
- # Weights for different factors (adjust these to fine-tune the model)
529
- w_confidence = 0.4
530
- w_anxiety = -0.3 # Negative weight for anxiety
531
- w_fluency = 0.2
532
- w_speaking_rate = 0.1 # Ideal rate gets higher score
533
- w_filler_repetition = -0.1 # Negative weight for filler/repetition
534
- w_content_strengths = 0.2 # Placeholder, ideally from deeper content analysis
535
-
536
- # Normalize/interpret scores
537
- confidence_score = voice.get('composite_scores', {}).get('confidence', 0.0)
538
- anxiety_score = voice.get('composite_scores', {}).get('anxiety', 0.0)
539
- fluency_level = voice.get('interpretation', {}).get('fluency_level', 'disfluent')
540
- speaking_rate = voice.get('speaking_rate', 0.0)
541
- filler_ratio = voice.get('filler_ratio', 0.0)
542
- repetition_score = voice.get('repetition_score', 0.0)
543
-
544
- # Fluency mapping (higher score for more fluent)
545
- fluency_map = {'fluent': 1.0, 'moderate': 0.5, 'disfluent': 0.0}
546
- fluency_val = fluency_map.get(fluency_level, 0.0)
547
-
548
- # Speaking rate scoring (e.g., ideal is around 2.5 words/sec, gets lower for too fast/slow)
549
- # This is a simple inverse of deviation from ideal
550
- ideal_speaking_rate = 2.5
551
- speaking_rate_deviation = abs(speaking_rate - ideal_speaking_rate)
552
- speaking_rate_score = max(0, 1 - (speaking_rate_deviation / ideal_speaking_rate)) # Max 1.0, min 0.0
553
-
554
- # Filler/Repetition score (lower is better, so 1 - score)
555
- filler_repetition_composite = (filler_ratio + repetition_score) / 2 # Average them
556
- filler_repetition_score = max(0, 1 - filler_repetition_composite)
557
-
558
- # Simplified content strength score (you might need a more sophisticated NLP method here)
559
- # For now, based on presence of strengths in Gemini's content analysis
560
- content_strength_val = 0.0
561
- # This part would ideally come from a structured output from Gemini's content analysis.
562
- # For now, we'll make a simplified assumption based on the analysis data:
563
- # If content analysis found "strengths" (which is likely if Gemini generates a full report)
564
- # This needs refinement if Gemini output is not structured for this.
565
- if analysis_data.get('text_analysis', {}).get('total_duration', 0) > 0: # Basic check if interview happened
566
- content_strength_val = 0.8 # Assume moderate strength if analysis went through
567
- # You could parse gemini_report_text for specific phrases like "Strengths:" and count items.
568
-
569
- # Calculate raw score
570
- raw_score = (
571
- confidence_score * w_confidence +
572
- (1 - anxiety_score) * abs(w_anxiety) + # (1 - anxiety) because lower anxiety is better
573
- fluency_val * w_fluency +
574
- speaking_rate_score * w_speaking_rate +
575
- filler_repetition_score * abs(w_filler_repetition) + # Use abs weight as score is already inverted
576
- content_strength_val * w_content_strengths
577
- )
578
-
579
- # Normalize to 0-1 and then to percentage
580
- # These max/min values are rough estimates and should be calibrated with real data
581
- min_possible_score = (0 * w_confidence) + (0 * abs(w_anxiety)) + (0 * w_fluency) + (0 * w_speaking_rate) + (
582
- 0 * abs(w_filler_repetition)) + (0 * w_content_strengths)
583
- max_possible_score = (1 * w_confidence) + (1 * abs(w_anxiety)) + (1 * w_fluency) + (1 * w_speaking_rate) + (
584
- 1 * abs(w_filler_repetition)) + (1 * w_content_strengths)
585
-
586
- # Prevent division by zero if all weights are zero or min/max are same
587
- if max_possible_score == min_possible_score:
588
- normalized_score = 0.5 # Default if no variation
589
- else:
590
- normalized_score = (raw_score - min_possible_score) / (max_possible_score - min_possible_score)
591
-
592
- acceptance_probability = max(0.0, min(1.0, normalized_score)) # Clamp between 0 and 1
593
-
594
- return float(f"{acceptance_probability * 100:.2f}") # Return as percentage
595
-
596
-
597
- def generate_report(analysis_data: Dict) -> str:
598
- try:
599
- voice = analysis_data.get('voice_analysis', {})
600
- voice_interpretation = generate_voice_interpretation(voice)
601
-
602
- interviewee_responses = [
603
- f"Speaker {u['speaker']} ({u['role']}): {u['text']}"
604
- for u in analysis_data['transcript']
605
- if u['role'] == 'Interviewee'
606
- ][:5] # Limit to first 5 for prompt brevity
607
-
608
- acceptance_prob = analysis_data.get('acceptance_probability', None)
609
- acceptance_line = ""
610
- if acceptance_prob is not None:
611
- acceptance_line = f"\n**Estimated Acceptance Probability: {acceptance_prob:.2f}%**\n"
612
- if acceptance_prob >= 80:
613
- acceptance_line += "This indicates a very strong candidate. Well done!"
614
- elif acceptance_prob >= 50:
615
- acceptance_line += "This indicates a solid candidate with potential for improvement."
616
- else:
617
- acceptance_line += "This candidate may require significant development or may not be a strong fit."
618
-
619
- prompt = f"""
620
- As EvalBot, an AI interview analysis system, generate a highly professional, well-structured, and concise interview analysis report.
621
- The report should be suitable for a professional setting and clearly highlight key findings and actionable recommendations.
622
- Use clear headings and subheadings. For bullet points, use '- '.
623
-
624
- {acceptance_line}
625
-
626
- **1. Executive Summary**
627
- Provide a brief, high-level overview of the interview.
628
- - Overall interview duration: {analysis_data['text_analysis']['total_duration']:.2f} seconds
629
- - Number of speaker turns: {analysis_data['text_analysis']['speaker_turns']}
630
- - Main participants: {', '.join(analysis_data['speakers'])}
631
-
632
- **2. Voice Analysis Insights**
633
- Analyze key voice metrics and provide a detailed interpretation.
634
- {voice_interpretation}
635
-
636
- **3. Content Analysis & Strengths/Areas for Development**
637
- Analyze the key themes and identify both strengths and areas for development in the interviewee's responses.
638
- Key responses from interviewee (for context):
639
- {chr(10).join(interviewee_responses)}
640
-
641
- **4. Actionable Recommendations**
642
- Offer specific, actionable suggestions for improvement.
643
- Focus on:
644
- - Communication Skills (e.g., pacing, clarity, filler words)
645
- - Content Delivery (e.g., quantifying achievements, structuring answers)
646
- - Professional Presentation (e.g., research, specific examples, mock interviews)
647
- """
648
-
649
- response = gemini_model.generate_content(prompt)
650
- return response.text
651
- except Exception as e:
652
- logger.error(f"Report generation failed: {str(e)}")
653
- return f"Error generating report: {str(e)}"
654
-
655
-
656
- # --- ENHANCED PDF GENERATION FUNCTION ---
657
- def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str):
658
- try:
659
- doc = SimpleDocTemplate(output_path, pagesize=letter)
660
- styles = getSampleStyleSheet()
661
-
662
- # Define custom styles
663
- h1 = ParagraphStyle(name='Heading1', parent=styles['h1'], fontSize=16, spaceAfter=14, alignment=1,
664
- textColor=colors.HexColor('#003366'))
665
- h2 = ParagraphStyle(name='Heading2', parent=styles['h2'], fontSize=12, spaceBefore=10, spaceAfter=8,
666
- textColor=colors.HexColor('#336699'))
667
- h3 = ParagraphStyle(name='Heading3', parent=styles['h3'], fontSize=10, spaceBefore=8, spaceAfter=4,
668
- textColor=colors.HexColor('#0055AA'))
669
- body_text = ParagraphStyle(name='BodyText', parent=styles['Normal'], fontSize=9, leading=12, spaceAfter=4)
670
- bullet_style = ParagraphStyle(name='Bullet', parent=styles['Normal'], fontSize=9, leading=12, leftIndent=18,
671
- bulletIndent=9)
672
-
673
- story = []
674
-
675
- # Title and Date
676
- story.append(Paragraph(f"<b>EvalBot Interview Analysis Report</b>", h1))
677
- story.append(Spacer(1, 0.2 * inch))
678
- story.append(Paragraph(f"<b>Date:</b> {time.strftime('%Y-%m-%d')}", body_text))
679
- story.append(Spacer(1, 0.3 * inch))
680
-
681
- # --- Acceptance Probability (New Section) ---
682
- acceptance_prob = analysis_data.get('acceptance_probability', None)
683
- if acceptance_prob is not None:
684
- story.append(Paragraph("<b>Candidate Evaluation Summary</b>", h2))
685
- story.append(Spacer(1, 0.1 * inch))
686
-
687
- prob_color = colors.green if acceptance_prob >= 70 else (
688
- colors.orange if acceptance_prob >= 40 else colors.red)
689
-
690
- # --- FIX: Call .hexval() as a method ---
691
- story.append(Paragraph(
692
- f"<font size='12' color='{prob_color.hexval()}'><b>Estimated Acceptance Probability: {acceptance_prob:.2f}%</b></font>",
693
- ParagraphStyle(name='AcceptanceProbability', parent=styles['Normal'], fontSize=12, spaceAfter=10,
694
- alignment=1)
695
- ))
696
- # --- End FIX ---
697
-
698
- if acceptance_prob >= 80:
699
- story.append(
700
- Paragraph("This indicates a very strong candidate with high potential. Well done!", body_text))
701
- elif acceptance_prob >= 50:
702
- story.append(Paragraph(
703
- "This candidate shows solid potential but has areas for improvement to become an even stronger fit.",
704
- body_text))
705
- else:
706
- story.append(Paragraph(
707
- "This candidate may require significant development or may not be the ideal fit at this time.",
708
- body_text))
709
- story.append(Spacer(1, 0.3 * inch))
710
- # --- End Acceptance Probability ---
711
-
712
- # Parse Gemini's report into sections for better PDF structuring
713
- sections = {}
714
- current_section = None
715
- # Use regex to robustly identify sections, especially with varied bullet points
716
- section_patterns = {
717
- r'^\s*\*\*\s*1\.\s*Executive Summary\s*\*\*': 'Executive Summary',
718
- r'^\s*\*\*\s*2\.\s*Voice Analysis Insights\s*\*\*': 'Voice Analysis Insights',
719
- r'^\s*\*\*\s*3\.\s*Content Analysis & Strengths/Areas for Development\s*\*\*': 'Content Analysis & Strengths/Areas for Development',
720
- r'^\s*\*\*\s*4\.\s*Actionable Recommendations\s*\*\*': 'Actionable Recommendations'
721
- }
722
-
723
- for line in gemini_report_text.split('\n'):
724
- matched_section = False
725
- for pattern, section_name in section_patterns.items():
726
- if re.match(pattern, line):
727
- current_section = section_name
728
- sections[current_section] = []
729
- matched_section = True
730
- break
731
- if not matched_section and current_section:
732
- sections[current_section].append(line)
733
-
734
- # 1. Executive Summary
735
- story.append(Paragraph("1. Executive Summary", h2))
736
- story.append(Spacer(1, 0.1 * inch))
737
- if 'Executive Summary' in sections:
738
- for line in sections['Executive Summary']:
739
- if line.strip():
740
- story.append(Paragraph(line.strip(), body_text))
741
- story.append(Spacer(1, 0.2 * inch))
742
-
743
- # 2. Voice Analysis (Detailed - using Table for summary)
744
- story.append(Paragraph("2. Voice Analysis", h2))
745
- voice_analysis = analysis_data.get('voice_analysis', {})
746
-
747
- if voice_analysis and 'error' not in voice_analysis:
748
- # Voice Analysis Summary Table
749
- table_data = [
750
- ['Metric', 'Value', 'Interpretation'],
751
- ['Speaking Rate', f"{voice_analysis['speaking_rate']:.2f} words/sec", 'Average rate'],
752
- ['Filler Words', f"{voice_analysis['filler_ratio'] * 100:.1f}%", 'Percentage of total words'],
753
- ['Repetition Score', f"{voice_analysis['repetition_score']:.3f}", 'Lower is better articulation'],
754
- ['Anxiety Level', voice_analysis['interpretation']['anxiety_level'].upper(),
755
- f"Score: {voice_analysis['composite_scores']['anxiety']:.3f}"],
756
- ['Confidence Level', voice_analysis['interpretation']['confidence_level'].upper(),
757
- f"Score: {voice_analysis['composite_scores']['confidence']:.3f}"],
758
- ['Fluency', voice_analysis['interpretation']['fluency_level'].upper(), 'Overall speech flow']
759
- ]
760
-
761
- table_style = TableStyle([
762
- ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#6699CC')),
763
- ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
764
- ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
765
- ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
766
- ('BOTTOMPADDING', (0, 0), (-1, 0), 10),
767
- ('BACKGROUND', (0, 1), (-1, -1), colors.HexColor('#EFEFEF')),
768
- ('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#CCCCCC')),
769
- ('LEFTPADDING', (0, 0), (-1, -1), 6),
770
- ('RIGHTPADDING', (0, 0), (-1, -1), 6),
771
- ('TOPPADDING', (0, 0), (-1, -1), 6),
772
- ('BOTTOMPADDING', (0, 0), (-1, -1), 6),
773
- ])
774
-
775
- table = Table(table_data)
776
- table.setStyle(table_style)
777
- story.append(table)
778
- story.append(Spacer(1, 0.2 * inch))
779
-
780
- # --- Charts ---
781
- story.append(Paragraph("Score Visualization:", h3))
782
- # chart_path = os.path.join(OUTPUT_DIR, f"anxiety_confidence_{uuid.uuid4().hex[:8]}.png") # Removed from here
783
- # --- FIX: Generate chart in memory (BytesIO) ---
784
- chart_buffer = io.BytesIO() # Create in-memory buffer
785
- try:
786
- generate_anxiety_confidence_chart(voice_analysis['composite_scores'], chart_buffer) # Pass buffer instead of path
787
- chart_buffer.seek(0) # Rewind the buffer to the beginning
788
- img = Image(chart_buffer, width=3.5*inch, height=2.0*inch) # Load image from buffer
789
- story.append(img)
790
- story.append(Spacer(1, 0.1 * inch))
791
- except NameError:
792
- logger.warning("Chart generation function 'generate_anxiety_confidence_chart' is not defined. Skipping chart.")
793
- except Exception as chart_e:
794
- logger.warning(f"Could not add chart image to PDF: {chart_e}. Please check matplotlib installation.")
795
- # --- End FIX ---
796
- # --- End Charts ---
797
-
798
- # Detailed Interpretation from Gemini (if present)
799
- if 'Voice Analysis Insights' in sections:
800
- story.append(Paragraph("Detailed Interpretation:", h3))
801
- for line in sections['Voice Analysis Insights']:
802
- if line.strip():
803
- # Handle numbered lists from Gemini
804
- if re.match(r'^\d+\.\s', line.strip()):
805
- story.append(
806
- Paragraph(line.strip(), bullet_style))
807
- else:
808
- story.append(Paragraph(line.strip(), body_text))
809
- story.append(Spacer(1, 0.2 * inch))
810
-
811
- else:
812
- story.append(Paragraph("Voice analysis not available or encountered an error.", body_text))
813
- story.append(Spacer(1, 0.3 * inch))
814
-
815
- # 3. Content Analysis
816
- story.append(Paragraph("3. Content Analysis", h2))
817
- if 'Content Analysis & Strengths/Areas for Development' in sections:
818
- for line in sections['Content Analysis & Strengths/Areas for Development']:
819
- if line.strip():
820
- # Handle bullet points from Gemini
821
- if line.strip().startswith('-'):
822
- story.append(Paragraph(line.strip()[1:].strip(), bullet_style)) # Remove the '-' and strip
823
- else:
824
- story.append(Paragraph(line.strip(), body_text))
825
- story.append(Spacer(1, 0.2 * inch))
826
-
827
- # Add some interviewee responses to the report (can be formatted as a list)
828
- story.append(Paragraph("Key Interviewee Responses (Contextual):", h3))
829
- interviewee_responses = [
830
- f"Speaker {u['speaker']} ({u['role']}): {u['text']}"
831
- for u in analysis_data['transcript']
832
- if u['role'] == 'Interviewee'
833
- ][:5]
834
- for res in interviewee_responses:
835
- story.append(Paragraph(res, bullet_style))
836
- story.append(Spacer(1, 0.3 * inch))
837
-
838
- # 4. Recommendations
839
- story.append(Paragraph("4. Recommendations", h2))
840
- if 'Actionable Recommendations' in sections:
841
- for line in sections['Actionable Recommendations']:
842
- if line.strip():
843
- # Handle bullet points from Gemini
844
- if line.strip().startswith('-'):
845
- story.append(Paragraph(line.strip()[1:].strip(), bullet_style)) # Remove the '-' and strip
846
- else:
847
- story.append(Paragraph(line.strip(), body_text))
848
- story.append(Spacer(1, 0.2 * inch))
849
-
850
- # Footer Text
851
- story.append(Spacer(1, 0.5 * inch))
852
- story.append(Paragraph("--- Analysis by EvalBot ---", ParagraphStyle(
853
- name='FooterText', parent=styles['Normal'], fontSize=8, alignment=1, textColor=colors.HexColor('#666666')
854
- )))
855
-
856
- doc.build(story)
857
- return True
858
- except Exception as e:
859
- logger.error(f"PDF creation failed: {str(e)}", exc_info=True)
860
- return False
861
-
862
-
863
- def convert_to_serializable(obj):
864
- if isinstance(obj, np.generic):
865
- return obj.item()
866
- elif isinstance(obj, dict):
867
- return {key: convert_to_serializable(value) for key, value in obj.items()}
868
- elif isinstance(obj, list):
869
- return [convert_to_serializable(item) for item in obj]
870
- elif isinstance(obj, np.ndarray):
871
- return obj.tolist()
872
- return obj
873
-
874
-
875
- def process_interview(audio_path: str):
876
  try:
877
- logger.info(f"Starting processing for {audio_path}")
878
-
879
- wav_file = convert_to_wav(audio_path)
880
-
881
- logger.info("Starting transcription")
882
- transcript = transcribe(wav_file)
883
-
884
- logger.info("Extracting prosodic features")
885
- for utterance in transcript['utterances']:
886
- utterance['prosodic_features'] = extract_prosodic_features(
887
- wav_file,
888
- utterance['start'],
889
- utterance['end']
890
- )
891
-
892
- logger.info("Identifying speakers")
893
- utterances_with_speakers = identify_speakers(transcript, wav_file)
894
-
895
- logger.info("Classifying roles")
896
- # Ensure role classifier models are loaded/trained only once if possible,
897
- # or handled carefully in a multi-threaded context.
898
- # For simplicity, keeping it inside process_interview for now.
899
- if os.path.exists(os.path.join(OUTPUT_DIR, 'role_classifier.pkl')):
900
- clf = joblib.load(os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
901
- vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
902
- scaler = joblib.load(os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
903
- else:
904
- clf, vectorizer, scaler = train_role_classifier(utterances_with_speakers)
905
-
906
- classified_utterances = classify_roles(utterances_with_speakers, clf, vectorizer, scaler)
907
-
908
- logger.info("Analyzing interviewee voice")
909
- voice_analysis = analyze_interviewee_voice(wav_file, classified_utterances)
910
-
911
- analysis_data = {
912
- 'transcript': classified_utterances,
913
- 'speakers': list(set(u['speaker'] for u in classified_utterances)),
914
- 'voice_analysis': voice_analysis,
915
- 'text_analysis': {
916
- 'total_duration': sum(u['prosodic_features']['duration'] for u in classified_utterances),
917
- 'speaker_turns': len(classified_utterances)
918
- }
919
- }
920
-
921
- # --- Calculate Acceptance Probability ---
922
- acceptance_probability = calculate_acceptance_probability(analysis_data)
923
- analysis_data['acceptance_probability'] = acceptance_probability
924
- # --- End Acceptance Probability ---
925
-
926
- logger.info("Generating report text using Gemini")
927
- gemini_report_text = generate_report(analysis_data)
928
-
929
- base_name = os.path.splitext(os.path.basename(audio_path))[0]
930
- pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
931
- create_pdf_report(analysis_data, pdf_path, gemini_report_text=gemini_report_text)
932
 
933
- json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
934
- with open(json_path, 'w') as f:
935
- serializable_data = convert_to_serializable(analysis_data)
936
- json.dump(serializable_data, f, indent=2)
 
 
 
 
 
 
937
 
938
- os.remove(wav_file) # Clean up WAV file after processing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
939
 
940
- logger.info(f"Processing completed for {audio_path}")
941
- return {
942
- 'pdf_path': pdf_path,
943
- 'json_path': json_path
944
- }
945
  except Exception as e:
946
- logger.error(f"Processing failed: {str(e)}", exc_info=True)
947
- # Clean up wav_file in case of error
948
- if 'wav_file' in locals() and os.path.exists(wav_file):
949
- os.remove(wav_file)
950
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
 
2
  import uuid
3
+ import shutil
 
4
  import json
5
+ import requests
 
 
 
 
 
 
 
 
 
 
6
  import logging
7
+ from fastapi import FastAPI, HTTPException, Body
8
+ from fastapi.staticfiles import StaticFiles
9
+ from fastapi.responses import FileResponse
10
+ from pydantic import BaseModel, HttpUrl
11
+ from process_interview import process_interview # Assuming process_interview is in a separate file
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ # Logging setup
14
  logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger("EvalBot-Audio-Processor")
16
+
17
+ # Initialize FastAPI app
18
+ app = FastAPI()
19
+
20
+ # Directories
21
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
22
+ TEMP_DIR = os.path.join(BASE_DIR, "temp_files")
23
+ STATIC_DIR = os.path.join(BASE_DIR, "static")
24
+ OUTPUT_DIR = os.path.join(STATIC_DIR, "outputs") # Outputs are within static to be servable
25
+ JSON_DIR = os.path.join(OUTPUT_DIR, "json")
26
+ PDF_DIR = os.path.join(OUTPUT_DIR, "pdf")
27
+
28
+ # Create necessary directories
29
+ for folder in [TEMP_DIR, JSON_DIR, PDF_DIR]:
30
+ os.makedirs(folder, exist_ok=True)
31
+
32
+ # Mount static files directory to be accessible via /static URL
33
+ app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
34
+
35
+ # Configuration Constants
36
+ VALID_EXTENSIONS = ('.wav', '.mp3', '.m4a', '.flac')
37
+ MAX_FILE_SIZE_MB = 300
38
+
39
+ # Base URL for the deployed application (e.g., from Hugging Face Space or ngrok)
40
+ # This should NOT include /static or any subpaths.
41
+ # Example: https://evalbot-audio-evalbot.hf.space
42
+ # Example: https://your-ngrok-url.ngrok-free.app
43
+ BASE_URL = os.getenv("BASE_URL", "http://localhost:7860") # Default for local testing
44
+
45
+ # Pydantic Models for Request/Response validation
46
+ class ProcessResponse(BaseModel):
47
+ """Response model for the /process-audio endpoint."""
48
+ summary: str
49
+ json_url: str
50
+ pdf_url: str
51
+
52
+ class ProcessAudioRequest(BaseModel):
53
+ """Request model for the /process-audio endpoint."""
54
+ file_url: HttpUrl # URL of the audio file to process
55
+ user_id: str # Identifier for the user submitting the audio
56
+
57
+ # Helper Functions
58
+ def download_file(file_url: str, dest_path: str):
59
+ """Downloads a file from a given URL to a specified destination path."""
60
+ logger.info(f"Attempting to download file from {file_url}")
61
  try:
62
+ resp = requests.get(file_url, stream=True, timeout=60) # Increased timeout
63
+ resp.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ # Ensure the destination directory exists
66
+ os.makedirs(os.path.dirname(dest_path), exist_ok=True)
67
+
68
+ with open(dest_path, "wb") as f:
69
+ for chunk in resp.iter_content(chunk_size=8192):
70
+ if chunk: # Filter out keep-alive new chunks
71
+ f.write(chunk)
72
+ logger.info(f"File downloaded successfully to {dest_path}")
73
+ except requests.exceptions.RequestException as e:
74
+ logger.error(f"Error downloading file from {file_url}: {e}")
75
+ raise HTTPException(status_code=400, detail=f"Failed to download file from URL: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  except Exception as e:
77
+ logger.error(f"Unexpected error during file download: {e}", exc_info=True)
78
+ raise HTTPException(status_code=500, detail="Internal server error during file download")
79
+
80
+ def validate_file_size(file_path: str):
81
+ """Validates the size of a file against MAX_FILE_SIZE_MB."""
82
+ file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
83
+ if file_size_mb > MAX_FILE_SIZE_MB:
84
+ logger.warning(f"File too large: {file_size_mb:.2f} MB. Max allowed: {MAX_FILE_SIZE_MB} MB")
85
+ os.remove(file_path) # Clean up the oversized file
86
+ raise HTTPException(status_code=400, detail=f"File too large: {file_size_mb:.2f} MB. Max size: {MAX_FILE_SIZE_MB} MB")
87
+
88
+ def generate_public_url(full_local_path: str) -> str:
89
  """
90
+ Generates a public URL for a locally stored file.
91
+ Assumes the file is located within the STATIC_DIR.
92
+ The URL will be BASE_URL/static/relative_path_from_static_dir.
93
  """
94
+ # Calculate the path relative to STATIC_DIR
95
+ # Example: If STATIC_DIR is /app/static and full_local_path is /app/static/outputs/json/file.json
96
+ # relative_path will be "outputs/json/file.json" (or "outputs\json\file.json" on Windows)
97
+ relative_path = os.path.relpath(full_local_path, STATIC_DIR)
98
+
99
+ # Replace backslashes with forward slashes for web URL compatibility (especially on Windows)
100
+ web_path = relative_path.replace(os.path.sep, "/")
101
+
102
+ # Construct the full public URL using the BASE_URL and the mounted static path prefix
103
+ return f"{BASE_URL}/static/{web_path}"
104
+
105
+ # Main API Endpoint
106
+ @app.post("/process-audio", response_model=ProcessResponse)
107
+ async def process_audio(request: ProcessAudioRequest = Body(...)):
108
+ """
109
+ Endpoint to process an audio file from a given URL.
110
+ Downloads the audio, processes it through the interview analysis pipeline,
111
+ and returns URLs for the generated JSON analysis and PDF report.
112
+ """
113
+ file_url = str(request.file_url)
114
+ user_id = request.user_id
115
+
116
+ # Validate file extension based on URL
117
+ file_ext = os.path.splitext(file_url)[1].lower()
118
+ if file_ext not in VALID_EXTENSIONS:
119
+ logger.error(f"Invalid file extension: {file_ext}. Supported: {VALID_EXTENSIONS}")
120
+ raise HTTPException(status_code=400, detail=f"Invalid file extension: {file_ext}. Supported formats: {', '.join(VALID_EXTENSIONS)}")
121
+
122
+ # Create a unique temporary path for the downloaded audio file
123
+ temp_filename = f"{user_id}_{uuid.uuid4().hex}{file_ext}"
124
+ temp_path = os.path.join(TEMP_DIR, temp_filename)
125
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  try:
127
+ # 1. Download the audio file
128
+ download_file(file_url, temp_path)
129
+
130
+ # 2. Validate downloaded file size
131
+ validate_file_size(temp_path)
132
+
133
+ logger.info(f"Starting interview processing for user: {user_id} from {temp_path}")
134
+
135
+ # 3. Process the interview audio using the external process_interview module
136
+ # process_interview returns a dictionary with local paths to the generated JSON and PDF
137
+ result = process_interview(temp_path)
138
+
139
+ if not result:
140
+ logger.error(f"process_interview returned no result for {user_id}")
141
+ raise HTTPException(status_code=500, detail="Audio processing failed: No result from analysis pipeline.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
+ # 4. Generate unique filenames for outputs and copy them to static outputs directory
144
+ json_filename = f"{user_id}_{uuid.uuid4().hex}.json"
145
+ pdf_filename = f"{user_id}_{uuid.uuid4().hex}.pdf"
146
+
147
+ json_dest_path = os.path.join(JSON_DIR, json_filename)
148
+ pdf_dest_path = os.path.join(PDF_DIR, pdf_filename)
149
+
150
+ shutil.copyfile(result['json_path'], json_dest_path)
151
+ shutil.copyfile(result['pdf_path'], pdf_dest_path)
152
+ logger.info(f"Analysis outputs copied to: {json_dest_path} and {pdf_dest_path}")
153
 
154
+ # 5. Load analysis data for summary and generate public URLs
155
+ with open(json_dest_path, "r") as jf: # Use json_dest_path to read the *copied* file
156
+ analysis_data = json.load(jf)
157
+
158
+ voice_interpretation = analysis_data.get('voice_analysis', {}).get('interpretation', {})
159
+ speakers_list = analysis_data.get('speakers', [])
160
+ total_duration = analysis_data.get('text_analysis', {}).get('total_duration', 0.0)
161
+
162
+ summary_text = (
163
+ f"User ID: {user_id}\n"
164
+ f"Speakers: {', '.join(speakers_list)}\n"
165
+ f"Duration: {total_duration:.2f} sec\n"
166
+ f"Confidence: {voice_interpretation.get('confidence_level', 'N/A')}\n"
167
+ f"Anxiety: {voice_interpretation.get('anxiety_level', 'N/A')}"
168
+ )
169
+
170
+ # This is the crucial part: generate public URLs using the correct local paths
171
+ json_public_url = generate_public_url(json_dest_path)
172
+ pdf_public_url = generate_public_url(pdf_dest_path)
173
+
174
+ logger.info("Audio processing and URL generation completed successfully.")
175
+ return ProcessResponse(summary=summary_text, json_url=json_public_url, pdf_url=pdf_public_url)
176
 
177
+ except HTTPException as e:
178
+ # Re-raise HTTPException directly as it already contains appropriate status/detail
179
+ raise e
 
 
180
  except Exception as e:
181
+ # Catch any other unexpected errors during the process
182
+ logger.exception(f"Unexpected error during audio processing for user {user_id}: {e}")
183
+ raise HTTPException(status_code=500, detail=f"Internal server error during processing: {e}")
184
+ finally:
185
+ # Clean up the temporary downloaded audio file
186
+ if os.path.exists(temp_path):
187
+ os.remove(temp_path)
188
+ logger.info(f"Cleaned up temporary file: {temp_path}")
189
+
190
+ # Routes to serve output files directly if needed (though /static mount handles this)
191
+ # These explicit routes are often redundant if /static mount is configured correctly,
192
+ # but can be useful for specific media types or debugging.
193
+ @app.get("/outputs/json/{filename}", response_class=FileResponse)
194
+ async def get_json_file(filename: str):
195
+ """Serves a JSON analysis file from the outputs directory."""
196
+ file_path = os.path.join(JSON_DIR, filename)
197
+ if not os.path.exists(file_path):
198
+ raise HTTPException(status_code=404, detail="JSON file not found")
199
+ return FileResponse(file_path, media_type="application/json", filename=filename)
200
+
201
+ @app.get("/outputs/pdf/{filename}", response_class=FileResponse)
202
+ async def get_pdf_file(filename: str):
203
+ """Serves a PDF report file from the outputs directory."""
204
+ file_path = os.path.join(PDF_DIR, filename)
205
+ if not os.path.exists(file_path):
206
+ raise HTTPException(status_code=404, detail="PDF file not found")
207
+ return FileResponse(file_path, media_type="application/pdf", filename=filename)