Spaces:

norhan12
/

TheEnd

Build error

App Files Files Community

norhan12 commited on Jun 14, 2025

Commit

52e76bd

verified ·

1 Parent(s): ebf10a8

Update process_interview.py

Browse files

Files changed (1) hide show

process_interview.py +91 -99

process_interview.py CHANGED Viewed

@@ -1,6 +1,3 @@
-# ==============================================================================
-# 1. IMPORTS
-# ==============================================================================
 import os
 import torch
 import numpy as np
@@ -42,6 +39,7 @@ matplotlib.use('Agg')
 # Concurrency
 from concurrent.futures import ThreadPoolExecutor
 # ==============================================================================
 # 2. CONFIGURATION AND INITIALIZATION
@@ -52,8 +50,11 @@ logging.getLogger("nemo_logging").setLevel(logging.ERROR)
 logging.getLogger("nemo").setLevel(logging.ERROR)
 logging.getLogger("transformers").setLevel(logging.ERROR)
-OUTPUT_DIR = "./processed_audio"
-os.makedirs(OUTPUT_DIR, exist_ok=True)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 PINECONE_KEY = os.getenv("PINECONE_KEY")
@@ -65,7 +66,6 @@ if not all([PINECONE_KEY, ASSEMBLYAI_KEY, GEMINI_API_KEY]):
 # Global variables for models and services
 index, gemini_model, speaker_model, nlp, tokenizer, text_embedding_model = (None,) * 6
 def initialize_all_services_and_models():
     """Initializes all external services and loads all AI models into memory."""
     global index, gemini_model, speaker_model, nlp, tokenizer, text_embedding_model
@@ -85,10 +85,8 @@ def initialize_all_services_and_models():
     text_embedding_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device).eval()
     logger.info("All services and models are ready.")
 initialize_all_services_and_models()
 # ==============================================================================
 # 3. HELPER AND UTILITY FUNCTIONS
 # ==============================================================================
@@ -97,11 +95,11 @@ def temp_audio_file(suffix='.wav'):
     temp_file_path = None
     try:
         fd, temp_file_path = tempfile.mkstemp(suffix=suffix)
-        os.close(fd);
         yield temp_file_path
     finally:
-        if temp_file_path and os.path.exists(temp_file_path): os.remove(temp_file_path)
 def convert_to_wav(input_path: str) -> str:
     temp_wav_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False).name
@@ -111,11 +109,11 @@ def convert_to_wav(input_path: str) -> str:
         subprocess.run(command, check=True, capture_output=True, text=True)
         return temp_wav_file
     except Exception as e:
-        if os.path.exists(temp_wav_file): os.remove(temp_wav_file)
-        logger.error(f"Audio conversion failed: {e}", exc_info=True);
         raise
 def transcribe(audio_path: str) -> Dict:
     try:
         headers = {"authorization": ASSEMBLYAI_KEY}
@@ -131,21 +129,23 @@ def transcribe(audio_path: str) -> Dict:
         logger.info(f"Transcription submitted. Polling for results (ID: {transcript_id})...")
         while True:
             result = requests.get(f"https://api.assemblyai.com/v2/transcript/{transcript_id}", headers=headers).json()
-            if result['status'] == 'completed': return result
-            if result['status'] == 'error': raise Exception(f"Transcription failed: {result['error']}")
             time.sleep(5)
     except Exception as e:
-        logger.error(f"Transcription failed: {e}", exc_info=True);
         raise
 def identify_speakers(transcript: Dict, wav_file_path: str) -> List[Dict]:
     try:
         full_audio = AudioSegment.from_wav(wav_file_path)
         def process_utterance(utterance):
             start_ms, end_ms = utterance['start'], utterance['end']
-            if end_ms - start_ms < 1000: return {**utterance, 'speaker_id': 'unknown_short_utterance'}
             with temp_audio_file() as temp_path:
                 full_audio[start_ms:end_ms].export(temp_path, format="wav")
                 with torch.no_grad():
@@ -164,44 +164,44 @@ def identify_speakers(transcript: Dict, wav_file_path: str) -> List[Dict]:
         with ThreadPoolExecutor() as executor:
             return list(executor.map(process_utterance, transcript.get('utterances', [])))
     except Exception as e:
-        logger.error(f"Speaker identification failed: {e}", exc_info=True);
         raise
 def get_text_embedding(text: str) -> np.ndarray:
     with torch.no_grad():
         inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding=True).to(device)
         outputs = text_embedding_model(**inputs)
         return outputs.last_hidden_state[0, 0, :].cpu().numpy()
 def extract_detailed_prosodic_features(audio_segment: AudioSegment) -> Dict:
     try:
         with temp_audio_file() as temp_path:
             audio_segment.export(temp_path, format="wav")
             y, sr = librosa.load(temp_path, sr=16000)
-            if len(y) == 0: return {'pitch_std': 0}
             f0, _, _ = librosa.pyin(y, fmin=80, fmax=400, sr=sr)
             f0_values = f0[~np.isnan(f0)]
             return {'pitch_std': float(np.std(f0_values)) if len(f0_values) > 1 else 0}
     except Exception:
         return {'pitch_std': 0}
 def extract_duration_feature(utterances: List[Dict]) -> List[Dict]:
     for u in utterances:
         u['prosodic_features'] = {'duration': (u['end'] - u['start']) / 1000.0}
     return utterances
 def convert_to_serializable(obj):
-    if isinstance(obj, (np.integer, np.floating)): return obj.item()
-    if isinstance(obj, np.ndarray): return obj.tolist()
-    if isinstance(obj, dict): return {k: convert_to_serializable(v) for k, v in obj.items()}
-    if isinstance(obj, list): return [convert_to_serializable(item) for item in obj]
     return obj
 # ==============================================================================
 # 4. CORE LOGIC - ULTIMATE ROLE CLASSIFIER
 # ==============================================================================
@@ -209,42 +209,45 @@ def classify_roles_ultimate(utterances: List[Dict], audio_path: str) -> List[Dic
     logger.info("Starting ULTIMATE role classification with prosodic analysis...")
     full_audio = AudioSegment.from_wav(audio_path)
     speakers = {u['speaker_id'] for u in utterances if 'speaker_id' in u and not u['speaker_id'].startswith('unknown')}
-    if len(speakers) < 2: return utterances
-    speaker_data = {sid: {'rule_score': 0, 'prosodic_score': 0, 'utterance_count': 0, 'embeddings': []} for sid in
-                    speakers}
     interviewer_keywords = r'\b(what|why|how|when|where|who|which|tell me about|can you explain|describe|give me an example)\b'
     for u in utterances:
         sid, text = u.get('speaker_id'), u.get('text', '').lower()
-        if sid not in speaker_data or not text: continue
-        rule_score = 10 if text.endswith('?') else 0;
         rule_score += 5 * len(re.findall(interviewer_keywords, text))
         rule_score += 2 if len(text.split()) < 10 else -5 if len(text.split()) > 30 else 0
         speaker_data[sid]['rule_score'] += rule_score
-        segment = full_audio[u['start']:u['end']];
         prosodic_features = extract_detailed_prosodic_features(segment)
         speaker_data[sid]['prosodic_score'] += -5 if prosodic_features['pitch_std'] > 40 else 2
-        speaker_data[sid]['embeddings'].append(get_text_embedding(u['text']));
         speaker_data[sid]['utterance_count'] += 1
     canonical_question_embedding = get_text_embedding("Tell me about your experience and skills.")
     for sid, data in speaker_data.items():
-        if not data['embeddings']: data['semantic_score'] = 0; continue
         avg_embedding = np.mean(data['embeddings'], axis=0).reshape(1, -1)
         data['semantic_score'] = cosine_similarity(avg_embedding, canonical_question_embedding.reshape(1, -1))[0][0]
     final_scores = {}
     for sid, data in speaker_data.items():
-        if data['utterance_count'] == 0: final_scores[sid] = -999; continue
-        avg_rule_score = data['rule_score'] / data['utterance_count'];
         avg_prosodic_score = data['prosodic_score'] / data['utterance_count']
         final_scores[sid] = (avg_rule_score * 0.5) + (data['semantic_score'] * 0.3) + (avg_prosodic_score * 0.2)
     sorted_speakers = sorted(final_scores.items(), key=lambda item: item[1], reverse=True)
     interviewer_id, interviewee_id = sorted_speakers[0][0], sorted_speakers[1][0]
     logger.info(f"Ultimate Role Classification: Interviewer -> {interviewer_id}, Interviewee -> {interviewee_id}")
     for u in utterances:
-        u['role'] = 'Interviewer' if u.get('speaker_id') == interviewer_id else 'Interviewee' if u.get(
-            'speaker_id') == interviewee_id else 'Unknown'
     return utterances
 # ==============================================================================
 # 5. YOUR CUSTOM ANALYSIS & REPORTING FUNCTIONS
 # ==============================================================================
@@ -253,29 +256,29 @@ def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
     try:
         y, sr = librosa.load(audio_path, sr=16000)
         interviewee_utterances = [u for u in utterances if u.get('role') == 'Interviewee']
-        if not interviewee_utterances: return {'error': 'No interviewee utterances found'}
         segments = [y[int(u['start'] * sr / 1000):int(u['end'] * sr / 1000)] for u in interviewee_utterances]
-        if not segments: return {'error': 'No valid interviewee segments to analyze.'}
         combined_audio = np.concatenate(segments)
         total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
         total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
         speaking_rate = total_words / total_duration if total_duration > 0 else 0
-        filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean'];
         filler_count = sum(sum(u['text'].lower().count(fw) for fw in filler_words) for u in interviewee_utterances)
         filler_ratio = filler_count / total_words if total_words > 0 else 0
         all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
-        word_counts = {tuple(all_words[i:i + 2]): all_words.count(tuple(all_words[i:i + 2])) for i in
-                       range(len(all_words) - 1)}
-        repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(
-            word_counts) if word_counts else 0
         f0, voiced_flag, _ = librosa.pyin(combined_audio, fmin=80, fmax=300, sr=sr)
-        f0_values = f0[voiced_flag & ~np.isnan(f0)];
         pitch_mean = np.mean(f0_values) if len(f0_values) > 0 else 0
-        pitch_std = np.std(f0_values) if len(f0_values) > 0 else 0;
         jitter = np.mean(np.abs(np.diff(f0_values))) / pitch_mean if len(f0_values) > 1 and pitch_mean > 0 else 0
-        rms = librosa.feature.rms(y=combined_audio)[0];
         intensity_mean = np.mean(rms) if len(rms) > 0 else 0
-        intensity_std = np.std(rms) if len(rms) > 0 else 0;
         shimmer = np.mean(np.abs(np.diff(rms))) / intensity_mean if len(rms) > 1 and intensity_mean > 0 else 0
         anxiety_score = 0.6 * (pitch_std / pitch_mean if pitch_mean > 0 else 0) + 0.4 * (jitter + shimmer)
         confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
@@ -288,14 +291,14 @@ def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
                 'composite_scores': {'anxiety': float(anxiety_score), 'confidence': float(confidence_score),
                                      'hesitation': float(hesitation_score)}}
     except Exception as e:
-        logger.error(f"Error in detailed voice analysis: {e}", exc_info=True);
         return {'error': str(e)}
 def generate_voice_interpretation(analysis: Dict) -> str:
-    if 'error' in analysis: return "<b>Detailed Vocal Metrics:</b><br/>Analysis not available."
-    scores = analysis.get('composite_scores', {});
-    pitch = analysis.get('pitch_analysis', {});
     intensity = analysis.get('intensity_analysis', {})
     return (f"<b>Detailed Vocal Metrics Interpretation:</b><br/>"
             f"- Speaking Rate: {analysis.get('speaking_rate', 0):.2f} words/sec<br/>"
@@ -309,38 +312,37 @@ def generate_voice_interpretation(analysis: Dict) -> str:
             f"- <b>Confidence Score:</b> {scores.get('confidence', 0):.3f}<br/>"
             f"- <b>Hesitation Score:</b> {scores.get('hesitation', 0):.3f}")
 def generate_anxiety_confidence_chart(composite_scores: Dict, chart_path_or_buffer):
     try:
-        labels = ['Anxiety', 'Confidence', 'Hesitation'];
         scores = [composite_scores.get(k.lower(), 0) for k in labels]
-        fig, ax = plt.subplots(figsize=(6, 4));
         ax.bar(labels, scores, color=['#FF6B6B', '#4ECDC4', '#FFA500'], edgecolor='black', width=0.5)
-        ax.set_ylabel('Score');
-        ax.set_title('Candidate Vocal Dynamics');
         ax.set_ylim(0, max(scores) * 1.2 if scores and max(scores) > 0 else 1)
-        for bar in ax.patches: ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
-                                       f"{bar.get_height():.2f}", ha='center', color='black')
-        plt.tight_layout();
-        plt.savefig(chart_path_or_buffer, format='png', dpi=150);
         plt.close(fig)
     except Exception as e:
         logger.error(f"Error generating chart: {e}")
 def calculate_acceptance_probability(analysis_data: Dict) -> float:
     logger.info("Calculating final acceptance probability...")
     voice_metrics = analysis_data.get('voice_analysis_metrics', {})
-    if 'error' in voice_metrics or not voice_metrics.get('composite_scores'): return 30.0
-    scores = voice_metrics['composite_scores'];
-    confidence = scores.get('confidence', 0.5);
-    anxiety = scores.get('anxiety', 0.5);
     hesitation = scores.get('hesitation', 0.5)
     raw_score = (confidence * 0.6) + ((1 - anxiety) * 0.2) + ((1 - hesitation) * 0.2)
     max_score = 0.6 + 0.2 + 0.2
     return round(max(10.0, min(99.0, (raw_score / max_score if max_score > 0 else 0) * 100)), 2)
 # ==============================================================================
 # 6. AI-POWERED NARRATIVE AND PDF REPORTING
 # ==============================================================================
@@ -348,25 +350,20 @@ def generate_gemini_report_text(analysis_data: Dict) -> str:
     """Generates a comprehensive narrative report using the Gemini model, based on your prompt structure."""
     logger.info("Generating AI-powered narrative report with Gemini...")
     voice = analysis_data.get('voice_analysis_metrics', {})
-    interviewee_text = "\n".join(
-        [f"- {u['text']}" for u in analysis_data['transcript_with_roles'] if u.get('role') == 'Interviewee'])
     acceptance_prob = analysis_data.get('acceptance_probability', 50.0)
     prompt = f"""
     You are EvalBot, a highly experienced senior HR analyst generating a comprehensive interview evaluation report.
     Analyze deeply based on actual responses provided below. Avoid generic analysis.
     Maintain professional, HR-standard language with clear structure and bullet points.
     **Suitability Score: {acceptance_prob:.2f}%**
     ### Interviewee Full Responses:
     {interviewee_text if interviewee_text else "No responses recorded."}
     ### Key Metrics:
     - Confidence Score: {voice.get('composite_scores', {}).get('confidence', 'N/A'):.2f}
     - Anxiety Score: {voice.get('composite_scores', {}).get('anxiety', 'N/A'):.2f}
     - Speaking Rate: {voice.get('speaking_rate', 'N/A')} words/sec
     ### Report Sections to Generate (Follow this structure exactly):
     **1. Executive Summary:**
     - 3 bullets summarizing performance, key strengths, and hiring recommendation.
@@ -381,13 +378,12 @@ def generate_gemini_report_text(analysis_data: Dict) -> str:
     - Provide 5 actionable recommendations and 5 clear next steps.
     """
     try:
-        response = gemini_model.generate_content(prompt);
         return response.text
     except Exception as e:
-        logger.error(f"Gemini report generation failed: {e}");
         return "Error: Could not generate AI analysis report."
 def create_pdf_report(analysis_data: Dict, output_path: str):
     """Generates a detailed, professional PDF report including all analysis sections, based on your structure."""
     logger.info(f"Generating comprehensive PDF report at {output_path}...")
@@ -397,6 +393,8 @@ def create_pdf_report(analysis_data: Dict, output_path: str):
                               fontName='Helvetica-Bold', alignment=TA_CENTER))
     styles.add(ParagraphStyle(name='H2', fontSize=14, leading=18, spaceBefore=12, spaceAfter=8,
                               textColor=colors.HexColor('#0050BC'), fontName='Helvetica-Bold'))
     styles.add(ParagraphStyle(name='Body', fontSize=10, leading=14, spaceAfter=6, alignment=TA_JUSTIFY))
     story = []
@@ -405,10 +403,9 @@ def create_pdf_report(analysis_data: Dict, output_path: str):
     story.append(Spacer(1, 0.2 * inch))
     story.append(Paragraph(f"Candidate ID: {analysis_data.get('user_id', 'N/A')}", styles['Body']))
     story.append(Paragraph(f"Date of Analysis: {time.strftime('%B %d, %Y')}", styles['Body']))
-    prob = analysis_data.get('acceptance_probability', 0);
     prob_color = 'green' if prob >= 75 else 'orange' if prob >= 50 else 'red'
-    story.append(
-        Paragraph(f"<b>Overall Suitability Score:</b> <font size=16 color='{prob_color}'>{prob}%</font>", styles['H2']))
     story.append(PageBreak())
     # Quantitative Analysis Page
@@ -426,7 +423,8 @@ def create_pdf_report(analysis_data: Dict, output_path: str):
     gemini_text = analysis_data.get('gemini_report_text', 'Not available.')
     for line in gemini_text.split('\n'):
         line = line.strip()
-        if not line: continue
         if line.startswith('**') and line.endswith('**'):
             story.append(Paragraph(line.strip('*'), styles['H3']))
         elif line.startswith('- ') or line.startswith('* '):
@@ -437,13 +435,9 @@ def create_pdf_report(analysis_data: Dict, output_path: str):
     doc.build(story)
     logger.info("PDF report generated successfully.")
 # ==============================================================================
 # 7. MAIN PROCESSING PIPELINE
 # ==============================================================================
-import joblib  # Added import
-import io
 def process_interview(audio_path: str, user_id: str = "candidate-123") -> Dict:
     try:
         logger.info(f"Starting processing for {audio_path} (User ID: {user_id})")
@@ -492,14 +486,13 @@ def process_interview(audio_path: str, user_id: str = "candidate-123") -> Dict:
         logger.info("Generating report text using Gemini")
         gemini_report_text = generate_gemini_report_text(analysis_data)
-        base_name = f"{user_id}_{os.path.splitext(os.path.basename(audio_path))[0].split('_', 1)[1]}"
-        pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
-        if not create_pdf_report(analysis_data, pdf_path, gemini_report_text=gemini_report_text):
-            logger.error(f"Failed to create PDF report: {pdf_path}")
-            raise RuntimeError("PDF report generation failed")
-        json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
         with open(json_path, 'w') as f:
             logger.debug(f"Serializing analysis_data with keys: {list(analysis_data.keys())}")
             serializable_data = convert_to_serializable(analysis_data)
@@ -516,5 +509,4 @@ def process_interview(audio_path: str, user_id: str = "candidate-123") -> Dict:
         logger.error(f"Processing failed: {str(e)}", exc_info=True)
         if 'wav_file' in locals() and os.path.exists(wav_file):
             os.remove(wav_file)
-        raise

 import os
 import torch
 import numpy as np
 # Concurrency
 from concurrent.futures import ThreadPoolExecutor
+import joblib  # Added import
 # ==============================================================================
 # 2. CONFIGURATION AND INITIALIZATION
 logging.getLogger("nemo").setLevel(logging.ERROR)
 logging.getLogger("transformers").setLevel(logging.ERROR)
+OUTPUT_DIR = "./static/outputs"
+JSON_DIR = os.path.join(OUTPUT_DIR, "json")
+PDF_DIR = os.path.join(OUTPUT_DIR, "pdf")
+os.makedirs(JSON_DIR, exist_ok=True)
+os.makedirs(PDF_DIR, exist_ok=True)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 PINECONE_KEY = os.getenv("PINECONE_KEY")
 # Global variables for models and services
 index, gemini_model, speaker_model, nlp, tokenizer, text_embedding_model = (None,) * 6
 def initialize_all_services_and_models():
     """Initializes all external services and loads all AI models into memory."""
     global index, gemini_model, speaker_model, nlp, tokenizer, text_embedding_model
     text_embedding_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device).eval()
     logger.info("All services and models are ready.")
 initialize_all_services_and_models()
 # ==============================================================================
 # 3. HELPER AND UTILITY FUNCTIONS
 # ==============================================================================
     temp_file_path = None
     try:
         fd, temp_file_path = tempfile.mkstemp(suffix=suffix)
+        os.close(fd)
         yield temp_file_path
     finally:
+        if temp_file_path and os.path.exists(temp_file_path):
+            os.remove(temp_file_path)
 def convert_to_wav(input_path: str) -> str:
     temp_wav_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False).name
         subprocess.run(command, check=True, capture_output=True, text=True)
         return temp_wav_file
     except Exception as e:
+        if os.path.exists(temp_wav_file):
+            os.remove(temp_wav_file)
+        logger.error(f"Audio conversion failed: {e}", exc_info=True)
         raise
 def transcribe(audio_path: str) -> Dict:
     try:
         headers = {"authorization": ASSEMBLYAI_KEY}
         logger.info(f"Transcription submitted. Polling for results (ID: {transcript_id})...")
         while True:
             result = requests.get(f"https://api.assemblyai.com/v2/transcript/{transcript_id}", headers=headers).json()
+            if result['status'] == 'completed':
+                return result
+            if result['status'] == 'error':
+                raise Exception(f"Transcription failed: {result['error']}")
             time.sleep(5)
     except Exception as e:
+        logger.error(f"Transcription failed: {e}", exc_info=True)
         raise
 def identify_speakers(transcript: Dict, wav_file_path: str) -> List[Dict]:
     try:
         full_audio = AudioSegment.from_wav(wav_file_path)
         def process_utterance(utterance):
             start_ms, end_ms = utterance['start'], utterance['end']
+            if end_ms - start_ms < 1000:
+                return {**utterance, 'speaker_id': 'unknown_short_utterance'}
             with temp_audio_file() as temp_path:
                 full_audio[start_ms:end_ms].export(temp_path, format="wav")
                 with torch.no_grad():
         with ThreadPoolExecutor() as executor:
             return list(executor.map(process_utterance, transcript.get('utterances', [])))
     except Exception as e:
+        logger.error(f"Speaker identification failed: {e}", exc_info=True)
         raise
 def get_text_embedding(text: str) -> np.ndarray:
     with torch.no_grad():
         inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding=True).to(device)
         outputs = text_embedding_model(**inputs)
         return outputs.last_hidden_state[0, 0, :].cpu().numpy()
 def extract_detailed_prosodic_features(audio_segment: AudioSegment) -> Dict:
     try:
         with temp_audio_file() as temp_path:
             audio_segment.export(temp_path, format="wav")
             y, sr = librosa.load(temp_path, sr=16000)
+            if len(y) == 0:
+                return {'pitch_std': 0}
             f0, _, _ = librosa.pyin(y, fmin=80, fmax=400, sr=sr)
             f0_values = f0[~np.isnan(f0)]
             return {'pitch_std': float(np.std(f0_values)) if len(f0_values) > 1 else 0}
     except Exception:
         return {'pitch_std': 0}
 def extract_duration_feature(utterances: List[Dict]) -> List[Dict]:
     for u in utterances:
         u['prosodic_features'] = {'duration': (u['end'] - u['start']) / 1000.0}
     return utterances
 def convert_to_serializable(obj):
+    if isinstance(obj, (np.integer, np.floating)):
+        return obj.item()
+    if isinstance(obj, np.ndarray):
+        return obj.tolist()
+    if isinstance(obj, dict):
+        return {k: convert_to_serializable(v) for k, v in obj.items()}
+    if isinstance(obj, list):
+        return [convert_to_serializable(item) for item in obj]
     return obj
 # ==============================================================================
 # 4. CORE LOGIC - ULTIMATE ROLE CLASSIFIER
 # ==============================================================================
     logger.info("Starting ULTIMATE role classification with prosodic analysis...")
     full_audio = AudioSegment.from_wav(audio_path)
     speakers = {u['speaker_id'] for u in utterances if 'speaker_id' in u and not u['speaker_id'].startswith('unknown')}
+    if len(speakers) < 2:
+        return utterances
+    speaker_data = {sid: {'rule_score': 0, 'prosodic_score': 0, 'utterance_count': 0, 'embeddings': []} for sid in speakers}
     interviewer_keywords = r'\b(what|why|how|when|where|who|which|tell me about|can you explain|describe|give me an example)\b'
     for u in utterances:
         sid, text = u.get('speaker_id'), u.get('text', '').lower()
+        if sid not in speaker_data or not text:
+            continue
+        rule_score = 10 if text.endswith('?') else 0
         rule_score += 5 * len(re.findall(interviewer_keywords, text))
         rule_score += 2 if len(text.split()) < 10 else -5 if len(text.split()) > 30 else 0
         speaker_data[sid]['rule_score'] += rule_score
+        segment = full_audio[u['start']:u['end']]
         prosodic_features = extract_detailed_prosodic_features(segment)
         speaker_data[sid]['prosodic_score'] += -5 if prosodic_features['pitch_std'] > 40 else 2
+        speaker_data[sid]['embeddings'].append(get_text_embedding(u['text']))
         speaker_data[sid]['utterance_count'] += 1
     canonical_question_embedding = get_text_embedding("Tell me about your experience and skills.")
     for sid, data in speaker_data.items():
+        if not data['embeddings']:
+            data['semantic_score'] = 0
+            continue
         avg_embedding = np.mean(data['embeddings'], axis=0).reshape(1, -1)
         data['semantic_score'] = cosine_similarity(avg_embedding, canonical_question_embedding.reshape(1, -1))[0][0]
     final_scores = {}
     for sid, data in speaker_data.items():
+        if data['utterance_count'] == 0:
+            final_scores[sid] = -999
+            continue
+        avg_rule_score = data['rule_score'] / data['utterance_count']
         avg_prosodic_score = data['prosodic_score'] / data['utterance_count']
         final_scores[sid] = (avg_rule_score * 0.5) + (data['semantic_score'] * 0.3) + (avg_prosodic_score * 0.2)
     sorted_speakers = sorted(final_scores.items(), key=lambda item: item[1], reverse=True)
     interviewer_id, interviewee_id = sorted_speakers[0][0], sorted_speakers[1][0]
     logger.info(f"Ultimate Role Classification: Interviewer -> {interviewer_id}, Interviewee -> {interviewee_id}")
     for u in utterances:
+        u['role'] = 'Interviewer' if u.get('speaker_id') == interviewer_id else 'Interviewee' if u.get('speaker_id') == interviewee_id else 'Unknown'
     return utterances
 # ==============================================================================
 # 5. YOUR CUSTOM ANALYSIS & REPORTING FUNCTIONS
 # ==============================================================================
     try:
         y, sr = librosa.load(audio_path, sr=16000)
         interviewee_utterances = [u for u in utterances if u.get('role') == 'Interviewee']
+        if not interviewee_utterances:
+            return {'error': 'No interviewee utterances found'}
         segments = [y[int(u['start'] * sr / 1000):int(u['end'] * sr / 1000)] for u in interviewee_utterances]
+        if not segments:
+            return {'error': 'No valid interviewee segments to analyze.'}
         combined_audio = np.concatenate(segments)
         total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
         total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
         speaking_rate = total_words / total_duration if total_duration > 0 else 0
+        filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
         filler_count = sum(sum(u['text'].lower().count(fw) for fw in filler_words) for u in interviewee_utterances)
         filler_ratio = filler_count / total_words if total_words > 0 else 0
         all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
+        word_counts = {tuple(all_words[i:i + 2]): all_words.count(tuple(all_words[i:i + 2])) for i in range(len(all_words) - 1)}
+        repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(word_counts) if word_counts else 0
         f0, voiced_flag, _ = librosa.pyin(combined_audio, fmin=80, fmax=300, sr=sr)
+        f0_values = f0[voiced_flag & ~np.isnan(f0)]
         pitch_mean = np.mean(f0_values) if len(f0_values) > 0 else 0
+        pitch_std = np.std(f0_values) if len(f0_values) > 0 else 0
         jitter = np.mean(np.abs(np.diff(f0_values))) / pitch_mean if len(f0_values) > 1 and pitch_mean > 0 else 0
+        rms = librosa.feature.rms(y=combined_audio)[0]
         intensity_mean = np.mean(rms) if len(rms) > 0 else 0
+        intensity_std = np.std(rms) if len(rms) > 0 else 0
         shimmer = np.mean(np.abs(np.diff(rms))) / intensity_mean if len(rms) > 1 and intensity_mean > 0 else 0
         anxiety_score = 0.6 * (pitch_std / pitch_mean if pitch_mean > 0 else 0) + 0.4 * (jitter + shimmer)
         confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
                 'composite_scores': {'anxiety': float(anxiety_score), 'confidence': float(confidence_score),
                                      'hesitation': float(hesitation_score)}}
     except Exception as e:
+        logger.error(f"Error in detailed voice analysis: {e}", exc_info=True)
         return {'error': str(e)}
 def generate_voice_interpretation(analysis: Dict) -> str:
+    if 'error' in analysis:
+        return "<b>Detailed Vocal Metrics:</b><br/>Analysis not available."
+    scores = analysis.get('composite_scores', {})
+    pitch = analysis.get('pitch_analysis', {})
     intensity = analysis.get('intensity_analysis', {})
     return (f"<b>Detailed Vocal Metrics Interpretation:</b><br/>"
             f"- Speaking Rate: {analysis.get('speaking_rate', 0):.2f} words/sec<br/>"
             f"- <b>Confidence Score:</b> {scores.get('confidence', 0):.3f}<br/>"
             f"- <b>Hesitation Score:</b> {scores.get('hesitation', 0):.3f}")
 def generate_anxiety_confidence_chart(composite_scores: Dict, chart_path_or_buffer):
     try:
+        labels = ['Anxiety', 'Confidence', 'Hesitation']
         scores = [composite_scores.get(k.lower(), 0) for k in labels]
+        fig, ax = plt.subplots(figsize=(6, 4))
         ax.bar(labels, scores, color=['#FF6B6B', '#4ECDC4', '#FFA500'], edgecolor='black', width=0.5)
+        ax.set_ylabel('Score')
+        ax.set_title('Candidate Vocal Dynamics')
         ax.set_ylim(0, max(scores) * 1.2 if scores and max(scores) > 0 else 1)
+        for bar in ax.patches:
+            ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
+                    f"{bar.get_height():.2f}", ha='center', color='black')
+        plt.tight_layout()
+        plt.savefig(chart_path_or_buffer, format='png', dpi=150)
         plt.close(fig)
     except Exception as e:
         logger.error(f"Error generating chart: {e}")
 def calculate_acceptance_probability(analysis_data: Dict) -> float:
     logger.info("Calculating final acceptance probability...")
     voice_metrics = analysis_data.get('voice_analysis_metrics', {})
+    if 'error' in voice_metrics or not voice_metrics.get('composite_scores'):
+        return 30.0
+    scores = voice_metrics['composite_scores']
+    confidence = scores.get('confidence', 0.5)
+    anxiety = scores.get('anxiety', 0.5)
     hesitation = scores.get('hesitation', 0.5)
     raw_score = (confidence * 0.6) + ((1 - anxiety) * 0.2) + ((1 - hesitation) * 0.2)
     max_score = 0.6 + 0.2 + 0.2
     return round(max(10.0, min(99.0, (raw_score / max_score if max_score > 0 else 0) * 100)), 2)
 # ==============================================================================
 # 6. AI-POWERED NARRATIVE AND PDF REPORTING
 # ==============================================================================
     """Generates a comprehensive narrative report using the Gemini model, based on your prompt structure."""
     logger.info("Generating AI-powered narrative report with Gemini...")
     voice = analysis_data.get('voice_analysis_metrics', {})
+    interviewee_text = "\n".join([f"- {u['text']}" for u in analysis_data['transcript_with_roles'] if u.get('role') == 'Interviewee'])
     acceptance_prob = analysis_data.get('acceptance_probability', 50.0)
     prompt = f"""
     You are EvalBot, a highly experienced senior HR analyst generating a comprehensive interview evaluation report.
     Analyze deeply based on actual responses provided below. Avoid generic analysis.
     Maintain professional, HR-standard language with clear structure and bullet points.
     **Suitability Score: {acceptance_prob:.2f}%**
     ### Interviewee Full Responses:
     {interviewee_text if interviewee_text else "No responses recorded."}
     ### Key Metrics:
     - Confidence Score: {voice.get('composite_scores', {}).get('confidence', 'N/A'):.2f}
     - Anxiety Score: {voice.get('composite_scores', {}).get('anxiety', 'N/A'):.2f}
     - Speaking Rate: {voice.get('speaking_rate', 'N/A')} words/sec
     ### Report Sections to Generate (Follow this structure exactly):
     **1. Executive Summary:**
     - 3 bullets summarizing performance, key strengths, and hiring recommendation.
     - Provide 5 actionable recommendations and 5 clear next steps.
     """
     try:
+        response = gemini_model.generate_content(prompt)
         return response.text
     except Exception as e:
+        logger.error(f"Gemini report generation failed: {e}")
         return "Error: Could not generate AI analysis report."
 def create_pdf_report(analysis_data: Dict, output_path: str):
     """Generates a detailed, professional PDF report including all analysis sections, based on your structure."""
     logger.info(f"Generating comprehensive PDF report at {output_path}...")
                               fontName='Helvetica-Bold', alignment=TA_CENTER))
     styles.add(ParagraphStyle(name='H2', fontSize=14, leading=18, spaceBefore=12, spaceAfter=8,
                               textColor=colors.HexColor('#0050BC'), fontName='Helvetica-Bold'))
+    styles.add(ParagraphStyle(name='H3', fontSize=12, leading=16, spaceBefore=10, spaceAfter=6,
+                              textColor=colors.HexColor('#333333'), fontName='Helvetica-Bold'))
     styles.add(ParagraphStyle(name='Body', fontSize=10, leading=14, spaceAfter=6, alignment=TA_JUSTIFY))
     story = []
     story.append(Spacer(1, 0.2 * inch))
     story.append(Paragraph(f"Candidate ID: {analysis_data.get('user_id', 'N/A')}", styles['Body']))
     story.append(Paragraph(f"Date of Analysis: {time.strftime('%B %d, %Y')}", styles['Body']))
+    prob = analysis_data.get('acceptance_probability', 0)
     prob_color = 'green' if prob >= 75 else 'orange' if prob >= 50 else 'red'
+    story.append(Paragraph(f"<b>Overall Suitability Score:</b> <font size=16 color='{prob_color}'>{prob}%</font>", styles['H2']))
     story.append(PageBreak())
     # Quantitative Analysis Page
     gemini_text = analysis_data.get('gemini_report_text', 'Not available.')
     for line in gemini_text.split('\n'):
         line = line.strip()
+        if not line:
+            continue
         if line.startswith('**') and line.endswith('**'):
             story.append(Paragraph(line.strip('*'), styles['H3']))
         elif line.startswith('- ') or line.startswith('* '):
     doc.build(story)
     logger.info("PDF report generated successfully.")
 # ==============================================================================
 # 7. MAIN PROCESSING PIPELINE
 # ==============================================================================
 def process_interview(audio_path: str, user_id: str = "candidate-123") -> Dict:
     try:
         logger.info(f"Starting processing for {audio_path} (User ID: {user_id})")
         logger.info("Generating report text using Gemini")
         gemini_report_text = generate_gemini_report_text(analysis_data)
+        analysis_data['gemini_report_text'] = gemini_report_text  # Add to analysis_data
+        base_name = f"{user_id}_{os.path.splitext(os.path.basename(audio_path))[0].rsplit('_', 1)[-1]}"
+        pdf_path = os.path.join(PDF_DIR, f"{base_name}_report.pdf")
+        create_pdf_report(analysis_data, pdf_path)
+        json_path = os.path.join(JSON_DIR, f"{base_name}_analysis.json")
         with open(json_path, 'w') as f:
             logger.debug(f"Serializing analysis_data with keys: {list(analysis_data.keys())}")
             serializable_data = convert_to_serializable(analysis_data)
         logger.error(f"Processing failed: {str(e)}", exc_info=True)
         if 'wav_file' in locals() and os.path.exists(wav_file):
             os.remove(wav_file)
+        raise