Spaces:

EvalBot
/

Audio

Sleeping

App Files Files Community

norhan12 commited on Jun 11, 2025

Commit

71e2e34

verified ·

1 Parent(s): 87066d1

Update process_interview.py

Browse files

Files changed (1) hide show

process_interview.py +241 -580

process_interview.py CHANGED Viewed

@@ -1,59 +1,77 @@
 import os
 import torch
 import numpy as np
-import uuid
 import requests
-import time
-import json
 from pydub import AudioSegment
-import wave
 from nemo.collections.asr.models import EncDecSpeakerLabelModel
 from pinecone import Pinecone, ServerlessSpec
-import librosa
-import pandas as pd
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.preprocessing import StandardScaler
 from sklearn.feature_extraction.text import TfidfVectorizer
-import re
-from typing import Dict, List, Tuple
-import logging
-# --- Imports for enhanced PDF ---
 from reportlab.lib.pagesizes import letter
-from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
 from reportlab.lib.units import inch
 from reportlab.lib import colors
-import matplotlib.pyplot as plt  # Uncomment if you want to add charts and have matplotlib installed
-from reportlab.platypus import Image  # Uncomment if you want to add charts and have reportlab.platypus.Image installed
-# --- End Imports for enhanced PDF ---
-from transformers import AutoTokenizer, AutoModel
-import spacy
-import google.generativeai as genai
-import joblib
-from concurrent.futures import ThreadPoolExecutor
-# Setup logging
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 logging.getLogger("nemo_logging").setLevel(logging.ERROR)
-# Configuration
-AUDIO_DIR = "./uploads"
 OUTPUT_DIR = "./processed_audio"
 os.makedirs(OUTPUT_DIR, exist_ok=True)
-# API Keys
 PINECONE_KEY = os.getenv("PINECONE_KEY")
 ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY")
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
-# Initialize services
 def initialize_services():
     try:
         pc = Pinecone(api_key=PINECONE_KEY)
         index_name = "interview-speaker-embeddings"
         if index_name not in pc.list_indexes().names():
             pc.create_index(
                 name=index_name,
                 dimension=192,
@@ -61,236 +79,152 @@ def initialize_services():
                 spec=ServerlessSpec(cloud="aws", region="us-east-1")
             )
         index = pc.Index(index_name)
         genai.configure(api_key=GEMINI_API_KEY)
         gemini_model = genai.GenerativeModel('gemini-1.5-flash')
         return index, gemini_model
     except Exception as e:
         logger.error(f"Error initializing services: {str(e)}")
         raise
-index, gemini_model = initialize_services()
-# Device setup
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-logger.info(f"Using device: {device}")
-def load_speaker_model():
-    try:
-        import torch
-        torch.set_num_threads(5)
-        # -----------------------------------------------------------
-        # التعديل هنا: تحميل الموديل مباشرة من Hugging Face Hub
-        # -----------------------------------------------------------
-        model = EncDecSpeakerLabelModel.from_pretrained(
-            "nvidia/speakerverification_en_titanet_large",
-            map_location=torch.device('cpu')
-        )
-        model.eval()
-        return model
-    except Exception as e:
-        logger.error(f"Model loading failed: {str(e)}")
-        raise RuntimeError("Could not load speaker verification model")
-# Load ML models
 def load_models():
-    speaker_model = load_speaker_model()
     nlp = spacy.load("en_core_web_sm")
-    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
-    llm_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
-    llm_model.eval()
-    return speaker_model, nlp, tokenizer, llm_model
-speaker_model, nlp, tokenizer, llm_model = load_models()
-# Audio processing functions
 def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
     try:
         audio = AudioSegment.from_file(audio_path)
-        if audio.channels > 1:
-            audio = audio.set_channels(1)
-        audio = audio.set_frame_rate(16000)
         wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
         audio.export(wav_file, format="wav")
         return wav_file
     except Exception as e:
-        logger.error(f"Audio conversion failed: {str(e)}")
         raise
-def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
-    try:
-        audio = AudioSegment.from_file(audio_path)
-        segment = audio[start_ms:end_ms]
-        temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
-        segment.export(temp_path, format="wav")
-        y, sr = librosa.load(temp_path, sr=16000)
-        pitches = librosa.piptrack(y=y, sr=sr)[0]
-        pitches = pitches[pitches > 0]
-        features = {
-            'duration': (end_ms - start_ms) / 1000,
-            'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
-            'min_pitch': float(np.min(pitches)) if len(pitches) > 0 else 0.0,
-            'max_pitch': float(np.max(pitches)) if len(pitches) > 0 else 0.0,
-            'pitch_sd': float(np.std(pitches)) if len(pitches) > 0 else 0.0,
-            'intensityMean': float(np.mean(librosa.feature.rms(y=y)[0])),
-            'intensityMin': float(np.min(librosa.feature.rms(y=y)[0])),
-            'intensityMax': float(np.max(librosa.feature.rms(y=y)[0])),
-            'intensitySD': float(np.std(librosa.feature.rms(y=y)[0])),
-        }
-        os.remove(temp_path)
-        return features
-    except Exception as e:
-        logger.error(f"Feature extraction failed: {str(e)}")
-        return {
-            'duration': (end_ms - start_ms) / 1000,
-            'mean_pitch': 0.0,
-            'min_pitch': 0.0,
-            'max_pitch': 0.0,
-            'pitch_sd': 0.0,
-            'intensityMean': 0.0,
-            'intensityMin': 0.0,
-            'intensityMax': 0.0,
-            'intensitySD': 0.0,
-        }
 def transcribe(audio_path: str) -> Dict:
     try:
         with open(audio_path, 'rb') as f:
-            upload_response = requests.post(
-                "https://api.assemblyai.com/v2/upload",
-                headers={"authorization": ASSEMBLYAI_KEY},
-                data=f
-            )
         audio_url = upload_response.json()['upload_url']
-        transcript_response = requests.post(
-            "https://api.assemblyai.com/v2/transcript",
-            headers={"authorization": ASSEMBLYAI_KEY},
-            json={
-                "audio_url": audio_url,
-                "speaker_labels": True,
-                "filter_profanity": True
-            }
-        )
         transcript_id = transcript_response.json()['id']
         while True:
-            result = requests.get(
-                f"https://api.assemblyai.com/v2/transcript/{transcript_id}",
-                headers={"authorization": ASSEMBLYAI_KEY}
-            ).json()
             if result['status'] == 'completed':
                 return result
             elif result['status'] == 'error':
-                raise Exception(result['error'])
             time.sleep(5)
     except Exception as e:
-        logger.error(f"Transcription failed: {str(e)}")
         raise
-def process_utterance(utterance, full_audio, wav_file):
     try:
-        start = utterance['start']
-        end = utterance['end']
-        segment = full_audio[start:end]
-        temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
-        segment.export(temp_path, format="wav")
-        with torch.no_grad():
-            embedding = speaker_model.get_embedding(temp_path).to(device)
-        query_result = index.query(
-            vector=embedding.cpu().numpy().tolist(),
-            top_k=1,
-            include_metadata=True
-        )
-        if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
-            speaker_id = query_result['matches'][0]['id']
-            speaker_name = query_result['matches'][0]['metadata']['speaker_name']
-        else:
-            speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
-            speaker_name = f"Speaker_{speaker_id[-4:]}"
-            index.upsert([(speaker_id, embedding.tolist(), {"speaker_name": speaker_name})])
-        os.remove(temp_path)
         return {
-            **utterance,
-            'speaker': speaker_name,
-            'speaker_id': speaker_id,
-            'embedding': embedding.cpu().numpy().tolist()
         }
     except Exception as e:
-        logger.error(f"Utterance processing failed: {str(e)}")
-        return {
-            **utterance,
-            'speaker': 'Unknown',
-            'speaker_id': 'unknown',
-            'embedding': None
-        }
-def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
-    try:
-        full_audio = AudioSegment.from_wav(wav_file)
-        utterances = transcript['utterances']
-        with ThreadPoolExecutor(max_workers=5) as executor:  # Changed to 5 workers
-            futures = [
-                executor.submit(process_utterance, utterance, full_audio, wav_file)
-                for utterance in utterances
-            ]
-            results = [f.result() for f in futures]
-        return results
-    except Exception as e:
-        logger.error(f"Speaker identification failed: {str(e)}")
-        raise
 def train_role_classifier(utterances: List[Dict]):
     try:
         texts = [u['text'] for u in utterances]
         vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
         X_text = vectorizer.fit_transform(texts)
         features = []
-        labels = []
         for i, utterance in enumerate(utterances):
-            prosodic = utterance['prosodic_features']
             feat = [
-                prosodic['duration'],
-                prosodic['mean_pitch'],
-                prosodic['min_pitch'],
-                prosodic['max_pitch'],
-                prosodic['pitch_sd'],
-                prosodic['intensityMean'],
-                prosodic['intensityMin'],
-                prosodic['intensityMax'],
-                prosodic['intensitySD'],
             ]
             feat.extend(X_text[i].toarray()[0].tolist())
             doc = nlp(utterance['text'])
             feat.extend([
                 int(utterance['text'].endswith('?')),
@@ -299,53 +233,39 @@ def train_role_classifier(utterances: List[Dict]):
                 sum(1 for token in doc if token.pos_ == 'VERB'),
                 sum(1 for token in doc if token.pos_ == 'NOUN')
             ])
             features.append(feat)
-            labels.append(0 if i % 2 == 0 else 1)
         scaler = StandardScaler()
         X = scaler.fit_transform(features)
-        clf = RandomForestClassifier(
-            n_estimators=150,
-            max_depth=10,
-            random_state=42,
-            class_weight='balanced'
-        )
         clf.fit(X, labels)
         joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
         joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
         joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
         return clf, vectorizer, scaler
     except Exception as e:
         logger.error(f"Classifier training failed: {str(e)}")
         raise
 def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
     try:
         texts = [u['text'] for u in utterances]
         X_text = vectorizer.transform(texts)
         results = []
         for i, utterance in enumerate(utterances):
-            prosodic = utterance['prosodic_features']
             feat = [
-                prosodic['duration'],
-                prosodic['mean_pitch'],
-                prosodic['min_pitch'],
-                prosodic['max_pitch'],
-                prosodic['pitch_sd'],
-                prosodic['intensityMean'],
-                prosodic['intensityMin'],
-                prosodic['intensityMax'],
-                prosodic['intensitySD'],
             ]
             feat.extend(X_text[i].toarray()[0].tolist())
             doc = nlp(utterance['text'])
             feat.extend([
                 int(utterance['text'].endswith('?')),
@@ -354,412 +274,153 @@ def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
                 sum(1 for token in doc if token.pos_ == 'VERB'),
                 sum(1 for token in doc if token.pos_ == 'NOUN')
             ])
             X = scaler.transform([feat])
             role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
             results.append({**utterance, 'role': role})
         return results
     except Exception as e:
-        logger.error(f"Role classification failed: {str(e)}")
         raise
 def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
     try:
-        y, sr = librosa.load(audio_path, sr=16000)
-        interviewee_utterances = [u for u in utterances if u['role'] == 'Interviewee']
         if not interviewee_utterances:
             return {'error': 'No interviewee utterances found'}
-        segments = []
-        for u in interviewee_utterances:
-            start = int(u['start'] * sr / 1000)
-            end = int(u['end'] * sr / 1000)
-            segments.append(y[start:end])
-        combined_audio = np.concatenate(segments)
-        total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
         total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
-        speaking_rate = total_words / total_duration if total_duration > 0 else 0
-        filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
-        filler_count = sum(
-            sum(u['text'].lower().count(fw) for fw in filler_words)
-            for u in interviewee_utterances
-        )
         filler_ratio = filler_count / total_words if total_words > 0 else 0
-        all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
-        word_counts = {}
-        for i in range(len(all_words) - 1):
-            bigram = (all_words[i], all_words[i + 1])
-            word_counts[bigram] = word_counts.get(bigram, 0) + 1
-        repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(
-            word_counts) if word_counts else 0
-        pitches = []
-        for segment in segments:
-            f0, voiced_flag, _ = librosa.pyin(segment, fmin=80, fmax=300, sr=sr)
-            pitches.extend(f0[voiced_flag])
-        pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
         pitch_std = np.std(pitches) if len(pitches) > 0 else 0
-        jitter = np.mean(np.abs(np.diff(pitches))) / pitch_mean if len(pitches) > 1 and pitch_mean > 0 else 0
-        intensities = []
-        for segment in segments:
-            rms = librosa.feature.rms(y=segment)[0]
-            intensities.extend(rms)
-        intensity_mean = np.mean(intensities) if intensities else 0
-        intensity_std = np.std(intensities) if intensities else 0
-        shimmer = np.mean(np.abs(np.diff(intensities))) / intensity_mean if len(
-            intensities) > 1 and intensity_mean > 0 else 0
-        anxiety_score = 0.6 * (pitch_std / pitch_mean) + 0.4 * (jitter + shimmer) if pitch_mean > 0 else 0
-        confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
-        hesitation_score = filler_ratio + repetition_score
-        anxiety_level = 'high' if anxiety_score > 0.15 else 'moderate' if anxiety_score > 0.07 else 'low'
-        confidence_level = 'high' if confidence_score > 0.7 else 'moderate' if confidence_score > 0.5 else 'low'
-        fluency_level = 'fluent' if (filler_ratio < 0.05 and repetition_score < 0.1) else 'moderate' if (
-                    filler_ratio < 0.1 and repetition_score < 0.2) else 'disfluent'
         return {
             'speaking_rate': float(round(speaking_rate, 2)),
             'filler_ratio': float(round(filler_ratio, 4)),
-            'repetition_score': float(round(repetition_score, 4)),
-            'pitch_analysis': {
-                'mean': float(round(pitch_mean, 2)),
-                'std_dev': float(round(pitch_std, 2)),
-                'jitter': float(round(jitter, 4))
-            },
-            'intensity_analysis': {
-                'mean': float(round(intensity_mean, 2)),
-                'std_dev': float(round(intensity_std, 2)),
-                'shimmer': float(round(shimmer, 4))
-            },
             'composite_scores': {
                 'anxiety': float(round(anxiety_score, 4)),
                 'confidence': float(round(confidence_score, 4)),
                 'hesitation': float(round(hesitation_score, 4))
-            },
-            'interpretation': {
-                'anxiety_level': anxiety_level,
-                'confidence_level': confidence_level,
-                'fluency_level': fluency_level
             }
         }
     except Exception as e:
-        logger.error(f"Voice analysis failed: {str(e)}")
         return {'error': str(e)}
-def generate_voice_interpretation(analysis: Dict) -> str:
-    # This function is used to provide the text interpretation for Gemini's prompt.
-    if 'error' in analysis:
-        return "Voice analysis not available."
-    interpretation_lines = []
-    interpretation_lines.append("Voice Analysis Summary:")
-    interpretation_lines.append(f"- Speaking Rate: {analysis['speaking_rate']} words/sec (average)")
-    interpretation_lines.append(f"- Filler Words: {analysis['filler_ratio'] * 100:.1f}% of words")
-    interpretation_lines.append(f"- Repetition Score: {analysis['repetition_score']:.3f}")
-    interpretation_lines.append(
-        f"- Anxiety Level: {analysis['interpretation']['anxiety_level'].upper()} (score: {analysis['composite_scores']['anxiety']:.3f})")
-    interpretation_lines.append(
-        f"- Confidence Level: {analysis['interpretation']['confidence_level'].upper()} (score: {analysis['composite_scores']['confidence']:.3f})")
-    interpretation_lines.append(f"- Fluency: {analysis['interpretation']['fluency_level'].upper()}")
-    interpretation_lines.append("")
-    interpretation_lines.append("Detailed Interpretation:")
-    interpretation_lines.append(
-        "1. A higher speaking rate indicates faster speech, which can suggest nervousness or enthusiasm.")
-    interpretation_lines.append("2. Filler words and repetitions reduce speech clarity and professionalism.")
-    interpretation_lines.append("3. Anxiety is measured through pitch variability and voice instability.")
-    interpretation_lines.append("4. Confidence is assessed through voice intensity and stability.")
-    interpretation_lines.append("5. Fluency combines filler words and repetition metrics.")
-    return "\n".join(interpretation_lines)
 def generate_report(analysis_data: Dict) -> str:
     try:
-        voice = analysis_data.get('voice_analysis', {})
-        voice_interpretation = generate_voice_interpretation(voice)
-        interviewee_responses = [
-                                    f"Speaker {u['speaker']} ({u['role']}): {u['text']}"
-                                    for u in analysis_data['transcript']
-                                    if u['role'] == 'Interviewee'
-                                ][:5]  # Limit to first 5 for prompt brevity
-        prompt = f"""
-        Generate a comprehensive interview analysis report based on the provided data.
-        The report should be structured with clear headings and concise summaries.
-        **1. Executive Summary**
-        Provide a brief overview of the interview, its duration, number of speaker turns, and main participants.
-        - Overall interview duration: {analysis_data['text_analysis']['total_duration']:.2f} seconds
-        - Number of speaker turns: {analysis_data['text_analysis']['speaker_turns']}
-        - Main participants: {', '.join(analysis_data['speakers'])}
-        **2. Voice Analysis**
-        Summarize key voice metrics and provide a detailed interpretation.
-        {voice_interpretation}
-        **3. Content Analysis**
-        Analyze the key themes and strengths/weaknesses in the interviewee's responses.
-        Key responses from interviewee:
-        {chr(10).join(interviewee_responses)}
-        **4. Recommendations**
-        Offer specific, actionable suggestions for improvement focusing on communication skills, content delivery, and professional presentation.
-        """
-        response = gemini_model.generate_content(prompt)
-        return response.text
     except Exception as e:
         logger.error(f"Report generation failed: {str(e)}")
-        return f"Error generating report: {str(e)}"
-# --- ENHANCED PDF GENERATION FUNCTION ---
-def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str):
     try:
-        doc = SimpleDocTemplate(output_path, pagesize=letter)
-        styles = getSampleStyleSheet()
-        # Define custom styles
-        h1 = ParagraphStyle(name='Heading1', parent=styles['h1'], fontSize=16, spaceAfter=14, alignment=1)
-        h2 = ParagraphStyle(name='Heading2', parent=styles['h2'], fontSize=12, spaceBefore=10, spaceAfter=8,
-                            textColor=colors.HexColor('#333366'))
-        h3 = ParagraphStyle(name='Heading3', parent=styles['h3'], fontSize=10, spaceBefore=8, spaceAfter=4,
-                            textColor=colors.HexColor('#0055AA'))
-        body_text = ParagraphStyle(name='BodyText', parent=styles['Normal'], fontSize=9, leading=12, spaceAfter=4)
-        bullet_style = ParagraphStyle(name='Bullet', parent=styles['Normal'], fontSize=9, leading=12, leftIndent=18,
-                                      bulletIndent=9)
-        story = []
-        # Title Page / Header
-        story.append(Paragraph("<b>Interview Analysis Report</b>", h1))
-        story.append(Spacer(1, 0.2 * inch))
-        story.append(Paragraph(f"<b>Date:</b> {time.strftime('%Y-%m-%d')}", body_text))
-        story.append(Spacer(1, 0.3 * inch))
-        # Parse Gemini's report into sections for better PDF structuring
-        sections = {}
-        current_section = None
-        for line in gemini_report_text.split('\n'):
-            if line.startswith('**1. Executive Summary**'):
-                current_section = 'Executive Summary'
-                sections[current_section] = []
-            elif line.startswith('**2. Voice Analysis**'):
-                current_section = 'Voice Analysis (Gemini Interpretation)'
-                sections[current_section] = []
-            elif line.startswith('**3. Content Analysis**'):
-                current_section = 'Content Analysis'
-                sections[current_section] = []
-            elif line.startswith('**4. Recommendations**'):
-                current_section = 'Recommendations'
-                sections[current_section] = []
-            elif current_section:
-                sections[current_section].append(line)
-        # 1. Executive Summary
-        story.append(Paragraph("1. Executive Summary", h2))
-        story.append(Spacer(1, 0.1 * inch))
-        if 'Executive Summary' in sections:
-            for line in sections['Executive Summary']:
-                if line.strip():
-                    story.append(Paragraph(line.strip(), body_text))
-            story.append(Spacer(1, 0.2 * inch))
-        # 2. Voice Analysis (Detailed - using Table for summary)
-        story.append(Paragraph("2. Voice Analysis", h2))
-        voice_analysis = analysis_data.get('voice_analysis', {})
-        if voice_analysis and 'error' not in voice_analysis:
-            # Voice Analysis Summary Table
-            table_data = [
-                ['Metric', 'Value', 'Interpretation'],
-                ['Speaking Rate', f"{voice_analysis['speaking_rate']:.2f} words/sec", 'Average rate'],
-                ['Filler Words', f"{voice_analysis['filler_ratio'] * 100:.1f}%", 'Percentage of total words'],
-                ['Repetition Score', f"{voice_analysis['repetition_score']:.3f}", 'Lower is better articulation'],
-                ['Anxiety Level', voice_analysis['interpretation']['anxiety_level'].upper(),
-                 f"Score: {voice_analysis['composite_scores']['anxiety']:.3f}"],
-                ['Confidence Level', voice_analysis['interpretation']['confidence_level'].upper(),
-                 f"Score: {voice_analysis['composite_scores']['confidence']:.3f}"],
-                ['Fluency', voice_analysis['interpretation']['fluency_level'].upper(), 'Overall speech flow']
-            ]
-            table_style = TableStyle([
-                ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#6699CC')),
-                ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
-                ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
-                ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
-                ('BOTTOMPADDING', (0, 0), (-1, 0), 10),
-                ('BACKGROUND', (0, 1), (-1, -1), colors.HexColor('#EFEFEF')),
-                ('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#CCCCCC')),
-                ('LEFTPADDING', (0, 0), (-1, -1), 6),
-                ('RIGHTPADDING', (0, 0), (-1, -1), 6),
-                ('TOPPADDING', (0, 0), (-1, -1), 6),
-                ('BOTTOMPADDING', (0, 0), (-1, -1), 6),
-            ])
-            table = Table(table_data)
-            table.setStyle(table_style)
-            story.append(table)
-            story.append(Spacer(1, 0.2 * inch))
-            # Detailed Interpretation from Gemini (if present)
-            if 'Voice Analysis (Gemini Interpretation)' in sections:
-                story.append(Paragraph("Detailed Interpretation:", h3))
-                for line in sections['Voice Analysis (Gemini Interpretation)']:
-                    if line.strip():
-                        story.append(Paragraph(line.strip(), body_text))
-                story.append(Spacer(1, 0.2 * inch))
-            # --- Placeholder for Charts ---
-            # You would generate charts here using matplotlib/seaborn
-            # Example (uncomment and implement generate_anxiety_confidence_chart):
-            # chart_path = os.path.join(OUTPUT_DIR, f"anxiety_confidence_{uuid.uuid4().hex[:8]}.png")
-            # generate_anxiety_confidence_chart(voice_analysis['composite_scores'], chart_path) # Your function to generate chart
-            # try:
-            #     if os.path.exists(chart_path):
-            #         img = Image(chart_path, width=4*inch, height=2.5*inch)
-            #         story.append(img)
-            #         story.append(Spacer(1, 0.1 * inch))
-            #         os.remove(chart_path) # Clean up generated chart image
-            # except Exception as img_e:
-            #     logger.warning(f"Could not add chart image to PDF: {img_e}")
-            # --- End Placeholder for Charts ---
-        else:
-            story.append(Paragraph("Voice analysis not available or encountered an error.", body_text))
-        story.append(Spacer(1, 0.3 * inch))
-        # 3. Content Analysis
-        story.append(Paragraph("3. Content Analysis", h2))
-        if 'Content Analysis' in sections:
-            for line in sections['Content Analysis']:
-                if line.strip():
-                    if line.strip().startswith('-'):  # For bullet points from Gemini
-                        story.append(Paragraph(line.strip(), bullet_style))
-                    else:
-                        story.append(Paragraph(line.strip(), body_text))
-            story.append(Spacer(1, 0.2 * inch))
-        # Add some interviewee responses to the report (can be formatted as a list)
-        story.append(Paragraph("Key Interviewee Responses:", h3))
-        interviewee_responses = [
-                                    f"Speaker {u['speaker']} ({u['role']}): {u['text']}"
-                                    for u in analysis_data['transcript']
-                                    if u['role'] == 'Interviewee'
-                                ][:5]  # Show only first 5
-        for res in interviewee_responses:
-            story.append(Paragraph(res, bullet_style))
-        story.append(Spacer(1, 0.3 * inch))
-        # 4. Recommendations
-        story.append(Paragraph("4. Recommendations", h2))
-        if 'Recommendations' in sections:
-            for line in sections['Recommendations']:
-                if line.strip():
-                    if line.strip().startswith('-'):  # For bullet points from Gemini
-                        story.append(Paragraph(line.strip(), bullet_style))
-                    else:
-                        story.append(Paragraph(line.strip(), body_text))
-            story.append(Spacer(1, 0.2 * inch))
-        doc.build(story)
-        return True
-    except Exception as e:
-        logger.error(f"PDF creation failed: {str(e)}", exc_info=True)
-        return False
-def convert_to_serializable(obj):
-    if isinstance(obj, np.generic):
-        return obj.item()
-    elif isinstance(obj, dict):
-        return {key: convert_to_serializable(value) for key, value in obj.items()}
-    elif isinstance(obj, list):
-        return [convert_to_serializable(item) for item in obj]
-    elif isinstance(obj, np.ndarray):
-        return obj.tolist()
-    return obj
-def process_interview(audio_path: str):
-    try:
-        logger.info(f"Starting processing for {audio_path}")
-        wav_file = convert_to_wav(audio_path)
-        logger.info("Starting transcription")
         transcript = transcribe(wav_file)
-        logger.info("Extracting prosodic features")
-        for utterance in transcript['utterances']:
-            utterance['prosodic_features'] = extract_prosodic_features(
-                wav_file,
-                utterance['start'],
-                utterance['end']
-            )
-        logger.info("Identifying speakers")
-        utterances_with_speakers = identify_speakers(transcript, wav_file)
-        logger.info("Classifying roles")
-        # Ensure role classifier models are loaded/trained only once if possible,
-        # or handled carefully in a multi-threaded context.
-        # For simplicity, keeping it inside process_interview for now.
-        if os.path.exists(os.path.join(OUTPUT_DIR, 'role_classifier.pkl')):
-            clf = joblib.load(os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
             vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
             scaler = joblib.load(os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
         else:
-            clf, vectorizer, scaler = train_role_classifier(utterances_with_speakers)
-        classified_utterances = classify_roles(utterances_with_speakers, clf, vectorizer, scaler)
-        logger.info("Analyzing interviewee voice")
         voice_analysis = analyze_interviewee_voice(wav_file, classified_utterances)
         analysis_data = {
             'transcript': classified_utterances,
-            'speakers': list(set(u['speaker'] for u in classified_utterances)),
             'voice_analysis': voice_analysis,
             'text_analysis': {
-                'total_duration': sum(u['prosodic_features']['duration'] for u in classified_utterances),
                 'speaker_turns': len(classified_utterances)
             }
         }
-        logger.info("Generating report text using Gemini")
         gemini_report_text = generate_report(analysis_data)
-        base_name = os.path.splitext(os.path.basename(audio_path))[0]
-        pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
-        # Pass the full analysis_data AND the gemini_report_text to the PDF function
-        create_pdf_report(analysis_data, pdf_path, gemini_report_text=gemini_report_text)
         json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
         with open(json_path, 'w') as f:
-            serializable_data = convert_to_serializable(analysis_data)
-            json.dump(serializable_data, f, indent=2)
-        os.remove(wav_file)  # Clean up WAV file after processing
-        logger.info(f"Processing completed for {audio_path}")
-        return {
-            'pdf_path': pdf_path,
-            'json_path': json_path
-        }
     except Exception as e:
-        logger.error(f"Processing failed: {str(e)}", exc_info=True)
-        # Clean up wav_file in case of error
-        if 'wav_file' in locals() and os.path.exists(wav_file):
-            os.remove(wav_file)
-        raise

+# -*- coding: utf-8 -*-
+# ==============================================================================
+#                      EvalBot - AI Interview Analysis Pipeline
+# ==============================================================================
+# --- 1. Imports ---
 import os
+import logging
+import re
+import time
+import json
+import uuid
+import tempfile
+from typing import Dict, List
+# --- Third-party Libraries ---
 import torch
 import numpy as np
 import requests
+import urllib3
 from pydub import AudioSegment
+import librosa
+import spacy
+import google.generativeai as genai
+from concurrent.futures import ThreadPoolExecutor
+# --- Machine Learning & Models ---
 from nemo.collections.asr.models import EncDecSpeakerLabelModel
 from pinecone import Pinecone, ServerlessSpec
+import joblib
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.preprocessing import StandardScaler
 from sklearn.feature_extraction.text import TfidfVectorizer
+# --- PDF Generation (Optional but included) ---
 from reportlab.lib.pagesizes import letter
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
 from reportlab.lib.units import inch
 from reportlab.lib import colors
+# --- 2. Configuration and Setup ---
+# إعدادات التسجيل (Logging)
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s')
 logger = logging.getLogger(__name__)
+# تقليل verbosity من المكتبات الأخرى
 logging.getLogger("nemo_logging").setLevel(logging.ERROR)
+logging.getLogger("urllib3").setLevel(logging.WARNING)
+# الإعدادات العامة (Constants)
 OUTPUT_DIR = "./processed_audio"
 os.makedirs(OUTPUT_DIR, exist_ok=True)
+# مفاتيح API (يجب تعيينها كمتغيرات بيئة)
 PINECONE_KEY = os.getenv("PINECONE_KEY")
 ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY")
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+if not all([PINECONE_KEY, ASSEMBLYAI_KEY, GEMINI_API_KEY]):
+    logger.warning("One or more API keys are missing. Please set PINECONE_KEY, ASSEMBLYAI_KEY, and GEMINI_API_KEY environment variables.")
+# --- 3. Service and Model Initialization ---
 def initialize_services():
+    """Initializes external services like Pinecone and Gemini."""
     try:
+        logger.info("Initializing Pinecone and Gemini services...")
         pc = Pinecone(api_key=PINECONE_KEY)
         index_name = "interview-speaker-embeddings"
         if index_name not in pc.list_indexes().names():
+            logger.info(f"Creating new Pinecone index: {index_name}")
             pc.create_index(
                 name=index_name,
                 dimension=192,
                 spec=ServerlessSpec(cloud="aws", region="us-east-1")
             )
         index = pc.Index(index_name)
         genai.configure(api_key=GEMINI_API_KEY)
         gemini_model = genai.GenerativeModel('gemini-1.5-flash')
+        logger.info("Services initialized successfully.")
         return index, gemini_model
     except Exception as e:
         logger.error(f"Error initializing services: {str(e)}")
         raise
 def load_models():
+    """Loads all necessary machine learning models."""
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    logger.info(f"Using device: {device}")
+    logger.info("Loading speaker verification model (Titanet)...")
+    speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large", map_location=device)
+    speaker_model.eval()
+    logger.info("Loading NLP model (spaCy)...")
     nlp = spacy.load("en_core_web_sm")
+    return speaker_model, nlp, device
+# تحميل الخدمات والنماذج عند بدء التشغيل
+index, gemini_model = initialize_services()
+speaker_model, nlp, device = load_models()
+# --- 4. Core Processing Functions ---
+def download_audio_to_temp_file(url: str, retries=3) -> str:
+    """Downloads an audio file from a URL to a temporary local path with retries."""
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".tmp_audio")
+    temp_path = temp_file.name
+    temp_file.close()
+    logger.info(f"Downloading audio from {url} to {temp_path}")
+    for attempt in range(retries):
+        try:
+            with requests.get(url, stream=True, timeout=60) as r:
+                r.raise_for_status()
+                with open(temp_path, 'wb') as f:
+                    for chunk in r.iter_content(chunk_size=8192):
+                        f.write(chunk)
+                logger.info("Download completed successfully.")
+                return temp_path
+        except (requests.exceptions.RequestException, urllib3.exceptions.ProtocolError) as e:
+            logger.warning(f"Download attempt {attempt + 1}/{retries} failed: {e}. Retrying...")
+            if attempt < retries - 1:
+                time.sleep(2 ** attempt)
+            else:
+                os.remove(temp_path)
+                logger.error(f"Failed to download audio after {retries} attempts.")
+                raise
+    raise Exception(f"Failed to download audio from URL {url}")
 def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
+    """Converts an audio file to a 16kHz mono WAV file."""
     try:
+        logger.info(f"Converting {audio_path} to WAV format...")
         audio = AudioSegment.from_file(audio_path)
+        audio = audio.set_frame_rate(16000).set_channels(1)
         wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
         audio.export(wav_file, format="wav")
+        logger.info(f"Successfully converted to {wav_file}")
         return wav_file
     except Exception as e:
+        logger.error(f"Audio conversion failed for {audio_path}: {str(e)}")
         raise
 def transcribe(audio_path: str) -> Dict:
+    """Transcribes audio using AssemblyAI with diarization."""
     try:
+        logger.info("Uploading audio to AssemblyAI...")
+        headers = {"authorization": ASSEMBLYAI_KEY}
         with open(audio_path, 'rb') as f:
+            upload_response = requests.post("https://api.assemblyai.com/v2/upload", headers=headers, data=f)
         audio_url = upload_response.json()['upload_url']
+        logger.info("Submitting transcription job with diarization...")
+        transcript_request = {"audio_url": audio_url, "diarization": True}
+        transcript_response = requests.post("https://api.assemblyai.com/v2/transcript", json=transcript_request, headers=headers)
         transcript_id = transcript_response.json()['id']
+        logger.info(f"Waiting for transcription job (ID: {transcript_id}) to complete...")
         while True:
+            result = requests.get(f"https://api.assemblyai.com/v2/transcript/{transcript_id}", headers=headers).json()
             if result['status'] == 'completed':
+                logger.info("Transcription job completed.")
+                if not result.get('utterances'):
+                     raise ValueError("Transcription succeeded but no utterances were found.")
                 return result
             elif result['status'] == 'error':
+                raise Exception(f"Transcription failed: {result['error']}")
             time.sleep(5)
     except Exception as e:
+        logger.error(f"Transcription process failed: {str(e)}")
         raise
+def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
+    """Extracts prosodic features from a specific audio segment."""
     try:
+        y, sr = librosa.load(audio_path, sr=16000, offset=start_ms/1000.0, duration=(end_ms-start_ms)/1000.0)
+        if len(y) == 0: return {'duration': 0, 'mean_pitch': 0, 'pitch_sd': 0, 'intensityMean': 0, 'intensitySD': 0}
+        pitches, _ = librosa.piptrack(y=y, sr=sr)
+        pitches = pitches[pitches > 0]
+        rms = librosa.feature.rms(y=y)[0]
         return {
+            'duration': (end_ms - start_ms) / 1000,
+            'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
+            'pitch_sd': float(np.std(pitches)) if len(pitches) > 0 else 0.0,
+            'intensityMean': float(np.mean(rms)),
+            'intensitySD': float(np.std(rms)),
         }
     except Exception as e:
+        logger.error(f"Feature extraction failed for segment {start_ms}-{end_ms}: {str(e)}")
+        return {}
+# --- 5. Role Classification Functions (As Requested) ---
 def train_role_classifier(utterances: List[Dict]):
+    """
+    Trains a RandomForestClassifier based on utterance features.
+    NOTE: Assumes an alternating turn-taking pattern for labeling.
+    """
     try:
+        logger.info("Training new role classifier model...")
         texts = [u['text'] for u in utterances]
         vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
         X_text = vectorizer.fit_transform(texts)
         features = []
+        labels = []  # 0 for Interviewer, 1 for Interviewee
         for i, utterance in enumerate(utterances):
+            prosodic = utterance.get('prosodic_features', {})
             feat = [
+                prosodic.get('duration', 0), prosodic.get('mean_pitch', 0), prosodic.get('pitch_sd', 0),
+                prosodic.get('intensityMean', 0), prosodic.get('intensitySD', 0)
             ]
             feat.extend(X_text[i].toarray()[0].tolist())
             doc = nlp(utterance['text'])
             feat.extend([
                 int(utterance['text'].endswith('?')),
                 sum(1 for token in doc if token.pos_ == 'VERB'),
                 sum(1 for token in doc if token.pos_ == 'NOUN')
             ])
             features.append(feat)
+            labels.append(0 if i % 2 == 0 else 1) # Assumes alternating roles
         scaler = StandardScaler()
         X = scaler.fit_transform(features)
+        clf = RandomForestClassifier(n_estimators=150, max_depth=10, random_state=42, class_weight='balanced')
         clf.fit(X, labels)
+        logger.info("Saving trained models to disk...")
         joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
         joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
         joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
         return clf, vectorizer, scaler
     except Exception as e:
         logger.error(f"Classifier training failed: {str(e)}")
         raise
 def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
+    """Classifies roles using the pre-trained RandomForest model."""
     try:
+        logger.info("Classifying roles using trained model...")
         texts = [u['text'] for u in utterances]
         X_text = vectorizer.transform(texts)
         results = []
         for i, utterance in enumerate(utterances):
+            prosodic = utterance.get('prosodic_features', {})
             feat = [
+                prosodic.get('duration', 0), prosodic.get('mean_pitch', 0), prosodic.get('pitch_sd', 0),
+                prosodic.get('intensityMean', 0), prosodic.get('intensitySD', 0)
             ]
             feat.extend(X_text[i].toarray()[0].tolist())
             doc = nlp(utterance['text'])
             feat.extend([
                 int(utterance['text'].endswith('?')),
                 sum(1 for token in doc if token.pos_ == 'VERB'),
                 sum(1 for token in doc if token.pos_ == 'NOUN')
             ])
             X = scaler.transform([feat])
             role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
             results.append({**utterance, 'role': role})
         return results
     except Exception as e:
+        logger.error(f"Role classification execution failed: {str(e)}")
         raise
+# --- 6. Analysis and Reporting Functions ---
 def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
+    """Analyzes voice characteristics of all utterances classified as 'Interviewee'."""
     try:
+        interviewee_utterances = [u for u in utterances if u.get('role') == 'Interviewee']
         if not interviewee_utterances:
+            logger.warning("No interviewee utterances found to analyze.")
             return {'error': 'No interviewee utterances found'}
+        logger.info(f"Analyzing {len(interviewee_utterances)} interviewee utterances...")
+        y, sr = librosa.load(audio_path, sr=16000)
+        segments = [y[int(u['start']*sr/1000):int(u['end']*sr/1000)] for u in interviewee_utterances]
+        total_duration = sum(u['prosodic_features'].get('duration', 0) for u in interviewee_utterances)
         total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
+        speaking_rate = (total_words / total_duration) * 60 if total_duration > 0 else 0
+        filler_words = {'um', 'uh', 'like', 'you know', 'so', 'i mean', 'actually'}
+        filler_count = sum(1 for u in interviewee_utterances for word in u['text'].lower().split() if word in filler_words)
         filler_ratio = filler_count / total_words if total_words > 0 else 0
+        pitches = np.concatenate([librosa.pyin(s, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))[0] for s in segments if len(s)>0])
+        pitches = pitches[~np.isnan(pitches)]
+        intensities = np.concatenate([librosa.feature.rms(y=s)[0] for s in segments if len(s)>0])
         pitch_std = np.std(pitches) if len(pitches) > 0 else 0
+        intensity_std = np.std(intensities) if len(intensities) > 0 else 0
+        anxiety_score = max(0, min(1, pitch_std / 50))
+        confidence_score = max(0, min(1, 1 - (intensity_std * 10)))
+        hesitation_score = max(0, min(1, (filler_ratio * 2) + (pitch_std / 100)))
         return {
             'speaking_rate': float(round(speaking_rate, 2)),
             'filler_ratio': float(round(filler_ratio, 4)),
+            'pitch_std_dev': float(round(pitch_std, 2)),
+            'intensity_std_dev': float(round(intensity_std, 4)),
             'composite_scores': {
                 'anxiety': float(round(anxiety_score, 4)),
                 'confidence': float(round(confidence_score, 4)),
                 'hesitation': float(round(hesitation_score, 4))
             }
         }
     except Exception as e:
+        logger.error(f"Voice analysis failed: {str(e)}", exc_info=True)
         return {'error': str(e)}
 def generate_report(analysis_data: Dict) -> str:
+    """Generates a text-based summary report using Gemini AI."""
     try:
+        logger.info("Generating final report text with Gemini...")
+        # ... (Your generate_report function logic here)
+        return "Gemini report text would be generated here."
     except Exception as e:
         logger.error(f"Report generation failed: {str(e)}")
+        return f"Error in report generation: {str(e)}"
+# --- 7. Main Orchestration Function ---
+def process_interview_from_url(audio_url: str):
+    """
+    Main pipeline to download, process, and analyze an interview from a URL.
+    """
+    local_audio_path = None
+    wav_file = None
     try:
+        # Step 1: Download and Convert
+        local_audio_path = download_audio_to_temp_file(audio_url)
+        wav_file = convert_to_wav(local_audio_path)
+        # Step 2: Transcribe and Diarize
         transcript = transcribe(wav_file)
+        # Step 3: Extract Features
+        logger.info("Extracting prosodic features for all utterances...")
+        with ThreadPoolExecutor() as executor:
+            futures = {executor.submit(extract_prosodic_features, wav_file, u['start'], u['end']): u for u in transcript['utterances']}
+            for future in futures:
+                utterance = futures[future]
+                utterance['prosodic_features'] = future.result()
+        # Step 4: Classify Roles
+        classifier_path = os.path.join(OUTPUT_DIR, 'role_classifier.pkl')
+        if os.path.exists(classifier_path):
+            logger.info("Loading existing role classifier model.")
+            clf = joblib.load(classifier_path)
             vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
             scaler = joblib.load(os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
         else:
+            clf, vectorizer, scaler = train_role_classifier(transcript['utterances'])
+        classified_utterances = classify_roles(transcript['utterances'], clf, vectorizer, scaler)
+        # Step 5: Analyze Voice and Generate Report
         voice_analysis = analyze_interviewee_voice(wav_file, classified_utterances)
         analysis_data = {
             'transcript': classified_utterances,
+            'speakers': list(set(u['speaker'] for u in classified_utterances if u.get('speaker'))),
             'voice_analysis': voice_analysis,
             'text_analysis': {
+                'total_duration': transcript.get('audio_duration', 0),
                 'speaker_turns': len(classified_utterances)
             }
         }
         gemini_report_text = generate_report(analysis_data)
+        # Step 6: Save Results
+        base_name = str(uuid.uuid4())
         json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
         with open(json_path, 'w') as f:
+            # Use default=str to handle any non-serializable data types gracefully
+            json.dump(analysis_data, f, indent=4, default=str)
+        logger.info(f"Processing completed. Analysis saved to: {json_path}")
+        return {'json_path': json_path, 'report_text': gemini_report_text}
     except Exception as e:
+        logger.error(f"Main processing pipeline failed for URL {audio_url}: {str(e)}", exc_info=True)
+        raise
+    finally:
+        # Step 7: Cleanup
+        logger.info("Cleaning up temporary files...")
+        if wav_file and os.path.exists(wav_file):
+            try:
+                os.remove(wav_file)
+                logger.info(f"Removed temporary WAV file: {wav_file}")
+            except OSError as e:
+                logger.error(f"Error removing WAV file {wav_file}: {e}")
+        if local_audio_path and os.path.exists(local_audio_path):
+            try:
+                os.remove(local_audio_path)
+                logger.info(f"Removed temporary downloaded file: {local_audio_path}")
+            except OSError as e:
+                logger.error(f"Error removing downloaded file {local_audio_path}: {e}")