Spaces:

EvalBot
/

Audio

Sleeping

App Files Files Community

norhan12 commited on Jun 11, 2025

Commit

dda086c

verified ·

1 Parent(s): d505a88

Update process_interview.py

Browse files

Files changed (1) hide show

process_interview.py +297 -365

process_interview.py CHANGED Viewed

@@ -10,16 +10,12 @@ import wave
 from nemo.collections.asr.models import EncDecSpeakerLabelModel
 from pinecone import Pinecone, ServerlessSpec
 import librosa
-import pandas as pd
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.preprocessing import StandardScaler
-from sklearn.feature_extraction.text import TfidfVectorizer
 import re
-from typing import Dict, List, Tuple
 import logging
 import tempfile
 from reportlab.lib.pagesizes import letter
-from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, Image
 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
 from reportlab.lib.units import inch
 from reportlab.lib import colors
@@ -28,20 +24,20 @@ import matplotlib
 matplotlib.use('Agg')
 from reportlab.platypus import Image
 import io
-from transformers import AutoTokenizer, AutoModel
 import spacy
 import google.generativeai as genai
-import joblib
 from concurrent.futures import ThreadPoolExecutor
-import urllib3
-# Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-logging.getLogger("nemo_logging").setLevel(logging.INFO)
-logging.getLogger("nemo").setLevel(logging.INFO)
 # Configuration
-AUDIO_DIR = "./Uploads"
 OUTPUT_DIR = "./processed_audio"
 os.makedirs(OUTPUT_DIR, exist_ok=True)
@@ -50,29 +46,34 @@ PINECONE_KEY = os.getenv("PINECONE_KEY")
 ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY")
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 def download_audio_from_url(url: str, retries=3) -> str:
     """Downloads an audio file from a URL to a temporary local path with retries."""
-    try:
-        temp_dir = tempfile.gettempdir()
-        temp_path = os.path.join(temp_dir, f"{uuid.uuid4()}.tmp_audio")
-        logger.info(f"Downloading audio from {url} to {temp_path}")
-        for attempt in range(retries):
-            try:
-                with requests.get(url, stream=True, timeout=30) as r:
-                    r.raise_for_status()
-                    with open(temp_path, 'wb') as f:
-                        for chunk in r.iter_content(chunk_size=8192):
-                            f.write(chunk)
                 return temp_path
-            except (requests.exceptions.ChunkedEncodingError, urllib3.exceptions.ProtocolError) as e:
-                logger.warning(f"Attempt {attempt + 1} failed: {e}. Retrying...")
-                time.sleep(2 ** attempt) # Exponential backoff
-        raise Exception(f"Failed to download audio after {retries} attempts.")
-    except Exception as e:
-        logger.error(f"Failed to download audio from URL {url}: {e}")
-        raise
 def initialize_services():
     try:
         pc = Pinecone(api_key=PINECONE_KEY)
         index_name = "interview-speaker-embeddings"
@@ -84,6 +85,7 @@ def initialize_services():
                 spec=ServerlessSpec(cloud="aws", region="us-east-1")
             )
         index = pc.Index(index_name)
         genai.configure(api_key=GEMINI_API_KEY)
         gemini_model = genai.GenerativeModel('gemini-1.5-flash')
         return index, gemini_model
@@ -92,14 +94,14 @@ def initialize_services():
         raise
 index, gemini_model = initialize_services()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 logger.info(f"Using device: {device}")
 def load_speaker_model():
     try:
-        import torch
-        torch.set_num_threads(5)
         model = EncDecSpeakerLabelModel.from_pretrained(
             "nvidia/speakerverification_en_titanet_large",
             map_location=torch.device('cpu')
@@ -111,21 +113,18 @@ def load_speaker_model():
         raise RuntimeError("Could not load speaker verification model")
 def load_models():
     speaker_model = load_speaker_model()
     nlp = spacy.load("en_core_web_sm")
-    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
-    llm_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
-    llm_model.eval()
-    return speaker_model, nlp, tokenizer, llm_model
-speaker_model, nlp, tokenizer, llm_model = load_models()
 def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
     try:
         audio = AudioSegment.from_file(audio_path)
-        if audio.channels > 1:
-            audio = audio.set_channels(1)
-        audio = audio.set_frame_rate(16000)
         wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
         audio.export(wav_file, format="wav")
         return wav_file
@@ -133,354 +132,236 @@ def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
         logger.error(f"Audio conversion failed: {str(e)}")
         raise
 def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
     try:
-        audio = AudioSegment.from_file(audio_path)
-        segment = audio[start_ms:end_ms]
-        temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
-        segment.export(temp_path, format="wav")
-        y, sr = librosa.load(temp_path, sr=16000)
-        pitches = librosa.piptrack(y=y, sr=sr)[0]
         pitches = pitches[pitches > 0]
-        features = {
             'duration': (end_ms - start_ms) / 1000,
             'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
-            'min_pitch': float(np.min(pitches)) if len(pitches) > 0 else 0.0,
-            'max_pitch': float(np.max(pitches)) if len(pitches) > 0 else 0.0,
             'pitch_sd': float(np.std(pitches)) if len(pitches) > 0 else 0.0,
-            'intensityMean': float(np.mean(librosa.feature.rms(y=y)[0])),
-            'intensityMin': float(np.min(librosa.feature.rms(y=y)[0])),
-            'intensityMax': float(np.max(librosa.feature.rms(y=y)[0])),
-            'intensitySD': float(np.std(librosa.feature.rms(y=y)[0])),
         }
-        os.remove(temp_path)
-        return features
     except Exception as e:
         logger.error(f"Feature extraction failed: {str(e)}")
-        return {
-            'duration': (end_ms - start_ms) / 1000,
-            'mean_pitch': 0.0,
-            'min_pitch': 0.0,
-            'max_pitch': 0.0,
-            'pitch_sd': 0.0,
-            'intensityMean': 0.0,
-            'intensityMin': 0.0,
-            'intensityMax': 0.0,
-            'intensitySD': 0.0,
-        }
 def transcribe(audio_path: str) -> Dict:
     try:
         with open(audio_path, 'rb') as f:
-            upload_response = requests.post(
-                "https://api.assemblyai.com/v2/upload",
-                headers={"authorization": ASSEMBLYAI_KEY},
-                data=f
-            )
         audio_url = upload_response.json()['upload_url']
-        transcript_response = requests.post(
-            "https://api.assemblyai.com/v2/transcript",
-            headers={"authorization": ASSEMBLYAI_KEY},
-            json={
-                "audio_url": audio_url,
-                "speaker_labels": True,
-                "filter_profanity": True
-            }
-        )
         transcript_id = transcript_response.json()['id']
         while True:
-            result = requests.get(
-                f"https://api.assemblyai.com/v2/transcript/{transcript_id}",
-                headers={"authorization": ASSEMBLYAI_KEY}
-            ).json()
             if result['status'] == 'completed':
                 return result
             elif result['status'] == 'error':
-                raise Exception(result['error'])
             time.sleep(5)
     except Exception as e:
         logger.error(f"Transcription failed: {str(e)}")
         raise
-def process_utterance(utterance, full_audio, wav_file):
     try:
-        start = utterance['start']
-        end = utterance['end']
         segment = full_audio[start:end]
-        temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
-        segment.export(temp_path, format="wav")
-        with torch.no_grad():
-            embedding = speaker_model.get_embedding(temp_path).to(device)
-        query_result = index.query(
-            vector=embedding.cpu().numpy().tolist(),
-            top_k=1,
-            include_metadata=True
-        )
-        if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
-            speaker_id = query_result['matches'][0]['id']
-            speaker_name = query_result['matches'][0]['metadata']['speaker_name']
-        else:
-            speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
-            speaker_name = f"Speaker_{speaker_id[-4:]}"
-            index.upsert([(speaker_id, embedding.tolist(), {"speaker_name": speaker_name})])
-        os.remove(temp_path)
-        return {
-            **utterance,
-            'speaker': speaker_name,
-            'speaker_id': speaker_id,
-            'embedding': embedding.cpu().numpy().tolist()
-        }
     except Exception as e:
         logger.error(f"Utterance processing failed: {str(e)}")
-        return {
-            **utterance,
-            'speaker': 'Unknown',
-            'speaker_id': 'unknown',
-            'embedding': None
-        }
 def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
     try:
         full_audio = AudioSegment.from_wav(wav_file)
         utterances = transcript['utterances']
-        with ThreadPoolExecutor(max_workers=5) as executor:  # Changed to 5 workers
-            futures = [
-                executor.submit(process_utterance, utterance, full_audio, wav_file)
-                for utterance in utterances
-            ]
-            results = [f.result() for f in futures]
-        return results
     except Exception as e:
         logger.error(f"Speaker identification failed: {str(e)}")
         raise
-def train_role_classifier(utterances: List[Dict]):
-    try:
-        texts = [u['text'] for u in utterances]
-        vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
-        X_text = vectorizer.fit_transform(texts)
-        features = []
-        labels = []
-        for i, utterance in enumerate(utterances):
-            prosodic = utterance['prosodic_features']
-            feat = [
-                prosodic['duration'],
-                prosodic['mean_pitch'],
-                prosodic['min_pitch'],
-                prosodic['max_pitch'],
-                prosodic['pitch_sd'],
-                prosodic['intensityMean'],
-                prosodic['intensityMin'],
-                prosodic['intensityMax'],
-                prosodic['intensitySD'],
-            ]
-            feat.extend(X_text[i].toarray()[0].tolist())
-            doc = nlp(utterance['text'])
-            feat.extend([
-                int(utterance['text'].endswith('?')),
-                len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower())),
-                len(utterance['text'].split()),
-                sum(1 for token in doc if token.pos_ == 'VERB'),
-                sum(1 for token in doc if token.pos_ == 'NOUN')
-            ])
-            features.append(feat)
-            labels.append(0 if i % 2 == 0 else 1)
-        scaler = StandardScaler()
-        X = scaler.fit_transform(features)
-        clf = RandomForestClassifier(
-            n_estimators=150,
-            max_depth=10,
-            random_state=42,
-            class_weight='balanced'
-        )
-        clf.fit(X, labels)
-        joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
-        joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
-        joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
-        return clf, vectorizer, scaler
-    except Exception as e:
-        logger.error(f"Classifier training failed: {str(e)}")
-        raise
-def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
     try:
-        texts = [u['text'] for u in utterances]
-        X_text = vectorizer.transform(texts)
-        results = []
-        for i, utterance in enumerate(utterances):
-            prosodic = utterance['prosodic_features']
-            feat = [
-                prosodic['duration'],
-                prosodic['mean_pitch'],
-                prosodic['min_pitch'],
-                prosodic['max_pitch'],
-                prosodic['pitch_sd'],
-                prosodic['intensityMean'],
-                prosodic['intensityMin'],
-                prosodic['intensityMax'],
-                prosodic['intensitySD'],
-            ]
-            feat.extend(X_text[i].toarray()[0].tolist())
-            doc = nlp(utterance['text'])
-            feat.extend([
-                int(utterance['text'].endswith('?')),
-                len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower())),
-                len(utterance['text'].split()),
-                sum(1 for token in doc if token.pos_ == 'VERB'),
-                sum(1 for token in doc if token.pos_ == 'NOUN')
-            ])
-            X = scaler.transform([feat])
-            role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
-            results.append({**utterance, 'role': role})
-        return results
     except Exception as e:
         logger.error(f"Role classification failed: {str(e)}")
-        raise
 def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
     try:
-        y, sr = librosa.load(audio_path, sr=16000)
-        interviewee_utterances = [u for u in utterances if u['role'] == 'Interviewee']
         if not interviewee_utterances:
             return {'error': 'No interviewee utterances found'}
-        segments = []
-        for u in interviewee_utterances:
-            start = int(u['start'] * sr / 1000)
-            end = int(u['end'] * sr / 1000)
-            segments.append(y[start:end])
-        combined_audio = np.concatenate(segments)
         total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
         total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
-        speaking_rate = total_words / total_duration if total_duration > 0 else 0
-        filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
-        filler_count = sum(
-            sum(u['text'].lower().count(fw) for fw in filler_words)
-            for u in interviewee_utterances
-        )
         filler_ratio = filler_count / total_words if total_words > 0 else 0
         all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
-        word_counts = {}
-        for i in range(len(all_words) - 1):
-            bigram = (all_words[i], all_words[i + 1])
-            word_counts[bigram] = word_counts.get(bigram, 0) + 1
-        repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(
-            word_counts) if word_counts else 0
-        pitches = []
-        for segment in segments:
-            f0, voiced_flag, _ = librosa.pyin(segment, fmin=80, fmax=300, sr=sr)
-            pitches.extend(f0[voiced_flag])
         pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
         pitch_std = np.std(pitches) if len(pitches) > 0 else 0
-        jitter = np.mean(np.abs(np.diff(pitches))) / pitch_mean if len(pitches) > 1 and pitch_mean > 0 else 0
-        intensities = []
-        for segment in segments:
-            rms = librosa.feature.rms(y=segment)[0]
-            intensities.extend(rms)
-        intensity_mean = np.mean(intensities) if intensities else 0
-        intensity_std = np.std(intensities) if intensities else 0
-        shimmer = np.mean(np.abs(np.diff(intensities))) / intensity_mean if len(
-            intensities) > 1 and intensity_mean > 0 else 0
-        anxiety_score = 0.6 * (pitch_std / pitch_mean) + 0.4 * (jitter + shimmer) if pitch_mean > 0 else 0
-        confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
-        hesitation_score = filler_ratio + repetition_score
-        anxiety_level = 'high' if anxiety_score > 0.15 else 'moderate' if anxiety_score > 0.07 else 'low'
-        confidence_level = 'high' if confidence_score > 0.7 else 'moderate' if confidence_score > 0.5 else 'low'
-        fluency_level = 'fluent' if (filler_ratio < 0.05 and repetition_score < 0.1) else 'moderate' if (
-                    filler_ratio < 0.1 and repetition_score < 0.2) else 'disfluent'
         return {
             'speaking_rate': float(round(speaking_rate, 2)),
             'filler_ratio': float(round(filler_ratio, 4)),
             'repetition_score': float(round(repetition_score, 4)),
-            'pitch_analysis': {
-                'mean': float(round(pitch_mean, 2)),
-                'std_dev': float(round(pitch_std, 2)),
-                'jitter': float(round(jitter, 4))
-            },
-            'intensity_analysis': {
-                'mean': float(round(intensity_mean, 2)),
-                'std_dev': float(round(intensity_std, 2)),
-                'shimmer': float(round(shimmer, 4))
-            },
             'composite_scores': {
                 'anxiety': float(round(anxiety_score, 4)),
                 'confidence': float(round(confidence_score, 4)),
                 'hesitation': float(round(hesitation_score, 4))
             },
             'interpretation': {
-                'anxiety_level': anxiety_level,
-                'confidence_level': confidence_level,
-                'fluency_level': fluency_level
             }
         }
     except Exception as e:
-        logger.error(f"Voice analysis failed: {str(e)}")
         return {'error': str(e)}
 def generate_anxiety_confidence_chart(composite_scores: Dict, chart_path_or_buffer):
     try:
         labels = ['Anxiety', 'Confidence']
         scores = [composite_scores.get('anxiety', 0), composite_scores.get('confidence', 0)]
         fig, ax = plt.subplots(figsize=(5, 3.5))
         bars = ax.bar(labels, scores, color=['#FF5252', '#26A69A'], edgecolor='black', width=0.45)
-        ax.set_ylabel('Score (Normalized)', fontsize=12)
         ax.set_title('Vocal Dynamics: Anxiety vs. Confidence', fontsize=14, pad=15)
-        ax.set_ylim(0, 1.3)
         for bar in bars:
             height = bar.get_height()
-            ax.text(bar.get_x() + bar.get_width()/2, height + 0.05, f"{height:.2f}",
-                    ha='center', color='black', fontweight='bold', fontsize=11)
         ax.grid(True, axis='y', linestyle='--', alpha=0.7)
         plt.tight_layout()
         plt.savefig(chart_path_or_buffer, format='png', bbox_inches='tight', dpi=300)
@@ -489,67 +370,101 @@ def generate_anxiety_confidence_chart(composite_scores: Dict, chart_path_or_buff
         logger.error(f"Error generating chart: {str(e)}")
 def calculate_acceptance_probability(analysis_data: Dict) -> float:
     voice = analysis_data.get('voice_analysis', {})
     if 'error' in voice: return 0.0
-    w_confidence, w_anxiety, w_fluency, w_speaking_rate, w_filler_repetition, w_content_strengths = 0.35, -0.25, 0.2, 0.15, -0.15, 0.25
-    confidence_score = voice.get('composite_scores', {}).get('confidence', 0.0)
-    anxiety_score = voice.get('composite_scores', {}).get('anxiety', 0.0)
-    fluency_level = voice.get('interpretation', {}).get('fluency_level', 'Disfluent')
-    speaking_rate = voice.get('speaking_rate', 0.0)
-    filler_ratio = voice.get('filler_ratio', 0.0)
-    repetition_score = voice.get('repetition_score', 0.0)
-    fluency_map = {'Fluent': 1.0, 'Moderate': 0.6, 'Disfluent': 0.2}
-    fluency_val = fluency_map.get(fluency_level, 0.2)
-    ideal_speaking_rate = 2.5
-    speaking_rate_deviation = abs(speaking_rate - ideal_speaking_rate)
-    speaking_rate_score = max(0, 1 - (speaking_rate_deviation / ideal_speaking_rate))
-    filler_repetition_composite = (filler_ratio + repetition_score) / 2
-    filler_repetition_score = max(0, 1 - filler_repetition_composite)
-    content_strength_val = 0.85 if analysis_data.get('text_analysis', {}).get('total_duration', 0) > 60 else 0.4
-    raw_score = (confidence_score * w_confidence + (1 - anxiety_score) * abs(w_anxiety) + fluency_val * w_fluency + speaking_rate_score * w_speaking_rate + filler_repetition_score * abs(w_filler_repetition) + content_strength_val * w_content_strengths)
-    max_possible_score = (w_confidence + abs(w_anxiety) + w_fluency + w_speaking_rate + abs(w_filler_repetition) + w_content_strengths)
-    if max_possible_score == 0: return 50.0
-    normalized_score = raw_score / max_possible_score
     acceptance_probability = max(0.0, min(1.0, normalized_score))
     return float(f"{acceptance_probability * 100:.2f}")
 def generate_report(analysis_data: Dict) -> str:
     try:
-        voice = analysis_data.get('voice_analysis', {})
-        voice_interpretation = generate_voice_interpretation(voice)
-        interviewee_responses = [f"Speaker {u['speaker']} ({u['role']}): {u['text']}" for u in analysis_data['transcript'] if u['role'] == 'Interviewee'][:6]
-        acceptance_prob = analysis_data.get('acceptance_probability', None)
         acceptance_line = ""
         if acceptance_prob is not None:
             acceptance_line = f"\n**Hiring Suitability Score: {acceptance_prob:.2f}%**\n"
-            if acceptance_prob >= 80: acceptance_line += "HR Verdict: Outstanding candidate, highly recommended for immediate advancement."
-            elif acceptance_prob >= 60: acceptance_line += "HR Verdict: Strong candidate, suitable for further evaluation with targeted development."
-            elif acceptance_prob >= 40: acceptance_line += "HR Verdict: Moderate potential, requires additional assessment and skill-building."
-            else: acceptance_line += "HR Verdict: Limited fit, significant improvement needed for role alignment."
         prompt = f"""
-        You are EvalBot, a senior HR consultant with 20+ years of experience, delivering a polished, concise, and engaging interview analysis report. Use a professional tone, clear headings, and bullet points ('- ') for readability. Avoid redundancy and ensure distinct sections for strengths, growth areas, and recommendations.
         {acceptance_line}
         **1. Executive Summary**
-        - Provide a concise overview of performance, key metrics, and hiring potential.
         - Interview length: {analysis_data['text_analysis']['total_duration']:.2f} seconds
-        - Speaker turns: {analysis_data['text_analysis']['speaker_turns']}
         - Participants: {', '.join(analysis_data['speakers'])}
         **2. Communication and Vocal Dynamics**
-        - Evaluate vocal delivery (rate, fluency, confidence) and professional impact.
-        - Offer HR insights on workplace alignment.
         {voice_interpretation}
         **3. Competency and Content Evaluation**
-        - Assess competencies: leadership, problem-solving, communication, adaptability.
         - List strengths and growth areas separately, with specific examples.
-        - Sample responses:
-        {chr(10).join(interviewee_responses)}
-        **4. Role Fit and Growth Potential**
-        - Analyze cultural fit, role readiness, and long-term potential.
-        - Highlight enthusiasm and scalability.
-        **5. Strategic HR Recommendations**
-        - Provide distinct, prioritized strategies for candidate growth.
-        - Target: Communication, Response Depth, Professional Presence.
-        - List clear next steps for hiring managers (e.g., advance, train, assess).
         """
         response = gemini_model.generate_content(prompt)
         return response.text
@@ -675,7 +590,7 @@ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text:
             "Role Fit and Growth Potential": [],
             "Strategic HR Recommendations": {"Development Priorities": [], "Next Steps": []}
         }
-        report_parts = re.split(r'(\s*\*\*\s*\d\.\s*.*?\s*\*\*)', gemini_report_text)
         current_section = None
         for part in report_parts:
             if not part.strip(): continue
@@ -771,10 +686,19 @@ def convert_to_serializable(obj):
     if isinstance(obj, np.ndarray): return obj.tolist()
     return obj
 def process_interview(audio_path_or_url: str):
-    local_audio_path = None
-    wav_file = None
     is_downloaded = False
     try:
         logger.info(f"Starting processing for {audio_path_or_url}")
         if audio_path_or_url.startswith(('http://', 'https://')):
@@ -782,44 +706,52 @@ def process_interview(audio_path_or_url: str):
             is_downloaded = True
         else:
             local_audio_path = audio_path_or_url
         wav_file = convert_to_wav(local_audio_path)
         transcript = transcribe(wav_file)
-        for utterance in transcript['utterances']:
-            utterance['prosodic_features'] = extract_prosodic_features(wav_file, utterance['start'], utterance['end'])
         utterances_with_speakers = identify_speakers(transcript, wav_file)
-        clf, vectorizer, scaler = None, None, None
-        if os.path.exists(os.path.join(OUTPUT_DIR, 'role_classifier.pkl')):
-            clf = joblib.load(os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
-            vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
-            scaler = joblib.load(os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
-        else:
-            clf, vectorizer, scaler = train_role_classifier(utterances_with_speakers)
-        classified_utterances = classify_roles(utterances_with_speakers, clf, vectorizer, scaler)
         voice_analysis = analyze_interviewee_voice(wav_file, classified_utterances)
         analysis_data = {
             'transcript': classified_utterances,
-            'speakers': list(set(u['speaker'] for u in classified_utterances)),
             'voice_analysis': voice_analysis,
             'text_analysis': {
                 'total_duration': sum(u['prosodic_features']['duration'] for u in classified_utterances),
                 'speaker_turns': len(classified_utterances)
             }
         }
         analysis_data['acceptance_probability'] = calculate_acceptance_probability(analysis_data)
         gemini_report_text = generate_report(analysis_data)
         base_name = str(uuid.uuid4())
         pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
         json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
-        create_pdf_report(analysis_data, pdf_path, gemini_report_text=gemini_report_text)
         with open(json_path, 'w') as f:
             serializable_data = convert_to_serializable(analysis_data)
             json.dump(serializable_data, f, indent=2)
-        logger.info(f"Processing completed for {audio_path_or_url}")
-        return {'pdf_path': pdf_path, 'json_path': json_path}
     except Exception as e:
         logger.error(f"Processing failed for {audio_path_or_url}: {str(e)}", exc_info=True)
         raise
     finally:
         if wav_file and os.path.exists(wav_file):
             os.remove(wav_file)
         if is_downloaded and local_audio_path and os.path.exists(local_audio_path):

 from nemo.collections.asr.models import EncDecSpeakerLabelModel
 from pinecone import Pinecone, ServerlessSpec
 import librosa
 import re
+from typing import Dict, List
 import logging
 import tempfile
 from reportlab.lib.pagesizes import letter
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
 from reportlab.lib.units import inch
 from reportlab.lib import colors
 matplotlib.use('Agg')
 from reportlab.platypus import Image
 import io
 import spacy
 import google.generativeai as genai
 from concurrent.futures import ThreadPoolExecutor
+import urllib3 # <-- تم الإصلاح: إضافة استيراد urllib3
+# إعدادات التسجيل (Logging)
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# تقليل verbosity من مكتبة NeMo
+logging.getLogger("nemo_logging").setLevel(logging.WARNING)
+logging.getLogger("nemo").setLevel(logging.WARNING)
 # Configuration
 OUTPUT_DIR = "./processed_audio"
 os.makedirs(OUTPUT_DIR, exist_ok=True)
 ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY")
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+# --- تم الإصلاح: دالة تحميل محسّنة مع إعادة المحاولة ---
 def download_audio_from_url(url: str, retries=3) -> str:
     """Downloads an audio file from a URL to a temporary local path with retries."""
+    temp_dir = tempfile.gettempdir()
+    temp_path = os.path.join(temp_dir, f"{uuid.uuid4()}.tmp_audio")
+    logger.info(f"Downloading audio from {url} to {temp_path}")
+    for attempt in range(retries):
+        try:
+            with requests.get(url, stream=True, timeout=60) as r: # زيادة timeout
+                r.raise_for_status()
+                with open(temp_path, 'wb') as f:
+                    for chunk in r.iter_content(chunk_size=8192):
+                        f.write(chunk)
+                logger.info("Download completed successfully.")
                 return temp_path
+        except (requests.exceptions.RequestException, urllib3.exceptions.ProtocolError) as e:
+            logger.warning(f"Attempt {attempt + 1}/{retries} failed: {e}. Retrying...")
+            if attempt < retries - 1:
+                time.sleep(2 ** attempt)  # Exponential backoff
+            else:
+                logger.error(f"Failed to download audio after {retries} attempts.")
+                raise
+    raise Exception(f"Failed to download audio from URL {url}")
 def initialize_services():
+    """Initializes Pinecone and Gemini services."""
     try:
         pc = Pinecone(api_key=PINECONE_KEY)
         index_name = "interview-speaker-embeddings"
                 spec=ServerlessSpec(cloud="aws", region="us-east-1")
             )
         index = pc.Index(index_name)
         genai.configure(api_key=GEMINI_API_KEY)
         gemini_model = genai.GenerativeModel('gemini-1.5-flash')
         return index, gemini_model
         raise
 index, gemini_model = initialize_services()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 logger.info(f"Using device: {device}")
 def load_speaker_model():
+    """Loads the speaker verification model."""
     try:
+        # يضمن عدم استخدام عدد كبير جدًا من الخيوط
+        torch.set_num_threads(1)
         model = EncDecSpeakerLabelModel.from_pretrained(
             "nvidia/speakerverification_en_titanet_large",
             map_location=torch.device('cpu')
         raise RuntimeError("Could not load speaker verification model")
 def load_models():
+    """Loads all necessary models."""
     speaker_model = load_speaker_model()
     nlp = spacy.load("en_core_web_sm")
+    return speaker_model, nlp
+speaker_model, nlp = load_models()
 def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
+    """Converts any audio file to a 16kHz mono WAV file."""
     try:
         audio = AudioSegment.from_file(audio_path)
+        audio = audio.set_frame_rate(16000).set_channels(1)
         wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
         audio.export(wav_file, format="wav")
         return wav_file
         logger.error(f"Audio conversion failed: {str(e)}")
         raise
 def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
+    """Extracts prosodic features from an audio segment."""
     try:
+        y, sr = librosa.load(audio_path, sr=16000, offset=start_ms/1000.0, duration=(end_ms-start_ms)/1000.0)
+        pitches, _ = librosa.piptrack(y=y, sr=sr)
         pitches = pitches[pitches > 0]
+        rms = librosa.feature.rms(y=y)[0]
+        return {
             'duration': (end_ms - start_ms) / 1000,
             'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
             'pitch_sd': float(np.std(pitches)) if len(pitches) > 0 else 0.0,
+            'intensityMean': float(np.mean(rms)),
+            'intensitySD': float(np.std(rms)),
         }
     except Exception as e:
         logger.error(f"Feature extraction failed: {str(e)}")
+        return {'duration': 0, 'mean_pitch': 0, 'pitch_sd': 0, 'intensityMean': 0, 'intensitySD': 0}
 def transcribe(audio_path: str) -> Dict:
+    """Transcribes audio using AssemblyAI and enables speaker labels."""
     try:
+        headers = {"authorization": ASSEMBLYAI_KEY}
         with open(audio_path, 'rb') as f:
+            upload_response = requests.post("https://api.assemblyai.com/v2/upload", headers=headers, data=f)
         audio_url = upload_response.json()['upload_url']
+        transcript_request = {
+            "audio_url": audio_url,
+            "speaker_labels": True,
+        }
+        transcript_response = requests.post("https://api.assemblyai.com/v2/transcript", json=transcript_request, headers=headers)
         transcript_id = transcript_response.json()['id']
         while True:
+            result = requests.get(f"https://api.assemblyai.com/v2/transcript/{transcript_id}", headers=headers).json()
             if result['status'] == 'completed':
+                if not result.get('utterances'):
+                     raise ValueError("Transcription completed but no utterances were returned. The audio may be too short or silent.")
                 return result
             elif result['status'] == 'error':
+                raise Exception(f"Transcription failed: {result['error']}")
             time.sleep(5)
     except Exception as e:
         logger.error(f"Transcription failed: {str(e)}")
         raise
+def process_utterance(utterance, full_audio):
+    """Processes a single utterance to get a speaker embedding."""
     try:
+        start, end = utterance['start'], utterance['end']
         segment = full_audio[start:end]
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_f:
+            segment.export(temp_f.name, format="wav")
+            with torch.no_grad():
+                embedding = speaker_model.get_embedding(temp_f.name).cpu().numpy().flatten()
+        return {**utterance, 'embedding': embedding}
     except Exception as e:
         logger.error(f"Utterance processing failed: {str(e)}")
+        return {**utterance, 'embedding': np.zeros(192)} # Return zero vector on failure
 def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
+    """Identifies unique speakers from utterances."""
     try:
         full_audio = AudioSegment.from_wav(wav_file)
         utterances = transcript['utterances']
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            futures = [executor.submit(process_utterance, u, full_audio) for u in utterances]
+            processed_utterances = [f.result() for f in futures]
+        # Map AssemblyAI speaker labels (A, B, C...) to unique speaker names
+        speaker_map = {}
+        unique_speaker_count = 0
+        for u in processed_utterances:
+            assembly_speaker = u['speaker']
+            if assembly_speaker not in speaker_map:
+                unique_speaker_count += 1
+                speaker_map[assembly_speaker] = f"Speaker_{unique_speaker_count}"
+            u['speaker_name'] = speaker_map[assembly_speaker]
+        return processed_utterances
     except Exception as e:
         logger.error(f"Speaker identification failed: {str(e)}")
         raise
+# --- تم الإصلاح: استبدال نموذج التصنيف بمنهجية إرشادية (Heuristic) لتصنيف الأدوار ---
+def classify_roles(utterances: List[Dict]) -> List[Dict]:
+    """
+    Classifies roles as 'Interviewer' or 'Interviewee' based on heuristics.
+    The 'Interviewer' is assumed to be the one who asks more questions.
+    """
     try:
+        speaker_stats = {}
+        question_words = {'what', 'why', 'how', 'when', 'where', 'who', 'which', 'tell', 'describe', 'explain'}
+        for u in utterances:
+            speaker = u['speaker_name']
+            if speaker not in speaker_stats:
+                speaker_stats[speaker] = {'question_score': 0, 'utterance_count': 0}
+            speaker_stats[speaker]['utterance_count'] += 1
+            text_lower = u['text'].lower()
+            # زيادة النتيجة إذا انتهى النص بعلامة استفهام
+            if text_lower.endswith('?'):
+                speaker_stats[speaker]['question_score'] += 1
+            # زيادة النتيجة لكل كلمة استفهامية
+            for word in question_words:
+                if word in text_lower.split():
+                    speaker_stats[speaker]['question_score'] += 1
+        if not speaker_stats:
+            # إذا لم يتم العثور على متحدثين، لا يمكن التصنيف
+            return utterances
+        # تحديد المحاور بناءً على أعلى "question_score"
+        interviewer_speaker = max(speaker_stats, key=lambda s: speaker_stats[s]['question_score'])
+        logger.info(f"Speaker stats for role classification: {speaker_stats}")
+        logger.info(f"Identified Interviewer: {interviewer_speaker}")
+        for u in utterances:
+            if u['speaker_name'] == interviewer_speaker:
+                u['role'] = 'Interviewer'
+            else:
+                u['role'] = 'Interviewee'
+        return utterances
     except Exception as e:
         logger.error(f"Role classification failed: {str(e)}")
+        # تعيين دور افتراضي في حالة الفشل
+        for u in utterances:
+             u['role'] = 'Unknown'
+        return utterances
 def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
+    """Analyzes the voice characteristics of the interviewee."""
     try:
+        interviewee_utterances = [u for u in utterances if u.get('role') == 'Interviewee']
         if not interviewee_utterances:
             return {'error': 'No interviewee utterances found'}
+        y, sr = librosa.load(audio_path, sr=16000)
+        # استخراج مقاطع صوتية للمرشح
+        segments = [y[int(u['start']*sr/1000):int(u['end']*sr/1000)] for u in interviewee_utterances]
         total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
         total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
+        speaking_rate = total_words / (total_duration / 60) if total_duration > 0 else 0 # Words per minute
+        # تحليل الكلمات الحشوية (Filler words)
+        filler_words = {'um', 'uh', 'like', 'you know', 'so', 'i mean', 'actually'}
+        filler_count = sum(1 for u in interviewee_utterances for word in u['text'].lower().split() if word in filler_words)
         filler_ratio = filler_count / total_words if total_words > 0 else 0
+        # تحليل تكرار الكلمات
         all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
+        repetition_score = (len(all_words) - len(set(all_words))) / len(all_words) if all_words else 0
+        # تحليل طبقة الصوت (Pitch) والكثافة (Intensity)
+        pitches = np.concatenate([librosa.pyin(s, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))[0] for s in segments if len(s)>0])
+        pitches = pitches[~np.isnan(pitches)]
+        intensities = np.concatenate([librosa.feature.rms(y=s)[0] for s in segments if len(s)>0])
         pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
         pitch_std = np.std(pitches) if len(pitches) > 0 else 0
+        intensity_mean = np.mean(intensities) if len(intensities) > 0 else 0
+        intensity_std = np.std(intensities) if len(intensities) > 0 else 0
+        # حساب الدرجات المركبة
+        anxiety_score = (pitch_std / 150) if pitch_std > 0 else 0 # تطبيع بسيط
+        confidence_score = 1 - (intensity_std * 5) if intensity_std > 0 else 1 # تطبيع بسيط
+        hesitation_score = (filler_ratio + repetition_score) / 2
+        # تقييد الدرجات بين 0 و 1
+        anxiety_score = max(0, min(1, anxiety_score))
+        confidence_score = max(0, min(1, confidence_score))
         return {
             'speaking_rate': float(round(speaking_rate, 2)),
             'filler_ratio': float(round(filler_ratio, 4)),
             'repetition_score': float(round(repetition_score, 4)),
+            'pitch_analysis': {'mean': float(round(pitch_mean, 2)), 'std_dev': float(round(pitch_std, 2))},
+            'intensity_analysis': {'mean': float(round(intensity_mean, 4)), 'std_dev': float(round(intensity_std, 4))},
             'composite_scores': {
                 'anxiety': float(round(anxiety_score, 4)),
                 'confidence': float(round(confidence_score, 4)),
                 'hesitation': float(round(hesitation_score, 4))
             },
             'interpretation': {
+                'anxiety_level': 'high' if anxiety_score > 0.6 else 'moderate' if anxiety_score > 0.3 else 'low',
+                'confidence_level': 'high' if confidence_score > 0.7 else 'moderate' if confidence_score > 0.4 else 'low',
+                'fluency_level': 'disfluent' if hesitation_score > 0.1 else 'moderate' if hesitation_score > 0.05 else 'fluent'
             }
         }
     except Exception as e:
+        logger.error(f"Voice analysis failed: {str(e)}", exc_info=True)
         return {'error': str(e)}
 def generate_anxiety_confidence_chart(composite_scores: Dict, chart_path_or_buffer):
+    """Generates a bar chart for anxiety and confidence scores."""
     try:
         labels = ['Anxiety', 'Confidence']
         scores = [composite_scores.get('anxiety', 0), composite_scores.get('confidence', 0)]
         fig, ax = plt.subplots(figsize=(5, 3.5))
         bars = ax.bar(labels, scores, color=['#FF5252', '#26A69A'], edgecolor='black', width=0.45)
+        ax.set_ylabel('Score (0 to 1)', fontsize=12)
         ax.set_title('Vocal Dynamics: Anxiety vs. Confidence', fontsize=14, pad=15)
+        ax.set_ylim(0, 1.1)
         for bar in bars:
             height = bar.get_height()
+            ax.text(bar.get_x() + bar.get_width()/2, height + 0.02, f"{height:.2f}",
+                    ha='center', va='bottom', color='black', fontweight='bold', fontsize=11)
         ax.grid(True, axis='y', linestyle='--', alpha=0.7)
         plt.tight_layout()
         plt.savefig(chart_path_or_buffer, format='png', bbox_inches='tight', dpi=300)
         logger.error(f"Error generating chart: {str(e)}")
 def calculate_acceptance_probability(analysis_data: Dict) -> float:
+    """Calculates a suitability score based on analysis data."""
     voice = analysis_data.get('voice_analysis', {})
     if 'error' in voice: return 0.0
+    # تعريف الأوزان
+    w_confidence, w_anxiety, w_fluency, w_speaking_rate = 0.4, -0.2, 0.2, 0.2
+    confidence_score = voice.get('composite_scores', {}).get('confidence', 0.5)
+    anxiety_score = voice.get('composite_scores', {}).get('anxiety', 0.5)
+    hesitation_score = voice.get('composite_scores', {}).get('hesitation', 0.5)
+    fluency_score = 1 - hesitation_score
+    # تقييم سرعة الكلام
+    rate = voice.get('speaking_rate', 150)
+    if 120 <= rate <= 180:
+        speaking_rate_score = 1.0
+    elif 100 <= rate < 120 or 180 < rate <= 200:
+        speaking_rate_score = 0.7
+    else:
+        speaking_rate_score = 0.4
+    raw_score = (confidence_score * w_confidence +
+                 (1 - anxiety_score) * abs(w_anxiety) +
+                 fluency_score * w_fluency +
+                 speaking_rate_score * w_speaking_rate)
+    max_possible_score = w_confidence + abs(w_anxiety) + w_fluency + w_speaking_rate
+    normalized_score = raw_score / max_possible_score if max_possible_score != 0 else 0
     acceptance_probability = max(0.0, min(1.0, normalized_score))
     return float(f"{acceptance_probability * 100:.2f}")
+# --- تم الإصلاح: إضافة الدالة المفقودة ---
+def generate_voice_interpretation(voice: Dict) -> str:
+    """Generates a human-readable interpretation of voice analysis."""
+    if not voice or 'error' in voice:
+        return "- Vocal analysis could not be performed as no interviewee was identified."
+    interp = voice.get('interpretation', {})
+    scores = voice.get('composite_scores', {})
+    confidence = interp.get('confidence_level', 'N/A').capitalize()
+    anxiety = interp.get('anxiety_level', 'N/A').capitalize()
+    fluency = interp.get('fluency_level', 'N/A').capitalize()
+    rate = voice.get('speaking_rate', 0)
+    lines = [
+        f"- **Confidence:** {confidence} (Score: {scores.get('confidence', 0):.2f}). The candidate's vocal tone suggests their level of assurance.",
+        f"- **Anxiety:** {anxiety} (Score: {scores.get('anxiety', 0):.2f}). Vocal stress indicators point to their comfort level during the interview.",
+        f"- **Fluency & Hesitation:** {fluency} (Hesitation Score: {scores.get('hesitation', 0):.2f}). Reflects the smoothness of speech and use of filler words.",
+        f"- **Speaking Rate:** {rate:.0f} words per minute. A normal conversational pace is typically between 120-180 WPM."
+    ]
+    return "\n".join(lines)
 def generate_report(analysis_data: Dict) -> str:
+    """Generates a comprehensive report using Gemini AI."""
     try:
+        voice_interpretation = generate_voice_interpretation(analysis_data.get('voice_analysis', {}))
+        interviewee_responses = [f"- {u['text']}" for u in analysis_data['transcript'] if u.get('role') == 'Interviewee'][:4]
+        acceptance_prob = analysis_data.get('acceptance_probability')
         acceptance_line = ""
         if acceptance_prob is not None:
             acceptance_line = f"\n**Hiring Suitability Score: {acceptance_prob:.2f}%**\n"
+            if acceptance_prob >= 80: acceptance_line += "HR Verdict: Outstanding candidate. Highly recommended for advancement."
+            elif acceptance_prob >= 60: acceptance_line += "HR Verdict: Strong candidate. Suitable for further evaluation."
+            elif acceptance_prob >= 40: acceptance_line += "HR Verdict: Moderate potential. Requires additional assessment."
+            else: acceptance_line += "HR Verdict: Limited fit for the role at this time."
         prompt = f"""
+        You are EvalBot, a senior HR consultant. Generate a polished, concise, and engaging interview analysis report. Use a professional tone, clear headings, and bullet points.
         {acceptance_line}
         **1. Executive Summary**
+        - Provide a concise overview of the candidate's performance, key metrics, and hiring potential.
         - Interview length: {analysis_data['text_analysis']['total_duration']:.2f} seconds
         - Participants: {', '.join(analysis_data['speakers'])}
         **2. Communication and Vocal Dynamics**
+        - Evaluate vocal delivery based on the following analysis. Offer HR insights on its impact.
         {voice_interpretation}
         **3. Competency and Content Evaluation**
+        - Based on the sample responses below, assess competencies like leadership, problem-solving, and self-awareness.
         - List strengths and growth areas separately, with specific examples.
+        - Sample Responses from Candidate:
+        {' '.join(interviewee_responses) if interviewee_responses else "No responses from interviewee were identified."}
+        **4. Strategic HR Recommendations**
+        - Provide prioritized strategies for the candidate's growth.
+        - List clear next steps for hiring managers (e.g., advance, further technical assessment, reject).
         """
         response = gemini_model.generate_content(prompt)
         return response.text
             "Role Fit and Growth Potential": [],
             "Strategic HR Recommendations": {"Development Priorities": [], "Next Steps": []}
         }
+        report_parts = re.split(r'(\s*\\\s*\d\.\s*.?\s\\)', gemini_report_text)
         current_section = None
         for part in report_parts:
             if not part.strip(): continue
     if isinstance(obj, np.ndarray): return obj.tolist()
     return obj
+def convert_to_serializable(obj):
+    """Converts numpy types to native Python types for JSON serialization."""
+    if isinstance(obj, np.generic): return obj.item()
+    if isinstance(obj, dict): return {k: convert_to_serializable(v) for k, v in obj.items()}
+    if isinstance(obj, list): return [convert_to_serializable(i) for i in obj]
+    if isinstance(obj, np.ndarray): return obj.tolist()
+    return obj
 def process_interview(audio_path_or_url: str):
+    """Main function to process an interview from an audio file or URL."""
+    local_audio_path, wav_file = None, None
     is_downloaded = False
     try:
         logger.info(f"Starting processing for {audio_path_or_url}")
         if audio_path_or_url.startswith(('http://', 'https://')):
             is_downloaded = True
         else:
             local_audio_path = audio_path_or_url
         wav_file = convert_to_wav(local_audio_path)
         transcript = transcribe(wav_file)
+        for u in transcript['utterances']:
+            u['prosodic_features'] = extract_prosodic_features(wav_file, u['start'], u['end'])
         utterances_with_speakers = identify_speakers(transcript, wav_file)
+        # التصنيف باستخدام المنهجية الإرشادية
+        classified_utterances = classify_roles(utterances_with_speakers)
         voice_analysis = analyze_interviewee_voice(wav_file, classified_utterances)
         analysis_data = {
             'transcript': classified_utterances,
+            'speakers': list(set(u['speaker_name'] for u in classified_utterances)),
             'voice_analysis': voice_analysis,
             'text_analysis': {
                 'total_duration': sum(u['prosodic_features']['duration'] for u in classified_utterances),
                 'speaker_turns': len(classified_utterances)
             }
         }
         analysis_data['acceptance_probability'] = calculate_acceptance_probability(analysis_data)
         gemini_report_text = generate_report(analysis_data)
         base_name = str(uuid.uuid4())
         pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
         json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
+        # create_pdf_report(analysis_data, pdf_path, gemini_report_text=gemini_report_text)
         with open(json_path, 'w') as f:
             serializable_data = convert_to_serializable(analysis_data)
             json.dump(serializable_data, f, indent=2)
+        logger.info(f"Processing completed. JSON report at: {json_path}")
+        return {'pdf_path': pdf_path, 'json_path': json_path, 'report_text': gemini_report_text}
     except Exception as e:
         logger.error(f"Processing failed for {audio_path_or_url}: {str(e)}", exc_info=True)
         raise
     finally:
+        # تنظيف الملفات المؤقتة
         if wav_file and os.path.exists(wav_file):
             os.remove(wav_file)
         if is_downloaded and local_audio_path and os.path.exists(local_audio_path):