norhan12 commited on
Commit
5327928
·
1 Parent(s): 6b6ec0a

Initial project setup with multi-URL API

Browse files
Dockerfile ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # 1. Install system dependencies
4
+ RUN apt-get update && apt-get install -y \
5
+ libsndfile1 \
6
+ ffmpeg \
7
+ sox \
8
+ curl \
9
+ git-lfs \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # 2. Create non-root user (moved up for better permissions)
13
+ RUN useradd -m appuser
14
+ # 3. Create directory structure (with proper ownership from the start)
15
+ RUN mkdir -p \
16
+ /tmp/matplotlib \
17
+ /tmp/fontconfig \
18
+ /tmp/lhotse \
19
+ /app/uploads \
20
+ /app/processed_audio \
21
+ /app/assets && \
22
+ chown -R appuser:appuser /app /tmp/matplotlib /tmp/fontconfig /tmp/lhotse
23
+
24
+ # 4. Set working directory
25
+ WORKDIR /app
26
+
27
+ # 5. Copy application files
28
+ COPY --chown=appuser:appuser . .
29
+
30
+ # 6. Set environment variables
31
+ ENV MPLCONFIGDIR=/tmp/matplotlib \
32
+ FONTCONFIG_PATH=/tmp/fontconfig \
33
+ LHOTSE_CACHE_DIR=/tmp/lhotse \
34
+ HF_HUB_ENABLE_HF_TRANSFER=1 \
35
+ PYTHONUNBUFFERED=1
36
+
37
+ # 7. Install Python dependencies
38
+ USER appuser
39
+ RUN pip install --upgrade pip && \
40
+ pip install --no-cache-dir -r requirements.txt && \
41
+ python -m spacy download en_core_web_sm && \
42
+ pip check
43
+
44
+ # 8. Health check
45
+ HEALTHCHECK --interval=30s --timeout=10s \
46
+ CMD curl -f http://localhost:7860/ || exit 1
47
+
48
+ # 9. Run the application
49
+ CMD ["python", "app.py"]
README.md DELETED
@@ -1,11 +0,0 @@
1
- ---
2
- title: EvalBot
3
- emoji: 🐠
4
- colorFrom: blue
5
- colorTo: yellow
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import logging
4
+ import json
5
+ import shutil
6
+ from pathlib import Path
7
+ import tempfile
8
+ import gradio as gr
9
+ from process_interview import process_interview
10
+ from typing import Tuple, Optional, List, Dict
11
+ from concurrent.futures import ThreadPoolExecutor # Import ThreadPoolExecutor for parallel processing
12
+
13
+ # Setup logging
14
+ logging.basicConfig(
15
+ level=logging.INFO,
16
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
17
+ )
18
+ logger = logging.getLogger(__name__)
19
+ logging.getLogger("nemo_logging").setLevel(logging.ERROR)
20
+ logging.getLogger("nemo").setLevel(logging.ERROR)
21
+
22
+ # Configuration
23
+ OUTPUT_DIR = "./processed_audio"
24
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
25
+
26
+ # Constants
27
+ VALID_EXTENSIONS = ('.wav', '.mp3', '.m4a', '.flac')
28
+ MAX_FILE_SIZE = 100 * 1024 * 1024 # 100MB
29
+
30
+
31
+ def check_health() -> str:
32
+ """Check system health, similar to FastAPI /health endpoint"""
33
+ try:
34
+ for directory in [OUTPUT_DIR]:
35
+ if not os.path.exists(directory):
36
+ raise Exception(f"Directory {directory} does not exist")
37
+ return "System is healthy"
38
+ except Exception as e:
39
+ logger.error(f"Health check failed: {str(e)}")
40
+ return f"System is unhealthy: {str(e)}"
41
+
42
+
43
+ # A helper function to process a single audio file
44
+ def process_single_audio(file_path_or_url: str) -> Dict:
45
+ """Processes a single audio file and returns its analysis."""
46
+ try:
47
+ if not file_path_or_url:
48
+ return {"error": "No audio provided for processing."}
49
+
50
+ # Gradio will download the file if it's a URL and provide a local path.
51
+ # So, 'file_path_or_url' will always be a local path when it reaches this function.
52
+ temp_audio_path = Path(file_path_or_url)
53
+
54
+ file_ext = temp_audio_path.suffix.lower()
55
+ if file_ext not in VALID_EXTENSIONS:
56
+ return {"error": f"Invalid file format: {file_ext}. Supported formats: {', '.join(VALID_EXTENSIONS)}"}
57
+
58
+ file_size = os.path.getsize(temp_audio_path)
59
+ if file_size > MAX_FILE_SIZE:
60
+ return {
61
+ "error": f"File too large: {file_size / (1024 * 1024):.2f}MB. Max size: {MAX_FILE_SIZE // (1024 * 1024)}MB"}
62
+
63
+ logger.info(f"Processing audio from: {temp_audio_path}")
64
+ result = process_interview(str(temp_audio_path))
65
+
66
+ if not result or 'pdf_path' not in result or 'json_path' not in result:
67
+ return {"error": "Processing failed - invalid result format."}
68
+
69
+ pdf_path = Path(result['pdf_path'])
70
+ json_path = Path(result['json_path'])
71
+
72
+ if not pdf_path.exists() or not json_path.exists():
73
+ return {"error": "Processing failed - output files not found."}
74
+
75
+ with json_path.open('r') as f:
76
+ analysis_data = json.load(f)
77
+
78
+ voice_analysis = analysis_data.get('voice_analysis', {})
79
+ summary = (
80
+ f"Speakers: {', '.join(analysis_data['speakers'])}\n"
81
+ f"Interview Duration: {analysis_data['text_analysis']['total_duration']:.2f} seconds\n"
82
+ f"Confidence Level: {voice_analysis.get('interpretation', {}).get('confidence_level', 'Unknown')}\n"
83
+ f"Anxiety Level: {voice_analysis.get('interpretation', {}).get('anxiety_level', 'Unknown')}"
84
+ )
85
+
86
+ json_data = json.dumps(analysis_data, indent=2)
87
+
88
+ return {
89
+ "summary": summary,
90
+ "json_data": json_data,
91
+ "pdf_path": str(pdf_path),
92
+ "original_input": file_path_or_url # Optionally return the original URL/path for mapping
93
+ }
94
+
95
+ except Exception as e:
96
+ logger.error(f"Error processing single audio: {str(e)}", exc_info=True)
97
+ return {"error": f"Error during processing: {str(e)}"}
98
+
99
+
100
+ # Main function to handle multiple audio files/URLs
101
+ def analyze_multiple_audios(file_paths_or_urls: List[str]) -> Tuple[str, str, List[str]]:
102
+ """
103
+ Analyzes multiple interview audio files/URLs in parallel.
104
+ Returns combined summary, combined JSON, and a list of PDF paths.
105
+ """
106
+ if not file_paths_or_urls:
107
+ return "No audio files/URLs provided.", "[]", []
108
+
109
+ all_summaries = []
110
+ all_json_data = []
111
+ all_pdf_paths = []
112
+
113
+ # Use ThreadPoolExecutor for parallel processing
114
+ # Adjust max_workers based on available resources and expected load
115
+ with ThreadPoolExecutor(max_workers=5) as executor:
116
+ futures = {executor.submit(process_single_audio, item): item for item in file_paths_or_urls}
117
+
118
+ for future in futures:
119
+ item = futures[future] # Get the original item (URL/path) that was processed
120
+ try:
121
+ result = future.result() # Get the result of the processing
122
+ if "error" in result:
123
+ all_summaries.append(f"Error processing {item}: {result['error']}")
124
+ # Include error in JSON output for clarity
125
+ all_json_data.append(json.dumps({"input": item, "error": result['error']}, indent=2))
126
+ else:
127
+ all_summaries.append(f"Analysis for {os.path.basename(item)}:\n{result['summary']}")
128
+ all_json_data.append(result['json_data'])
129
+ all_pdf_paths.append(result['pdf_path'])
130
+ except Exception as exc:
131
+ logger.error(f"Item {item} generated an unexpected exception: {exc}", exc_info=True)
132
+ all_summaries.append(f"Error processing {item}: An unexpected error occurred.")
133
+ all_json_data.append(json.dumps({"input": item, "error": str(exc)}, indent=2))
134
+
135
+ combined_summary = "\n\n---\n\n".join(all_summaries)
136
+ # Ensure the combined_json_list is a valid JSON array string
137
+ combined_json_list = "[\n" + ",\n".join(all_json_data) + "\n]"
138
+ return combined_summary, combined_json_list, all_pdf_paths
139
+
140
+
141
+ # Gradio interface
142
+ with gr.Blocks(title="Interview Analysis System", theme=gr.themes.Soft()) as demo:
143
+ gr.Markdown("""
144
+ # 🎤 Interview Audio Analysis System
145
+ Provide multiple audio file URLs or upload multiple audio files to analyze speaker performance.
146
+ Supported formats: WAV, MP3, M4A, FLAC (max 100MB per file).
147
+ """)
148
+
149
+ with gr.Row():
150
+ with gr.Column():
151
+ health_status = gr.Textbox(label="System Status", value=check_health(), interactive=False)
152
+ audio_inputs = gr.File(
153
+ label="Provide Audio URLs or Upload Files (Multiple allowed)",
154
+ type="filepath",
155
+ file_count="multiple" # Allow multiple files/URLs
156
+ )
157
+ submit_btn = gr.Button("Start Analysis", variant="primary")
158
+
159
+ with gr.Column():
160
+ output_summary = gr.Textbox(label="Combined Analysis Summary", interactive=False,
161
+ lines=10) # Adjusted lines
162
+ output_json = gr.Textbox(label="Detailed Analysis (JSON Array)", interactive=False, lines=20)
163
+ pdf_outputs = gr.File(label="Download All Reports", type="filepath", file_count="multiple")
164
+
165
+ submit_btn.click(
166
+ fn=analyze_multiple_audios,
167
+ inputs=audio_inputs,
168
+ outputs=[output_summary, output_json, pdf_outputs]
169
+ )
170
+
171
+ # Run the interface
172
+ if __name__ == "__main__":
173
+ demo.launch(server_port=7860, server_name="0.0.0.0")
.gitattributes → gitattributes RENAMED
File without changes
interview_transcripts_by_turkers.csv ADDED
The diff for this file is too large to render. See raw diff
 
process_interview.py ADDED
@@ -0,0 +1,920 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ import uuid
5
+ import requests
6
+ import time
7
+ import json
8
+ from pydub import AudioSegment
9
+ import wave
10
+ from nemo.collections.asr.models import EncDecSpeakerLabelModel
11
+ from pinecone import Pinecone, ServerlessSpec
12
+ import librosa
13
+ import pandas as pd
14
+ from sklearn.ensemble import RandomForestClassifier
15
+ from sklearn.preprocessing import StandardScaler
16
+ from sklearn.feature_extraction.text import TfidfVectorizer
17
+ import re
18
+ from typing import Dict, List, Tuple
19
+ import logging
20
+ # --- Imports for enhanced PDF ---
21
+ from reportlab.lib.pagesizes import letter
22
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
23
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
24
+ from reportlab.lib.units import inch
25
+ from reportlab.lib import colors
26
+ # --- End Imports for enhanced PDF ---
27
+ from transformers import AutoTokenizer, AutoModel
28
+ import spacy
29
+ import google.generativeai as genai
30
+ import joblib
31
+ from concurrent.futures import ThreadPoolExecutor
32
+
33
+ # Setup logging
34
+ logging.basicConfig(level=logging.INFO)
35
+ logger = logging.getLogger(__name__)
36
+ logging.getLogger("nemo_logging").setLevel(logging.ERROR)
37
+
38
+ # Configuration
39
+ AUDIO_DIR = "./uploads"
40
+ OUTPUT_DIR = "./processed_audio"
41
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
42
+
43
+ # API Keys
44
+ PINECONE_KEY = os.getenv("PINECONE_KEY")
45
+ ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY")
46
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
47
+
48
+
49
+ # Initialize services
50
+ def initialize_services():
51
+ try:
52
+ pc = Pinecone(api_key=PINECONE_KEY)
53
+ index_name = "interview-speaker-embeddings"
54
+ if index_name not in pc.list_indexes().names():
55
+ pc.create_index(
56
+ name=index_name,
57
+ dimension=192,
58
+ metric="cosine",
59
+ spec=ServerlessSpec(cloud="aws", region="us-east-1")
60
+ )
61
+ index = pc.Index(index_name)
62
+
63
+ genai.configure(api_key=GEMINI_API_KEY)
64
+ gemini_model = genai.GenerativeModel('gemini-1.5-flash')
65
+
66
+ return index, gemini_model
67
+ except Exception as e:
68
+ logger.error(f"Error initializing services: {str(e)}")
69
+ raise
70
+
71
+
72
+ index, gemini_model = initialize_services()
73
+
74
+ # Device setup
75
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
76
+ logger.info(f"Using device: {device}")
77
+
78
+
79
+ def load_speaker_model():
80
+ try:
81
+ import torch
82
+ torch.set_num_threads(5)
83
+ model = EncDecSpeakerLabelModel.from_pretrained(
84
+ "nvidia/speakerverification_en_titanet_large",
85
+ map_location=torch.device('cpu')
86
+ )
87
+ model.eval()
88
+ return model
89
+ except Exception as e:
90
+ logger.error(f"Model loading failed: {str(e)}")
91
+ raise RuntimeError("Could not load speaker verification model")
92
+
93
+
94
+ # Load ML models
95
+ def load_models():
96
+ speaker_model = load_speaker_model()
97
+ nlp = spacy.load("en_core_web_sm")
98
+
99
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
100
+ llm_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
101
+ llm_model.eval()
102
+
103
+ return speaker_model, nlp, tokenizer, llm_model
104
+
105
+
106
+ speaker_model, nlp, tokenizer, llm_model = load_models()
107
+
108
+
109
+ # Audio processing functions
110
+ def convert_to_wav(audio_path: str, output_dir: str = OUTPUT_DIR) -> str:
111
+ try:
112
+ audio = AudioSegment.from_file(audio_path)
113
+ if audio.channels > 1:
114
+ audio = audio.set_channels(1)
115
+ audio = audio.set_frame_rate(16000)
116
+
117
+ wav_file = os.path.join(output_dir, f"{uuid.uuid4()}.wav")
118
+ audio.export(wav_file, format="wav")
119
+ return wav_file
120
+ except Exception as e:
121
+ logger.error(f"Audio conversion failed: {str(e)}")
122
+ raise
123
+
124
+
125
+ def extract_prosodic_features(audio_path: str, start_ms: int, end_ms: int) -> Dict:
126
+ try:
127
+ audio = AudioSegment.from_file(audio_path)
128
+ segment = audio[start_ms:end_ms]
129
+ temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
130
+ segment.export(temp_path, format="wav")
131
+
132
+ y, sr = librosa.load(temp_path, sr=16000)
133
+ pitches = librosa.piptrack(y=y, sr=sr)[0]
134
+ pitches = pitches[pitches > 0]
135
+
136
+ features = {
137
+ 'duration': (end_ms - start_ms) / 1000,
138
+ 'mean_pitch': float(np.mean(pitches)) if len(pitches) > 0 else 0.0,
139
+ 'min_pitch': float(np.min(pitches)) if len(pitches) > 0 else 0.0,
140
+ 'max_pitch': float(np.max(pitches)) if len(pitches) > 0 else 0.0,
141
+ 'pitch_sd': float(np.std(pitches)) if len(pitches) > 0 else 0.0,
142
+ 'intensityMean': float(np.mean(librosa.feature.rms(y=y)[0])),
143
+ 'intensityMin': float(np.min(librosa.feature.rms(y=y)[0])),
144
+ 'intensityMax': float(np.max(librosa.feature.rms(y=y)[0])),
145
+ 'intensitySD': float(np.std(librosa.feature.rms(y=y)[0])),
146
+ }
147
+
148
+ os.remove(temp_path)
149
+ return features
150
+ except Exception as e:
151
+ logger.error(f"Feature extraction failed: {str(e)}")
152
+ return {
153
+ 'duration': (end_ms - start_ms) / 1000,
154
+ 'mean_pitch': 0.0,
155
+ 'min_pitch': 0.0,
156
+ 'max_pitch': 0.0,
157
+ 'pitch_sd': 0.0,
158
+ 'intensityMean': 0.0,
159
+ 'intensityMin': 0.0,
160
+ 'intensityMax': 0.0,
161
+ 'intensitySD': 0.0,
162
+ }
163
+
164
+
165
+ def transcribe(audio_path: str) -> Dict:
166
+ try:
167
+ with open(audio_path, 'rb') as f:
168
+ upload_response = requests.post(
169
+ "https://api.assemblyai.com/v2/upload",
170
+ headers={"authorization": ASSEMBLYAI_KEY},
171
+ data=f
172
+ )
173
+ audio_url = upload_response.json()['upload_url']
174
+
175
+ transcript_response = requests.post(
176
+ "https://api.assemblyai.com/v2/transcript",
177
+ headers={"authorization": ASSEMBLYAI_KEY},
178
+ json={
179
+ "audio_url": audio_url,
180
+ "speaker_labels": True,
181
+ "filter_profanity": True
182
+ }
183
+ )
184
+ transcript_id = transcript_response.json()['id']
185
+
186
+ while True:
187
+ result = requests.get(
188
+ f"https://api.assemblyai.com/v2/transcript/{transcript_id}",
189
+ headers={"authorization": ASSEMBLYAI_KEY}
190
+ ).json()
191
+
192
+ if result['status'] == 'completed':
193
+ return result
194
+ elif result['status'] == 'error':
195
+ raise Exception(result['error'])
196
+
197
+ time.sleep(5)
198
+ except Exception as e:
199
+ logger.error(f"Transcription failed: {str(e)}")
200
+ raise
201
+
202
+
203
+ def process_utterance(utterance, full_audio, wav_file):
204
+ try:
205
+ start = utterance['start']
206
+ end = utterance['end']
207
+ segment = full_audio[start:end]
208
+ temp_path = os.path.join(OUTPUT_DIR, f"temp_{uuid.uuid4()}.wav")
209
+ segment.export(temp_path, format="wav")
210
+
211
+ with torch.no_grad():
212
+ embedding = speaker_model.get_embedding(temp_path).to(device)
213
+
214
+ query_result = index.query(
215
+ vector=embedding.cpu().numpy().tolist(),
216
+ top_k=1,
217
+ include_metadata=True
218
+ )
219
+
220
+ if query_result['matches'] and query_result['matches'][0]['score'] > 0.7:
221
+ speaker_id = query_result['matches'][0]['id']
222
+ speaker_name = query_result['matches'][0]['metadata']['speaker_name']
223
+ else:
224
+ speaker_id = f"unknown_{uuid.uuid4().hex[:6]}"
225
+ speaker_name = f"Speaker_{speaker_id[-4:]}"
226
+ index.upsert([(speaker_id, embedding.tolist(), {"speaker_name": speaker_name})])
227
+
228
+ os.remove(temp_path)
229
+
230
+ return {
231
+ **utterance,
232
+ 'speaker': speaker_name,
233
+ 'speaker_id': speaker_id,
234
+ 'embedding': embedding.cpu().numpy().tolist()
235
+ }
236
+ except Exception as e:
237
+ logger.error(f"Utterance processing failed: {str(e)}")
238
+ return {
239
+ **utterance,
240
+ 'speaker': 'Unknown',
241
+ 'speaker_id': 'unknown',
242
+ 'embedding': None
243
+ }
244
+
245
+
246
+ def identify_speakers(transcript: Dict, wav_file: str) -> List[Dict]:
247
+ try:
248
+ full_audio = AudioSegment.from_wav(wav_file)
249
+ utterances = transcript['utterances']
250
+
251
+ with ThreadPoolExecutor(max_workers=5) as executor: # Changed to 5 workers
252
+ futures = [
253
+ executor.submit(process_utterance, utterance, full_audio, wav_file)
254
+ for utterance in utterances
255
+ ]
256
+ results = [f.result() for f in futures]
257
+
258
+ return results
259
+ except Exception as e:
260
+ logger.error(f"Speaker identification failed: {str(e)}")
261
+ raise
262
+
263
+
264
+ def train_role_classifier(utterances: List[Dict]):
265
+ try:
266
+ texts = [u['text'] for u in utterances]
267
+ vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
268
+ X_text = vectorizer.fit_transform(texts)
269
+
270
+ features = []
271
+ labels = []
272
+
273
+ for i, utterance in enumerate(utterances):
274
+ prosodic = utterance['prosodic_features']
275
+ feat = [
276
+ prosodic['duration'],
277
+ prosodic['mean_pitch'],
278
+ prosodic['min_pitch'],
279
+ prosodic['max_pitch'],
280
+ prosodic['pitch_sd'],
281
+ prosodic['intensityMean'],
282
+ prosodic['intensityMin'],
283
+ prosodic['intensityMax'],
284
+ prosodic['intensitySD'],
285
+ ]
286
+
287
+ feat.extend(X_text[i].toarray()[0].tolist())
288
+
289
+ doc = nlp(utterance['text'])
290
+ feat.extend([
291
+ int(utterance['text'].endswith('?')),
292
+ len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower())),
293
+ len(utterance['text'].split()),
294
+ sum(1 for token in doc if token.pos_ == 'VERB'),
295
+ sum(1 for token in doc if token.pos_ == 'NOUN')
296
+ ])
297
+
298
+ features.append(feat)
299
+ labels.append(0 if i % 2 == 0 else 1)
300
+
301
+ scaler = StandardScaler()
302
+ X = scaler.fit_transform(features)
303
+
304
+ clf = RandomForestClassifier(
305
+ n_estimators=150,
306
+ max_depth=10,
307
+ random_state=42,
308
+ class_weight='balanced'
309
+ )
310
+ clf.fit(X, labels)
311
+
312
+ joblib.dump(clf, os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
313
+ joblib.dump(vectorizer, os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
314
+ joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
315
+
316
+ return clf, vectorizer, scaler
317
+ except Exception as e:
318
+ logger.error(f"Classifier training failed: {str(e)}")
319
+ raise
320
+
321
+
322
+ def classify_roles(utterances: List[Dict], clf, vectorizer, scaler):
323
+ try:
324
+ texts = [u['text'] for u in utterances]
325
+ X_text = vectorizer.transform(texts)
326
+
327
+ results = []
328
+ for i, utterance in enumerate(utterances):
329
+ prosodic = utterance['prosodic_features']
330
+ feat = [
331
+ prosodic['duration'],
332
+ prosodic['mean_pitch'],
333
+ prosodic['min_pitch'],
334
+ prosodic['max_pitch'],
335
+ prosodic['pitch_sd'],
336
+ prosodic['intensityMean'],
337
+ prosodic['intensityMin'],
338
+ prosodic['intensityMax'],
339
+ prosodic['intensitySD'],
340
+ ]
341
+
342
+ feat.extend(X_text[i].toarray()[0].tolist())
343
+
344
+ doc = nlp(utterance['text'])
345
+ feat.extend([
346
+ int(utterance['text'].endswith('?')),
347
+ len(re.findall(r'\b(why|how|what|when|where|who|which)\b', utterance['text'].lower())),
348
+ len(utterance['text'].split()),
349
+ sum(1 for token in doc if token.pos_ == 'VERB'),
350
+ sum(1 for token in doc if token.pos_ == 'NOUN')
351
+ ])
352
+
353
+ X = scaler.transform([feat])
354
+ role = 'Interviewer' if clf.predict(X)[0] == 0 else 'Interviewee'
355
+
356
+ results.append({**utterance, 'role': role})
357
+
358
+ return results
359
+ except Exception as e:
360
+ logger.error(f"Role classification failed: {str(e)}")
361
+ raise
362
+
363
+
364
+ def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
365
+ try:
366
+ y, sr = librosa.load(audio_path, sr=16000)
367
+
368
+ interviewee_utterances = [u for u in utterances if u['role'] == 'Interviewee']
369
+ if not interviewee_utterances:
370
+ return {'error': 'No interviewee utterances found'}
371
+
372
+ segments = []
373
+ for u in interviewee_utterances:
374
+ start = int(u['start'] * sr / 1000)
375
+ end = int(u['end'] * sr / 1000)
376
+ segments.append(y[start:end])
377
+
378
+ combined_audio = np.concatenate(segments)
379
+
380
+ total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
381
+ total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
382
+ speaking_rate = total_words / total_duration if total_duration > 0 else 0
383
+
384
+ filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean']
385
+ filler_count = sum(
386
+ sum(u['text'].lower().count(fw) for fw in filler_words)
387
+ for u in interviewee_utterances
388
+ )
389
+ filler_ratio = filler_count / total_words if total_words > 0 else 0
390
+
391
+ all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
392
+ word_counts = {}
393
+ for i in range(len(all_words) - 1):
394
+ bigram = (all_words[i], all_words[i + 1])
395
+ word_counts[bigram] = word_counts.get(bigram, 0) + 1
396
+ repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(
397
+ word_counts) if word_counts else 0
398
+
399
+ pitches = []
400
+ for segment in segments:
401
+ f0, voiced_flag, _ = librosa.pyin(segment, fmin=80, fmax=300, sr=sr)
402
+ pitches.extend(f0[voiced_flag])
403
+
404
+ pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
405
+ pitch_std = np.std(pitches) if len(pitches) > 0 else 0
406
+ jitter = np.mean(np.abs(np.diff(pitches))) / pitch_mean if len(pitches) > 1 and pitch_mean > 0 else 0
407
+
408
+ intensities = []
409
+ for segment in segments:
410
+ rms = librosa.feature.rms(y=segment)[0]
411
+ intensities.extend(rms)
412
+
413
+ intensity_mean = np.mean(intensities) if intensities else 0
414
+ intensity_std = np.std(intensities) if intensities else 0
415
+ shimmer = np.mean(np.abs(np.diff(intensities))) / intensity_mean if len(
416
+ intensities) > 1 and intensity_mean > 0 else 0
417
+
418
+ anxiety_score = 0.6 * (pitch_std / pitch_mean) + 0.4 * (jitter + shimmer) if pitch_mean > 0 else 0
419
+ confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
420
+ hesitation_score = filler_ratio + repetition_score
421
+
422
+ anxiety_level = 'high' if anxiety_score > 0.15 else 'moderate' if anxiety_score > 0.07 else 'low'
423
+ confidence_level = 'high' if confidence_score > 0.7 else 'moderate' if confidence_score > 0.5 else 'low'
424
+ fluency_level = 'fluent' if (filler_ratio < 0.05 and repetition_score < 0.1) else 'moderate' if (
425
+ filler_ratio < 0.1 and repetition_score < 0.2) else 'disfluent'
426
+
427
+ return {
428
+ 'speaking_rate': float(round(speaking_rate, 2)),
429
+ 'filler_ratio': float(round(filler_ratio, 4)),
430
+ 'repetition_score': float(round(repetition_score, 4)),
431
+ 'pitch_analysis': {
432
+ 'mean': float(round(pitch_mean, 2)),
433
+ 'std_dev': float(round(pitch_std, 2)),
434
+ 'jitter': float(round(jitter, 4))
435
+ },
436
+ 'intensity_analysis': {
437
+ 'mean': float(round(intensity_mean, 2)),
438
+ 'std_dev': float(round(intensity_std, 2)),
439
+ 'shimmer': float(round(shimmer, 4))
440
+ },
441
+ 'composite_scores': {
442
+ 'anxiety': float(round(anxiety_score, 4)),
443
+ 'confidence': float(round(confidence_score, 4)),
444
+ 'hesitation': float(round(hesitation_score, 4))
445
+ },
446
+ 'interpretation': {
447
+ 'anxiety_level': anxiety_level,
448
+ 'confidence_level': confidence_level,
449
+ 'fluency_level': fluency_level
450
+ }
451
+ }
452
+ except Exception as e:
453
+ logger.error(f"Voice analysis failed: {str(e)}")
454
+ return {'error': str(e)}
455
+
456
+
457
+ def generate_voice_interpretation(analysis: Dict) -> str:
458
+ # This function is used to provide the text interpretation for Gemini's prompt.
459
+ if 'error' in analysis:
460
+ return "Voice analysis not available."
461
+
462
+ interpretation_lines = []
463
+ interpretation_lines.append("Voice Analysis Summary:")
464
+ interpretation_lines.append(f"- Speaking Rate: {analysis['speaking_rate']} words/sec (average)")
465
+ interpretation_lines.append(f"- Filler Words: {analysis['filler_ratio'] * 100:.1f}% of words")
466
+ interpretation_lines.append(f"- Repetition Score: {analysis['repetition_score']:.3f}")
467
+ interpretation_lines.append(
468
+ f"- Anxiety Level: {analysis['interpretation']['anxiety_level'].upper()} (score: {analysis['composite_scores']['anxiety']:.3f})")
469
+ interpretation_lines.append(
470
+ f"- Confidence Level: {analysis['interpretation']['confidence_level'].upper()} (score: {analysis['composite_scores']['confidence']:.3f})")
471
+ interpretation_lines.append(f"- Fluency: {analysis['interpretation']['fluency_level'].upper()}")
472
+ interpretation_lines.append("")
473
+ interpretation_lines.append("Detailed Interpretation:")
474
+ interpretation_lines.append(
475
+ "1. A higher speaking rate indicates faster speech, which can suggest nervousness or enthusiasm.")
476
+ interpretation_lines.append("2. Filler words and repetitions reduce speech clarity and professionalism.")
477
+ interpretation_lines.append("3. Anxiety is measured through pitch variability and voice instability.")
478
+ interpretation_lines.append("4. Confidence is assessed through voice intensity and stability.")
479
+ interpretation_lines.append("5. Fluency combines filler words and repetition metrics.")
480
+
481
+ return "\n".join(interpretation_lines)
482
+
483
+
484
+ # --- Chart Generation Function ---
485
+ # Removed function as charts are no longer included
486
+ def generate_anxiety_confidence_chart(composite_scores: Dict, chart_path: str):
487
+ try:
488
+
489
+ labels = ['Anxiety', 'Confidence']
490
+ scores = [composite_scores.get('anxiety', 0), composite_scores.get('confidence', 0)]
491
+
492
+ fig, ax = plt.subplots(figsize=(4, 2.5))
493
+ ax.bar(labels, scores, color=['lightcoral', 'lightskyblue'])
494
+ ax.set_ylabel('Score')
495
+ ax.set_title('Anxiety vs. Confidence Scores')
496
+ ax.set_ylim(0, 1.0)
497
+
498
+ for i, v in enumerate(scores):
499
+
500
+ ax.text(i, v + 0.05, f"{v:.2f}", color='black', ha='center', fontweight='bold')
501
+
502
+ plt.tight_layout()
503
+ plt.savefig(chart_path)
504
+ plt.close(fig)
505
+ except Exception as e:
506
+ logger.error(f"Error generating chart: {str(e)}")
507
+
508
+
509
+ # --- Acceptance Probability Calculation ---
510
+ def calculate_acceptance_probability(analysis_data: Dict) -> float:
511
+ """
512
+ Calculates a hypothetical acceptance probability based on voice and content analysis.
513
+ This is a simplified, heuristic model and can be refined with more data/ML.
514
+ """
515
+ voice = analysis_data.get('voice_analysis', {})
516
+
517
+ if 'error' in voice:
518
+ return 0.0 # Cannot calculate if voice analysis failed
519
+
520
+ # Weights for different factors (adjust these to fine-tune the model)
521
+ w_confidence = 0.4
522
+ w_anxiety = -0.3 # Negative weight for anxiety
523
+ w_fluency = 0.2
524
+ w_speaking_rate = 0.1 # Ideal rate gets higher score
525
+ w_filler_repetition = -0.1 # Negative weight for filler/repetition
526
+ w_content_strengths = 0.2 # Placeholder, ideally from deeper content analysis
527
+
528
+ # Normalize/interpret scores
529
+ confidence_score = voice.get('composite_scores', {}).get('confidence', 0.0)
530
+ anxiety_score = voice.get('composite_scores', {}).get('anxiety', 0.0)
531
+ fluency_level = voice.get('interpretation', {}).get('fluency_level', 'disfluent')
532
+ speaking_rate = voice.get('speaking_rate', 0.0)
533
+ filler_ratio = voice.get('filler_ratio', 0.0)
534
+ repetition_score = voice.get('repetition_score', 0.0)
535
+
536
+ # Fluency mapping (higher score for more fluent)
537
+ fluency_map = {'fluent': 1.0, 'moderate': 0.5, 'disfluent': 0.0}
538
+ fluency_val = fluency_map.get(fluency_level, 0.0)
539
+
540
+ # Speaking rate scoring (e.g., ideal is around 2.5 words/sec, gets lower for too fast/slow)
541
+ # This is a simple inverse of deviation from ideal
542
+ ideal_speaking_rate = 2.5
543
+ speaking_rate_deviation = abs(speaking_rate - ideal_speaking_rate)
544
+ speaking_rate_score = max(0, 1 - (speaking_rate_deviation / ideal_speaking_rate)) # Max 1.0, min 0.0
545
+
546
+ # Filler/Repetition score (lower is better, so 1 - score)
547
+ filler_repetition_composite = (filler_ratio + repetition_score) / 2 # Average them
548
+ filler_repetition_score = max(0, 1 - filler_repetition_composite)
549
+
550
+ # Simplified content strength score (you might need a more sophisticated NLP method here)
551
+ # For now, based on presence of strengths in Gemini's content analysis
552
+ content_strength_val = 0.0
553
+ # This part would ideally come from a structured output from Gemini's content analysis.
554
+ # For now, we'll make a simplified assumption based on the analysis data:
555
+ # If content analysis found "strengths" (which is likely if Gemini generates a full report)
556
+ # This needs refinement if Gemini output is not structured for this.
557
+ if analysis_data.get('text_analysis', {}).get('total_duration', 0) > 0: # Basic check if interview happened
558
+ content_strength_val = 0.8 # Assume moderate strength if analysis went through
559
+ # You could parse gemini_report_text for specific phrases like "Strengths:" and count items.
560
+
561
+ # Calculate raw score
562
+ raw_score = (
563
+ confidence_score * w_confidence +
564
+ (1 - anxiety_score) * abs(w_anxiety) + # (1 - anxiety) because lower anxiety is better
565
+ fluency_val * w_fluency +
566
+ speaking_rate_score * w_speaking_rate +
567
+ filler_repetition_score * abs(w_filler_repetition) + # Use abs weight as score is already inverted
568
+ content_strength_val * w_content_strengths
569
+ )
570
+
571
+ # Normalize to 0-1 and then to percentage
572
+ # These max/min values are rough estimates and should be calibrated with real data
573
+ min_possible_score = (0 * w_confidence) + (0 * abs(w_anxiety)) + (0 * w_fluency) + (0 * w_speaking_rate) + (
574
+ 0 * abs(w_filler_repetition)) + (0 * w_content_strengths)
575
+ max_possible_score = (1 * w_confidence) + (1 * abs(w_anxiety)) + (1 * w_fluency) + (1 * w_speaking_rate) + (
576
+ 1 * abs(w_filler_repetition)) + (1 * w_content_strengths)
577
+
578
+ # Prevent division by zero if all weights are zero or min/max are same
579
+ if max_possible_score == min_possible_score:
580
+ normalized_score = 0.5 # Default if no variation
581
+ else:
582
+ normalized_score = (raw_score - min_possible_score) / (max_possible_score - min_possible_score)
583
+
584
+ acceptance_probability = max(0.0, min(1.0, normalized_score)) # Clamp between 0 and 1
585
+
586
+ return float(f"{acceptance_probability * 100:.2f}") # Return as percentage
587
+
588
+
589
+ def generate_report(analysis_data: Dict) -> str:
590
+ try:
591
+ voice = analysis_data.get('voice_analysis', {})
592
+ voice_interpretation = generate_voice_interpretation(voice)
593
+
594
+ interviewee_responses = [
595
+ f"Speaker {u['speaker']} ({u['role']}): {u['text']}"
596
+ for u in analysis_data['transcript']
597
+ if u['role'] == 'Interviewee'
598
+ ][:5] # Limit to first 5 for prompt brevity
599
+
600
+ acceptance_prob = analysis_data.get('acceptance_probability', None)
601
+ acceptance_line = ""
602
+ if acceptance_prob is not None:
603
+ acceptance_line = f"\n**Estimated Acceptance Probability: {acceptance_prob:.2f}%**\n"
604
+ if acceptance_prob >= 80:
605
+ acceptance_line += "This indicates a very strong candidate. Well done!"
606
+ elif acceptance_prob >= 50:
607
+ acceptance_line += "This indicates a solid candidate with potential for improvement."
608
+ else:
609
+ acceptance_line += "This candidate may require significant development or may not be a strong fit."
610
+
611
+ prompt = f"""
612
+ As EvalBot, an AI interview analysis system, generate a highly professional, well-structured, and concise interview analysis report.
613
+ The report should be suitable for a professional setting and clearly highlight key findings and actionable recommendations.
614
+ Use clear headings and subheadings. For bullet points, use '- '.
615
+
616
+ {acceptance_line}
617
+
618
+ **1. Executive Summary**
619
+ Provide a brief, high-level overview of the interview.
620
+ - Overall interview duration: {analysis_data['text_analysis']['total_duration']:.2f} seconds
621
+ - Number of speaker turns: {analysis_data['text_analysis']['speaker_turns']}
622
+ - Main participants: {', '.join(analysis_data['speakers'])}
623
+
624
+ **2. Voice Analysis Insights**
625
+ Analyze key voice metrics and provide a detailed interpretation.
626
+ {voice_interpretation}
627
+
628
+ **3. Content Analysis & Strengths/Areas for Development**
629
+ Analyze the key themes and identify both strengths and areas for development in the interviewee's responses.
630
+ Key responses from interviewee (for context):
631
+ {chr(10).join(interviewee_responses)}
632
+
633
+ **4. Actionable Recommendations**
634
+ Offer specific, actionable suggestions for improvement.
635
+ Focus on:
636
+ - Communication Skills (e.g., pacing, clarity, filler words)
637
+ - Content Delivery (e.g., quantifying achievements, structuring answers)
638
+ - Professional Presentation (e.g., research, specific examples, mock interviews)
639
+ """
640
+
641
+ response = gemini_model.generate_content(prompt)
642
+ return response.text
643
+ except Exception as e:
644
+ logger.error(f"Report generation failed: {str(e)}")
645
+ return f"Error generating report: {str(e)}"
646
+
647
+
648
+ # --- ENHANCED PDF GENERATION FUNCTION (without logo or charts) ---
649
+ def create_pdf_report(analysis_data: Dict, output_path: str, gemini_report_text: str):
650
+ try:
651
+ doc = SimpleDocTemplate(output_path, pagesize=letter)
652
+ styles = getSampleStyleSheet()
653
+
654
+ # Define custom styles
655
+ h1 = ParagraphStyle(name='Heading1', parent=styles['h1'], fontSize=16, spaceAfter=14, alignment=1,
656
+ textColor=colors.HexColor('#003366'))
657
+ h2 = ParagraphStyle(name='Heading2', parent=styles['h2'], fontSize=12, spaceBefore=10, spaceAfter=8,
658
+ textColor=colors.HexColor('#336699'))
659
+ h3 = ParagraphStyle(name='Heading3', parent=styles['h3'], fontSize=10, spaceBefore=8, spaceAfter=4,
660
+ textColor=colors.HexColor('#0055AA'))
661
+ body_text = ParagraphStyle(name='BodyText', parent=styles['Normal'], fontSize=9, leading=12, spaceAfter=4)
662
+ bullet_style = ParagraphStyle(name='Bullet', parent=styles['Normal'], fontSize=9, leading=12, leftIndent=18,
663
+ bulletIndent=9)
664
+
665
+ story = []
666
+
667
+ # Title and Date
668
+ story.append(Paragraph(f"<b>EvalBot Interview Analysis Report</b>", h1))
669
+ story.append(Spacer(1, 0.2 * inch))
670
+ story.append(Paragraph(f"<b>Date:</b> {time.strftime('%Y-%m-%d')}", body_text))
671
+ story.append(Spacer(1, 0.3 * inch))
672
+
673
+ # --- Acceptance Probability (New Section) ---
674
+ acceptance_prob = analysis_data.get('acceptance_probability', None)
675
+ if acceptance_prob is not None:
676
+ story.append(Paragraph("<b>Candidate Evaluation Summary</b>", h2))
677
+ story.append(Spacer(1, 0.1 * inch))
678
+
679
+ prob_color = colors.green if acceptance_prob >= 70 else (
680
+ colors.orange if acceptance_prob >= 40 else colors.red)
681
+
682
+ story.append(Paragraph(
683
+ f"<font size='12' color='{prob_color.hexval}'><b>Estimated Acceptance Probability: {acceptance_prob:.2f}%</b></font>",
684
+ ParagraphStyle(name='AcceptanceProbability', parent=styles['Normal'], fontSize=12, spaceAfter=10,
685
+ alignment=1)
686
+ ))
687
+
688
+ if acceptance_prob >= 80:
689
+ story.append(
690
+ Paragraph("This indicates a very strong candidate with high potential. Well done!", body_text))
691
+ elif acceptance_prob >= 50:
692
+ story.append(Paragraph(
693
+ "This candidate shows solid potential but has areas for improvement to become an even stronger fit.",
694
+ body_text))
695
+ else:
696
+ story.append(Paragraph(
697
+ "This candidate may require significant development or may not be the ideal fit at this time.",
698
+ body_text))
699
+ story.append(Spacer(1, 0.3 * inch))
700
+ # --- End Acceptance Probability ---
701
+
702
+ # Parse Gemini's report into sections for better PDF structuring
703
+ sections = {}
704
+ current_section = None
705
+ # Use regex to robustly identify sections, especially with varied bullet points
706
+ section_patterns = {
707
+ r'^\s*\*\*\s*1\.\s*Executive Summary\s*\*\*': 'Executive Summary',
708
+ r'^\s*\*\*\s*2\.\s*Voice Analysis Insights\s*\*\*': 'Voice Analysis Insights',
709
+ r'^\s*\*\*\s*3\.\s*Content Analysis & Strengths/Areas for Development\s*\*\*': 'Content Analysis & Strengths/Areas for Development',
710
+ r'^\s*\*\*\s*4\.\s*Actionable Recommendations\s*\*\*': 'Actionable Recommendations'
711
+ }
712
+
713
+ for line in gemini_report_text.split('\n'):
714
+ matched_section = False
715
+ for pattern, section_name in section_patterns.items():
716
+ if re.match(pattern, line):
717
+ current_section = section_name
718
+ sections[current_section] = []
719
+ matched_section = True
720
+ break
721
+ if not matched_section and current_section:
722
+ sections[current_section].append(line)
723
+
724
+ # 1. Executive Summary
725
+ story.append(Paragraph("1. Executive Summary", h2))
726
+ story.append(Spacer(1, 0.1 * inch))
727
+ if 'Executive Summary' in sections:
728
+ for line in sections['Executive Summary']:
729
+ if line.strip():
730
+ story.append(Paragraph(line.strip(), body_text))
731
+ story.append(Spacer(1, 0.2 * inch))
732
+
733
+ # 2. Voice Analysis (Detailed - using Table for summary)
734
+ story.append(Paragraph("2. Voice Analysis", h2))
735
+ voice_analysis = analysis_data.get('voice_analysis', {})
736
+
737
+ if voice_analysis and 'error' not in voice_analysis:
738
+ # Voice Analysis Summary Table
739
+ table_data = [
740
+ ['Metric', 'Value', 'Interpretation'],
741
+ ['Speaking Rate', f"{voice_analysis['speaking_rate']:.2f} words/sec", 'Average rate'],
742
+ ['Filler Words', f"{voice_analysis['filler_ratio'] * 100:.1f}%", 'Percentage of total words'],
743
+ ['Repetition Score', f"{voice_analysis['repetition_score']:.3f}", 'Lower is better articulation'],
744
+ ['Anxiety Level', voice_analysis['interpretation']['anxiety_level'].upper(),
745
+ f"Score: {voice_analysis['composite_scores']['anxiety']:.3f}"],
746
+ ['Confidence Level', voice_analysis['interpretation']['confidence_level'].upper(),
747
+ f"Score: {voice_analysis['composite_scores']['confidence']:.3f}"],
748
+ ['Fluency', voice_analysis['interpretation']['fluency_level'].upper(), 'Overall speech flow']
749
+ ]
750
+
751
+ table_style = TableStyle([
752
+ ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#6699CC')),
753
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
754
+ ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
755
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
756
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 10),
757
+ ('BACKGROUND', (0, 1), (-1, -1), colors.HexColor('#EFEFEF')),
758
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#CCCCCC')),
759
+ ('LEFTPADDING', (0, 0), (-1, -1), 6),
760
+ ('RIGHTPADDING', (0, 0), (-1, -1), 6),
761
+ ('TOPPADDING', (0, 0), (-1, -1), 6),
762
+ ('BOTTOMPADDING', (0, 0), (-1, -1), 6),
763
+ ])
764
+
765
+ table = Table(table_data)
766
+ table.setStyle(table_style)
767
+ story.append(table)
768
+ story.append(Spacer(1, 0.2 * inch))
769
+
770
+ # Detailed Interpretation from Gemini (if present)
771
+ if 'Voice Analysis Insights' in sections:
772
+ story.append(Paragraph("Detailed Interpretation:", h3))
773
+ for line in sections['Voice Analysis Insights']:
774
+ if line.strip():
775
+ # Handle numbered lists from Gemini
776
+ if re.match(r'^\d+\.\s', line.strip()):
777
+ story.append(
778
+ Paragraph(line.strip(), bullet_style))
779
+ else:
780
+ story.append(Paragraph(line.strip(), body_text))
781
+ story.append(Spacer(1, 0.2 * inch))
782
+
783
+ else:
784
+ story.append(Paragraph("Voice analysis not available or encountered an error.", body_text))
785
+ story.append(Spacer(1, 0.3 * inch))
786
+
787
+ # 3. Content Analysis
788
+ story.append(Paragraph("3. Content Analysis", h2))
789
+ if 'Content Analysis & Strengths/Areas for Development' in sections:
790
+ for line in sections['Content Analysis & Strengths/Areas for Development']:
791
+ if line.strip():
792
+ if line.strip().startswith('-'):
793
+ story.append(Paragraph(line.strip(), bullet_style))
794
+ else:
795
+ story.append(Paragraph(line.strip(), body_text))
796
+ story.append(Spacer(1, 0.2 * inch))
797
+
798
+ # Add some interviewee responses to the report (can be formatted as a list)
799
+ story.append(Paragraph("Key Interviewee Responses (Contextual):", h3))
800
+ interviewee_responses = [
801
+ f"Speaker {u['speaker']} ({u['role']}): {u['text']}"
802
+ for u in analysis_data['transcript']
803
+ if u['role'] == 'Interviewee'
804
+ ][:5]
805
+ for res in interviewee_responses:
806
+ story.append(Paragraph(res, bullet_style))
807
+ story.append(Spacer(1, 0.3 * inch))
808
+
809
+ # 4. Recommendations
810
+ story.append(Paragraph("4. Recommendations", h2))
811
+ if 'Actionable Recommendations' in sections:
812
+ for line in sections['Actionable Recommendations']:
813
+ if line.strip():
814
+ if line.strip().startswith('-'):
815
+ story.append(Paragraph(line.strip(), bullet_style))
816
+ else:
817
+ story.append(Paragraph(line.strip(), body_text))
818
+ story.append(Spacer(1, 0.2 * inch))
819
+
820
+ # Footer Text
821
+ story.append(Spacer(1, 0.5 * inch))
822
+ story.append(Paragraph("--- Analysis by EvalBot ---", ParagraphStyle(
823
+ name='FooterText', parent=styles['Normal'], fontSize=8, alignment=1, textColor=colors.HexColor('#666666')
824
+ )))
825
+
826
+ doc.build(story)
827
+ return True
828
+ except Exception as e:
829
+ logger.error(f"PDF creation failed: {str(e)}", exc_info=True)
830
+ return False
831
+
832
+
833
+ def convert_to_serializable(obj):
834
+ if isinstance(obj, np.generic):
835
+ return obj.item()
836
+ elif isinstance(obj, dict):
837
+ return {key: convert_to_serializable(value) for key, value in obj.items()}
838
+ elif isinstance(obj, list):
839
+ return [convert_to_serializable(item) for item in obj]
840
+ elif isinstance(obj, np.ndarray):
841
+ return obj.tolist()
842
+ return obj
843
+
844
+
845
+ def process_interview(audio_path: str):
846
+ try:
847
+ logger.info(f"Starting processing for {audio_path}")
848
+
849
+ wav_file = convert_to_wav(audio_path)
850
+
851
+ logger.info("Starting transcription")
852
+ transcript = transcribe(wav_file)
853
+
854
+ logger.info("Extracting prosodic features")
855
+ for utterance in transcript['utterances']:
856
+ utterance['prosodic_features'] = extract_prosodic_features(
857
+ wav_file,
858
+ utterance['start'],
859
+ utterance['end']
860
+ )
861
+
862
+ logger.info("Identifying speakers")
863
+ utterances_with_speakers = identify_speakers(transcript, wav_file)
864
+
865
+ logger.info("Classifying roles")
866
+ # Ensure role classifier models are loaded/trained only once if possible,
867
+ # or handled carefully in a multi-threaded context.
868
+ # For simplicity, keeping it inside process_interview for now.
869
+ if os.path.exists(os.path.join(OUTPUT_DIR, 'role_classifier.pkl')):
870
+ clf = joblib.load(os.path.join(OUTPUT_DIR, 'role_classifier.pkl'))
871
+ vectorizer = joblib.load(os.path.join(OUTPUT_DIR, 'text_vectorizer.pkl'))
872
+ scaler = joblib.load(os.path.join(OUTPUT_DIR, 'feature_scaler.pkl'))
873
+ else:
874
+ clf, vectorizer, scaler = train_role_classifier(utterances_with_speakers)
875
+
876
+ classified_utterances = classify_roles(utterances_with_speakers, clf, vectorizer, scaler)
877
+
878
+ logger.info("Analyzing interviewee voice")
879
+ voice_analysis = analyze_interviewee_voice(wav_file, classified_utterances)
880
+
881
+ analysis_data = {
882
+ 'transcript': classified_utterances,
883
+ 'speakers': list(set(u['speaker'] for u in classified_utterances)),
884
+ 'voice_analysis': voice_analysis,
885
+ 'text_analysis': {
886
+ 'total_duration': sum(u['prosodic_features']['duration'] for u in classified_utterances),
887
+ 'speaker_turns': len(classified_utterances)
888
+ }
889
+ }
890
+
891
+ # --- Calculate Acceptance Probability ---
892
+ acceptance_probability = calculate_acceptance_probability(analysis_data)
893
+ analysis_data['acceptance_probability'] = acceptance_probability
894
+ # --- End Acceptance Probability ---
895
+
896
+ logger.info("Generating report text using Gemini")
897
+ gemini_report_text = generate_report(analysis_data)
898
+
899
+ base_name = os.path.splitext(os.path.basename(audio_path))[0]
900
+ pdf_path = os.path.join(OUTPUT_DIR, f"{base_name}_report.pdf")
901
+ create_pdf_report(analysis_data, pdf_path, gemini_report_text=gemini_report_text)
902
+
903
+ json_path = os.path.join(OUTPUT_DIR, f"{base_name}_analysis.json")
904
+ with open(json_path, 'w') as f:
905
+ serializable_data = convert_to_serializable(analysis_data)
906
+ json.dump(serializable_data, f, indent=2)
907
+
908
+ os.remove(wav_file) # Clean up WAV file after processing
909
+
910
+ logger.info(f"Processing completed for {audio_path}")
911
+ return {
912
+ 'pdf_path': pdf_path,
913
+ 'json_path': json_path
914
+ }
915
+ except Exception as e:
916
+ logger.error(f"Processing failed: {str(e)}", exc_info=True)
917
+ # Clean up wav_file in case of error
918
+ if 'wav_file' in locals() and os.path.exists(wav_file):
919
+ os.remove(wav_file)
920
+ raise
requirements.txt ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Levenshtein
2
+ braceexpand
3
+ distance
4
+ docopt
5
+ fiddle
6
+ fsspec
7
+ g2p_en
8
+ hydra-core
9
+ intervaltree
10
+ jiwer
11
+ kaldi-python-io
12
+ kaldiio
13
+ lhotse
14
+ libcst
15
+ lightning
16
+ lilcom
17
+ llvmlite
18
+ loguru
19
+ mediapy
20
+ einops
21
+ nemo_toolkit
22
+ numba
23
+ nvidia-cublas-cu12
24
+ nvidia-cudnn-cu12
25
+ nvidia-cufft-cu12
26
+ nvidia-curand-cu12
27
+ nvidia-cusolver-cu12
28
+ nvidia-cusparse-cu12
29
+ nvidia-nvjitlink-cu12
30
+ pinecone
31
+ pinecone-plugin-interface
32
+ plac
33
+ pyannote.core
34
+ pyannote.database
35
+ pyannote.metrics
36
+ pyloudnorm
37
+ rapidfuzz
38
+ reportlab
39
+ resampy
40
+ ruamel.yaml
41
+ ruamel.yaml.clib
42
+ sacremoses
43
+ sox
44
+ texterrors
45
+ transformers
46
+ webdataset
47
+ wget
48
+ fastapi
49
+ uvicorn
50
+ pydub
51
+ librosa
52
+ spacy
53
+ google-generativeai
54
+ joblib
55
+ pandas
56
+ scikit-learn
57
+ numpy
58
+ torch
59
+ requests
60
+ sentencepiece
61
+ datasets
62
+ editdistance
63
+ python-multipart
64
+ hf_xet
65
+ huggingface_hub
66
+ gradio
67
+ hf_transfer
68
+ matplotlib
69
+ seaborn
70
+ reportlab