norhan12 commited on
Commit
48356cf
·
verified ·
1 Parent(s): ac312f5

Upload 5 files

Browse files
Files changed (5) hide show
  1. Dockerfile +52 -0
  2. README.md +65 -6
  3. app.py +132 -0
  4. process_interview.py +486 -0
  5. requirements.txt +75 -0
Dockerfile ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ RUN apt-get update && apt-get install -y \
4
+ libsndfile1 \
5
+ ffmpeg \
6
+ sox \
7
+ curl \
8
+ git-lfs \
9
+ pkg-config \
10
+ libfreetype6-dev \
11
+ libpng-dev \
12
+ build-essential \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ RUN useradd -m appuser
16
+
17
+ RUN mkdir -p \
18
+ /tmp/matplotlib \
19
+ /tmp/fontconfig \
20
+ /tmp/lhotse \
21
+ /app/uploads \
22
+ /app/processed_audio \
23
+ /app/assets \
24
+ /app/temp_files \
25
+ /app/static/outputs && \
26
+ chown -R appuser:appuser /app /tmp/matplotlib /tmp/fontconfig /tmp/lhotse
27
+
28
+ WORKDIR /app
29
+
30
+ COPY --chown=appuser:appuser . .
31
+
32
+ ENV MPLCONFIGDIR=/tmp/matplotlib \
33
+ FONTCONFIG_PATH=/tmp/fontconfig \
34
+ LHOTSE_CACHE_DIR=/tmp/lhotse \
35
+ HF_HUB_ENABLE_HF_TRANSFER=1 \
36
+ PYTHONUNBUFFERED=1
37
+
38
+
39
+ # 7. Install Python dependencies as non-root user
40
+ USER appuser
41
+ RUN pip install --upgrade pip && \
42
+ pip install --no-cache-dir -r requirements.txt && \
43
+ python -m spacy download en_core_web_sm && \
44
+ pip check
45
+
46
+ # 8. Health check - MODIFIED to use the new /health endpoint
47
+ HEALTHCHECK --interval=30s --timeout=10s \
48
+ CMD curl -f http://localhost:7860/health || exit 1
49
+
50
+
51
+ # 9. Run the application - MODIFIED to use Uvicorn for FastAPI
52
+ CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,11 +1,70 @@
1
  ---
2
- title: TheEnd
3
- emoji: 🚀
4
- colorFrom: yellow
5
- colorTo: red
6
  sdk: docker
 
7
  pinned: false
8
- license: mit
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: EvalBot - Interview Analysis System
3
+ emoji: 🤖
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: docker
7
+ app_file: app.py
8
  pinned: false
 
9
  ---
10
 
11
+ # 🎤 EvalBot: Automated Interview Analysis System
12
+
13
+ Welcome to EvalBot, your AI-powered solution for comprehensive interview analysis!
14
+
15
+ EvalBot helps assess candidate performance in interviews by analyzing:
16
+ - **Voice Metrics:** Speaking rate, filler words, anxiety, confidence, and fluency.
17
+ - **Content Analysis:** Key themes, strengths, and areas for development in responses.
18
+ - **Speaker Identification:** Differentiating between interviewer and interviewee.
19
+ - **Acceptance Probability:** An estimated likelihood of acceptance based on key performance indicators.
20
+
21
+ ## Features:
22
+ - **Audio Analysis:** Upload audio files (WAV, MP3, M4A, FLAC) or provide URLs.
23
+ - **Detailed PDF Reports:** Get professional, structured reports with key insights and actionable recommendations.
24
+ - **API Access:** Integrate EvalBot's analysis capabilities into your own applications.
25
+
26
+ ## How to Use the API:
27
+
28
+ You can interact with EvalBot's API using the `gradio_client` library in Python.
29
+
30
+ 1. **Install the client:**
31
+ ```bash
32
+ pip install gradio_client
33
+ ```
34
+
35
+ 2. **Use the API to analyze audio (accepts multiple URLs):**
36
+
37
+ ```python
38
+ from gradio_client import Client, handle_file
39
+ import os
40
+
41
+ # Replace with your actual Space URL (e.g., [https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME](https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME))
42
+ SPACE_URL = "[https://huggingface.co/spaces/norhan12/YOUR_NEW_SPACE_NAME](https://huggingface.co/spaces/norhan12/YOUR_NEW_SPACE_NAME)" # REMEMBER TO UPDATE THIS
43
+
44
+ # Ensure your Hugging Face Access Token is set as an environment variable (for private spaces)
45
+ # HF_ACCESS_TOKEN = os.getenv("HF_TOKEN")
46
+
47
+ client = Client(SPACE_URL) # , hf_token=HF_ACCESS_TOKEN # Uncomment if your space is private
48
+
49
+ # List of audio URLs to analyze
50
+ audio_interview_urls = [
51
+ "[https://www.soundhelix.com/examples/mp3/SoundHelix-Song-1.mp3](https://www.soundhelix.com/examples/mp3/SoundHelix-Song-1.mp3)",
52
+ "[https://www.soundhelix.com/examples/mp3/SoundHelix-Song-2.mp3](https://www.soundhelix.com/examples/mp3/SoundHelix-Song-2.mp3)",
53
+ # Add more URLs as needed
54
+ ]
55
+
56
+ try:
57
+ result = client.predict(
58
+ file=handle_file(audio_interview_urls),
59
+ api_name="/analyze_multiple_audios" # Ensure this matches your function name in app.py
60
+ )
61
+
62
+ print("Combined Analysis Summary:", result[0])
63
+ print("Detailed Analysis (JSON Array):", result[1])
64
+ print("Downloadable PDF Paths:", result[2])
65
+
66
+ except Exception as e:
67
+ print(f"An error occurred while calling the API: {e}")
68
+ ```
69
+
70
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Body
2
+ from pydantic import BaseModel, HttpUrl
3
+ import os
4
+ import uuid
5
+ import shutil
6
+ import json
7
+ import requests
8
+ import logging
9
+ from process_interview import process_interview
10
+ from fastapi.staticfiles import StaticFiles
11
+ from fastapi.responses import FileResponse
12
+
13
+ app = FastAPI()
14
+
15
+ # Configuration
16
+ TEMP_DIR = "./temp_files"
17
+ OUTPUT_DIR = "./static/outputs"
18
+ JSON_DIR = os.path.join(OUTPUT_DIR, "json")
19
+ PDF_DIR = os.path.join(OUTPUT_DIR, "pdf")
20
+ PROCESSED_DIR = "./processed_audio" # Matches process_interview.py output
21
+
22
+ os.makedirs(TEMP_DIR, exist_ok=True)
23
+ os.makedirs(JSON_DIR, exist_ok=True)
24
+ os.makedirs(PDF_DIR, exist_ok=True)
25
+ os.makedirs(PROCESSED_DIR, exist_ok=True)
26
+
27
+ app.mount("/static/outputs", StaticFiles(directory=OUTPUT_DIR), name="static_outputs")
28
+
29
+ # Logging setup
30
+ logging.basicConfig(level=logging.INFO)
31
+ logger = logging.getLogger(__name__)
32
+
33
+ VALID_EXTENSIONS = ('.wav', '.mp3', '.m4a', '.flac', '.webm', '.ogg', '.aac')
34
+ MAX_FILE_SIZE_MB = 300
35
+
36
+ BASE_URL = os.getenv("BASE_URL", "https://norhan12-evalbot-interview-analysis.hf.space")
37
+
38
+ class ProcessResponse(BaseModel):
39
+ summary: str
40
+ json_url: str
41
+ pdf_url: str
42
+
43
+ class ProcessAudioRequest(BaseModel):
44
+ file_url: HttpUrl
45
+ user_id: str
46
+
47
+ @app.post("/process-audio", response_model=ProcessResponse)
48
+ async def process_audio(request: ProcessAudioRequest = Body(...)):
49
+ file_url = request.file_url
50
+ user_id = request.user_id
51
+ try:
52
+ file_ext = os.path.splitext(str(file_url))[1].lower()
53
+ if file_ext not in VALID_EXTENSIONS:
54
+ raise HTTPException(status_code=400, detail=f"Invalid file extension: {file_ext}")
55
+
56
+ local_filename = f"{user_id}_{uuid.uuid4().hex}{file_ext}"
57
+ local_path = os.path.join(TEMP_DIR, local_filename)
58
+
59
+ logger.info(f"Downloading file from {file_url} to {local_path}")
60
+ resp = requests.get(str(file_url), stream=True, timeout=30)
61
+ if resp.status_code != 200:
62
+ raise HTTPException(status_code=400, detail=f"Failed to download file from {file_url}: Status {resp.status_code}")
63
+
64
+ with open(local_path, "wb") as f:
65
+ for chunk in resp.iter_content(chunk_size=8192):
66
+ if chunk:
67
+ f.write(chunk)
68
+
69
+ file_size_mb = os.path.getsize(local_path) / (1024 * 1024)
70
+ if file_size_mb > MAX_FILE_SIZE_MB:
71
+ os.remove(local_path)
72
+ raise HTTPException(status_code=400, detail=f"File too large: {file_size_mb:.2f} MB")
73
+
74
+ logger.info(f"Processing audio file: {local_path}")
75
+ result = process_interview(local_path, user_id=user_id)
76
+ if not result or 'json_path' not in result or 'pdf_path' not in result:
77
+ os.remove(local_path)
78
+ raise HTTPException(status_code=500, detail="Processing failed")
79
+
80
+ json_dest = os.path.basename(result['json_path'])
81
+ pdf_dest = os.path.basename(result['pdf_path'])
82
+
83
+ shutil.copyfile(result['json_path'], os.path.join(JSON_DIR, json_dest))
84
+ shutil.copyfile(result['pdf_path'], os.path.join(PDF_DIR, pdf_dest))
85
+
86
+ with open(result['json_path'], "r") as jf:
87
+ analysis_data = json.load(jf)
88
+
89
+ voice = analysis_data.get('voice_analysis', {})
90
+ interpretation = voice.get('interpretation', {})
91
+ speakers = analysis_data.get('speakers', [])
92
+ total_duration = analysis_data.get('text_analysis', {}).get('total_duration', 0.0)
93
+
94
+ summary = (
95
+ f"User ID: {user_id}\n"
96
+ f"Speakers: {', '.join(speakers)}\n"
97
+ f"Duration: {total_duration:.2f} sec\n"
98
+ f"Confidence: {interpretation.get('confidence_level', 'N/A')}\n"
99
+ f"Anxiety: {interpretation.get('anxiety_level', 'N/A')}"
100
+ )
101
+
102
+ json_url = f"{BASE_URL}/static/outputs/json/{json_dest}"
103
+ pdf_url = f"{BASE_URL}/static/outputs/pdf/{pdf_dest}"
104
+
105
+ # Clean up temporary and original processed files
106
+ os.remove(local_path)
107
+ os.remove(result['json_path'])
108
+ os.remove(result['pdf_path'])
109
+
110
+ return ProcessResponse(summary=summary, json_url=json_url, pdf_url=pdf_url)
111
+
112
+ except requests.RequestException as e:
113
+ raise HTTPException(status_code=400, detail=f"Download error: {str(e)}")
114
+ except Exception as e:
115
+ if os.path.exists(local_path):
116
+ os.remove(local_path)
117
+ raise HTTPException(status_code=500, detail=f"Processing error: {str(e)}")
118
+
119
+ @app.get("/static/outputs/json/{filename}")
120
+ async def get_json_file(filename: str):
121
+ file_path = os.path.join(JSON_DIR, filename)
122
+ if not os.path.exists(file_path):
123
+ raise HTTPException(status_code=404, detail="JSON file not found")
124
+ return FileResponse(file_path, media_type="application/json", filename=filename)
125
+
126
+ @app.get("/static/outputs/pdf/{filename}")
127
+ async def get_pdf_file(filename: str):
128
+ file_path = os.path.join(PDF_DIR, filename)
129
+ if not os.path.exists(file_path):
130
+ raise HTTPException(status_code=404, detail="PDF file not found")
131
+ return FileResponse(file_path, media_type="application/pdf", filename=filename)
132
+
process_interview.py ADDED
@@ -0,0 +1,486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # 1. IMPORTS
3
+ # ==============================================================================
4
+ import os
5
+ import torch
6
+ import numpy as np
7
+ import uuid
8
+ import requests
9
+ import time
10
+ import json
11
+ import re
12
+ import logging
13
+ import io
14
+ import subprocess
15
+ from contextlib import contextmanager
16
+ import tempfile
17
+ from typing import Dict, List
18
+
19
+ # Core AI & Audio Processing Libraries
20
+ from pydub import AudioSegment
21
+ from nemo.collections.asr.models import EncDecSpeakerLabelModel
22
+ from pinecone import Pinecone, ServerlessSpec
23
+ import librosa
24
+ import parselmouth
25
+ from parselmouth.praat import call
26
+ from transformers import AutoTokenizer, AutoModel
27
+ import spacy
28
+ import google.generativeai as genai
29
+ from sklearn.metrics.pairwise import cosine_similarity
30
+
31
+ # Reporting & Visualization
32
+ from reportlab.lib.pagesizes import letter
33
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, PageBreak
34
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
35
+ from reportlab.lib import colors
36
+ from reportlab.lib.enums import TA_CENTER, TA_JUSTIFY
37
+ from reportlab.lib.units import inch
38
+ import matplotlib.pyplot as plt
39
+ import matplotlib
40
+
41
+ matplotlib.use('Agg')
42
+
43
+ # Concurrency
44
+ from concurrent.futures import ThreadPoolExecutor
45
+
46
+ # ==============================================================================
47
+ # 2. CONFIGURATION AND INITIALIZATION
48
+ # ==============================================================================
49
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
50
+ logger = logging.getLogger(__name__)
51
+ logging.getLogger("nemo_logging").setLevel(logging.ERROR)
52
+ logging.getLogger("nemo").setLevel(logging.ERROR)
53
+ logging.getLogger("transformers").setLevel(logging.ERROR)
54
+
55
+ OUTPUT_DIR = "./processed_audio"
56
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
57
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
58
+
59
+ PINECONE_KEY = os.getenv("PINECONE_KEY")
60
+ ASSEMBLYAI_KEY = os.getenv("ASSEMBLYAI_KEY")
61
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
62
+ if not all([PINECONE_KEY, ASSEMBLYAI_KEY, GEMINI_API_KEY]):
63
+ raise ValueError("One or more required environment variables are missing.")
64
+
65
+ # Global variables for models and services
66
+ index, gemini_model, speaker_model, nlp, tokenizer, text_embedding_model = (None,) * 6
67
+
68
+
69
+ def initialize_all_services_and_models():
70
+ """Initializes all external services and loads all AI models into memory."""
71
+ global index, gemini_model, speaker_model, nlp, tokenizer, text_embedding_model
72
+ logger.info("Initializing all services and loading all models...")
73
+ pc = Pinecone(api_key=PINECONE_KEY)
74
+ index_name = "interview-speaker-embeddings"
75
+ if index_name not in pc.list_indexes().names():
76
+ pc.create_index(name=index_name, dimension=192, metric="cosine",
77
+ spec=ServerlessSpec(cloud="aws", region="us-east-1"))
78
+ index = pc.Index(index_name)
79
+ genai.configure(api_key=GEMINI_API_KEY)
80
+ gemini_model = genai.GenerativeModel('gemini-1.5-flash')
81
+ speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large",
82
+ map_location=device).eval()
83
+ nlp = spacy.load("en_core_web_sm")
84
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
85
+ text_embedding_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device).eval()
86
+ logger.info("All services and models are ready.")
87
+
88
+
89
+ initialize_all_services_and_models()
90
+
91
+
92
+ # ==============================================================================
93
+ # 3. HELPER AND UTILITY FUNCTIONS
94
+ # ==============================================================================
95
+ @contextmanager
96
+ def temp_audio_file(suffix='.wav'):
97
+ temp_file_path = None
98
+ try:
99
+ fd, temp_file_path = tempfile.mkstemp(suffix=suffix)
100
+ os.close(fd);
101
+ yield temp_file_path
102
+ finally:
103
+ if temp_file_path and os.path.exists(temp_file_path): os.remove(temp_file_path)
104
+
105
+
106
+ def convert_to_wav(input_path: str) -> str:
107
+ temp_wav_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False).name
108
+ try:
109
+ command = ['ffmpeg', '-y', '-i', input_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1',
110
+ temp_wav_file]
111
+ subprocess.run(command, check=True, capture_output=True, text=True)
112
+ return temp_wav_file
113
+ except Exception as e:
114
+ if os.path.exists(temp_wav_file): os.remove(temp_wav_file)
115
+ logger.error(f"Audio conversion failed: {e}", exc_info=True);
116
+ raise
117
+
118
+
119
+ def transcribe(audio_path: str) -> Dict:
120
+ try:
121
+ headers = {"authorization": ASSEMBLYAI_KEY}
122
+ with open(audio_path, 'rb') as f:
123
+ upload_response = requests.post("https://api.assemblyai.com/v2/upload", headers=headers, data=f)
124
+ upload_response.raise_for_status()
125
+ audio_url = upload_response.json()['upload_url']
126
+ transcript_response = requests.post("https://api.assemblyai.com/v2/transcript", headers=headers,
127
+ json={"audio_url": audio_url, "speaker_labels": True,
128
+ "filter_profanity": True})
129
+ transcript_response.raise_for_status()
130
+ transcript_id = transcript_response.json()['id']
131
+ logger.info(f"Transcription submitted. Polling for results (ID: {transcript_id})...")
132
+ while True:
133
+ result = requests.get(f"https://api.assemblyai.com/v2/transcript/{transcript_id}", headers=headers).json()
134
+ if result['status'] == 'completed': return result
135
+ if result['status'] == 'error': raise Exception(f"Transcription failed: {result['error']}")
136
+ time.sleep(5)
137
+ except Exception as e:
138
+ logger.error(f"Transcription failed: {e}", exc_info=True);
139
+ raise
140
+
141
+
142
+ def identify_speakers(transcript: Dict, wav_file_path: str) -> List[Dict]:
143
+ try:
144
+ full_audio = AudioSegment.from_wav(wav_file_path)
145
+
146
+ def process_utterance(utterance):
147
+ start_ms, end_ms = utterance['start'], utterance['end']
148
+ if end_ms - start_ms < 1000: return {**utterance, 'speaker_id': 'unknown_short_utterance'}
149
+ with temp_audio_file() as temp_path:
150
+ full_audio[start_ms:end_ms].export(temp_path, format="wav")
151
+ with torch.no_grad():
152
+ embedding = speaker_model.get_embedding(temp_path).cpu().numpy().flatten().tolist()
153
+ query_result = index.query(vector=embedding, top_k=1, include_metadata=True)
154
+ if query_result.get('matches') and query_result['matches'][0]['score'] > 0.75:
155
+ match = query_result['matches'][0]
156
+ return {**utterance, 'speaker_id': match['id'],
157
+ 'speaker_name': match['metadata'].get('speaker_name', 'Unknown Speaker')}
158
+ else:
159
+ speaker_id = f"speaker_{uuid.uuid4().hex[:8]}"
160
+ speaker_name = f"Speaker {len(index.describe_index_stats()['namespaces'].get('default', {}).get('vector_count', 0)) + 1}"
161
+ index.upsert(vectors=[(speaker_id, embedding, {"speaker_name": speaker_name})])
162
+ return {**utterance, 'speaker_id': speaker_id, 'speaker_name': speaker_name}
163
+
164
+ with ThreadPoolExecutor() as executor:
165
+ return list(executor.map(process_utterance, transcript.get('utterances', [])))
166
+ except Exception as e:
167
+ logger.error(f"Speaker identification failed: {e}", exc_info=True);
168
+ raise
169
+
170
+
171
+ def get_text_embedding(text: str) -> np.ndarray:
172
+ with torch.no_grad():
173
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding=True).to(device)
174
+ outputs = text_embedding_model(**inputs)
175
+ return outputs.last_hidden_state[0, 0, :].cpu().numpy()
176
+
177
+
178
+ def extract_detailed_prosodic_features(audio_segment: AudioSegment) -> Dict:
179
+ try:
180
+ with temp_audio_file() as temp_path:
181
+ audio_segment.export(temp_path, format="wav")
182
+ y, sr = librosa.load(temp_path, sr=16000)
183
+ if len(y) == 0: return {'pitch_std': 0}
184
+ f0, _, _ = librosa.pyin(y, fmin=80, fmax=400, sr=sr)
185
+ f0_values = f0[~np.isnan(f0)]
186
+ return {'pitch_std': float(np.std(f0_values)) if len(f0_values) > 1 else 0}
187
+ except Exception:
188
+ return {'pitch_std': 0}
189
+
190
+
191
+ def extract_duration_feature(utterances: List[Dict]) -> List[Dict]:
192
+ for u in utterances:
193
+ u['prosodic_features'] = {'duration': (u['end'] - u['start']) / 1000.0}
194
+ return utterances
195
+
196
+
197
+ def convert_to_serializable(obj):
198
+ if isinstance(obj, (np.integer, np.floating)): return obj.item()
199
+ if isinstance(obj, np.ndarray): return obj.tolist()
200
+ if isinstance(obj, dict): return {k: convert_to_serializable(v) for k, v in obj.items()}
201
+ if isinstance(obj, list): return [convert_to_serializable(item) for item in obj]
202
+ return obj
203
+
204
+
205
+ # ==============================================================================
206
+ # 4. CORE LOGIC - ULTIMATE ROLE CLASSIFIER
207
+ # ==============================================================================
208
+ def classify_roles_ultimate(utterances: List[Dict], audio_path: str) -> List[Dict]:
209
+ logger.info("Starting ULTIMATE role classification with prosodic analysis...")
210
+ full_audio = AudioSegment.from_wav(audio_path)
211
+ speakers = {u['speaker_id'] for u in utterances if 'speaker_id' in u and not u['speaker_id'].startswith('unknown')}
212
+ if len(speakers) < 2: return utterances
213
+ speaker_data = {sid: {'rule_score': 0, 'prosodic_score': 0, 'utterance_count': 0, 'embeddings': []} for sid in
214
+ speakers}
215
+ interviewer_keywords = r'\b(what|why|how|when|where|who|which|tell me about|can you explain|describe|give me an example)\b'
216
+ for u in utterances:
217
+ sid, text = u.get('speaker_id'), u.get('text', '').lower()
218
+ if sid not in speaker_data or not text: continue
219
+ rule_score = 10 if text.endswith('?') else 0;
220
+ rule_score += 5 * len(re.findall(interviewer_keywords, text))
221
+ rule_score += 2 if len(text.split()) < 10 else -5 if len(text.split()) > 30 else 0
222
+ speaker_data[sid]['rule_score'] += rule_score
223
+ segment = full_audio[u['start']:u['end']];
224
+ prosodic_features = extract_detailed_prosodic_features(segment)
225
+ speaker_data[sid]['prosodic_score'] += -5 if prosodic_features['pitch_std'] > 40 else 2
226
+ speaker_data[sid]['embeddings'].append(get_text_embedding(u['text']));
227
+ speaker_data[sid]['utterance_count'] += 1
228
+ canonical_question_embedding = get_text_embedding("Tell me about your experience and skills.")
229
+ for sid, data in speaker_data.items():
230
+ if not data['embeddings']: data['semantic_score'] = 0; continue
231
+ avg_embedding = np.mean(data['embeddings'], axis=0).reshape(1, -1)
232
+ data['semantic_score'] = cosine_similarity(avg_embedding, canonical_question_embedding.reshape(1, -1))[0][0]
233
+ final_scores = {}
234
+ for sid, data in speaker_data.items():
235
+ if data['utterance_count'] == 0: final_scores[sid] = -999; continue
236
+ avg_rule_score = data['rule_score'] / data['utterance_count'];
237
+ avg_prosodic_score = data['prosodic_score'] / data['utterance_count']
238
+ final_scores[sid] = (avg_rule_score * 0.5) + (data['semantic_score'] * 0.3) + (avg_prosodic_score * 0.2)
239
+ sorted_speakers = sorted(final_scores.items(), key=lambda item: item[1], reverse=True)
240
+ interviewer_id, interviewee_id = sorted_speakers[0][0], sorted_speakers[1][0]
241
+ logger.info(f"Ultimate Role Classification: Interviewer -> {interviewer_id}, Interviewee -> {interviewee_id}")
242
+ for u in utterances:
243
+ u['role'] = 'Interviewer' if u.get('speaker_id') == interviewer_id else 'Interviewee' if u.get(
244
+ 'speaker_id') == interviewee_id else 'Unknown'
245
+ return utterances
246
+
247
+
248
+ # ==============================================================================
249
+ # 5. YOUR CUSTOM ANALYSIS & REPORTING FUNCTIONS
250
+ # ==============================================================================
251
+ def analyze_interviewee_voice(audio_path: str, utterances: List[Dict]) -> Dict:
252
+ logger.info("Performing detailed voice analysis using your custom function...")
253
+ try:
254
+ y, sr = librosa.load(audio_path, sr=16000)
255
+ interviewee_utterances = [u for u in utterances if u.get('role') == 'Interviewee']
256
+ if not interviewee_utterances: return {'error': 'No interviewee utterances found'}
257
+ segments = [y[int(u['start'] * sr / 1000):int(u['end'] * sr / 1000)] for u in interviewee_utterances]
258
+ if not segments: return {'error': 'No valid interviewee segments to analyze.'}
259
+ combined_audio = np.concatenate(segments)
260
+ total_duration = sum(u['prosodic_features']['duration'] for u in interviewee_utterances)
261
+ total_words = sum(len(u['text'].split()) for u in interviewee_utterances)
262
+ speaking_rate = total_words / total_duration if total_duration > 0 else 0
263
+ filler_words = ['um', 'uh', 'like', 'you know', 'so', 'i mean'];
264
+ filler_count = sum(sum(u['text'].lower().count(fw) for fw in filler_words) for u in interviewee_utterances)
265
+ filler_ratio = filler_count / total_words if total_words > 0 else 0
266
+ all_words = ' '.join(u['text'].lower() for u in interviewee_utterances).split()
267
+ word_counts = {tuple(all_words[i:i + 2]): all_words.count(tuple(all_words[i:i + 2])) for i in
268
+ range(len(all_words) - 1)}
269
+ repetition_score = sum(1 for count in word_counts.values() if count > 1) / len(
270
+ word_counts) if word_counts else 0
271
+ f0, voiced_flag, _ = librosa.pyin(combined_audio, fmin=80, fmax=300, sr=sr)
272
+ f0_values = f0[voiced_flag & ~np.isnan(f0)];
273
+ pitch_mean = np.mean(f0_values) if len(f0_values) > 0 else 0
274
+ pitch_std = np.std(f0_values) if len(f0_values) > 0 else 0;
275
+ jitter = np.mean(np.abs(np.diff(f0_values))) / pitch_mean if len(f0_values) > 1 and pitch_mean > 0 else 0
276
+ rms = librosa.feature.rms(y=combined_audio)[0];
277
+ intensity_mean = np.mean(rms) if len(rms) > 0 else 0
278
+ intensity_std = np.std(rms) if len(rms) > 0 else 0;
279
+ shimmer = np.mean(np.abs(np.diff(rms))) / intensity_mean if len(rms) > 1 and intensity_mean > 0 else 0
280
+ anxiety_score = 0.6 * (pitch_std / pitch_mean if pitch_mean > 0 else 0) + 0.4 * (jitter + shimmer)
281
+ confidence_score = 0.7 * (1 / (1 + intensity_std)) + 0.3 * (1 / (1 + filler_ratio))
282
+ hesitation_score = filler_ratio + repetition_score
283
+ return {'speaking_rate': round(speaking_rate, 2), 'filler_ratio': round(filler_ratio, 4),
284
+ 'repetition_score': round(repetition_score, 4),
285
+ 'pitch_analysis': {'mean': float(pitch_mean), 'std_dev': float(pitch_std), 'jitter': float(jitter)},
286
+ 'intensity_analysis': {'mean': float(intensity_mean), 'std_dev': float(intensity_std),
287
+ 'shimmer': float(shimmer)},
288
+ 'composite_scores': {'anxiety': float(anxiety_score), 'confidence': float(confidence_score),
289
+ 'hesitation': float(hesitation_score)}}
290
+ except Exception as e:
291
+ logger.error(f"Error in detailed voice analysis: {e}", exc_info=True);
292
+ return {'error': str(e)}
293
+
294
+
295
+ def generate_voice_interpretation(analysis: Dict) -> str:
296
+ if 'error' in analysis: return "<b>Detailed Vocal Metrics:</b><br/>Analysis not available."
297
+ scores = analysis.get('composite_scores', {});
298
+ pitch = analysis.get('pitch_analysis', {});
299
+ intensity = analysis.get('intensity_analysis', {})
300
+ return (f"<b>Detailed Vocal Metrics Interpretation:</b><br/>"
301
+ f"- Speaking Rate: {analysis.get('speaking_rate', 0):.2f} words/sec<br/>"
302
+ f"- Filler Word Ratio: {analysis.get('filler_ratio', 0) * 100:.1f}%<br/>"
303
+ f"-----------------------------------<br/>"
304
+ f"- Pitch Mean: {pitch.get('mean', 0):.2f} Hz (Std Dev: {pitch.get('std_dev', 0):.2f})<br/>"
305
+ f"- Jitter (Vocal Stability): {pitch.get('jitter', 0):.4f}<br/>"
306
+ f"- Intensity (Loudness) Std Dev: {intensity.get('std_dev', 0):.4f}<br/>"
307
+ f"-----------------------------------<br/>"
308
+ f"- <b>Anxiety Score:</b> {scores.get('anxiety', 0):.3f}<br/>"
309
+ f"- <b>Confidence Score:</b> {scores.get('confidence', 0):.3f}<br/>"
310
+ f"- <b>Hesitation Score:</b> {scores.get('hesitation', 0):.3f}")
311
+
312
+
313
+ def generate_anxiety_confidence_chart(composite_scores: Dict, chart_path_or_buffer):
314
+ try:
315
+ labels = ['Anxiety', 'Confidence', 'Hesitation'];
316
+ scores = [composite_scores.get(k.lower(), 0) for k in labels]
317
+ fig, ax = plt.subplots(figsize=(6, 4));
318
+ ax.bar(labels, scores, color=['#FF6B6B', '#4ECDC4', '#FFA500'], edgecolor='black', width=0.5)
319
+ ax.set_ylabel('Score');
320
+ ax.set_title('Candidate Vocal Dynamics');
321
+ ax.set_ylim(0, max(scores) * 1.2 if scores and max(scores) > 0 else 1)
322
+ for bar in ax.patches: ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
323
+ f"{bar.get_height():.2f}", ha='center', color='black')
324
+ plt.tight_layout();
325
+ plt.savefig(chart_path_or_buffer, format='png', dpi=150);
326
+ plt.close(fig)
327
+ except Exception as e:
328
+ logger.error(f"Error generating chart: {e}")
329
+
330
+
331
+ def calculate_acceptance_probability(analysis_data: Dict) -> float:
332
+ logger.info("Calculating final acceptance probability...")
333
+ voice_metrics = analysis_data.get('voice_analysis_metrics', {})
334
+ if 'error' in voice_metrics or not voice_metrics.get('composite_scores'): return 30.0
335
+ scores = voice_metrics['composite_scores'];
336
+ confidence = scores.get('confidence', 0.5);
337
+ anxiety = scores.get('anxiety', 0.5);
338
+ hesitation = scores.get('hesitation', 0.5)
339
+ raw_score = (confidence * 0.6) + ((1 - anxiety) * 0.2) + ((1 - hesitation) * 0.2)
340
+ max_score = 0.6 + 0.2 + 0.2
341
+ return round(max(10.0, min(99.0, (raw_score / max_score if max_score > 0 else 0) * 100)), 2)
342
+
343
+
344
+ # ==============================================================================
345
+ # 6. AI-POWERED NARRATIVE AND PDF REPORTING
346
+ # ==============================================================================
347
+ def generate_gemini_report_text(analysis_data: Dict) -> str:
348
+ """Generates a comprehensive narrative report using the Gemini model, based on your prompt structure."""
349
+ logger.info("Generating AI-powered narrative report with Gemini...")
350
+ voice = analysis_data.get('voice_analysis_metrics', {})
351
+ interviewee_text = "\n".join(
352
+ [f"- {u['text']}" for u in analysis_data['transcript_with_roles'] if u.get('role') == 'Interviewee'])
353
+ acceptance_prob = analysis_data.get('acceptance_probability', 50.0)
354
+
355
+ prompt = f"""
356
+ You are EvalBot, a highly experienced senior HR analyst generating a comprehensive interview evaluation report.
357
+ Analyze deeply based on actual responses provided below. Avoid generic analysis.
358
+ Maintain professional, HR-standard language with clear structure and bullet points.
359
+
360
+ **Suitability Score: {acceptance_prob:.2f}%**
361
+
362
+ ### Interviewee Full Responses:
363
+ {interviewee_text if interviewee_text else "No responses recorded."}
364
+
365
+ ### Key Metrics:
366
+ - Confidence Score: {voice.get('composite_scores', {}).get('confidence', 'N/A'):.2f}
367
+ - Anxiety Score: {voice.get('composite_scores', {}).get('anxiety', 'N/A'):.2f}
368
+ - Speaking Rate: {voice.get('speaking_rate', 'N/A')} words/sec
369
+
370
+ ### Report Sections to Generate (Follow this structure exactly):
371
+ **1. Executive Summary:**
372
+ - 3 bullets summarizing performance, key strengths, and hiring recommendation.
373
+ **2. Communication and Vocal Dynamics:**
374
+ - Analyze delivery: speaking rate, filler words, confidence, anxiety. Provide 3-4 insightful bullets and 1 actionable recommendation.
375
+ **3. Competency and Content:**
376
+ - Identify 5-8 strengths (e.g., leadership, teamwork) with concrete examples from their responses.
377
+ - Identify 5-10 weaknesses or development areas with actionable feedback.
378
+ **4. Role Fit and Potential:**
379
+ - Analyze role fit, cultural fit, and growth potential in 3 bullets.
380
+ **5. Recommendations & Next Steps for Hiring Managers:**
381
+ - Provide 5 actionable recommendations and 5 clear next steps.
382
+ """
383
+ try:
384
+ response = gemini_model.generate_content(prompt);
385
+ return response.text
386
+ except Exception as e:
387
+ logger.error(f"Gemini report generation failed: {e}");
388
+ return "Error: Could not generate AI analysis report."
389
+
390
+
391
+ def create_pdf_report(analysis_data: Dict, output_path: str):
392
+ """Generates a detailed, professional PDF report including all analysis sections, based on your structure."""
393
+ logger.info(f"Generating comprehensive PDF report at {output_path}...")
394
+ doc = SimpleDocTemplate(output_path, pagesize=letter, topMargin=inch, bottomMargin=inch)
395
+ styles = getSampleStyleSheet()
396
+ styles.add(ParagraphStyle(name='H1', fontSize=18, leading=22, spaceAfter=12, textColor=colors.HexColor('#003087'),
397
+ fontName='Helvetica-Bold', alignment=TA_CENTER))
398
+ styles.add(ParagraphStyle(name='H2', fontSize=14, leading=18, spaceBefore=12, spaceAfter=8,
399
+ textColor=colors.HexColor('#0050BC'), fontName='Helvetica-Bold'))
400
+ styles.add(ParagraphStyle(name='Body', fontSize=10, leading=14, spaceAfter=6, alignment=TA_JUSTIFY))
401
+ story = []
402
+
403
+ # Cover Page
404
+ story.append(Paragraph("Candidate Interview Analysis Report", styles['H1']))
405
+ story.append(Spacer(1, 0.2 * inch))
406
+ story.append(Paragraph(f"Candidate ID: {analysis_data.get('user_id', 'N/A')}", styles['Body']))
407
+ story.append(Paragraph(f"Date of Analysis: {time.strftime('%B %d, %Y')}", styles['Body']))
408
+ prob = analysis_data.get('acceptance_probability', 0);
409
+ prob_color = 'green' if prob >= 75 else 'orange' if prob >= 50 else 'red'
410
+ story.append(
411
+ Paragraph(f"<b>Overall Suitability Score:</b> <font size=16 color='{prob_color}'>{prob}%</font>", styles['H2']))
412
+ story.append(PageBreak())
413
+
414
+ # Quantitative Analysis Page
415
+ story.append(Paragraph("Quantitative Vocal Analysis", styles['H2']))
416
+ if analysis_data.get('chart_image_bytes'):
417
+ story.append(Image(io.BytesIO(analysis_data['chart_image_bytes']), width=5.5 * inch, height=3.3 * inch))
418
+ story.append(Spacer(1, 0.2 * inch))
419
+
420
+ voice_text = analysis_data.get('voice_interpretation_text', 'Not available.').replace('\n', '<br/>')
421
+ story.append(Paragraph(voice_text, styles['Body']))
422
+ story.append(Spacer(1, 0.2 * inch))
423
+
424
+ # AI-Generated Narrative Page
425
+ story.append(Paragraph("Qualitative AI-Powered Report", styles['H2']))
426
+ gemini_text = analysis_data.get('gemini_report_text', 'Not available.')
427
+ for line in gemini_text.split('\n'):
428
+ line = line.strip()
429
+ if not line: continue
430
+ if line.startswith('**') and line.endswith('**'):
431
+ story.append(Paragraph(line.strip('*'), styles['H3']))
432
+ elif line.startswith('- ') or line.startswith('* '):
433
+ story.append(Paragraph(f"• {line[2:]}", styles['Body'], leftIndent=10))
434
+ else:
435
+ story.append(Paragraph(line, styles['Body']))
436
+
437
+ doc.build(story)
438
+ logger.info("PDF report generated successfully.")
439
+
440
+
441
+ # ==============================================================================
442
+ # 7. MAIN PROCESSING PIPELINE
443
+ # ==============================================================================
444
+ def process_interview(audio_path: str, user_id: str = "candidate-123") -> Dict:
445
+ """The main orchestrator function to process an interview from start to finish."""
446
+ wav_file = None
447
+ try:
448
+ logger.info(f"===== STARTING FULL ANALYSIS FOR USER: {user_id} | FILE: {audio_path} =====")
449
+ wav_file = convert_to_wav(audio_path)
450
+ transcript = transcribe(wav_file)
451
+ if not transcript or 'utterances' not in transcript: raise ValueError("Transcription failed.")
452
+
453
+ utterances_with_speakers = identify_speakers(transcript, wav_file)
454
+ classified_utterances = classify_roles_ultimate(utterances_with_speakers, wav_file)
455
+ utterances_with_features = extract_duration_feature(classified_utterances)
456
+ voice_analysis = analyze_interviewee_voice(wav_file, utterances_with_features)
457
+
458
+ analysis_data = {'user_id': user_id, 'source_file': audio_path,
459
+ 'transcript_with_roles': utterances_with_features, 'voice_analysis_metrics': voice_analysis}
460
+
461
+ analysis_data['acceptance_probability'] = calculate_acceptance_probability(analysis_data)
462
+ analysis_data['voice_interpretation_text'] = generate_voice_interpretation(voice_analysis)
463
+ analysis_data['gemini_report_text'] = generate_gemini_report_text(analysis_data)
464
+
465
+ with io.BytesIO() as chart_buffer:
466
+ generate_anxiety_confidence_chart(voice_analysis.get('composite_scores', {}), chart_buffer)
467
+ analysis_data['chart_image_bytes'] = chart_buffer.getvalue()
468
+
469
+ pdf_path = os.path.join(OUTPUT_DIR, f"{user_id}_{time.strftime('%Y%m%d')}_report.pdf")
470
+ create_pdf_report(analysis_data, pdf_path)
471
+
472
+ json_path = os.path.join(OUTPUT_DIR, f"{user_id}_{time.strftime('%Y%m%d')}_analysis.json")
473
+ with open(json_path, 'w') as f:
474
+ json_friendly_data = {k: v for k, v in analysis_data.items() if k != 'chart_image_bytes'}
475
+ json.dump(convert_to_serializable(json_friendly_data), f, indent=4)
476
+
477
+ logger.info(f"===== ANALYSIS COMPLETED. PDF: {pdf_path}, JSON: {json_path} =====")
478
+ return {'status': 'Success', 'pdf_path': pdf_path, 'json_path': json_path}
479
+
480
+ except Exception as e:
481
+ logger.critical(f"!!!!!! A CRITICAL ERROR OCCURRED IN THE PIPELINE for {user_id}: {e}", exc_info=True)
482
+ return {'status': 'Error', 'message': str(e)}
483
+ finally:
484
+ if wav_file and os.path.exists(wav_file): os.remove(wav_file)
485
+
486
+
requirements.txt ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Levenshtein
2
+ braceexpand
3
+ distance
4
+ docopt
5
+ fiddle
6
+ fsspec
7
+ g2p_en
8
+ hydra-core
9
+ intervaltree
10
+ jiwer
11
+ kaldi-python-io
12
+ kaldiio
13
+ lhotse
14
+ libcst
15
+ lightning
16
+ lilcom
17
+ llvmlite
18
+ loguru
19
+ mediapy
20
+ einops
21
+ nemo_toolkit
22
+ numba
23
+ nvidia-cublas-cu12
24
+ nvidia-cudnn-cu12
25
+ nvidia-cufft-cu12
26
+ nvidia-curand-cu12
27
+ nvidia-cusolver-cu12
28
+ nvidia-cusparse-cu12
29
+ nvidia-nvjitlink-cu12
30
+ pinecone
31
+ pinecone-plugin-interface
32
+ plac
33
+ pyannote.core
34
+ pyannote.database
35
+ pyannote.metrics
36
+ pyloudnorm
37
+ rapidfuzz
38
+ reportlab
39
+ resampy
40
+ ruamel.yaml
41
+ ruamel.yaml.clib
42
+ sacremoses
43
+ sox
44
+ texterrors
45
+ transformers
46
+ webdataset
47
+ wget
48
+ fastapi
49
+ uvicorn
50
+ pydub
51
+ librosa
52
+ spacy
53
+ google-generativeai
54
+ joblib
55
+ pandas
56
+ scikit-learn
57
+ numpy
58
+ torch
59
+ requests
60
+ sentencepiece
61
+ datasets
62
+ editdistance
63
+ python-multipart
64
+ hf_xet
65
+ huggingface_hub
66
+ gradio
67
+ hf_transfer
68
+ matplotlib
69
+ seaborn
70
+ reportlab
71
+ Pillow
72
+ Flask
73
+ requests
74
+ gunicorn
75
+ parselmouth