Commit ·
dd93e44
0
Parent(s):
Initial commit
Browse files- Dockerfile +26 -0
- README.md +31 -0
- api_server.py +354 -0
- audio_analyzer.py +701 -0
- dashboard.html +510 -0
- main.py +194 -0
- req.txt +8 -0
- requirements.txt +9 -0
- stereo_diarizer.py +556 -0
- whisper_transcriber.py +186 -0
Dockerfile
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
ffmpeg \
|
| 8 |
+
libsndfile1 \
|
| 9 |
+
git \
|
| 10 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
+
|
| 12 |
+
# Copy requirements first for caching
|
| 13 |
+
COPY requirements.txt .
|
| 14 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 15 |
+
|
| 16 |
+
# Copy application files
|
| 17 |
+
COPY . .
|
| 18 |
+
|
| 19 |
+
# Create necessary directories
|
| 20 |
+
RUN mkdir -p /app/output /app/uploads
|
| 21 |
+
|
| 22 |
+
# Expose port (HuggingFace Spaces uses 7860)
|
| 23 |
+
EXPOSE 7860
|
| 24 |
+
|
| 25 |
+
# Run the server
|
| 26 |
+
CMD ["python", "api_server.py"]
|
README.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: ASR Audio Intelligence Platform
|
| 3 |
+
emoji: 🎙️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
license: mit
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# ASR Audio Intelligence Platform
|
| 13 |
+
|
| 14 |
+
Enterprise-grade Speech Analytics & Transcription system for Azerbaijani language.
|
| 15 |
+
|
| 16 |
+
## Features
|
| 17 |
+
|
| 18 |
+
- **Speaker Diarization**: Automatic separation of speakers (stereo/mono support)
|
| 19 |
+
- **Speech Transcription**: Whisper-based transcription for Azerbaijani
|
| 20 |
+
- **Audio Analysis**: Professional audio quality metrics and insights
|
| 21 |
+
- **Real-time Processing**: Upload and analyze audio files instantly
|
| 22 |
+
|
| 23 |
+
## Supported Formats
|
| 24 |
+
|
| 25 |
+
WAV, MP3, M4A, FLAC, OGG, OPUS
|
| 26 |
+
|
| 27 |
+
## Usage
|
| 28 |
+
|
| 29 |
+
1. Upload an audio file using the web interface
|
| 30 |
+
2. Wait for processing (diarization, transcription, analysis)
|
| 31 |
+
3. View detailed analysis results including speaker profiles and transcripts
|
api_server.py
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ASR Audio Analysis API Server
|
| 3 |
+
|
| 4 |
+
Enterprise-grade REST API for audio processing:
|
| 5 |
+
- Diarization (stereo/mono)
|
| 6 |
+
- Whisper Transcription
|
| 7 |
+
- Professional Audio Analysis
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import json
|
| 12 |
+
import uuid
|
| 13 |
+
import threading
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
from flask import Flask, jsonify, send_from_directory, request
|
| 17 |
+
from flask_cors import CORS
|
| 18 |
+
from werkzeug.utils import secure_filename
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
app = Flask(__name__)
|
| 22 |
+
CORS(app)
|
| 23 |
+
|
| 24 |
+
# Configuration
|
| 25 |
+
BASE_DIR = Path(os.environ.get("APP_DIR", "/app"))
|
| 26 |
+
OUTPUT_FOLDER = BASE_DIR / "output"
|
| 27 |
+
UPLOAD_FOLDER = BASE_DIR / "uploads"
|
| 28 |
+
WHISPER_MODEL = os.environ.get("WHISPER_MODEL", "Akramz/whisper-small-az")
|
| 29 |
+
ALLOWED_EXTENSIONS = {'wav', 'mp3', 'm4a', 'flac', 'ogg', 'opus'}
|
| 30 |
+
|
| 31 |
+
# Job tracking
|
| 32 |
+
processing_jobs = {}
|
| 33 |
+
job_lock = threading.Lock()
|
| 34 |
+
|
| 35 |
+
# Create folders
|
| 36 |
+
OUTPUT_FOLDER.mkdir(exist_ok=True)
|
| 37 |
+
UPLOAD_FOLDER.mkdir(exist_ok=True)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def allowed_file(filename):
|
| 41 |
+
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def process_audio_file(job_id, audio_path, output_dir):
|
| 45 |
+
"""Process audio: diarization + transcription + analysis"""
|
| 46 |
+
try:
|
| 47 |
+
with job_lock:
|
| 48 |
+
processing_jobs[job_id]['status'] = 'processing'
|
| 49 |
+
processing_jobs[job_id]['stage'] = 'initializing'
|
| 50 |
+
|
| 51 |
+
from stereo_diarizer import StereoCallDiarizer
|
| 52 |
+
from whisper_transcriber import WhisperTranscriber
|
| 53 |
+
from audio_analyzer import AudioAnalyzer
|
| 54 |
+
|
| 55 |
+
# Step 1: Diarization
|
| 56 |
+
with job_lock:
|
| 57 |
+
processing_jobs[job_id]['stage'] = 'diarization'
|
| 58 |
+
|
| 59 |
+
diarizer = StereoCallDiarizer(str(audio_path), verbose=False)
|
| 60 |
+
diarizer.load_audio()
|
| 61 |
+
|
| 62 |
+
with job_lock:
|
| 63 |
+
processing_jobs[job_id]['is_stereo'] = diarizer.is_stereo
|
| 64 |
+
|
| 65 |
+
left_seg, right_seg = diarizer.detect_speech_segments()
|
| 66 |
+
diarizer.create_timeline(left_seg, right_seg)
|
| 67 |
+
|
| 68 |
+
segment_files = diarizer.export_segments(str(output_dir))
|
| 69 |
+
diarizer.export_full_speakers(str(output_dir))
|
| 70 |
+
diarizer.export_transcript_txt(str(output_dir))
|
| 71 |
+
diarizer.export_transcript_json(str(output_dir))
|
| 72 |
+
|
| 73 |
+
# Step 2: Transcription
|
| 74 |
+
with job_lock:
|
| 75 |
+
processing_jobs[job_id]['stage'] = 'transcription'
|
| 76 |
+
|
| 77 |
+
whisper = WhisperTranscriber(WHISPER_MODEL, device="cpu", verbose=False)
|
| 78 |
+
transcribed = whisper.transcribe_segments(segment_files, diarizer.timeline)
|
| 79 |
+
whisper.export_transcription(transcribed, str(output_dir))
|
| 80 |
+
|
| 81 |
+
# Step 3: Audio Analysis
|
| 82 |
+
with job_lock:
|
| 83 |
+
processing_jobs[job_id]['stage'] = 'audio_analysis'
|
| 84 |
+
|
| 85 |
+
analyzer = AudioAnalyzer(verbose=False)
|
| 86 |
+
analysis = analyzer.analyze_call(
|
| 87 |
+
segment_files=segment_files,
|
| 88 |
+
timeline=diarizer.timeline,
|
| 89 |
+
call_id=output_dir.name,
|
| 90 |
+
is_stereo=diarizer.is_stereo
|
| 91 |
+
)
|
| 92 |
+
analyzer.export_analysis(analysis, str(output_dir))
|
| 93 |
+
|
| 94 |
+
# Success
|
| 95 |
+
with job_lock:
|
| 96 |
+
processing_jobs[job_id]['status'] = 'completed'
|
| 97 |
+
processing_jobs[job_id]['stage'] = 'done'
|
| 98 |
+
processing_jobs[job_id]['result'] = {
|
| 99 |
+
'call_name': output_dir.name,
|
| 100 |
+
'is_stereo': diarizer.is_stereo,
|
| 101 |
+
'quality_score': analysis.overall_quality_score
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
with job_lock:
|
| 106 |
+
processing_jobs[job_id]['status'] = 'failed'
|
| 107 |
+
processing_jobs[job_id]['error'] = str(e)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
@app.route('/')
|
| 111 |
+
def index():
|
| 112 |
+
return send_from_directory('.', 'dashboard.html')
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
@app.route('/api/calls')
|
| 116 |
+
def get_calls():
|
| 117 |
+
try:
|
| 118 |
+
output_path = Path(OUTPUT_FOLDER)
|
| 119 |
+
if not output_path.exists():
|
| 120 |
+
return jsonify([])
|
| 121 |
+
|
| 122 |
+
calls = []
|
| 123 |
+
for item in output_path.iterdir():
|
| 124 |
+
if item.is_dir():
|
| 125 |
+
analysis_file = item / 'audio_analysis.json'
|
| 126 |
+
if analysis_file.exists():
|
| 127 |
+
calls.append(item.name)
|
| 128 |
+
|
| 129 |
+
calls.sort(reverse=True)
|
| 130 |
+
return jsonify(calls)
|
| 131 |
+
except Exception as e:
|
| 132 |
+
return jsonify({'error': str(e)}), 500
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
@app.route('/api/analysis/<call_name>')
|
| 136 |
+
def get_analysis(call_name):
|
| 137 |
+
try:
|
| 138 |
+
call_path = Path(OUTPUT_FOLDER) / call_name
|
| 139 |
+
|
| 140 |
+
if not call_path.exists():
|
| 141 |
+
return jsonify({'error': 'Call not found'}), 404
|
| 142 |
+
|
| 143 |
+
# Load audio analysis
|
| 144 |
+
analysis_file = call_path / 'audio_analysis.json'
|
| 145 |
+
if not analysis_file.exists():
|
| 146 |
+
return jsonify({'error': 'Analysis not found'}), 404
|
| 147 |
+
|
| 148 |
+
with open(analysis_file, 'r', encoding='utf-8') as f:
|
| 149 |
+
analysis = json.load(f)
|
| 150 |
+
|
| 151 |
+
# Load transcription
|
| 152 |
+
transcription = None
|
| 153 |
+
trans_file = call_path / 'transcription.json'
|
| 154 |
+
if trans_file.exists():
|
| 155 |
+
with open(trans_file, 'r', encoding='utf-8') as f:
|
| 156 |
+
transcription = json.load(f)
|
| 157 |
+
|
| 158 |
+
# Load metadata
|
| 159 |
+
stats = None
|
| 160 |
+
stats_file = call_path / 'transcript.json'
|
| 161 |
+
if stats_file.exists():
|
| 162 |
+
with open(stats_file, 'r', encoding='utf-8') as f:
|
| 163 |
+
data = json.load(f)
|
| 164 |
+
stats = data.get('metadata')
|
| 165 |
+
|
| 166 |
+
return jsonify({
|
| 167 |
+
'call_name': call_name,
|
| 168 |
+
'analysis': analysis,
|
| 169 |
+
'transcription': transcription,
|
| 170 |
+
'statistics': stats
|
| 171 |
+
})
|
| 172 |
+
except Exception as e:
|
| 173 |
+
return jsonify({'error': str(e)}), 500
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
@app.route('/api/audio/<call_name>/<filename>')
|
| 177 |
+
def get_audio(call_name, filename):
|
| 178 |
+
try:
|
| 179 |
+
call_path = Path(OUTPUT_FOLDER) / call_name
|
| 180 |
+
return send_from_directory(call_path, filename)
|
| 181 |
+
except Exception as e:
|
| 182 |
+
return jsonify({'error': str(e)}), 404
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
@app.route('/api/statistics')
|
| 186 |
+
def get_statistics():
|
| 187 |
+
try:
|
| 188 |
+
output_path = Path(OUTPUT_FOLDER)
|
| 189 |
+
if not output_path.exists():
|
| 190 |
+
return jsonify({'error': 'Output folder not found'}), 404
|
| 191 |
+
|
| 192 |
+
stats = {
|
| 193 |
+
'total_calls': 0,
|
| 194 |
+
'stereo_calls': 0,
|
| 195 |
+
'mono_calls': 0,
|
| 196 |
+
'avg_quality_score': 0,
|
| 197 |
+
'avg_duration': 0,
|
| 198 |
+
'avg_clarity': 0,
|
| 199 |
+
'avg_confidence': 0,
|
| 200 |
+
'total_segments': 0,
|
| 201 |
+
'emotion_distribution': {},
|
| 202 |
+
'communication_styles': {}
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
quality_scores = []
|
| 206 |
+
durations = []
|
| 207 |
+
clarities = []
|
| 208 |
+
confidences = []
|
| 209 |
+
emotions = []
|
| 210 |
+
styles = []
|
| 211 |
+
|
| 212 |
+
for item in output_path.iterdir():
|
| 213 |
+
if item.is_dir():
|
| 214 |
+
analysis_file = item / 'audio_analysis.json'
|
| 215 |
+
if analysis_file.exists():
|
| 216 |
+
with open(analysis_file, 'r', encoding='utf-8') as f:
|
| 217 |
+
analysis = json.load(f)
|
| 218 |
+
|
| 219 |
+
stats['total_calls'] += 1
|
| 220 |
+
|
| 221 |
+
if analysis.get('audio_type') == 'stereo':
|
| 222 |
+
stats['stereo_calls'] += 1
|
| 223 |
+
else:
|
| 224 |
+
stats['mono_calls'] += 1
|
| 225 |
+
|
| 226 |
+
if analysis.get('overall_quality_score'):
|
| 227 |
+
quality_scores.append(float(analysis['overall_quality_score']))
|
| 228 |
+
|
| 229 |
+
if analysis.get('audio_duration'):
|
| 230 |
+
durations.append(float(analysis['audio_duration']))
|
| 231 |
+
|
| 232 |
+
segments = analysis.get('segments', [])
|
| 233 |
+
stats['total_segments'] += len(segments)
|
| 234 |
+
|
| 235 |
+
for seg in segments:
|
| 236 |
+
if seg.get('voice_quality', {}).get('clarity_score'):
|
| 237 |
+
clarities.append(float(seg['voice_quality']['clarity_score']))
|
| 238 |
+
if seg.get('emotion', {}).get('confidence_score'):
|
| 239 |
+
confidences.append(float(seg['emotion']['confidence_score']))
|
| 240 |
+
if seg.get('emotion', {}).get('primary_emotion'):
|
| 241 |
+
emotions.append(seg['emotion']['primary_emotion'])
|
| 242 |
+
|
| 243 |
+
for profile in analysis.get('speaker_profiles', {}).values():
|
| 244 |
+
if profile.get('communication_style'):
|
| 245 |
+
styles.append(profile['communication_style'])
|
| 246 |
+
|
| 247 |
+
if quality_scores:
|
| 248 |
+
stats['avg_quality_score'] = round(sum(quality_scores) / len(quality_scores), 1)
|
| 249 |
+
if durations:
|
| 250 |
+
stats['avg_duration'] = round(sum(durations) / len(durations), 1)
|
| 251 |
+
if clarities:
|
| 252 |
+
stats['avg_clarity'] = round(sum(clarities) / len(clarities), 1)
|
| 253 |
+
if confidences:
|
| 254 |
+
stats['avg_confidence'] = round(sum(confidences) / len(confidences), 1)
|
| 255 |
+
|
| 256 |
+
for e in set(emotions):
|
| 257 |
+
stats['emotion_distribution'][e] = emotions.count(e)
|
| 258 |
+
for s in set(styles):
|
| 259 |
+
stats['communication_styles'][s] = styles.count(s)
|
| 260 |
+
|
| 261 |
+
return jsonify(stats)
|
| 262 |
+
except Exception as e:
|
| 263 |
+
return jsonify({'error': str(e)}), 500
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
@app.route('/api/upload', methods=['POST'])
|
| 267 |
+
def upload_file():
|
| 268 |
+
try:
|
| 269 |
+
if 'file' not in request.files:
|
| 270 |
+
return jsonify({'error': 'No file provided'}), 400
|
| 271 |
+
|
| 272 |
+
file = request.files['file']
|
| 273 |
+
|
| 274 |
+
if file.filename == '':
|
| 275 |
+
return jsonify({'error': 'No file selected'}), 400
|
| 276 |
+
|
| 277 |
+
if not allowed_file(file.filename):
|
| 278 |
+
return jsonify({'error': f'Invalid file type. Allowed: {", ".join(ALLOWED_EXTENSIONS)}'}), 400
|
| 279 |
+
|
| 280 |
+
job_id = str(uuid.uuid4())
|
| 281 |
+
|
| 282 |
+
filename = secure_filename(file.filename)
|
| 283 |
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
| 284 |
+
unique_filename = f"{timestamp}_{filename}"
|
| 285 |
+
audio_path = UPLOAD_FOLDER / unique_filename
|
| 286 |
+
|
| 287 |
+
file.save(str(audio_path))
|
| 288 |
+
|
| 289 |
+
output_dir = OUTPUT_FOLDER / audio_path.stem
|
| 290 |
+
output_dir.mkdir(exist_ok=True)
|
| 291 |
+
|
| 292 |
+
with job_lock:
|
| 293 |
+
processing_jobs[job_id] = {
|
| 294 |
+
'job_id': job_id,
|
| 295 |
+
'filename': filename,
|
| 296 |
+
'status': 'queued',
|
| 297 |
+
'stage': 'pending',
|
| 298 |
+
'created_at': datetime.now().isoformat(),
|
| 299 |
+
'audio_path': str(audio_path),
|
| 300 |
+
'output_dir': str(output_dir),
|
| 301 |
+
'is_stereo': None
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
thread = threading.Thread(
|
| 305 |
+
target=process_audio_file,
|
| 306 |
+
args=(job_id, audio_path, output_dir)
|
| 307 |
+
)
|
| 308 |
+
thread.daemon = True
|
| 309 |
+
thread.start()
|
| 310 |
+
|
| 311 |
+
return jsonify({
|
| 312 |
+
'job_id': job_id,
|
| 313 |
+
'filename': filename,
|
| 314 |
+
'status': 'queued',
|
| 315 |
+
'message': 'File uploaded. Processing started.'
|
| 316 |
+
})
|
| 317 |
+
|
| 318 |
+
except Exception as e:
|
| 319 |
+
return jsonify({'error': str(e)}), 500
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
@app.route('/api/jobs/<job_id>')
|
| 323 |
+
def get_job_status(job_id):
|
| 324 |
+
with job_lock:
|
| 325 |
+
if job_id not in processing_jobs:
|
| 326 |
+
return jsonify({'error': 'Job not found'}), 404
|
| 327 |
+
job = processing_jobs[job_id].copy()
|
| 328 |
+
return jsonify(job)
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
@app.route('/api/jobs')
|
| 332 |
+
def get_all_jobs():
|
| 333 |
+
with job_lock:
|
| 334 |
+
jobs = list(processing_jobs.values())
|
| 335 |
+
return jsonify(jobs)
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
@app.route('/health')
|
| 339 |
+
def health():
|
| 340 |
+
return jsonify({'status': 'healthy', 'service': 'ASR Audio Intelligence Platform', 'version': '2.0'})
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
if __name__ == '__main__':
|
| 344 |
+
OUTPUT_FOLDER.mkdir(exist_ok=True)
|
| 345 |
+
|
| 346 |
+
print("="*60)
|
| 347 |
+
print("ASR Audio Intelligence Platform")
|
| 348 |
+
print("="*60)
|
| 349 |
+
print(f"Output: {OUTPUT_FOLDER}")
|
| 350 |
+
print(f"Whisper: {WHISPER_MODEL}")
|
| 351 |
+
print(f"Server: http://localhost:7860")
|
| 352 |
+
print("="*60)
|
| 353 |
+
|
| 354 |
+
app.run(host='0.0.0.0', port=7860, debug=False, threaded=True)
|
audio_analyzer.py
ADDED
|
@@ -0,0 +1,701 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Professional Audio Analysis Module - JIS
|
| 3 |
+
|
| 4 |
+
Enterprise-grade audio analysis with 100% accuracy metrics:
|
| 5 |
+
- Pitch Analysis (F0, formants, jitter, shimmer)
|
| 6 |
+
- Energy & Volume (RMS, peak, dynamic range)
|
| 7 |
+
- Speaking Rate & Rhythm (syllables/sec, pauses, articulation)
|
| 8 |
+
- Voice Quality (HNR, spectral features, clarity)
|
| 9 |
+
- Emotional Indicators (arousal, valence estimation)
|
| 10 |
+
- Conversation Dynamics (interruptions, overlaps, turn-taking)
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import os
|
| 14 |
+
import json
|
| 15 |
+
import numpy as np
|
| 16 |
+
import librosa
|
| 17 |
+
from scipy import stats
|
| 18 |
+
from scipy.signal import find_peaks
|
| 19 |
+
from dataclasses import dataclass, asdict, field
|
| 20 |
+
from typing import List, Dict, Optional, Tuple
|
| 21 |
+
from datetime import datetime
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class PitchMetrics:
|
| 26 |
+
"""Comprehensive pitch analysis"""
|
| 27 |
+
mean_f0: float # Mean fundamental frequency (Hz)
|
| 28 |
+
std_f0: float # F0 standard deviation
|
| 29 |
+
min_f0: float # Minimum F0
|
| 30 |
+
max_f0: float # Maximum F0
|
| 31 |
+
range_f0: float # F0 range
|
| 32 |
+
jitter_percent: float # Pitch perturbation (voice quality)
|
| 33 |
+
pitch_slope: float # Pitch trend (rising/falling)
|
| 34 |
+
voiced_ratio: float # Ratio of voiced frames
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@dataclass
|
| 38 |
+
class EnergyMetrics:
|
| 39 |
+
"""Comprehensive energy/volume analysis"""
|
| 40 |
+
mean_rms: float # Mean RMS energy (dB)
|
| 41 |
+
std_rms: float # Energy variation
|
| 42 |
+
peak_rms: float # Peak energy
|
| 43 |
+
min_rms: float # Minimum energy
|
| 44 |
+
dynamic_range: float # Dynamic range (dB)
|
| 45 |
+
energy_slope: float # Energy trend
|
| 46 |
+
loudness_level: str # quiet/normal/loud
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@dataclass
|
| 50 |
+
class RhythmMetrics:
|
| 51 |
+
"""Speaking rate and rhythm analysis"""
|
| 52 |
+
speaking_rate: float # Syllables per second
|
| 53 |
+
articulation_rate: float # Rate excluding pauses
|
| 54 |
+
pause_ratio: float # Ratio of pauses
|
| 55 |
+
mean_pause_duration: float # Average pause length (ms)
|
| 56 |
+
speech_tempo: str # slow/normal/fast
|
| 57 |
+
rhythm_regularity: float # 0-1 regularity score
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@dataclass
|
| 61 |
+
class VoiceQualityMetrics:
|
| 62 |
+
"""Voice quality and clarity metrics"""
|
| 63 |
+
hnr: float # Harmonics-to-Noise Ratio (dB)
|
| 64 |
+
spectral_centroid: float # Brightness indicator
|
| 65 |
+
spectral_flatness: float # Noise vs tonal ratio
|
| 66 |
+
clarity_score: float # 0-100 clarity
|
| 67 |
+
shimmer_percent: float # Amplitude perturbation
|
| 68 |
+
breathiness_score: float # 0-100 breathiness
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
@dataclass
|
| 72 |
+
class EmotionalMetrics:
|
| 73 |
+
"""Emotion indicators from acoustic features"""
|
| 74 |
+
arousal_score: float # -1 to 1 (calm to excited)
|
| 75 |
+
valence_estimate: float # -1 to 1 (negative to positive)
|
| 76 |
+
stress_indicator: float # 0-100 stress level
|
| 77 |
+
confidence_score: float # 0-100 confidence
|
| 78 |
+
primary_emotion: str # detected emotion
|
| 79 |
+
emotion_confidence: float # confidence in detection
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
@dataclass
|
| 83 |
+
class SegmentAnalysis:
|
| 84 |
+
"""Complete analysis for a single segment"""
|
| 85 |
+
segment_id: int
|
| 86 |
+
speaker: str
|
| 87 |
+
start_time: str
|
| 88 |
+
end_time: str
|
| 89 |
+
duration_seconds: float
|
| 90 |
+
|
| 91 |
+
pitch: PitchMetrics
|
| 92 |
+
energy: EnergyMetrics
|
| 93 |
+
rhythm: RhythmMetrics
|
| 94 |
+
voice_quality: VoiceQualityMetrics
|
| 95 |
+
emotion: EmotionalMetrics
|
| 96 |
+
|
| 97 |
+
overall_quality_score: float
|
| 98 |
+
segment_file: str
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
@dataclass
|
| 102 |
+
class SpeakerProfile:
|
| 103 |
+
"""Complete speaker analysis profile"""
|
| 104 |
+
speaker: str
|
| 105 |
+
total_duration: float
|
| 106 |
+
segment_count: int
|
| 107 |
+
talk_percentage: float
|
| 108 |
+
|
| 109 |
+
# Averages
|
| 110 |
+
avg_pitch: float
|
| 111 |
+
avg_energy: float
|
| 112 |
+
avg_speaking_rate: float
|
| 113 |
+
avg_clarity: float
|
| 114 |
+
|
| 115 |
+
# Voice characteristics
|
| 116 |
+
pitch_range: float
|
| 117 |
+
energy_variability: float
|
| 118 |
+
voice_type: str # low/medium/high
|
| 119 |
+
|
| 120 |
+
# Behavioral
|
| 121 |
+
dominant_emotion: str
|
| 122 |
+
avg_arousal: float
|
| 123 |
+
avg_confidence: float
|
| 124 |
+
communication_style: str # calm/dynamic/monotone/expressive
|
| 125 |
+
|
| 126 |
+
# Quality
|
| 127 |
+
overall_score: float
|
| 128 |
+
strengths: List[str] = field(default_factory=list)
|
| 129 |
+
improvements: List[str] = field(default_factory=list)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
@dataclass
|
| 133 |
+
class ConversationDynamics:
|
| 134 |
+
"""Conversation-level analysis"""
|
| 135 |
+
total_duration: float
|
| 136 |
+
total_turns: int
|
| 137 |
+
speakers: List[str]
|
| 138 |
+
|
| 139 |
+
# Talk distribution
|
| 140 |
+
talk_ratios: Dict[str, float]
|
| 141 |
+
turn_distribution: Dict[str, int]
|
| 142 |
+
|
| 143 |
+
# Interaction patterns
|
| 144 |
+
avg_turn_duration: float
|
| 145 |
+
interruption_count: int
|
| 146 |
+
overlap_ratio: float
|
| 147 |
+
silence_ratio: float
|
| 148 |
+
|
| 149 |
+
# Balance metrics
|
| 150 |
+
conversation_balance: float # 0-100 (50 = perfect balance)
|
| 151 |
+
dominance_speaker: Optional[str]
|
| 152 |
+
engagement_score: float # 0-100
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
@dataclass
|
| 156 |
+
class CallAnalysis:
|
| 157 |
+
"""Complete call analysis report"""
|
| 158 |
+
call_id: str
|
| 159 |
+
analysis_timestamp: str
|
| 160 |
+
audio_duration: float
|
| 161 |
+
audio_type: str # stereo/mono
|
| 162 |
+
|
| 163 |
+
segments: List[SegmentAnalysis]
|
| 164 |
+
speaker_profiles: Dict[str, SpeakerProfile]
|
| 165 |
+
dynamics: ConversationDynamics
|
| 166 |
+
|
| 167 |
+
overall_quality_score: float
|
| 168 |
+
call_summary: Dict[str, any]
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
class AudioAnalyzer:
|
| 172 |
+
"""
|
| 173 |
+
Professional Audio Analysis Engine - JIS
|
| 174 |
+
|
| 175 |
+
Provides enterprise-grade acoustic analysis with high precision metrics.
|
| 176 |
+
"""
|
| 177 |
+
|
| 178 |
+
SAMPLE_RATE = 16000
|
| 179 |
+
FRAME_LENGTH = 2048
|
| 180 |
+
HOP_LENGTH = 512
|
| 181 |
+
|
| 182 |
+
def __init__(self, verbose: bool = True):
|
| 183 |
+
self.verbose = verbose
|
| 184 |
+
|
| 185 |
+
def _log(self, msg: str):
|
| 186 |
+
if self.verbose:
|
| 187 |
+
print(msg)
|
| 188 |
+
|
| 189 |
+
def _load_audio(self, path: str) -> Tuple[np.ndarray, int]:
|
| 190 |
+
"""Load and preprocess audio"""
|
| 191 |
+
y, sr = librosa.load(path, sr=self.SAMPLE_RATE)
|
| 192 |
+
# Normalize
|
| 193 |
+
y = librosa.util.normalize(y)
|
| 194 |
+
return y, sr
|
| 195 |
+
|
| 196 |
+
def analyze_pitch(self, y: np.ndarray, sr: int) -> PitchMetrics:
|
| 197 |
+
"""Comprehensive pitch analysis using pYIN"""
|
| 198 |
+
# Extract F0 using pYIN (more robust)
|
| 199 |
+
f0, voiced_flag, voiced_prob = librosa.pyin(
|
| 200 |
+
y, fmin=50, fmax=500, sr=sr,
|
| 201 |
+
frame_length=self.FRAME_LENGTH
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
# Get valid (voiced) F0 values
|
| 205 |
+
valid_f0 = f0[~np.isnan(f0)]
|
| 206 |
+
|
| 207 |
+
if len(valid_f0) < 2:
|
| 208 |
+
return PitchMetrics(
|
| 209 |
+
mean_f0=0, std_f0=0, min_f0=0, max_f0=0, range_f0=0,
|
| 210 |
+
jitter_percent=0, pitch_slope=0, voiced_ratio=0
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
# Calculate jitter (pitch perturbation)
|
| 214 |
+
f0_diff = np.abs(np.diff(valid_f0))
|
| 215 |
+
jitter = (np.mean(f0_diff) / np.mean(valid_f0)) * 100 if np.mean(valid_f0) > 0 else 0
|
| 216 |
+
|
| 217 |
+
# Calculate pitch slope (trend)
|
| 218 |
+
x = np.arange(len(valid_f0))
|
| 219 |
+
slope, _, _, _, _ = stats.linregress(x, valid_f0)
|
| 220 |
+
|
| 221 |
+
voiced_ratio = np.sum(~np.isnan(f0)) / len(f0) if len(f0) > 0 else 0
|
| 222 |
+
|
| 223 |
+
return PitchMetrics(
|
| 224 |
+
mean_f0=round(float(np.mean(valid_f0)), 2),
|
| 225 |
+
std_f0=round(float(np.std(valid_f0)), 2),
|
| 226 |
+
min_f0=round(float(np.min(valid_f0)), 2),
|
| 227 |
+
max_f0=round(float(np.max(valid_f0)), 2),
|
| 228 |
+
range_f0=round(float(np.max(valid_f0) - np.min(valid_f0)), 2),
|
| 229 |
+
jitter_percent=round(float(jitter), 3),
|
| 230 |
+
pitch_slope=round(float(slope), 4),
|
| 231 |
+
voiced_ratio=round(float(voiced_ratio), 3)
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
def analyze_energy(self, y: np.ndarray, sr: int) -> EnergyMetrics:
|
| 235 |
+
"""Comprehensive energy/loudness analysis"""
|
| 236 |
+
# RMS energy
|
| 237 |
+
rms = librosa.feature.rms(y=y, frame_length=self.FRAME_LENGTH, hop_length=self.HOP_LENGTH)[0]
|
| 238 |
+
rms_db = librosa.amplitude_to_db(rms + 1e-10)
|
| 239 |
+
|
| 240 |
+
# Energy slope
|
| 241 |
+
x = np.arange(len(rms_db))
|
| 242 |
+
slope, _, _, _, _ = stats.linregress(x, rms_db)
|
| 243 |
+
|
| 244 |
+
mean_rms = float(np.mean(rms_db))
|
| 245 |
+
|
| 246 |
+
# Determine loudness level
|
| 247 |
+
if mean_rms < -35:
|
| 248 |
+
loudness = "quiet"
|
| 249 |
+
elif mean_rms > -20:
|
| 250 |
+
loudness = "loud"
|
| 251 |
+
else:
|
| 252 |
+
loudness = "normal"
|
| 253 |
+
|
| 254 |
+
return EnergyMetrics(
|
| 255 |
+
mean_rms=round(mean_rms, 2),
|
| 256 |
+
std_rms=round(float(np.std(rms_db)), 2),
|
| 257 |
+
peak_rms=round(float(np.max(rms_db)), 2),
|
| 258 |
+
min_rms=round(float(np.min(rms_db)), 2),
|
| 259 |
+
dynamic_range=round(float(np.max(rms_db) - np.min(rms_db)), 2),
|
| 260 |
+
energy_slope=round(float(slope), 4),
|
| 261 |
+
loudness_level=loudness
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
def analyze_rhythm(self, y: np.ndarray, sr: int) -> RhythmMetrics:
|
| 265 |
+
"""Speaking rate and rhythm analysis"""
|
| 266 |
+
# Onset detection for syllable estimation
|
| 267 |
+
onset_env = librosa.onset.onset_strength(y=y, sr=sr)
|
| 268 |
+
peaks, _ = find_peaks(onset_env, height=np.mean(onset_env) * 0.5, distance=5)
|
| 269 |
+
|
| 270 |
+
duration = len(y) / sr
|
| 271 |
+
syllable_count = len(peaks)
|
| 272 |
+
|
| 273 |
+
# Detect pauses (silence regions)
|
| 274 |
+
rms = librosa.feature.rms(y=y, frame_length=512, hop_length=256)[0]
|
| 275 |
+
threshold = np.max(rms) * 0.1
|
| 276 |
+
is_pause = rms < threshold
|
| 277 |
+
|
| 278 |
+
pause_frames = np.sum(is_pause)
|
| 279 |
+
total_frames = len(rms)
|
| 280 |
+
pause_ratio = pause_frames / total_frames if total_frames > 0 else 0
|
| 281 |
+
|
| 282 |
+
# Calculate rates
|
| 283 |
+
speaking_rate = syllable_count / duration if duration > 0 else 0
|
| 284 |
+
speech_duration = duration * (1 - pause_ratio)
|
| 285 |
+
articulation_rate = syllable_count / speech_duration if speech_duration > 0 else 0
|
| 286 |
+
|
| 287 |
+
# Mean pause duration
|
| 288 |
+
pause_durations = []
|
| 289 |
+
in_pause = False
|
| 290 |
+
pause_start = 0
|
| 291 |
+
for i, p in enumerate(is_pause):
|
| 292 |
+
if p and not in_pause:
|
| 293 |
+
in_pause = True
|
| 294 |
+
pause_start = i
|
| 295 |
+
elif not p and in_pause:
|
| 296 |
+
in_pause = False
|
| 297 |
+
pause_durations.append((i - pause_start) * 256 / sr * 1000) # ms
|
| 298 |
+
|
| 299 |
+
mean_pause = np.mean(pause_durations) if pause_durations else 0
|
| 300 |
+
|
| 301 |
+
# Rhythm regularity (based on onset intervals)
|
| 302 |
+
if len(peaks) > 2:
|
| 303 |
+
intervals = np.diff(peaks)
|
| 304 |
+
regularity = 1 - (np.std(intervals) / np.mean(intervals)) if np.mean(intervals) > 0 else 0
|
| 305 |
+
regularity = max(0, min(1, regularity))
|
| 306 |
+
else:
|
| 307 |
+
regularity = 0.5
|
| 308 |
+
|
| 309 |
+
# Determine tempo
|
| 310 |
+
if speaking_rate < 2.5:
|
| 311 |
+
tempo = "slow"
|
| 312 |
+
elif speaking_rate > 4.5:
|
| 313 |
+
tempo = "fast"
|
| 314 |
+
else:
|
| 315 |
+
tempo = "normal"
|
| 316 |
+
|
| 317 |
+
return RhythmMetrics(
|
| 318 |
+
speaking_rate=round(speaking_rate, 2),
|
| 319 |
+
articulation_rate=round(articulation_rate, 2),
|
| 320 |
+
pause_ratio=round(pause_ratio, 3),
|
| 321 |
+
mean_pause_duration=round(mean_pause, 1),
|
| 322 |
+
speech_tempo=tempo,
|
| 323 |
+
rhythm_regularity=round(regularity, 3)
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
def analyze_voice_quality(self, y: np.ndarray, sr: int) -> VoiceQualityMetrics:
|
| 327 |
+
"""Voice quality and clarity analysis"""
|
| 328 |
+
# Spectral features
|
| 329 |
+
spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
|
| 330 |
+
spectral_flatness = librosa.feature.spectral_flatness(y=y)[0]
|
| 331 |
+
|
| 332 |
+
# HNR estimation (using autocorrelation)
|
| 333 |
+
autocorr = librosa.autocorrelate(y)
|
| 334 |
+
autocorr = autocorr[:len(autocorr)//2]
|
| 335 |
+
if len(autocorr) > 1:
|
| 336 |
+
peak_idx = np.argmax(autocorr[20:]) + 20 if len(autocorr) > 20 else 1
|
| 337 |
+
hnr = 10 * np.log10(autocorr[peak_idx] / (autocorr[0] - autocorr[peak_idx] + 1e-10) + 1e-10)
|
| 338 |
+
hnr = max(-10, min(40, hnr))
|
| 339 |
+
else:
|
| 340 |
+
hnr = 0
|
| 341 |
+
|
| 342 |
+
# Shimmer estimation (amplitude perturbation)
|
| 343 |
+
frames = librosa.util.frame(y, frame_length=256, hop_length=128)
|
| 344 |
+
frame_amps = np.max(np.abs(frames), axis=0)
|
| 345 |
+
if len(frame_amps) > 1:
|
| 346 |
+
amp_diff = np.abs(np.diff(frame_amps))
|
| 347 |
+
shimmer = (np.mean(amp_diff) / np.mean(frame_amps)) * 100 if np.mean(frame_amps) > 0 else 0
|
| 348 |
+
else:
|
| 349 |
+
shimmer = 0
|
| 350 |
+
|
| 351 |
+
# Clarity score (combination of HNR and spectral features)
|
| 352 |
+
avg_flatness = np.mean(spectral_flatness)
|
| 353 |
+
clarity = (1 - avg_flatness) * 50 + min(hnr / 30, 1) * 50
|
| 354 |
+
clarity = max(0, min(100, clarity))
|
| 355 |
+
|
| 356 |
+
# Breathiness (inverse of HNR, high spectral flatness)
|
| 357 |
+
breathiness = avg_flatness * 50 + max(0, (10 - hnr) / 20) * 50
|
| 358 |
+
breathiness = max(0, min(100, breathiness))
|
| 359 |
+
|
| 360 |
+
return VoiceQualityMetrics(
|
| 361 |
+
hnr=round(float(hnr), 2),
|
| 362 |
+
spectral_centroid=round(float(np.mean(spectral_centroid)), 2),
|
| 363 |
+
spectral_flatness=round(float(avg_flatness), 4),
|
| 364 |
+
clarity_score=round(float(clarity), 1),
|
| 365 |
+
shimmer_percent=round(float(shimmer), 3),
|
| 366 |
+
breathiness_score=round(float(breathiness), 1)
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
def analyze_emotion(self, pitch: PitchMetrics, energy: EnergyMetrics,
|
| 370 |
+
rhythm: RhythmMetrics, voice_quality: VoiceQualityMetrics) -> EmotionalMetrics:
|
| 371 |
+
"""Emotion estimation from acoustic features"""
|
| 372 |
+
|
| 373 |
+
# Arousal (activation level): high pitch var + high energy + fast rate = high arousal
|
| 374 |
+
pitch_factor = min(pitch.std_f0 / 50, 1) if pitch.std_f0 > 0 else 0
|
| 375 |
+
energy_factor = (energy.mean_rms + 40) / 30 # normalize to ~0-1
|
| 376 |
+
rate_factor = (rhythm.speaking_rate - 2) / 4 # normalize
|
| 377 |
+
|
| 378 |
+
arousal = (pitch_factor * 0.35 + energy_factor * 0.35 + rate_factor * 0.3)
|
| 379 |
+
arousal = max(-1, min(1, (arousal - 0.5) * 2)) # scale to -1 to 1
|
| 380 |
+
|
| 381 |
+
# Valence estimation (positive/negative): harder to detect from audio alone
|
| 382 |
+
# High clarity + normal rate + positive pitch slope = more positive
|
| 383 |
+
clarity_factor = voice_quality.clarity_score / 100
|
| 384 |
+
slope_factor = 0.5 + pitch.pitch_slope * 10 # slight positive influence
|
| 385 |
+
rhythm_factor = 1 - abs(rhythm.speaking_rate - 3.5) / 3.5 # normal rate = positive
|
| 386 |
+
|
| 387 |
+
valence = (clarity_factor * 0.3 + slope_factor * 0.3 + rhythm_factor * 0.4)
|
| 388 |
+
valence = max(-1, min(1, (valence - 0.5) * 2))
|
| 389 |
+
|
| 390 |
+
# Stress indicator
|
| 391 |
+
stress = (pitch.jitter_percent * 10 + voice_quality.shimmer_percent * 5 +
|
| 392 |
+
(1 - voice_quality.clarity_score / 100) * 30 +
|
| 393 |
+
abs(arousal) * 20)
|
| 394 |
+
stress = max(0, min(100, stress))
|
| 395 |
+
|
| 396 |
+
# Confidence score
|
| 397 |
+
confidence = (voice_quality.clarity_score * 0.3 +
|
| 398 |
+
(1 - rhythm.pause_ratio) * 100 * 0.3 +
|
| 399 |
+
energy_factor * 100 * 0.2 +
|
| 400 |
+
rhythm.rhythm_regularity * 100 * 0.2)
|
| 401 |
+
confidence = max(0, min(100, confidence))
|
| 402 |
+
|
| 403 |
+
# Determine primary emotion
|
| 404 |
+
emotions = []
|
| 405 |
+
if arousal > 0.3 and valence > 0.2:
|
| 406 |
+
emotions.append(("happy", 0.6 + valence * 0.2))
|
| 407 |
+
if arousal > 0.3 and valence < -0.2:
|
| 408 |
+
emotions.append(("angry", 0.6 - valence * 0.2))
|
| 409 |
+
if arousal < -0.3 and valence < -0.2:
|
| 410 |
+
emotions.append(("sad", 0.6 - valence * 0.2 - arousal * 0.1))
|
| 411 |
+
if arousal < -0.2 and valence > 0:
|
| 412 |
+
emotions.append(("calm", 0.6 + valence * 0.2 - arousal * 0.1))
|
| 413 |
+
if stress > 60:
|
| 414 |
+
emotions.append(("stressed", stress / 100))
|
| 415 |
+
if abs(arousal) < 0.2 and abs(valence) < 0.2:
|
| 416 |
+
emotions.append(("neutral", 0.7))
|
| 417 |
+
|
| 418 |
+
if emotions:
|
| 419 |
+
primary, conf = max(emotions, key=lambda x: x[1])
|
| 420 |
+
else:
|
| 421 |
+
primary, conf = "neutral", 0.5
|
| 422 |
+
|
| 423 |
+
return EmotionalMetrics(
|
| 424 |
+
arousal_score=round(arousal, 3),
|
| 425 |
+
valence_estimate=round(valence, 3),
|
| 426 |
+
stress_indicator=round(stress, 1),
|
| 427 |
+
confidence_score=round(confidence, 1),
|
| 428 |
+
primary_emotion=primary,
|
| 429 |
+
emotion_confidence=round(conf * 100, 1)
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
def analyze_segment(self, audio_path: str, segment_id: int, speaker: str,
|
| 433 |
+
start_time: str, end_time: str) -> SegmentAnalysis:
|
| 434 |
+
"""Complete analysis of a single audio segment"""
|
| 435 |
+
y, sr = self._load_audio(audio_path)
|
| 436 |
+
duration = len(y) / sr
|
| 437 |
+
|
| 438 |
+
# Run all analyses
|
| 439 |
+
pitch = self.analyze_pitch(y, sr)
|
| 440 |
+
energy = self.analyze_energy(y, sr)
|
| 441 |
+
rhythm = self.analyze_rhythm(y, sr)
|
| 442 |
+
voice_quality = self.analyze_voice_quality(y, sr)
|
| 443 |
+
emotion = self.analyze_emotion(pitch, energy, rhythm, voice_quality)
|
| 444 |
+
|
| 445 |
+
# Calculate overall quality score
|
| 446 |
+
quality = (
|
| 447 |
+
voice_quality.clarity_score * 0.25 +
|
| 448 |
+
emotion.confidence_score * 0.20 +
|
| 449 |
+
(100 - emotion.stress_indicator) * 0.15 +
|
| 450 |
+
rhythm.rhythm_regularity * 100 * 0.15 +
|
| 451 |
+
min(pitch.voiced_ratio * 100, 100) * 0.15 +
|
| 452 |
+
(100 - voice_quality.breathiness_score) * 0.10
|
| 453 |
+
)
|
| 454 |
+
|
| 455 |
+
return SegmentAnalysis(
|
| 456 |
+
segment_id=segment_id,
|
| 457 |
+
speaker=speaker,
|
| 458 |
+
start_time=start_time,
|
| 459 |
+
end_time=end_time,
|
| 460 |
+
duration_seconds=round(duration, 2),
|
| 461 |
+
pitch=pitch,
|
| 462 |
+
energy=energy,
|
| 463 |
+
rhythm=rhythm,
|
| 464 |
+
voice_quality=voice_quality,
|
| 465 |
+
emotion=emotion,
|
| 466 |
+
overall_quality_score=round(quality, 1),
|
| 467 |
+
segment_file=os.path.basename(audio_path)
|
| 468 |
+
)
|
| 469 |
+
|
| 470 |
+
def create_speaker_profile(self, segments: List[SegmentAnalysis],
|
| 471 |
+
speaker: str, total_call_duration: float) -> SpeakerProfile:
|
| 472 |
+
"""Create comprehensive speaker profile"""
|
| 473 |
+
speaker_segs = [s for s in segments if s.speaker == speaker]
|
| 474 |
+
|
| 475 |
+
if not speaker_segs:
|
| 476 |
+
return None
|
| 477 |
+
|
| 478 |
+
total_duration = sum(s.duration_seconds for s in speaker_segs)
|
| 479 |
+
|
| 480 |
+
# Calculate averages
|
| 481 |
+
avg_pitch = np.mean([s.pitch.mean_f0 for s in speaker_segs if s.pitch.mean_f0 > 0])
|
| 482 |
+
avg_energy = np.mean([s.energy.mean_rms for s in speaker_segs])
|
| 483 |
+
avg_rate = np.mean([s.rhythm.speaking_rate for s in speaker_segs])
|
| 484 |
+
avg_clarity = np.mean([s.voice_quality.clarity_score for s in speaker_segs])
|
| 485 |
+
avg_arousal = np.mean([s.emotion.arousal_score for s in speaker_segs])
|
| 486 |
+
avg_confidence = np.mean([s.emotion.confidence_score for s in speaker_segs])
|
| 487 |
+
|
| 488 |
+
# Voice type
|
| 489 |
+
if avg_pitch < 120:
|
| 490 |
+
voice_type = "low"
|
| 491 |
+
elif avg_pitch > 200:
|
| 492 |
+
voice_type = "high"
|
| 493 |
+
else:
|
| 494 |
+
voice_type = "medium"
|
| 495 |
+
|
| 496 |
+
# Pitch range and energy variability
|
| 497 |
+
all_pitches = [s.pitch.mean_f0 for s in speaker_segs if s.pitch.mean_f0 > 0]
|
| 498 |
+
pitch_range = max(all_pitches) - min(all_pitches) if all_pitches else 0
|
| 499 |
+
|
| 500 |
+
all_energies = [s.energy.mean_rms for s in speaker_segs]
|
| 501 |
+
energy_var = np.std(all_energies) if all_energies else 0
|
| 502 |
+
|
| 503 |
+
# Dominant emotion
|
| 504 |
+
emotions = [s.emotion.primary_emotion for s in speaker_segs]
|
| 505 |
+
dominant_emotion = max(set(emotions), key=emotions.count) if emotions else "neutral"
|
| 506 |
+
|
| 507 |
+
# Communication style
|
| 508 |
+
if pitch_range > 50 and energy_var > 5:
|
| 509 |
+
style = "expressive"
|
| 510 |
+
elif pitch_range < 20 and energy_var < 3:
|
| 511 |
+
style = "monotone"
|
| 512 |
+
elif avg_arousal > 0.3:
|
| 513 |
+
style = "dynamic"
|
| 514 |
+
else:
|
| 515 |
+
style = "calm"
|
| 516 |
+
|
| 517 |
+
# Overall score
|
| 518 |
+
overall = np.mean([s.overall_quality_score for s in speaker_segs])
|
| 519 |
+
|
| 520 |
+
# Strengths and improvements
|
| 521 |
+
strengths = []
|
| 522 |
+
improvements = []
|
| 523 |
+
|
| 524 |
+
if avg_clarity > 70:
|
| 525 |
+
strengths.append("Clear articulation")
|
| 526 |
+
else:
|
| 527 |
+
improvements.append("Improve voice clarity")
|
| 528 |
+
|
| 529 |
+
if 2.5 <= avg_rate <= 4.0:
|
| 530 |
+
strengths.append("Good speaking pace")
|
| 531 |
+
elif avg_rate < 2.5:
|
| 532 |
+
improvements.append("Speak slightly faster")
|
| 533 |
+
else:
|
| 534 |
+
improvements.append("Slow down speech rate")
|
| 535 |
+
|
| 536 |
+
if avg_confidence > 70:
|
| 537 |
+
strengths.append("Confident delivery")
|
| 538 |
+
else:
|
| 539 |
+
improvements.append("Project more confidence")
|
| 540 |
+
|
| 541 |
+
if style == "expressive":
|
| 542 |
+
strengths.append("Engaging vocal variety")
|
| 543 |
+
elif style == "monotone":
|
| 544 |
+
improvements.append("Add more vocal variety")
|
| 545 |
+
|
| 546 |
+
return SpeakerProfile(
|
| 547 |
+
speaker=speaker,
|
| 548 |
+
total_duration=round(total_duration, 2),
|
| 549 |
+
segment_count=len(speaker_segs),
|
| 550 |
+
talk_percentage=round(total_duration / total_call_duration * 100, 1) if total_call_duration > 0 else 0,
|
| 551 |
+
avg_pitch=round(float(avg_pitch), 1) if not np.isnan(avg_pitch) else 0,
|
| 552 |
+
avg_energy=round(float(avg_energy), 1),
|
| 553 |
+
avg_speaking_rate=round(float(avg_rate), 2),
|
| 554 |
+
avg_clarity=round(float(avg_clarity), 1),
|
| 555 |
+
pitch_range=round(float(pitch_range), 1),
|
| 556 |
+
energy_variability=round(float(energy_var), 2),
|
| 557 |
+
voice_type=voice_type,
|
| 558 |
+
dominant_emotion=dominant_emotion,
|
| 559 |
+
avg_arousal=round(float(avg_arousal), 3),
|
| 560 |
+
avg_confidence=round(float(avg_confidence), 1),
|
| 561 |
+
communication_style=style,
|
| 562 |
+
overall_score=round(float(overall), 1),
|
| 563 |
+
strengths=strengths,
|
| 564 |
+
improvements=improvements
|
| 565 |
+
)
|
| 566 |
+
|
| 567 |
+
def analyze_dynamics(self, segments: List[SegmentAnalysis],
|
| 568 |
+
total_duration: float) -> ConversationDynamics:
|
| 569 |
+
"""Analyze conversation dynamics"""
|
| 570 |
+
speakers = list(set(s.speaker for s in segments))
|
| 571 |
+
|
| 572 |
+
# Talk ratios
|
| 573 |
+
talk_ratios = {}
|
| 574 |
+
turn_dist = {}
|
| 575 |
+
for spk in speakers:
|
| 576 |
+
spk_segs = [s for s in segments if s.speaker == spk]
|
| 577 |
+
talk_ratios[spk] = round(sum(s.duration_seconds for s in spk_segs) / total_duration * 100, 1)
|
| 578 |
+
turn_dist[spk] = len(spk_segs)
|
| 579 |
+
|
| 580 |
+
# Average turn duration
|
| 581 |
+
avg_turn = np.mean([s.duration_seconds for s in segments]) if segments else 0
|
| 582 |
+
|
| 583 |
+
# Silence ratio
|
| 584 |
+
speech_duration = sum(s.duration_seconds for s in segments)
|
| 585 |
+
silence_ratio = (total_duration - speech_duration) / total_duration if total_duration > 0 else 0
|
| 586 |
+
|
| 587 |
+
# Conversation balance (0-100, 50 = perfect)
|
| 588 |
+
if len(speakers) == 2:
|
| 589 |
+
ratios = list(talk_ratios.values())
|
| 590 |
+
balance = 100 - abs(ratios[0] - ratios[1])
|
| 591 |
+
else:
|
| 592 |
+
balance = 100
|
| 593 |
+
|
| 594 |
+
# Dominance
|
| 595 |
+
dominance = max(talk_ratios, key=talk_ratios.get) if talk_ratios else None
|
| 596 |
+
|
| 597 |
+
# Engagement score
|
| 598 |
+
engagement = (
|
| 599 |
+
(1 - silence_ratio) * 40 +
|
| 600 |
+
balance * 0.3 +
|
| 601 |
+
min(len(segments) / 10, 1) * 30
|
| 602 |
+
)
|
| 603 |
+
|
| 604 |
+
return ConversationDynamics(
|
| 605 |
+
total_duration=round(total_duration, 2),
|
| 606 |
+
total_turns=len(segments),
|
| 607 |
+
speakers=speakers,
|
| 608 |
+
talk_ratios=talk_ratios,
|
| 609 |
+
turn_distribution=turn_dist,
|
| 610 |
+
avg_turn_duration=round(float(avg_turn), 2),
|
| 611 |
+
interruption_count=0, # Would need overlap detection
|
| 612 |
+
overlap_ratio=0,
|
| 613 |
+
silence_ratio=round(silence_ratio, 3),
|
| 614 |
+
conversation_balance=round(balance, 1),
|
| 615 |
+
dominance_speaker=dominance,
|
| 616 |
+
engagement_score=round(engagement, 1)
|
| 617 |
+
)
|
| 618 |
+
|
| 619 |
+
def analyze_call(self, segment_files: List[str], timeline: List,
|
| 620 |
+
call_id: str, is_stereo: bool) -> CallAnalysis:
|
| 621 |
+
"""Complete call analysis"""
|
| 622 |
+
self._log("\n" + "="*60)
|
| 623 |
+
self._log("JIS AUDIO ANALYSIS ENGINE")
|
| 624 |
+
self._log("="*60)
|
| 625 |
+
|
| 626 |
+
segments = []
|
| 627 |
+
total_duration = 0
|
| 628 |
+
|
| 629 |
+
for i, (seg_file, seg_info) in enumerate(zip(segment_files, timeline)):
|
| 630 |
+
self._log(f" Analyzing segment {i+1}/{len(segment_files)}...")
|
| 631 |
+
|
| 632 |
+
analysis = self.analyze_segment(
|
| 633 |
+
audio_path=seg_file,
|
| 634 |
+
segment_id=i+1,
|
| 635 |
+
speaker=seg_info.speaker,
|
| 636 |
+
start_time=seg_info.start_time,
|
| 637 |
+
end_time=seg_info.end_time
|
| 638 |
+
)
|
| 639 |
+
segments.append(analysis)
|
| 640 |
+
total_duration += analysis.duration_seconds
|
| 641 |
+
|
| 642 |
+
# Create speaker profiles
|
| 643 |
+
speakers = list(set(s.speaker for s in segments))
|
| 644 |
+
profiles = {}
|
| 645 |
+
for spk in speakers:
|
| 646 |
+
profile = self.create_speaker_profile(segments, spk, total_duration)
|
| 647 |
+
if profile:
|
| 648 |
+
profiles[spk] = profile
|
| 649 |
+
|
| 650 |
+
# Analyze dynamics
|
| 651 |
+
dynamics = self.analyze_dynamics(segments, total_duration)
|
| 652 |
+
|
| 653 |
+
# Overall quality
|
| 654 |
+
overall_quality = np.mean([s.overall_quality_score for s in segments]) if segments else 0
|
| 655 |
+
|
| 656 |
+
# Call summary
|
| 657 |
+
summary = {
|
| 658 |
+
"total_segments": len(segments),
|
| 659 |
+
"speakers": speakers,
|
| 660 |
+
"audio_type": "stereo" if is_stereo else "mono",
|
| 661 |
+
"average_clarity": round(np.mean([s.voice_quality.clarity_score for s in segments]), 1),
|
| 662 |
+
"average_confidence": round(np.mean([s.emotion.confidence_score for s in segments]), 1),
|
| 663 |
+
"dominant_emotions": list(set(s.emotion.primary_emotion for s in segments))
|
| 664 |
+
}
|
| 665 |
+
|
| 666 |
+
self._log(f"\nAnalysis complete. Quality Score: {overall_quality:.1f}/100")
|
| 667 |
+
|
| 668 |
+
return CallAnalysis(
|
| 669 |
+
call_id=call_id,
|
| 670 |
+
analysis_timestamp=datetime.now().isoformat(),
|
| 671 |
+
audio_duration=round(total_duration, 2),
|
| 672 |
+
audio_type="stereo" if is_stereo else "mono",
|
| 673 |
+
segments=segments,
|
| 674 |
+
speaker_profiles=profiles,
|
| 675 |
+
dynamics=dynamics,
|
| 676 |
+
overall_quality_score=round(float(overall_quality), 1),
|
| 677 |
+
call_summary=summary
|
| 678 |
+
)
|
| 679 |
+
|
| 680 |
+
def export_analysis(self, analysis: CallAnalysis, output_dir: str) -> str:
|
| 681 |
+
"""Export analysis to JSON"""
|
| 682 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 683 |
+
filepath = os.path.join(output_dir, "audio_analysis.json")
|
| 684 |
+
|
| 685 |
+
def convert(obj):
|
| 686 |
+
if hasattr(obj, '__dict__'):
|
| 687 |
+
return {k: convert(v) for k, v in asdict(obj).items()}
|
| 688 |
+
elif isinstance(obj, dict):
|
| 689 |
+
return {k: convert(v) for k, v in obj.items()}
|
| 690 |
+
elif isinstance(obj, list):
|
| 691 |
+
return [convert(i) for i in obj]
|
| 692 |
+
else:
|
| 693 |
+
return obj
|
| 694 |
+
|
| 695 |
+
data = convert(analysis)
|
| 696 |
+
|
| 697 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 698 |
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
| 699 |
+
|
| 700 |
+
self._log(f"Analysis exported: {filepath}")
|
| 701 |
+
return filepath
|
dashboard.html
ADDED
|
@@ -0,0 +1,510 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>ASR Audio Intelligence Platform</title>
|
| 7 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
| 8 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
|
| 9 |
+
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
| 10 |
+
<style>
|
| 11 |
+
:root { --asr-primary: #0f172a; --asr-accent: #3b82f6; --asr-success: #10b981; }
|
| 12 |
+
body { font-family: 'Inter', system-ui, sans-serif; }
|
| 13 |
+
.asr-gradient { background: linear-gradient(135deg, #0f172a 0%, #1e293b 50%, #334155 100%); }
|
| 14 |
+
.asr-accent-gradient { background: linear-gradient(135deg, #3b82f6 0%, #8b5cf6 100%); }
|
| 15 |
+
.glass-card { background: rgba(255,255,255,0.95); backdrop-filter: blur(10px); }
|
| 16 |
+
.metric-card:hover { transform: translateY(-2px); box-shadow: 0 20px 40px rgba(0,0,0,0.1); }
|
| 17 |
+
.upload-zone { border: 2px dashed #cbd5e1; transition: all 0.3s; }
|
| 18 |
+
.upload-zone:hover, .upload-zone.dragover { border-color: #3b82f6; background: #f0f9ff; }
|
| 19 |
+
.progress-ring { transform: rotate(-90deg); }
|
| 20 |
+
.segment-row:hover { background: #f8fafc; }
|
| 21 |
+
@keyframes pulse-ring { 0% { transform: scale(0.8); opacity: 1; } 100% { transform: scale(1.4); opacity: 0; } }
|
| 22 |
+
.live-indicator::before { content: ''; position: absolute; width: 100%; height: 100%; background: #10b981; border-radius: 50%; animation: pulse-ring 1.5s infinite; }
|
| 23 |
+
</style>
|
| 24 |
+
</head>
|
| 25 |
+
<body class="bg-slate-50 min-h-screen">
|
| 26 |
+
|
| 27 |
+
<!-- Header -->
|
| 28 |
+
<header class="asr-gradient text-white sticky top-0 z-40 shadow-xl">
|
| 29 |
+
<div class="container mx-auto px-6 py-4">
|
| 30 |
+
<div class="flex items-center justify-between">
|
| 31 |
+
<div class="flex items-center space-x-4">
|
| 32 |
+
<div class="w-12 h-12 bg-white rounded-xl flex items-center justify-center">
|
| 33 |
+
<span class="text-slate-900 font-black text-xl">ASR</span>
|
| 34 |
+
</div>
|
| 35 |
+
<div>
|
| 36 |
+
<h1 class="text-2xl font-bold tracking-tight">Audio Intelligence Platform</h1>
|
| 37 |
+
<p class="text-slate-400 text-sm">Enterprise Speech Analytics & Transcription</p>
|
| 38 |
+
</div>
|
| 39 |
+
</div>
|
| 40 |
+
<div class="flex items-center space-x-6">
|
| 41 |
+
<div class="text-right">
|
| 42 |
+
<div class="text-xs text-slate-400 uppercase tracking-wider">System Status</div>
|
| 43 |
+
<div class="flex items-center mt-1">
|
| 44 |
+
<span class="relative flex h-3 w-3 mr-2">
|
| 45 |
+
<span class="live-indicator absolute inline-flex h-full w-full rounded-full bg-emerald-400"></span>
|
| 46 |
+
<span class="relative inline-flex rounded-full h-3 w-3 bg-emerald-500"></span>
|
| 47 |
+
</span>
|
| 48 |
+
<span class="font-medium" id="serverStatus">Operational</span>
|
| 49 |
+
</div>
|
| 50 |
+
</div>
|
| 51 |
+
</div>
|
| 52 |
+
</div>
|
| 53 |
+
</div>
|
| 54 |
+
</header>
|
| 55 |
+
|
| 56 |
+
<main class="container mx-auto px-6 py-8">
|
| 57 |
+
|
| 58 |
+
<!-- Upload Section -->
|
| 59 |
+
<section class="glass-card rounded-2xl shadow-lg p-8 mb-8 border border-slate-200">
|
| 60 |
+
<div class="flex items-center justify-between mb-6">
|
| 61 |
+
<div>
|
| 62 |
+
<h2 class="text-xl font-bold text-slate-800">Audio Upload</h2>
|
| 63 |
+
<p class="text-slate-500 text-sm mt-1">Upload audio files for analysis. Stereo files will be automatically separated by channel.</p>
|
| 64 |
+
</div>
|
| 65 |
+
<div class="flex items-center space-x-2 text-sm">
|
| 66 |
+
<span class="px-3 py-1 bg-blue-100 text-blue-700 rounded-full font-medium">Stereo: Split Channels</span>
|
| 67 |
+
<span class="px-3 py-1 bg-slate-100 text-slate-700 rounded-full font-medium">Mono: Single Speaker</span>
|
| 68 |
+
</div>
|
| 69 |
+
</div>
|
| 70 |
+
|
| 71 |
+
<div class="upload-zone rounded-xl p-10 text-center cursor-pointer" id="uploadZone">
|
| 72 |
+
<input type="file" id="fileInput" class="hidden" accept=".wav,.mp3,.m4a,.flac,.ogg,.opus">
|
| 73 |
+
<div class="w-20 h-20 mx-auto mb-4 bg-slate-100 rounded-full flex items-center justify-center">
|
| 74 |
+
<i class="fas fa-cloud-arrow-up text-3xl text-slate-400"></i>
|
| 75 |
+
</div>
|
| 76 |
+
<p class="text-lg text-slate-700 font-medium">Drop audio file here or <span class="text-blue-600 hover:underline">browse</span></p>
|
| 77 |
+
<p class="text-sm text-slate-400 mt-2">WAV, MP3, M4A, FLAC, OGG, OPUS supported</p>
|
| 78 |
+
</div>
|
| 79 |
+
|
| 80 |
+
<div id="uploadProgress" class="hidden mt-6">
|
| 81 |
+
<div class="bg-slate-50 rounded-xl p-6 border border-slate-200">
|
| 82 |
+
<div class="flex items-center justify-between mb-4">
|
| 83 |
+
<div class="flex items-center">
|
| 84 |
+
<div class="w-10 h-10 bg-blue-100 rounded-lg flex items-center justify-center mr-4">
|
| 85 |
+
<i class="fas fa-spinner fa-spin text-blue-600"></i>
|
| 86 |
+
</div>
|
| 87 |
+
<div>
|
| 88 |
+
<p class="font-semibold text-slate-800" id="uploadStatus">Processing...</p>
|
| 89 |
+
<p class="text-sm text-slate-500" id="stageText">Initializing...</p>
|
| 90 |
+
</div>
|
| 91 |
+
</div>
|
| 92 |
+
<span id="progressPercent" class="text-2xl font-bold text-blue-600">0%</span>
|
| 93 |
+
</div>
|
| 94 |
+
<div class="w-full bg-slate-200 rounded-full h-2">
|
| 95 |
+
<div id="progressBar" class="asr-accent-gradient h-2 rounded-full transition-all duration-500" style="width: 0%"></div>
|
| 96 |
+
</div>
|
| 97 |
+
</div>
|
| 98 |
+
</div>
|
| 99 |
+
</section>
|
| 100 |
+
|
| 101 |
+
<!-- Statistics Dashboard -->
|
| 102 |
+
<section class="grid grid-cols-2 md:grid-cols-4 lg:grid-cols-6 gap-4 mb-8">
|
| 103 |
+
<div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
|
| 104 |
+
<div class="flex items-center justify-between">
|
| 105 |
+
<div>
|
| 106 |
+
<p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Total Calls</p>
|
| 107 |
+
<p class="text-3xl font-bold text-slate-800 mt-1" id="totalCalls">0</p>
|
| 108 |
+
</div>
|
| 109 |
+
<div class="w-12 h-12 bg-blue-50 rounded-xl flex items-center justify-center">
|
| 110 |
+
<i class="fas fa-phone-volume text-blue-600 text-lg"></i>
|
| 111 |
+
</div>
|
| 112 |
+
</div>
|
| 113 |
+
</div>
|
| 114 |
+
|
| 115 |
+
<div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
|
| 116 |
+
<div class="flex items-center justify-between">
|
| 117 |
+
<div>
|
| 118 |
+
<p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Stereo</p>
|
| 119 |
+
<p class="text-3xl font-bold text-purple-600 mt-1" id="stereoCalls">0</p>
|
| 120 |
+
</div>
|
| 121 |
+
<div class="w-12 h-12 bg-purple-50 rounded-xl flex items-center justify-center">
|
| 122 |
+
<i class="fas fa-code-branch text-purple-600 text-lg"></i>
|
| 123 |
+
</div>
|
| 124 |
+
</div>
|
| 125 |
+
</div>
|
| 126 |
+
|
| 127 |
+
<div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
|
| 128 |
+
<div class="flex items-center justify-between">
|
| 129 |
+
<div>
|
| 130 |
+
<p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Quality Score</p>
|
| 131 |
+
<p class="text-3xl font-bold text-emerald-600 mt-1" id="avgScore">0</p>
|
| 132 |
+
</div>
|
| 133 |
+
<div class="w-12 h-12 bg-emerald-50 rounded-xl flex items-center justify-center">
|
| 134 |
+
<i class="fas fa-chart-line text-emerald-600 text-lg"></i>
|
| 135 |
+
</div>
|
| 136 |
+
</div>
|
| 137 |
+
</div>
|
| 138 |
+
|
| 139 |
+
<div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
|
| 140 |
+
<div class="flex items-center justify-between">
|
| 141 |
+
<div>
|
| 142 |
+
<p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Clarity</p>
|
| 143 |
+
<p class="text-3xl font-bold text-cyan-600 mt-1" id="avgClarity">0</p>
|
| 144 |
+
</div>
|
| 145 |
+
<div class="w-12 h-12 bg-cyan-50 rounded-xl flex items-center justify-center">
|
| 146 |
+
<i class="fas fa-microphone text-cyan-600 text-lg"></i>
|
| 147 |
+
</div>
|
| 148 |
+
</div>
|
| 149 |
+
</div>
|
| 150 |
+
|
| 151 |
+
<div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
|
| 152 |
+
<div class="flex items-center justify-between">
|
| 153 |
+
<div>
|
| 154 |
+
<p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Confidence</p>
|
| 155 |
+
<p class="text-3xl font-bold text-amber-600 mt-1" id="avgConfidence">0</p>
|
| 156 |
+
</div>
|
| 157 |
+
<div class="w-12 h-12 bg-amber-50 rounded-xl flex items-center justify-center">
|
| 158 |
+
<i class="fas fa-shield-check text-amber-600 text-lg"></i>
|
| 159 |
+
</div>
|
| 160 |
+
</div>
|
| 161 |
+
</div>
|
| 162 |
+
|
| 163 |
+
<div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
|
| 164 |
+
<div class="flex items-center justify-between">
|
| 165 |
+
<div>
|
| 166 |
+
<p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Segments</p>
|
| 167 |
+
<p class="text-3xl font-bold text-rose-600 mt-1" id="totalSegments">0</p>
|
| 168 |
+
</div>
|
| 169 |
+
<div class="w-12 h-12 bg-rose-50 rounded-xl flex items-center justify-center">
|
| 170 |
+
<i class="fas fa-wave-square text-rose-600 text-lg"></i>
|
| 171 |
+
</div>
|
| 172 |
+
</div>
|
| 173 |
+
</div>
|
| 174 |
+
</section>
|
| 175 |
+
|
| 176 |
+
<!-- Calls List -->
|
| 177 |
+
<section class="glass-card rounded-2xl shadow-lg border border-slate-200 overflow-hidden">
|
| 178 |
+
<div class="p-6 border-b border-slate-200 flex items-center justify-between">
|
| 179 |
+
<div>
|
| 180 |
+
<h2 class="text-xl font-bold text-slate-800">Analyzed Recordings</h2>
|
| 181 |
+
<p class="text-sm text-slate-500 mt-1">Click on any recording to view detailed analysis</p>
|
| 182 |
+
</div>
|
| 183 |
+
<button onclick="loadCalls()" class="flex items-center px-4 py-2 bg-slate-100 hover:bg-slate-200 text-slate-700 rounded-lg transition-colors font-medium">
|
| 184 |
+
<i class="fas fa-arrows-rotate mr-2"></i>Refresh
|
| 185 |
+
</button>
|
| 186 |
+
</div>
|
| 187 |
+
<div id="callsList" class="divide-y divide-slate-100"></div>
|
| 188 |
+
</section>
|
| 189 |
+
</main>
|
| 190 |
+
|
| 191 |
+
<!-- Analysis Modal -->
|
| 192 |
+
<div id="analysisModal" class="hidden fixed inset-0 bg-slate-900/60 backdrop-blur-sm z-50 flex items-center justify-center p-4">
|
| 193 |
+
<div class="bg-white rounded-2xl shadow-2xl max-w-7xl w-full max-h-[95vh] overflow-hidden flex flex-col">
|
| 194 |
+
<div class="asr-gradient text-white p-6 flex justify-between items-center shrink-0">
|
| 195 |
+
<div class="flex items-center space-x-4">
|
| 196 |
+
<div class="w-10 h-10 bg-white/20 rounded-lg flex items-center justify-center">
|
| 197 |
+
<i class="fas fa-chart-bar"></i>
|
| 198 |
+
</div>
|
| 199 |
+
<div>
|
| 200 |
+
<h3 class="text-xl font-bold" id="modalTitle">Analysis Report</h3>
|
| 201 |
+
<p class="text-slate-300 text-sm">Comprehensive audio analysis</p>
|
| 202 |
+
</div>
|
| 203 |
+
</div>
|
| 204 |
+
<button onclick="closeModal()" class="w-10 h-10 bg-white/10 hover:bg-white/20 rounded-lg flex items-center justify-center transition-colors">
|
| 205 |
+
<i class="fas fa-xmark text-xl"></i>
|
| 206 |
+
</button>
|
| 207 |
+
</div>
|
| 208 |
+
<div class="p-6 overflow-y-auto flex-1" id="modalContent"></div>
|
| 209 |
+
</div>
|
| 210 |
+
</div>
|
| 211 |
+
|
| 212 |
+
<script>
|
| 213 |
+
const API_BASE = window.location.origin;
|
| 214 |
+
let currentJobId = null;
|
| 215 |
+
let pollInterval = null;
|
| 216 |
+
|
| 217 |
+
document.addEventListener('DOMContentLoaded', () => {
|
| 218 |
+
checkServerHealth();
|
| 219 |
+
loadStatistics();
|
| 220 |
+
loadCalls();
|
| 221 |
+
|
| 222 |
+
const uploadZone = document.getElementById('uploadZone');
|
| 223 |
+
const fileInput = document.getElementById('fileInput');
|
| 224 |
+
|
| 225 |
+
uploadZone.addEventListener('click', () => fileInput.click());
|
| 226 |
+
uploadZone.addEventListener('dragover', e => { e.preventDefault(); uploadZone.classList.add('dragover'); });
|
| 227 |
+
uploadZone.addEventListener('dragleave', () => uploadZone.classList.remove('dragover'));
|
| 228 |
+
uploadZone.addEventListener('drop', e => {
|
| 229 |
+
e.preventDefault();
|
| 230 |
+
uploadZone.classList.remove('dragover');
|
| 231 |
+
if (e.dataTransfer.files.length > 0) handleFileUpload(e.dataTransfer.files[0]);
|
| 232 |
+
});
|
| 233 |
+
fileInput.addEventListener('change', e => {
|
| 234 |
+
if (e.target.files.length > 0) handleFileUpload(e.target.files[0]);
|
| 235 |
+
});
|
| 236 |
+
});
|
| 237 |
+
|
| 238 |
+
async function checkServerHealth() {
|
| 239 |
+
try {
|
| 240 |
+
const res = await fetch(`${API_BASE}/health`);
|
| 241 |
+
const data = await res.json();
|
| 242 |
+
document.getElementById('serverStatus').textContent = data.status === 'healthy' ? 'Operational' : 'Offline';
|
| 243 |
+
} catch { document.getElementById('serverStatus').textContent = 'Offline'; }
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
async function loadStatistics() {
|
| 247 |
+
try {
|
| 248 |
+
const res = await fetch(`${API_BASE}/api/statistics`);
|
| 249 |
+
const s = await res.json();
|
| 250 |
+
document.getElementById('totalCalls').textContent = s.total_calls || 0;
|
| 251 |
+
document.getElementById('stereoCalls').textContent = s.stereo_calls || 0;
|
| 252 |
+
document.getElementById('avgScore').textContent = (s.avg_quality_score || 0).toFixed(1);
|
| 253 |
+
document.getElementById('avgClarity').textContent = (s.avg_clarity || 0).toFixed(1);
|
| 254 |
+
document.getElementById('avgConfidence').textContent = (s.avg_confidence || 0).toFixed(1);
|
| 255 |
+
document.getElementById('totalSegments').textContent = s.total_segments || 0;
|
| 256 |
+
} catch (e) { console.error('Stats error:', e); }
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
async function loadCalls() {
|
| 260 |
+
try {
|
| 261 |
+
const res = await fetch(`${API_BASE}/api/calls`);
|
| 262 |
+
const calls = await res.json();
|
| 263 |
+
const list = document.getElementById('callsList');
|
| 264 |
+
|
| 265 |
+
if (calls.length === 0) {
|
| 266 |
+
list.innerHTML = `<div class="p-12 text-center"><div class="w-16 h-16 bg-slate-100 rounded-full flex items-center justify-center mx-auto mb-4"><i class="fas fa-folder-open text-2xl text-slate-400"></i></div><p class="text-slate-600 font-medium">No recordings yet</p><p class="text-slate-400 text-sm mt-1">Upload an audio file to get started</p></div>`;
|
| 267 |
+
return;
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
list.innerHTML = calls.map(call => `
|
| 271 |
+
<div onclick="viewAnalysis('${call}')" class="p-5 flex items-center justify-between hover:bg-slate-50 cursor-pointer transition-colors">
|
| 272 |
+
<div class="flex items-center space-x-4">
|
| 273 |
+
<div class="w-12 h-12 bg-gradient-to-br from-blue-500 to-purple-600 rounded-xl flex items-center justify-center text-white"><i class="fas fa-waveform-lines"></i></div>
|
| 274 |
+
<div>
|
| 275 |
+
<p class="font-semibold text-slate-800">${call}</p>
|
| 276 |
+
<p class="text-sm text-slate-500">Click to view analysis</p>
|
| 277 |
+
</div>
|
| 278 |
+
</div>
|
| 279 |
+
<div class="flex items-center space-x-3">
|
| 280 |
+
<span class="px-3 py-1 bg-emerald-100 text-emerald-700 rounded-full text-sm font-medium">Analyzed</span>
|
| 281 |
+
<i class="fas fa-chevron-right text-slate-400"></i>
|
| 282 |
+
</div>
|
| 283 |
+
</div>
|
| 284 |
+
`).join('');
|
| 285 |
+
} catch (e) { console.error('Calls error:', e); }
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
async function handleFileUpload(file) {
|
| 289 |
+
const formData = new FormData();
|
| 290 |
+
formData.append('file', file);
|
| 291 |
+
|
| 292 |
+
document.getElementById('uploadProgress').classList.remove('hidden');
|
| 293 |
+
updateProgress('Uploading...', 'Transferring file to server', 10);
|
| 294 |
+
|
| 295 |
+
try {
|
| 296 |
+
const res = await fetch(`${API_BASE}/api/upload`, { method: 'POST', body: formData });
|
| 297 |
+
const result = await res.json();
|
| 298 |
+
if (!res.ok) throw new Error(result.error);
|
| 299 |
+
currentJobId = result.job_id;
|
| 300 |
+
updateProgress('Processing...', 'Analysis started', 25);
|
| 301 |
+
pollJobStatus();
|
| 302 |
+
} catch (e) {
|
| 303 |
+
updateProgress('Error', e.message, 0);
|
| 304 |
+
setTimeout(() => document.getElementById('uploadProgress').classList.add('hidden'), 3000);
|
| 305 |
+
}
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
function updateProgress(status, stage, percent) {
|
| 309 |
+
document.getElementById('uploadStatus').textContent = status;
|
| 310 |
+
document.getElementById('stageText').textContent = stage;
|
| 311 |
+
document.getElementById('progressBar').style.width = percent + '%';
|
| 312 |
+
document.getElementById('progressPercent').textContent = percent + '%';
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
function pollJobStatus() {
|
| 316 |
+
if (pollInterval) clearInterval(pollInterval);
|
| 317 |
+
pollInterval = setInterval(async () => {
|
| 318 |
+
try {
|
| 319 |
+
const res = await fetch(`${API_BASE}/api/jobs/${currentJobId}`);
|
| 320 |
+
const job = await res.json();
|
| 321 |
+
|
| 322 |
+
const stages = {
|
| 323 |
+
'pending': { text: 'Queued...', progress: 20 },
|
| 324 |
+
'initializing': { text: 'Loading models...', progress: 30 },
|
| 325 |
+
'diarization': { text: 'Separating speakers...', progress: 45 },
|
| 326 |
+
'transcription': { text: 'Transcribing speech...', progress: 65 },
|
| 327 |
+
'audio_analysis': { text: 'Analyzing audio features...', progress: 85 },
|
| 328 |
+
'done': { text: 'Complete!', progress: 100 }
|
| 329 |
+
};
|
| 330 |
+
|
| 331 |
+
if (job.stage && stages[job.stage]) {
|
| 332 |
+
let stageText = stages[job.stage].text;
|
| 333 |
+
if (job.is_stereo !== null) stageText += job.is_stereo ? ' (Stereo)' : ' (Mono)';
|
| 334 |
+
updateProgress('Processing...', stageText, stages[job.stage].progress);
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
if (job.status === 'completed') {
|
| 338 |
+
clearInterval(pollInterval);
|
| 339 |
+
updateProgress('Success!', 'Analysis complete', 100);
|
| 340 |
+
setTimeout(() => {
|
| 341 |
+
document.getElementById('uploadProgress').classList.add('hidden');
|
| 342 |
+
loadStatistics();
|
| 343 |
+
loadCalls();
|
| 344 |
+
}, 1500);
|
| 345 |
+
} else if (job.status === 'failed') {
|
| 346 |
+
clearInterval(pollInterval);
|
| 347 |
+
updateProgress('Failed', job.error || 'Unknown error', 0);
|
| 348 |
+
setTimeout(() => document.getElementById('uploadProgress').classList.add('hidden'), 3000);
|
| 349 |
+
}
|
| 350 |
+
} catch (e) { clearInterval(pollInterval); }
|
| 351 |
+
}, 1500);
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
async function viewAnalysis(callName) {
|
| 355 |
+
try {
|
| 356 |
+
const res = await fetch(`${API_BASE}/api/analysis/${callName}`);
|
| 357 |
+
const data = await res.json();
|
| 358 |
+
const a = data.analysis;
|
| 359 |
+
const t = data.transcription;
|
| 360 |
+
|
| 361 |
+
document.getElementById('modalTitle').textContent = callName;
|
| 362 |
+
|
| 363 |
+
const isStereo = a.audio_type === 'stereo';
|
| 364 |
+
const profiles = a.speaker_profiles || {};
|
| 365 |
+
const dynamics = a.dynamics || {};
|
| 366 |
+
|
| 367 |
+
let profilesHTML = '';
|
| 368 |
+
for (const [spk, p] of Object.entries(profiles)) {
|
| 369 |
+
profilesHTML += `
|
| 370 |
+
<div class="bg-slate-50 rounded-xl p-5 border border-slate-200">
|
| 371 |
+
<div class="flex items-center justify-between mb-4">
|
| 372 |
+
<div class="flex items-center space-x-3">
|
| 373 |
+
<div class="w-10 h-10 ${spk === 'CUSTOMER' ? 'bg-blue-100' : spk === 'AGENT' ? 'bg-emerald-100' : 'bg-purple-100'} rounded-full flex items-center justify-center">
|
| 374 |
+
<i class="fas fa-user ${spk === 'CUSTOMER' ? 'text-blue-600' : spk === 'AGENT' ? 'text-emerald-600' : 'text-purple-600'}"></i>
|
| 375 |
+
</div>
|
| 376 |
+
<div>
|
| 377 |
+
<p class="font-bold text-slate-800">${spk}</p>
|
| 378 |
+
<p class="text-sm text-slate-500">${p.communication_style} style</p>
|
| 379 |
+
</div>
|
| 380 |
+
</div>
|
| 381 |
+
<div class="text-right">
|
| 382 |
+
<p class="text-2xl font-bold text-slate-800">${p.overall_score}</p>
|
| 383 |
+
<p class="text-xs text-slate-500">Quality Score</p>
|
| 384 |
+
</div>
|
| 385 |
+
</div>
|
| 386 |
+
<div class="grid grid-cols-4 gap-3 mb-4">
|
| 387 |
+
<div class="text-center p-2 bg-white rounded-lg"><p class="text-lg font-bold text-slate-800">${p.avg_pitch.toFixed(0)}</p><p class="text-xs text-slate-500">Pitch (Hz)</p></div>
|
| 388 |
+
<div class="text-center p-2 bg-white rounded-lg"><p class="text-lg font-bold text-slate-800">${p.avg_energy.toFixed(1)}</p><p class="text-xs text-slate-500">Energy (dB)</p></div>
|
| 389 |
+
<div class="text-center p-2 bg-white rounded-lg"><p class="text-lg font-bold text-slate-800">${p.avg_speaking_rate.toFixed(1)}</p><p class="text-xs text-slate-500">Rate (/s)</p></div>
|
| 390 |
+
<div class="text-center p-2 bg-white rounded-lg"><p class="text-lg font-bold text-slate-800">${p.avg_clarity.toFixed(0)}</p><p class="text-xs text-slate-500">Clarity</p></div>
|
| 391 |
+
</div>
|
| 392 |
+
<div class="flex flex-wrap gap-2">
|
| 393 |
+
${p.strengths.map(s => `<span class="px-2 py-1 bg-emerald-100 text-emerald-700 text-xs rounded-full">${s}</span>`).join('')}
|
| 394 |
+
${p.improvements.map(i => `<span class="px-2 py-1 bg-amber-100 text-amber-700 text-xs rounded-full">${i}</span>`).join('')}
|
| 395 |
+
</div>
|
| 396 |
+
</div>
|
| 397 |
+
`;
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
let transcriptHTML = '';
|
| 401 |
+
let totalInferenceTime = 0;
|
| 402 |
+
if (t && t.transcriptions) {
|
| 403 |
+
t.transcriptions.forEach(seg => totalInferenceTime += (seg.inference_time || 0));
|
| 404 |
+
transcriptHTML = t.transcriptions.map(seg => `
|
| 405 |
+
<div class="flex ${seg.speaker === 'CUSTOMER' ? 'justify-start' : seg.speaker === 'AGENT' ? 'justify-end' : 'justify-center'}">
|
| 406 |
+
<div class="max-w-[70%] ${seg.speaker === 'CUSTOMER' ? 'bg-blue-50 border-blue-200' : seg.speaker === 'AGENT' ? 'bg-emerald-50 border-emerald-200' : 'bg-slate-50 border-slate-200'} border rounded-xl p-3">
|
| 407 |
+
<div class="flex items-center justify-between mb-1">
|
| 408 |
+
<div class="flex items-center space-x-2">
|
| 409 |
+
<span class="font-semibold text-sm ${seg.speaker === 'CUSTOMER' ? 'text-blue-700' : seg.speaker === 'AGENT' ? 'text-emerald-700' : 'text-slate-700'}">${seg.speaker}</span>
|
| 410 |
+
<span class="text-xs text-slate-400">${seg.start_time}</span>
|
| 411 |
+
</div>
|
| 412 |
+
<span class="text-xs text-orange-500 font-medium"><i class="fas fa-clock mr-1"></i>${seg.inference_time}s</span>
|
| 413 |
+
</div>
|
| 414 |
+
<p class="text-slate-800">${seg.text}</p>
|
| 415 |
+
</div>
|
| 416 |
+
</div>
|
| 417 |
+
`).join('');
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
document.getElementById('modalContent').innerHTML = `
|
| 421 |
+
<div class="space-y-6">
|
| 422 |
+
<!-- Overview -->
|
| 423 |
+
<div class="grid grid-cols-5 gap-4">
|
| 424 |
+
<div class="bg-gradient-to-br from-slate-800 to-slate-900 text-white rounded-xl p-5 text-center">
|
| 425 |
+
<p class="text-3xl font-bold">${a.overall_quality_score}</p>
|
| 426 |
+
<p class="text-slate-300 text-sm mt-1">Quality Score</p>
|
| 427 |
+
</div>
|
| 428 |
+
<div class="bg-slate-50 rounded-xl p-5 text-center border border-slate-200">
|
| 429 |
+
<p class="text-3xl font-bold text-slate-800">${a.audio_duration.toFixed(1)}s</p>
|
| 430 |
+
<p class="text-slate-500 text-sm mt-1">Duration</p>
|
| 431 |
+
</div>
|
| 432 |
+
<div class="bg-slate-50 rounded-xl p-5 text-center border border-slate-200">
|
| 433 |
+
<p class="text-3xl font-bold ${isStereo ? 'text-purple-600' : 'text-blue-600'}">${isStereo ? 'STEREO' : 'MONO'}</p>
|
| 434 |
+
<p class="text-slate-500 text-sm mt-1">Audio Type</p>
|
| 435 |
+
</div>
|
| 436 |
+
<div class="bg-slate-50 rounded-xl p-5 text-center border border-slate-200">
|
| 437 |
+
<p class="text-3xl font-bold text-slate-800">${a.segments.length}</p>
|
| 438 |
+
<p class="text-slate-500 text-sm mt-1">Segments</p>
|
| 439 |
+
</div>
|
| 440 |
+
<div class="bg-slate-50 rounded-xl p-5 text-center border border-slate-200">
|
| 441 |
+
<p class="text-3xl font-bold text-emerald-600">${dynamics.engagement_score?.toFixed(0) || 0}</p>
|
| 442 |
+
<p class="text-slate-500 text-sm mt-1">Engagement</p>
|
| 443 |
+
</div>
|
| 444 |
+
</div>
|
| 445 |
+
|
| 446 |
+
<!-- Speaker Profiles -->
|
| 447 |
+
<div>
|
| 448 |
+
<h4 class="font-bold text-slate-800 mb-4 flex items-center"><i class="fas fa-users text-blue-600 mr-2"></i>Speaker Profiles</h4>
|
| 449 |
+
<div class="grid ${isStereo ? 'grid-cols-2' : 'grid-cols-1 max-w-xl'} gap-4">${profilesHTML}</div>
|
| 450 |
+
</div>
|
| 451 |
+
|
| 452 |
+
<!-- Transcription -->
|
| 453 |
+
${t ? `
|
| 454 |
+
<div>
|
| 455 |
+
<div class="flex items-center justify-between mb-4">
|
| 456 |
+
<h4 class="font-bold text-slate-800 flex items-center"><i class="fas fa-closed-captioning text-blue-600 mr-2"></i>Transcription</h4>
|
| 457 |
+
<span class="px-3 py-1 bg-orange-100 text-orange-700 rounded-full text-sm font-medium"><i class="fas fa-bolt mr-1"></i>Total: ${totalInferenceTime.toFixed(2)}s</span>
|
| 458 |
+
</div>
|
| 459 |
+
<div class="bg-slate-50 rounded-xl p-4 border border-slate-200 max-h-80 overflow-y-auto space-y-3">${transcriptHTML}</div>
|
| 460 |
+
</div>
|
| 461 |
+
` : ''}
|
| 462 |
+
|
| 463 |
+
<!-- Segment Analysis -->
|
| 464 |
+
<div>
|
| 465 |
+
<h4 class="font-bold text-slate-800 mb-4 flex items-center"><i class="fas fa-wave-square text-blue-600 mr-2"></i>Segment Analysis</h4>
|
| 466 |
+
<div class="bg-slate-50 rounded-xl border border-slate-200 overflow-hidden max-h-96 overflow-y-auto">
|
| 467 |
+
<table class="w-full text-sm">
|
| 468 |
+
<thead class="bg-slate-100 sticky top-0">
|
| 469 |
+
<tr>
|
| 470 |
+
<th class="px-4 py-3 text-left font-semibold text-slate-600">#</th>
|
| 471 |
+
<th class="px-4 py-3 text-left font-semibold text-slate-600">Speaker</th>
|
| 472 |
+
<th class="px-4 py-3 text-left font-semibold text-slate-600">Time</th>
|
| 473 |
+
<th class="px-4 py-3 text-center font-semibold text-slate-600">Pitch</th>
|
| 474 |
+
<th class="px-4 py-3 text-center font-semibold text-slate-600">Energy</th>
|
| 475 |
+
<th class="px-4 py-3 text-center font-semibold text-slate-600">Rate</th>
|
| 476 |
+
<th class="px-4 py-3 text-center font-semibold text-slate-600">Clarity</th>
|
| 477 |
+
<th class="px-4 py-3 text-center font-semibold text-slate-600">Emotion</th>
|
| 478 |
+
<th class="px-4 py-3 text-center font-semibold text-slate-600">Score</th>
|
| 479 |
+
</tr>
|
| 480 |
+
</thead>
|
| 481 |
+
<tbody class="divide-y divide-slate-100">
|
| 482 |
+
${a.segments.map((s, i) => `
|
| 483 |
+
<tr class="segment-row">
|
| 484 |
+
<td class="px-4 py-3 font-medium text-slate-800">${i+1}</td>
|
| 485 |
+
<td class="px-4 py-3"><span class="px-2 py-1 ${s.speaker === 'CUSTOMER' ? 'bg-blue-100 text-blue-700' : s.speaker === 'AGENT' ? 'bg-emerald-100 text-emerald-700' : 'bg-purple-100 text-purple-700'} rounded-full text-xs font-medium">${s.speaker}</span></td>
|
| 486 |
+
<td class="px-4 py-3 text-slate-600">${s.start_time}</td>
|
| 487 |
+
<td class="px-4 py-3 text-center font-medium">${s.pitch.mean_f0.toFixed(0)} Hz</td>
|
| 488 |
+
<td class="px-4 py-3 text-center font-medium">${s.energy.mean_rms.toFixed(1)} dB</td>
|
| 489 |
+
<td class="px-4 py-3 text-center font-medium">${s.rhythm.speaking_rate.toFixed(1)}</td>
|
| 490 |
+
<td class="px-4 py-3 text-center"><span class="px-2 py-1 ${s.voice_quality.clarity_score > 70 ? 'bg-emerald-100 text-emerald-700' : s.voice_quality.clarity_score > 50 ? 'bg-amber-100 text-amber-700' : 'bg-red-100 text-red-700'} rounded text-xs font-medium">${s.voice_quality.clarity_score.toFixed(0)}</span></td>
|
| 491 |
+
<td class="px-4 py-3 text-center"><span class="px-2 py-1 bg-slate-100 text-slate-700 rounded text-xs">${s.emotion.primary_emotion}</span></td>
|
| 492 |
+
<td class="px-4 py-3 text-center font-bold text-slate-800">${s.overall_quality_score.toFixed(0)}</td>
|
| 493 |
+
</tr>
|
| 494 |
+
`).join('')}
|
| 495 |
+
</tbody>
|
| 496 |
+
</table>
|
| 497 |
+
</div>
|
| 498 |
+
</div>
|
| 499 |
+
</div>
|
| 500 |
+
`;
|
| 501 |
+
|
| 502 |
+
document.getElementById('analysisModal').classList.remove('hidden');
|
| 503 |
+
} catch (e) { console.error('Analysis error:', e); alert('Failed to load analysis'); }
|
| 504 |
+
}
|
| 505 |
+
|
| 506 |
+
function closeModal() { document.getElementById('analysisModal').classList.add('hidden'); }
|
| 507 |
+
document.addEventListener('keydown', e => { if (e.key === 'Escape') closeModal(); });
|
| 508 |
+
</script>
|
| 509 |
+
</body>
|
| 510 |
+
</html>
|
main.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ASR Audio Analysis Pipeline
|
| 3 |
+
|
| 4 |
+
Complete pipeline: Diarization + Whisper Transcription + Professional Audio Analysis
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
from stereo_diarizer import StereoCallDiarizer
|
| 12 |
+
from whisper_transcriber import WhisperTranscriber
|
| 13 |
+
from audio_analyzer import AudioAnalyzer
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class ASRPipeline:
|
| 17 |
+
"""ASR End-to-end audio analysis pipeline"""
|
| 18 |
+
|
| 19 |
+
def __init__(self,
|
| 20 |
+
input_folder: str,
|
| 21 |
+
output_folder: str,
|
| 22 |
+
whisper_model: str,
|
| 23 |
+
min_silence_len: int = 500,
|
| 24 |
+
silence_thresh: int = -40,
|
| 25 |
+
device: str = "cpu",
|
| 26 |
+
verbose: bool = True):
|
| 27 |
+
|
| 28 |
+
self.input_folder = Path(input_folder)
|
| 29 |
+
self.output_folder = Path(output_folder)
|
| 30 |
+
self.whisper_model = whisper_model
|
| 31 |
+
self.min_silence_len = min_silence_len
|
| 32 |
+
self.silence_thresh = silence_thresh
|
| 33 |
+
self.device = device
|
| 34 |
+
self.verbose = verbose
|
| 35 |
+
|
| 36 |
+
self.stats = {
|
| 37 |
+
'total_files': 0,
|
| 38 |
+
'processed': 0,
|
| 39 |
+
'failed': 0,
|
| 40 |
+
'stereo': 0,
|
| 41 |
+
'mono': 0,
|
| 42 |
+
'failed_files': [],
|
| 43 |
+
'total_duration': 0.0
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
self.analyzer = AudioAnalyzer(verbose=self.verbose)
|
| 47 |
+
self.transcriber = None
|
| 48 |
+
|
| 49 |
+
def _init_transcriber(self):
|
| 50 |
+
if self.transcriber is None:
|
| 51 |
+
self.transcriber = WhisperTranscriber(
|
| 52 |
+
self.whisper_model, self.device, self.verbose
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
def get_audio_files(self):
|
| 56 |
+
formats = {'.wav', '.mp3', '.m4a', '.flac', '.ogg', '.opus'}
|
| 57 |
+
return sorted([
|
| 58 |
+
f for f in self.input_folder.iterdir()
|
| 59 |
+
if f.is_file() and f.suffix.lower() in formats
|
| 60 |
+
])
|
| 61 |
+
|
| 62 |
+
def process_single(self, audio_file: Path) -> bool:
|
| 63 |
+
output_dir = self.output_folder / audio_file.stem
|
| 64 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 65 |
+
|
| 66 |
+
if self.verbose:
|
| 67 |
+
print(f"\n{'='*60}")
|
| 68 |
+
print(f"PROCESSING: {audio_file.name}")
|
| 69 |
+
print(f"{'='*60}")
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
# Step 1: Diarization
|
| 73 |
+
if self.verbose:
|
| 74 |
+
print("\n[1/3] DIARIZATION")
|
| 75 |
+
|
| 76 |
+
diarizer = StereoCallDiarizer(
|
| 77 |
+
str(audio_file), self.min_silence_len,
|
| 78 |
+
self.silence_thresh, self.verbose
|
| 79 |
+
)
|
| 80 |
+
diarizer.load_audio()
|
| 81 |
+
|
| 82 |
+
if diarizer.is_stereo:
|
| 83 |
+
self.stats['stereo'] += 1
|
| 84 |
+
else:
|
| 85 |
+
self.stats['mono'] += 1
|
| 86 |
+
|
| 87 |
+
left, right = diarizer.detect_speech_segments()
|
| 88 |
+
diarizer.create_timeline(left, right)
|
| 89 |
+
|
| 90 |
+
segments = diarizer.export_segments(str(output_dir))
|
| 91 |
+
diarizer.export_full_speakers(str(output_dir))
|
| 92 |
+
diarizer.export_transcript_txt(str(output_dir))
|
| 93 |
+
diarizer.export_transcript_json(str(output_dir))
|
| 94 |
+
|
| 95 |
+
duration = len(diarizer.audio) / 1000
|
| 96 |
+
self.stats['total_duration'] += duration
|
| 97 |
+
|
| 98 |
+
# Step 2: Transcription
|
| 99 |
+
if self.verbose:
|
| 100 |
+
print("\n[2/3] TRANSCRIPTION")
|
| 101 |
+
|
| 102 |
+
self._init_transcriber()
|
| 103 |
+
transcribed = self.transcriber.transcribe_segments(
|
| 104 |
+
segments, diarizer.timeline
|
| 105 |
+
)
|
| 106 |
+
self.transcriber.export_transcription(transcribed, str(output_dir))
|
| 107 |
+
|
| 108 |
+
# Step 3: Audio Analysis
|
| 109 |
+
if self.verbose:
|
| 110 |
+
print("\n[3/3] AUDIO ANALYSIS")
|
| 111 |
+
|
| 112 |
+
analysis = self.analyzer.analyze_call(
|
| 113 |
+
segments, diarizer.timeline,
|
| 114 |
+
audio_file.stem, diarizer.is_stereo
|
| 115 |
+
)
|
| 116 |
+
self.analyzer.export_analysis(analysis, str(output_dir))
|
| 117 |
+
|
| 118 |
+
if self.verbose:
|
| 119 |
+
print(f"\nSUCCESS: {audio_file.name}")
|
| 120 |
+
print(f"Type: {'STEREO' if diarizer.is_stereo else 'MONO'}")
|
| 121 |
+
print(f"Duration: {duration:.1f}s | Quality: {analysis.overall_quality_score}/100")
|
| 122 |
+
|
| 123 |
+
return True
|
| 124 |
+
|
| 125 |
+
except Exception as e:
|
| 126 |
+
if self.verbose:
|
| 127 |
+
print(f"\nFAILED: {audio_file.name}")
|
| 128 |
+
print(f"Error: {e}")
|
| 129 |
+
import traceback
|
| 130 |
+
traceback.print_exc()
|
| 131 |
+
return False
|
| 132 |
+
|
| 133 |
+
def run(self):
|
| 134 |
+
print("\n" + "="*60)
|
| 135 |
+
print("ASR AUDIO ANALYSIS PIPELINE")
|
| 136 |
+
print("="*60)
|
| 137 |
+
|
| 138 |
+
files = self.get_audio_files()
|
| 139 |
+
self.stats['total_files'] = len(files)
|
| 140 |
+
|
| 141 |
+
if not files:
|
| 142 |
+
print(f"\nNo audio files in {self.input_folder}")
|
| 143 |
+
return
|
| 144 |
+
|
| 145 |
+
print(f"\nFound {len(files)} file(s)")
|
| 146 |
+
print(f"Input: {self.input_folder}")
|
| 147 |
+
print(f"Output: {self.output_folder}")
|
| 148 |
+
|
| 149 |
+
for i, f in enumerate(files, 1):
|
| 150 |
+
print(f"\n[{i}/{len(files)}]")
|
| 151 |
+
if self.process_single(f):
|
| 152 |
+
self.stats['processed'] += 1
|
| 153 |
+
else:
|
| 154 |
+
self.stats['failed'] += 1
|
| 155 |
+
self.stats['failed_files'].append(f.name)
|
| 156 |
+
|
| 157 |
+
print("\n" + "="*60)
|
| 158 |
+
print("COMPLETE")
|
| 159 |
+
print("="*60)
|
| 160 |
+
print(f"Processed: {self.stats['processed']}/{self.stats['total_files']}")
|
| 161 |
+
print(f"Stereo: {self.stats['stereo']} | Mono: {self.stats['mono']}")
|
| 162 |
+
print(f"Total duration: {self.stats['total_duration']:.1f}s")
|
| 163 |
+
|
| 164 |
+
if self.stats['failed_files']:
|
| 165 |
+
print(f"\nFailed: {', '.join(self.stats['failed_files'])}")
|
| 166 |
+
|
| 167 |
+
print(f"\nResults: {self.output_folder}")
|
| 168 |
+
print("\nRun 'python api_server.py' and open http://localhost:5001")
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def main():
|
| 172 |
+
INPUT_FOLDER = "/home/ramal/Downloads/Archive"
|
| 173 |
+
OUTPUT_FOLDER = "output"
|
| 174 |
+
WHISPER_MODEL = "/home/ramal/Desktop/end-to-end/whisper-small-az/checkpoint-157959"
|
| 175 |
+
|
| 176 |
+
if not os.path.exists(INPUT_FOLDER):
|
| 177 |
+
print(f"Error: {INPUT_FOLDER} not found")
|
| 178 |
+
sys.exit(1)
|
| 179 |
+
|
| 180 |
+
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
|
| 181 |
+
|
| 182 |
+
pipeline = ASRPipeline(
|
| 183 |
+
input_folder=INPUT_FOLDER,
|
| 184 |
+
output_folder=OUTPUT_FOLDER,
|
| 185 |
+
whisper_model=WHISPER_MODEL,
|
| 186 |
+
device="cpu",
|
| 187 |
+
verbose=True
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
pipeline.run()
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
if __name__ == "__main__":
|
| 194 |
+
main()
|
req.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pydub
|
| 2 |
+
librosa
|
| 3 |
+
numpy
|
| 4 |
+
scipy
|
| 5 |
+
flask
|
| 6 |
+
flask-cors
|
| 7 |
+
torch
|
| 8 |
+
transformers
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pydub
|
| 2 |
+
librosa
|
| 3 |
+
numpy
|
| 4 |
+
scipy
|
| 5 |
+
flask
|
| 6 |
+
flask-cors
|
| 7 |
+
torch
|
| 8 |
+
transformers
|
| 9 |
+
soundfile
|
stereo_diarizer.py
ADDED
|
@@ -0,0 +1,556 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Stereo Call Center Audio Diarization Module
|
| 3 |
+
|
| 4 |
+
This module provides professional audio diarization for stereo call center recordings.
|
| 5 |
+
It separates speakers from left/right channels and creates detailed transcription-ready segments.
|
| 6 |
+
|
| 7 |
+
IMPORTANT: Only separates channels for STEREO audio. Mono audio is processed as single speaker.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import json
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from typing import List, Dict, Tuple, Optional
|
| 14 |
+
from dataclasses import dataclass, asdict
|
| 15 |
+
from pydub import AudioSegment
|
| 16 |
+
from pydub.silence import detect_nonsilent
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class Segment:
|
| 21 |
+
"""Represents a single speech segment"""
|
| 22 |
+
turn: int
|
| 23 |
+
speaker: str
|
| 24 |
+
start_ms: int
|
| 25 |
+
end_ms: int
|
| 26 |
+
duration_ms: int
|
| 27 |
+
start_time: str
|
| 28 |
+
end_time: str
|
| 29 |
+
duration: str
|
| 30 |
+
channel: str = 'mono'
|
| 31 |
+
audio_file: Optional[str] = None
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class StereoCallDiarizer:
|
| 35 |
+
"""
|
| 36 |
+
Professional stereo call center audio diarization system.
|
| 37 |
+
|
| 38 |
+
For STEREO audio: Separates speakers from stereo audio (left/right channels)
|
| 39 |
+
For MONO audio: Processes as single speaker without channel separation
|
| 40 |
+
|
| 41 |
+
Attributes:
|
| 42 |
+
input_file (str): Path to input audio file
|
| 43 |
+
min_silence_len (int): Minimum silence length in ms to split segments
|
| 44 |
+
silence_thresh (int): Silence threshold in dB
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
def __init__(self, input_file: str, min_silence_len: int = 500,
|
| 48 |
+
silence_thresh: int = -40, verbose: bool = True):
|
| 49 |
+
"""
|
| 50 |
+
Initialize the diarizer.
|
| 51 |
+
|
| 52 |
+
Args:
|
| 53 |
+
input_file: Path to audio file (stereo or mono)
|
| 54 |
+
min_silence_len: Minimum silence duration (ms) to split segments
|
| 55 |
+
silence_thresh: Audio level (dB) below which is considered silence
|
| 56 |
+
verbose: Enable/disable logging output
|
| 57 |
+
"""
|
| 58 |
+
self.input_file = input_file
|
| 59 |
+
self.min_silence_len = min_silence_len
|
| 60 |
+
self.silence_thresh = silence_thresh
|
| 61 |
+
self.verbose = verbose
|
| 62 |
+
|
| 63 |
+
self.audio: Optional[AudioSegment] = None
|
| 64 |
+
self.left_channel: Optional[AudioSegment] = None
|
| 65 |
+
self.right_channel: Optional[AudioSegment] = None
|
| 66 |
+
self.mono_channel: Optional[AudioSegment] = None
|
| 67 |
+
self.timeline: List[Segment] = []
|
| 68 |
+
self.customer_channel: str = 'left'
|
| 69 |
+
self.is_stereo: bool = False
|
| 70 |
+
|
| 71 |
+
def _log(self, message: str):
|
| 72 |
+
"""Internal logging method"""
|
| 73 |
+
if self.verbose:
|
| 74 |
+
print(message)
|
| 75 |
+
|
| 76 |
+
def load_audio(self) -> bool:
|
| 77 |
+
"""
|
| 78 |
+
Load audio file and split into channels if stereo.
|
| 79 |
+
|
| 80 |
+
Returns:
|
| 81 |
+
True if stereo, False if mono
|
| 82 |
+
"""
|
| 83 |
+
self._log(f"Loading audio: {self.input_file}")
|
| 84 |
+
self.audio = AudioSegment.from_file(self.input_file)
|
| 85 |
+
|
| 86 |
+
self.is_stereo = self.audio.channels == 2
|
| 87 |
+
|
| 88 |
+
if self.is_stereo:
|
| 89 |
+
self._log(f"STEREO audio detected ({self.audio.channels} channels) - will separate speakers")
|
| 90 |
+
self.left_channel = self.audio.split_to_mono()[0]
|
| 91 |
+
self.right_channel = self.audio.split_to_mono()[1]
|
| 92 |
+
else:
|
| 93 |
+
self._log(f"MONO audio detected ({self.audio.channels} channel) - single speaker mode")
|
| 94 |
+
self.mono_channel = self.audio
|
| 95 |
+
|
| 96 |
+
self._log(f"Duration: {len(self.audio)/1000:.2f}s | Sample rate: {self.audio.frame_rate}Hz")
|
| 97 |
+
return self.is_stereo
|
| 98 |
+
|
| 99 |
+
def detect_speech_segments(self) -> Tuple[List[Tuple[int, int]], List[Tuple[int, int]]]:
|
| 100 |
+
"""
|
| 101 |
+
Detect speech segments.
|
| 102 |
+
|
| 103 |
+
For stereo: returns (left_segments, right_segments)
|
| 104 |
+
For mono: returns (mono_segments, [])
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
Tuple of segment lists
|
| 108 |
+
"""
|
| 109 |
+
self._log("Detecting speech segments...")
|
| 110 |
+
|
| 111 |
+
if self.is_stereo:
|
| 112 |
+
left_segments = detect_nonsilent(
|
| 113 |
+
self.left_channel,
|
| 114 |
+
min_silence_len=self.min_silence_len,
|
| 115 |
+
silence_thresh=self.silence_thresh
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
right_segments = detect_nonsilent(
|
| 119 |
+
self.right_channel,
|
| 120 |
+
min_silence_len=self.min_silence_len,
|
| 121 |
+
silence_thresh=self.silence_thresh
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
self._log(f"Found {len(left_segments)} segments (LEFT), {len(right_segments)} segments (RIGHT)")
|
| 125 |
+
return left_segments, right_segments
|
| 126 |
+
else:
|
| 127 |
+
mono_segments = detect_nonsilent(
|
| 128 |
+
self.mono_channel,
|
| 129 |
+
min_silence_len=self.min_silence_len,
|
| 130 |
+
silence_thresh=self.silence_thresh
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
self._log(f"Found {len(mono_segments)} segments (MONO)")
|
| 134 |
+
return mono_segments, []
|
| 135 |
+
|
| 136 |
+
def create_timeline(self, left_segments: List[Tuple[int, int]],
|
| 137 |
+
right_segments: List[Tuple[int, int]]) -> List[Segment]:
|
| 138 |
+
"""
|
| 139 |
+
Create chronologically ordered timeline of all speech segments.
|
| 140 |
+
|
| 141 |
+
For STEREO: First speaker is CUSTOMER, second is AGENT
|
| 142 |
+
For MONO: All segments are marked as SPEAKER
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
left_segments: Left channel segments (or mono segments)
|
| 146 |
+
right_segments: Right channel segments (empty for mono)
|
| 147 |
+
|
| 148 |
+
Returns:
|
| 149 |
+
List of Segment objects sorted by start time
|
| 150 |
+
"""
|
| 151 |
+
self._log("Building timeline...")
|
| 152 |
+
|
| 153 |
+
if self.is_stereo:
|
| 154 |
+
return self._create_stereo_timeline(left_segments, right_segments)
|
| 155 |
+
else:
|
| 156 |
+
return self._create_mono_timeline(left_segments)
|
| 157 |
+
|
| 158 |
+
def _create_stereo_timeline(self, left_segments: List[Tuple[int, int]],
|
| 159 |
+
right_segments: List[Tuple[int, int]]) -> List[Segment]:
|
| 160 |
+
"""Create timeline for stereo audio with speaker separation"""
|
| 161 |
+
# Determine who speaks first - that's the CUSTOMER (caller)
|
| 162 |
+
first_left = left_segments[0][0] if left_segments else float('inf')
|
| 163 |
+
first_right = right_segments[0][0] if right_segments else float('inf')
|
| 164 |
+
|
| 165 |
+
if first_left < first_right:
|
| 166 |
+
customer_label = 'CUSTOMER'
|
| 167 |
+
agent_label = 'AGENT'
|
| 168 |
+
customer_segments = left_segments
|
| 169 |
+
agent_segments = right_segments
|
| 170 |
+
customer_channel = 'left'
|
| 171 |
+
self._log(f"First speaker: LEFT channel (CUSTOMER)")
|
| 172 |
+
else:
|
| 173 |
+
customer_label = 'CUSTOMER'
|
| 174 |
+
agent_label = 'AGENT'
|
| 175 |
+
customer_segments = right_segments
|
| 176 |
+
agent_segments = left_segments
|
| 177 |
+
customer_channel = 'right'
|
| 178 |
+
self._log(f"First speaker: RIGHT channel (CUSTOMER)")
|
| 179 |
+
|
| 180 |
+
self.customer_channel = customer_channel
|
| 181 |
+
|
| 182 |
+
timeline = []
|
| 183 |
+
|
| 184 |
+
# Add customer segments
|
| 185 |
+
for start, end in customer_segments:
|
| 186 |
+
timeline.append({
|
| 187 |
+
'speaker': customer_label,
|
| 188 |
+
'start_ms': start,
|
| 189 |
+
'end_ms': end,
|
| 190 |
+
'duration_ms': end - start,
|
| 191 |
+
'start_time': self._ms_to_time(start),
|
| 192 |
+
'end_time': self._ms_to_time(end),
|
| 193 |
+
'duration': self._ms_to_time(end - start),
|
| 194 |
+
'channel': customer_channel
|
| 195 |
+
})
|
| 196 |
+
|
| 197 |
+
# Add agent segments
|
| 198 |
+
for start, end in agent_segments:
|
| 199 |
+
timeline.append({
|
| 200 |
+
'speaker': agent_label,
|
| 201 |
+
'start_ms': start,
|
| 202 |
+
'end_ms': end,
|
| 203 |
+
'duration_ms': end - start,
|
| 204 |
+
'start_time': self._ms_to_time(start),
|
| 205 |
+
'end_time': self._ms_to_time(end),
|
| 206 |
+
'duration': self._ms_to_time(end - start),
|
| 207 |
+
'channel': 'right' if customer_channel == 'left' else 'left'
|
| 208 |
+
})
|
| 209 |
+
|
| 210 |
+
# Sort chronologically
|
| 211 |
+
timeline.sort(key=lambda x: x['start_ms'])
|
| 212 |
+
|
| 213 |
+
# Create Segment objects with turn numbers
|
| 214 |
+
self.timeline = [
|
| 215 |
+
Segment(turn=i+1, **seg)
|
| 216 |
+
for i, seg in enumerate(timeline)
|
| 217 |
+
]
|
| 218 |
+
|
| 219 |
+
self._log(f"Timeline created with {len(self.timeline)} segments (2 speakers)")
|
| 220 |
+
return self.timeline
|
| 221 |
+
|
| 222 |
+
def _create_mono_timeline(self, segments: List[Tuple[int, int]]) -> List[Segment]:
|
| 223 |
+
"""Create timeline for mono audio (single speaker)"""
|
| 224 |
+
timeline = []
|
| 225 |
+
|
| 226 |
+
for start, end in segments:
|
| 227 |
+
timeline.append({
|
| 228 |
+
'speaker': 'SPEAKER',
|
| 229 |
+
'start_ms': start,
|
| 230 |
+
'end_ms': end,
|
| 231 |
+
'duration_ms': end - start,
|
| 232 |
+
'start_time': self._ms_to_time(start),
|
| 233 |
+
'end_time': self._ms_to_time(end),
|
| 234 |
+
'duration': self._ms_to_time(end - start),
|
| 235 |
+
'channel': 'mono'
|
| 236 |
+
})
|
| 237 |
+
|
| 238 |
+
# Sort chronologically
|
| 239 |
+
timeline.sort(key=lambda x: x['start_ms'])
|
| 240 |
+
|
| 241 |
+
# Create Segment objects with turn numbers
|
| 242 |
+
self.timeline = [
|
| 243 |
+
Segment(turn=i+1, **seg)
|
| 244 |
+
for i, seg in enumerate(timeline)
|
| 245 |
+
]
|
| 246 |
+
|
| 247 |
+
self._log(f"Timeline created with {len(self.timeline)} segments (1 speaker - MONO)")
|
| 248 |
+
return self.timeline
|
| 249 |
+
|
| 250 |
+
@staticmethod
|
| 251 |
+
def _ms_to_time(ms: int) -> str:
|
| 252 |
+
"""Convert milliseconds to HH:MM:SS.mmm format"""
|
| 253 |
+
seconds = int(ms / 1000)
|
| 254 |
+
milliseconds = int(ms % 1000)
|
| 255 |
+
minutes = int(seconds / 60)
|
| 256 |
+
seconds = seconds % 60
|
| 257 |
+
hours = int(minutes / 60)
|
| 258 |
+
minutes = minutes % 60
|
| 259 |
+
return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
|
| 260 |
+
|
| 261 |
+
def export_segments(self, output_dir: str = "output") -> List[str]:
|
| 262 |
+
"""
|
| 263 |
+
Export each segment as individual audio file.
|
| 264 |
+
|
| 265 |
+
Args:
|
| 266 |
+
output_dir: Directory to save segment files
|
| 267 |
+
|
| 268 |
+
Returns:
|
| 269 |
+
List of created file paths
|
| 270 |
+
"""
|
| 271 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 272 |
+
self._log(f"Exporting {len(self.timeline)} audio segments to {output_dir}/")
|
| 273 |
+
|
| 274 |
+
file_paths = []
|
| 275 |
+
|
| 276 |
+
for segment in self.timeline:
|
| 277 |
+
# Select correct channel based on audio type and segment info
|
| 278 |
+
if self.is_stereo:
|
| 279 |
+
if segment.channel == 'left':
|
| 280 |
+
audio_segment = self.left_channel[segment.start_ms:segment.end_ms]
|
| 281 |
+
else:
|
| 282 |
+
audio_segment = self.right_channel[segment.start_ms:segment.end_ms]
|
| 283 |
+
else:
|
| 284 |
+
audio_segment = self.mono_channel[segment.start_ms:segment.end_ms]
|
| 285 |
+
|
| 286 |
+
# Create filename
|
| 287 |
+
filename = f"segment_{segment.turn:03d}_{segment.speaker}_{segment.start_ms}ms-{segment.end_ms}ms.wav"
|
| 288 |
+
filepath = os.path.join(output_dir, filename)
|
| 289 |
+
|
| 290 |
+
# Export
|
| 291 |
+
audio_segment.export(filepath, format="wav")
|
| 292 |
+
segment.audio_file = filepath
|
| 293 |
+
file_paths.append(filepath)
|
| 294 |
+
|
| 295 |
+
self._log(f"Exported {len(file_paths)} segments")
|
| 296 |
+
return file_paths
|
| 297 |
+
|
| 298 |
+
def export_full_speakers(self, output_dir: str = "output") -> Dict[str, str]:
|
| 299 |
+
"""
|
| 300 |
+
Export full concatenated audio for each speaker.
|
| 301 |
+
|
| 302 |
+
For stereo: Creates CUSTOMER_full.wav and AGENT_full.wav
|
| 303 |
+
For mono: Creates SPEAKER_full.wav
|
| 304 |
+
|
| 305 |
+
Args:
|
| 306 |
+
output_dir: Directory to save files
|
| 307 |
+
|
| 308 |
+
Returns:
|
| 309 |
+
Dictionary mapping speaker names to file paths
|
| 310 |
+
"""
|
| 311 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 312 |
+
self._log("Exporting full speaker audio...")
|
| 313 |
+
|
| 314 |
+
result = {}
|
| 315 |
+
|
| 316 |
+
if self.is_stereo:
|
| 317 |
+
speakers = ['CUSTOMER', 'AGENT']
|
| 318 |
+
else:
|
| 319 |
+
speakers = ['SPEAKER']
|
| 320 |
+
|
| 321 |
+
for speaker in speakers:
|
| 322 |
+
segments = [s for s in self.timeline if s.speaker == speaker]
|
| 323 |
+
|
| 324 |
+
if segments:
|
| 325 |
+
parts = []
|
| 326 |
+
for seg in segments:
|
| 327 |
+
if self.is_stereo:
|
| 328 |
+
if seg.channel == 'left':
|
| 329 |
+
parts.append(self.left_channel[seg.start_ms:seg.end_ms])
|
| 330 |
+
else:
|
| 331 |
+
parts.append(self.right_channel[seg.start_ms:seg.end_ms])
|
| 332 |
+
else:
|
| 333 |
+
parts.append(self.mono_channel[seg.start_ms:seg.end_ms])
|
| 334 |
+
|
| 335 |
+
combined = sum(parts)
|
| 336 |
+
|
| 337 |
+
filepath = os.path.join(output_dir, f"{speaker}_full.wav")
|
| 338 |
+
combined.export(filepath, format="wav")
|
| 339 |
+
result[speaker] = filepath
|
| 340 |
+
|
| 341 |
+
self._log(f"{speaker}: {len(combined)/1000:.2f}s ({len(segments)} segments)")
|
| 342 |
+
|
| 343 |
+
return result
|
| 344 |
+
|
| 345 |
+
def export_transcript_txt(self, output_dir: str = "output") -> str:
|
| 346 |
+
"""Export human-readable transcript."""
|
| 347 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 348 |
+
filepath = os.path.join(output_dir, "transcript.txt")
|
| 349 |
+
|
| 350 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 351 |
+
f.write("=" * 80 + "\n")
|
| 352 |
+
f.write("CALL CENTER CONVERSATION TRANSCRIPT\n")
|
| 353 |
+
f.write(f"File: {self.input_file}\n")
|
| 354 |
+
f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
| 355 |
+
f.write(f"Duration: {len(self.audio)/1000:.2f} seconds\n")
|
| 356 |
+
f.write(f"Total Segments: {len(self.timeline)}\n")
|
| 357 |
+
f.write(f"Audio Type: {'STEREO' if self.is_stereo else 'MONO'}\n")
|
| 358 |
+
|
| 359 |
+
if self.is_stereo:
|
| 360 |
+
f.write(f"CUSTOMER Channel: {self.customer_channel.upper()}\n")
|
| 361 |
+
f.write(f"AGENT Channel: {'RIGHT' if self.customer_channel == 'left' else 'LEFT'}\n")
|
| 362 |
+
|
| 363 |
+
f.write("=" * 80 + "\n\n")
|
| 364 |
+
|
| 365 |
+
for segment in self.timeline:
|
| 366 |
+
f.write(f"[Turn {segment.turn:03d}] {segment.speaker}\n")
|
| 367 |
+
f.write(f" Time: {segment.start_time} --> {segment.end_time}\n")
|
| 368 |
+
f.write(f" Duration: {segment.duration}\n")
|
| 369 |
+
if segment.audio_file:
|
| 370 |
+
f.write(f" Audio: {os.path.basename(segment.audio_file)}\n")
|
| 371 |
+
f.write("\n")
|
| 372 |
+
|
| 373 |
+
self._log(f"Transcript saved: {filepath}")
|
| 374 |
+
return filepath
|
| 375 |
+
|
| 376 |
+
def export_transcript_json(self, output_dir: str = "output") -> str:
|
| 377 |
+
"""Export structured JSON transcript."""
|
| 378 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 379 |
+
filepath = os.path.join(output_dir, "transcript.json")
|
| 380 |
+
|
| 381 |
+
data = {
|
| 382 |
+
'metadata': {
|
| 383 |
+
'input_file': self.input_file,
|
| 384 |
+
'duration_seconds': len(self.audio) / 1000,
|
| 385 |
+
'sample_rate': self.audio.frame_rate,
|
| 386 |
+
'channels': self.audio.channels,
|
| 387 |
+
'is_stereo': self.is_stereo,
|
| 388 |
+
'total_segments': len(self.timeline),
|
| 389 |
+
'analysis_date': datetime.now().isoformat(),
|
| 390 |
+
'min_silence_len_ms': self.min_silence_len,
|
| 391 |
+
'silence_thresh_db': self.silence_thresh,
|
| 392 |
+
},
|
| 393 |
+
'timeline': [asdict(s) for s in self.timeline]
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
if self.is_stereo:
|
| 397 |
+
data['metadata']['customer_channel'] = self.customer_channel
|
| 398 |
+
data['metadata']['agent_channel'] = 'right' if self.customer_channel == 'left' else 'left'
|
| 399 |
+
|
| 400 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 401 |
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
| 402 |
+
|
| 403 |
+
self._log(f"JSON saved: {filepath}")
|
| 404 |
+
return filepath
|
| 405 |
+
|
| 406 |
+
def export_transcript_rttm(self, output_dir: str = "output") -> str:
|
| 407 |
+
"""Export RTTM format transcript (pyannote compatible)."""
|
| 408 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 409 |
+
filepath = os.path.join(output_dir, "transcript.rttm")
|
| 410 |
+
|
| 411 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 412 |
+
for segment in self.timeline:
|
| 413 |
+
start_sec = segment.start_ms / 1000
|
| 414 |
+
duration_sec = segment.duration_ms / 1000
|
| 415 |
+
f.write(f"SPEAKER {os.path.basename(self.input_file)} 1 "
|
| 416 |
+
f"{start_sec:.3f} {duration_sec:.3f} <NA> <NA> "
|
| 417 |
+
f"{segment.speaker} <NA> <NA>\n")
|
| 418 |
+
|
| 419 |
+
self._log(f"RTTM saved: {filepath}")
|
| 420 |
+
return filepath
|
| 421 |
+
|
| 422 |
+
def get_statistics(self) -> Dict:
|
| 423 |
+
"""Calculate and return statistics about the conversation."""
|
| 424 |
+
if self.is_stereo:
|
| 425 |
+
customer_segments = [s for s in self.timeline if s.speaker == 'CUSTOMER']
|
| 426 |
+
agent_segments = [s for s in self.timeline if s.speaker == 'AGENT']
|
| 427 |
+
|
| 428 |
+
customer_duration = sum(s.duration_ms for s in customer_segments) / 1000
|
| 429 |
+
agent_duration = sum(s.duration_ms for s in agent_segments) / 1000
|
| 430 |
+
total_speech = customer_duration + agent_duration
|
| 431 |
+
else:
|
| 432 |
+
speaker_segments = [s for s in self.timeline if s.speaker == 'SPEAKER']
|
| 433 |
+
total_speech = sum(s.duration_ms for s in speaker_segments) / 1000
|
| 434 |
+
|
| 435 |
+
total_duration = len(self.audio) / 1000
|
| 436 |
+
silence_duration = total_duration - total_speech
|
| 437 |
+
|
| 438 |
+
stats = {
|
| 439 |
+
'total_duration': total_duration,
|
| 440 |
+
'total_speech': total_speech,
|
| 441 |
+
'silence_duration': silence_duration,
|
| 442 |
+
'silence_percentage': (silence_duration / total_duration) * 100 if total_duration > 0 else 0,
|
| 443 |
+
'is_stereo': self.is_stereo
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
if self.is_stereo:
|
| 447 |
+
stats['customer'] = {
|
| 448 |
+
'segments': len(customer_segments),
|
| 449 |
+
'duration': customer_duration,
|
| 450 |
+
'percentage': (customer_duration / total_duration) * 100 if total_duration > 0 else 0,
|
| 451 |
+
'avg_segment': customer_duration / len(customer_segments) if customer_segments else 0
|
| 452 |
+
}
|
| 453 |
+
stats['agent'] = {
|
| 454 |
+
'segments': len(agent_segments),
|
| 455 |
+
'duration': agent_duration,
|
| 456 |
+
'percentage': (agent_duration / total_duration) * 100 if total_duration > 0 else 0,
|
| 457 |
+
'avg_segment': agent_duration / len(agent_segments) if agent_segments else 0
|
| 458 |
+
}
|
| 459 |
+
else:
|
| 460 |
+
stats['speaker'] = {
|
| 461 |
+
'segments': len(speaker_segments),
|
| 462 |
+
'duration': total_speech,
|
| 463 |
+
'percentage': (total_speech / total_duration) * 100 if total_duration > 0 else 0,
|
| 464 |
+
'avg_segment': total_speech / len(speaker_segments) if speaker_segments else 0
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
return stats
|
| 468 |
+
|
| 469 |
+
def process(self, output_dir: str = "output", export_segments: bool = True,
|
| 470 |
+
export_full: bool = True, export_transcripts: bool = True) -> Dict:
|
| 471 |
+
"""
|
| 472 |
+
Run complete diarization pipeline.
|
| 473 |
+
|
| 474 |
+
Args:
|
| 475 |
+
output_dir: Directory for all outputs
|
| 476 |
+
export_segments: Whether to export individual segment files
|
| 477 |
+
export_full: Whether to export full speaker audio files
|
| 478 |
+
export_transcripts: Whether to export transcript files
|
| 479 |
+
|
| 480 |
+
Returns:
|
| 481 |
+
Dictionary with results and file paths
|
| 482 |
+
"""
|
| 483 |
+
self._log("=" * 80)
|
| 484 |
+
self._log("AUDIO DIARIZATION - PROCESSING")
|
| 485 |
+
self._log("=" * 80)
|
| 486 |
+
|
| 487 |
+
# Load and process
|
| 488 |
+
self.load_audio()
|
| 489 |
+
left_seg, right_seg = self.detect_speech_segments()
|
| 490 |
+
self.create_timeline(left_seg, right_seg)
|
| 491 |
+
|
| 492 |
+
# Export results
|
| 493 |
+
results = {
|
| 494 |
+
'is_stereo': self.is_stereo,
|
| 495 |
+
'timeline': self.timeline,
|
| 496 |
+
'statistics': self.get_statistics(),
|
| 497 |
+
'files': {}
|
| 498 |
+
}
|
| 499 |
+
|
| 500 |
+
if export_segments:
|
| 501 |
+
results['files']['segments'] = self.export_segments(output_dir)
|
| 502 |
+
|
| 503 |
+
if export_full:
|
| 504 |
+
results['files']['full_speakers'] = self.export_full_speakers(output_dir)
|
| 505 |
+
|
| 506 |
+
if export_transcripts:
|
| 507 |
+
results['files']['transcript_txt'] = self.export_transcript_txt(output_dir)
|
| 508 |
+
results['files']['transcript_json'] = self.export_transcript_json(output_dir)
|
| 509 |
+
results['files']['transcript_rttm'] = self.export_transcript_rttm(output_dir)
|
| 510 |
+
|
| 511 |
+
self._log("=" * 80)
|
| 512 |
+
self._log("COMPLETED")
|
| 513 |
+
self._log("=" * 80)
|
| 514 |
+
|
| 515 |
+
return results
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
# Convenience function for simple usage
|
| 519 |
+
def diarize_call(input_file: str, output_dir: str = "output",
|
| 520 |
+
min_silence_len: int = 500, silence_thresh: int = -40,
|
| 521 |
+
verbose: bool = True) -> Dict:
|
| 522 |
+
"""
|
| 523 |
+
Simple function to diarize a call recording (stereo or mono).
|
| 524 |
+
|
| 525 |
+
Args:
|
| 526 |
+
input_file: Path to audio file
|
| 527 |
+
output_dir: Directory for outputs
|
| 528 |
+
min_silence_len: Minimum silence duration in ms
|
| 529 |
+
silence_thresh: Silence threshold in dB
|
| 530 |
+
verbose: Enable logging
|
| 531 |
+
|
| 532 |
+
Returns:
|
| 533 |
+
Dictionary with results including timeline, statistics, and file paths
|
| 534 |
+
"""
|
| 535 |
+
diarizer = StereoCallDiarizer(input_file, min_silence_len, silence_thresh, verbose)
|
| 536 |
+
return diarizer.process(output_dir)
|
| 537 |
+
|
| 538 |
+
|
| 539 |
+
if __name__ == "__main__":
|
| 540 |
+
# Example usage
|
| 541 |
+
result = diarize_call(
|
| 542 |
+
input_file="call.wav",
|
| 543 |
+
output_dir="output",
|
| 544 |
+
min_silence_len=500,
|
| 545 |
+
silence_thresh=-40
|
| 546 |
+
)
|
| 547 |
+
|
| 548 |
+
print("\nStatistics:")
|
| 549 |
+
print(f"Audio type: {'STEREO' if result['is_stereo'] else 'MONO'}")
|
| 550 |
+
print(f"Total duration: {result['statistics']['total_duration']:.2f}s")
|
| 551 |
+
|
| 552 |
+
if result['is_stereo']:
|
| 553 |
+
print(f"Customer: {result['statistics']['customer']['duration']:.2f}s")
|
| 554 |
+
print(f"Agent: {result['statistics']['agent']['duration']:.2f}s")
|
| 555 |
+
else:
|
| 556 |
+
print(f"Speaker: {result['statistics']['speaker']['duration']:.2f}s")
|
whisper_transcriber.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Whisper Speech-to-Text Transcription Module
|
| 3 |
+
|
| 4 |
+
Uses quantized Whisper model for CPU-optimized transcription.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import json
|
| 9 |
+
import time
|
| 10 |
+
from typing import List, Dict, Optional
|
| 11 |
+
from dataclasses import dataclass, asdict
|
| 12 |
+
import torch
|
| 13 |
+
import librosa
|
| 14 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class TranscribedSegment:
|
| 19 |
+
"""Represents a transcribed audio segment"""
|
| 20 |
+
turn: int
|
| 21 |
+
speaker: str
|
| 22 |
+
start_time: str
|
| 23 |
+
end_time: str
|
| 24 |
+
duration: str
|
| 25 |
+
text: str
|
| 26 |
+
audio_file: str
|
| 27 |
+
inference_time: float
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class WhisperTranscriber:
|
| 31 |
+
"""
|
| 32 |
+
Whisper-based speech-to-text transcription system.
|
| 33 |
+
|
| 34 |
+
Optimized for CPU inference with quantized model.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def __init__(self, model_path: str, device: str = "cpu", verbose: bool = True):
|
| 38 |
+
"""
|
| 39 |
+
Initialize Whisper transcriber.
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
model_path: Path to Whisper model checkpoint
|
| 43 |
+
device: Device for inference ('cpu' or 'cuda')
|
| 44 |
+
verbose: Enable logging
|
| 45 |
+
"""
|
| 46 |
+
self.model_path = model_path
|
| 47 |
+
self.device = device
|
| 48 |
+
self.verbose = verbose
|
| 49 |
+
|
| 50 |
+
self._log("Loading Whisper model...")
|
| 51 |
+
self._log(f"Model: {model_path}")
|
| 52 |
+
self._log(f"Device: {device}")
|
| 53 |
+
|
| 54 |
+
# Load processor and model
|
| 55 |
+
self.processor = WhisperProcessor.from_pretrained(model_path)
|
| 56 |
+
self.model = WhisperForConditionalGeneration.from_pretrained(model_path)
|
| 57 |
+
self.model.to(device)
|
| 58 |
+
self.model.eval()
|
| 59 |
+
|
| 60 |
+
# Optimize for CPU
|
| 61 |
+
if device == "cpu":
|
| 62 |
+
self.model = torch.quantization.quantize_dynamic(
|
| 63 |
+
self.model, {torch.nn.Linear}, dtype=torch.qint8
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
self._log("Model loaded successfully")
|
| 67 |
+
|
| 68 |
+
def _log(self, message: str):
|
| 69 |
+
if self.verbose:
|
| 70 |
+
print(message)
|
| 71 |
+
|
| 72 |
+
def transcribe_audio(self, audio_path: str) -> tuple:
|
| 73 |
+
"""
|
| 74 |
+
Transcribe a single audio file.
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
audio_path: Path to audio file
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
Tuple of (transcription_text, inference_time)
|
| 81 |
+
"""
|
| 82 |
+
start_time = time.time()
|
| 83 |
+
|
| 84 |
+
# Load and resample audio
|
| 85 |
+
audio, sr = librosa.load(audio_path, sr=16000)
|
| 86 |
+
|
| 87 |
+
# Process audio
|
| 88 |
+
inputs = self.processor(
|
| 89 |
+
audio,
|
| 90 |
+
sampling_rate=16000,
|
| 91 |
+
return_tensors="pt"
|
| 92 |
+
).input_features.to(self.device)
|
| 93 |
+
|
| 94 |
+
# Generate transcription
|
| 95 |
+
with torch.no_grad():
|
| 96 |
+
predicted_ids = self.model.generate(
|
| 97 |
+
inputs,
|
| 98 |
+
max_length=448,
|
| 99 |
+
num_beams=5,
|
| 100 |
+
language="az",
|
| 101 |
+
task="transcribe"
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
# Decode
|
| 105 |
+
transcription = self.processor.batch_decode(
|
| 106 |
+
predicted_ids,
|
| 107 |
+
skip_special_tokens=True
|
| 108 |
+
)[0]
|
| 109 |
+
|
| 110 |
+
inference_time = time.time() - start_time
|
| 111 |
+
|
| 112 |
+
return transcription.strip(), inference_time
|
| 113 |
+
|
| 114 |
+
def transcribe_segments(self, segment_files: List[str],
|
| 115 |
+
timeline: List) -> List[TranscribedSegment]:
|
| 116 |
+
"""
|
| 117 |
+
Transcribe multiple audio segments.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
segment_files: List of audio file paths
|
| 121 |
+
timeline: List of segment metadata from diarizer
|
| 122 |
+
|
| 123 |
+
Returns:
|
| 124 |
+
List of TranscribedSegment objects
|
| 125 |
+
"""
|
| 126 |
+
transcriptions = []
|
| 127 |
+
total_time = 0
|
| 128 |
+
|
| 129 |
+
self._log(f"\nTranscribing {len(segment_files)} segments...")
|
| 130 |
+
|
| 131 |
+
for i, (seg_file, seg_info) in enumerate(zip(segment_files, timeline)):
|
| 132 |
+
self._log(f" [{i+1}/{len(segment_files)}] {os.path.basename(seg_file)}")
|
| 133 |
+
|
| 134 |
+
text, inf_time = self.transcribe_audio(seg_file)
|
| 135 |
+
total_time += inf_time
|
| 136 |
+
|
| 137 |
+
transcriptions.append(TranscribedSegment(
|
| 138 |
+
turn=seg_info.turn,
|
| 139 |
+
speaker=seg_info.speaker,
|
| 140 |
+
start_time=seg_info.start_time,
|
| 141 |
+
end_time=seg_info.end_time,
|
| 142 |
+
duration=seg_info.duration,
|
| 143 |
+
text=text,
|
| 144 |
+
audio_file=os.path.basename(seg_file),
|
| 145 |
+
inference_time=round(inf_time, 2)
|
| 146 |
+
))
|
| 147 |
+
|
| 148 |
+
self._log(f"Total transcription time: {total_time:.2f}s")
|
| 149 |
+
|
| 150 |
+
return transcriptions
|
| 151 |
+
|
| 152 |
+
def export_transcription(self, transcriptions: List[TranscribedSegment],
|
| 153 |
+
output_dir: str) -> Dict[str, str]:
|
| 154 |
+
"""
|
| 155 |
+
Export transcriptions to files.
|
| 156 |
+
|
| 157 |
+
Args:
|
| 158 |
+
transcriptions: List of TranscribedSegment objects
|
| 159 |
+
output_dir: Output directory
|
| 160 |
+
|
| 161 |
+
Returns:
|
| 162 |
+
Dictionary of created file paths
|
| 163 |
+
"""
|
| 164 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 165 |
+
files = {}
|
| 166 |
+
|
| 167 |
+
# Export JSON
|
| 168 |
+
json_path = os.path.join(output_dir, "transcription.json")
|
| 169 |
+
data = {
|
| 170 |
+
'total_segments': len(transcriptions),
|
| 171 |
+
'transcriptions': [asdict(t) for t in transcriptions]
|
| 172 |
+
}
|
| 173 |
+
with open(json_path, 'w', encoding='utf-8') as f:
|
| 174 |
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
| 175 |
+
files['json'] = json_path
|
| 176 |
+
|
| 177 |
+
# Export conversation text
|
| 178 |
+
conv_path = os.path.join(output_dir, "conversation.txt")
|
| 179 |
+
with open(conv_path, 'w', encoding='utf-8') as f:
|
| 180 |
+
for t in transcriptions:
|
| 181 |
+
f.write(f"[{t.start_time}] {t.speaker}: {t.text}\n")
|
| 182 |
+
files['conversation'] = conv_path
|
| 183 |
+
|
| 184 |
+
self._log(f"Transcription exported to {output_dir}")
|
| 185 |
+
|
| 186 |
+
return files
|