ramalMr commited on
Commit
dd93e44
·
0 Parent(s):

Initial commit

Browse files
Files changed (10) hide show
  1. Dockerfile +26 -0
  2. README.md +31 -0
  3. api_server.py +354 -0
  4. audio_analyzer.py +701 -0
  5. dashboard.html +510 -0
  6. main.py +194 -0
  7. req.txt +8 -0
  8. requirements.txt +9 -0
  9. stereo_diarizer.py +556 -0
  10. whisper_transcriber.py +186 -0
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ ffmpeg \
8
+ libsndfile1 \
9
+ git \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Copy requirements first for caching
13
+ COPY requirements.txt .
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ # Copy application files
17
+ COPY . .
18
+
19
+ # Create necessary directories
20
+ RUN mkdir -p /app/output /app/uploads
21
+
22
+ # Expose port (HuggingFace Spaces uses 7860)
23
+ EXPOSE 7860
24
+
25
+ # Run the server
26
+ CMD ["python", "api_server.py"]
README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ASR Audio Intelligence Platform
3
+ emoji: 🎙️
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ license: mit
10
+ ---
11
+
12
+ # ASR Audio Intelligence Platform
13
+
14
+ Enterprise-grade Speech Analytics & Transcription system for Azerbaijani language.
15
+
16
+ ## Features
17
+
18
+ - **Speaker Diarization**: Automatic separation of speakers (stereo/mono support)
19
+ - **Speech Transcription**: Whisper-based transcription for Azerbaijani
20
+ - **Audio Analysis**: Professional audio quality metrics and insights
21
+ - **Real-time Processing**: Upload and analyze audio files instantly
22
+
23
+ ## Supported Formats
24
+
25
+ WAV, MP3, M4A, FLAC, OGG, OPUS
26
+
27
+ ## Usage
28
+
29
+ 1. Upload an audio file using the web interface
30
+ 2. Wait for processing (diarization, transcription, analysis)
31
+ 3. View detailed analysis results including speaker profiles and transcripts
api_server.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ASR Audio Analysis API Server
3
+
4
+ Enterprise-grade REST API for audio processing:
5
+ - Diarization (stereo/mono)
6
+ - Whisper Transcription
7
+ - Professional Audio Analysis
8
+ """
9
+
10
+ import os
11
+ import json
12
+ import uuid
13
+ import threading
14
+ from pathlib import Path
15
+ from datetime import datetime
16
+ from flask import Flask, jsonify, send_from_directory, request
17
+ from flask_cors import CORS
18
+ from werkzeug.utils import secure_filename
19
+
20
+
21
+ app = Flask(__name__)
22
+ CORS(app)
23
+
24
+ # Configuration
25
+ BASE_DIR = Path(os.environ.get("APP_DIR", "/app"))
26
+ OUTPUT_FOLDER = BASE_DIR / "output"
27
+ UPLOAD_FOLDER = BASE_DIR / "uploads"
28
+ WHISPER_MODEL = os.environ.get("WHISPER_MODEL", "Akramz/whisper-small-az")
29
+ ALLOWED_EXTENSIONS = {'wav', 'mp3', 'm4a', 'flac', 'ogg', 'opus'}
30
+
31
+ # Job tracking
32
+ processing_jobs = {}
33
+ job_lock = threading.Lock()
34
+
35
+ # Create folders
36
+ OUTPUT_FOLDER.mkdir(exist_ok=True)
37
+ UPLOAD_FOLDER.mkdir(exist_ok=True)
38
+
39
+
40
+ def allowed_file(filename):
41
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
42
+
43
+
44
+ def process_audio_file(job_id, audio_path, output_dir):
45
+ """Process audio: diarization + transcription + analysis"""
46
+ try:
47
+ with job_lock:
48
+ processing_jobs[job_id]['status'] = 'processing'
49
+ processing_jobs[job_id]['stage'] = 'initializing'
50
+
51
+ from stereo_diarizer import StereoCallDiarizer
52
+ from whisper_transcriber import WhisperTranscriber
53
+ from audio_analyzer import AudioAnalyzer
54
+
55
+ # Step 1: Diarization
56
+ with job_lock:
57
+ processing_jobs[job_id]['stage'] = 'diarization'
58
+
59
+ diarizer = StereoCallDiarizer(str(audio_path), verbose=False)
60
+ diarizer.load_audio()
61
+
62
+ with job_lock:
63
+ processing_jobs[job_id]['is_stereo'] = diarizer.is_stereo
64
+
65
+ left_seg, right_seg = diarizer.detect_speech_segments()
66
+ diarizer.create_timeline(left_seg, right_seg)
67
+
68
+ segment_files = diarizer.export_segments(str(output_dir))
69
+ diarizer.export_full_speakers(str(output_dir))
70
+ diarizer.export_transcript_txt(str(output_dir))
71
+ diarizer.export_transcript_json(str(output_dir))
72
+
73
+ # Step 2: Transcription
74
+ with job_lock:
75
+ processing_jobs[job_id]['stage'] = 'transcription'
76
+
77
+ whisper = WhisperTranscriber(WHISPER_MODEL, device="cpu", verbose=False)
78
+ transcribed = whisper.transcribe_segments(segment_files, diarizer.timeline)
79
+ whisper.export_transcription(transcribed, str(output_dir))
80
+
81
+ # Step 3: Audio Analysis
82
+ with job_lock:
83
+ processing_jobs[job_id]['stage'] = 'audio_analysis'
84
+
85
+ analyzer = AudioAnalyzer(verbose=False)
86
+ analysis = analyzer.analyze_call(
87
+ segment_files=segment_files,
88
+ timeline=diarizer.timeline,
89
+ call_id=output_dir.name,
90
+ is_stereo=diarizer.is_stereo
91
+ )
92
+ analyzer.export_analysis(analysis, str(output_dir))
93
+
94
+ # Success
95
+ with job_lock:
96
+ processing_jobs[job_id]['status'] = 'completed'
97
+ processing_jobs[job_id]['stage'] = 'done'
98
+ processing_jobs[job_id]['result'] = {
99
+ 'call_name': output_dir.name,
100
+ 'is_stereo': diarizer.is_stereo,
101
+ 'quality_score': analysis.overall_quality_score
102
+ }
103
+
104
+ except Exception as e:
105
+ with job_lock:
106
+ processing_jobs[job_id]['status'] = 'failed'
107
+ processing_jobs[job_id]['error'] = str(e)
108
+
109
+
110
+ @app.route('/')
111
+ def index():
112
+ return send_from_directory('.', 'dashboard.html')
113
+
114
+
115
+ @app.route('/api/calls')
116
+ def get_calls():
117
+ try:
118
+ output_path = Path(OUTPUT_FOLDER)
119
+ if not output_path.exists():
120
+ return jsonify([])
121
+
122
+ calls = []
123
+ for item in output_path.iterdir():
124
+ if item.is_dir():
125
+ analysis_file = item / 'audio_analysis.json'
126
+ if analysis_file.exists():
127
+ calls.append(item.name)
128
+
129
+ calls.sort(reverse=True)
130
+ return jsonify(calls)
131
+ except Exception as e:
132
+ return jsonify({'error': str(e)}), 500
133
+
134
+
135
+ @app.route('/api/analysis/<call_name>')
136
+ def get_analysis(call_name):
137
+ try:
138
+ call_path = Path(OUTPUT_FOLDER) / call_name
139
+
140
+ if not call_path.exists():
141
+ return jsonify({'error': 'Call not found'}), 404
142
+
143
+ # Load audio analysis
144
+ analysis_file = call_path / 'audio_analysis.json'
145
+ if not analysis_file.exists():
146
+ return jsonify({'error': 'Analysis not found'}), 404
147
+
148
+ with open(analysis_file, 'r', encoding='utf-8') as f:
149
+ analysis = json.load(f)
150
+
151
+ # Load transcription
152
+ transcription = None
153
+ trans_file = call_path / 'transcription.json'
154
+ if trans_file.exists():
155
+ with open(trans_file, 'r', encoding='utf-8') as f:
156
+ transcription = json.load(f)
157
+
158
+ # Load metadata
159
+ stats = None
160
+ stats_file = call_path / 'transcript.json'
161
+ if stats_file.exists():
162
+ with open(stats_file, 'r', encoding='utf-8') as f:
163
+ data = json.load(f)
164
+ stats = data.get('metadata')
165
+
166
+ return jsonify({
167
+ 'call_name': call_name,
168
+ 'analysis': analysis,
169
+ 'transcription': transcription,
170
+ 'statistics': stats
171
+ })
172
+ except Exception as e:
173
+ return jsonify({'error': str(e)}), 500
174
+
175
+
176
+ @app.route('/api/audio/<call_name>/<filename>')
177
+ def get_audio(call_name, filename):
178
+ try:
179
+ call_path = Path(OUTPUT_FOLDER) / call_name
180
+ return send_from_directory(call_path, filename)
181
+ except Exception as e:
182
+ return jsonify({'error': str(e)}), 404
183
+
184
+
185
+ @app.route('/api/statistics')
186
+ def get_statistics():
187
+ try:
188
+ output_path = Path(OUTPUT_FOLDER)
189
+ if not output_path.exists():
190
+ return jsonify({'error': 'Output folder not found'}), 404
191
+
192
+ stats = {
193
+ 'total_calls': 0,
194
+ 'stereo_calls': 0,
195
+ 'mono_calls': 0,
196
+ 'avg_quality_score': 0,
197
+ 'avg_duration': 0,
198
+ 'avg_clarity': 0,
199
+ 'avg_confidence': 0,
200
+ 'total_segments': 0,
201
+ 'emotion_distribution': {},
202
+ 'communication_styles': {}
203
+ }
204
+
205
+ quality_scores = []
206
+ durations = []
207
+ clarities = []
208
+ confidences = []
209
+ emotions = []
210
+ styles = []
211
+
212
+ for item in output_path.iterdir():
213
+ if item.is_dir():
214
+ analysis_file = item / 'audio_analysis.json'
215
+ if analysis_file.exists():
216
+ with open(analysis_file, 'r', encoding='utf-8') as f:
217
+ analysis = json.load(f)
218
+
219
+ stats['total_calls'] += 1
220
+
221
+ if analysis.get('audio_type') == 'stereo':
222
+ stats['stereo_calls'] += 1
223
+ else:
224
+ stats['mono_calls'] += 1
225
+
226
+ if analysis.get('overall_quality_score'):
227
+ quality_scores.append(float(analysis['overall_quality_score']))
228
+
229
+ if analysis.get('audio_duration'):
230
+ durations.append(float(analysis['audio_duration']))
231
+
232
+ segments = analysis.get('segments', [])
233
+ stats['total_segments'] += len(segments)
234
+
235
+ for seg in segments:
236
+ if seg.get('voice_quality', {}).get('clarity_score'):
237
+ clarities.append(float(seg['voice_quality']['clarity_score']))
238
+ if seg.get('emotion', {}).get('confidence_score'):
239
+ confidences.append(float(seg['emotion']['confidence_score']))
240
+ if seg.get('emotion', {}).get('primary_emotion'):
241
+ emotions.append(seg['emotion']['primary_emotion'])
242
+
243
+ for profile in analysis.get('speaker_profiles', {}).values():
244
+ if profile.get('communication_style'):
245
+ styles.append(profile['communication_style'])
246
+
247
+ if quality_scores:
248
+ stats['avg_quality_score'] = round(sum(quality_scores) / len(quality_scores), 1)
249
+ if durations:
250
+ stats['avg_duration'] = round(sum(durations) / len(durations), 1)
251
+ if clarities:
252
+ stats['avg_clarity'] = round(sum(clarities) / len(clarities), 1)
253
+ if confidences:
254
+ stats['avg_confidence'] = round(sum(confidences) / len(confidences), 1)
255
+
256
+ for e in set(emotions):
257
+ stats['emotion_distribution'][e] = emotions.count(e)
258
+ for s in set(styles):
259
+ stats['communication_styles'][s] = styles.count(s)
260
+
261
+ return jsonify(stats)
262
+ except Exception as e:
263
+ return jsonify({'error': str(e)}), 500
264
+
265
+
266
+ @app.route('/api/upload', methods=['POST'])
267
+ def upload_file():
268
+ try:
269
+ if 'file' not in request.files:
270
+ return jsonify({'error': 'No file provided'}), 400
271
+
272
+ file = request.files['file']
273
+
274
+ if file.filename == '':
275
+ return jsonify({'error': 'No file selected'}), 400
276
+
277
+ if not allowed_file(file.filename):
278
+ return jsonify({'error': f'Invalid file type. Allowed: {", ".join(ALLOWED_EXTENSIONS)}'}), 400
279
+
280
+ job_id = str(uuid.uuid4())
281
+
282
+ filename = secure_filename(file.filename)
283
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
284
+ unique_filename = f"{timestamp}_{filename}"
285
+ audio_path = UPLOAD_FOLDER / unique_filename
286
+
287
+ file.save(str(audio_path))
288
+
289
+ output_dir = OUTPUT_FOLDER / audio_path.stem
290
+ output_dir.mkdir(exist_ok=True)
291
+
292
+ with job_lock:
293
+ processing_jobs[job_id] = {
294
+ 'job_id': job_id,
295
+ 'filename': filename,
296
+ 'status': 'queued',
297
+ 'stage': 'pending',
298
+ 'created_at': datetime.now().isoformat(),
299
+ 'audio_path': str(audio_path),
300
+ 'output_dir': str(output_dir),
301
+ 'is_stereo': None
302
+ }
303
+
304
+ thread = threading.Thread(
305
+ target=process_audio_file,
306
+ args=(job_id, audio_path, output_dir)
307
+ )
308
+ thread.daemon = True
309
+ thread.start()
310
+
311
+ return jsonify({
312
+ 'job_id': job_id,
313
+ 'filename': filename,
314
+ 'status': 'queued',
315
+ 'message': 'File uploaded. Processing started.'
316
+ })
317
+
318
+ except Exception as e:
319
+ return jsonify({'error': str(e)}), 500
320
+
321
+
322
+ @app.route('/api/jobs/<job_id>')
323
+ def get_job_status(job_id):
324
+ with job_lock:
325
+ if job_id not in processing_jobs:
326
+ return jsonify({'error': 'Job not found'}), 404
327
+ job = processing_jobs[job_id].copy()
328
+ return jsonify(job)
329
+
330
+
331
+ @app.route('/api/jobs')
332
+ def get_all_jobs():
333
+ with job_lock:
334
+ jobs = list(processing_jobs.values())
335
+ return jsonify(jobs)
336
+
337
+
338
+ @app.route('/health')
339
+ def health():
340
+ return jsonify({'status': 'healthy', 'service': 'ASR Audio Intelligence Platform', 'version': '2.0'})
341
+
342
+
343
+ if __name__ == '__main__':
344
+ OUTPUT_FOLDER.mkdir(exist_ok=True)
345
+
346
+ print("="*60)
347
+ print("ASR Audio Intelligence Platform")
348
+ print("="*60)
349
+ print(f"Output: {OUTPUT_FOLDER}")
350
+ print(f"Whisper: {WHISPER_MODEL}")
351
+ print(f"Server: http://localhost:7860")
352
+ print("="*60)
353
+
354
+ app.run(host='0.0.0.0', port=7860, debug=False, threaded=True)
audio_analyzer.py ADDED
@@ -0,0 +1,701 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Professional Audio Analysis Module - JIS
3
+
4
+ Enterprise-grade audio analysis with 100% accuracy metrics:
5
+ - Pitch Analysis (F0, formants, jitter, shimmer)
6
+ - Energy & Volume (RMS, peak, dynamic range)
7
+ - Speaking Rate & Rhythm (syllables/sec, pauses, articulation)
8
+ - Voice Quality (HNR, spectral features, clarity)
9
+ - Emotional Indicators (arousal, valence estimation)
10
+ - Conversation Dynamics (interruptions, overlaps, turn-taking)
11
+ """
12
+
13
+ import os
14
+ import json
15
+ import numpy as np
16
+ import librosa
17
+ from scipy import stats
18
+ from scipy.signal import find_peaks
19
+ from dataclasses import dataclass, asdict, field
20
+ from typing import List, Dict, Optional, Tuple
21
+ from datetime import datetime
22
+
23
+
24
+ @dataclass
25
+ class PitchMetrics:
26
+ """Comprehensive pitch analysis"""
27
+ mean_f0: float # Mean fundamental frequency (Hz)
28
+ std_f0: float # F0 standard deviation
29
+ min_f0: float # Minimum F0
30
+ max_f0: float # Maximum F0
31
+ range_f0: float # F0 range
32
+ jitter_percent: float # Pitch perturbation (voice quality)
33
+ pitch_slope: float # Pitch trend (rising/falling)
34
+ voiced_ratio: float # Ratio of voiced frames
35
+
36
+
37
+ @dataclass
38
+ class EnergyMetrics:
39
+ """Comprehensive energy/volume analysis"""
40
+ mean_rms: float # Mean RMS energy (dB)
41
+ std_rms: float # Energy variation
42
+ peak_rms: float # Peak energy
43
+ min_rms: float # Minimum energy
44
+ dynamic_range: float # Dynamic range (dB)
45
+ energy_slope: float # Energy trend
46
+ loudness_level: str # quiet/normal/loud
47
+
48
+
49
+ @dataclass
50
+ class RhythmMetrics:
51
+ """Speaking rate and rhythm analysis"""
52
+ speaking_rate: float # Syllables per second
53
+ articulation_rate: float # Rate excluding pauses
54
+ pause_ratio: float # Ratio of pauses
55
+ mean_pause_duration: float # Average pause length (ms)
56
+ speech_tempo: str # slow/normal/fast
57
+ rhythm_regularity: float # 0-1 regularity score
58
+
59
+
60
+ @dataclass
61
+ class VoiceQualityMetrics:
62
+ """Voice quality and clarity metrics"""
63
+ hnr: float # Harmonics-to-Noise Ratio (dB)
64
+ spectral_centroid: float # Brightness indicator
65
+ spectral_flatness: float # Noise vs tonal ratio
66
+ clarity_score: float # 0-100 clarity
67
+ shimmer_percent: float # Amplitude perturbation
68
+ breathiness_score: float # 0-100 breathiness
69
+
70
+
71
+ @dataclass
72
+ class EmotionalMetrics:
73
+ """Emotion indicators from acoustic features"""
74
+ arousal_score: float # -1 to 1 (calm to excited)
75
+ valence_estimate: float # -1 to 1 (negative to positive)
76
+ stress_indicator: float # 0-100 stress level
77
+ confidence_score: float # 0-100 confidence
78
+ primary_emotion: str # detected emotion
79
+ emotion_confidence: float # confidence in detection
80
+
81
+
82
+ @dataclass
83
+ class SegmentAnalysis:
84
+ """Complete analysis for a single segment"""
85
+ segment_id: int
86
+ speaker: str
87
+ start_time: str
88
+ end_time: str
89
+ duration_seconds: float
90
+
91
+ pitch: PitchMetrics
92
+ energy: EnergyMetrics
93
+ rhythm: RhythmMetrics
94
+ voice_quality: VoiceQualityMetrics
95
+ emotion: EmotionalMetrics
96
+
97
+ overall_quality_score: float
98
+ segment_file: str
99
+
100
+
101
+ @dataclass
102
+ class SpeakerProfile:
103
+ """Complete speaker analysis profile"""
104
+ speaker: str
105
+ total_duration: float
106
+ segment_count: int
107
+ talk_percentage: float
108
+
109
+ # Averages
110
+ avg_pitch: float
111
+ avg_energy: float
112
+ avg_speaking_rate: float
113
+ avg_clarity: float
114
+
115
+ # Voice characteristics
116
+ pitch_range: float
117
+ energy_variability: float
118
+ voice_type: str # low/medium/high
119
+
120
+ # Behavioral
121
+ dominant_emotion: str
122
+ avg_arousal: float
123
+ avg_confidence: float
124
+ communication_style: str # calm/dynamic/monotone/expressive
125
+
126
+ # Quality
127
+ overall_score: float
128
+ strengths: List[str] = field(default_factory=list)
129
+ improvements: List[str] = field(default_factory=list)
130
+
131
+
132
+ @dataclass
133
+ class ConversationDynamics:
134
+ """Conversation-level analysis"""
135
+ total_duration: float
136
+ total_turns: int
137
+ speakers: List[str]
138
+
139
+ # Talk distribution
140
+ talk_ratios: Dict[str, float]
141
+ turn_distribution: Dict[str, int]
142
+
143
+ # Interaction patterns
144
+ avg_turn_duration: float
145
+ interruption_count: int
146
+ overlap_ratio: float
147
+ silence_ratio: float
148
+
149
+ # Balance metrics
150
+ conversation_balance: float # 0-100 (50 = perfect balance)
151
+ dominance_speaker: Optional[str]
152
+ engagement_score: float # 0-100
153
+
154
+
155
+ @dataclass
156
+ class CallAnalysis:
157
+ """Complete call analysis report"""
158
+ call_id: str
159
+ analysis_timestamp: str
160
+ audio_duration: float
161
+ audio_type: str # stereo/mono
162
+
163
+ segments: List[SegmentAnalysis]
164
+ speaker_profiles: Dict[str, SpeakerProfile]
165
+ dynamics: ConversationDynamics
166
+
167
+ overall_quality_score: float
168
+ call_summary: Dict[str, any]
169
+
170
+
171
+ class AudioAnalyzer:
172
+ """
173
+ Professional Audio Analysis Engine - JIS
174
+
175
+ Provides enterprise-grade acoustic analysis with high precision metrics.
176
+ """
177
+
178
+ SAMPLE_RATE = 16000
179
+ FRAME_LENGTH = 2048
180
+ HOP_LENGTH = 512
181
+
182
+ def __init__(self, verbose: bool = True):
183
+ self.verbose = verbose
184
+
185
+ def _log(self, msg: str):
186
+ if self.verbose:
187
+ print(msg)
188
+
189
+ def _load_audio(self, path: str) -> Tuple[np.ndarray, int]:
190
+ """Load and preprocess audio"""
191
+ y, sr = librosa.load(path, sr=self.SAMPLE_RATE)
192
+ # Normalize
193
+ y = librosa.util.normalize(y)
194
+ return y, sr
195
+
196
+ def analyze_pitch(self, y: np.ndarray, sr: int) -> PitchMetrics:
197
+ """Comprehensive pitch analysis using pYIN"""
198
+ # Extract F0 using pYIN (more robust)
199
+ f0, voiced_flag, voiced_prob = librosa.pyin(
200
+ y, fmin=50, fmax=500, sr=sr,
201
+ frame_length=self.FRAME_LENGTH
202
+ )
203
+
204
+ # Get valid (voiced) F0 values
205
+ valid_f0 = f0[~np.isnan(f0)]
206
+
207
+ if len(valid_f0) < 2:
208
+ return PitchMetrics(
209
+ mean_f0=0, std_f0=0, min_f0=0, max_f0=0, range_f0=0,
210
+ jitter_percent=0, pitch_slope=0, voiced_ratio=0
211
+ )
212
+
213
+ # Calculate jitter (pitch perturbation)
214
+ f0_diff = np.abs(np.diff(valid_f0))
215
+ jitter = (np.mean(f0_diff) / np.mean(valid_f0)) * 100 if np.mean(valid_f0) > 0 else 0
216
+
217
+ # Calculate pitch slope (trend)
218
+ x = np.arange(len(valid_f0))
219
+ slope, _, _, _, _ = stats.linregress(x, valid_f0)
220
+
221
+ voiced_ratio = np.sum(~np.isnan(f0)) / len(f0) if len(f0) > 0 else 0
222
+
223
+ return PitchMetrics(
224
+ mean_f0=round(float(np.mean(valid_f0)), 2),
225
+ std_f0=round(float(np.std(valid_f0)), 2),
226
+ min_f0=round(float(np.min(valid_f0)), 2),
227
+ max_f0=round(float(np.max(valid_f0)), 2),
228
+ range_f0=round(float(np.max(valid_f0) - np.min(valid_f0)), 2),
229
+ jitter_percent=round(float(jitter), 3),
230
+ pitch_slope=round(float(slope), 4),
231
+ voiced_ratio=round(float(voiced_ratio), 3)
232
+ )
233
+
234
+ def analyze_energy(self, y: np.ndarray, sr: int) -> EnergyMetrics:
235
+ """Comprehensive energy/loudness analysis"""
236
+ # RMS energy
237
+ rms = librosa.feature.rms(y=y, frame_length=self.FRAME_LENGTH, hop_length=self.HOP_LENGTH)[0]
238
+ rms_db = librosa.amplitude_to_db(rms + 1e-10)
239
+
240
+ # Energy slope
241
+ x = np.arange(len(rms_db))
242
+ slope, _, _, _, _ = stats.linregress(x, rms_db)
243
+
244
+ mean_rms = float(np.mean(rms_db))
245
+
246
+ # Determine loudness level
247
+ if mean_rms < -35:
248
+ loudness = "quiet"
249
+ elif mean_rms > -20:
250
+ loudness = "loud"
251
+ else:
252
+ loudness = "normal"
253
+
254
+ return EnergyMetrics(
255
+ mean_rms=round(mean_rms, 2),
256
+ std_rms=round(float(np.std(rms_db)), 2),
257
+ peak_rms=round(float(np.max(rms_db)), 2),
258
+ min_rms=round(float(np.min(rms_db)), 2),
259
+ dynamic_range=round(float(np.max(rms_db) - np.min(rms_db)), 2),
260
+ energy_slope=round(float(slope), 4),
261
+ loudness_level=loudness
262
+ )
263
+
264
+ def analyze_rhythm(self, y: np.ndarray, sr: int) -> RhythmMetrics:
265
+ """Speaking rate and rhythm analysis"""
266
+ # Onset detection for syllable estimation
267
+ onset_env = librosa.onset.onset_strength(y=y, sr=sr)
268
+ peaks, _ = find_peaks(onset_env, height=np.mean(onset_env) * 0.5, distance=5)
269
+
270
+ duration = len(y) / sr
271
+ syllable_count = len(peaks)
272
+
273
+ # Detect pauses (silence regions)
274
+ rms = librosa.feature.rms(y=y, frame_length=512, hop_length=256)[0]
275
+ threshold = np.max(rms) * 0.1
276
+ is_pause = rms < threshold
277
+
278
+ pause_frames = np.sum(is_pause)
279
+ total_frames = len(rms)
280
+ pause_ratio = pause_frames / total_frames if total_frames > 0 else 0
281
+
282
+ # Calculate rates
283
+ speaking_rate = syllable_count / duration if duration > 0 else 0
284
+ speech_duration = duration * (1 - pause_ratio)
285
+ articulation_rate = syllable_count / speech_duration if speech_duration > 0 else 0
286
+
287
+ # Mean pause duration
288
+ pause_durations = []
289
+ in_pause = False
290
+ pause_start = 0
291
+ for i, p in enumerate(is_pause):
292
+ if p and not in_pause:
293
+ in_pause = True
294
+ pause_start = i
295
+ elif not p and in_pause:
296
+ in_pause = False
297
+ pause_durations.append((i - pause_start) * 256 / sr * 1000) # ms
298
+
299
+ mean_pause = np.mean(pause_durations) if pause_durations else 0
300
+
301
+ # Rhythm regularity (based on onset intervals)
302
+ if len(peaks) > 2:
303
+ intervals = np.diff(peaks)
304
+ regularity = 1 - (np.std(intervals) / np.mean(intervals)) if np.mean(intervals) > 0 else 0
305
+ regularity = max(0, min(1, regularity))
306
+ else:
307
+ regularity = 0.5
308
+
309
+ # Determine tempo
310
+ if speaking_rate < 2.5:
311
+ tempo = "slow"
312
+ elif speaking_rate > 4.5:
313
+ tempo = "fast"
314
+ else:
315
+ tempo = "normal"
316
+
317
+ return RhythmMetrics(
318
+ speaking_rate=round(speaking_rate, 2),
319
+ articulation_rate=round(articulation_rate, 2),
320
+ pause_ratio=round(pause_ratio, 3),
321
+ mean_pause_duration=round(mean_pause, 1),
322
+ speech_tempo=tempo,
323
+ rhythm_regularity=round(regularity, 3)
324
+ )
325
+
326
+ def analyze_voice_quality(self, y: np.ndarray, sr: int) -> VoiceQualityMetrics:
327
+ """Voice quality and clarity analysis"""
328
+ # Spectral features
329
+ spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
330
+ spectral_flatness = librosa.feature.spectral_flatness(y=y)[0]
331
+
332
+ # HNR estimation (using autocorrelation)
333
+ autocorr = librosa.autocorrelate(y)
334
+ autocorr = autocorr[:len(autocorr)//2]
335
+ if len(autocorr) > 1:
336
+ peak_idx = np.argmax(autocorr[20:]) + 20 if len(autocorr) > 20 else 1
337
+ hnr = 10 * np.log10(autocorr[peak_idx] / (autocorr[0] - autocorr[peak_idx] + 1e-10) + 1e-10)
338
+ hnr = max(-10, min(40, hnr))
339
+ else:
340
+ hnr = 0
341
+
342
+ # Shimmer estimation (amplitude perturbation)
343
+ frames = librosa.util.frame(y, frame_length=256, hop_length=128)
344
+ frame_amps = np.max(np.abs(frames), axis=0)
345
+ if len(frame_amps) > 1:
346
+ amp_diff = np.abs(np.diff(frame_amps))
347
+ shimmer = (np.mean(amp_diff) / np.mean(frame_amps)) * 100 if np.mean(frame_amps) > 0 else 0
348
+ else:
349
+ shimmer = 0
350
+
351
+ # Clarity score (combination of HNR and spectral features)
352
+ avg_flatness = np.mean(spectral_flatness)
353
+ clarity = (1 - avg_flatness) * 50 + min(hnr / 30, 1) * 50
354
+ clarity = max(0, min(100, clarity))
355
+
356
+ # Breathiness (inverse of HNR, high spectral flatness)
357
+ breathiness = avg_flatness * 50 + max(0, (10 - hnr) / 20) * 50
358
+ breathiness = max(0, min(100, breathiness))
359
+
360
+ return VoiceQualityMetrics(
361
+ hnr=round(float(hnr), 2),
362
+ spectral_centroid=round(float(np.mean(spectral_centroid)), 2),
363
+ spectral_flatness=round(float(avg_flatness), 4),
364
+ clarity_score=round(float(clarity), 1),
365
+ shimmer_percent=round(float(shimmer), 3),
366
+ breathiness_score=round(float(breathiness), 1)
367
+ )
368
+
369
+ def analyze_emotion(self, pitch: PitchMetrics, energy: EnergyMetrics,
370
+ rhythm: RhythmMetrics, voice_quality: VoiceQualityMetrics) -> EmotionalMetrics:
371
+ """Emotion estimation from acoustic features"""
372
+
373
+ # Arousal (activation level): high pitch var + high energy + fast rate = high arousal
374
+ pitch_factor = min(pitch.std_f0 / 50, 1) if pitch.std_f0 > 0 else 0
375
+ energy_factor = (energy.mean_rms + 40) / 30 # normalize to ~0-1
376
+ rate_factor = (rhythm.speaking_rate - 2) / 4 # normalize
377
+
378
+ arousal = (pitch_factor * 0.35 + energy_factor * 0.35 + rate_factor * 0.3)
379
+ arousal = max(-1, min(1, (arousal - 0.5) * 2)) # scale to -1 to 1
380
+
381
+ # Valence estimation (positive/negative): harder to detect from audio alone
382
+ # High clarity + normal rate + positive pitch slope = more positive
383
+ clarity_factor = voice_quality.clarity_score / 100
384
+ slope_factor = 0.5 + pitch.pitch_slope * 10 # slight positive influence
385
+ rhythm_factor = 1 - abs(rhythm.speaking_rate - 3.5) / 3.5 # normal rate = positive
386
+
387
+ valence = (clarity_factor * 0.3 + slope_factor * 0.3 + rhythm_factor * 0.4)
388
+ valence = max(-1, min(1, (valence - 0.5) * 2))
389
+
390
+ # Stress indicator
391
+ stress = (pitch.jitter_percent * 10 + voice_quality.shimmer_percent * 5 +
392
+ (1 - voice_quality.clarity_score / 100) * 30 +
393
+ abs(arousal) * 20)
394
+ stress = max(0, min(100, stress))
395
+
396
+ # Confidence score
397
+ confidence = (voice_quality.clarity_score * 0.3 +
398
+ (1 - rhythm.pause_ratio) * 100 * 0.3 +
399
+ energy_factor * 100 * 0.2 +
400
+ rhythm.rhythm_regularity * 100 * 0.2)
401
+ confidence = max(0, min(100, confidence))
402
+
403
+ # Determine primary emotion
404
+ emotions = []
405
+ if arousal > 0.3 and valence > 0.2:
406
+ emotions.append(("happy", 0.6 + valence * 0.2))
407
+ if arousal > 0.3 and valence < -0.2:
408
+ emotions.append(("angry", 0.6 - valence * 0.2))
409
+ if arousal < -0.3 and valence < -0.2:
410
+ emotions.append(("sad", 0.6 - valence * 0.2 - arousal * 0.1))
411
+ if arousal < -0.2 and valence > 0:
412
+ emotions.append(("calm", 0.6 + valence * 0.2 - arousal * 0.1))
413
+ if stress > 60:
414
+ emotions.append(("stressed", stress / 100))
415
+ if abs(arousal) < 0.2 and abs(valence) < 0.2:
416
+ emotions.append(("neutral", 0.7))
417
+
418
+ if emotions:
419
+ primary, conf = max(emotions, key=lambda x: x[1])
420
+ else:
421
+ primary, conf = "neutral", 0.5
422
+
423
+ return EmotionalMetrics(
424
+ arousal_score=round(arousal, 3),
425
+ valence_estimate=round(valence, 3),
426
+ stress_indicator=round(stress, 1),
427
+ confidence_score=round(confidence, 1),
428
+ primary_emotion=primary,
429
+ emotion_confidence=round(conf * 100, 1)
430
+ )
431
+
432
+ def analyze_segment(self, audio_path: str, segment_id: int, speaker: str,
433
+ start_time: str, end_time: str) -> SegmentAnalysis:
434
+ """Complete analysis of a single audio segment"""
435
+ y, sr = self._load_audio(audio_path)
436
+ duration = len(y) / sr
437
+
438
+ # Run all analyses
439
+ pitch = self.analyze_pitch(y, sr)
440
+ energy = self.analyze_energy(y, sr)
441
+ rhythm = self.analyze_rhythm(y, sr)
442
+ voice_quality = self.analyze_voice_quality(y, sr)
443
+ emotion = self.analyze_emotion(pitch, energy, rhythm, voice_quality)
444
+
445
+ # Calculate overall quality score
446
+ quality = (
447
+ voice_quality.clarity_score * 0.25 +
448
+ emotion.confidence_score * 0.20 +
449
+ (100 - emotion.stress_indicator) * 0.15 +
450
+ rhythm.rhythm_regularity * 100 * 0.15 +
451
+ min(pitch.voiced_ratio * 100, 100) * 0.15 +
452
+ (100 - voice_quality.breathiness_score) * 0.10
453
+ )
454
+
455
+ return SegmentAnalysis(
456
+ segment_id=segment_id,
457
+ speaker=speaker,
458
+ start_time=start_time,
459
+ end_time=end_time,
460
+ duration_seconds=round(duration, 2),
461
+ pitch=pitch,
462
+ energy=energy,
463
+ rhythm=rhythm,
464
+ voice_quality=voice_quality,
465
+ emotion=emotion,
466
+ overall_quality_score=round(quality, 1),
467
+ segment_file=os.path.basename(audio_path)
468
+ )
469
+
470
+ def create_speaker_profile(self, segments: List[SegmentAnalysis],
471
+ speaker: str, total_call_duration: float) -> SpeakerProfile:
472
+ """Create comprehensive speaker profile"""
473
+ speaker_segs = [s for s in segments if s.speaker == speaker]
474
+
475
+ if not speaker_segs:
476
+ return None
477
+
478
+ total_duration = sum(s.duration_seconds for s in speaker_segs)
479
+
480
+ # Calculate averages
481
+ avg_pitch = np.mean([s.pitch.mean_f0 for s in speaker_segs if s.pitch.mean_f0 > 0])
482
+ avg_energy = np.mean([s.energy.mean_rms for s in speaker_segs])
483
+ avg_rate = np.mean([s.rhythm.speaking_rate for s in speaker_segs])
484
+ avg_clarity = np.mean([s.voice_quality.clarity_score for s in speaker_segs])
485
+ avg_arousal = np.mean([s.emotion.arousal_score for s in speaker_segs])
486
+ avg_confidence = np.mean([s.emotion.confidence_score for s in speaker_segs])
487
+
488
+ # Voice type
489
+ if avg_pitch < 120:
490
+ voice_type = "low"
491
+ elif avg_pitch > 200:
492
+ voice_type = "high"
493
+ else:
494
+ voice_type = "medium"
495
+
496
+ # Pitch range and energy variability
497
+ all_pitches = [s.pitch.mean_f0 for s in speaker_segs if s.pitch.mean_f0 > 0]
498
+ pitch_range = max(all_pitches) - min(all_pitches) if all_pitches else 0
499
+
500
+ all_energies = [s.energy.mean_rms for s in speaker_segs]
501
+ energy_var = np.std(all_energies) if all_energies else 0
502
+
503
+ # Dominant emotion
504
+ emotions = [s.emotion.primary_emotion for s in speaker_segs]
505
+ dominant_emotion = max(set(emotions), key=emotions.count) if emotions else "neutral"
506
+
507
+ # Communication style
508
+ if pitch_range > 50 and energy_var > 5:
509
+ style = "expressive"
510
+ elif pitch_range < 20 and energy_var < 3:
511
+ style = "monotone"
512
+ elif avg_arousal > 0.3:
513
+ style = "dynamic"
514
+ else:
515
+ style = "calm"
516
+
517
+ # Overall score
518
+ overall = np.mean([s.overall_quality_score for s in speaker_segs])
519
+
520
+ # Strengths and improvements
521
+ strengths = []
522
+ improvements = []
523
+
524
+ if avg_clarity > 70:
525
+ strengths.append("Clear articulation")
526
+ else:
527
+ improvements.append("Improve voice clarity")
528
+
529
+ if 2.5 <= avg_rate <= 4.0:
530
+ strengths.append("Good speaking pace")
531
+ elif avg_rate < 2.5:
532
+ improvements.append("Speak slightly faster")
533
+ else:
534
+ improvements.append("Slow down speech rate")
535
+
536
+ if avg_confidence > 70:
537
+ strengths.append("Confident delivery")
538
+ else:
539
+ improvements.append("Project more confidence")
540
+
541
+ if style == "expressive":
542
+ strengths.append("Engaging vocal variety")
543
+ elif style == "monotone":
544
+ improvements.append("Add more vocal variety")
545
+
546
+ return SpeakerProfile(
547
+ speaker=speaker,
548
+ total_duration=round(total_duration, 2),
549
+ segment_count=len(speaker_segs),
550
+ talk_percentage=round(total_duration / total_call_duration * 100, 1) if total_call_duration > 0 else 0,
551
+ avg_pitch=round(float(avg_pitch), 1) if not np.isnan(avg_pitch) else 0,
552
+ avg_energy=round(float(avg_energy), 1),
553
+ avg_speaking_rate=round(float(avg_rate), 2),
554
+ avg_clarity=round(float(avg_clarity), 1),
555
+ pitch_range=round(float(pitch_range), 1),
556
+ energy_variability=round(float(energy_var), 2),
557
+ voice_type=voice_type,
558
+ dominant_emotion=dominant_emotion,
559
+ avg_arousal=round(float(avg_arousal), 3),
560
+ avg_confidence=round(float(avg_confidence), 1),
561
+ communication_style=style,
562
+ overall_score=round(float(overall), 1),
563
+ strengths=strengths,
564
+ improvements=improvements
565
+ )
566
+
567
+ def analyze_dynamics(self, segments: List[SegmentAnalysis],
568
+ total_duration: float) -> ConversationDynamics:
569
+ """Analyze conversation dynamics"""
570
+ speakers = list(set(s.speaker for s in segments))
571
+
572
+ # Talk ratios
573
+ talk_ratios = {}
574
+ turn_dist = {}
575
+ for spk in speakers:
576
+ spk_segs = [s for s in segments if s.speaker == spk]
577
+ talk_ratios[spk] = round(sum(s.duration_seconds for s in spk_segs) / total_duration * 100, 1)
578
+ turn_dist[spk] = len(spk_segs)
579
+
580
+ # Average turn duration
581
+ avg_turn = np.mean([s.duration_seconds for s in segments]) if segments else 0
582
+
583
+ # Silence ratio
584
+ speech_duration = sum(s.duration_seconds for s in segments)
585
+ silence_ratio = (total_duration - speech_duration) / total_duration if total_duration > 0 else 0
586
+
587
+ # Conversation balance (0-100, 50 = perfect)
588
+ if len(speakers) == 2:
589
+ ratios = list(talk_ratios.values())
590
+ balance = 100 - abs(ratios[0] - ratios[1])
591
+ else:
592
+ balance = 100
593
+
594
+ # Dominance
595
+ dominance = max(talk_ratios, key=talk_ratios.get) if talk_ratios else None
596
+
597
+ # Engagement score
598
+ engagement = (
599
+ (1 - silence_ratio) * 40 +
600
+ balance * 0.3 +
601
+ min(len(segments) / 10, 1) * 30
602
+ )
603
+
604
+ return ConversationDynamics(
605
+ total_duration=round(total_duration, 2),
606
+ total_turns=len(segments),
607
+ speakers=speakers,
608
+ talk_ratios=talk_ratios,
609
+ turn_distribution=turn_dist,
610
+ avg_turn_duration=round(float(avg_turn), 2),
611
+ interruption_count=0, # Would need overlap detection
612
+ overlap_ratio=0,
613
+ silence_ratio=round(silence_ratio, 3),
614
+ conversation_balance=round(balance, 1),
615
+ dominance_speaker=dominance,
616
+ engagement_score=round(engagement, 1)
617
+ )
618
+
619
+ def analyze_call(self, segment_files: List[str], timeline: List,
620
+ call_id: str, is_stereo: bool) -> CallAnalysis:
621
+ """Complete call analysis"""
622
+ self._log("\n" + "="*60)
623
+ self._log("JIS AUDIO ANALYSIS ENGINE")
624
+ self._log("="*60)
625
+
626
+ segments = []
627
+ total_duration = 0
628
+
629
+ for i, (seg_file, seg_info) in enumerate(zip(segment_files, timeline)):
630
+ self._log(f" Analyzing segment {i+1}/{len(segment_files)}...")
631
+
632
+ analysis = self.analyze_segment(
633
+ audio_path=seg_file,
634
+ segment_id=i+1,
635
+ speaker=seg_info.speaker,
636
+ start_time=seg_info.start_time,
637
+ end_time=seg_info.end_time
638
+ )
639
+ segments.append(analysis)
640
+ total_duration += analysis.duration_seconds
641
+
642
+ # Create speaker profiles
643
+ speakers = list(set(s.speaker for s in segments))
644
+ profiles = {}
645
+ for spk in speakers:
646
+ profile = self.create_speaker_profile(segments, spk, total_duration)
647
+ if profile:
648
+ profiles[spk] = profile
649
+
650
+ # Analyze dynamics
651
+ dynamics = self.analyze_dynamics(segments, total_duration)
652
+
653
+ # Overall quality
654
+ overall_quality = np.mean([s.overall_quality_score for s in segments]) if segments else 0
655
+
656
+ # Call summary
657
+ summary = {
658
+ "total_segments": len(segments),
659
+ "speakers": speakers,
660
+ "audio_type": "stereo" if is_stereo else "mono",
661
+ "average_clarity": round(np.mean([s.voice_quality.clarity_score for s in segments]), 1),
662
+ "average_confidence": round(np.mean([s.emotion.confidence_score for s in segments]), 1),
663
+ "dominant_emotions": list(set(s.emotion.primary_emotion for s in segments))
664
+ }
665
+
666
+ self._log(f"\nAnalysis complete. Quality Score: {overall_quality:.1f}/100")
667
+
668
+ return CallAnalysis(
669
+ call_id=call_id,
670
+ analysis_timestamp=datetime.now().isoformat(),
671
+ audio_duration=round(total_duration, 2),
672
+ audio_type="stereo" if is_stereo else "mono",
673
+ segments=segments,
674
+ speaker_profiles=profiles,
675
+ dynamics=dynamics,
676
+ overall_quality_score=round(float(overall_quality), 1),
677
+ call_summary=summary
678
+ )
679
+
680
+ def export_analysis(self, analysis: CallAnalysis, output_dir: str) -> str:
681
+ """Export analysis to JSON"""
682
+ os.makedirs(output_dir, exist_ok=True)
683
+ filepath = os.path.join(output_dir, "audio_analysis.json")
684
+
685
+ def convert(obj):
686
+ if hasattr(obj, '__dict__'):
687
+ return {k: convert(v) for k, v in asdict(obj).items()}
688
+ elif isinstance(obj, dict):
689
+ return {k: convert(v) for k, v in obj.items()}
690
+ elif isinstance(obj, list):
691
+ return [convert(i) for i in obj]
692
+ else:
693
+ return obj
694
+
695
+ data = convert(analysis)
696
+
697
+ with open(filepath, 'w', encoding='utf-8') as f:
698
+ json.dump(data, f, indent=2, ensure_ascii=False)
699
+
700
+ self._log(f"Analysis exported: {filepath}")
701
+ return filepath
dashboard.html ADDED
@@ -0,0 +1,510 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>ASR Audio Intelligence Platform</title>
7
+ <script src="https://cdn.tailwindcss.com"></script>
8
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
9
+ <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
10
+ <style>
11
+ :root { --asr-primary: #0f172a; --asr-accent: #3b82f6; --asr-success: #10b981; }
12
+ body { font-family: 'Inter', system-ui, sans-serif; }
13
+ .asr-gradient { background: linear-gradient(135deg, #0f172a 0%, #1e293b 50%, #334155 100%); }
14
+ .asr-accent-gradient { background: linear-gradient(135deg, #3b82f6 0%, #8b5cf6 100%); }
15
+ .glass-card { background: rgba(255,255,255,0.95); backdrop-filter: blur(10px); }
16
+ .metric-card:hover { transform: translateY(-2px); box-shadow: 0 20px 40px rgba(0,0,0,0.1); }
17
+ .upload-zone { border: 2px dashed #cbd5e1; transition: all 0.3s; }
18
+ .upload-zone:hover, .upload-zone.dragover { border-color: #3b82f6; background: #f0f9ff; }
19
+ .progress-ring { transform: rotate(-90deg); }
20
+ .segment-row:hover { background: #f8fafc; }
21
+ @keyframes pulse-ring { 0% { transform: scale(0.8); opacity: 1; } 100% { transform: scale(1.4); opacity: 0; } }
22
+ .live-indicator::before { content: ''; position: absolute; width: 100%; height: 100%; background: #10b981; border-radius: 50%; animation: pulse-ring 1.5s infinite; }
23
+ </style>
24
+ </head>
25
+ <body class="bg-slate-50 min-h-screen">
26
+
27
+ <!-- Header -->
28
+ <header class="asr-gradient text-white sticky top-0 z-40 shadow-xl">
29
+ <div class="container mx-auto px-6 py-4">
30
+ <div class="flex items-center justify-between">
31
+ <div class="flex items-center space-x-4">
32
+ <div class="w-12 h-12 bg-white rounded-xl flex items-center justify-center">
33
+ <span class="text-slate-900 font-black text-xl">ASR</span>
34
+ </div>
35
+ <div>
36
+ <h1 class="text-2xl font-bold tracking-tight">Audio Intelligence Platform</h1>
37
+ <p class="text-slate-400 text-sm">Enterprise Speech Analytics & Transcription</p>
38
+ </div>
39
+ </div>
40
+ <div class="flex items-center space-x-6">
41
+ <div class="text-right">
42
+ <div class="text-xs text-slate-400 uppercase tracking-wider">System Status</div>
43
+ <div class="flex items-center mt-1">
44
+ <span class="relative flex h-3 w-3 mr-2">
45
+ <span class="live-indicator absolute inline-flex h-full w-full rounded-full bg-emerald-400"></span>
46
+ <span class="relative inline-flex rounded-full h-3 w-3 bg-emerald-500"></span>
47
+ </span>
48
+ <span class="font-medium" id="serverStatus">Operational</span>
49
+ </div>
50
+ </div>
51
+ </div>
52
+ </div>
53
+ </div>
54
+ </header>
55
+
56
+ <main class="container mx-auto px-6 py-8">
57
+
58
+ <!-- Upload Section -->
59
+ <section class="glass-card rounded-2xl shadow-lg p-8 mb-8 border border-slate-200">
60
+ <div class="flex items-center justify-between mb-6">
61
+ <div>
62
+ <h2 class="text-xl font-bold text-slate-800">Audio Upload</h2>
63
+ <p class="text-slate-500 text-sm mt-1">Upload audio files for analysis. Stereo files will be automatically separated by channel.</p>
64
+ </div>
65
+ <div class="flex items-center space-x-2 text-sm">
66
+ <span class="px-3 py-1 bg-blue-100 text-blue-700 rounded-full font-medium">Stereo: Split Channels</span>
67
+ <span class="px-3 py-1 bg-slate-100 text-slate-700 rounded-full font-medium">Mono: Single Speaker</span>
68
+ </div>
69
+ </div>
70
+
71
+ <div class="upload-zone rounded-xl p-10 text-center cursor-pointer" id="uploadZone">
72
+ <input type="file" id="fileInput" class="hidden" accept=".wav,.mp3,.m4a,.flac,.ogg,.opus">
73
+ <div class="w-20 h-20 mx-auto mb-4 bg-slate-100 rounded-full flex items-center justify-center">
74
+ <i class="fas fa-cloud-arrow-up text-3xl text-slate-400"></i>
75
+ </div>
76
+ <p class="text-lg text-slate-700 font-medium">Drop audio file here or <span class="text-blue-600 hover:underline">browse</span></p>
77
+ <p class="text-sm text-slate-400 mt-2">WAV, MP3, M4A, FLAC, OGG, OPUS supported</p>
78
+ </div>
79
+
80
+ <div id="uploadProgress" class="hidden mt-6">
81
+ <div class="bg-slate-50 rounded-xl p-6 border border-slate-200">
82
+ <div class="flex items-center justify-between mb-4">
83
+ <div class="flex items-center">
84
+ <div class="w-10 h-10 bg-blue-100 rounded-lg flex items-center justify-center mr-4">
85
+ <i class="fas fa-spinner fa-spin text-blue-600"></i>
86
+ </div>
87
+ <div>
88
+ <p class="font-semibold text-slate-800" id="uploadStatus">Processing...</p>
89
+ <p class="text-sm text-slate-500" id="stageText">Initializing...</p>
90
+ </div>
91
+ </div>
92
+ <span id="progressPercent" class="text-2xl font-bold text-blue-600">0%</span>
93
+ </div>
94
+ <div class="w-full bg-slate-200 rounded-full h-2">
95
+ <div id="progressBar" class="asr-accent-gradient h-2 rounded-full transition-all duration-500" style="width: 0%"></div>
96
+ </div>
97
+ </div>
98
+ </div>
99
+ </section>
100
+
101
+ <!-- Statistics Dashboard -->
102
+ <section class="grid grid-cols-2 md:grid-cols-4 lg:grid-cols-6 gap-4 mb-8">
103
+ <div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
104
+ <div class="flex items-center justify-between">
105
+ <div>
106
+ <p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Total Calls</p>
107
+ <p class="text-3xl font-bold text-slate-800 mt-1" id="totalCalls">0</p>
108
+ </div>
109
+ <div class="w-12 h-12 bg-blue-50 rounded-xl flex items-center justify-center">
110
+ <i class="fas fa-phone-volume text-blue-600 text-lg"></i>
111
+ </div>
112
+ </div>
113
+ </div>
114
+
115
+ <div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
116
+ <div class="flex items-center justify-between">
117
+ <div>
118
+ <p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Stereo</p>
119
+ <p class="text-3xl font-bold text-purple-600 mt-1" id="stereoCalls">0</p>
120
+ </div>
121
+ <div class="w-12 h-12 bg-purple-50 rounded-xl flex items-center justify-center">
122
+ <i class="fas fa-code-branch text-purple-600 text-lg"></i>
123
+ </div>
124
+ </div>
125
+ </div>
126
+
127
+ <div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
128
+ <div class="flex items-center justify-between">
129
+ <div>
130
+ <p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Quality Score</p>
131
+ <p class="text-3xl font-bold text-emerald-600 mt-1" id="avgScore">0</p>
132
+ </div>
133
+ <div class="w-12 h-12 bg-emerald-50 rounded-xl flex items-center justify-center">
134
+ <i class="fas fa-chart-line text-emerald-600 text-lg"></i>
135
+ </div>
136
+ </div>
137
+ </div>
138
+
139
+ <div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
140
+ <div class="flex items-center justify-between">
141
+ <div>
142
+ <p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Clarity</p>
143
+ <p class="text-3xl font-bold text-cyan-600 mt-1" id="avgClarity">0</p>
144
+ </div>
145
+ <div class="w-12 h-12 bg-cyan-50 rounded-xl flex items-center justify-center">
146
+ <i class="fas fa-microphone text-cyan-600 text-lg"></i>
147
+ </div>
148
+ </div>
149
+ </div>
150
+
151
+ <div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
152
+ <div class="flex items-center justify-between">
153
+ <div>
154
+ <p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Confidence</p>
155
+ <p class="text-3xl font-bold text-amber-600 mt-1" id="avgConfidence">0</p>
156
+ </div>
157
+ <div class="w-12 h-12 bg-amber-50 rounded-xl flex items-center justify-center">
158
+ <i class="fas fa-shield-check text-amber-600 text-lg"></i>
159
+ </div>
160
+ </div>
161
+ </div>
162
+
163
+ <div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
164
+ <div class="flex items-center justify-between">
165
+ <div>
166
+ <p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Segments</p>
167
+ <p class="text-3xl font-bold text-rose-600 mt-1" id="totalSegments">0</p>
168
+ </div>
169
+ <div class="w-12 h-12 bg-rose-50 rounded-xl flex items-center justify-center">
170
+ <i class="fas fa-wave-square text-rose-600 text-lg"></i>
171
+ </div>
172
+ </div>
173
+ </div>
174
+ </section>
175
+
176
+ <!-- Calls List -->
177
+ <section class="glass-card rounded-2xl shadow-lg border border-slate-200 overflow-hidden">
178
+ <div class="p-6 border-b border-slate-200 flex items-center justify-between">
179
+ <div>
180
+ <h2 class="text-xl font-bold text-slate-800">Analyzed Recordings</h2>
181
+ <p class="text-sm text-slate-500 mt-1">Click on any recording to view detailed analysis</p>
182
+ </div>
183
+ <button onclick="loadCalls()" class="flex items-center px-4 py-2 bg-slate-100 hover:bg-slate-200 text-slate-700 rounded-lg transition-colors font-medium">
184
+ <i class="fas fa-arrows-rotate mr-2"></i>Refresh
185
+ </button>
186
+ </div>
187
+ <div id="callsList" class="divide-y divide-slate-100"></div>
188
+ </section>
189
+ </main>
190
+
191
+ <!-- Analysis Modal -->
192
+ <div id="analysisModal" class="hidden fixed inset-0 bg-slate-900/60 backdrop-blur-sm z-50 flex items-center justify-center p-4">
193
+ <div class="bg-white rounded-2xl shadow-2xl max-w-7xl w-full max-h-[95vh] overflow-hidden flex flex-col">
194
+ <div class="asr-gradient text-white p-6 flex justify-between items-center shrink-0">
195
+ <div class="flex items-center space-x-4">
196
+ <div class="w-10 h-10 bg-white/20 rounded-lg flex items-center justify-center">
197
+ <i class="fas fa-chart-bar"></i>
198
+ </div>
199
+ <div>
200
+ <h3 class="text-xl font-bold" id="modalTitle">Analysis Report</h3>
201
+ <p class="text-slate-300 text-sm">Comprehensive audio analysis</p>
202
+ </div>
203
+ </div>
204
+ <button onclick="closeModal()" class="w-10 h-10 bg-white/10 hover:bg-white/20 rounded-lg flex items-center justify-center transition-colors">
205
+ <i class="fas fa-xmark text-xl"></i>
206
+ </button>
207
+ </div>
208
+ <div class="p-6 overflow-y-auto flex-1" id="modalContent"></div>
209
+ </div>
210
+ </div>
211
+
212
+ <script>
213
+ const API_BASE = window.location.origin;
214
+ let currentJobId = null;
215
+ let pollInterval = null;
216
+
217
+ document.addEventListener('DOMContentLoaded', () => {
218
+ checkServerHealth();
219
+ loadStatistics();
220
+ loadCalls();
221
+
222
+ const uploadZone = document.getElementById('uploadZone');
223
+ const fileInput = document.getElementById('fileInput');
224
+
225
+ uploadZone.addEventListener('click', () => fileInput.click());
226
+ uploadZone.addEventListener('dragover', e => { e.preventDefault(); uploadZone.classList.add('dragover'); });
227
+ uploadZone.addEventListener('dragleave', () => uploadZone.classList.remove('dragover'));
228
+ uploadZone.addEventListener('drop', e => {
229
+ e.preventDefault();
230
+ uploadZone.classList.remove('dragover');
231
+ if (e.dataTransfer.files.length > 0) handleFileUpload(e.dataTransfer.files[0]);
232
+ });
233
+ fileInput.addEventListener('change', e => {
234
+ if (e.target.files.length > 0) handleFileUpload(e.target.files[0]);
235
+ });
236
+ });
237
+
238
+ async function checkServerHealth() {
239
+ try {
240
+ const res = await fetch(`${API_BASE}/health`);
241
+ const data = await res.json();
242
+ document.getElementById('serverStatus').textContent = data.status === 'healthy' ? 'Operational' : 'Offline';
243
+ } catch { document.getElementById('serverStatus').textContent = 'Offline'; }
244
+ }
245
+
246
+ async function loadStatistics() {
247
+ try {
248
+ const res = await fetch(`${API_BASE}/api/statistics`);
249
+ const s = await res.json();
250
+ document.getElementById('totalCalls').textContent = s.total_calls || 0;
251
+ document.getElementById('stereoCalls').textContent = s.stereo_calls || 0;
252
+ document.getElementById('avgScore').textContent = (s.avg_quality_score || 0).toFixed(1);
253
+ document.getElementById('avgClarity').textContent = (s.avg_clarity || 0).toFixed(1);
254
+ document.getElementById('avgConfidence').textContent = (s.avg_confidence || 0).toFixed(1);
255
+ document.getElementById('totalSegments').textContent = s.total_segments || 0;
256
+ } catch (e) { console.error('Stats error:', e); }
257
+ }
258
+
259
+ async function loadCalls() {
260
+ try {
261
+ const res = await fetch(`${API_BASE}/api/calls`);
262
+ const calls = await res.json();
263
+ const list = document.getElementById('callsList');
264
+
265
+ if (calls.length === 0) {
266
+ list.innerHTML = `<div class="p-12 text-center"><div class="w-16 h-16 bg-slate-100 rounded-full flex items-center justify-center mx-auto mb-4"><i class="fas fa-folder-open text-2xl text-slate-400"></i></div><p class="text-slate-600 font-medium">No recordings yet</p><p class="text-slate-400 text-sm mt-1">Upload an audio file to get started</p></div>`;
267
+ return;
268
+ }
269
+
270
+ list.innerHTML = calls.map(call => `
271
+ <div onclick="viewAnalysis('${call}')" class="p-5 flex items-center justify-between hover:bg-slate-50 cursor-pointer transition-colors">
272
+ <div class="flex items-center space-x-4">
273
+ <div class="w-12 h-12 bg-gradient-to-br from-blue-500 to-purple-600 rounded-xl flex items-center justify-center text-white"><i class="fas fa-waveform-lines"></i></div>
274
+ <div>
275
+ <p class="font-semibold text-slate-800">${call}</p>
276
+ <p class="text-sm text-slate-500">Click to view analysis</p>
277
+ </div>
278
+ </div>
279
+ <div class="flex items-center space-x-3">
280
+ <span class="px-3 py-1 bg-emerald-100 text-emerald-700 rounded-full text-sm font-medium">Analyzed</span>
281
+ <i class="fas fa-chevron-right text-slate-400"></i>
282
+ </div>
283
+ </div>
284
+ `).join('');
285
+ } catch (e) { console.error('Calls error:', e); }
286
+ }
287
+
288
+ async function handleFileUpload(file) {
289
+ const formData = new FormData();
290
+ formData.append('file', file);
291
+
292
+ document.getElementById('uploadProgress').classList.remove('hidden');
293
+ updateProgress('Uploading...', 'Transferring file to server', 10);
294
+
295
+ try {
296
+ const res = await fetch(`${API_BASE}/api/upload`, { method: 'POST', body: formData });
297
+ const result = await res.json();
298
+ if (!res.ok) throw new Error(result.error);
299
+ currentJobId = result.job_id;
300
+ updateProgress('Processing...', 'Analysis started', 25);
301
+ pollJobStatus();
302
+ } catch (e) {
303
+ updateProgress('Error', e.message, 0);
304
+ setTimeout(() => document.getElementById('uploadProgress').classList.add('hidden'), 3000);
305
+ }
306
+ }
307
+
308
+ function updateProgress(status, stage, percent) {
309
+ document.getElementById('uploadStatus').textContent = status;
310
+ document.getElementById('stageText').textContent = stage;
311
+ document.getElementById('progressBar').style.width = percent + '%';
312
+ document.getElementById('progressPercent').textContent = percent + '%';
313
+ }
314
+
315
+ function pollJobStatus() {
316
+ if (pollInterval) clearInterval(pollInterval);
317
+ pollInterval = setInterval(async () => {
318
+ try {
319
+ const res = await fetch(`${API_BASE}/api/jobs/${currentJobId}`);
320
+ const job = await res.json();
321
+
322
+ const stages = {
323
+ 'pending': { text: 'Queued...', progress: 20 },
324
+ 'initializing': { text: 'Loading models...', progress: 30 },
325
+ 'diarization': { text: 'Separating speakers...', progress: 45 },
326
+ 'transcription': { text: 'Transcribing speech...', progress: 65 },
327
+ 'audio_analysis': { text: 'Analyzing audio features...', progress: 85 },
328
+ 'done': { text: 'Complete!', progress: 100 }
329
+ };
330
+
331
+ if (job.stage && stages[job.stage]) {
332
+ let stageText = stages[job.stage].text;
333
+ if (job.is_stereo !== null) stageText += job.is_stereo ? ' (Stereo)' : ' (Mono)';
334
+ updateProgress('Processing...', stageText, stages[job.stage].progress);
335
+ }
336
+
337
+ if (job.status === 'completed') {
338
+ clearInterval(pollInterval);
339
+ updateProgress('Success!', 'Analysis complete', 100);
340
+ setTimeout(() => {
341
+ document.getElementById('uploadProgress').classList.add('hidden');
342
+ loadStatistics();
343
+ loadCalls();
344
+ }, 1500);
345
+ } else if (job.status === 'failed') {
346
+ clearInterval(pollInterval);
347
+ updateProgress('Failed', job.error || 'Unknown error', 0);
348
+ setTimeout(() => document.getElementById('uploadProgress').classList.add('hidden'), 3000);
349
+ }
350
+ } catch (e) { clearInterval(pollInterval); }
351
+ }, 1500);
352
+ }
353
+
354
+ async function viewAnalysis(callName) {
355
+ try {
356
+ const res = await fetch(`${API_BASE}/api/analysis/${callName}`);
357
+ const data = await res.json();
358
+ const a = data.analysis;
359
+ const t = data.transcription;
360
+
361
+ document.getElementById('modalTitle').textContent = callName;
362
+
363
+ const isStereo = a.audio_type === 'stereo';
364
+ const profiles = a.speaker_profiles || {};
365
+ const dynamics = a.dynamics || {};
366
+
367
+ let profilesHTML = '';
368
+ for (const [spk, p] of Object.entries(profiles)) {
369
+ profilesHTML += `
370
+ <div class="bg-slate-50 rounded-xl p-5 border border-slate-200">
371
+ <div class="flex items-center justify-between mb-4">
372
+ <div class="flex items-center space-x-3">
373
+ <div class="w-10 h-10 ${spk === 'CUSTOMER' ? 'bg-blue-100' : spk === 'AGENT' ? 'bg-emerald-100' : 'bg-purple-100'} rounded-full flex items-center justify-center">
374
+ <i class="fas fa-user ${spk === 'CUSTOMER' ? 'text-blue-600' : spk === 'AGENT' ? 'text-emerald-600' : 'text-purple-600'}"></i>
375
+ </div>
376
+ <div>
377
+ <p class="font-bold text-slate-800">${spk}</p>
378
+ <p class="text-sm text-slate-500">${p.communication_style} style</p>
379
+ </div>
380
+ </div>
381
+ <div class="text-right">
382
+ <p class="text-2xl font-bold text-slate-800">${p.overall_score}</p>
383
+ <p class="text-xs text-slate-500">Quality Score</p>
384
+ </div>
385
+ </div>
386
+ <div class="grid grid-cols-4 gap-3 mb-4">
387
+ <div class="text-center p-2 bg-white rounded-lg"><p class="text-lg font-bold text-slate-800">${p.avg_pitch.toFixed(0)}</p><p class="text-xs text-slate-500">Pitch (Hz)</p></div>
388
+ <div class="text-center p-2 bg-white rounded-lg"><p class="text-lg font-bold text-slate-800">${p.avg_energy.toFixed(1)}</p><p class="text-xs text-slate-500">Energy (dB)</p></div>
389
+ <div class="text-center p-2 bg-white rounded-lg"><p class="text-lg font-bold text-slate-800">${p.avg_speaking_rate.toFixed(1)}</p><p class="text-xs text-slate-500">Rate (/s)</p></div>
390
+ <div class="text-center p-2 bg-white rounded-lg"><p class="text-lg font-bold text-slate-800">${p.avg_clarity.toFixed(0)}</p><p class="text-xs text-slate-500">Clarity</p></div>
391
+ </div>
392
+ <div class="flex flex-wrap gap-2">
393
+ ${p.strengths.map(s => `<span class="px-2 py-1 bg-emerald-100 text-emerald-700 text-xs rounded-full">${s}</span>`).join('')}
394
+ ${p.improvements.map(i => `<span class="px-2 py-1 bg-amber-100 text-amber-700 text-xs rounded-full">${i}</span>`).join('')}
395
+ </div>
396
+ </div>
397
+ `;
398
+ }
399
+
400
+ let transcriptHTML = '';
401
+ let totalInferenceTime = 0;
402
+ if (t && t.transcriptions) {
403
+ t.transcriptions.forEach(seg => totalInferenceTime += (seg.inference_time || 0));
404
+ transcriptHTML = t.transcriptions.map(seg => `
405
+ <div class="flex ${seg.speaker === 'CUSTOMER' ? 'justify-start' : seg.speaker === 'AGENT' ? 'justify-end' : 'justify-center'}">
406
+ <div class="max-w-[70%] ${seg.speaker === 'CUSTOMER' ? 'bg-blue-50 border-blue-200' : seg.speaker === 'AGENT' ? 'bg-emerald-50 border-emerald-200' : 'bg-slate-50 border-slate-200'} border rounded-xl p-3">
407
+ <div class="flex items-center justify-between mb-1">
408
+ <div class="flex items-center space-x-2">
409
+ <span class="font-semibold text-sm ${seg.speaker === 'CUSTOMER' ? 'text-blue-700' : seg.speaker === 'AGENT' ? 'text-emerald-700' : 'text-slate-700'}">${seg.speaker}</span>
410
+ <span class="text-xs text-slate-400">${seg.start_time}</span>
411
+ </div>
412
+ <span class="text-xs text-orange-500 font-medium"><i class="fas fa-clock mr-1"></i>${seg.inference_time}s</span>
413
+ </div>
414
+ <p class="text-slate-800">${seg.text}</p>
415
+ </div>
416
+ </div>
417
+ `).join('');
418
+ }
419
+
420
+ document.getElementById('modalContent').innerHTML = `
421
+ <div class="space-y-6">
422
+ <!-- Overview -->
423
+ <div class="grid grid-cols-5 gap-4">
424
+ <div class="bg-gradient-to-br from-slate-800 to-slate-900 text-white rounded-xl p-5 text-center">
425
+ <p class="text-3xl font-bold">${a.overall_quality_score}</p>
426
+ <p class="text-slate-300 text-sm mt-1">Quality Score</p>
427
+ </div>
428
+ <div class="bg-slate-50 rounded-xl p-5 text-center border border-slate-200">
429
+ <p class="text-3xl font-bold text-slate-800">${a.audio_duration.toFixed(1)}s</p>
430
+ <p class="text-slate-500 text-sm mt-1">Duration</p>
431
+ </div>
432
+ <div class="bg-slate-50 rounded-xl p-5 text-center border border-slate-200">
433
+ <p class="text-3xl font-bold ${isStereo ? 'text-purple-600' : 'text-blue-600'}">${isStereo ? 'STEREO' : 'MONO'}</p>
434
+ <p class="text-slate-500 text-sm mt-1">Audio Type</p>
435
+ </div>
436
+ <div class="bg-slate-50 rounded-xl p-5 text-center border border-slate-200">
437
+ <p class="text-3xl font-bold text-slate-800">${a.segments.length}</p>
438
+ <p class="text-slate-500 text-sm mt-1">Segments</p>
439
+ </div>
440
+ <div class="bg-slate-50 rounded-xl p-5 text-center border border-slate-200">
441
+ <p class="text-3xl font-bold text-emerald-600">${dynamics.engagement_score?.toFixed(0) || 0}</p>
442
+ <p class="text-slate-500 text-sm mt-1">Engagement</p>
443
+ </div>
444
+ </div>
445
+
446
+ <!-- Speaker Profiles -->
447
+ <div>
448
+ <h4 class="font-bold text-slate-800 mb-4 flex items-center"><i class="fas fa-users text-blue-600 mr-2"></i>Speaker Profiles</h4>
449
+ <div class="grid ${isStereo ? 'grid-cols-2' : 'grid-cols-1 max-w-xl'} gap-4">${profilesHTML}</div>
450
+ </div>
451
+
452
+ <!-- Transcription -->
453
+ ${t ? `
454
+ <div>
455
+ <div class="flex items-center justify-between mb-4">
456
+ <h4 class="font-bold text-slate-800 flex items-center"><i class="fas fa-closed-captioning text-blue-600 mr-2"></i>Transcription</h4>
457
+ <span class="px-3 py-1 bg-orange-100 text-orange-700 rounded-full text-sm font-medium"><i class="fas fa-bolt mr-1"></i>Total: ${totalInferenceTime.toFixed(2)}s</span>
458
+ </div>
459
+ <div class="bg-slate-50 rounded-xl p-4 border border-slate-200 max-h-80 overflow-y-auto space-y-3">${transcriptHTML}</div>
460
+ </div>
461
+ ` : ''}
462
+
463
+ <!-- Segment Analysis -->
464
+ <div>
465
+ <h4 class="font-bold text-slate-800 mb-4 flex items-center"><i class="fas fa-wave-square text-blue-600 mr-2"></i>Segment Analysis</h4>
466
+ <div class="bg-slate-50 rounded-xl border border-slate-200 overflow-hidden max-h-96 overflow-y-auto">
467
+ <table class="w-full text-sm">
468
+ <thead class="bg-slate-100 sticky top-0">
469
+ <tr>
470
+ <th class="px-4 py-3 text-left font-semibold text-slate-600">#</th>
471
+ <th class="px-4 py-3 text-left font-semibold text-slate-600">Speaker</th>
472
+ <th class="px-4 py-3 text-left font-semibold text-slate-600">Time</th>
473
+ <th class="px-4 py-3 text-center font-semibold text-slate-600">Pitch</th>
474
+ <th class="px-4 py-3 text-center font-semibold text-slate-600">Energy</th>
475
+ <th class="px-4 py-3 text-center font-semibold text-slate-600">Rate</th>
476
+ <th class="px-4 py-3 text-center font-semibold text-slate-600">Clarity</th>
477
+ <th class="px-4 py-3 text-center font-semibold text-slate-600">Emotion</th>
478
+ <th class="px-4 py-3 text-center font-semibold text-slate-600">Score</th>
479
+ </tr>
480
+ </thead>
481
+ <tbody class="divide-y divide-slate-100">
482
+ ${a.segments.map((s, i) => `
483
+ <tr class="segment-row">
484
+ <td class="px-4 py-3 font-medium text-slate-800">${i+1}</td>
485
+ <td class="px-4 py-3"><span class="px-2 py-1 ${s.speaker === 'CUSTOMER' ? 'bg-blue-100 text-blue-700' : s.speaker === 'AGENT' ? 'bg-emerald-100 text-emerald-700' : 'bg-purple-100 text-purple-700'} rounded-full text-xs font-medium">${s.speaker}</span></td>
486
+ <td class="px-4 py-3 text-slate-600">${s.start_time}</td>
487
+ <td class="px-4 py-3 text-center font-medium">${s.pitch.mean_f0.toFixed(0)} Hz</td>
488
+ <td class="px-4 py-3 text-center font-medium">${s.energy.mean_rms.toFixed(1)} dB</td>
489
+ <td class="px-4 py-3 text-center font-medium">${s.rhythm.speaking_rate.toFixed(1)}</td>
490
+ <td class="px-4 py-3 text-center"><span class="px-2 py-1 ${s.voice_quality.clarity_score > 70 ? 'bg-emerald-100 text-emerald-700' : s.voice_quality.clarity_score > 50 ? 'bg-amber-100 text-amber-700' : 'bg-red-100 text-red-700'} rounded text-xs font-medium">${s.voice_quality.clarity_score.toFixed(0)}</span></td>
491
+ <td class="px-4 py-3 text-center"><span class="px-2 py-1 bg-slate-100 text-slate-700 rounded text-xs">${s.emotion.primary_emotion}</span></td>
492
+ <td class="px-4 py-3 text-center font-bold text-slate-800">${s.overall_quality_score.toFixed(0)}</td>
493
+ </tr>
494
+ `).join('')}
495
+ </tbody>
496
+ </table>
497
+ </div>
498
+ </div>
499
+ </div>
500
+ `;
501
+
502
+ document.getElementById('analysisModal').classList.remove('hidden');
503
+ } catch (e) { console.error('Analysis error:', e); alert('Failed to load analysis'); }
504
+ }
505
+
506
+ function closeModal() { document.getElementById('analysisModal').classList.add('hidden'); }
507
+ document.addEventListener('keydown', e => { if (e.key === 'Escape') closeModal(); });
508
+ </script>
509
+ </body>
510
+ </html>
main.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ASR Audio Analysis Pipeline
3
+
4
+ Complete pipeline: Diarization + Whisper Transcription + Professional Audio Analysis
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ from stereo_diarizer import StereoCallDiarizer
12
+ from whisper_transcriber import WhisperTranscriber
13
+ from audio_analyzer import AudioAnalyzer
14
+
15
+
16
+ class ASRPipeline:
17
+ """ASR End-to-end audio analysis pipeline"""
18
+
19
+ def __init__(self,
20
+ input_folder: str,
21
+ output_folder: str,
22
+ whisper_model: str,
23
+ min_silence_len: int = 500,
24
+ silence_thresh: int = -40,
25
+ device: str = "cpu",
26
+ verbose: bool = True):
27
+
28
+ self.input_folder = Path(input_folder)
29
+ self.output_folder = Path(output_folder)
30
+ self.whisper_model = whisper_model
31
+ self.min_silence_len = min_silence_len
32
+ self.silence_thresh = silence_thresh
33
+ self.device = device
34
+ self.verbose = verbose
35
+
36
+ self.stats = {
37
+ 'total_files': 0,
38
+ 'processed': 0,
39
+ 'failed': 0,
40
+ 'stereo': 0,
41
+ 'mono': 0,
42
+ 'failed_files': [],
43
+ 'total_duration': 0.0
44
+ }
45
+
46
+ self.analyzer = AudioAnalyzer(verbose=self.verbose)
47
+ self.transcriber = None
48
+
49
+ def _init_transcriber(self):
50
+ if self.transcriber is None:
51
+ self.transcriber = WhisperTranscriber(
52
+ self.whisper_model, self.device, self.verbose
53
+ )
54
+
55
+ def get_audio_files(self):
56
+ formats = {'.wav', '.mp3', '.m4a', '.flac', '.ogg', '.opus'}
57
+ return sorted([
58
+ f for f in self.input_folder.iterdir()
59
+ if f.is_file() and f.suffix.lower() in formats
60
+ ])
61
+
62
+ def process_single(self, audio_file: Path) -> bool:
63
+ output_dir = self.output_folder / audio_file.stem
64
+ output_dir.mkdir(parents=True, exist_ok=True)
65
+
66
+ if self.verbose:
67
+ print(f"\n{'='*60}")
68
+ print(f"PROCESSING: {audio_file.name}")
69
+ print(f"{'='*60}")
70
+
71
+ try:
72
+ # Step 1: Diarization
73
+ if self.verbose:
74
+ print("\n[1/3] DIARIZATION")
75
+
76
+ diarizer = StereoCallDiarizer(
77
+ str(audio_file), self.min_silence_len,
78
+ self.silence_thresh, self.verbose
79
+ )
80
+ diarizer.load_audio()
81
+
82
+ if diarizer.is_stereo:
83
+ self.stats['stereo'] += 1
84
+ else:
85
+ self.stats['mono'] += 1
86
+
87
+ left, right = diarizer.detect_speech_segments()
88
+ diarizer.create_timeline(left, right)
89
+
90
+ segments = diarizer.export_segments(str(output_dir))
91
+ diarizer.export_full_speakers(str(output_dir))
92
+ diarizer.export_transcript_txt(str(output_dir))
93
+ diarizer.export_transcript_json(str(output_dir))
94
+
95
+ duration = len(diarizer.audio) / 1000
96
+ self.stats['total_duration'] += duration
97
+
98
+ # Step 2: Transcription
99
+ if self.verbose:
100
+ print("\n[2/3] TRANSCRIPTION")
101
+
102
+ self._init_transcriber()
103
+ transcribed = self.transcriber.transcribe_segments(
104
+ segments, diarizer.timeline
105
+ )
106
+ self.transcriber.export_transcription(transcribed, str(output_dir))
107
+
108
+ # Step 3: Audio Analysis
109
+ if self.verbose:
110
+ print("\n[3/3] AUDIO ANALYSIS")
111
+
112
+ analysis = self.analyzer.analyze_call(
113
+ segments, diarizer.timeline,
114
+ audio_file.stem, diarizer.is_stereo
115
+ )
116
+ self.analyzer.export_analysis(analysis, str(output_dir))
117
+
118
+ if self.verbose:
119
+ print(f"\nSUCCESS: {audio_file.name}")
120
+ print(f"Type: {'STEREO' if diarizer.is_stereo else 'MONO'}")
121
+ print(f"Duration: {duration:.1f}s | Quality: {analysis.overall_quality_score}/100")
122
+
123
+ return True
124
+
125
+ except Exception as e:
126
+ if self.verbose:
127
+ print(f"\nFAILED: {audio_file.name}")
128
+ print(f"Error: {e}")
129
+ import traceback
130
+ traceback.print_exc()
131
+ return False
132
+
133
+ def run(self):
134
+ print("\n" + "="*60)
135
+ print("ASR AUDIO ANALYSIS PIPELINE")
136
+ print("="*60)
137
+
138
+ files = self.get_audio_files()
139
+ self.stats['total_files'] = len(files)
140
+
141
+ if not files:
142
+ print(f"\nNo audio files in {self.input_folder}")
143
+ return
144
+
145
+ print(f"\nFound {len(files)} file(s)")
146
+ print(f"Input: {self.input_folder}")
147
+ print(f"Output: {self.output_folder}")
148
+
149
+ for i, f in enumerate(files, 1):
150
+ print(f"\n[{i}/{len(files)}]")
151
+ if self.process_single(f):
152
+ self.stats['processed'] += 1
153
+ else:
154
+ self.stats['failed'] += 1
155
+ self.stats['failed_files'].append(f.name)
156
+
157
+ print("\n" + "="*60)
158
+ print("COMPLETE")
159
+ print("="*60)
160
+ print(f"Processed: {self.stats['processed']}/{self.stats['total_files']}")
161
+ print(f"Stereo: {self.stats['stereo']} | Mono: {self.stats['mono']}")
162
+ print(f"Total duration: {self.stats['total_duration']:.1f}s")
163
+
164
+ if self.stats['failed_files']:
165
+ print(f"\nFailed: {', '.join(self.stats['failed_files'])}")
166
+
167
+ print(f"\nResults: {self.output_folder}")
168
+ print("\nRun 'python api_server.py' and open http://localhost:5001")
169
+
170
+
171
+ def main():
172
+ INPUT_FOLDER = "/home/ramal/Downloads/Archive"
173
+ OUTPUT_FOLDER = "output"
174
+ WHISPER_MODEL = "/home/ramal/Desktop/end-to-end/whisper-small-az/checkpoint-157959"
175
+
176
+ if not os.path.exists(INPUT_FOLDER):
177
+ print(f"Error: {INPUT_FOLDER} not found")
178
+ sys.exit(1)
179
+
180
+ os.makedirs(OUTPUT_FOLDER, exist_ok=True)
181
+
182
+ pipeline = ASRPipeline(
183
+ input_folder=INPUT_FOLDER,
184
+ output_folder=OUTPUT_FOLDER,
185
+ whisper_model=WHISPER_MODEL,
186
+ device="cpu",
187
+ verbose=True
188
+ )
189
+
190
+ pipeline.run()
191
+
192
+
193
+ if __name__ == "__main__":
194
+ main()
req.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pydub
2
+ librosa
3
+ numpy
4
+ scipy
5
+ flask
6
+ flask-cors
7
+ torch
8
+ transformers
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ pydub
2
+ librosa
3
+ numpy
4
+ scipy
5
+ flask
6
+ flask-cors
7
+ torch
8
+ transformers
9
+ soundfile
stereo_diarizer.py ADDED
@@ -0,0 +1,556 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stereo Call Center Audio Diarization Module
3
+
4
+ This module provides professional audio diarization for stereo call center recordings.
5
+ It separates speakers from left/right channels and creates detailed transcription-ready segments.
6
+
7
+ IMPORTANT: Only separates channels for STEREO audio. Mono audio is processed as single speaker.
8
+ """
9
+
10
+ import os
11
+ import json
12
+ from datetime import datetime
13
+ from typing import List, Dict, Tuple, Optional
14
+ from dataclasses import dataclass, asdict
15
+ from pydub import AudioSegment
16
+ from pydub.silence import detect_nonsilent
17
+
18
+
19
+ @dataclass
20
+ class Segment:
21
+ """Represents a single speech segment"""
22
+ turn: int
23
+ speaker: str
24
+ start_ms: int
25
+ end_ms: int
26
+ duration_ms: int
27
+ start_time: str
28
+ end_time: str
29
+ duration: str
30
+ channel: str = 'mono'
31
+ audio_file: Optional[str] = None
32
+
33
+
34
+ class StereoCallDiarizer:
35
+ """
36
+ Professional stereo call center audio diarization system.
37
+
38
+ For STEREO audio: Separates speakers from stereo audio (left/right channels)
39
+ For MONO audio: Processes as single speaker without channel separation
40
+
41
+ Attributes:
42
+ input_file (str): Path to input audio file
43
+ min_silence_len (int): Minimum silence length in ms to split segments
44
+ silence_thresh (int): Silence threshold in dB
45
+ """
46
+
47
+ def __init__(self, input_file: str, min_silence_len: int = 500,
48
+ silence_thresh: int = -40, verbose: bool = True):
49
+ """
50
+ Initialize the diarizer.
51
+
52
+ Args:
53
+ input_file: Path to audio file (stereo or mono)
54
+ min_silence_len: Minimum silence duration (ms) to split segments
55
+ silence_thresh: Audio level (dB) below which is considered silence
56
+ verbose: Enable/disable logging output
57
+ """
58
+ self.input_file = input_file
59
+ self.min_silence_len = min_silence_len
60
+ self.silence_thresh = silence_thresh
61
+ self.verbose = verbose
62
+
63
+ self.audio: Optional[AudioSegment] = None
64
+ self.left_channel: Optional[AudioSegment] = None
65
+ self.right_channel: Optional[AudioSegment] = None
66
+ self.mono_channel: Optional[AudioSegment] = None
67
+ self.timeline: List[Segment] = []
68
+ self.customer_channel: str = 'left'
69
+ self.is_stereo: bool = False
70
+
71
+ def _log(self, message: str):
72
+ """Internal logging method"""
73
+ if self.verbose:
74
+ print(message)
75
+
76
+ def load_audio(self) -> bool:
77
+ """
78
+ Load audio file and split into channels if stereo.
79
+
80
+ Returns:
81
+ True if stereo, False if mono
82
+ """
83
+ self._log(f"Loading audio: {self.input_file}")
84
+ self.audio = AudioSegment.from_file(self.input_file)
85
+
86
+ self.is_stereo = self.audio.channels == 2
87
+
88
+ if self.is_stereo:
89
+ self._log(f"STEREO audio detected ({self.audio.channels} channels) - will separate speakers")
90
+ self.left_channel = self.audio.split_to_mono()[0]
91
+ self.right_channel = self.audio.split_to_mono()[1]
92
+ else:
93
+ self._log(f"MONO audio detected ({self.audio.channels} channel) - single speaker mode")
94
+ self.mono_channel = self.audio
95
+
96
+ self._log(f"Duration: {len(self.audio)/1000:.2f}s | Sample rate: {self.audio.frame_rate}Hz")
97
+ return self.is_stereo
98
+
99
+ def detect_speech_segments(self) -> Tuple[List[Tuple[int, int]], List[Tuple[int, int]]]:
100
+ """
101
+ Detect speech segments.
102
+
103
+ For stereo: returns (left_segments, right_segments)
104
+ For mono: returns (mono_segments, [])
105
+
106
+ Returns:
107
+ Tuple of segment lists
108
+ """
109
+ self._log("Detecting speech segments...")
110
+
111
+ if self.is_stereo:
112
+ left_segments = detect_nonsilent(
113
+ self.left_channel,
114
+ min_silence_len=self.min_silence_len,
115
+ silence_thresh=self.silence_thresh
116
+ )
117
+
118
+ right_segments = detect_nonsilent(
119
+ self.right_channel,
120
+ min_silence_len=self.min_silence_len,
121
+ silence_thresh=self.silence_thresh
122
+ )
123
+
124
+ self._log(f"Found {len(left_segments)} segments (LEFT), {len(right_segments)} segments (RIGHT)")
125
+ return left_segments, right_segments
126
+ else:
127
+ mono_segments = detect_nonsilent(
128
+ self.mono_channel,
129
+ min_silence_len=self.min_silence_len,
130
+ silence_thresh=self.silence_thresh
131
+ )
132
+
133
+ self._log(f"Found {len(mono_segments)} segments (MONO)")
134
+ return mono_segments, []
135
+
136
+ def create_timeline(self, left_segments: List[Tuple[int, int]],
137
+ right_segments: List[Tuple[int, int]]) -> List[Segment]:
138
+ """
139
+ Create chronologically ordered timeline of all speech segments.
140
+
141
+ For STEREO: First speaker is CUSTOMER, second is AGENT
142
+ For MONO: All segments are marked as SPEAKER
143
+
144
+ Args:
145
+ left_segments: Left channel segments (or mono segments)
146
+ right_segments: Right channel segments (empty for mono)
147
+
148
+ Returns:
149
+ List of Segment objects sorted by start time
150
+ """
151
+ self._log("Building timeline...")
152
+
153
+ if self.is_stereo:
154
+ return self._create_stereo_timeline(left_segments, right_segments)
155
+ else:
156
+ return self._create_mono_timeline(left_segments)
157
+
158
+ def _create_stereo_timeline(self, left_segments: List[Tuple[int, int]],
159
+ right_segments: List[Tuple[int, int]]) -> List[Segment]:
160
+ """Create timeline for stereo audio with speaker separation"""
161
+ # Determine who speaks first - that's the CUSTOMER (caller)
162
+ first_left = left_segments[0][0] if left_segments else float('inf')
163
+ first_right = right_segments[0][0] if right_segments else float('inf')
164
+
165
+ if first_left < first_right:
166
+ customer_label = 'CUSTOMER'
167
+ agent_label = 'AGENT'
168
+ customer_segments = left_segments
169
+ agent_segments = right_segments
170
+ customer_channel = 'left'
171
+ self._log(f"First speaker: LEFT channel (CUSTOMER)")
172
+ else:
173
+ customer_label = 'CUSTOMER'
174
+ agent_label = 'AGENT'
175
+ customer_segments = right_segments
176
+ agent_segments = left_segments
177
+ customer_channel = 'right'
178
+ self._log(f"First speaker: RIGHT channel (CUSTOMER)")
179
+
180
+ self.customer_channel = customer_channel
181
+
182
+ timeline = []
183
+
184
+ # Add customer segments
185
+ for start, end in customer_segments:
186
+ timeline.append({
187
+ 'speaker': customer_label,
188
+ 'start_ms': start,
189
+ 'end_ms': end,
190
+ 'duration_ms': end - start,
191
+ 'start_time': self._ms_to_time(start),
192
+ 'end_time': self._ms_to_time(end),
193
+ 'duration': self._ms_to_time(end - start),
194
+ 'channel': customer_channel
195
+ })
196
+
197
+ # Add agent segments
198
+ for start, end in agent_segments:
199
+ timeline.append({
200
+ 'speaker': agent_label,
201
+ 'start_ms': start,
202
+ 'end_ms': end,
203
+ 'duration_ms': end - start,
204
+ 'start_time': self._ms_to_time(start),
205
+ 'end_time': self._ms_to_time(end),
206
+ 'duration': self._ms_to_time(end - start),
207
+ 'channel': 'right' if customer_channel == 'left' else 'left'
208
+ })
209
+
210
+ # Sort chronologically
211
+ timeline.sort(key=lambda x: x['start_ms'])
212
+
213
+ # Create Segment objects with turn numbers
214
+ self.timeline = [
215
+ Segment(turn=i+1, **seg)
216
+ for i, seg in enumerate(timeline)
217
+ ]
218
+
219
+ self._log(f"Timeline created with {len(self.timeline)} segments (2 speakers)")
220
+ return self.timeline
221
+
222
+ def _create_mono_timeline(self, segments: List[Tuple[int, int]]) -> List[Segment]:
223
+ """Create timeline for mono audio (single speaker)"""
224
+ timeline = []
225
+
226
+ for start, end in segments:
227
+ timeline.append({
228
+ 'speaker': 'SPEAKER',
229
+ 'start_ms': start,
230
+ 'end_ms': end,
231
+ 'duration_ms': end - start,
232
+ 'start_time': self._ms_to_time(start),
233
+ 'end_time': self._ms_to_time(end),
234
+ 'duration': self._ms_to_time(end - start),
235
+ 'channel': 'mono'
236
+ })
237
+
238
+ # Sort chronologically
239
+ timeline.sort(key=lambda x: x['start_ms'])
240
+
241
+ # Create Segment objects with turn numbers
242
+ self.timeline = [
243
+ Segment(turn=i+1, **seg)
244
+ for i, seg in enumerate(timeline)
245
+ ]
246
+
247
+ self._log(f"Timeline created with {len(self.timeline)} segments (1 speaker - MONO)")
248
+ return self.timeline
249
+
250
+ @staticmethod
251
+ def _ms_to_time(ms: int) -> str:
252
+ """Convert milliseconds to HH:MM:SS.mmm format"""
253
+ seconds = int(ms / 1000)
254
+ milliseconds = int(ms % 1000)
255
+ minutes = int(seconds / 60)
256
+ seconds = seconds % 60
257
+ hours = int(minutes / 60)
258
+ minutes = minutes % 60
259
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
260
+
261
+ def export_segments(self, output_dir: str = "output") -> List[str]:
262
+ """
263
+ Export each segment as individual audio file.
264
+
265
+ Args:
266
+ output_dir: Directory to save segment files
267
+
268
+ Returns:
269
+ List of created file paths
270
+ """
271
+ os.makedirs(output_dir, exist_ok=True)
272
+ self._log(f"Exporting {len(self.timeline)} audio segments to {output_dir}/")
273
+
274
+ file_paths = []
275
+
276
+ for segment in self.timeline:
277
+ # Select correct channel based on audio type and segment info
278
+ if self.is_stereo:
279
+ if segment.channel == 'left':
280
+ audio_segment = self.left_channel[segment.start_ms:segment.end_ms]
281
+ else:
282
+ audio_segment = self.right_channel[segment.start_ms:segment.end_ms]
283
+ else:
284
+ audio_segment = self.mono_channel[segment.start_ms:segment.end_ms]
285
+
286
+ # Create filename
287
+ filename = f"segment_{segment.turn:03d}_{segment.speaker}_{segment.start_ms}ms-{segment.end_ms}ms.wav"
288
+ filepath = os.path.join(output_dir, filename)
289
+
290
+ # Export
291
+ audio_segment.export(filepath, format="wav")
292
+ segment.audio_file = filepath
293
+ file_paths.append(filepath)
294
+
295
+ self._log(f"Exported {len(file_paths)} segments")
296
+ return file_paths
297
+
298
+ def export_full_speakers(self, output_dir: str = "output") -> Dict[str, str]:
299
+ """
300
+ Export full concatenated audio for each speaker.
301
+
302
+ For stereo: Creates CUSTOMER_full.wav and AGENT_full.wav
303
+ For mono: Creates SPEAKER_full.wav
304
+
305
+ Args:
306
+ output_dir: Directory to save files
307
+
308
+ Returns:
309
+ Dictionary mapping speaker names to file paths
310
+ """
311
+ os.makedirs(output_dir, exist_ok=True)
312
+ self._log("Exporting full speaker audio...")
313
+
314
+ result = {}
315
+
316
+ if self.is_stereo:
317
+ speakers = ['CUSTOMER', 'AGENT']
318
+ else:
319
+ speakers = ['SPEAKER']
320
+
321
+ for speaker in speakers:
322
+ segments = [s for s in self.timeline if s.speaker == speaker]
323
+
324
+ if segments:
325
+ parts = []
326
+ for seg in segments:
327
+ if self.is_stereo:
328
+ if seg.channel == 'left':
329
+ parts.append(self.left_channel[seg.start_ms:seg.end_ms])
330
+ else:
331
+ parts.append(self.right_channel[seg.start_ms:seg.end_ms])
332
+ else:
333
+ parts.append(self.mono_channel[seg.start_ms:seg.end_ms])
334
+
335
+ combined = sum(parts)
336
+
337
+ filepath = os.path.join(output_dir, f"{speaker}_full.wav")
338
+ combined.export(filepath, format="wav")
339
+ result[speaker] = filepath
340
+
341
+ self._log(f"{speaker}: {len(combined)/1000:.2f}s ({len(segments)} segments)")
342
+
343
+ return result
344
+
345
+ def export_transcript_txt(self, output_dir: str = "output") -> str:
346
+ """Export human-readable transcript."""
347
+ os.makedirs(output_dir, exist_ok=True)
348
+ filepath = os.path.join(output_dir, "transcript.txt")
349
+
350
+ with open(filepath, 'w', encoding='utf-8') as f:
351
+ f.write("=" * 80 + "\n")
352
+ f.write("CALL CENTER CONVERSATION TRANSCRIPT\n")
353
+ f.write(f"File: {self.input_file}\n")
354
+ f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
355
+ f.write(f"Duration: {len(self.audio)/1000:.2f} seconds\n")
356
+ f.write(f"Total Segments: {len(self.timeline)}\n")
357
+ f.write(f"Audio Type: {'STEREO' if self.is_stereo else 'MONO'}\n")
358
+
359
+ if self.is_stereo:
360
+ f.write(f"CUSTOMER Channel: {self.customer_channel.upper()}\n")
361
+ f.write(f"AGENT Channel: {'RIGHT' if self.customer_channel == 'left' else 'LEFT'}\n")
362
+
363
+ f.write("=" * 80 + "\n\n")
364
+
365
+ for segment in self.timeline:
366
+ f.write(f"[Turn {segment.turn:03d}] {segment.speaker}\n")
367
+ f.write(f" Time: {segment.start_time} --> {segment.end_time}\n")
368
+ f.write(f" Duration: {segment.duration}\n")
369
+ if segment.audio_file:
370
+ f.write(f" Audio: {os.path.basename(segment.audio_file)}\n")
371
+ f.write("\n")
372
+
373
+ self._log(f"Transcript saved: {filepath}")
374
+ return filepath
375
+
376
+ def export_transcript_json(self, output_dir: str = "output") -> str:
377
+ """Export structured JSON transcript."""
378
+ os.makedirs(output_dir, exist_ok=True)
379
+ filepath = os.path.join(output_dir, "transcript.json")
380
+
381
+ data = {
382
+ 'metadata': {
383
+ 'input_file': self.input_file,
384
+ 'duration_seconds': len(self.audio) / 1000,
385
+ 'sample_rate': self.audio.frame_rate,
386
+ 'channels': self.audio.channels,
387
+ 'is_stereo': self.is_stereo,
388
+ 'total_segments': len(self.timeline),
389
+ 'analysis_date': datetime.now().isoformat(),
390
+ 'min_silence_len_ms': self.min_silence_len,
391
+ 'silence_thresh_db': self.silence_thresh,
392
+ },
393
+ 'timeline': [asdict(s) for s in self.timeline]
394
+ }
395
+
396
+ if self.is_stereo:
397
+ data['metadata']['customer_channel'] = self.customer_channel
398
+ data['metadata']['agent_channel'] = 'right' if self.customer_channel == 'left' else 'left'
399
+
400
+ with open(filepath, 'w', encoding='utf-8') as f:
401
+ json.dump(data, f, indent=2, ensure_ascii=False)
402
+
403
+ self._log(f"JSON saved: {filepath}")
404
+ return filepath
405
+
406
+ def export_transcript_rttm(self, output_dir: str = "output") -> str:
407
+ """Export RTTM format transcript (pyannote compatible)."""
408
+ os.makedirs(output_dir, exist_ok=True)
409
+ filepath = os.path.join(output_dir, "transcript.rttm")
410
+
411
+ with open(filepath, 'w', encoding='utf-8') as f:
412
+ for segment in self.timeline:
413
+ start_sec = segment.start_ms / 1000
414
+ duration_sec = segment.duration_ms / 1000
415
+ f.write(f"SPEAKER {os.path.basename(self.input_file)} 1 "
416
+ f"{start_sec:.3f} {duration_sec:.3f} <NA> <NA> "
417
+ f"{segment.speaker} <NA> <NA>\n")
418
+
419
+ self._log(f"RTTM saved: {filepath}")
420
+ return filepath
421
+
422
+ def get_statistics(self) -> Dict:
423
+ """Calculate and return statistics about the conversation."""
424
+ if self.is_stereo:
425
+ customer_segments = [s for s in self.timeline if s.speaker == 'CUSTOMER']
426
+ agent_segments = [s for s in self.timeline if s.speaker == 'AGENT']
427
+
428
+ customer_duration = sum(s.duration_ms for s in customer_segments) / 1000
429
+ agent_duration = sum(s.duration_ms for s in agent_segments) / 1000
430
+ total_speech = customer_duration + agent_duration
431
+ else:
432
+ speaker_segments = [s for s in self.timeline if s.speaker == 'SPEAKER']
433
+ total_speech = sum(s.duration_ms for s in speaker_segments) / 1000
434
+
435
+ total_duration = len(self.audio) / 1000
436
+ silence_duration = total_duration - total_speech
437
+
438
+ stats = {
439
+ 'total_duration': total_duration,
440
+ 'total_speech': total_speech,
441
+ 'silence_duration': silence_duration,
442
+ 'silence_percentage': (silence_duration / total_duration) * 100 if total_duration > 0 else 0,
443
+ 'is_stereo': self.is_stereo
444
+ }
445
+
446
+ if self.is_stereo:
447
+ stats['customer'] = {
448
+ 'segments': len(customer_segments),
449
+ 'duration': customer_duration,
450
+ 'percentage': (customer_duration / total_duration) * 100 if total_duration > 0 else 0,
451
+ 'avg_segment': customer_duration / len(customer_segments) if customer_segments else 0
452
+ }
453
+ stats['agent'] = {
454
+ 'segments': len(agent_segments),
455
+ 'duration': agent_duration,
456
+ 'percentage': (agent_duration / total_duration) * 100 if total_duration > 0 else 0,
457
+ 'avg_segment': agent_duration / len(agent_segments) if agent_segments else 0
458
+ }
459
+ else:
460
+ stats['speaker'] = {
461
+ 'segments': len(speaker_segments),
462
+ 'duration': total_speech,
463
+ 'percentage': (total_speech / total_duration) * 100 if total_duration > 0 else 0,
464
+ 'avg_segment': total_speech / len(speaker_segments) if speaker_segments else 0
465
+ }
466
+
467
+ return stats
468
+
469
+ def process(self, output_dir: str = "output", export_segments: bool = True,
470
+ export_full: bool = True, export_transcripts: bool = True) -> Dict:
471
+ """
472
+ Run complete diarization pipeline.
473
+
474
+ Args:
475
+ output_dir: Directory for all outputs
476
+ export_segments: Whether to export individual segment files
477
+ export_full: Whether to export full speaker audio files
478
+ export_transcripts: Whether to export transcript files
479
+
480
+ Returns:
481
+ Dictionary with results and file paths
482
+ """
483
+ self._log("=" * 80)
484
+ self._log("AUDIO DIARIZATION - PROCESSING")
485
+ self._log("=" * 80)
486
+
487
+ # Load and process
488
+ self.load_audio()
489
+ left_seg, right_seg = self.detect_speech_segments()
490
+ self.create_timeline(left_seg, right_seg)
491
+
492
+ # Export results
493
+ results = {
494
+ 'is_stereo': self.is_stereo,
495
+ 'timeline': self.timeline,
496
+ 'statistics': self.get_statistics(),
497
+ 'files': {}
498
+ }
499
+
500
+ if export_segments:
501
+ results['files']['segments'] = self.export_segments(output_dir)
502
+
503
+ if export_full:
504
+ results['files']['full_speakers'] = self.export_full_speakers(output_dir)
505
+
506
+ if export_transcripts:
507
+ results['files']['transcript_txt'] = self.export_transcript_txt(output_dir)
508
+ results['files']['transcript_json'] = self.export_transcript_json(output_dir)
509
+ results['files']['transcript_rttm'] = self.export_transcript_rttm(output_dir)
510
+
511
+ self._log("=" * 80)
512
+ self._log("COMPLETED")
513
+ self._log("=" * 80)
514
+
515
+ return results
516
+
517
+
518
+ # Convenience function for simple usage
519
+ def diarize_call(input_file: str, output_dir: str = "output",
520
+ min_silence_len: int = 500, silence_thresh: int = -40,
521
+ verbose: bool = True) -> Dict:
522
+ """
523
+ Simple function to diarize a call recording (stereo or mono).
524
+
525
+ Args:
526
+ input_file: Path to audio file
527
+ output_dir: Directory for outputs
528
+ min_silence_len: Minimum silence duration in ms
529
+ silence_thresh: Silence threshold in dB
530
+ verbose: Enable logging
531
+
532
+ Returns:
533
+ Dictionary with results including timeline, statistics, and file paths
534
+ """
535
+ diarizer = StereoCallDiarizer(input_file, min_silence_len, silence_thresh, verbose)
536
+ return diarizer.process(output_dir)
537
+
538
+
539
+ if __name__ == "__main__":
540
+ # Example usage
541
+ result = diarize_call(
542
+ input_file="call.wav",
543
+ output_dir="output",
544
+ min_silence_len=500,
545
+ silence_thresh=-40
546
+ )
547
+
548
+ print("\nStatistics:")
549
+ print(f"Audio type: {'STEREO' if result['is_stereo'] else 'MONO'}")
550
+ print(f"Total duration: {result['statistics']['total_duration']:.2f}s")
551
+
552
+ if result['is_stereo']:
553
+ print(f"Customer: {result['statistics']['customer']['duration']:.2f}s")
554
+ print(f"Agent: {result['statistics']['agent']['duration']:.2f}s")
555
+ else:
556
+ print(f"Speaker: {result['statistics']['speaker']['duration']:.2f}s")
whisper_transcriber.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Whisper Speech-to-Text Transcription Module
3
+
4
+ Uses quantized Whisper model for CPU-optimized transcription.
5
+ """
6
+
7
+ import os
8
+ import json
9
+ import time
10
+ from typing import List, Dict, Optional
11
+ from dataclasses import dataclass, asdict
12
+ import torch
13
+ import librosa
14
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
15
+
16
+
17
+ @dataclass
18
+ class TranscribedSegment:
19
+ """Represents a transcribed audio segment"""
20
+ turn: int
21
+ speaker: str
22
+ start_time: str
23
+ end_time: str
24
+ duration: str
25
+ text: str
26
+ audio_file: str
27
+ inference_time: float
28
+
29
+
30
+ class WhisperTranscriber:
31
+ """
32
+ Whisper-based speech-to-text transcription system.
33
+
34
+ Optimized for CPU inference with quantized model.
35
+ """
36
+
37
+ def __init__(self, model_path: str, device: str = "cpu", verbose: bool = True):
38
+ """
39
+ Initialize Whisper transcriber.
40
+
41
+ Args:
42
+ model_path: Path to Whisper model checkpoint
43
+ device: Device for inference ('cpu' or 'cuda')
44
+ verbose: Enable logging
45
+ """
46
+ self.model_path = model_path
47
+ self.device = device
48
+ self.verbose = verbose
49
+
50
+ self._log("Loading Whisper model...")
51
+ self._log(f"Model: {model_path}")
52
+ self._log(f"Device: {device}")
53
+
54
+ # Load processor and model
55
+ self.processor = WhisperProcessor.from_pretrained(model_path)
56
+ self.model = WhisperForConditionalGeneration.from_pretrained(model_path)
57
+ self.model.to(device)
58
+ self.model.eval()
59
+
60
+ # Optimize for CPU
61
+ if device == "cpu":
62
+ self.model = torch.quantization.quantize_dynamic(
63
+ self.model, {torch.nn.Linear}, dtype=torch.qint8
64
+ )
65
+
66
+ self._log("Model loaded successfully")
67
+
68
+ def _log(self, message: str):
69
+ if self.verbose:
70
+ print(message)
71
+
72
+ def transcribe_audio(self, audio_path: str) -> tuple:
73
+ """
74
+ Transcribe a single audio file.
75
+
76
+ Args:
77
+ audio_path: Path to audio file
78
+
79
+ Returns:
80
+ Tuple of (transcription_text, inference_time)
81
+ """
82
+ start_time = time.time()
83
+
84
+ # Load and resample audio
85
+ audio, sr = librosa.load(audio_path, sr=16000)
86
+
87
+ # Process audio
88
+ inputs = self.processor(
89
+ audio,
90
+ sampling_rate=16000,
91
+ return_tensors="pt"
92
+ ).input_features.to(self.device)
93
+
94
+ # Generate transcription
95
+ with torch.no_grad():
96
+ predicted_ids = self.model.generate(
97
+ inputs,
98
+ max_length=448,
99
+ num_beams=5,
100
+ language="az",
101
+ task="transcribe"
102
+ )
103
+
104
+ # Decode
105
+ transcription = self.processor.batch_decode(
106
+ predicted_ids,
107
+ skip_special_tokens=True
108
+ )[0]
109
+
110
+ inference_time = time.time() - start_time
111
+
112
+ return transcription.strip(), inference_time
113
+
114
+ def transcribe_segments(self, segment_files: List[str],
115
+ timeline: List) -> List[TranscribedSegment]:
116
+ """
117
+ Transcribe multiple audio segments.
118
+
119
+ Args:
120
+ segment_files: List of audio file paths
121
+ timeline: List of segment metadata from diarizer
122
+
123
+ Returns:
124
+ List of TranscribedSegment objects
125
+ """
126
+ transcriptions = []
127
+ total_time = 0
128
+
129
+ self._log(f"\nTranscribing {len(segment_files)} segments...")
130
+
131
+ for i, (seg_file, seg_info) in enumerate(zip(segment_files, timeline)):
132
+ self._log(f" [{i+1}/{len(segment_files)}] {os.path.basename(seg_file)}")
133
+
134
+ text, inf_time = self.transcribe_audio(seg_file)
135
+ total_time += inf_time
136
+
137
+ transcriptions.append(TranscribedSegment(
138
+ turn=seg_info.turn,
139
+ speaker=seg_info.speaker,
140
+ start_time=seg_info.start_time,
141
+ end_time=seg_info.end_time,
142
+ duration=seg_info.duration,
143
+ text=text,
144
+ audio_file=os.path.basename(seg_file),
145
+ inference_time=round(inf_time, 2)
146
+ ))
147
+
148
+ self._log(f"Total transcription time: {total_time:.2f}s")
149
+
150
+ return transcriptions
151
+
152
+ def export_transcription(self, transcriptions: List[TranscribedSegment],
153
+ output_dir: str) -> Dict[str, str]:
154
+ """
155
+ Export transcriptions to files.
156
+
157
+ Args:
158
+ transcriptions: List of TranscribedSegment objects
159
+ output_dir: Output directory
160
+
161
+ Returns:
162
+ Dictionary of created file paths
163
+ """
164
+ os.makedirs(output_dir, exist_ok=True)
165
+ files = {}
166
+
167
+ # Export JSON
168
+ json_path = os.path.join(output_dir, "transcription.json")
169
+ data = {
170
+ 'total_segments': len(transcriptions),
171
+ 'transcriptions': [asdict(t) for t in transcriptions]
172
+ }
173
+ with open(json_path, 'w', encoding='utf-8') as f:
174
+ json.dump(data, f, indent=2, ensure_ascii=False)
175
+ files['json'] = json_path
176
+
177
+ # Export conversation text
178
+ conv_path = os.path.join(output_dir, "conversation.txt")
179
+ with open(conv_path, 'w', encoding='utf-8') as f:
180
+ for t in transcriptions:
181
+ f.write(f"[{t.start_time}] {t.speaker}: {t.text}\n")
182
+ files['conversation'] = conv_path
183
+
184
+ self._log(f"Transcription exported to {output_dir}")
185
+
186
+ return files