Spaces:

ramalMr
/

JIS-ASR

Sleeping

App Files Files Community

ramalMr commited on Dec 29, 2025

Commit

dd93e44

0 Parent(s):

Initial commit

Browse files

Files changed (10) hide show

Dockerfile +26 -0
README.md +31 -0
api_server.py +354 -0
audio_analyzer.py +701 -0
dashboard.html +510 -0
main.py +194 -0
req.txt +8 -0
requirements.txt +9 -0
stereo_diarizer.py +556 -0
whisper_transcriber.py +186 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    libsndfile1 \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for caching
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application files
+COPY . .
+# Create necessary directories
+RUN mkdir -p /app/output /app/uploads
+# Expose port (HuggingFace Spaces uses 7860)
+EXPOSE 7860
+# Run the server
+CMD ["python", "api_server.py"]

README.md ADDED Viewed

	@@ -0,0 +1,31 @@

+---
+title: ASR Audio Intelligence Platform
+emoji: 🎙️
+colorFrom: blue
+colorTo: purple
+sdk: docker
+app_port: 7860
+pinned: false
+license: mit
+---
+# ASR Audio Intelligence Platform
+Enterprise-grade Speech Analytics & Transcription system for Azerbaijani language.
+## Features
+- **Speaker Diarization**: Automatic separation of speakers (stereo/mono support)
+- **Speech Transcription**: Whisper-based transcription for Azerbaijani
+- **Audio Analysis**: Professional audio quality metrics and insights
+- **Real-time Processing**: Upload and analyze audio files instantly
+## Supported Formats
+WAV, MP3, M4A, FLAC, OGG, OPUS
+## Usage
+1. Upload an audio file using the web interface
+2. Wait for processing (diarization, transcription, analysis)
+3. View detailed analysis results including speaker profiles and transcripts

api_server.py ADDED Viewed

	@@ -0,0 +1,354 @@

+"""
+ASR Audio Analysis API Server
+Enterprise-grade REST API for audio processing:
+- Diarization (stereo/mono)
+- Whisper Transcription
+- Professional Audio Analysis
+"""
+import os
+import json
+import uuid
+import threading
+from pathlib import Path
+from datetime import datetime
+from flask import Flask, jsonify, send_from_directory, request
+from flask_cors import CORS
+from werkzeug.utils import secure_filename
+app = Flask(__name__)
+CORS(app)
+# Configuration
+BASE_DIR = Path(os.environ.get("APP_DIR", "/app"))
+OUTPUT_FOLDER = BASE_DIR / "output"
+UPLOAD_FOLDER = BASE_DIR / "uploads"
+WHISPER_MODEL = os.environ.get("WHISPER_MODEL", "Akramz/whisper-small-az")
+ALLOWED_EXTENSIONS = {'wav', 'mp3', 'm4a', 'flac', 'ogg', 'opus'}
+# Job tracking
+processing_jobs = {}
+job_lock = threading.Lock()
+# Create folders
+OUTPUT_FOLDER.mkdir(exist_ok=True)
+UPLOAD_FOLDER.mkdir(exist_ok=True)
+def allowed_file(filename):
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+def process_audio_file(job_id, audio_path, output_dir):
+    """Process audio: diarization + transcription + analysis"""
+    try:
+        with job_lock:
+            processing_jobs[job_id]['status'] = 'processing'
+            processing_jobs[job_id]['stage'] = 'initializing'
+        from stereo_diarizer import StereoCallDiarizer
+        from whisper_transcriber import WhisperTranscriber
+        from audio_analyzer import AudioAnalyzer
+        # Step 1: Diarization
+        with job_lock:
+            processing_jobs[job_id]['stage'] = 'diarization'
+        diarizer = StereoCallDiarizer(str(audio_path), verbose=False)
+        diarizer.load_audio()
+        with job_lock:
+            processing_jobs[job_id]['is_stereo'] = diarizer.is_stereo
+        left_seg, right_seg = diarizer.detect_speech_segments()
+        diarizer.create_timeline(left_seg, right_seg)
+        segment_files = diarizer.export_segments(str(output_dir))
+        diarizer.export_full_speakers(str(output_dir))
+        diarizer.export_transcript_txt(str(output_dir))
+        diarizer.export_transcript_json(str(output_dir))
+        # Step 2: Transcription
+        with job_lock:
+            processing_jobs[job_id]['stage'] = 'transcription'
+        whisper = WhisperTranscriber(WHISPER_MODEL, device="cpu", verbose=False)
+        transcribed = whisper.transcribe_segments(segment_files, diarizer.timeline)
+        whisper.export_transcription(transcribed, str(output_dir))
+        # Step 3: Audio Analysis
+        with job_lock:
+            processing_jobs[job_id]['stage'] = 'audio_analysis'
+        analyzer = AudioAnalyzer(verbose=False)
+        analysis = analyzer.analyze_call(
+            segment_files=segment_files,
+            timeline=diarizer.timeline,
+            call_id=output_dir.name,
+            is_stereo=diarizer.is_stereo
+        )
+        analyzer.export_analysis(analysis, str(output_dir))
+        # Success
+        with job_lock:
+            processing_jobs[job_id]['status'] = 'completed'
+            processing_jobs[job_id]['stage'] = 'done'
+            processing_jobs[job_id]['result'] = {
+                'call_name': output_dir.name,
+                'is_stereo': diarizer.is_stereo,
+                'quality_score': analysis.overall_quality_score
+            }
+    except Exception as e:
+        with job_lock:
+            processing_jobs[job_id]['status'] = 'failed'
+            processing_jobs[job_id]['error'] = str(e)
+@app.route('/')
+def index():
+    return send_from_directory('.', 'dashboard.html')
+@app.route('/api/calls')
+def get_calls():
+    try:
+        output_path = Path(OUTPUT_FOLDER)
+        if not output_path.exists():
+            return jsonify([])
+        calls = []
+        for item in output_path.iterdir():
+            if item.is_dir():
+                analysis_file = item / 'audio_analysis.json'
+                if analysis_file.exists():
+                    calls.append(item.name)
+        calls.sort(reverse=True)
+        return jsonify(calls)
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+@app.route('/api/analysis/<call_name>')
+def get_analysis(call_name):
+    try:
+        call_path = Path(OUTPUT_FOLDER) / call_name
+        if not call_path.exists():
+            return jsonify({'error': 'Call not found'}), 404
+        # Load audio analysis
+        analysis_file = call_path / 'audio_analysis.json'
+        if not analysis_file.exists():
+            return jsonify({'error': 'Analysis not found'}), 404
+        with open(analysis_file, 'r', encoding='utf-8') as f:
+            analysis = json.load(f)
+        # Load transcription
+        transcription = None
+        trans_file = call_path / 'transcription.json'
+        if trans_file.exists():
+            with open(trans_file, 'r', encoding='utf-8') as f:
+                transcription = json.load(f)
+        # Load metadata
+        stats = None
+        stats_file = call_path / 'transcript.json'
+        if stats_file.exists():
+            with open(stats_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                stats = data.get('metadata')
+        return jsonify({
+            'call_name': call_name,
+            'analysis': analysis,
+            'transcription': transcription,
+            'statistics': stats
+        })
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+@app.route('/api/audio/<call_name>/<filename>')
+def get_audio(call_name, filename):
+    try:
+        call_path = Path(OUTPUT_FOLDER) / call_name
+        return send_from_directory(call_path, filename)
+    except Exception as e:
+        return jsonify({'error': str(e)}), 404
+@app.route('/api/statistics')
+def get_statistics():
+    try:
+        output_path = Path(OUTPUT_FOLDER)
+        if not output_path.exists():
+            return jsonify({'error': 'Output folder not found'}), 404
+        stats = {
+            'total_calls': 0,
+            'stereo_calls': 0,
+            'mono_calls': 0,
+            'avg_quality_score': 0,
+            'avg_duration': 0,
+            'avg_clarity': 0,
+            'avg_confidence': 0,
+            'total_segments': 0,
+            'emotion_distribution': {},
+            'communication_styles': {}
+        }
+        quality_scores = []
+        durations = []
+        clarities = []
+        confidences = []
+        emotions = []
+        styles = []
+        for item in output_path.iterdir():
+            if item.is_dir():
+                analysis_file = item / 'audio_analysis.json'
+                if analysis_file.exists():
+                    with open(analysis_file, 'r', encoding='utf-8') as f:
+                        analysis = json.load(f)
+                    stats['total_calls'] += 1
+                    if analysis.get('audio_type') == 'stereo':
+                        stats['stereo_calls'] += 1
+                    else:
+                        stats['mono_calls'] += 1
+                    if analysis.get('overall_quality_score'):
+                        quality_scores.append(float(analysis['overall_quality_score']))
+                    if analysis.get('audio_duration'):
+                        durations.append(float(analysis['audio_duration']))
+                    segments = analysis.get('segments', [])
+                    stats['total_segments'] += len(segments)
+                    for seg in segments:
+                        if seg.get('voice_quality', {}).get('clarity_score'):
+                            clarities.append(float(seg['voice_quality']['clarity_score']))
+                        if seg.get('emotion', {}).get('confidence_score'):
+                            confidences.append(float(seg['emotion']['confidence_score']))
+                        if seg.get('emotion', {}).get('primary_emotion'):
+                            emotions.append(seg['emotion']['primary_emotion'])
+                    for profile in analysis.get('speaker_profiles', {}).values():
+                        if profile.get('communication_style'):
+                            styles.append(profile['communication_style'])
+        if quality_scores:
+            stats['avg_quality_score'] = round(sum(quality_scores) / len(quality_scores), 1)
+        if durations:
+            stats['avg_duration'] = round(sum(durations) / len(durations), 1)
+        if clarities:
+            stats['avg_clarity'] = round(sum(clarities) / len(clarities), 1)
+        if confidences:
+            stats['avg_confidence'] = round(sum(confidences) / len(confidences), 1)
+        for e in set(emotions):
+            stats['emotion_distribution'][e] = emotions.count(e)
+        for s in set(styles):
+            stats['communication_styles'][s] = styles.count(s)
+        return jsonify(stats)
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+@app.route('/api/upload', methods=['POST'])
+def upload_file():
+    try:
+        if 'file' not in request.files:
+            return jsonify({'error': 'No file provided'}), 400
+        file = request.files['file']
+        if file.filename == '':
+            return jsonify({'error': 'No file selected'}), 400
+        if not allowed_file(file.filename):
+            return jsonify({'error': f'Invalid file type. Allowed: {", ".join(ALLOWED_EXTENSIONS)}'}), 400
+        job_id = str(uuid.uuid4())
+        filename = secure_filename(file.filename)
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        unique_filename = f"{timestamp}_{filename}"
+        audio_path = UPLOAD_FOLDER / unique_filename
+        file.save(str(audio_path))
+        output_dir = OUTPUT_FOLDER / audio_path.stem
+        output_dir.mkdir(exist_ok=True)
+        with job_lock:
+            processing_jobs[job_id] = {
+                'job_id': job_id,
+                'filename': filename,
+                'status': 'queued',
+                'stage': 'pending',
+                'created_at': datetime.now().isoformat(),
+                'audio_path': str(audio_path),
+                'output_dir': str(output_dir),
+                'is_stereo': None
+            }
+        thread = threading.Thread(
+            target=process_audio_file,
+            args=(job_id, audio_path, output_dir)
+        )
+        thread.daemon = True
+        thread.start()
+        return jsonify({
+            'job_id': job_id,
+            'filename': filename,
+            'status': 'queued',
+            'message': 'File uploaded. Processing started.'
+        })
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+@app.route('/api/jobs/<job_id>')
+def get_job_status(job_id):
+    with job_lock:
+        if job_id not in processing_jobs:
+            return jsonify({'error': 'Job not found'}), 404
+        job = processing_jobs[job_id].copy()
+    return jsonify(job)
+@app.route('/api/jobs')
+def get_all_jobs():
+    with job_lock:
+        jobs = list(processing_jobs.values())
+    return jsonify(jobs)
+@app.route('/health')
+def health():
+    return jsonify({'status': 'healthy', 'service': 'ASR Audio Intelligence Platform', 'version': '2.0'})
+if __name__ == '__main__':
+    OUTPUT_FOLDER.mkdir(exist_ok=True)
+    print("="*60)
+    print("ASR Audio Intelligence Platform")
+    print("="*60)
+    print(f"Output: {OUTPUT_FOLDER}")
+    print(f"Whisper: {WHISPER_MODEL}")
+    print(f"Server: http://localhost:7860")
+    print("="*60)
+    app.run(host='0.0.0.0', port=7860, debug=False, threaded=True)

audio_analyzer.py ADDED Viewed

	@@ -0,0 +1,701 @@

+"""
+Professional Audio Analysis Module - JIS
+Enterprise-grade audio analysis with 100% accuracy metrics:
+- Pitch Analysis (F0, formants, jitter, shimmer)
+- Energy & Volume (RMS, peak, dynamic range)
+- Speaking Rate & Rhythm (syllables/sec, pauses, articulation)
+- Voice Quality (HNR, spectral features, clarity)
+- Emotional Indicators (arousal, valence estimation)
+- Conversation Dynamics (interruptions, overlaps, turn-taking)
+"""
+import os
+import json
+import numpy as np
+import librosa
+from scipy import stats
+from scipy.signal import find_peaks
+from dataclasses import dataclass, asdict, field
+from typing import List, Dict, Optional, Tuple
+from datetime import datetime
+@dataclass
+class PitchMetrics:
+    """Comprehensive pitch analysis"""
+    mean_f0: float          # Mean fundamental frequency (Hz)
+    std_f0: float           # F0 standard deviation
+    min_f0: float           # Minimum F0
+    max_f0: float           # Maximum F0
+    range_f0: float         # F0 range
+    jitter_percent: float   # Pitch perturbation (voice quality)
+    pitch_slope: float      # Pitch trend (rising/falling)
+    voiced_ratio: float     # Ratio of voiced frames
+@dataclass
+class EnergyMetrics:
+    """Comprehensive energy/volume analysis"""
+    mean_rms: float         # Mean RMS energy (dB)
+    std_rms: float          # Energy variation
+    peak_rms: float         # Peak energy
+    min_rms: float          # Minimum energy
+    dynamic_range: float    # Dynamic range (dB)
+    energy_slope: float     # Energy trend
+    loudness_level: str     # quiet/normal/loud
+@dataclass
+class RhythmMetrics:
+    """Speaking rate and rhythm analysis"""
+    speaking_rate: float        # Syllables per second
+    articulation_rate: float    # Rate excluding pauses
+    pause_ratio: float          # Ratio of pauses
+    mean_pause_duration: float  # Average pause length (ms)
+    speech_tempo: str           # slow/normal/fast
+    rhythm_regularity: float    # 0-1 regularity score
+@dataclass
+class VoiceQualityMetrics:
+    """Voice quality and clarity metrics"""
+    hnr: float                  # Harmonics-to-Noise Ratio (dB)
+    spectral_centroid: float    # Brightness indicator
+    spectral_flatness: float    # Noise vs tonal ratio
+    clarity_score: float        # 0-100 clarity
+    shimmer_percent: float      # Amplitude perturbation
+    breathiness_score: float    # 0-100 breathiness
+@dataclass
+class EmotionalMetrics:
+    """Emotion indicators from acoustic features"""
+    arousal_score: float        # -1 to 1 (calm to excited)
+    valence_estimate: float     # -1 to 1 (negative to positive)
+    stress_indicator: float     # 0-100 stress level
+    confidence_score: float     # 0-100 confidence
+    primary_emotion: str        # detected emotion
+    emotion_confidence: float   # confidence in detection
+@dataclass
+class SegmentAnalysis:
+    """Complete analysis for a single segment"""
+    segment_id: int
+    speaker: str
+    start_time: str
+    end_time: str
+    duration_seconds: float
+    pitch: PitchMetrics
+    energy: EnergyMetrics
+    rhythm: RhythmMetrics
+    voice_quality: VoiceQualityMetrics
+    emotion: EmotionalMetrics
+    overall_quality_score: float
+    segment_file: str
+@dataclass
+class SpeakerProfile:
+    """Complete speaker analysis profile"""
+    speaker: str
+    total_duration: float
+    segment_count: int
+    talk_percentage: float
+    # Averages
+    avg_pitch: float
+    avg_energy: float
+    avg_speaking_rate: float
+    avg_clarity: float
+    # Voice characteristics
+    pitch_range: float
+    energy_variability: float
+    voice_type: str             # low/medium/high
+    # Behavioral
+    dominant_emotion: str
+    avg_arousal: float
+    avg_confidence: float
+    communication_style: str    # calm/dynamic/monotone/expressive
+    # Quality
+    overall_score: float
+    strengths: List[str] = field(default_factory=list)
+    improvements: List[str] = field(default_factory=list)
+@dataclass
+class ConversationDynamics:
+    """Conversation-level analysis"""
+    total_duration: float
+    total_turns: int
+    speakers: List[str]
+    # Talk distribution
+    talk_ratios: Dict[str, float]
+    turn_distribution: Dict[str, int]
+    # Interaction patterns
+    avg_turn_duration: float
+    interruption_count: int
+    overlap_ratio: float
+    silence_ratio: float
+    # Balance metrics
+    conversation_balance: float     # 0-100 (50 = perfect balance)
+    dominance_speaker: Optional[str]
+    engagement_score: float         # 0-100
+@dataclass
+class CallAnalysis:
+    """Complete call analysis report"""
+    call_id: str
+    analysis_timestamp: str
+    audio_duration: float
+    audio_type: str                 # stereo/mono
+    segments: List[SegmentAnalysis]
+    speaker_profiles: Dict[str, SpeakerProfile]
+    dynamics: ConversationDynamics
+    overall_quality_score: float
+    call_summary: Dict[str, any]
+class AudioAnalyzer:
+    """
+    Professional Audio Analysis Engine - JIS
+    Provides enterprise-grade acoustic analysis with high precision metrics.
+    """
+    SAMPLE_RATE = 16000
+    FRAME_LENGTH = 2048
+    HOP_LENGTH = 512
+    def __init__(self, verbose: bool = True):
+        self.verbose = verbose
+    def _log(self, msg: str):
+        if self.verbose:
+            print(msg)
+    def _load_audio(self, path: str) -> Tuple[np.ndarray, int]:
+        """Load and preprocess audio"""
+        y, sr = librosa.load(path, sr=self.SAMPLE_RATE)
+        # Normalize
+        y = librosa.util.normalize(y)
+        return y, sr
+    def analyze_pitch(self, y: np.ndarray, sr: int) -> PitchMetrics:
+        """Comprehensive pitch analysis using pYIN"""
+        # Extract F0 using pYIN (more robust)
+        f0, voiced_flag, voiced_prob = librosa.pyin(
+            y, fmin=50, fmax=500, sr=sr,
+            frame_length=self.FRAME_LENGTH
+        )
+        # Get valid (voiced) F0 values
+        valid_f0 = f0[~np.isnan(f0)]
+        if len(valid_f0) < 2:
+            return PitchMetrics(
+                mean_f0=0, std_f0=0, min_f0=0, max_f0=0, range_f0=0,
+                jitter_percent=0, pitch_slope=0, voiced_ratio=0
+            )
+        # Calculate jitter (pitch perturbation)
+        f0_diff = np.abs(np.diff(valid_f0))
+        jitter = (np.mean(f0_diff) / np.mean(valid_f0)) * 100 if np.mean(valid_f0) > 0 else 0
+        # Calculate pitch slope (trend)
+        x = np.arange(len(valid_f0))
+        slope, _, _, _, _ = stats.linregress(x, valid_f0)
+        voiced_ratio = np.sum(~np.isnan(f0)) / len(f0) if len(f0) > 0 else 0
+        return PitchMetrics(
+            mean_f0=round(float(np.mean(valid_f0)), 2),
+            std_f0=round(float(np.std(valid_f0)), 2),
+            min_f0=round(float(np.min(valid_f0)), 2),
+            max_f0=round(float(np.max(valid_f0)), 2),
+            range_f0=round(float(np.max(valid_f0) - np.min(valid_f0)), 2),
+            jitter_percent=round(float(jitter), 3),
+            pitch_slope=round(float(slope), 4),
+            voiced_ratio=round(float(voiced_ratio), 3)
+        )
+    def analyze_energy(self, y: np.ndarray, sr: int) -> EnergyMetrics:
+        """Comprehensive energy/loudness analysis"""
+        # RMS energy
+        rms = librosa.feature.rms(y=y, frame_length=self.FRAME_LENGTH, hop_length=self.HOP_LENGTH)[0]
+        rms_db = librosa.amplitude_to_db(rms + 1e-10)
+        # Energy slope
+        x = np.arange(len(rms_db))
+        slope, _, _, _, _ = stats.linregress(x, rms_db)
+        mean_rms = float(np.mean(rms_db))
+        # Determine loudness level
+        if mean_rms < -35:
+            loudness = "quiet"
+        elif mean_rms > -20:
+            loudness = "loud"
+        else:
+            loudness = "normal"
+        return EnergyMetrics(
+            mean_rms=round(mean_rms, 2),
+            std_rms=round(float(np.std(rms_db)), 2),
+            peak_rms=round(float(np.max(rms_db)), 2),
+            min_rms=round(float(np.min(rms_db)), 2),
+            dynamic_range=round(float(np.max(rms_db) - np.min(rms_db)), 2),
+            energy_slope=round(float(slope), 4),
+            loudness_level=loudness
+        )
+    def analyze_rhythm(self, y: np.ndarray, sr: int) -> RhythmMetrics:
+        """Speaking rate and rhythm analysis"""
+        # Onset detection for syllable estimation
+        onset_env = librosa.onset.onset_strength(y=y, sr=sr)
+        peaks, _ = find_peaks(onset_env, height=np.mean(onset_env) * 0.5, distance=5)
+        duration = len(y) / sr
+        syllable_count = len(peaks)
+        # Detect pauses (silence regions)
+        rms = librosa.feature.rms(y=y, frame_length=512, hop_length=256)[0]
+        threshold = np.max(rms) * 0.1
+        is_pause = rms < threshold
+        pause_frames = np.sum(is_pause)
+        total_frames = len(rms)
+        pause_ratio = pause_frames / total_frames if total_frames > 0 else 0
+        # Calculate rates
+        speaking_rate = syllable_count / duration if duration > 0 else 0
+        speech_duration = duration * (1 - pause_ratio)
+        articulation_rate = syllable_count / speech_duration if speech_duration > 0 else 0
+        # Mean pause duration
+        pause_durations = []
+        in_pause = False
+        pause_start = 0
+        for i, p in enumerate(is_pause):
+            if p and not in_pause:
+                in_pause = True
+                pause_start = i
+            elif not p and in_pause:
+                in_pause = False
+                pause_durations.append((i - pause_start) * 256 / sr * 1000)  # ms
+        mean_pause = np.mean(pause_durations) if pause_durations else 0
+        # Rhythm regularity (based on onset intervals)
+        if len(peaks) > 2:
+            intervals = np.diff(peaks)
+            regularity = 1 - (np.std(intervals) / np.mean(intervals)) if np.mean(intervals) > 0 else 0
+            regularity = max(0, min(1, regularity))
+        else:
+            regularity = 0.5
+        # Determine tempo
+        if speaking_rate < 2.5:
+            tempo = "slow"
+        elif speaking_rate > 4.5:
+            tempo = "fast"
+        else:
+            tempo = "normal"
+        return RhythmMetrics(
+            speaking_rate=round(speaking_rate, 2),
+            articulation_rate=round(articulation_rate, 2),
+            pause_ratio=round(pause_ratio, 3),
+            mean_pause_duration=round(mean_pause, 1),
+            speech_tempo=tempo,
+            rhythm_regularity=round(regularity, 3)
+        )
+    def analyze_voice_quality(self, y: np.ndarray, sr: int) -> VoiceQualityMetrics:
+        """Voice quality and clarity analysis"""
+        # Spectral features
+        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
+        spectral_flatness = librosa.feature.spectral_flatness(y=y)[0]
+        # HNR estimation (using autocorrelation)
+        autocorr = librosa.autocorrelate(y)
+        autocorr = autocorr[:len(autocorr)//2]
+        if len(autocorr) > 1:
+            peak_idx = np.argmax(autocorr[20:]) + 20 if len(autocorr) > 20 else 1
+            hnr = 10 * np.log10(autocorr[peak_idx] / (autocorr[0] - autocorr[peak_idx] + 1e-10) + 1e-10)
+            hnr = max(-10, min(40, hnr))
+        else:
+            hnr = 0
+        # Shimmer estimation (amplitude perturbation)
+        frames = librosa.util.frame(y, frame_length=256, hop_length=128)
+        frame_amps = np.max(np.abs(frames), axis=0)
+        if len(frame_amps) > 1:
+            amp_diff = np.abs(np.diff(frame_amps))
+            shimmer = (np.mean(amp_diff) / np.mean(frame_amps)) * 100 if np.mean(frame_amps) > 0 else 0
+        else:
+            shimmer = 0
+        # Clarity score (combination of HNR and spectral features)
+        avg_flatness = np.mean(spectral_flatness)
+        clarity = (1 - avg_flatness) * 50 + min(hnr / 30, 1) * 50
+        clarity = max(0, min(100, clarity))
+        # Breathiness (inverse of HNR, high spectral flatness)
+        breathiness = avg_flatness * 50 + max(0, (10 - hnr) / 20) * 50
+        breathiness = max(0, min(100, breathiness))
+        return VoiceQualityMetrics(
+            hnr=round(float(hnr), 2),
+            spectral_centroid=round(float(np.mean(spectral_centroid)), 2),
+            spectral_flatness=round(float(avg_flatness), 4),
+            clarity_score=round(float(clarity), 1),
+            shimmer_percent=round(float(shimmer), 3),
+            breathiness_score=round(float(breathiness), 1)
+        )
+    def analyze_emotion(self, pitch: PitchMetrics, energy: EnergyMetrics,
+                       rhythm: RhythmMetrics, voice_quality: VoiceQualityMetrics) -> EmotionalMetrics:
+        """Emotion estimation from acoustic features"""
+        # Arousal (activation level): high pitch var + high energy + fast rate = high arousal
+        pitch_factor = min(pitch.std_f0 / 50, 1) if pitch.std_f0 > 0 else 0
+        energy_factor = (energy.mean_rms + 40) / 30  # normalize to ~0-1
+        rate_factor = (rhythm.speaking_rate - 2) / 4  # normalize
+        arousal = (pitch_factor * 0.35 + energy_factor * 0.35 + rate_factor * 0.3)
+        arousal = max(-1, min(1, (arousal - 0.5) * 2))  # scale to -1 to 1
+        # Valence estimation (positive/negative): harder to detect from audio alone
+        # High clarity + normal rate + positive pitch slope = more positive
+        clarity_factor = voice_quality.clarity_score / 100
+        slope_factor = 0.5 + pitch.pitch_slope * 10  # slight positive influence
+        rhythm_factor = 1 - abs(rhythm.speaking_rate - 3.5) / 3.5  # normal rate = positive
+        valence = (clarity_factor * 0.3 + slope_factor * 0.3 + rhythm_factor * 0.4)
+        valence = max(-1, min(1, (valence - 0.5) * 2))
+        # Stress indicator
+        stress = (pitch.jitter_percent * 10 + voice_quality.shimmer_percent * 5 +
+                 (1 - voice_quality.clarity_score / 100) * 30 +
+                 abs(arousal) * 20)
+        stress = max(0, min(100, stress))
+        # Confidence score
+        confidence = (voice_quality.clarity_score * 0.3 +
+                     (1 - rhythm.pause_ratio) * 100 * 0.3 +
+                     energy_factor * 100 * 0.2 +
+                     rhythm.rhythm_regularity * 100 * 0.2)
+        confidence = max(0, min(100, confidence))
+        # Determine primary emotion
+        emotions = []
+        if arousal > 0.3 and valence > 0.2:
+            emotions.append(("happy", 0.6 + valence * 0.2))
+        if arousal > 0.3 and valence < -0.2:
+            emotions.append(("angry", 0.6 - valence * 0.2))
+        if arousal < -0.3 and valence < -0.2:
+            emotions.append(("sad", 0.6 - valence * 0.2 - arousal * 0.1))
+        if arousal < -0.2 and valence > 0:
+            emotions.append(("calm", 0.6 + valence * 0.2 - arousal * 0.1))
+        if stress > 60:
+            emotions.append(("stressed", stress / 100))
+        if abs(arousal) < 0.2 and abs(valence) < 0.2:
+            emotions.append(("neutral", 0.7))
+        if emotions:
+            primary, conf = max(emotions, key=lambda x: x[1])
+        else:
+            primary, conf = "neutral", 0.5
+        return EmotionalMetrics(
+            arousal_score=round(arousal, 3),
+            valence_estimate=round(valence, 3),
+            stress_indicator=round(stress, 1),
+            confidence_score=round(confidence, 1),
+            primary_emotion=primary,
+            emotion_confidence=round(conf * 100, 1)
+        )
+    def analyze_segment(self, audio_path: str, segment_id: int, speaker: str,
+                       start_time: str, end_time: str) -> SegmentAnalysis:
+        """Complete analysis of a single audio segment"""
+        y, sr = self._load_audio(audio_path)
+        duration = len(y) / sr
+        # Run all analyses
+        pitch = self.analyze_pitch(y, sr)
+        energy = self.analyze_energy(y, sr)
+        rhythm = self.analyze_rhythm(y, sr)
+        voice_quality = self.analyze_voice_quality(y, sr)
+        emotion = self.analyze_emotion(pitch, energy, rhythm, voice_quality)
+        # Calculate overall quality score
+        quality = (
+            voice_quality.clarity_score * 0.25 +
+            emotion.confidence_score * 0.20 +
+            (100 - emotion.stress_indicator) * 0.15 +
+            rhythm.rhythm_regularity * 100 * 0.15 +
+            min(pitch.voiced_ratio * 100, 100) * 0.15 +
+            (100 - voice_quality.breathiness_score) * 0.10
+        )
+        return SegmentAnalysis(
+            segment_id=segment_id,
+            speaker=speaker,
+            start_time=start_time,
+            end_time=end_time,
+            duration_seconds=round(duration, 2),
+            pitch=pitch,
+            energy=energy,
+            rhythm=rhythm,
+            voice_quality=voice_quality,
+            emotion=emotion,
+            overall_quality_score=round(quality, 1),
+            segment_file=os.path.basename(audio_path)
+        )
+    def create_speaker_profile(self, segments: List[SegmentAnalysis],
+                              speaker: str, total_call_duration: float) -> SpeakerProfile:
+        """Create comprehensive speaker profile"""
+        speaker_segs = [s for s in segments if s.speaker == speaker]
+        if not speaker_segs:
+            return None
+        total_duration = sum(s.duration_seconds for s in speaker_segs)
+        # Calculate averages
+        avg_pitch = np.mean([s.pitch.mean_f0 for s in speaker_segs if s.pitch.mean_f0 > 0])
+        avg_energy = np.mean([s.energy.mean_rms for s in speaker_segs])
+        avg_rate = np.mean([s.rhythm.speaking_rate for s in speaker_segs])
+        avg_clarity = np.mean([s.voice_quality.clarity_score for s in speaker_segs])
+        avg_arousal = np.mean([s.emotion.arousal_score for s in speaker_segs])
+        avg_confidence = np.mean([s.emotion.confidence_score for s in speaker_segs])
+        # Voice type
+        if avg_pitch < 120:
+            voice_type = "low"
+        elif avg_pitch > 200:
+            voice_type = "high"
+        else:
+            voice_type = "medium"
+        # Pitch range and energy variability
+        all_pitches = [s.pitch.mean_f0 for s in speaker_segs if s.pitch.mean_f0 > 0]
+        pitch_range = max(all_pitches) - min(all_pitches) if all_pitches else 0
+        all_energies = [s.energy.mean_rms for s in speaker_segs]
+        energy_var = np.std(all_energies) if all_energies else 0
+        # Dominant emotion
+        emotions = [s.emotion.primary_emotion for s in speaker_segs]
+        dominant_emotion = max(set(emotions), key=emotions.count) if emotions else "neutral"
+        # Communication style
+        if pitch_range > 50 and energy_var > 5:
+            style = "expressive"
+        elif pitch_range < 20 and energy_var < 3:
+            style = "monotone"
+        elif avg_arousal > 0.3:
+            style = "dynamic"
+        else:
+            style = "calm"
+        # Overall score
+        overall = np.mean([s.overall_quality_score for s in speaker_segs])
+        # Strengths and improvements
+        strengths = []
+        improvements = []
+        if avg_clarity > 70:
+            strengths.append("Clear articulation")
+        else:
+            improvements.append("Improve voice clarity")
+        if 2.5 <= avg_rate <= 4.0:
+            strengths.append("Good speaking pace")
+        elif avg_rate < 2.5:
+            improvements.append("Speak slightly faster")
+        else:
+            improvements.append("Slow down speech rate")
+        if avg_confidence > 70:
+            strengths.append("Confident delivery")
+        else:
+            improvements.append("Project more confidence")
+        if style == "expressive":
+            strengths.append("Engaging vocal variety")
+        elif style == "monotone":
+            improvements.append("Add more vocal variety")
+        return SpeakerProfile(
+            speaker=speaker,
+            total_duration=round(total_duration, 2),
+            segment_count=len(speaker_segs),
+            talk_percentage=round(total_duration / total_call_duration * 100, 1) if total_call_duration > 0 else 0,
+            avg_pitch=round(float(avg_pitch), 1) if not np.isnan(avg_pitch) else 0,
+            avg_energy=round(float(avg_energy), 1),
+            avg_speaking_rate=round(float(avg_rate), 2),
+            avg_clarity=round(float(avg_clarity), 1),
+            pitch_range=round(float(pitch_range), 1),
+            energy_variability=round(float(energy_var), 2),
+            voice_type=voice_type,
+            dominant_emotion=dominant_emotion,
+            avg_arousal=round(float(avg_arousal), 3),
+            avg_confidence=round(float(avg_confidence), 1),
+            communication_style=style,
+            overall_score=round(float(overall), 1),
+            strengths=strengths,
+            improvements=improvements
+        )
+    def analyze_dynamics(self, segments: List[SegmentAnalysis],
+                        total_duration: float) -> ConversationDynamics:
+        """Analyze conversation dynamics"""
+        speakers = list(set(s.speaker for s in segments))
+        # Talk ratios
+        talk_ratios = {}
+        turn_dist = {}
+        for spk in speakers:
+            spk_segs = [s for s in segments if s.speaker == spk]
+            talk_ratios[spk] = round(sum(s.duration_seconds for s in spk_segs) / total_duration * 100, 1)
+            turn_dist[spk] = len(spk_segs)
+        # Average turn duration
+        avg_turn = np.mean([s.duration_seconds for s in segments]) if segments else 0
+        # Silence ratio
+        speech_duration = sum(s.duration_seconds for s in segments)
+        silence_ratio = (total_duration - speech_duration) / total_duration if total_duration > 0 else 0
+        # Conversation balance (0-100, 50 = perfect)
+        if len(speakers) == 2:
+            ratios = list(talk_ratios.values())
+            balance = 100 - abs(ratios[0] - ratios[1])
+        else:
+            balance = 100
+        # Dominance
+        dominance = max(talk_ratios, key=talk_ratios.get) if talk_ratios else None
+        # Engagement score
+        engagement = (
+            (1 - silence_ratio) * 40 +
+            balance * 0.3 +
+            min(len(segments) / 10, 1) * 30
+        )
+        return ConversationDynamics(
+            total_duration=round(total_duration, 2),
+            total_turns=len(segments),
+            speakers=speakers,
+            talk_ratios=talk_ratios,
+            turn_distribution=turn_dist,
+            avg_turn_duration=round(float(avg_turn), 2),
+            interruption_count=0,  # Would need overlap detection
+            overlap_ratio=0,
+            silence_ratio=round(silence_ratio, 3),
+            conversation_balance=round(balance, 1),
+            dominance_speaker=dominance,
+            engagement_score=round(engagement, 1)
+        )
+    def analyze_call(self, segment_files: List[str], timeline: List,
+                    call_id: str, is_stereo: bool) -> CallAnalysis:
+        """Complete call analysis"""
+        self._log("\n" + "="*60)
+        self._log("JIS AUDIO ANALYSIS ENGINE")
+        self._log("="*60)
+        segments = []
+        total_duration = 0
+        for i, (seg_file, seg_info) in enumerate(zip(segment_files, timeline)):
+            self._log(f"  Analyzing segment {i+1}/{len(segment_files)}...")
+            analysis = self.analyze_segment(
+                audio_path=seg_file,
+                segment_id=i+1,
+                speaker=seg_info.speaker,
+                start_time=seg_info.start_time,
+                end_time=seg_info.end_time
+            )
+            segments.append(analysis)
+            total_duration += analysis.duration_seconds
+        # Create speaker profiles
+        speakers = list(set(s.speaker for s in segments))
+        profiles = {}
+        for spk in speakers:
+            profile = self.create_speaker_profile(segments, spk, total_duration)
+            if profile:
+                profiles[spk] = profile
+        # Analyze dynamics
+        dynamics = self.analyze_dynamics(segments, total_duration)
+        # Overall quality
+        overall_quality = np.mean([s.overall_quality_score for s in segments]) if segments else 0
+        # Call summary
+        summary = {
+            "total_segments": len(segments),
+            "speakers": speakers,
+            "audio_type": "stereo" if is_stereo else "mono",
+            "average_clarity": round(np.mean([s.voice_quality.clarity_score for s in segments]), 1),
+            "average_confidence": round(np.mean([s.emotion.confidence_score for s in segments]), 1),
+            "dominant_emotions": list(set(s.emotion.primary_emotion for s in segments))
+        }
+        self._log(f"\nAnalysis complete. Quality Score: {overall_quality:.1f}/100")
+        return CallAnalysis(
+            call_id=call_id,
+            analysis_timestamp=datetime.now().isoformat(),
+            audio_duration=round(total_duration, 2),
+            audio_type="stereo" if is_stereo else "mono",
+            segments=segments,
+            speaker_profiles=profiles,
+            dynamics=dynamics,
+            overall_quality_score=round(float(overall_quality), 1),
+            call_summary=summary
+        )
+    def export_analysis(self, analysis: CallAnalysis, output_dir: str) -> str:
+        """Export analysis to JSON"""
+        os.makedirs(output_dir, exist_ok=True)
+        filepath = os.path.join(output_dir, "audio_analysis.json")
+        def convert(obj):
+            if hasattr(obj, '__dict__'):
+                return {k: convert(v) for k, v in asdict(obj).items()}
+            elif isinstance(obj, dict):
+                return {k: convert(v) for k, v in obj.items()}
+            elif isinstance(obj, list):
+                return [convert(i) for i in obj]
+            else:
+                return obj
+        data = convert(analysis)
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+        self._log(f"Analysis exported: {filepath}")
+        return filepath

dashboard.html ADDED Viewed

	@@ -0,0 +1,510 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>ASR Audio Intelligence Platform</title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
+    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
+    <style>
+        :root { --asr-primary: #0f172a; --asr-accent: #3b82f6; --asr-success: #10b981; }
+        body { font-family: 'Inter', system-ui, sans-serif; }
+        .asr-gradient { background: linear-gradient(135deg, #0f172a 0%, #1e293b 50%, #334155 100%); }
+        .asr-accent-gradient { background: linear-gradient(135deg, #3b82f6 0%, #8b5cf6 100%); }
+        .glass-card { background: rgba(255,255,255,0.95); backdrop-filter: blur(10px); }
+        .metric-card:hover { transform: translateY(-2px); box-shadow: 0 20px 40px rgba(0,0,0,0.1); }
+        .upload-zone { border: 2px dashed #cbd5e1; transition: all 0.3s; }
+        .upload-zone:hover, .upload-zone.dragover { border-color: #3b82f6; background: #f0f9ff; }
+        .progress-ring { transform: rotate(-90deg); }
+        .segment-row:hover { background: #f8fafc; }
+        @keyframes pulse-ring { 0% { transform: scale(0.8); opacity: 1; } 100% { transform: scale(1.4); opacity: 0; } }
+        .live-indicator::before { content: ''; position: absolute; width: 100%; height: 100%; background: #10b981; border-radius: 50%; animation: pulse-ring 1.5s infinite; }
+    </style>
+</head>
+<body class="bg-slate-50 min-h-screen">
+<!-- Header -->
+<header class="asr-gradient text-white sticky top-0 z-40 shadow-xl">
+    <div class="container mx-auto px-6 py-4">
+        <div class="flex items-center justify-between">
+            <div class="flex items-center space-x-4">
+                <div class="w-12 h-12 bg-white rounded-xl flex items-center justify-center">
+                    <span class="text-slate-900 font-black text-xl">ASR</span>
+                </div>
+                <div>
+                    <h1 class="text-2xl font-bold tracking-tight">Audio Intelligence Platform</h1>
+                    <p class="text-slate-400 text-sm">Enterprise Speech Analytics & Transcription</p>
+                </div>
+            </div>
+            <div class="flex items-center space-x-6">
+                <div class="text-right">
+                    <div class="text-xs text-slate-400 uppercase tracking-wider">System Status</div>
+                    <div class="flex items-center mt-1">
+                        <span class="relative flex h-3 w-3 mr-2">
+                            <span class="live-indicator absolute inline-flex h-full w-full rounded-full bg-emerald-400"></span>
+                            <span class="relative inline-flex rounded-full h-3 w-3 bg-emerald-500"></span>
+                        </span>
+                        <span class="font-medium" id="serverStatus">Operational</span>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+</header>
+<main class="container mx-auto px-6 py-8">
+    <!-- Upload Section -->
+    <section class="glass-card rounded-2xl shadow-lg p-8 mb-8 border border-slate-200">
+        <div class="flex items-center justify-between mb-6">
+            <div>
+                <h2 class="text-xl font-bold text-slate-800">Audio Upload</h2>
+                <p class="text-slate-500 text-sm mt-1">Upload audio files for analysis. Stereo files will be automatically separated by channel.</p>
+            </div>
+            <div class="flex items-center space-x-2 text-sm">
+                <span class="px-3 py-1 bg-blue-100 text-blue-700 rounded-full font-medium">Stereo: Split Channels</span>
+                <span class="px-3 py-1 bg-slate-100 text-slate-700 rounded-full font-medium">Mono: Single Speaker</span>
+            </div>
+        </div>
+        <div class="upload-zone rounded-xl p-10 text-center cursor-pointer" id="uploadZone">
+            <input type="file" id="fileInput" class="hidden" accept=".wav,.mp3,.m4a,.flac,.ogg,.opus">
+            <div class="w-20 h-20 mx-auto mb-4 bg-slate-100 rounded-full flex items-center justify-center">
+                <i class="fas fa-cloud-arrow-up text-3xl text-slate-400"></i>
+            </div>
+            <p class="text-lg text-slate-700 font-medium">Drop audio file here or <span class="text-blue-600 hover:underline">browse</span></p>
+            <p class="text-sm text-slate-400 mt-2">WAV, MP3, M4A, FLAC, OGG, OPUS supported</p>
+        </div>
+        <div id="uploadProgress" class="hidden mt-6">
+            <div class="bg-slate-50 rounded-xl p-6 border border-slate-200">
+                <div class="flex items-center justify-between mb-4">
+                    <div class="flex items-center">
+                        <div class="w-10 h-10 bg-blue-100 rounded-lg flex items-center justify-center mr-4">
+                            <i class="fas fa-spinner fa-spin text-blue-600"></i>
+                        </div>
+                        <div>
+                            <p class="font-semibold text-slate-800" id="uploadStatus">Processing...</p>
+                            <p class="text-sm text-slate-500" id="stageText">Initializing...</p>
+                        </div>
+                    </div>
+                    <span id="progressPercent" class="text-2xl font-bold text-blue-600">0%</span>
+                </div>
+                <div class="w-full bg-slate-200 rounded-full h-2">
+                    <div id="progressBar" class="asr-accent-gradient h-2 rounded-full transition-all duration-500" style="width: 0%"></div>
+                </div>
+            </div>
+        </div>
+    </section>
+    <!-- Statistics Dashboard -->
+    <section class="grid grid-cols-2 md:grid-cols-4 lg:grid-cols-6 gap-4 mb-8">
+        <div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
+            <div class="flex items-center justify-between">
+                <div>
+                    <p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Total Calls</p>
+                    <p class="text-3xl font-bold text-slate-800 mt-1" id="totalCalls">0</p>
+                </div>
+                <div class="w-12 h-12 bg-blue-50 rounded-xl flex items-center justify-center">
+                    <i class="fas fa-phone-volume text-blue-600 text-lg"></i>
+                </div>
+            </div>
+        </div>
+        <div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
+            <div class="flex items-center justify-between">
+                <div>
+                    <p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Stereo</p>
+                    <p class="text-3xl font-bold text-purple-600 mt-1" id="stereoCalls">0</p>
+                </div>
+                <div class="w-12 h-12 bg-purple-50 rounded-xl flex items-center justify-center">
+                    <i class="fas fa-code-branch text-purple-600 text-lg"></i>
+                </div>
+            </div>
+        </div>
+        <div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
+            <div class="flex items-center justify-between">
+                <div>
+                    <p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Quality Score</p>
+                    <p class="text-3xl font-bold text-emerald-600 mt-1" id="avgScore">0</p>
+                </div>
+                <div class="w-12 h-12 bg-emerald-50 rounded-xl flex items-center justify-center">
+                    <i class="fas fa-chart-line text-emerald-600 text-lg"></i>
+                </div>
+            </div>
+        </div>
+        <div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
+            <div class="flex items-center justify-between">
+                <div>
+                    <p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Clarity</p>
+                    <p class="text-3xl font-bold text-cyan-600 mt-1" id="avgClarity">0</p>
+                </div>
+                <div class="w-12 h-12 bg-cyan-50 rounded-xl flex items-center justify-center">
+                    <i class="fas fa-microphone text-cyan-600 text-lg"></i>
+                </div>
+            </div>
+        </div>
+        <div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
+            <div class="flex items-center justify-between">
+                <div>
+                    <p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Confidence</p>
+                    <p class="text-3xl font-bold text-amber-600 mt-1" id="avgConfidence">0</p>
+                </div>
+                <div class="w-12 h-12 bg-amber-50 rounded-xl flex items-center justify-center">
+                    <i class="fas fa-shield-check text-amber-600 text-lg"></i>
+                </div>
+            </div>
+        </div>
+        <div class="glass-card rounded-xl p-5 border border-slate-200 metric-card transition-all duration-300">
+            <div class="flex items-center justify-between">
+                <div>
+                    <p class="text-xs text-slate-500 uppercase tracking-wider font-medium">Segments</p>
+                    <p class="text-3xl font-bold text-rose-600 mt-1" id="totalSegments">0</p>
+                </div>
+                <div class="w-12 h-12 bg-rose-50 rounded-xl flex items-center justify-center">
+                    <i class="fas fa-wave-square text-rose-600 text-lg"></i>
+                </div>
+            </div>
+        </div>
+    </section>
+    <!-- Calls List -->
+    <section class="glass-card rounded-2xl shadow-lg border border-slate-200 overflow-hidden">
+        <div class="p-6 border-b border-slate-200 flex items-center justify-between">
+            <div>
+                <h2 class="text-xl font-bold text-slate-800">Analyzed Recordings</h2>
+                <p class="text-sm text-slate-500 mt-1">Click on any recording to view detailed analysis</p>
+            </div>
+            <button onclick="loadCalls()" class="flex items-center px-4 py-2 bg-slate-100 hover:bg-slate-200 text-slate-700 rounded-lg transition-colors font-medium">
+                <i class="fas fa-arrows-rotate mr-2"></i>Refresh
+            </button>
+        </div>
+        <div id="callsList" class="divide-y divide-slate-100"></div>
+    </section>
+</main>
+<!-- Analysis Modal -->
+<div id="analysisModal" class="hidden fixed inset-0 bg-slate-900/60 backdrop-blur-sm z-50 flex items-center justify-center p-4">
+    <div class="bg-white rounded-2xl shadow-2xl max-w-7xl w-full max-h-[95vh] overflow-hidden flex flex-col">
+        <div class="asr-gradient text-white p-6 flex justify-between items-center shrink-0">
+            <div class="flex items-center space-x-4">
+                <div class="w-10 h-10 bg-white/20 rounded-lg flex items-center justify-center">
+                    <i class="fas fa-chart-bar"></i>
+                </div>
+                <div>
+                    <h3 class="text-xl font-bold" id="modalTitle">Analysis Report</h3>
+                    <p class="text-slate-300 text-sm">Comprehensive audio analysis</p>
+                </div>
+            </div>
+            <button onclick="closeModal()" class="w-10 h-10 bg-white/10 hover:bg-white/20 rounded-lg flex items-center justify-center transition-colors">
+                <i class="fas fa-xmark text-xl"></i>
+            </button>
+        </div>
+        <div class="p-6 overflow-y-auto flex-1" id="modalContent"></div>
+    </div>
+</div>
+<script>
+const API_BASE = window.location.origin;
+let currentJobId = null;
+let pollInterval = null;
+document.addEventListener('DOMContentLoaded', () => {
+    checkServerHealth();
+    loadStatistics();
+    loadCalls();
+    const uploadZone = document.getElementById('uploadZone');
+    const fileInput = document.getElementById('fileInput');
+    uploadZone.addEventListener('click', () => fileInput.click());
+    uploadZone.addEventListener('dragover', e => { e.preventDefault(); uploadZone.classList.add('dragover'); });
+    uploadZone.addEventListener('dragleave', () => uploadZone.classList.remove('dragover'));
+    uploadZone.addEventListener('drop', e => {
+        e.preventDefault();
+        uploadZone.classList.remove('dragover');
+        if (e.dataTransfer.files.length > 0) handleFileUpload(e.dataTransfer.files[0]);
+    });
+    fileInput.addEventListener('change', e => {
+        if (e.target.files.length > 0) handleFileUpload(e.target.files[0]);
+    });
+});
+async function checkServerHealth() {
+    try {
+        const res = await fetch(`${API_BASE}/health`);
+        const data = await res.json();
+        document.getElementById('serverStatus').textContent = data.status === 'healthy' ? 'Operational' : 'Offline';
+    } catch { document.getElementById('serverStatus').textContent = 'Offline'; }
+}
+async function loadStatistics() {
+    try {
+        const res = await fetch(`${API_BASE}/api/statistics`);
+        const s = await res.json();
+        document.getElementById('totalCalls').textContent = s.total_calls || 0;
+        document.getElementById('stereoCalls').textContent = s.stereo_calls || 0;
+        document.getElementById('avgScore').textContent = (s.avg_quality_score || 0).toFixed(1);
+        document.getElementById('avgClarity').textContent = (s.avg_clarity || 0).toFixed(1);
+        document.getElementById('avgConfidence').textContent = (s.avg_confidence || 0).toFixed(1);
+        document.getElementById('totalSegments').textContent = s.total_segments || 0;
+    } catch (e) { console.error('Stats error:', e); }
+}
+async function loadCalls() {
+    try {
+        const res = await fetch(`${API_BASE}/api/calls`);
+        const calls = await res.json();
+        const list = document.getElementById('callsList');
+        if (calls.length === 0) {
+            list.innerHTML = `<div class="p-12 text-center"><div class="w-16 h-16 bg-slate-100 rounded-full flex items-center justify-center mx-auto mb-4"><i class="fas fa-folder-open text-2xl text-slate-400"></i></div><p class="text-slate-600 font-medium">No recordings yet</p><p class="text-slate-400 text-sm mt-1">Upload an audio file to get started</p></div>`;
+            return;
+        }
+        list.innerHTML = calls.map(call => `
+            <div onclick="viewAnalysis('${call}')" class="p-5 flex items-center justify-between hover:bg-slate-50 cursor-pointer transition-colors">
+                <div class="flex items-center space-x-4">
+                    <div class="w-12 h-12 bg-gradient-to-br from-blue-500 to-purple-600 rounded-xl flex items-center justify-center text-white"><i class="fas fa-waveform-lines"></i></div>
+                    <div>
+                        <p class="font-semibold text-slate-800">${call}</p>
+                        <p class="text-sm text-slate-500">Click to view analysis</p>
+                    </div>
+                </div>
+                <div class="flex items-center space-x-3">
+                    <span class="px-3 py-1 bg-emerald-100 text-emerald-700 rounded-full text-sm font-medium">Analyzed</span>
+                    <i class="fas fa-chevron-right text-slate-400"></i>
+                </div>
+            </div>
+        `).join('');
+    } catch (e) { console.error('Calls error:', e); }
+}
+async function handleFileUpload(file) {
+    const formData = new FormData();
+    formData.append('file', file);
+    document.getElementById('uploadProgress').classList.remove('hidden');
+    updateProgress('Uploading...', 'Transferring file to server', 10);
+    try {
+        const res = await fetch(`${API_BASE}/api/upload`, { method: 'POST', body: formData });
+        const result = await res.json();
+        if (!res.ok) throw new Error(result.error);
+        currentJobId = result.job_id;
+        updateProgress('Processing...', 'Analysis started', 25);
+        pollJobStatus();
+    } catch (e) {
+        updateProgress('Error', e.message, 0);
+        setTimeout(() => document.getElementById('uploadProgress').classList.add('hidden'), 3000);
+    }
+}
+function updateProgress(status, stage, percent) {
+    document.getElementById('uploadStatus').textContent = status;
+    document.getElementById('stageText').textContent = stage;
+    document.getElementById('progressBar').style.width = percent + '%';
+    document.getElementById('progressPercent').textContent = percent + '%';
+}
+function pollJobStatus() {
+    if (pollInterval) clearInterval(pollInterval);
+    pollInterval = setInterval(async () => {
+        try {
+            const res = await fetch(`${API_BASE}/api/jobs/${currentJobId}`);
+            const job = await res.json();
+            const stages = {
+                'pending': { text: 'Queued...', progress: 20 },
+                'initializing': { text: 'Loading models...', progress: 30 },
+                'diarization': { text: 'Separating speakers...', progress: 45 },
+                'transcription': { text: 'Transcribing speech...', progress: 65 },
+                'audio_analysis': { text: 'Analyzing audio features...', progress: 85 },
+                'done': { text: 'Complete!', progress: 100 }
+            };
+            if (job.stage && stages[job.stage]) {
+                let stageText = stages[job.stage].text;
+                if (job.is_stereo !== null) stageText += job.is_stereo ? ' (Stereo)' : ' (Mono)';
+                updateProgress('Processing...', stageText, stages[job.stage].progress);
+            }
+            if (job.status === 'completed') {
+                clearInterval(pollInterval);
+                updateProgress('Success!', 'Analysis complete', 100);
+                setTimeout(() => {
+                    document.getElementById('uploadProgress').classList.add('hidden');
+                    loadStatistics();
+                    loadCalls();
+                }, 1500);
+            } else if (job.status === 'failed') {
+                clearInterval(pollInterval);
+                updateProgress('Failed', job.error || 'Unknown error', 0);
+                setTimeout(() => document.getElementById('uploadProgress').classList.add('hidden'), 3000);
+            }
+        } catch (e) { clearInterval(pollInterval); }
+    }, 1500);
+}
+async function viewAnalysis(callName) {
+    try {
+        const res = await fetch(`${API_BASE}/api/analysis/${callName}`);
+        const data = await res.json();
+        const a = data.analysis;
+        const t = data.transcription;
+        document.getElementById('modalTitle').textContent = callName;
+        const isStereo = a.audio_type === 'stereo';
+        const profiles = a.speaker_profiles || {};
+        const dynamics = a.dynamics || {};
+        let profilesHTML = '';
+        for (const [spk, p] of Object.entries(profiles)) {
+            profilesHTML += `
+                <div class="bg-slate-50 rounded-xl p-5 border border-slate-200">
+                    <div class="flex items-center justify-between mb-4">
+                        <div class="flex items-center space-x-3">
+                            <div class="w-10 h-10 ${spk === 'CUSTOMER' ? 'bg-blue-100' : spk === 'AGENT' ? 'bg-emerald-100' : 'bg-purple-100'} rounded-full flex items-center justify-center">
+                                <i class="fas fa-user ${spk === 'CUSTOMER' ? 'text-blue-600' : spk === 'AGENT' ? 'text-emerald-600' : 'text-purple-600'}"></i>
+                            </div>
+                            <div>
+                                <p class="font-bold text-slate-800">${spk}</p>
+                                <p class="text-sm text-slate-500">${p.communication_style} style</p>
+                            </div>
+                        </div>
+                        <div class="text-right">
+                            <p class="text-2xl font-bold text-slate-800">${p.overall_score}</p>
+                            <p class="text-xs text-slate-500">Quality Score</p>
+                        </div>
+                    </div>
+                    <div class="grid grid-cols-4 gap-3 mb-4">
+                        <div class="text-center p-2 bg-white rounded-lg"><p class="text-lg font-bold text-slate-800">${p.avg_pitch.toFixed(0)}</p><p class="text-xs text-slate-500">Pitch (Hz)</p></div>
+                        <div class="text-center p-2 bg-white rounded-lg"><p class="text-lg font-bold text-slate-800">${p.avg_energy.toFixed(1)}</p><p class="text-xs text-slate-500">Energy (dB)</p></div>
+                        <div class="text-center p-2 bg-white rounded-lg"><p class="text-lg font-bold text-slate-800">${p.avg_speaking_rate.toFixed(1)}</p><p class="text-xs text-slate-500">Rate (/s)</p></div>
+                        <div class="text-center p-2 bg-white rounded-lg"><p class="text-lg font-bold text-slate-800">${p.avg_clarity.toFixed(0)}</p><p class="text-xs text-slate-500">Clarity</p></div>
+                    </div>
+                    <div class="flex flex-wrap gap-2">
+                        ${p.strengths.map(s => `<span class="px-2 py-1 bg-emerald-100 text-emerald-700 text-xs rounded-full">${s}</span>`).join('')}
+                        ${p.improvements.map(i => `<span class="px-2 py-1 bg-amber-100 text-amber-700 text-xs rounded-full">${i}</span>`).join('')}
+                    </div>
+                </div>
+            `;
+        }
+        let transcriptHTML = '';
+        let totalInferenceTime = 0;
+        if (t && t.transcriptions) {
+            t.transcriptions.forEach(seg => totalInferenceTime += (seg.inference_time || 0));
+            transcriptHTML = t.transcriptions.map(seg => `
+                <div class="flex ${seg.speaker === 'CUSTOMER' ? 'justify-start' : seg.speaker === 'AGENT' ? 'justify-end' : 'justify-center'}">
+                    <div class="max-w-[70%] ${seg.speaker === 'CUSTOMER' ? 'bg-blue-50 border-blue-200' : seg.speaker === 'AGENT' ? 'bg-emerald-50 border-emerald-200' : 'bg-slate-50 border-slate-200'} border rounded-xl p-3">
+                        <div class="flex items-center justify-between mb-1">
+                            <div class="flex items-center space-x-2">
+                                <span class="font-semibold text-sm ${seg.speaker === 'CUSTOMER' ? 'text-blue-700' : seg.speaker === 'AGENT' ? 'text-emerald-700' : 'text-slate-700'}">${seg.speaker}</span>
+                                <span class="text-xs text-slate-400">${seg.start_time}</span>
+                            </div>
+                            <span class="text-xs text-orange-500 font-medium"><i class="fas fa-clock mr-1"></i>${seg.inference_time}s</span>
+                        </div>
+                        <p class="text-slate-800">${seg.text}</p>
+                    </div>
+                </div>
+            `).join('');
+        }
+        document.getElementById('modalContent').innerHTML = `
+            <div class="space-y-6">
+                <!-- Overview -->
+                <div class="grid grid-cols-5 gap-4">
+                    <div class="bg-gradient-to-br from-slate-800 to-slate-900 text-white rounded-xl p-5 text-center">
+                        <p class="text-3xl font-bold">${a.overall_quality_score}</p>
+                        <p class="text-slate-300 text-sm mt-1">Quality Score</p>
+                    </div>
+                    <div class="bg-slate-50 rounded-xl p-5 text-center border border-slate-200">
+                        <p class="text-3xl font-bold text-slate-800">${a.audio_duration.toFixed(1)}s</p>
+                        <p class="text-slate-500 text-sm mt-1">Duration</p>
+                    </div>
+                    <div class="bg-slate-50 rounded-xl p-5 text-center border border-slate-200">
+                        <p class="text-3xl font-bold ${isStereo ? 'text-purple-600' : 'text-blue-600'}">${isStereo ? 'STEREO' : 'MONO'}</p>
+                        <p class="text-slate-500 text-sm mt-1">Audio Type</p>
+                    </div>
+                    <div class="bg-slate-50 rounded-xl p-5 text-center border border-slate-200">
+                        <p class="text-3xl font-bold text-slate-800">${a.segments.length}</p>
+                        <p class="text-slate-500 text-sm mt-1">Segments</p>
+                    </div>
+                    <div class="bg-slate-50 rounded-xl p-5 text-center border border-slate-200">
+                        <p class="text-3xl font-bold text-emerald-600">${dynamics.engagement_score?.toFixed(0) || 0}</p>
+                        <p class="text-slate-500 text-sm mt-1">Engagement</p>
+                    </div>
+                </div>
+                <!-- Speaker Profiles -->
+                <div>
+                    <h4 class="font-bold text-slate-800 mb-4 flex items-center"><i class="fas fa-users text-blue-600 mr-2"></i>Speaker Profiles</h4>
+                    <div class="grid ${isStereo ? 'grid-cols-2' : 'grid-cols-1 max-w-xl'} gap-4">${profilesHTML}</div>
+                </div>
+                <!-- Transcription -->
+                ${t ? `
+                    <div>
+                        <div class="flex items-center justify-between mb-4">
+                            <h4 class="font-bold text-slate-800 flex items-center"><i class="fas fa-closed-captioning text-blue-600 mr-2"></i>Transcription</h4>
+                            <span class="px-3 py-1 bg-orange-100 text-orange-700 rounded-full text-sm font-medium"><i class="fas fa-bolt mr-1"></i>Total: ${totalInferenceTime.toFixed(2)}s</span>
+                        </div>
+                        <div class="bg-slate-50 rounded-xl p-4 border border-slate-200 max-h-80 overflow-y-auto space-y-3">${transcriptHTML}</div>
+                    </div>
+                ` : ''}
+                <!-- Segment Analysis -->
+                <div>
+                    <h4 class="font-bold text-slate-800 mb-4 flex items-center"><i class="fas fa-wave-square text-blue-600 mr-2"></i>Segment Analysis</h4>
+                    <div class="bg-slate-50 rounded-xl border border-slate-200 overflow-hidden max-h-96 overflow-y-auto">
+                        <table class="w-full text-sm">
+                            <thead class="bg-slate-100 sticky top-0">
+                                <tr>
+                                    <th class="px-4 py-3 text-left font-semibold text-slate-600">#</th>
+                                    <th class="px-4 py-3 text-left font-semibold text-slate-600">Speaker</th>
+                                    <th class="px-4 py-3 text-left font-semibold text-slate-600">Time</th>
+                                    <th class="px-4 py-3 text-center font-semibold text-slate-600">Pitch</th>
+                                    <th class="px-4 py-3 text-center font-semibold text-slate-600">Energy</th>
+                                    <th class="px-4 py-3 text-center font-semibold text-slate-600">Rate</th>
+                                    <th class="px-4 py-3 text-center font-semibold text-slate-600">Clarity</th>
+                                    <th class="px-4 py-3 text-center font-semibold text-slate-600">Emotion</th>
+                                    <th class="px-4 py-3 text-center font-semibold text-slate-600">Score</th>
+                                </tr>
+                            </thead>
+                            <tbody class="divide-y divide-slate-100">
+                                ${a.segments.map((s, i) => `
+                                    <tr class="segment-row">
+                                        <td class="px-4 py-3 font-medium text-slate-800">${i+1}</td>
+                                        <td class="px-4 py-3"><span class="px-2 py-1 ${s.speaker === 'CUSTOMER' ? 'bg-blue-100 text-blue-700' : s.speaker === 'AGENT' ? 'bg-emerald-100 text-emerald-700' : 'bg-purple-100 text-purple-700'} rounded-full text-xs font-medium">${s.speaker}</span></td>
+                                        <td class="px-4 py-3 text-slate-600">${s.start_time}</td>
+                                        <td class="px-4 py-3 text-center font-medium">${s.pitch.mean_f0.toFixed(0)} Hz</td>
+                                        <td class="px-4 py-3 text-center font-medium">${s.energy.mean_rms.toFixed(1)} dB</td>
+                                        <td class="px-4 py-3 text-center font-medium">${s.rhythm.speaking_rate.toFixed(1)}</td>
+                                        <td class="px-4 py-3 text-center"><span class="px-2 py-1 ${s.voice_quality.clarity_score > 70 ? 'bg-emerald-100 text-emerald-700' : s.voice_quality.clarity_score > 50 ? 'bg-amber-100 text-amber-700' : 'bg-red-100 text-red-700'} rounded text-xs font-medium">${s.voice_quality.clarity_score.toFixed(0)}</span></td>
+                                        <td class="px-4 py-3 text-center"><span class="px-2 py-1 bg-slate-100 text-slate-700 rounded text-xs">${s.emotion.primary_emotion}</span></td>
+                                        <td class="px-4 py-3 text-center font-bold text-slate-800">${s.overall_quality_score.toFixed(0)}</td>
+                                    </tr>
+                                `).join('')}
+                            </tbody>
+                        </table>
+                    </div>
+                </div>
+            </div>
+        `;
+        document.getElementById('analysisModal').classList.remove('hidden');
+    } catch (e) { console.error('Analysis error:', e); alert('Failed to load analysis'); }
+}
+function closeModal() { document.getElementById('analysisModal').classList.add('hidden'); }
+document.addEventListener('keydown', e => { if (e.key === 'Escape') closeModal(); });
+</script>
+</body>
+</html>

main.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""
+ASR Audio Analysis Pipeline
+Complete pipeline: Diarization + Whisper Transcription + Professional Audio Analysis
+"""
+import os
+import sys
+from pathlib import Path
+from stereo_diarizer import StereoCallDiarizer
+from whisper_transcriber import WhisperTranscriber
+from audio_analyzer import AudioAnalyzer
+class ASRPipeline:
+    """ASR End-to-end audio analysis pipeline"""
+    def __init__(self,
+                 input_folder: str,
+                 output_folder: str,
+                 whisper_model: str,
+                 min_silence_len: int = 500,
+                 silence_thresh: int = -40,
+                 device: str = "cpu",
+                 verbose: bool = True):
+        self.input_folder = Path(input_folder)
+        self.output_folder = Path(output_folder)
+        self.whisper_model = whisper_model
+        self.min_silence_len = min_silence_len
+        self.silence_thresh = silence_thresh
+        self.device = device
+        self.verbose = verbose
+        self.stats = {
+            'total_files': 0,
+            'processed': 0,
+            'failed': 0,
+            'stereo': 0,
+            'mono': 0,
+            'failed_files': [],
+            'total_duration': 0.0
+        }
+        self.analyzer = AudioAnalyzer(verbose=self.verbose)
+        self.transcriber = None
+    def _init_transcriber(self):
+        if self.transcriber is None:
+            self.transcriber = WhisperTranscriber(
+                self.whisper_model, self.device, self.verbose
+            )
+    def get_audio_files(self):
+        formats = {'.wav', '.mp3', '.m4a', '.flac', '.ogg', '.opus'}
+        return sorted([
+            f for f in self.input_folder.iterdir()
+            if f.is_file() and f.suffix.lower() in formats
+        ])
+    def process_single(self, audio_file: Path) -> bool:
+        output_dir = self.output_folder / audio_file.stem
+        output_dir.mkdir(parents=True, exist_ok=True)
+        if self.verbose:
+            print(f"\n{'='*60}")
+            print(f"PROCESSING: {audio_file.name}")
+            print(f"{'='*60}")
+        try:
+            # Step 1: Diarization
+            if self.verbose:
+                print("\n[1/3] DIARIZATION")
+            diarizer = StereoCallDiarizer(
+                str(audio_file), self.min_silence_len,
+                self.silence_thresh, self.verbose
+            )
+            diarizer.load_audio()
+            if diarizer.is_stereo:
+                self.stats['stereo'] += 1
+            else:
+                self.stats['mono'] += 1
+            left, right = diarizer.detect_speech_segments()
+            diarizer.create_timeline(left, right)
+            segments = diarizer.export_segments(str(output_dir))
+            diarizer.export_full_speakers(str(output_dir))
+            diarizer.export_transcript_txt(str(output_dir))
+            diarizer.export_transcript_json(str(output_dir))
+            duration = len(diarizer.audio) / 1000
+            self.stats['total_duration'] += duration
+            # Step 2: Transcription
+            if self.verbose:
+                print("\n[2/3] TRANSCRIPTION")
+            self._init_transcriber()
+            transcribed = self.transcriber.transcribe_segments(
+                segments, diarizer.timeline
+            )
+            self.transcriber.export_transcription(transcribed, str(output_dir))
+            # Step 3: Audio Analysis
+            if self.verbose:
+                print("\n[3/3] AUDIO ANALYSIS")
+            analysis = self.analyzer.analyze_call(
+                segments, diarizer.timeline,
+                audio_file.stem, diarizer.is_stereo
+            )
+            self.analyzer.export_analysis(analysis, str(output_dir))
+            if self.verbose:
+                print(f"\nSUCCESS: {audio_file.name}")
+                print(f"Type: {'STEREO' if diarizer.is_stereo else 'MONO'}")
+                print(f"Duration: {duration:.1f}s | Quality: {analysis.overall_quality_score}/100")
+            return True
+        except Exception as e:
+            if self.verbose:
+                print(f"\nFAILED: {audio_file.name}")
+                print(f"Error: {e}")
+            import traceback
+            traceback.print_exc()
+            return False
+    def run(self):
+        print("\n" + "="*60)
+        print("ASR AUDIO ANALYSIS PIPELINE")
+        print("="*60)
+        files = self.get_audio_files()
+        self.stats['total_files'] = len(files)
+        if not files:
+            print(f"\nNo audio files in {self.input_folder}")
+            return
+        print(f"\nFound {len(files)} file(s)")
+        print(f"Input: {self.input_folder}")
+        print(f"Output: {self.output_folder}")
+        for i, f in enumerate(files, 1):
+            print(f"\n[{i}/{len(files)}]")
+            if self.process_single(f):
+                self.stats['processed'] += 1
+            else:
+                self.stats['failed'] += 1
+                self.stats['failed_files'].append(f.name)
+        print("\n" + "="*60)
+        print("COMPLETE")
+        print("="*60)
+        print(f"Processed: {self.stats['processed']}/{self.stats['total_files']}")
+        print(f"Stereo: {self.stats['stereo']} | Mono: {self.stats['mono']}")
+        print(f"Total duration: {self.stats['total_duration']:.1f}s")
+        if self.stats['failed_files']:
+            print(f"\nFailed: {', '.join(self.stats['failed_files'])}")
+        print(f"\nResults: {self.output_folder}")
+        print("\nRun 'python api_server.py' and open http://localhost:5001")
+def main():
+    INPUT_FOLDER = "/home/ramal/Downloads/Archive"
+    OUTPUT_FOLDER = "output"
+    WHISPER_MODEL = "/home/ramal/Desktop/end-to-end/whisper-small-az/checkpoint-157959"
+    if not os.path.exists(INPUT_FOLDER):
+        print(f"Error: {INPUT_FOLDER} not found")
+        sys.exit(1)
+    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
+    pipeline = ASRPipeline(
+        input_folder=INPUT_FOLDER,
+        output_folder=OUTPUT_FOLDER,
+        whisper_model=WHISPER_MODEL,
+        device="cpu",
+        verbose=True
+    )
+    pipeline.run()
+if __name__ == "__main__":
+    main()

req.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+pydub
+librosa
+numpy
+scipy
+flask
+flask-cors
+torch
+transformers

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+pydub
+librosa
+numpy
+scipy
+flask
+flask-cors
+torch
+transformers
+soundfile

stereo_diarizer.py ADDED Viewed

	@@ -0,0 +1,556 @@

+"""
+Stereo Call Center Audio Diarization Module
+This module provides professional audio diarization for stereo call center recordings.
+It separates speakers from left/right channels and creates detailed transcription-ready segments.
+IMPORTANT: Only separates channels for STEREO audio. Mono audio is processed as single speaker.
+"""
+import os
+import json
+from datetime import datetime
+from typing import List, Dict, Tuple, Optional
+from dataclasses import dataclass, asdict
+from pydub import AudioSegment
+from pydub.silence import detect_nonsilent
+@dataclass
+class Segment:
+    """Represents a single speech segment"""
+    turn: int
+    speaker: str
+    start_ms: int
+    end_ms: int
+    duration_ms: int
+    start_time: str
+    end_time: str
+    duration: str
+    channel: str = 'mono'
+    audio_file: Optional[str] = None
+class StereoCallDiarizer:
+    """
+    Professional stereo call center audio diarization system.
+    For STEREO audio: Separates speakers from stereo audio (left/right channels)
+    For MONO audio: Processes as single speaker without channel separation
+    Attributes:
+        input_file (str): Path to input audio file
+        min_silence_len (int): Minimum silence length in ms to split segments
+        silence_thresh (int): Silence threshold in dB
+    """
+    def __init__(self, input_file: str, min_silence_len: int = 500,
+                 silence_thresh: int = -40, verbose: bool = True):
+        """
+        Initialize the diarizer.
+        Args:
+            input_file: Path to audio file (stereo or mono)
+            min_silence_len: Minimum silence duration (ms) to split segments
+            silence_thresh: Audio level (dB) below which is considered silence
+            verbose: Enable/disable logging output
+        """
+        self.input_file = input_file
+        self.min_silence_len = min_silence_len
+        self.silence_thresh = silence_thresh
+        self.verbose = verbose
+        self.audio: Optional[AudioSegment] = None
+        self.left_channel: Optional[AudioSegment] = None
+        self.right_channel: Optional[AudioSegment] = None
+        self.mono_channel: Optional[AudioSegment] = None
+        self.timeline: List[Segment] = []
+        self.customer_channel: str = 'left'
+        self.is_stereo: bool = False
+    def _log(self, message: str):
+        """Internal logging method"""
+        if self.verbose:
+            print(message)
+    def load_audio(self) -> bool:
+        """
+        Load audio file and split into channels if stereo.
+        Returns:
+            True if stereo, False if mono
+        """
+        self._log(f"Loading audio: {self.input_file}")
+        self.audio = AudioSegment.from_file(self.input_file)
+        self.is_stereo = self.audio.channels == 2
+        if self.is_stereo:
+            self._log(f"STEREO audio detected ({self.audio.channels} channels) - will separate speakers")
+            self.left_channel = self.audio.split_to_mono()[0]
+            self.right_channel = self.audio.split_to_mono()[1]
+        else:
+            self._log(f"MONO audio detected ({self.audio.channels} channel) - single speaker mode")
+            self.mono_channel = self.audio
+        self._log(f"Duration: {len(self.audio)/1000:.2f}s | Sample rate: {self.audio.frame_rate}Hz")
+        return self.is_stereo
+    def detect_speech_segments(self) -> Tuple[List[Tuple[int, int]], List[Tuple[int, int]]]:
+        """
+        Detect speech segments.
+        For stereo: returns (left_segments, right_segments)
+        For mono: returns (mono_segments, [])
+        Returns:
+            Tuple of segment lists
+        """
+        self._log("Detecting speech segments...")
+        if self.is_stereo:
+            left_segments = detect_nonsilent(
+                self.left_channel,
+                min_silence_len=self.min_silence_len,
+                silence_thresh=self.silence_thresh
+            )
+            right_segments = detect_nonsilent(
+                self.right_channel,
+                min_silence_len=self.min_silence_len,
+                silence_thresh=self.silence_thresh
+            )
+            self._log(f"Found {len(left_segments)} segments (LEFT), {len(right_segments)} segments (RIGHT)")
+            return left_segments, right_segments
+        else:
+            mono_segments = detect_nonsilent(
+                self.mono_channel,
+                min_silence_len=self.min_silence_len,
+                silence_thresh=self.silence_thresh
+            )
+            self._log(f"Found {len(mono_segments)} segments (MONO)")
+            return mono_segments, []
+    def create_timeline(self, left_segments: List[Tuple[int, int]],
+                       right_segments: List[Tuple[int, int]]) -> List[Segment]:
+        """
+        Create chronologically ordered timeline of all speech segments.
+        For STEREO: First speaker is CUSTOMER, second is AGENT
+        For MONO: All segments are marked as SPEAKER
+        Args:
+            left_segments: Left channel segments (or mono segments)
+            right_segments: Right channel segments (empty for mono)
+        Returns:
+            List of Segment objects sorted by start time
+        """
+        self._log("Building timeline...")
+        if self.is_stereo:
+            return self._create_stereo_timeline(left_segments, right_segments)
+        else:
+            return self._create_mono_timeline(left_segments)
+    def _create_stereo_timeline(self, left_segments: List[Tuple[int, int]],
+                                right_segments: List[Tuple[int, int]]) -> List[Segment]:
+        """Create timeline for stereo audio with speaker separation"""
+        # Determine who speaks first - that's the CUSTOMER (caller)
+        first_left = left_segments[0][0] if left_segments else float('inf')
+        first_right = right_segments[0][0] if right_segments else float('inf')
+        if first_left < first_right:
+            customer_label = 'CUSTOMER'
+            agent_label = 'AGENT'
+            customer_segments = left_segments
+            agent_segments = right_segments
+            customer_channel = 'left'
+            self._log(f"First speaker: LEFT channel (CUSTOMER)")
+        else:
+            customer_label = 'CUSTOMER'
+            agent_label = 'AGENT'
+            customer_segments = right_segments
+            agent_segments = left_segments
+            customer_channel = 'right'
+            self._log(f"First speaker: RIGHT channel (CUSTOMER)")
+        self.customer_channel = customer_channel
+        timeline = []
+        # Add customer segments
+        for start, end in customer_segments:
+            timeline.append({
+                'speaker': customer_label,
+                'start_ms': start,
+                'end_ms': end,
+                'duration_ms': end - start,
+                'start_time': self._ms_to_time(start),
+                'end_time': self._ms_to_time(end),
+                'duration': self._ms_to_time(end - start),
+                'channel': customer_channel
+            })
+        # Add agent segments
+        for start, end in agent_segments:
+            timeline.append({
+                'speaker': agent_label,
+                'start_ms': start,
+                'end_ms': end,
+                'duration_ms': end - start,
+                'start_time': self._ms_to_time(start),
+                'end_time': self._ms_to_time(end),
+                'duration': self._ms_to_time(end - start),
+                'channel': 'right' if customer_channel == 'left' else 'left'
+            })
+        # Sort chronologically
+        timeline.sort(key=lambda x: x['start_ms'])
+        # Create Segment objects with turn numbers
+        self.timeline = [
+            Segment(turn=i+1, **seg)
+            for i, seg in enumerate(timeline)
+        ]
+        self._log(f"Timeline created with {len(self.timeline)} segments (2 speakers)")
+        return self.timeline
+    def _create_mono_timeline(self, segments: List[Tuple[int, int]]) -> List[Segment]:
+        """Create timeline for mono audio (single speaker)"""
+        timeline = []
+        for start, end in segments:
+            timeline.append({
+                'speaker': 'SPEAKER',
+                'start_ms': start,
+                'end_ms': end,
+                'duration_ms': end - start,
+                'start_time': self._ms_to_time(start),
+                'end_time': self._ms_to_time(end),
+                'duration': self._ms_to_time(end - start),
+                'channel': 'mono'
+            })
+        # Sort chronologically
+        timeline.sort(key=lambda x: x['start_ms'])
+        # Create Segment objects with turn numbers
+        self.timeline = [
+            Segment(turn=i+1, **seg)
+            for i, seg in enumerate(timeline)
+        ]
+        self._log(f"Timeline created with {len(self.timeline)} segments (1 speaker - MONO)")
+        return self.timeline
+    @staticmethod
+    def _ms_to_time(ms: int) -> str:
+        """Convert milliseconds to HH:MM:SS.mmm format"""
+        seconds = int(ms / 1000)
+        milliseconds = int(ms % 1000)
+        minutes = int(seconds / 60)
+        seconds = seconds % 60
+        hours = int(minutes / 60)
+        minutes = minutes % 60
+        return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
+    def export_segments(self, output_dir: str = "output") -> List[str]:
+        """
+        Export each segment as individual audio file.
+        Args:
+            output_dir: Directory to save segment files
+        Returns:
+            List of created file paths
+        """
+        os.makedirs(output_dir, exist_ok=True)
+        self._log(f"Exporting {len(self.timeline)} audio segments to {output_dir}/")
+        file_paths = []
+        for segment in self.timeline:
+            # Select correct channel based on audio type and segment info
+            if self.is_stereo:
+                if segment.channel == 'left':
+                    audio_segment = self.left_channel[segment.start_ms:segment.end_ms]
+                else:
+                    audio_segment = self.right_channel[segment.start_ms:segment.end_ms]
+            else:
+                audio_segment = self.mono_channel[segment.start_ms:segment.end_ms]
+            # Create filename
+            filename = f"segment_{segment.turn:03d}_{segment.speaker}_{segment.start_ms}ms-{segment.end_ms}ms.wav"
+            filepath = os.path.join(output_dir, filename)
+            # Export
+            audio_segment.export(filepath, format="wav")
+            segment.audio_file = filepath
+            file_paths.append(filepath)
+        self._log(f"Exported {len(file_paths)} segments")
+        return file_paths
+    def export_full_speakers(self, output_dir: str = "output") -> Dict[str, str]:
+        """
+        Export full concatenated audio for each speaker.
+        For stereo: Creates CUSTOMER_full.wav and AGENT_full.wav
+        For mono: Creates SPEAKER_full.wav
+        Args:
+            output_dir: Directory to save files
+        Returns:
+            Dictionary mapping speaker names to file paths
+        """
+        os.makedirs(output_dir, exist_ok=True)
+        self._log("Exporting full speaker audio...")
+        result = {}
+        if self.is_stereo:
+            speakers = ['CUSTOMER', 'AGENT']
+        else:
+            speakers = ['SPEAKER']
+        for speaker in speakers:
+            segments = [s for s in self.timeline if s.speaker == speaker]
+            if segments:
+                parts = []
+                for seg in segments:
+                    if self.is_stereo:
+                        if seg.channel == 'left':
+                            parts.append(self.left_channel[seg.start_ms:seg.end_ms])
+                        else:
+                            parts.append(self.right_channel[seg.start_ms:seg.end_ms])
+                    else:
+                        parts.append(self.mono_channel[seg.start_ms:seg.end_ms])
+                combined = sum(parts)
+                filepath = os.path.join(output_dir, f"{speaker}_full.wav")
+                combined.export(filepath, format="wav")
+                result[speaker] = filepath
+                self._log(f"{speaker}: {len(combined)/1000:.2f}s ({len(segments)} segments)")
+        return result
+    def export_transcript_txt(self, output_dir: str = "output") -> str:
+        """Export human-readable transcript."""
+        os.makedirs(output_dir, exist_ok=True)
+        filepath = os.path.join(output_dir, "transcript.txt")
+        with open(filepath, 'w', encoding='utf-8') as f:
+            f.write("=" * 80 + "\n")
+            f.write("CALL CENTER CONVERSATION TRANSCRIPT\n")
+            f.write(f"File: {self.input_file}\n")
+            f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+            f.write(f"Duration: {len(self.audio)/1000:.2f} seconds\n")
+            f.write(f"Total Segments: {len(self.timeline)}\n")
+            f.write(f"Audio Type: {'STEREO' if self.is_stereo else 'MONO'}\n")
+            if self.is_stereo:
+                f.write(f"CUSTOMER Channel: {self.customer_channel.upper()}\n")
+                f.write(f"AGENT Channel: {'RIGHT' if self.customer_channel == 'left' else 'LEFT'}\n")
+            f.write("=" * 80 + "\n\n")
+            for segment in self.timeline:
+                f.write(f"[Turn {segment.turn:03d}] {segment.speaker}\n")
+                f.write(f"  Time: {segment.start_time} --> {segment.end_time}\n")
+                f.write(f"  Duration: {segment.duration}\n")
+                if segment.audio_file:
+                    f.write(f"  Audio: {os.path.basename(segment.audio_file)}\n")
+                f.write("\n")
+        self._log(f"Transcript saved: {filepath}")
+        return filepath
+    def export_transcript_json(self, output_dir: str = "output") -> str:
+        """Export structured JSON transcript."""
+        os.makedirs(output_dir, exist_ok=True)
+        filepath = os.path.join(output_dir, "transcript.json")
+        data = {
+            'metadata': {
+                'input_file': self.input_file,
+                'duration_seconds': len(self.audio) / 1000,
+                'sample_rate': self.audio.frame_rate,
+                'channels': self.audio.channels,
+                'is_stereo': self.is_stereo,
+                'total_segments': len(self.timeline),
+                'analysis_date': datetime.now().isoformat(),
+                'min_silence_len_ms': self.min_silence_len,
+                'silence_thresh_db': self.silence_thresh,
+            },
+            'timeline': [asdict(s) for s in self.timeline]
+        }
+        if self.is_stereo:
+            data['metadata']['customer_channel'] = self.customer_channel
+            data['metadata']['agent_channel'] = 'right' if self.customer_channel == 'left' else 'left'
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+        self._log(f"JSON saved: {filepath}")
+        return filepath
+    def export_transcript_rttm(self, output_dir: str = "output") -> str:
+        """Export RTTM format transcript (pyannote compatible)."""
+        os.makedirs(output_dir, exist_ok=True)
+        filepath = os.path.join(output_dir, "transcript.rttm")
+        with open(filepath, 'w', encoding='utf-8') as f:
+            for segment in self.timeline:
+                start_sec = segment.start_ms / 1000
+                duration_sec = segment.duration_ms / 1000
+                f.write(f"SPEAKER {os.path.basename(self.input_file)} 1 "
+                       f"{start_sec:.3f} {duration_sec:.3f} <NA> <NA> "
+                       f"{segment.speaker} <NA> <NA>\n")
+        self._log(f"RTTM saved: {filepath}")
+        return filepath
+    def get_statistics(self) -> Dict:
+        """Calculate and return statistics about the conversation."""
+        if self.is_stereo:
+            customer_segments = [s for s in self.timeline if s.speaker == 'CUSTOMER']
+            agent_segments = [s for s in self.timeline if s.speaker == 'AGENT']
+            customer_duration = sum(s.duration_ms for s in customer_segments) / 1000
+            agent_duration = sum(s.duration_ms for s in agent_segments) / 1000
+            total_speech = customer_duration + agent_duration
+        else:
+            speaker_segments = [s for s in self.timeline if s.speaker == 'SPEAKER']
+            total_speech = sum(s.duration_ms for s in speaker_segments) / 1000
+        total_duration = len(self.audio) / 1000
+        silence_duration = total_duration - total_speech
+        stats = {
+            'total_duration': total_duration,
+            'total_speech': total_speech,
+            'silence_duration': silence_duration,
+            'silence_percentage': (silence_duration / total_duration) * 100 if total_duration > 0 else 0,
+            'is_stereo': self.is_stereo
+        }
+        if self.is_stereo:
+            stats['customer'] = {
+                'segments': len(customer_segments),
+                'duration': customer_duration,
+                'percentage': (customer_duration / total_duration) * 100 if total_duration > 0 else 0,
+                'avg_segment': customer_duration / len(customer_segments) if customer_segments else 0
+            }
+            stats['agent'] = {
+                'segments': len(agent_segments),
+                'duration': agent_duration,
+                'percentage': (agent_duration / total_duration) * 100 if total_duration > 0 else 0,
+                'avg_segment': agent_duration / len(agent_segments) if agent_segments else 0
+            }
+        else:
+            stats['speaker'] = {
+                'segments': len(speaker_segments),
+                'duration': total_speech,
+                'percentage': (total_speech / total_duration) * 100 if total_duration > 0 else 0,
+                'avg_segment': total_speech / len(speaker_segments) if speaker_segments else 0
+            }
+        return stats
+    def process(self, output_dir: str = "output", export_segments: bool = True,
+                export_full: bool = True, export_transcripts: bool = True) -> Dict:
+        """
+        Run complete diarization pipeline.
+        Args:
+            output_dir: Directory for all outputs
+            export_segments: Whether to export individual segment files
+            export_full: Whether to export full speaker audio files
+            export_transcripts: Whether to export transcript files
+        Returns:
+            Dictionary with results and file paths
+        """
+        self._log("=" * 80)
+        self._log("AUDIO DIARIZATION - PROCESSING")
+        self._log("=" * 80)
+        # Load and process
+        self.load_audio()
+        left_seg, right_seg = self.detect_speech_segments()
+        self.create_timeline(left_seg, right_seg)
+        # Export results
+        results = {
+            'is_stereo': self.is_stereo,
+            'timeline': self.timeline,
+            'statistics': self.get_statistics(),
+            'files': {}
+        }
+        if export_segments:
+            results['files']['segments'] = self.export_segments(output_dir)
+        if export_full:
+            results['files']['full_speakers'] = self.export_full_speakers(output_dir)
+        if export_transcripts:
+            results['files']['transcript_txt'] = self.export_transcript_txt(output_dir)
+            results['files']['transcript_json'] = self.export_transcript_json(output_dir)
+            results['files']['transcript_rttm'] = self.export_transcript_rttm(output_dir)
+        self._log("=" * 80)
+        self._log("COMPLETED")
+        self._log("=" * 80)
+        return results
+# Convenience function for simple usage
+def diarize_call(input_file: str, output_dir: str = "output",
+                 min_silence_len: int = 500, silence_thresh: int = -40,
+                 verbose: bool = True) -> Dict:
+    """
+    Simple function to diarize a call recording (stereo or mono).
+    Args:
+        input_file: Path to audio file
+        output_dir: Directory for outputs
+        min_silence_len: Minimum silence duration in ms
+        silence_thresh: Silence threshold in dB
+        verbose: Enable logging
+    Returns:
+        Dictionary with results including timeline, statistics, and file paths
+    """
+    diarizer = StereoCallDiarizer(input_file, min_silence_len, silence_thresh, verbose)
+    return diarizer.process(output_dir)
+if __name__ == "__main__":
+    # Example usage
+    result = diarize_call(
+        input_file="call.wav",
+        output_dir="output",
+        min_silence_len=500,
+        silence_thresh=-40
+    )
+    print("\nStatistics:")
+    print(f"Audio type: {'STEREO' if result['is_stereo'] else 'MONO'}")
+    print(f"Total duration: {result['statistics']['total_duration']:.2f}s")
+    if result['is_stereo']:
+        print(f"Customer: {result['statistics']['customer']['duration']:.2f}s")
+        print(f"Agent: {result['statistics']['agent']['duration']:.2f}s")
+    else:
+        print(f"Speaker: {result['statistics']['speaker']['duration']:.2f}s")

whisper_transcriber.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""
+Whisper Speech-to-Text Transcription Module
+Uses quantized Whisper model for CPU-optimized transcription.
+"""
+import os
+import json
+import time
+from typing import List, Dict, Optional
+from dataclasses import dataclass, asdict
+import torch
+import librosa
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+@dataclass
+class TranscribedSegment:
+    """Represents a transcribed audio segment"""
+    turn: int
+    speaker: str
+    start_time: str
+    end_time: str
+    duration: str
+    text: str
+    audio_file: str
+    inference_time: float
+class WhisperTranscriber:
+    """
+    Whisper-based speech-to-text transcription system.
+    Optimized for CPU inference with quantized model.
+    """
+    def __init__(self, model_path: str, device: str = "cpu", verbose: bool = True):
+        """
+        Initialize Whisper transcriber.
+        Args:
+            model_path: Path to Whisper model checkpoint
+            device: Device for inference ('cpu' or 'cuda')
+            verbose: Enable logging
+        """
+        self.model_path = model_path
+        self.device = device
+        self.verbose = verbose
+        self._log("Loading Whisper model...")
+        self._log(f"Model: {model_path}")
+        self._log(f"Device: {device}")
+        # Load processor and model
+        self.processor = WhisperProcessor.from_pretrained(model_path)
+        self.model = WhisperForConditionalGeneration.from_pretrained(model_path)
+        self.model.to(device)
+        self.model.eval()
+        # Optimize for CPU
+        if device == "cpu":
+            self.model = torch.quantization.quantize_dynamic(
+                self.model, {torch.nn.Linear}, dtype=torch.qint8
+            )
+        self._log("Model loaded successfully")
+    def _log(self, message: str):
+        if self.verbose:
+            print(message)
+    def transcribe_audio(self, audio_path: str) -> tuple:
+        """
+        Transcribe a single audio file.
+        Args:
+            audio_path: Path to audio file
+        Returns:
+            Tuple of (transcription_text, inference_time)
+        """
+        start_time = time.time()
+        # Load and resample audio
+        audio, sr = librosa.load(audio_path, sr=16000)
+        # Process audio
+        inputs = self.processor(
+            audio,
+            sampling_rate=16000,
+            return_tensors="pt"
+        ).input_features.to(self.device)
+        # Generate transcription
+        with torch.no_grad():
+            predicted_ids = self.model.generate(
+                inputs,
+                max_length=448,
+                num_beams=5,
+                language="az",
+                task="transcribe"
+            )
+        # Decode
+        transcription = self.processor.batch_decode(
+            predicted_ids,
+            skip_special_tokens=True
+        )[0]
+        inference_time = time.time() - start_time
+        return transcription.strip(), inference_time
+    def transcribe_segments(self, segment_files: List[str],
+                           timeline: List) -> List[TranscribedSegment]:
+        """
+        Transcribe multiple audio segments.
+        Args:
+            segment_files: List of audio file paths
+            timeline: List of segment metadata from diarizer
+        Returns:
+            List of TranscribedSegment objects
+        """
+        transcriptions = []
+        total_time = 0
+        self._log(f"\nTranscribing {len(segment_files)} segments...")
+        for i, (seg_file, seg_info) in enumerate(zip(segment_files, timeline)):
+            self._log(f"  [{i+1}/{len(segment_files)}] {os.path.basename(seg_file)}")
+            text, inf_time = self.transcribe_audio(seg_file)
+            total_time += inf_time
+            transcriptions.append(TranscribedSegment(
+                turn=seg_info.turn,
+                speaker=seg_info.speaker,
+                start_time=seg_info.start_time,
+                end_time=seg_info.end_time,
+                duration=seg_info.duration,
+                text=text,
+                audio_file=os.path.basename(seg_file),
+                inference_time=round(inf_time, 2)
+            ))
+        self._log(f"Total transcription time: {total_time:.2f}s")
+        return transcriptions
+    def export_transcription(self, transcriptions: List[TranscribedSegment],
+                            output_dir: str) -> Dict[str, str]:
+        """
+        Export transcriptions to files.
+        Args:
+            transcriptions: List of TranscribedSegment objects
+            output_dir: Output directory
+        Returns:
+            Dictionary of created file paths
+        """
+        os.makedirs(output_dir, exist_ok=True)
+        files = {}
+        # Export JSON
+        json_path = os.path.join(output_dir, "transcription.json")
+        data = {
+            'total_segments': len(transcriptions),
+            'transcriptions': [asdict(t) for t in transcriptions]
+        }
+        with open(json_path, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+        files['json'] = json_path
+        # Export conversation text
+        conv_path = os.path.join(output_dir, "conversation.txt")
+        with open(conv_path, 'w', encoding='utf-8') as f:
+            for t in transcriptions:
+                f.write(f"[{t.start_time}] {t.speaker}: {t.text}\n")
+        files['conversation'] = conv_path
+        self._log(f"Transcription exported to {output_dir}")
+        return files