Spaces:

daasime
/

sop-audio-analyzer

Running

daasime commited on Jan 14

Commit

ebba35f

1 Parent(s): 9cda7a0

Add SOP Audio Analyzer app files

- Full Streamlit app with audio analysis
- SpeechBrain VAD, diarization, voiceprint
- Fraud detection modules
- Database models

Files changed (23) hide show

Dockerfile +21 -9
README.md +67 -12
app.py +1361 -0
requirements.txt +28 -3
src/__init__.py +27 -0
src/analyzer.py +597 -0
src/database/__init__.py +3 -0
src/database/models.py +320 -0
src/fraud_detection/__init__.py +14 -0
src/fraud_detection/pause_detector.py +235 -0
src/fraud_detection/reading_pattern.py +238 -0
src/fraud_detection/whisper_detector.py +203 -0
src/phase1_foundation/__init__.py +11 -0
src/phase1_foundation/diarization.py +199 -0
src/phase1_foundation/preprocessor.py +102 -0
src/phase1_foundation/vad.py +117 -0
src/phase1_foundation/voiceprint.py +199 -0
src/phase2_background/__init__.py +3 -0
src/phase2_background/analyzer.py +253 -0
src/phase6_synthetic/__init__.py +8 -0
src/phase6_synthetic/detector.py +494 -0
src/phase6_synthetic/wake_words.py +235 -0
src/ui/__init__.py +1 -0

Dockerfile CHANGED Viewed

@@ -1,20 +1,32 @@
-FROM python:3.13.5-slim
 WORKDIR /app
 RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
     git \
     && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
-EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+FROM python:3.11-slim
 WORKDIR /app
+# Install system dependencies for audio processing
 RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    libsndfile1 \
     git \
     && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Create necessary directories
+RUN mkdir -p data/db data/clips pretrained_models
+# Expose Streamlit port (HF Spaces uses 7860)
+EXPOSE 7860
+# Set environment variables
+ENV STREAMLIT_SERVER_PORT=7860
+ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
+# Run Streamlit
+CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]

README.md CHANGED Viewed

@@ -1,19 +1,74 @@
 ---
-title: Sop Audio Analyzer
-emoji: 🚀
-colorFrom: red
 colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
 pinned: false
-short_description: Audio Analyzer
 ---
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

 ---
+title: SOP Audio Analyzer
+emoji: 🎙️
+colorFrom: blue
 colorTo: red
+sdk: streamlit
+sdk_version: 1.29.0
+python_version: "3.11"
+app_file: app.py
 pinned: false
 ---
+# SOP Audio Analyzer
+Test Integrity Analysis - Voice fraud detection for take-at-home tests.
+## Features
+- 🎤 **Record or upload** audio files
+- 🗣️ **Speaker diarization** - detect multiple voices
+- 🎯 **Voiceprint extraction** - unique ID per speaker
+- 🔈 **Background analysis** - detect whispers, distant voices
+- 🤖 **Synthetic detection** - identify TTS/AI voices
+- 📢 **Wake word detection** - Alexa, Siri, Google
+- 🗄️ **Cross-test tracking** - find same voice across tests
+## Installation
+```bash
+# Create virtual environment
+python -m venv venv
+source venv/bin/activate  # Linux/Mac
+# or: venv\Scripts\activate  # Windows
+# Install dependencies
+pip install -r requirements.txt
+```
+## Run
+```bash
+streamlit run app.py
+```
+## Project Structure
+```
+sop-audio-analyzer/
+├── app.py                    # Main Streamlit app
+├── requirements.txt
+├── src/
+│   ├── phase1_foundation/    # VAD, Diarization, Voiceprint
+│   ├── phase2_background/    # Background analysis
+│   ├── phase6_synthetic/     # Synthetic & wake word detection
+│   ├── database/             # SQLite models & queries
+│   └── ui/                   # UI components
+├── data/
+│   ├── db/                   # SQLite database
+│   └── clips/                # Extracted audio clips
+└── tests/
+    └── audio/                # Test audio files
+```
+## Usage
+1. **Analyzer tab**: Upload or record audio → Analyze → View results
+2. **Database tab**: Browse all voiceprints → Track across tests
+## Tech Stack
+- **SpeechBrain**: VAD, diarization, speaker recognition
+- **Whisper**: Transcription, wake word detection
+- **Streamlit**: Web UI
+- **SQLite**: Voiceprint database

app.py ADDED Viewed

	@@ -0,0 +1,1361 @@

+"""
+SOP Audio Analyzer - Streamlit UI
+"""
+import os
+import streamlit as st
+import tempfile
+import wave
+import numpy as np
+from datetime import datetime
+from streamlit_webrtc import webrtc_streamer, WebRtcMode, AudioProcessorBase
+import av
+import queue
+import threading
+import time
+import plotly.graph_objects as go
+# Page config
+st.set_page_config(
+    page_title="Test Integrity Analysis",
+    page_icon="🎙️",
+    layout="wide"
+)
+# ============ SIMPLE LOGIN ============
+APP_USERNAME = "PTEXAdmin"
+APP_PASSWORD = "T3st@26"
+def check_login():
+    """Simple username/password authentication."""
+    if 'authenticated' not in st.session_state:
+        st.session_state['authenticated'] = False
+    if not st.session_state['authenticated']:
+        st.markdown("## 🔐 Login")
+        st.markdown("Enter credentials to access the application")
+        col1, col2, col3 = st.columns([1, 2, 1])
+        with col2:
+            username = st.text_input("Username", key="login_username")
+            password = st.text_input("Password", type="password", key="login_password")
+            if st.button("Login", use_container_width=True):
+                if username == APP_USERNAME and password == APP_PASSWORD:
+                    st.session_state['authenticated'] = True
+                    st.session_state['username'] = username
+                    st.rerun()
+                else:
+                    st.error("Invalid credentials")
+        return False
+    return True
+# Initialize analyzer (lazy) - v2 forces reload
+@st.cache_resource
+def get_analyzer():
+    from src.analyzer import AudioAnalyzer
+    return AudioAnalyzer()
+class AudioProcessor(AudioProcessorBase):
+    """Audio processor that records and provides real-time level data."""
+    def __init__(self):
+        self.audio_frames = []
+        self.sample_rate = 48000
+        self.lock = threading.Lock()
+        self.level_queue = queue.Queue()
+    def recv(self, frame: av.AudioFrame) -> av.AudioFrame:
+        """Process incoming audio frames."""
+        sound = frame.to_ndarray()
+        with self.lock:
+            self.audio_frames.append(sound.copy())
+        # Calculate RMS level for visualization
+        rms = np.sqrt(np.mean(sound.astype(np.float32) ** 2))
+        level_db = 20 * np.log10(max(rms, 1e-10))
+        try:
+            self.level_queue.put_nowait(level_db)
+        except queue.Full:
+            pass
+        return frame
+    def get_audio_data(self):
+        """Get recorded audio as numpy array."""
+        with self.lock:
+            if not self.audio_frames:
+                return None
+            return np.concatenate(self.audio_frames, axis=1)
+    def get_frame_count(self):
+        """Get number of recorded frames."""
+        with self.lock:
+            return len(self.audio_frames)
+    def clear(self):
+        """Clear recorded frames."""
+        with self.lock:
+            self.audio_frames = []
+    def save_to_wav(self, filepath: str) -> bool:
+        """Save recorded audio to WAV file."""
+        audio_data = self.get_audio_data()
+        if audio_data is None or audio_data.size == 0:
+            return False
+        # Convert to mono if stereo
+        if len(audio_data.shape) > 1 and audio_data.shape[0] > 1:
+            audio_data = audio_data.mean(axis=0)
+        else:
+            audio_data = audio_data.flatten()
+        # Normalize to int16
+        audio_data = audio_data.astype(np.float32)
+        max_val = np.abs(audio_data).max()
+        if max_val > 0:
+            audio_data = audio_data / max_val
+        audio_int16 = (audio_data * 32767).astype(np.int16)
+        # Save WAV
+        with wave.open(filepath, 'wb') as wf:
+            wf.setnchannels(1)
+            wf.setsampwidth(2)
+            wf.setframerate(self.sample_rate)
+            wf.writeframes(audio_int16.tobytes())
+        return True
+def render_waveform(audio_data, sample_rate):
+    """Render waveform visualization."""
+    if audio_data is None:
+        return
+    # Flatten to mono
+    if len(audio_data.shape) > 1:
+        audio_data = audio_data.mean(axis=0)
+    else:
+        audio_data = audio_data.flatten()
+    # Downsample for display
+    max_points = 1000
+    if len(audio_data) > max_points:
+        step = len(audio_data) // max_points
+        audio_data = audio_data[::step]
+    # Create time axis
+    duration = len(audio_data) / (sample_rate / (step if 'step' in dir() else 1))
+    time_axis = np.linspace(0, duration, len(audio_data))
+    # Normalize
+    max_val = np.abs(audio_data).max()
+    if max_val > 0:
+        audio_data = audio_data / max_val
+    fig = go.Figure()
+    fig.add_trace(go.Scatter(
+        x=time_axis,
+        y=audio_data,
+        mode='lines',
+        line=dict(color='#2563eb', width=1),
+        fill='tozeroy',
+        fillcolor='rgba(37, 99, 235, 0.3)'
+    ))
+    fig.update_layout(
+        height=150,
+        margin=dict(l=0, r=0, t=10, b=30),
+        xaxis=dict(title='Time (s)', showgrid=True),
+        yaxis=dict(visible=False, range=[-1, 1]),
+        showlegend=False
+    )
+    return fig
+def format_time(seconds: float) -> str:
+    """Format seconds as MM:SS or HH:MM:SS."""
+    if seconds < 3600:
+        return f"{int(seconds // 60)}:{int(seconds % 60):02d}"
+    else:
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        return f"{hours}:{minutes:02d}:{secs:02d}"
+def save_audio_to_wav(audio_data, sample_rate, filepath):
+    """Save audio data to WAV file."""
+    if audio_data is None or audio_data.size == 0:
+        return False
+    # Convert to mono if stereo
+    if len(audio_data.shape) > 1 and audio_data.shape[0] > 1:
+        audio_data = audio_data.mean(axis=0)
+    else:
+        audio_data = audio_data.flatten()
+    # Normalize to int16
+    audio_data = audio_data.astype(np.float32)
+    max_val = np.abs(audio_data).max()
+    if max_val > 0:
+        audio_data = audio_data / max_val
+    audio_int16 = (audio_data * 32767).astype(np.int16)
+    # Save WAV
+    with wave.open(filepath, 'wb') as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(sample_rate)
+        wf.writeframes(audio_int16.tobytes())
+    return True
+def render_analyzer_tab():
+    """Render the analyzer tab with clean UI."""
+    # Initialize session state
+    if 'recorded_audio_path' not in st.session_state:
+        st.session_state.recorded_audio_path = None
+    # Custom CSS for better styling
+    st.markdown("""
+    <style>
+    .recording-container {
+        background: linear-gradient(135deg, #1e3a5f 0%, #2d5a87 100%);
+        border-radius: 16px;
+        padding: 2rem;
+        text-align: center;
+        margin-bottom: 1rem;
+    }
+    .upload-icon {
+        font-size: 1.5rem;
+        cursor: pointer;
+    }
+    .stAudio > div {
+        border-radius: 8px;
+    }
+    </style>
+    """, unsafe_allow_html=True)
+    # Main recording area - centered and prominent
+    col_left, col_main, col_right = st.columns([1, 3, 1])
+    with col_main:
+        st.markdown("#### 🎙️ Record Audio")
+        st.caption("Minimum 20 seconds required")
+        # Audio recorder with visual feedback
+        recorded_audio = st.audio_input("", key="audio_recorder", label_visibility="collapsed")
+        if recorded_audio:
+            # Save and show waveform
+            temp_path = tempfile.mktemp(suffix='.wav')
+            with open(temp_path, 'wb') as f:
+                f.write(recorded_audio.getbuffer())
+            st.session_state.recorded_audio_path = temp_path
+            # Show audio duration
+            import wave
+            with wave.open(temp_path, 'rb') as wf:
+                frames = wf.getnframes()
+                rate = wf.getframerate()
+                duration = frames / float(rate)
+            if duration < 20:
+                st.warning(f"⚠️ Audio: {duration:.1f}s - Need at least 20s")
+            else:
+                st.success(f"✅ Audio ready: {duration:.1f}s")
+            # Analyze button
+            if st.button("🔍 Analyze", type="primary", use_container_width=True):
+                analyze_recorded_audio(temp_path)
+    # Upload button - opens modal dialog
+    with col_right:
+        if st.button("📤", help="Upload audio file", key="open_upload_modal"):
+            st.session_state['show_upload_modal'] = True
+    # Upload modal dialog
+    @st.dialog("Upload Audio File")
+    def upload_dialog():
+        uploaded_file = st.file_uploader(
+            "Select audio file",
+            type=['wav', 'mp3', 'm4a', 'ogg', 'flac'],
+            key="audio_uploader_modal"
+        )
+        if uploaded_file is not None:
+            st.audio(uploaded_file, format=f'audio/{uploaded_file.type.split("/")[-1]}')
+            if st.button("🔍 Analyze", use_container_width=True):
+                st.session_state['show_upload_modal'] = False
+                analyze_audio(uploaded_file)
+    if st.session_state.get('show_upload_modal', False):
+        upload_dialog()
+def analyze_recorded_audio(audio_path: str):
+    """Run analysis on recorded audio file."""
+    try:
+        # Clear analyzer cache to ensure fresh analysis
+        st.cache_resource.clear()
+        # Progress bar
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        def update_progress(msg, pct):
+            progress_bar.progress(pct / 100)
+            status_text.text(msg)
+        # Run analysis
+        analyzer = get_analyzer()
+        result = analyzer.analyze(audio_path, progress_callback=update_progress)
+        # Clear progress
+        progress_bar.empty()
+        status_text.empty()
+        # Store result in session state
+        st.session_state['last_result'] = result
+        # Clear recorded audio after analysis
+        if st.session_state.recorded_audio_path and os.path.exists(st.session_state.recorded_audio_path):
+            os.remove(st.session_state.recorded_audio_path)
+        st.session_state.recorded_audio_path = None
+        st.session_state.recording_complete = False
+        # Display results
+        render_results(result)
+    except Exception as e:
+        st.error(f"Analysis failed: {str(e)}")
+def analyze_audio(uploaded_file):
+    """Run analysis on uploaded file."""
+    # Save to temp file
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as f:
+        f.write(uploaded_file.getbuffer())
+        temp_path = f.name
+    try:
+        # Clear analyzer cache to ensure fresh analysis
+        st.cache_resource.clear()
+        # Progress bar
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        def update_progress(msg, pct):
+            progress_bar.progress(pct / 100)
+            status_text.text(msg)
+        # Run analysis
+        analyzer = get_analyzer()
+        result = analyzer.analyze(temp_path, progress_callback=update_progress)
+        # Clear progress
+        progress_bar.empty()
+        status_text.empty()
+        # Store result in session state
+        st.session_state['last_result'] = result
+        # Store audio bytes for playback
+        with open(temp_path, 'rb') as f:
+            st.session_state['last_audio_bytes'] = f.read()
+        # Display results
+        render_results(result)
+    finally:
+        # Cleanup
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
+def calc_speaking_time(speaker, max_duration):
+    """Calculate actual speaking time from segments, merging overlaps and capping to duration."""
+    if not speaker.segments:
+        return min(speaker.total_seconds, max_duration)
+    sorted_segs = sorted(speaker.segments, key=lambda s: s['start'])
+    merged = []
+    for seg in sorted_segs:
+        seg_start = max(0, seg['start'])
+        seg_end = min(seg['end'], max_duration)
+        if seg_start >= max_duration:
+            continue
+        if merged and seg_start <= merged[-1][1]:
+            merged[-1] = (merged[-1][0], max(merged[-1][1], seg_end))
+        else:
+            merged.append((seg_start, seg_end))
+    return sum(end - start for start, end in merged)
+def render_results(result):
+    """Render analysis results in a compact layout."""
+    st.markdown("---")
+    st.markdown("## Analysis Results")
+    # Row 1: Main speaker + Detection flags (3 columns)
+    col1, col2, col3 = st.columns([1.2, 1, 1])
+    with col1:
+        st.markdown("#### 🎤 Main Speaker")
+        if result.main_speaker:
+            main = result.main_speaker
+            st.code(main.voiceprint_id, language=None)
+            quality_color = {"High": "🟢", "Medium": "🟡", "Low": "🔴"}.get(main.quality, "⚪")
+            synth_icon = "✅" if not main.is_synthetic else "⚠️"
+            speaking_time = calc_speaking_time(main, result.duration_seconds)
+            st.markdown(f"{quality_color} Quality: **{main.quality}** · {speaking_time:.1f}s")
+            st.markdown(f"{synth_icon} Synthetic risk: **{main.synthetic_score:.0%}**")
+            # Voice sample
+            clip_path = getattr(main, 'clip_path', None)
+            if clip_path and os.path.exists(clip_path):
+                with open(clip_path, 'rb') as audio_file:
+                    st.audio(audio_file.read(), format='audio/wav')
+    with col2:
+        st.markdown("#### 🔍 Detection")
+        # Synthetic
+        synth_detected = result.main_speaker and result.main_speaker.is_synthetic
+        synth_icon = "⚠️" if synth_detected else "✅"
+        st.markdown(f"{synth_icon} **Synthetic voice:** {'Yes' if synth_detected else 'No'}")
+        st.caption("AI-generated (ElevenLabs, clones)")
+        # Playback
+        playback_detected = getattr(result, 'playback_detected', False)
+        playback_score = getattr(result, 'playback_score', 0.0)
+        playback_icon = "🔊" if playback_detected else "✅"
+        st.markdown(f"{playback_icon} **Playback:** {'Yes' if playback_detected else 'No'} ({playback_score:.0%})")
+        st.caption("Audio from speakers")
+        # Reading Pattern
+        reading_detected = getattr(result, 'reading_pattern_detected', False)
+        reading_conf = getattr(result, 'reading_confidence', 0.0)
+        reading_icon = "📖" if reading_detected else "✅"
+        st.markdown(f"{reading_icon} **Reading:** {'Yes' if reading_detected else 'No'} ({reading_conf:.0%})")
+        st.caption("Unnatural speech rhythm")
+    with col3:
+        st.markdown("#### 🚨 Alerts")
+        # Wake words
+        if result.wake_words:
+            for ww in result.wake_words[:2]:
+                st.markdown(f"🔴 **\"{ww['word']}\"** @ {format_time(ww['time'])}")
+        else:
+            st.markdown("✅ No wake words")
+        st.caption("Alexa, Siri, transfer...")
+        # Whisper detection
+        whisper_detected = getattr(result, 'whisper_detected', False)
+        whisper_instances = getattr(result, 'whisper_instances', []) or []
+        if whisper_detected:
+            st.markdown(f"🔇 **{len(whisper_instances)} whispers**")
+            for w in whisper_instances[:2]:
+                st.markdown(f"· @ {format_time(w['start'])} ({w['confidence']:.0%})")
+        else:
+            st.markdown("✅ No whispers")
+        st.caption("Background voices")
+        # Suspicious pauses
+        pauses_detected = getattr(result, 'suspicious_pauses_detected', False)
+        pauses = getattr(result, 'suspicious_pauses', []) or []
+        longest = getattr(result, 'longest_pause', 0.0)
+        if pauses_detected:
+            st.markdown(f"⏸️ **{len(pauses)} long pauses** (max {longest:.0f}s)")
+        else:
+            st.markdown("✅ No suspicious pauses")
+        st.caption("Silences > 5 seconds")
+    # Additional Speakers section
+    st.markdown("---")
+    if result.additional_speakers:
+        total_time = sum(calc_speaking_time(s, result.duration_seconds) for s in result.additional_speakers)
+        st.markdown(f"#### 👥 Additional Speakers ({len(result.additional_speakers)}) · {total_time:.1f}s total")
+        speaker_letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
+        # Display speakers in rows of 2-3
+        cols_per_row = min(3, len(result.additional_speakers))
+        for row_start in range(0, len(result.additional_speakers), cols_per_row):
+            row_speakers = result.additional_speakers[row_start:row_start + cols_per_row]
+            cols = st.columns(cols_per_row)
+            for col_idx, speaker in enumerate(row_speakers):
+                idx = row_start + col_idx
+                display_label = f"Speaker {speaker_letters[idx]}" if idx < len(speaker_letters) else f"Speaker {idx+1}"
+                with cols[col_idx]:
+                    voice_type = "⚠️ synthetic" if speaker.is_synthetic else "human"
+                    times_info = f"🚨 {speaker.times_seen}x" if speaker.times_seen > 1 else "1st"
+                    speaker_time = calc_speaking_time(speaker, result.duration_seconds)
+                    st.markdown(f"**{display_label}** · {speaker_time:.1f}s · {voice_type} · {times_info}")
+                    st.code(speaker.voiceprint_id, language=None)
+                    # Voice sample player
+                    clip_path = getattr(speaker, 'clip_path', None)
+                    if clip_path and os.path.exists(clip_path):
+                        with open(clip_path, 'rb') as audio_file:
+                            st.audio(audio_file.read(), format='audio/wav')
+                    if speaker.times_seen > 1:
+                        if st.button(f"History", key=f"hist_{speaker.voiceprint_id}_{idx}"):
+                            st.session_state['view_voiceprint'] = speaker.voiceprint_id
+                            st.session_state['active_tab'] = 'database'
+                            st.rerun()
+    else:
+        st.markdown("✅ No additional speakers detected")
+    # Timeline
+    st.markdown("---")
+    st.markdown("#### 📊 Timeline")
+    render_timeline(result)
+    # Download button inline
+    st.download_button(
+        label="📥 Download JSON",
+        data=result.to_json(),
+        file_name=f"{result.test_id}_analysis.json",
+        mime="application/json"
+    )
+def render_timeline(result):
+    """Render a timeline visualization with each speaker on their own row."""
+    import plotly.graph_objects as go
+    fig = go.Figure()
+    duration = result.duration_seconds
+    # Build list of speakers for y-axis
+    speakers = []
+    speaker_colors = {}
+    # Main speaker first
+    if result.main_speaker:
+        speakers.append(('Main Speaker', result.main_speaker))
+        speaker_colors['Main Speaker'] = '#2563eb'
+    # Additional speakers - use letters (A, B, C...) to match diarization
+    additional_colors = ['#dc2626', '#ea580c', '#ca8a04', '#16a34a', '#9333ea']
+    speaker_letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
+    for i, speaker in enumerate(result.additional_speakers):
+        label = f"Speaker {speaker_letters[i]}" if i < len(speaker_letters) else f"Speaker {i+1}"
+        speakers.append((label, speaker))
+        speaker_colors[label] = additional_colors[i % len(additional_colors)]
+    # Add "Events" row at bottom
+    speakers.append(('Events', None))
+    # Create y-axis mapping (bottom to top)
+    y_positions = {label: i for i, (label, _) in enumerate(speakers)}
+    y_labels = [label for label, _ in speakers]
+    bar_height = 0.4
+    # Draw speaker segments
+    for label, speaker in speakers:
+        if speaker is None or not hasattr(speaker, 'segments') or not speaker.segments:
+            continue
+        y_pos = y_positions[label]
+        color = speaker_colors.get(label, '#6b7280')
+        for seg in speaker.segments:
+            # Use shapes for cleaner bars
+            fig.add_shape(
+                type="rect",
+                x0=seg['start'],
+                x1=seg['end'],
+                y0=y_pos - bar_height,
+                y1=y_pos + bar_height,
+                fillcolor=color,
+                line=dict(width=0),
+                opacity=0.8
+            )
+            # Add invisible trace for hover
+            fig.add_trace(go.Scatter(
+                x=[(seg['start'] + seg['end']) / 2],
+                y=[y_pos],
+                mode='markers',
+                marker=dict(size=1, opacity=0),
+                hoverinfo='text',
+                hovertext=f"{label}: {seg['start']:.1f}s - {seg['end']:.1f}s ({seg['end']-seg['start']:.1f}s)",
+                showlegend=False
+            ))
+    # Events row (wake words, anomalies, whispers, pauses)
+    events_y = y_positions['Events']
+    # Wake words as red markers
+    for ww in result.wake_words:
+        fig.add_trace(go.Scatter(
+            x=[ww['time']],
+            y=[events_y],
+            mode='markers',
+            marker=dict(size=14, color='#dc2626', symbol='diamond'),
+            hoverinfo='text',
+            hovertext=f"Wake Word: {ww['word']} ({ww['confidence']:.0%})",
+            showlegend=False
+        ))
+    # Anomalies as yellow markers
+    for anom in result.background_anomalies:
+        fig.add_trace(go.Scatter(
+            x=[anom['start']],
+            y=[events_y],
+            mode='markers',
+            marker=dict(size=12, color='#eab308', symbol='triangle-up'),
+            hoverinfo='text',
+            hovertext=f"Anomaly: {anom['type']} ({anom['confidence']:.0%})",
+            showlegend=False
+        ))
+    # Whispers as purple markers
+    whisper_instances = getattr(result, 'whisper_instances', []) or []
+    for whisper in whisper_instances:
+        fig.add_trace(go.Scatter(
+            x=[whisper['start']],
+            y=[events_y],
+            mode='markers',
+            marker=dict(size=12, color='#9333ea', symbol='circle'),
+            hoverinfo='text',
+            hovertext=f"Whisper: {whisper['start']:.1f}s - {whisper['end']:.1f}s ({whisper['confidence']:.0%})",
+            showlegend=False
+        ))
+    # Suspicious pauses as gray bars
+    suspicious_pauses = getattr(result, 'suspicious_pauses', []) or []
+    for pause in suspicious_pauses:
+        # Draw a gray semi-transparent rectangle for the pause
+        fig.add_shape(
+            type="rect",
+            x0=pause['start'],
+            x1=pause['end'],
+            y0=events_y - bar_height,
+            y1=events_y + bar_height,
+            fillcolor='rgba(107, 114, 128, 0.5)',
+            line=dict(color='#6b7280', width=1),
+        )
+        # Add marker for hover
+        fig.add_trace(go.Scatter(
+            x=[(pause['start'] + pause['end']) / 2],
+            y=[events_y],
+            mode='markers',
+            marker=dict(size=10, color='#6b7280', symbol='square'),
+            hoverinfo='text',
+            hovertext=f"Pause: {pause['duration']:.1f}s ({pause['start']:.1f}s - {pause['end']:.1f}s)",
+            showlegend=False
+        ))
+    # Calculate dynamic height based on number of rows
+    row_height = 50
+    chart_height = max(180, len(speakers) * row_height + 60)
+    fig.update_layout(
+        height=chart_height,
+        margin=dict(l=100, r=20, t=20, b=40),
+        xaxis=dict(
+            range=[0, duration],
+            title='Time (seconds)',
+            showgrid=True,
+            gridcolor='rgba(128,128,128,0.2)'
+        ),
+        yaxis=dict(
+            tickmode='array',
+            tickvals=list(range(len(y_labels))),
+            ticktext=y_labels,
+            range=[-0.8, len(y_labels) - 0.2],
+            showgrid=True,
+            gridcolor='rgba(128,128,128,0.1)'
+        ),
+        showlegend=False,
+        plot_bgcolor='rgba(0,0,0,0)'
+    )
+    # Add legend for events
+    has_events = result.wake_words or result.background_anomalies or whisper_instances or suspicious_pauses
+    if has_events:
+        legend_text = []
+        if result.wake_words:
+            legend_text.append("◆ Wake Words")
+        if result.background_anomalies:
+            legend_text.append("▲ Anomalies")
+        if whisper_instances:
+            legend_text.append("● Whispers")
+        if suspicious_pauses:
+            legend_text.append("■ Long Pauses")
+        st.caption(" | ".join(legend_text))
+    st.plotly_chart(fig, use_container_width=True)
+    # Audio player below timeline for synchronized playback
+    if 'last_audio_bytes' in st.session_state and st.session_state['last_audio_bytes']:
+        st.markdown("**🔊 Audio Playback**")
+        st.audio(st.session_state['last_audio_bytes'], format='audio/wav')
+        st.caption("Play audio while viewing the timeline above to follow speaker changes")
+def render_database_tab():
+    """Render the voiceprint database tab."""
+    st.markdown("### Voiceprint Database")
+    analyzer = get_analyzer()
+    stats = analyzer.get_database_stats()
+    # KPI Cards with descriptions
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        with st.container(border=True):
+            st.metric("Tests Analyzed", stats['total_tests'])
+            st.caption("Audio files processed")
+    with col2:
+        with st.container(border=True):
+            st.metric("Unique Voices", stats['total_voiceprints'])
+            st.caption("Distinct speakers identified")
+    with col3:
+        with st.container(border=True):
+            st.metric("Flagged", stats['flagged_voiceprints'])
+            st.caption("Suspicious voices marked")
+    with col4:
+        with st.container(border=True):
+            st.metric("Recurring", stats['multi_appearance'])
+            st.caption("Voices in 2+ tests")
+    # Check if clear was requested
+    if st.session_state.get('clear_db_search', False):
+        st.session_state['clear_db_search'] = False
+        default_search = ""
+    else:
+        default_search = st.session_state.get('db_search_value', "")
+    # Filter input with clear button
+    filter_col1, filter_col2 = st.columns([5, 1])
+    with filter_col1:
+        search_query = st.text_input("🔍 Filter", value=default_search, placeholder="Voice ID or File name...", key="db_search_input", label_visibility="collapsed")
+        st.session_state['db_search_value'] = search_query
+    with filter_col2:
+        if search_query and st.button("✕ Clear", key="clear_search", use_container_width=True):
+            st.session_state['clear_db_search'] = True
+            st.rerun()
+    # Get all voiceprints
+    all_vps = analyzer.db.get_all_voiceprints()
+    if not all_vps:
+        st.info("No voiceprints yet. Analyze audio to get started!")
+        return
+    # Filter voiceprints if search query provided
+    if search_query:
+        search_lower = search_query.lower()
+        filtered_vps = []
+        for vp in all_vps:
+            # Check voice ID
+            if search_lower in vp.id.lower():
+                filtered_vps.append(vp)
+                continue
+            # Check label
+            if vp.label and search_lower in vp.label.lower():
+                filtered_vps.append(vp)
+                continue
+            # Check file names in appearances
+            appearances = analyzer.get_voiceprint_history(vp.id)
+            for app in appearances:
+                if app.get('filename') and search_lower in app['filename'].lower():
+                    filtered_vps.append(vp)
+                    break
+        all_vps = filtered_vps
+    if not all_vps:
+        st.warning(f"No results for '{search_query}'")
+        return
+    st.caption(f"Showing {len(all_vps)} voiceprint(s)")
+    st.markdown("---")
+    # Each voiceprint as a compact row
+    for i, vp in enumerate(all_vps):
+        flag_icon = "🚨" if vp.is_flagged else ("🟡" if vp.times_seen >= 2 else "✅")
+        label = vp.label if hasattr(vp, 'label') and vp.label else ""
+        display_name = label if label else vp.id
+        # Get appearances for this voiceprint
+        appearances = analyzer.get_voiceprint_history(vp.id)
+        # Compact single row: Flag | Name | Audio | Tests | Button
+        cols = st.columns([0.3, 1.5, 2.5, 0.5, 0.8])
+        with cols[0]:
+            st.write(flag_icon)
+        with cols[1]:
+            st.write(f"**{display_name}** · {vp.total_audio_seconds:.0f}s")
+        with cols[2]:
+            # Audio player
+            if appearances and appearances[0].get('clip_path'):
+                clip_path = appearances[0]['clip_path']
+                if os.path.exists(clip_path):
+                    with open(clip_path, 'rb') as f:
+                        st.audio(f.read(), format='audio/wav')
+                else:
+                    st.caption("—")
+            else:
+                st.caption("—")
+        with cols[3]:
+            st.write(f"**{vp.times_seen}** tests")
+        with cols[4]:
+            # Flag toggle button
+            btn_label = "Unflag" if vp.is_flagged else "Flag"
+            if st.button(btn_label, key=f"flag_{vp.id}_{i}"):
+                analyzer.db.toggle_voiceprint_flag(vp.id, not vp.is_flagged, "Manual" if not vp.is_flagged else None)
+                st.rerun()
+        # Accordion for test appearances
+        if appearances and len(appearances) > 0:
+            with st.expander(f"📋 {len(appearances)} test appearances", expanded=False):
+                # Editable fields row
+                edit_cols = st.columns([1, 2])
+                with edit_cols[0]:
+                    new_label = st.text_input("Name", value=label, key=f"label_{vp.id}_{i}", placeholder="Add name...")
+                    if new_label != label:
+                        if st.button("Save name", key=f"save_label_{vp.id}_{i}"):
+                            analyzer.db.update_voiceprint_label(vp.id, new_label)
+                            st.rerun()
+                with edit_cols[1]:
+                    notes = vp.notes if hasattr(vp, 'notes') and vp.notes else ""
+                    new_notes = st.text_input("Notes", value=notes, key=f"notes_{vp.id}_{i}", placeholder="Add notes...")
+                    if new_notes != notes:
+                        if st.button("Save notes", key=f"save_notes_{vp.id}_{i}"):
+                            analyzer.db.update_voiceprint_notes(vp.id, new_notes)
+                            st.rerun()
+                # Appearances table with audio players
+                # Header
+                hdr_cols = st.columns([2, 2.5, 1, 1, 1.5])
+                hdr_cols[0].caption("**Date**")
+                hdr_cols[1].caption("**File**")
+                hdr_cols[2].caption("**Role**")
+                hdr_cols[3].caption("**Duration**")
+                hdr_cols[4].caption("**Audio**")
+                for j, app in enumerate(appearances):
+                    # Format date as dd/mm/yyyy HH:mm
+                    date_str = '-'
+                    if app['date']:
+                        try:
+                            from datetime import datetime as dt
+                            date_obj = dt.fromisoformat(app['date'].replace('Z', '+00:00'))
+                            date_str = date_obj.strftime('%d/%m/%Y %H:%M')
+                        except:
+                            date_str = app['date'][:16] if len(app['date']) >= 16 else app['date']
+                    row_cols = st.columns([2, 2.5, 1, 1, 1.5])
+                    row_cols[0].write(date_str)
+                    row_cols[1].write(app['filename'][:30] if app['filename'] else '-')
+                    row_cols[2].write('👤' if app['role'] == 'main' else '👥')
+                    row_cols[3].write(f"{app['duration']:.0f}s")
+                    # Audio player
+                    clip_path = app.get('clip_path')
+                    if clip_path and os.path.exists(clip_path):
+                        with open(clip_path, 'rb') as f:
+                            row_cols[4].audio(f.read(), format='audio/wav')
+                    else:
+                        row_cols[4].button("▶", disabled=True, key=f"no_audio_{vp.id}_{j}")
+def render_voiceprint_card(vp, analyzer, compact=False, key_suffix=""):
+    """Render a voiceprint card with audio player."""
+    flag_icon = "🚨" if vp.is_flagged else "🟡" if vp.times_seen >= 2 else "✅"
+    label = vp.label if hasattr(vp, 'label') and vp.label else None
+    display_name = f"{label} ({vp.id})" if label else vp.id
+    unique_key = f"{vp.id}_{key_suffix}"
+    cols = st.columns([3, 1, 1, 1]) if not compact else st.columns([4, 1, 1])
+    with cols[0]:
+        st.markdown(f"**{flag_icon} {display_name}**")
+        if not compact:
+            st.caption(f"First: {vp.first_seen.strftime('%Y-%m-%d') if vp.first_seen else '-'} · {vp.total_audio_seconds:.0f}s total")
+        # Get most recent clip for audio player
+        appearances = analyzer.get_voiceprint_history(vp.id)
+        if appearances and appearances[0].get('clip_path'):
+            clip_path = appearances[0]['clip_path']
+            if os.path.exists(clip_path):
+                with open(clip_path, 'rb') as f:
+                    st.audio(f.read(), format='audio/wav')
+    with cols[1]:
+        st.metric("Tests", vp.times_seen, label_visibility="collapsed")
+    if not compact:
+        with cols[2]:
+            if st.button("View", key=f"view_{unique_key}"):
+                st.session_state['view_voiceprint'] = vp.id
+                st.rerun()
+        with cols[3]:
+            # Quick flag toggle
+            new_flag = not vp.is_flagged
+            flag_label = "Unflag" if vp.is_flagged else "Flag"
+            if st.button(flag_label, key=f"flag_{unique_key}"):
+                analyzer.db.toggle_voiceprint_flag(vp.id, new_flag, "Manual flag" if new_flag else None)
+                st.rerun()
+    else:
+        with cols[2]:
+            if st.button("→", key=f"view_{unique_key}"):
+                st.session_state['view_voiceprint'] = vp.id
+                st.rerun()
+def render_appearance_timeline(timeline_data):
+    """Render timeline chart of voiceprint appearances."""
+    import plotly.express as px
+    import pandas as pd
+    if not timeline_data:
+        return
+    df = pd.DataFrame(timeline_data)
+    df['date'] = pd.to_datetime(df['date'])
+    df['day'] = df['date'].dt.strftime('%Y-%m-%d')
+    # Count appearances per day
+    daily_counts = df.groupby('day').size().reset_index(name='count')
+    daily_counts = daily_counts.sort_values('day')
+    fig = px.bar(
+        daily_counts,
+        x='day',
+        y='count',
+        labels={'day': 'Date', 'count': 'Appearances'},
+    )
+    fig.update_layout(
+        height=120,
+        margin=dict(l=0, r=0, t=10, b=30),
+        showlegend=False,
+        xaxis=dict(
+            type='category',
+            tickangle=-45
+        )
+    )
+    fig.update_traces(marker_color='#2563eb')
+    st.plotly_chart(fig, use_container_width=True)
+def render_voiceprint_detail(vp_id: str):
+    """Render detail view for a voiceprint."""
+    analyzer = get_analyzer()
+    vp = analyzer.db.get_voiceprint(vp_id)
+    if not vp:
+        st.error(f"Voiceprint {vp_id} not found")
+        return
+    # Header with editable label
+    flag_icon = "🚨" if vp.is_flagged else "🟡" if vp.times_seen >= 2 else "✅"
+    current_label = vp.label if hasattr(vp, 'label') and vp.label else ""
+    col_title, col_flag = st.columns([4, 1])
+    with col_title:
+        st.markdown(f"## {flag_icon} {vp_id}")
+    with col_flag:
+        # Flag toggle
+        new_flag = not vp.is_flagged
+        flag_btn = "🚩 Unflag" if vp.is_flagged else "🚩 Flag"
+        if st.button(flag_btn, key="detail_flag"):
+            analyzer.db.toggle_voiceprint_flag(vp_id, new_flag, "Manual flag" if new_flag else None)
+            st.rerun()
+    # Editable name/label
+    new_label = st.text_input("Name/Label", value=current_label, placeholder="e.g., Juan Pérez")
+    if new_label != current_label:
+        if st.button("💾 Save name"):
+            analyzer.db.update_voiceprint_label(vp_id, new_label)
+            st.success("Name saved!")
+            st.rerun()
+    # Stats row
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.metric("Times Seen", vp.times_seen)
+    with col2:
+        st.metric("Total Audio", f"{vp.total_audio_seconds:.0f}s")
+    with col3:
+        st.metric("First Seen", vp.first_seen.strftime('%Y-%m-%d') if vp.first_seen else '-')
+    if vp.flag_reason:
+        st.warning(f"Flag reason: {vp.flag_reason}")
+    # Notes/Comments
+    st.markdown("#### 📝 Notes")
+    current_notes = vp.notes if hasattr(vp, 'notes') and vp.notes else ""
+    new_notes = st.text_area("", value=current_notes, placeholder="Add notes about this voiceprint...", height=80)
+    if new_notes != current_notes:
+        if st.button("💾 Save notes"):
+            analyzer.db.update_voiceprint_notes(vp_id, new_notes)
+            st.success("Notes saved!")
+            st.rerun()
+    # Timeline for this voiceprint
+    st.markdown("#### 📈 Appearance Timeline")
+    vp_timeline = analyzer.db.get_appearance_timeline(vp_id)
+    if vp_timeline:
+        render_appearance_timeline(vp_timeline)
+    # Appearances list
+    st.markdown("#### 📋 Appearances")
+    appearances = analyzer.get_voiceprint_history(vp_id)
+    for app in appearances:
+        with st.container():
+            cols = st.columns([1, 2, 1, 2])
+            with cols[0]:
+                date_str = app['date'][:10] if app['date'] else '-'
+                st.markdown(f"**{date_str}**")
+            with cols[1]:
+                st.caption(f"{app['filename']}")
+            with cols[2]:
+                role_icon = "👤" if app['role'] == 'main' else "👥"
+                st.markdown(f"{role_icon} {app['duration']:.0f}s")
+            with cols[3]:
+                # Audio player
+                if app['clip_path'] and os.path.exists(app['clip_path']):
+                    with open(app['clip_path'], 'rb') as f:
+                        st.audio(f.read(), format='audio/wav')
+def render_about_tab():
+    """Render the About tab with technical explanations."""
+    st.markdown("## How It Works")
+    st.markdown("This tool analyzes audio recordings to detect fraud patterns in voice-based assessments.")
+    st.markdown("---")
+    # Section 1: Speaker Diarization
+    st.markdown("### 🎭 Speaker Diarization")
+    with st.container(border=True):
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            st.markdown("""
+**What is it?**
+Speaker diarization answers the question *"who spoke when?"* — it segments audio by speaker identity.
+**How we do it:**
+- Extract voice embeddings using **ECAPA-TDNN** neural network
+- Cluster similar embeddings to group speech by speaker
+- Label speakers as A, B, C... based on speaking time
+**Key metric:**
+`Speaking time` — total seconds each speaker talks
+            """)
+        with col2:
+            st.markdown("""
+```
+Audio Timeline:
+├─ Speaker A ████░░████░░
+├─ Speaker B ░░░░██░░░░██
+└─ Speaker C ░░░░░░░░██░░
+```
+            """)
+    st.markdown("")
+    # Section 2: Voiceprint Matching
+    st.markdown("### 🔐 Voiceprint Matching")
+    with st.container(border=True):
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            st.markdown("""
+**What is it?**
+A voiceprint is a unique numerical representation (embedding) of a person's voice characteristics.
+**How matching works:**
+1. Extract 192-dimensional embedding vector
+2. Compare with stored voiceprints using **cosine similarity**
+3. If similarity > threshold → same person
+**Threshold:**
+            """)
+            st.code("similarity_threshold = 0.80  # 80% match required", language="python")
+        with col2:
+            st.metric("Embedding Size", "192-dim")
+            st.metric("Match Threshold", "80%")
+            st.metric("Model", "ECAPA-TDNN")
+    st.markdown("")
+    # Section 3: Synthetic Voice Detection
+    st.markdown("### 🤖 Synthetic Voice Detection")
+    with st.container(border=True):
+        st.markdown("""
+**What we detect:**
+| Type | Description | Indicators |
+|------|-------------|------------|
+| **TTS** | Text-to-Speech (ElevenLabs, etc.) | Flat pitch, regular timing, smooth spectrum |
+| **Voice Clone** | AI-generated voice copy | Unnatural prosody, artifacts |
+| **Playback** | Pre-recorded audio through speakers | Room acoustics, compression artifacts |
+**Detection methods:**
+        """)
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.markdown("""
+**Pitch Analysis**
+```python
+# TTS has very consistent pitch
+pitch_cv < 0.08  # Coefficient of variation
+```
+            """)
+        with col2:
+            st.markdown("""
+**Timing Regularity**
+```python
+# TTS has robotic timing
+timing_std < 0.05  # seconds
+```
+            """)
+        with col3:
+            st.markdown("""
+**Spectral Smoothness**
+```python
+# Natural speech has texture
+spectral_flux > threshold
+```
+            """)
+    st.markdown("")
+    # Section 4: Wake Word Detection
+    st.markdown("### 🎯 Wake Word Detection")
+    with st.container(border=True):
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            st.markdown("""
+**Purpose:**
+Detect if someone is using voice assistants or getting external help during the test.
+**Words we detect:**
+            """)
+            st.code("""
+WAKE_WORDS = [
+    "alexa", "siri", "hey google", "ok google",
+    "cortana", "hey chat", "transfer", "send money"
+]
+            """, language="python")
+        with col2:
+            st.warning("🔴 **Alert triggered** when wake words detected")
+    st.markdown("")
+    # Section 5: Fraud Detection Module
+    st.markdown("### 🕵️ Fraud Detection Module")
+    with st.container(border=True):
+        st.markdown("""
+Three specialized detectors analyze speech patterns to identify potential cheating:
+        """)
+        # Whisper Detection
+        st.markdown("#### 🔇 Whisper Detection")
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            st.markdown("""
+**Purpose:** Detect low-volume background voices that may indicate someone is being prompted.
+**How it works:**
+- Analyzes audio energy in frames (25ms windows)
+- Calculates spectral centroid (whispers have higher frequencies)
+- Measures zero-crossing rate (breathy sounds have higher ZCR)
+- Filters segments that overlap with main speaker
+            """)
+        with col2:
+            st.code("""
+# Whisper characteristics
+energy < 30% of main speech
+spectral_centroid > 0.15
+zero_crossing_rate > 0.1
+            """, language="python")
+        st.markdown("---")
+        # Reading Pattern Detection
+        st.markdown("#### 📖 Reading Pattern Detection")
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            st.markdown("""
+**Purpose:** Detect if someone is reading prepared answers vs speaking naturally.
+**Indicators analyzed:**
+| Indicator | Natural Speech | Reading |
+|-----------|---------------|---------|
+| Speech rate | Variable (CV > 0.15) | Constant |
+| Filler words | 2+ per minute | Few/none |
+| Pause pattern | Irregular | Regular |
+| Self-corrections | Present | Absent |
+            """)
+        with col2:
+            st.code("""
+FILLER_WORDS = [
+  'um', 'uh', 'like',
+  'you know', 'basically',
+  'i mean', 'sort of'
+]
+# < 2 fillers/min = suspicious
+            """, language="python")
+        st.markdown("---")
+        # Suspicious Pause Detection
+        st.markdown("#### ⏸️ Suspicious Pause Detection")
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            st.markdown("""
+**Purpose:** Identify abnormally long silences that may indicate looking up answers.
+**Pause classification:**
+| Duration | Classification |
+|----------|---------------|
+| < 2s | Natural thinking pause |
+| 2-5s | Extended pause (warning) |
+| > 5s | **Suspicious** - may indicate cheating |
+**Context captured:** What was said before/after the pause
+            """)
+        with col2:
+            st.metric("Suspicious Threshold", "> 5 seconds")
+            st.caption("Long silences may indicate:")
+            st.markdown("""
+- Looking up answers
+- Receiving external help
+- Reading from a source
+- Searching on phone/computer
+            """)
+    st.markdown("")
+    # Section 6: Technology Stack
+    st.markdown("### 🛠️ Technology Stack")
+    with st.container(border=True):
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.markdown("""
+**🧠 SpeechBrain**
+- Open-source speech toolkit
+- PyTorch-based
+- Pre-trained models
+- [speechbrain.github.io](https://speechbrain.github.io)
+            """)
+        with col2:
+            st.markdown("""
+**🎤 Models Used**
+- `spkrec-ecapa-voxceleb` — Speaker embedding
+- `vad-crdnn-libriparty` — Voice Activity Detection
+- `asr-wav2vec2` — Transcription
+            """)
+        with col3:
+            st.markdown("""
+**📊 Thresholds**
+| Parameter | Value |
+|-----------|-------|
+| Min audio | 20s |
+| Voice match | 80% |
+| Synthetic | 45% |
+| Voice sample | 10s |
+            """)
+    st.markdown("")
+    # Section 7: Flags & Alerts
+    st.markdown("### 🚨 Flags & Alert System")
+    with st.container(border=True):
+        st.markdown("""
+| Icon | Status | Meaning |
+|------|--------|---------|
+| ✅ | OK | Voice appears normal, seen 1 time |
+| 🟡 | Review | Voice seen in 2-3 tests — verify identity |
+| 🚨 | Flagged | Voice seen 4+ times OR manually flagged — investigate |
+**Detection indicators:**
+| Icon | Detection | Description |
+|------|-----------|-------------|
+| 🔊 | Playback | Audio played through speakers |
+| 📖 | Reading | Unnatural speech rhythm (reading prepared text) |
+| 🔇 | Whispers | Background voices detected |
+| ⏸️ | Long Pauses | Silences > 5 seconds |
+| 🔴 | Wake Words | "Alexa", "Siri", etc. detected |
+**Auto-flag conditions:**
+- Same voice in 4+ different tests
+- High synthetic voice score (>45%)
+- Wake words detected during test
+- Multiple fraud indicators triggered
+        """)
+    st.markdown("---")
+    st.caption("Built with SpeechBrain, Streamlit, and PyTorch · [GitHub](https://github.com/daasime/sop-audio-analyzer)")
+def main():
+    """Main app."""
+    # Check login first
+    if not check_login():
+        return
+    st.title("🎙️ Test Integrity Analysis")
+    st.markdown("Monitor and review voice authentication results")
+    # Logout button in sidebar
+    with st.sidebar:
+        st.markdown(f"**Logged in**")
+        if st.button("🚪 Logout"):
+            st.session_state['authenticated'] = False
+            st.rerun()
+    # Tabs
+    tab1, tab2, tab3 = st.tabs(["Analyzer", "Database", "About"])
+    with tab1:
+        render_analyzer_tab()
+    with tab2:
+        render_database_tab()
+    with tab3:
+        render_about_tab()
+if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -1,3 +1,28 @@
-altair
-pandas
-streamlit

+# Core ML - compatible with Python 3.11 (for HF Spaces)
+torch==2.5.1
+torchaudio==2.5.1
+speechbrain>=1.0.0
+# Audio processing
+librosa>=0.10.0
+soundfile>=0.12.1
+pydub>=0.25.1
+# Transcription
+openai-whisper>=20231117
+# Scientific computing
+numpy>=1.24.0
+scipy>=1.10.0
+scikit-learn>=1.3.0
+# Database
+sqlalchemy>=2.0.0
+# UI
+streamlit>=1.29.0
+streamlit-webrtc>=0.47.0
+plotly>=5.18.0
+# Utilities
+python-dotenv>=1.0.0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Patch huggingface_hub to handle deprecated use_auth_token parameter
+# This is needed because speechbrain uses the old API
+import huggingface_hub
+_original_hf_hub_download = huggingface_hub.hf_hub_download
+def _patched_hf_hub_download(*args, **kwargs):
+    # Convert deprecated use_auth_token to token
+    if 'use_auth_token' in kwargs:
+        kwargs['token'] = kwargs.pop('use_auth_token')
+    return _original_hf_hub_download(*args, **kwargs)
+huggingface_hub.hf_hub_download = _patched_hf_hub_download
+# Also patch snapshot_download if it exists
+if hasattr(huggingface_hub, 'snapshot_download'):
+    _original_snapshot_download = huggingface_hub.snapshot_download
+    def _patched_snapshot_download(*args, **kwargs):
+        if 'use_auth_token' in kwargs:
+            kwargs['token'] = kwargs.pop('use_auth_token')
+        return _original_snapshot_download(*args, **kwargs)
+    huggingface_hub.snapshot_download = _patched_snapshot_download
+from .analyzer import AudioAnalyzer, AnalysisResult, SpeakerResult
+__all__ = ['AudioAnalyzer', 'AnalysisResult', 'SpeakerResult']

src/analyzer.py ADDED Viewed

	@@ -0,0 +1,597 @@

+"""
+Main Audio Analyzer - orchestrates all analysis phases.
+"""
+import os
+import uuid
+import json
+import tempfile
+from datetime import datetime
+from typing import Dict, List, Optional, Callable
+from dataclasses import dataclass, asdict
+import numpy as np
+import torch
+import torchaudio
+def to_python_type(obj):
+    """Convert numpy types to Python native types for JSON serialization."""
+    if isinstance(obj, (np.bool_, bool)):
+        return bool(obj)
+    elif isinstance(obj, (np.integer, np.int64, np.int32)):
+        return int(obj)
+    elif isinstance(obj, (np.floating, np.float64, np.float32)):
+        return float(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, dict):
+        return {k: to_python_type(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [to_python_type(i) for i in obj]
+    return obj
+from .phase1_foundation import (
+    AudioPreprocessor,
+    VoiceActivityDetector,
+    SpeakerDiarizer,
+    VoiceprintExtractor,
+    VoiceprintResult
+)
+from .phase2_background import BackgroundAnalyzer, BackgroundAnomaly
+from .phase6_synthetic import SyntheticDetector, WakeWordDetector, PlaybackDetector
+from .fraud_detection import (
+    WhisperDetector, WhisperResult,
+    ReadingPatternAnalyzer, ReadingPatternResult,
+    SuspiciousPauseDetector, PauseResult
+)
+from .database import Database
+@dataclass
+class SpeakerResult:
+    """Result for a detected speaker."""
+    voiceprint_id: str
+    label: str
+    role: str  # "main" or "additional"
+    total_seconds: float
+    quality: str
+    is_synthetic: bool
+    synthetic_score: float
+    is_playback: bool = False
+    playback_score: float = 0.0
+    playback_indicators: List[str] = None
+    times_seen: int = 1
+    is_flagged: bool = False
+    segments: List[dict] = None
+    clip_path: str = None  # Path to audio sample for this speaker
+@dataclass
+class AnalysisResult:
+    """Complete analysis result."""
+    test_id: str
+    filename: str
+    duration_seconds: float
+    analyzed_at: str
+    # Speakers
+    main_speaker: Optional[SpeakerResult]
+    additional_speakers: List[SpeakerResult]
+    # Background
+    background_anomalies: List[dict]
+    # Wake words
+    wake_words: List[dict]
+    assistant_responses: List[dict]
+    # Prompt voice (audio from question prompts)
+    prompt_voice_detected: bool
+    prompt_voice_seconds: float
+    # Playback detection (global)
+    playback_detected: bool = False
+    playback_score: float = 0.0
+    playback_indicators: List[str] = None
+    # Fraud detection - Whisper (background voices)
+    whisper_detected: bool = False
+    whisper_instances: List[dict] = None
+    # Fraud detection - Reading pattern
+    reading_pattern_detected: bool = False
+    reading_confidence: float = 0.0
+    reading_indicators: List[str] = None
+    # Fraud detection - Suspicious pauses
+    suspicious_pauses_detected: bool = False
+    suspicious_pauses: List[dict] = None
+    longest_pause: float = 0.0
+    def to_dict(self) -> dict:
+        """Convert to dictionary."""
+        result = {
+            'test_id': self.test_id,
+            'filename': self.filename,
+            'duration_seconds': float(self.duration_seconds),
+            'analyzed_at': self.analyzed_at,
+            'main_speaker': to_python_type(asdict(self.main_speaker)) if self.main_speaker else None,
+            'additional_speakers': [to_python_type(asdict(s)) for s in self.additional_speakers],
+            'background_anomalies': to_python_type(self.background_anomalies),
+            'wake_words': to_python_type(self.wake_words),
+            'assistant_responses': to_python_type(self.assistant_responses),
+            'prompt_voice_detected': bool(self.prompt_voice_detected),
+            'prompt_voice_seconds': float(self.prompt_voice_seconds),
+            'playback_detected': bool(self.playback_detected),
+            'playback_score': float(self.playback_score),
+            'playback_indicators': self.playback_indicators or [],
+            # Fraud detection fields
+            'whisper_detected': bool(self.whisper_detected),
+            'whisper_instances': to_python_type(self.whisper_instances or []),
+            'reading_pattern_detected': bool(self.reading_pattern_detected),
+            'reading_confidence': float(self.reading_confidence),
+            'reading_indicators': self.reading_indicators or [],
+            'suspicious_pauses_detected': bool(self.suspicious_pauses_detected),
+            'suspicious_pauses': to_python_type(self.suspicious_pauses or []),
+            'longest_pause': float(self.longest_pause)
+        }
+        return result
+    def to_json(self) -> str:
+        """Convert to JSON string."""
+        return json.dumps(self.to_dict(), indent=2)
+class AudioAnalyzer:
+    """Main analyzer that orchestrates all phases."""
+    def __init__(self, db_path: str = "data/db/voiceprints.db",
+                 clips_dir: str = "data/clips",
+                 device: str = None):
+        """
+        Initialize analyzer.
+        Args:
+            db_path: Path to SQLite database
+            clips_dir: Directory to save audio clips
+            device: torch device (cuda/cpu)
+        """
+        self.device = device
+        self.clips_dir = clips_dir
+        os.makedirs(clips_dir, exist_ok=True)
+        # Initialize database
+        self.db = Database(db_path)
+        # Initialize components (lazy loaded)
+        self._preprocessor = None
+        self._vad = None
+        self._diarizer = None
+        self._voiceprint = None
+        self._background = None
+        self._synthetic = None
+        self._playback = None
+        self._wake_words = None
+        # Fraud detectors
+        self._whisper_detector = None
+        self._reading_pattern = None
+        self._pause_detector = None
+    @property
+    def preprocessor(self):
+        if self._preprocessor is None:
+            self._preprocessor = AudioPreprocessor()
+        return self._preprocessor
+    @property
+    def vad(self):
+        if self._vad is None:
+            self._vad = VoiceActivityDetector(device=self.device)
+        return self._vad
+    @property
+    def diarizer(self):
+        if self._diarizer is None:
+            self._diarizer = SpeakerDiarizer(device=self.device)
+        return self._diarizer
+    @property
+    def voiceprint_extractor(self):
+        if self._voiceprint is None:
+            self._voiceprint = VoiceprintExtractor(device=self.device)
+        return self._voiceprint
+    @property
+    def background_analyzer(self):
+        if self._background is None:
+            self._background = BackgroundAnalyzer()
+        return self._background
+    @property
+    def synthetic_detector(self):
+        if self._synthetic is None:
+            self._synthetic = SyntheticDetector(device=self.device)
+        return self._synthetic
+    @property
+    def playback_detector(self):
+        if self._playback is None:
+            self._playback = PlaybackDetector()
+        return self._playback
+    @property
+    def wake_word_detector(self):
+        if self._wake_words is None:
+            self._wake_words = WakeWordDetector(model_size="base")
+        return self._wake_words
+    @property
+    def whisper_detector(self):
+        if self._whisper_detector is None:
+            self._whisper_detector = WhisperDetector()
+        return self._whisper_detector
+    @property
+    def reading_pattern_analyzer(self):
+        if self._reading_pattern is None:
+            self._reading_pattern = ReadingPatternAnalyzer()
+        return self._reading_pattern
+    @property
+    def pause_detector(self):
+        if self._pause_detector is None:
+            self._pause_detector = SuspiciousPauseDetector()
+        return self._pause_detector
+    def analyze(self, audio_path: str,
+                test_id: str = None,
+                progress_callback: Callable[[str, int], None] = None) -> AnalysisResult:
+        """
+        Run full analysis on audio file.
+        Args:
+            audio_path: Path to audio file
+            test_id: Optional test ID (generated if not provided)
+            progress_callback: Optional callback for progress updates
+        Returns:
+            AnalysisResult with all findings
+        """
+        def update_progress(msg: str, pct: int):
+            if progress_callback:
+                progress_callback(msg, pct)
+        # Generate test ID
+        if test_id is None:
+            test_id = f"test_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:6]}"
+        filename = os.path.basename(audio_path)
+        # Step 1: Preprocess
+        update_progress("Preprocessing audio...", 5)
+        waveform, sample_rate, metadata = self.preprocessor.process_file(audio_path)
+        duration = metadata['normalized_duration']
+        # Validate minimum audio duration (20 seconds)
+        MIN_DURATION = 20.0
+        if duration < MIN_DURATION:
+            raise ValueError(f"Audio too short: {duration:.1f}s. Minimum required: {MIN_DURATION:.0f}s")
+        # Save normalized audio to temp file for other components
+        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
+            temp_path = f.name
+        torchaudio.save(temp_path, waveform, sample_rate)
+        try:
+            # Step 2: VAD
+            update_progress("Detecting voice activity...", 15)
+            speech_segments = self.vad.detect(temp_path)
+            # Step 3: Speaker Diarization
+            update_progress("Identifying speakers...", 30)
+            speakers = self.diarizer.diarize(temp_path, speech_segments)
+            # Step 4: Process speakers
+            update_progress("Extracting voiceprints...", 45)
+            main_speaker_result = None
+            additional_speakers = []
+            speaker_list = list(speakers.values())
+            # First pass: recalculate actual speaking time for all speakers
+            for speaker_info in speaker_list:
+                actual_speaking_time = sum(seg.end - seg.start for seg in speaker_info.segments)
+                actual_speaking_time = min(actual_speaking_time, duration)  # Cap to audio duration
+                speaker_info.total_seconds = actual_speaking_time
+            # Re-sort by speaking time (most speaking = main speaker)
+            speaker_list = sorted(speaker_list, key=lambda s: s.total_seconds, reverse=True)
+            for i, speaker_info in enumerate(speaker_list):
+                # Extract voiceprint
+                if speaker_info.embedding is not None:
+                    vp_result = self.voiceprint_extractor.extract_from_embedding(
+                        speaker_info.embedding,
+                        speaker_info.total_seconds
+                    )
+                    # Check for synthetic
+                    # Get speaker audio segments and run detection
+                    synthetic_result = self._detect_synthetic_for_speaker(
+                        waveform, sample_rate, speaker_info
+                    )
+                    role = "main" if i == 0 else "additional"
+                    # Save to database and check for matches
+                    existing_vp, similarity = self.db.find_matching_voiceprint(
+                        vp_result.to_bytes(),
+                        threshold=0.75
+                    )
+                    if existing_vp:
+                        vp_id = existing_vp.id
+                        times_seen = existing_vp.times_seen + 1
+                        is_flagged = existing_vp.is_flagged or times_seen >= 4
+                    else:
+                        vp_id = vp_result.voiceprint_id
+                        times_seen = 1
+                        is_flagged = False
+                    # Save clip for this speaker
+                    clip_path = self._save_speaker_clip(
+                        waveform, sample_rate, speaker_info, test_id, vp_id
+                    )
+                    # Add to database
+                    self.db.add_voiceprint(
+                        vp_id=vp_id,
+                        embedding=vp_result.to_bytes(),
+                        test_id=test_id,
+                        filename=filename,
+                        role=role,
+                        duration=speaker_info.total_seconds,
+                        clip_path=clip_path
+                    )
+                    speaker_result = SpeakerResult(
+                        voiceprint_id=vp_id,
+                        label=speaker_info.speaker_id,
+                        role=role,
+                        total_seconds=speaker_info.total_seconds,
+                        quality=self.voiceprint_extractor.quality_label(vp_result.quality_score),
+                        is_synthetic=synthetic_result.is_synthetic,
+                        synthetic_score=synthetic_result.score,
+                        times_seen=times_seen,
+                        is_flagged=is_flagged,
+                        segments=[{'start': s.start, 'end': s.end} for s in speaker_info.segments],
+                        clip_path=clip_path
+                    )
+                    if i == 0:
+                        main_speaker_result = speaker_result
+                    else:
+                        additional_speakers.append(speaker_result)
+            # Step 5: Background Analysis
+            update_progress("Analyzing background audio...", 55)
+            waveform_np = waveform.squeeze().numpy()
+            anomalies = self.background_analyzer.detect_anomalies(
+                waveform_np, speech_segments
+            )
+            # Step 6: Playback Detection (detect if audio is from speakers)
+            update_progress("Detecting playback/replay...", 65)
+            playback_result = self.playback_detector.detect(waveform_np)
+            # Step 7: Wake Word Detection
+            update_progress("Detecting wake words...", 70)
+            wake_analysis = self.wake_word_detector.analyze(temp_path)
+            # Step 8: Fraud Detection - Whisper, Reading Pattern, Suspicious Pauses
+            update_progress("Running fraud detection...", 80)
+            # 8a: Whisper detection (background voices)
+            main_speaker_segs = []
+            if main_speaker_result and main_speaker_result.segments:
+                main_speaker_segs = main_speaker_result.segments
+            whisper_result = self.whisper_detector.detect(
+                waveform_np, sample_rate, main_speaker_segs
+            )
+            # 8b: Reading pattern detection (uses wake word transcription)
+            word_timestamps = wake_analysis.get('word_timestamps', [])
+            transcription = wake_analysis.get('transcription', '')
+            reading_result = self.reading_pattern_analyzer.analyze(
+                transcription, word_timestamps, duration
+            )
+            # 8c: Suspicious pause detection
+            speech_segments_dict = [{'start': s.start, 'end': s.end} for s in speech_segments]
+            pause_result = self.pause_detector.detect(speech_segments_dict, duration)
+            # Step 9: Compile results
+            update_progress("Compiling results...", 90)
+            # Detect prompt voice (simplified: assume first few seconds might be prompt)
+            prompt_seconds = sum(
+                s.duration for s in speech_segments
+                if s.start < 5.0  # First 5 seconds
+            )
+            result = AnalysisResult(
+                test_id=test_id,
+                filename=filename,
+                duration_seconds=duration,
+                analyzed_at=datetime.now().isoformat(),
+                main_speaker=main_speaker_result,
+                additional_speakers=additional_speakers,
+                background_anomalies=[
+                    {
+                        'start': a.start,
+                        'end': a.end,
+                        'type': a.anomaly_type.value,
+                        'amplitude_db': a.amplitude_db,
+                        'confidence': a.confidence
+                    }
+                    for a in anomalies
+                ],
+                wake_words=[
+                    {
+                        'word': w.word,
+                        'assistant': w.assistant,
+                        'time': w.time,
+                        'confidence': w.confidence,
+                        'context': w.context
+                    }
+                    for w in wake_analysis['wake_words']
+                ],
+                assistant_responses=wake_analysis['assistant_responses'],
+                prompt_voice_detected=prompt_seconds > 0,
+                prompt_voice_seconds=prompt_seconds,
+                playback_detected=playback_result.is_playback,
+                playback_score=playback_result.score,
+                playback_indicators=playback_result.indicators,
+                # Fraud detection results
+                whisper_detected=whisper_result.detected,
+                whisper_instances=[
+                    {'start': w.start, 'end': w.end, 'confidence': w.confidence}
+                    for w in whisper_result.instances
+                ],
+                reading_pattern_detected=reading_result.is_reading,
+                reading_confidence=reading_result.confidence,
+                reading_indicators=reading_result.indicators,
+                suspicious_pauses_detected=pause_result.detected,
+                suspicious_pauses=[
+                    {'start': p.start, 'end': p.end, 'duration': p.duration, 'context': p.context}
+                    for p in pause_result.pauses
+                ],
+                longest_pause=pause_result.longest_pause
+            )
+            # Save analysis to database
+            self.db.save_test_analysis(
+                test_id=test_id,
+                filename=filename,
+                duration=duration,
+                results=result.to_dict()
+            )
+            update_progress("Analysis complete!", 100)
+            return result
+        finally:
+            # Cleanup temp file
+            if os.path.exists(temp_path):
+                os.remove(temp_path)
+    def _detect_synthetic_for_speaker(self, waveform, sample_rate, speaker_info):
+        """Run synthetic detection on speaker's audio.
+        Combines both SyntheticDetector (voice characteristics) and
+        PlaybackDetector (TTS/speaker playback) for better detection.
+        """
+        from .phase6_synthetic import SyntheticResult
+        # Concatenate speaker segments
+        segments_audio = []
+        for seg in speaker_info.segments[:5]:  # Limit to first 5 segments
+            start_sample = int(seg.start * sample_rate)
+            end_sample = int(seg.end * sample_rate)
+            if end_sample <= waveform.shape[1]:
+                segments_audio.append(waveform[:, start_sample:end_sample])
+        if not segments_audio:
+            return SyntheticResult.from_score(0.0)
+        speaker_audio = np.concatenate([s.squeeze().numpy() for s in segments_audio])
+        # Run both detectors on speaker's audio
+        synthetic_result = self.synthetic_detector.detect(speaker_audio)
+        playback_result = self.playback_detector.detect(speaker_audio)
+        # Combine scores: if either detects synthetic/TTS, flag it
+        # Playback with TTS indicators is strong evidence of synthetic
+        tts_indicators = ['tts_flat_pitch', 'tts_low_pitch_variation', 'tts_regular_timing',
+                         'smooth_spectrum', 'slightly_smooth_spectrum']
+        has_tts_indicators = any(ind in playback_result.indicators for ind in tts_indicators)
+        # Calculate combined score
+        if has_tts_indicators:
+            # Strong TTS evidence from playback detector
+            combined_score = max(synthetic_result.score, playback_result.score * 0.9)
+        else:
+            # Weight synthetic detector more, but consider playback
+            combined_score = synthetic_result.score * 0.7 + playback_result.score * 0.3
+        # Boost if both detectors agree
+        if synthetic_result.score > 0.4 and playback_result.score > 0.4:
+            combined_score = min(1.0, combined_score * 1.2)
+        return SyntheticResult.from_score(combined_score, threshold=0.45)
+    def _save_speaker_clip(self, waveform, sample_rate, speaker_info, test_id, vp_id):
+        """Save audio clip for a speaker (minimum 10 seconds for voice sample)."""
+        segments = sorted(speaker_info.segments, key=lambda s: s.start)
+        if not segments:
+            return None
+        # Merge overlapping segments first
+        merged_segments = []
+        for seg in segments:
+            if merged_segments and seg.start <= merged_segments[-1][1]:
+                # Overlap - extend previous segment
+                merged_segments[-1] = (merged_segments[-1][0], max(merged_segments[-1][1], seg.end))
+            else:
+                merged_segments.append((seg.start, seg.end))
+        # Concatenate segments until we have at least 10 seconds for voice sample
+        target_duration = 10.0
+        clips = []
+        total_duration = 0.0
+        for start, end in merged_segments:
+            start_sample = int(start * sample_rate)
+            end_sample = int(end * sample_rate)
+            if end_sample <= waveform.shape[1]:
+                clips.append(waveform[:, start_sample:end_sample])
+                total_duration += (end - start)
+                if total_duration >= target_duration:
+                    break
+        if not clips:
+            return None
+        # Concatenate all clips
+        clip = torch.cat(clips, dim=1)
+        # Save clip
+        clip_filename = f"{test_id}_{vp_id}_{total_duration:.1f}s.wav"
+        clip_path = os.path.join(self.clips_dir, clip_filename)
+        torchaudio.save(clip_path, clip, sample_rate)
+        return clip_path
+    def get_voiceprint_history(self, vp_id: str) -> List[dict]:
+        """Get appearance history for a voiceprint."""
+        appearances = self.db.get_voiceprint_appearances(vp_id)
+        return [
+            {
+                'test_id': a.test_id,
+                'filename': a.test_filename,
+                'role': a.role,
+                'duration': a.duration_seconds,
+                'date': a.detected_at.isoformat() if a.detected_at else None,
+                'clip_path': a.clip_path
+            }
+            for a in appearances
+        ]
+    def get_database_stats(self) -> dict:
+        """Get database statistics."""
+        return self.db.get_stats()

src/database/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .models import Database, Voiceprint, VoiceprintAppearance, TestAnalysis
2	+
3	+ __all__ = ['Database', 'Voiceprint', 'VoiceprintAppearance', 'TestAnalysis']

src/database/models.py ADDED Viewed

	@@ -0,0 +1,320 @@

+"""
+Database models for voiceprint tracking.
+"""
+from sqlalchemy import create_engine, Column, String, Float, Integer, DateTime, Boolean, ForeignKey, LargeBinary
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker, relationship
+from datetime import datetime
+import os
+Base = declarative_base()
+class Voiceprint(Base):
+    """Unique voice identity."""
+    __tablename__ = 'voiceprints'
+    id = Column(String(20), primary_key=True)  # vp_xxxxxxxx
+    embedding = Column(LargeBinary, nullable=False)  # 192-dim vector as bytes
+    first_seen = Column(DateTime, default=datetime.utcnow)
+    times_seen = Column(Integer, default=1)
+    total_audio_seconds = Column(Float, default=0.0)
+    is_flagged = Column(Boolean, default=False)
+    flag_reason = Column(String(200), nullable=True)
+    # User-editable fields
+    label = Column(String(100), nullable=True)  # Human-friendly name (e.g., "Juan Pérez")
+    notes = Column(String(1000), nullable=True)  # User comments/notes
+    # Relationships
+    appearances = relationship("VoiceprintAppearance", back_populates="voiceprint")
+class VoiceprintAppearance(Base):
+    """Track where a voiceprint appears."""
+    __tablename__ = 'voiceprint_appearances'
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    voiceprint_id = Column(String(20), ForeignKey('voiceprints.id'), nullable=False)
+    test_id = Column(String(50), nullable=False)
+    test_filename = Column(String(200), nullable=False)
+    role = Column(String(20), nullable=False)  # 'main' or 'additional'
+    duration_seconds = Column(Float, nullable=False)
+    detected_at = Column(DateTime, default=datetime.utcnow)
+    clip_path = Column(String(500), nullable=True)  # Path to extracted audio clip
+    # Relationships
+    voiceprint = relationship("Voiceprint", back_populates="appearances")
+class TestAnalysis(Base):
+    """Store analysis results per test."""
+    __tablename__ = 'test_analyses'
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    test_id = Column(String(50), unique=True, nullable=False)
+    filename = Column(String(200), nullable=False)
+    duration_seconds = Column(Float, nullable=False)
+    analyzed_at = Column(DateTime, default=datetime.utcnow)
+    # Main speaker
+    main_voiceprint_id = Column(String(20), ForeignKey('voiceprints.id'), nullable=True)
+    main_speech_seconds = Column(Float, default=0.0)
+    main_quality = Column(String(20), nullable=True)
+    # Detection counts
+    additional_speakers_count = Column(Integer, default=0)
+    background_anomalies_count = Column(Integer, default=0)
+    wake_words_count = Column(Integer, default=0)
+    # Synthetic detection
+    synthetic_score = Column(Float, default=0.0)
+    is_synthetic = Column(Boolean, default=False)
+    # JSON results (full analysis)
+    results_json = Column(String, nullable=True)
+class Database:
+    """Database manager."""
+    def __init__(self, db_path: str = "data/db/voiceprints.db"):
+        self.db_path = db_path
+        os.makedirs(os.path.dirname(db_path), exist_ok=True)
+        self.engine = create_engine(f'sqlite:///{db_path}')
+        Base.metadata.create_all(self.engine)
+        self.Session = sessionmaker(bind=self.engine)
+    def get_session(self):
+        return self.Session()
+    def add_voiceprint(self, vp_id: str, embedding: bytes,
+                       test_id: str, filename: str, role: str,
+                       duration: float, clip_path: str = None):
+        """Add or update voiceprint and record appearance."""
+        session = self.get_session()
+        try:
+            # Check if voiceprint exists
+            vp = session.query(Voiceprint).filter_by(id=vp_id).first()
+            if vp:
+                # Update existing
+                vp.times_seen += 1
+                vp.total_audio_seconds += duration
+                # Check for flag conditions
+                if vp.times_seen >= 4:
+                    vp.is_flagged = True
+                    vp.flag_reason = f"Seen in {vp.times_seen} tests"
+            else:
+                # Create new
+                vp = Voiceprint(
+                    id=vp_id,
+                    embedding=embedding,
+                    total_audio_seconds=duration
+                )
+                session.add(vp)
+            # Record appearance
+            appearance = VoiceprintAppearance(
+                voiceprint_id=vp_id,
+                test_id=test_id,
+                test_filename=filename,
+                role=role,
+                duration_seconds=duration,
+                clip_path=clip_path
+            )
+            session.add(appearance)
+            session.commit()
+            return vp
+        except Exception as e:
+            session.rollback()
+            raise e
+        finally:
+            session.close()
+    def get_voiceprint(self, vp_id: str):
+        """Get voiceprint by ID."""
+        session = self.get_session()
+        try:
+            return session.query(Voiceprint).filter_by(id=vp_id).first()
+        finally:
+            session.close()
+    def get_all_voiceprints(self):
+        """Get all voiceprints."""
+        session = self.get_session()
+        try:
+            return session.query(Voiceprint).order_by(Voiceprint.times_seen.desc()).all()
+        finally:
+            session.close()
+    def get_flagged_voiceprints(self):
+        """Get flagged voiceprints."""
+        session = self.get_session()
+        try:
+            return session.query(Voiceprint).filter_by(is_flagged=True).all()
+        finally:
+            session.close()
+    def get_multi_appearance_voiceprints(self, min_appearances: int = 2):
+        """Get voiceprints seen in multiple tests."""
+        session = self.get_session()
+        try:
+            return session.query(Voiceprint).filter(
+                Voiceprint.times_seen >= min_appearances
+            ).order_by(Voiceprint.times_seen.desc()).all()
+        finally:
+            session.close()
+    def get_voiceprint_appearances(self, vp_id: str):
+        """Get all appearances of a voiceprint."""
+        session = self.get_session()
+        try:
+            return session.query(VoiceprintAppearance).filter_by(
+                voiceprint_id=vp_id
+            ).order_by(VoiceprintAppearance.detected_at.desc()).all()
+        finally:
+            session.close()
+    def find_matching_voiceprint(self, embedding: bytes, threshold: float = 0.80):
+        """Find existing voiceprint matching the embedding."""
+        import numpy as np
+        session = self.get_session()
+        try:
+            new_emb = np.frombuffer(embedding, dtype=np.float32)
+            for vp in session.query(Voiceprint).all():
+                stored_emb = np.frombuffer(vp.embedding, dtype=np.float32)
+                # Cosine similarity
+                similarity = np.dot(new_emb, stored_emb) / (
+                    np.linalg.norm(new_emb) * np.linalg.norm(stored_emb)
+                )
+                if similarity >= threshold:
+                    return vp, similarity
+            return None, 0.0
+        finally:
+            session.close()
+    def save_test_analysis(self, test_id: str, filename: str,
+                          duration: float, results: dict):
+        """Save full test analysis."""
+        import json
+        session = self.get_session()
+        try:
+            analysis = TestAnalysis(
+                test_id=test_id,
+                filename=filename,
+                duration_seconds=duration,
+                main_voiceprint_id=results.get('main_voiceprint_id'),
+                main_speech_seconds=results.get('main_speech_seconds', 0),
+                main_quality=results.get('main_quality'),
+                additional_speakers_count=len(results.get('additional_speakers', [])),
+                background_anomalies_count=len(results.get('background_anomalies', [])),
+                wake_words_count=len(results.get('wake_words', [])),
+                synthetic_score=results.get('synthetic_score', 0),
+                is_synthetic=results.get('is_synthetic', False),
+                results_json=json.dumps(results)
+            )
+            session.add(analysis)
+            session.commit()
+            return analysis
+        except Exception as e:
+            session.rollback()
+            raise e
+        finally:
+            session.close()
+    def get_stats(self):
+        """Get database statistics."""
+        session = self.get_session()
+        try:
+            return {
+                'total_tests': session.query(TestAnalysis).count(),
+                'total_voiceprints': session.query(Voiceprint).count(),
+                'flagged_voiceprints': session.query(Voiceprint).filter_by(is_flagged=True).count(),
+                'multi_appearance': session.query(Voiceprint).filter(Voiceprint.times_seen >= 2).count()
+            }
+        finally:
+            session.close()
+    def update_voiceprint_label(self, vp_id: str, label: str):
+        """Update voiceprint label/name."""
+        session = self.get_session()
+        try:
+            vp = session.query(Voiceprint).filter_by(id=vp_id).first()
+            if vp:
+                vp.label = label
+                session.commit()
+                return True
+            return False
+        except Exception as e:
+            session.rollback()
+            raise e
+        finally:
+            session.close()
+    def update_voiceprint_notes(self, vp_id: str, notes: str):
+        """Update voiceprint notes/comments."""
+        session = self.get_session()
+        try:
+            vp = session.query(Voiceprint).filter_by(id=vp_id).first()
+            if vp:
+                vp.notes = notes
+                session.commit()
+                return True
+            return False
+        except Exception as e:
+            session.rollback()
+            raise e
+        finally:
+            session.close()
+    def toggle_voiceprint_flag(self, vp_id: str, flagged: bool, reason: str = None):
+        """Manually flag/unflag a voiceprint."""
+        session = self.get_session()
+        try:
+            vp = session.query(Voiceprint).filter_by(id=vp_id).first()
+            if vp:
+                vp.is_flagged = flagged
+                vp.flag_reason = reason if flagged else None
+                session.commit()
+                return True
+            return False
+        except Exception as e:
+            session.rollback()
+            raise e
+        finally:
+            session.close()
+    def get_similarity_threshold(self):
+        """Get current similarity threshold (default 0.80)."""
+        # Could be stored in a settings table, for now return default
+        return 0.80
+    def get_appearance_timeline(self, vp_id: str = None):
+        """Get appearances over time for timeline chart."""
+        session = self.get_session()
+        try:
+            query = session.query(VoiceprintAppearance)
+            if vp_id:
+                query = query.filter_by(voiceprint_id=vp_id)
+            appearances = query.order_by(VoiceprintAppearance.detected_at).all()
+            return [
+                {
+                    'date': a.detected_at,
+                    'voiceprint_id': a.voiceprint_id,
+                    'test_id': a.test_id,
+                    'role': a.role,
+                    'duration': a.duration_seconds
+                }
+                for a in appearances
+            ]
+        finally:
+            session.close()

src/fraud_detection/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""
+Fraud Detection Module
+Detects suspicious patterns in audio that may indicate cheating.
+"""
+from .whisper_detector import WhisperDetector, WhisperResult, WhisperInstance
+from .reading_pattern import ReadingPatternAnalyzer, ReadingPatternResult
+from .pause_detector import SuspiciousPauseDetector, PauseResult, SuspiciousPause
+__all__ = [
+    'WhisperDetector', 'WhisperResult', 'WhisperInstance',
+    'ReadingPatternAnalyzer', 'ReadingPatternResult',
+    'SuspiciousPauseDetector', 'PauseResult', 'SuspiciousPause'
+]

src/fraud_detection/pause_detector.py ADDED Viewed

	@@ -0,0 +1,235 @@

+"""
+Suspicious Pause Detector
+Detects abnormally long silences that may indicate the speaker is looking up
+answers or receiving help during a test.
+"""
+import numpy as np
+from dataclasses import dataclass, field
+from typing import List, Optional
+@dataclass
+class SuspiciousPause:
+    """A detected suspicious pause."""
+    start: float
+    end: float
+    duration: float
+    context: str = ""  # What happened before/after
+@dataclass
+class PauseResult:
+    """Result of suspicious pause detection."""
+    detected: bool
+    pauses: List[SuspiciousPause] = field(default_factory=list)
+    total_suspicious_time: float = 0.0
+    longest_pause: float = 0.0
+    @property
+    def count(self) -> int:
+        return len(self.pauses)
+class SuspiciousPauseDetector:
+    """
+    Detects suspicious long pauses in speech.
+    In natural conversation, pauses are typically:
+    - Short (< 2 seconds) for thinking
+    - Medium (2-4 seconds) for complex thoughts
+    Suspicious pauses (> 5 seconds) may indicate:
+    - Looking up answers
+    - Receiving external help
+    - Reading from a source
+    """
+    def __init__(self,
+                 min_suspicious_duration: float = 5.0,
+                 warning_duration: float = 3.0,
+                 max_natural_pause: float = 2.0):
+        """
+        Args:
+            min_suspicious_duration: Minimum pause duration to flag as suspicious
+            warning_duration: Duration to flag as a warning (not fully suspicious)
+            max_natural_pause: Maximum duration for a natural pause
+        """
+        self.min_suspicious_duration = min_suspicious_duration
+        self.warning_duration = warning_duration
+        self.max_natural_pause = max_natural_pause
+    def detect(self, speech_segments: List[dict],
+               total_duration: float,
+               transcription_segments: List[dict] = None) -> PauseResult:
+        """
+        Detect suspicious pauses between speech segments.
+        Args:
+            speech_segments: List of {'start': float, 'end': float} for speech
+            total_duration: Total audio duration in seconds
+            transcription_segments: Optional transcription with timestamps for context
+        Returns:
+            PauseResult with detected suspicious pauses
+        """
+        if not speech_segments:
+            return PauseResult(detected=False)
+        # Sort segments by start time
+        sorted_segments = sorted(speech_segments, key=lambda s: s.get('start', 0))
+        suspicious_pauses = []
+        # Check pause at the beginning
+        first_start = sorted_segments[0].get('start', 0)
+        if first_start >= self.min_suspicious_duration:
+            context = self._get_context(0, first_start, transcription_segments, "start")
+            suspicious_pauses.append(SuspiciousPause(
+                start=0,
+                end=first_start,
+                duration=round(first_start, 2),
+                context=context
+            ))
+        # Check pauses between segments
+        for i in range(1, len(sorted_segments)):
+            prev_end = sorted_segments[i-1].get('end', 0)
+            curr_start = sorted_segments[i].get('start', 0)
+            gap = curr_start - prev_end
+            if gap >= self.min_suspicious_duration:
+                context = self._get_context(prev_end, curr_start, transcription_segments, "middle")
+                suspicious_pauses.append(SuspiciousPause(
+                    start=round(prev_end, 2),
+                    end=round(curr_start, 2),
+                    duration=round(gap, 2),
+                    context=context
+                ))
+        # Check pause at the end
+        last_end = sorted_segments[-1].get('end', 0)
+        end_gap = total_duration - last_end
+        if end_gap >= self.min_suspicious_duration:
+            context = self._get_context(last_end, total_duration, transcription_segments, "end")
+            suspicious_pauses.append(SuspiciousPause(
+                start=round(last_end, 2),
+                end=round(total_duration, 2),
+                duration=round(end_gap, 2),
+                context=context
+            ))
+        # Calculate summary statistics
+        total_suspicious_time = sum(p.duration for p in suspicious_pauses)
+        longest_pause = max((p.duration for p in suspicious_pauses), default=0)
+        return PauseResult(
+            detected=len(suspicious_pauses) > 0,
+            pauses=suspicious_pauses,
+            total_suspicious_time=round(total_suspicious_time, 2),
+            longest_pause=round(longest_pause, 2)
+        )
+    def detect_from_vad(self, vad_result: dict, total_duration: float) -> PauseResult:
+        """
+        Detect suspicious pauses using VAD output.
+        Args:
+            vad_result: VAD result with 'segments' list
+            total_duration: Total audio duration
+        Returns:
+            PauseResult with detected suspicious pauses
+        """
+        segments = vad_result.get('segments', [])
+        return self.detect(segments, total_duration)
+    def _get_context(self, start: float, end: float,
+                     transcription_segments: List[dict],
+                     position: str) -> str:
+        """
+        Get context about what happened before/after the pause.
+        """
+        if not transcription_segments:
+            if position == "start":
+                return "Long silence at audio start"
+            elif position == "end":
+                return "Long silence at audio end"
+            else:
+                return "Long silence mid-conversation"
+        # Find text before and after the pause
+        text_before = ""
+        text_after = ""
+        for seg in transcription_segments:
+            seg_end = seg.get('end', 0)
+            seg_start = seg.get('start', 0)
+            seg_text = seg.get('text', '').strip()
+            # Text ending just before pause
+            if seg_end <= start + 0.5 and seg_end >= start - 1.0:
+                text_before = seg_text[-50:] if len(seg_text) > 50 else seg_text
+            # Text starting just after pause
+            if seg_start >= end - 0.5 and seg_start <= end + 1.0:
+                text_after = seg_text[:50] if len(seg_text) > 50 else seg_text
+        if text_before and text_after:
+            return f"After: '{text_before}...' | Before: '...{text_after}'"
+        elif text_before:
+            return f"After: '{text_before}...'"
+        elif text_after:
+            return f"Before: '...{text_after}'"
+        else:
+            return f"Silence at {position} of audio"
+    def analyze_pause_pattern(self, speech_segments: List[dict],
+                              total_duration: float) -> dict:
+        """
+        Analyze the overall pause pattern in the audio.
+        Returns statistics about pause behavior.
+        """
+        if not speech_segments or len(speech_segments) < 2:
+            return {
+                'avg_pause': 0,
+                'max_pause': 0,
+                'pause_count': 0,
+                'speech_ratio': 0
+            }
+        sorted_segments = sorted(speech_segments, key=lambda s: s.get('start', 0))
+        pauses = []
+        for i in range(1, len(sorted_segments)):
+            prev_end = sorted_segments[i-1].get('end', 0)
+            curr_start = sorted_segments[i].get('start', 0)
+            gap = curr_start - prev_end
+            if gap > 0.1:  # Ignore very small gaps
+                pauses.append(gap)
+        if not pauses:
+            return {
+                'avg_pause': 0,
+                'max_pause': 0,
+                'pause_count': 0,
+                'speech_ratio': 1.0
+            }
+        # Calculate speech time
+        speech_time = sum(
+            seg.get('end', 0) - seg.get('start', 0)
+            for seg in sorted_segments
+        )
+        return {
+            'avg_pause': round(np.mean(pauses), 2),
+            'max_pause': round(max(pauses), 2),
+            'pause_count': len(pauses),
+            'speech_ratio': round(speech_time / total_duration, 2) if total_duration > 0 else 0,
+            'natural_pauses': sum(1 for p in pauses if p <= self.max_natural_pause),
+            'warning_pauses': sum(1 for p in pauses if self.max_natural_pause < p < self.min_suspicious_duration),
+            'suspicious_pauses': sum(1 for p in pauses if p >= self.min_suspicious_duration)
+        }

src/fraud_detection/reading_pattern.py ADDED Viewed

	@@ -0,0 +1,238 @@

+"""
+Reading Pattern Analyzer
+Detects if someone is reading prepared answers vs speaking naturally.
+Key indicators of reading:
+- Consistent speech rate (no natural variation)
+- Lack of filler words ("um", "uh", "like", "you know")
+- Regular pause patterns
+- Monotonic rhythm
+"""
+import numpy as np
+from dataclasses import dataclass, field
+from typing import List, Optional
+# Common filler words in English
+FILLER_WORDS = [
+    'um', 'uh', 'uhm', 'umm', 'er', 'ah', 'like', 'you know',
+    'basically', 'actually', 'so', 'well', 'i mean', 'kind of',
+    'sort of', 'right', 'okay'
+]
+@dataclass
+class ReadingPatternResult:
+    """Result of reading pattern analysis."""
+    is_reading: bool
+    confidence: float  # 0.0 to 1.0
+    indicators: List[str] = field(default_factory=list)
+    speech_rate_cv: float = 0.0  # Coefficient of variation
+    filler_word_rate: float = 0.0  # Fillers per minute
+    pause_regularity: float = 0.0  # How regular pauses are
+class ReadingPatternAnalyzer:
+    """
+    Analyzes speech patterns to detect if someone is reading.
+    Uses transcription with timestamps to analyze:
+    - Speech rate variation
+    - Filler word frequency
+    - Pause patterns
+    """
+    def __init__(self,
+                 min_speech_rate_cv: float = 0.15,
+                 min_filler_rate: float = 2.0,
+                 reading_threshold: float = 0.6):
+        """
+        Args:
+            min_speech_rate_cv: Minimum coefficient of variation for natural speech
+            min_filler_rate: Minimum filler words per minute for natural speech
+            reading_threshold: Confidence threshold to flag as reading
+        """
+        self.min_speech_rate_cv = min_speech_rate_cv
+        self.min_filler_rate = min_filler_rate
+        self.reading_threshold = reading_threshold
+    def analyze(self, transcription: str, word_timestamps: List[dict],
+                duration_seconds: float) -> ReadingPatternResult:
+        """
+        Analyze transcription for reading patterns.
+        Args:
+            transcription: Full transcription text
+            word_timestamps: List of {'word': str, 'start': float, 'end': float}
+            duration_seconds: Total audio duration
+        Returns:
+            ReadingPatternResult with analysis
+        """
+        if not word_timestamps or len(word_timestamps) < 10:
+            return ReadingPatternResult(
+                is_reading=False,
+                confidence=0.0,
+                indicators=["Insufficient data for analysis"]
+            )
+        indicators = []
+        scores = []
+        # 1. Analyze speech rate variation
+        speech_rate_cv = self._analyze_speech_rate(word_timestamps)
+        if speech_rate_cv < self.min_speech_rate_cv:
+            indicators.append(f"Constant speech rate (CV={speech_rate_cv:.2f})")
+            scores.append(0.8)
+        else:
+            scores.append(0.2)
+        # 2. Analyze filler word frequency
+        filler_rate = self._analyze_filler_words(transcription, duration_seconds)
+        if filler_rate < self.min_filler_rate:
+            indicators.append(f"Few filler words ({filler_rate:.1f}/min)")
+            scores.append(0.7)
+        else:
+            scores.append(0.2)
+        # 3. Analyze pause patterns
+        pause_regularity = self._analyze_pause_patterns(word_timestamps)
+        if pause_regularity > 0.7:
+            indicators.append(f"Regular pause pattern ({pause_regularity:.0%})")
+            scores.append(0.6)
+        else:
+            scores.append(0.2)
+        # 4. Check for natural speech markers
+        has_corrections = self._has_self_corrections(transcription)
+        if not has_corrections:
+            indicators.append("No self-corrections detected")
+            scores.append(0.5)
+        else:
+            scores.append(0.1)
+        # Calculate overall confidence
+        confidence = np.mean(scores)
+        is_reading = confidence >= self.reading_threshold
+        return ReadingPatternResult(
+            is_reading=is_reading,
+            confidence=round(confidence, 2),
+            indicators=indicators,
+            speech_rate_cv=round(speech_rate_cv, 3),
+            filler_word_rate=round(filler_rate, 2),
+            pause_regularity=round(pause_regularity, 2)
+        )
+    def _analyze_speech_rate(self, word_timestamps: List[dict]) -> float:
+        """
+        Calculate coefficient of variation of speech rate.
+        Natural speech has variable rate, reading is more constant.
+        """
+        if len(word_timestamps) < 5:
+            return 0.0
+        # Calculate words per second in sliding windows
+        window_size = 3.0  # seconds
+        hop = 1.0  # seconds
+        rates = []
+        max_time = word_timestamps[-1].get('end', 0)
+        for start in np.arange(0, max_time - window_size, hop):
+            end = start + window_size
+            words_in_window = [
+                w for w in word_timestamps
+                if w.get('start', 0) >= start and w.get('end', 0) <= end
+            ]
+            if words_in_window:
+                rate = len(words_in_window) / window_size
+                rates.append(rate)
+        if len(rates) < 3:
+            return 0.0
+        # Coefficient of variation (std / mean)
+        mean_rate = np.mean(rates)
+        if mean_rate == 0:
+            return 0.0
+        cv = np.std(rates) / mean_rate
+        return cv
+    def _analyze_filler_words(self, transcription: str,
+                               duration_seconds: float) -> float:
+        """
+        Count filler words per minute.
+        Natural speech has more fillers, reading has fewer.
+        """
+        text_lower = transcription.lower()
+        filler_count = 0
+        for filler in FILLER_WORDS:
+            # Count occurrences (word boundaries)
+            import re
+            pattern = r'\b' + re.escape(filler) + r'\b'
+            matches = re.findall(pattern, text_lower)
+            filler_count += len(matches)
+        # Calculate per minute rate
+        minutes = duration_seconds / 60.0
+        if minutes < 0.1:
+            return 0.0
+        return filler_count / minutes
+    def _analyze_pause_patterns(self, word_timestamps: List[dict]) -> float:
+        """
+        Analyze regularity of pauses between words.
+        Reading tends to have more regular pauses.
+        """
+        if len(word_timestamps) < 5:
+            return 0.0
+        # Calculate gaps between consecutive words
+        gaps = []
+        for i in range(1, len(word_timestamps)):
+            prev_end = word_timestamps[i-1].get('end', 0)
+            curr_start = word_timestamps[i].get('start', 0)
+            gap = curr_start - prev_end
+            if gap > 0.05:  # Ignore very small gaps
+                gaps.append(gap)
+        if len(gaps) < 3:
+            return 0.0
+        # Calculate regularity (inverse of coefficient of variation)
+        mean_gap = np.mean(gaps)
+        if mean_gap == 0:
+            return 0.0
+        cv = np.std(gaps) / mean_gap
+        regularity = 1.0 / (1.0 + cv)  # Higher = more regular
+        return regularity
+    def _has_self_corrections(self, transcription: str) -> bool:
+        """
+        Check for self-corrections which indicate natural speech.
+        E.g., "I went to the... I mean, I was going to the store"
+        """
+        correction_markers = [
+            'i mean', 'sorry', 'no wait', 'actually', 'let me',
+            'what i meant', 'no no', 'sorry i', 'wait'
+        ]
+        text_lower = transcription.lower()
+        for marker in correction_markers:
+            if marker in text_lower:
+                return True
+        # Check for repeated words (stammering/correction)
+        words = text_lower.split()
+        for i in range(1, len(words)):
+            if words[i] == words[i-1] and len(words[i]) > 2:
+                return True
+        return False

src/fraud_detection/whisper_detector.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""
+Whisper Detector
+Detects low-volume background voices (whispers) that may indicate someone
+is being prompted or helped during a test.
+"""
+import numpy as np
+import librosa
+from dataclasses import dataclass, field
+from typing import List, Tuple
+@dataclass
+class WhisperInstance:
+    """A detected whisper event."""
+    start: float
+    end: float
+    confidence: float
+@dataclass
+class WhisperResult:
+    """Result of whisper detection."""
+    detected: bool
+    instances: List[WhisperInstance] = field(default_factory=list)
+    @property
+    def count(self) -> int:
+        return len(self.instances)
+class WhisperDetector:
+    """
+    Detects whispers/low background voices in audio.
+    Whispers have distinct characteristics:
+    - Lower amplitude than normal speech
+    - More high-frequency content (less voiced, more fricative)
+    - Often occur during pauses in main speaker's speech
+    """
+    def __init__(self,
+                 energy_threshold: float = 0.02,
+                 min_duration: float = 0.3,
+                 max_amplitude_ratio: float = 0.3):
+        """
+        Args:
+            energy_threshold: Minimum energy to consider as potential whisper
+            min_duration: Minimum duration in seconds for a whisper
+            max_amplitude_ratio: Max ratio vs main speech (whispers are quieter)
+        """
+        self.energy_threshold = energy_threshold
+        self.min_duration = min_duration
+        self.max_amplitude_ratio = max_amplitude_ratio
+    def detect(self, waveform: np.ndarray, sample_rate: int,
+               main_speaker_segments: List[dict] = None) -> WhisperResult:
+        """
+        Detect whispers in audio.
+        Args:
+            waveform: Audio waveform as numpy array
+            sample_rate: Sample rate of audio
+            main_speaker_segments: Segments where main speaker is talking
+                                   (whispers are checked outside these)
+        Returns:
+            WhisperResult with detected whisper instances
+        """
+        # Ensure mono
+        if len(waveform.shape) > 1:
+            waveform = waveform.mean(axis=0)
+        # Calculate main speech amplitude for comparison
+        main_amplitude = np.percentile(np.abs(waveform), 95)
+        # Frame-based analysis
+        frame_length = int(0.025 * sample_rate)  # 25ms frames
+        hop_length = int(0.010 * sample_rate)    # 10ms hop
+        # Calculate energy per frame
+        energy = librosa.feature.rms(y=waveform, frame_length=frame_length,
+                                      hop_length=hop_length)[0]
+        # Calculate spectral centroid (whispers have higher centroid)
+        spectral_centroid = librosa.feature.spectral_centroid(
+            y=waveform, sr=sample_rate,
+            n_fft=frame_length, hop_length=hop_length
+        )[0]
+        # Calculate zero crossing rate (whispers have higher ZCR)
+        zcr = librosa.feature.zero_crossing_rate(
+            y=waveform, frame_length=frame_length, hop_length=hop_length
+        )[0]
+        # Normalize features
+        energy_norm = energy / (main_amplitude + 1e-10)
+        centroid_norm = spectral_centroid / (sample_rate / 2)
+        # Identify whisper candidates:
+        # - Low energy (but not silent)
+        # - High spectral centroid (breathy)
+        # - High zero crossing rate
+        whisper_frames = (
+            (energy > self.energy_threshold * main_amplitude) &
+            (energy_norm < self.max_amplitude_ratio) &
+            (centroid_norm > 0.15) &
+            (zcr > 0.1)
+        )
+        # Convert frames to time segments
+        frame_times = librosa.frames_to_time(
+            np.arange(len(energy)), sr=sample_rate, hop_length=hop_length
+        )
+        # Group consecutive whisper frames
+        instances = []
+        in_whisper = False
+        start_time = 0
+        for i, is_whisper in enumerate(whisper_frames):
+            time = frame_times[i] if i < len(frame_times) else frame_times[-1]
+            if is_whisper and not in_whisper:
+                start_time = time
+                in_whisper = True
+            elif not is_whisper and in_whisper:
+                duration = time - start_time
+                if duration >= self.min_duration:
+                    # Check if this overlaps with main speaker
+                    if not self._overlaps_main_speaker(start_time, time, main_speaker_segments):
+                        confidence = self._calculate_confidence(
+                            waveform, sample_rate, start_time, time, main_amplitude
+                        )
+                        if confidence > 0.5:
+                            instances.append(WhisperInstance(
+                                start=round(start_time, 2),
+                                end=round(time, 2),
+                                confidence=round(confidence, 2)
+                            ))
+                in_whisper = False
+        # Handle case where audio ends during whisper
+        if in_whisper:
+            end_time = frame_times[-1] if len(frame_times) > 0 else 0
+            duration = end_time - start_time
+            if duration >= self.min_duration:
+                if not self._overlaps_main_speaker(start_time, end_time, main_speaker_segments):
+                    confidence = self._calculate_confidence(
+                        waveform, sample_rate, start_time, end_time, main_amplitude
+                    )
+                    if confidence > 0.5:
+                        instances.append(WhisperInstance(
+                            start=round(start_time, 2),
+                            end=round(end_time, 2),
+                            confidence=round(confidence, 2)
+                        ))
+        return WhisperResult(
+            detected=len(instances) > 0,
+            instances=instances
+        )
+    def _overlaps_main_speaker(self, start: float, end: float,
+                                segments: List[dict]) -> bool:
+        """Check if time range overlaps with main speaker segments."""
+        if not segments:
+            return False
+        for seg in segments:
+            seg_start = seg.get('start', 0)
+            seg_end = seg.get('end', 0)
+            # Check for overlap
+            if start < seg_end and end > seg_start:
+                return True
+        return False
+    def _calculate_confidence(self, waveform: np.ndarray, sample_rate: int,
+                              start: float, end: float,
+                              main_amplitude: float) -> float:
+        """Calculate confidence that this segment is a whisper."""
+        start_sample = int(start * sample_rate)
+        end_sample = int(end * sample_rate)
+        if end_sample > len(waveform):
+            end_sample = len(waveform)
+        if start_sample >= end_sample:
+            return 0.0
+        segment = waveform[start_sample:end_sample]
+        # Calculate features for this segment
+        seg_amplitude = np.percentile(np.abs(segment), 95)
+        amplitude_ratio = seg_amplitude / (main_amplitude + 1e-10)
+        # Whisper confidence based on amplitude ratio
+        # Lower ratio = more likely whisper
+        if amplitude_ratio > 0.5:
+            return 0.0
+        # Scale confidence: 0.1-0.3 ratio = high confidence
+        confidence = 1.0 - (amplitude_ratio / 0.5)
+        return min(1.0, max(0.0, confidence))

src/phase1_foundation/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .preprocessor import AudioPreprocessor
+from .vad import VoiceActivityDetector, SpeechSegment
+from .diarization import SpeakerDiarizer, SpeakerInfo, SpeakerSegment
+from .voiceprint import VoiceprintExtractor, VoiceprintResult
+__all__ = [
+    'AudioPreprocessor',
+    'VoiceActivityDetector', 'SpeechSegment',
+    'SpeakerDiarizer', 'SpeakerInfo', 'SpeakerSegment',
+    'VoiceprintExtractor', 'VoiceprintResult'
+]

src/phase1_foundation/diarization.py ADDED Viewed

	@@ -0,0 +1,199 @@

+"""
+Speaker Diarization - identify who spoke when.
+"""
+import torch
+import numpy as np
+from typing import List, Dict, Optional
+from dataclasses import dataclass, field
+from sklearn.cluster import AgglomerativeClustering
+@dataclass
+class SpeakerSegment:
+    """A segment of speech from a specific speaker."""
+    start: float
+    end: float
+    speaker_id: str
+    @property
+    def duration(self) -> float:
+        return self.end - self.start
+@dataclass
+class SpeakerInfo:
+    """Information about a speaker."""
+    speaker_id: str
+    total_seconds: float = 0.0
+    segments: List[SpeakerSegment] = field(default_factory=list)
+    embedding: Optional[np.ndarray] = None
+    def add_segment(self, segment: SpeakerSegment):
+        self.segments.append(segment)
+        self.total_seconds += segment.duration
+class SpeakerDiarizer:
+    """Speaker diarization using embedding clustering."""
+    def __init__(self, device: str = None):
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        self._embedding_model = None
+    @property
+    def embedding_model(self):
+        """Lazy load embedding model."""
+        if self._embedding_model is None:
+            from speechbrain.inference.speaker import SpeakerRecognition
+            self._embedding_model = SpeakerRecognition.from_hparams(
+                source="speechbrain/spkrec-ecapa-voxceleb",
+                savedir="pretrained_models/spkrec",
+                run_opts={"device": self.device}
+            )
+        return self._embedding_model
+    def diarize(self, audio_path: str,
+                speech_segments: List = None,
+                window_size: float = 2.0,
+                hop_size: float = 0.5,
+                num_speakers: Optional[int] = None,
+                min_speakers: int = 1,
+                max_speakers: int = 5) -> Dict[str, SpeakerInfo]:
+        """
+        Perform speaker diarization.
+        Args:
+            audio_path: Path to audio file
+            speech_segments: Optional list of speech segments (from VAD)
+            window_size: Window size for embedding extraction
+            hop_size: Hop size between windows
+            num_speakers: Known number of speakers (None to estimate)
+            min_speakers: Minimum speakers to detect
+            max_speakers: Maximum speakers to detect
+        Returns:
+            Dict mapping speaker_id to SpeakerInfo
+        """
+        import torchaudio
+        # Load audio (use soundfile backend to avoid torchcodec dependency)
+        waveform, sample_rate = torchaudio.load(audio_path, backend="soundfile")
+        if waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+        duration = waveform.shape[1] / sample_rate
+        # Extract embeddings for windows
+        windows = []
+        embeddings = []
+        current = 0.0
+        while current + window_size <= duration:
+            start_sample = int(current * sample_rate)
+            end_sample = int((current + window_size) * sample_rate)
+            window_audio = waveform[:, start_sample:end_sample]
+            # Check if this window has speech (if VAD provided)
+            has_speech = True
+            if speech_segments:
+                has_speech = any(
+                    s.start <= current + window_size/2 <= s.end
+                    for s in speech_segments
+                )
+            if has_speech and window_audio.shape[1] > 0:
+                # Extract embedding
+                emb = self.embedding_model.encode_batch(window_audio)
+                emb = emb.squeeze().cpu().numpy()
+                windows.append({'start': current, 'end': current + window_size})
+                embeddings.append(emb)
+            current += hop_size
+        if len(embeddings) < 2:
+            # Not enough data for clustering
+            speaker_info = SpeakerInfo(speaker_id="speaker_A")
+            for seg in (speech_segments or []):
+                speaker_info.add_segment(SpeakerSegment(
+                    start=seg.start, end=seg.end, speaker_id="speaker_A"
+                ))
+            if embeddings:
+                speaker_info.embedding = embeddings[0]
+            return {"speaker_A": speaker_info}
+        embeddings_array = np.array(embeddings)
+        # Cluster embeddings
+        if num_speakers is None:
+            # Estimate number of speakers
+            clustering = AgglomerativeClustering(
+                n_clusters=None,
+                distance_threshold=0.7,
+                metric='cosine',
+                linkage='average'
+            )
+        else:
+            clustering = AgglomerativeClustering(
+                n_clusters=num_speakers,
+                metric='cosine',
+                linkage='average'
+            )
+        labels = clustering.fit_predict(embeddings_array)
+        # Clamp number of speakers
+        unique_labels = np.unique(labels)
+        if len(unique_labels) > max_speakers:
+            # Re-cluster with max speakers
+            clustering = AgglomerativeClustering(
+                n_clusters=max_speakers,
+                metric='cosine',
+                linkage='average'
+            )
+            labels = clustering.fit_predict(embeddings_array)
+            unique_labels = np.unique(labels)
+        # Build speaker info
+        speakers = {}
+        speaker_names = ['speaker_A', 'speaker_B', 'speaker_C', 'speaker_D', 'speaker_E']
+        for label in unique_labels:
+            speaker_id = speaker_names[label] if label < len(speaker_names) else f"speaker_{label}"
+            speakers[speaker_id] = SpeakerInfo(speaker_id=speaker_id)
+            # Calculate mean embedding for this speaker
+            mask = labels == label
+            speaker_embeddings = embeddings_array[mask]
+            speakers[speaker_id].embedding = np.mean(speaker_embeddings, axis=0)
+        # Assign windows to speakers
+        for i, (window, label) in enumerate(zip(windows, labels)):
+            speaker_id = speaker_names[label] if label < len(speaker_names) else f"speaker_{label}"
+            segment = SpeakerSegment(
+                start=window['start'],
+                end=window['end'],
+                speaker_id=speaker_id
+            )
+            speakers[speaker_id].add_segment(segment)
+        # Sort by total speech time (main speaker first)
+        speakers = dict(sorted(
+            speakers.items(),
+            key=lambda x: x[1].total_seconds,
+            reverse=True
+        ))
+        return speakers
+    def get_main_speaker(self, speakers: Dict[str, SpeakerInfo]) -> Optional[SpeakerInfo]:
+        """Get the speaker with most speech time."""
+        if not speakers:
+            return None
+        return next(iter(speakers.values()))
+    def get_additional_speakers(self, speakers: Dict[str, SpeakerInfo]) -> List[SpeakerInfo]:
+        """Get all speakers except the main one."""
+        items = list(speakers.values())
+        return items[1:] if len(items) > 1 else []

src/phase1_foundation/preprocessor.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+Audio preprocessor - normalize audio for analysis.
+"""
+import torch
+import torchaudio
+import numpy as np
+from pathlib import Path
+from typing import Tuple, Optional
+import tempfile
+import os
+class AudioPreprocessor:
+    """Normalize audio to standard format for analysis."""
+    TARGET_SAMPLE_RATE = 16000
+    TARGET_CHANNELS = 1
+    def __init__(self):
+        pass
+    def load_audio(self, audio_path: str) -> Tuple[torch.Tensor, int]:
+        """
+        Load audio file.
+        Returns:
+            Tuple of (waveform, sample_rate)
+        """
+        # Use soundfile backend to avoid torchcodec dependency
+        waveform, sample_rate = torchaudio.load(audio_path, backend="soundfile")
+        return waveform, sample_rate
+    def normalize(self, waveform: torch.Tensor, sample_rate: int) -> Tuple[torch.Tensor, int]:
+        """
+        Normalize audio to mono, 16kHz, normalized amplitude.
+        Returns:
+            Tuple of (normalized_waveform, target_sample_rate)
+        """
+        # Convert to mono
+        if waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+        # Resample to 16kHz
+        if sample_rate != self.TARGET_SAMPLE_RATE:
+            resampler = torchaudio.transforms.Resample(
+                orig_freq=sample_rate,
+                new_freq=self.TARGET_SAMPLE_RATE
+            )
+            waveform = resampler(waveform)
+        # Normalize amplitude
+        max_amp = waveform.abs().max()
+        if max_amp > 0:
+            waveform = waveform / max_amp * 0.95
+        return waveform, self.TARGET_SAMPLE_RATE
+    def process_file(self, audio_path: str, output_path: Optional[str] = None) -> Tuple[torch.Tensor, int, dict]:
+        """
+        Load and normalize audio file.
+        Returns:
+            Tuple of (waveform, sample_rate, metadata)
+        """
+        # Load
+        waveform, orig_sr = self.load_audio(audio_path)
+        orig_duration = waveform.shape[1] / orig_sr
+        orig_channels = waveform.shape[0]
+        # Normalize
+        waveform, sample_rate = self.normalize(waveform, orig_sr)
+        # Save if output path provided
+        if output_path:
+            torchaudio.save(output_path, waveform, sample_rate)
+        metadata = {
+            'original_sample_rate': orig_sr,
+            'original_channels': orig_channels,
+            'original_duration': orig_duration,
+            'normalized_sample_rate': sample_rate,
+            'normalized_duration': waveform.shape[1] / sample_rate
+        }
+        return waveform, sample_rate, metadata
+    def get_duration(self, waveform: torch.Tensor, sample_rate: int) -> float:
+        """Get duration in seconds."""
+        return waveform.shape[1] / sample_rate
+    def save_audio(self, waveform: torch.Tensor, sample_rate: int, output_path: str):
+        """Save audio to file."""
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        torchaudio.save(output_path, waveform, sample_rate)
+    def extract_segment(self, waveform: torch.Tensor, sample_rate: int,
+                        start: float, end: float) -> torch.Tensor:
+        """Extract segment from waveform."""
+        start_sample = int(start * sample_rate)
+        end_sample = int(end * sample_rate)
+        return waveform[:, start_sample:end_sample]

src/phase1_foundation/vad.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Voice Activity Detection - detect speech segments.
+"""
+import torch
+from typing import List, Tuple, Optional
+from dataclasses import dataclass
+@dataclass
+class SpeechSegment:
+    """A segment of speech."""
+    start: float
+    end: float
+    @property
+    def duration(self) -> float:
+        return self.end - self.start
+class VoiceActivityDetector:
+    """Detect speech segments using SpeechBrain VAD."""
+    def __init__(self, device: str = None):
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        self._model = None
+    @property
+    def model(self):
+        """Lazy load VAD model."""
+        if self._model is None:
+            from speechbrain.inference.VAD import VAD
+            import warnings
+            # Suppress the use_auth_token deprecation warning from speechbrain
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", message=".*use_auth_token.*")
+                self._model = VAD.from_hparams(
+                    source="speechbrain/vad-crdnn-libriparty",
+                    savedir="pretrained_models/vad",
+                    run_opts={"device": self.device}
+                )
+        return self._model
+    def detect(self, audio_path: str,
+               min_speech_duration: float = 0.25,
+               min_silence_duration: float = 0.1) -> List[SpeechSegment]:
+        """
+        Detect speech segments in audio.
+        Args:
+            audio_path: Path to audio file
+            min_speech_duration: Minimum speech duration to keep
+            min_silence_duration: Minimum silence to consider as gap
+        Returns:
+            List of SpeechSegment objects
+        """
+        # Use get_speech_segments which does full pipeline
+        boundaries = self.model.get_speech_segments(
+            audio_path,
+            large_chunk_size=30,
+            small_chunk_size=10,
+            overlap_small_chunk=True,
+            apply_energy_VAD=True,
+            double_check=True,
+            close_th=min_silence_duration,
+            len_th=min_speech_duration
+        )
+        # Convert to segments
+        segments = []
+        # boundaries is a tensor with shape [N, 2] where each row is [start, end]
+        if boundaries is not None and len(boundaries) > 0:
+            for boundary in boundaries:
+                start, end = float(boundary[0]), float(boundary[1])
+                if end - start >= min_speech_duration:
+                    segments.append(SpeechSegment(start=start, end=end))
+        return segments
+    def detect_from_waveform(self, waveform: torch.Tensor, sample_rate: int,
+                             min_speech_duration: float = 0.25) -> List[SpeechSegment]:
+        """
+        Detect speech segments from waveform tensor.
+        Args:
+            waveform: Audio waveform tensor
+            sample_rate: Sample rate
+            min_speech_duration: Minimum speech duration
+        Returns:
+            List of SpeechSegment objects
+        """
+        import tempfile
+        import torchaudio
+        import os
+        # Save to temp file (SpeechBrain VAD needs file path)
+        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
+            temp_path = f.name
+        try:
+            torchaudio.save(temp_path, waveform, sample_rate)
+            return self.detect(temp_path, min_speech_duration)
+        finally:
+            if os.path.exists(temp_path):
+                os.remove(temp_path)
+    def get_total_speech(self, segments: List[SpeechSegment]) -> float:
+        """Get total speech duration from segments."""
+        return sum(s.duration for s in segments)
+    def get_speech_ratio(self, segments: List[SpeechSegment],
+                         total_duration: float) -> float:
+        """Get ratio of speech to total duration."""
+        if total_duration == 0:
+            return 0.0
+        return self.get_total_speech(segments) / total_duration

src/phase1_foundation/voiceprint.py ADDED Viewed

	@@ -0,0 +1,199 @@

+"""
+Voiceprint extraction - generate unique voice identifiers.
+"""
+import torch
+import numpy as np
+import hashlib
+from typing import Tuple, Optional
+from dataclasses import dataclass
+@dataclass
+class VoiceprintResult:
+    """Result of voiceprint extraction."""
+    voiceprint_id: str
+    embedding: np.ndarray
+    quality_score: float
+    speech_duration: float
+    def to_bytes(self) -> bytes:
+        """Convert embedding to bytes for storage."""
+        return self.embedding.astype(np.float32).tobytes()
+    @classmethod
+    def from_bytes(cls, vp_id: str, embedding_bytes: bytes,
+                   quality: float = 0.0, duration: float = 0.0):
+        """Create from stored bytes."""
+        embedding = np.frombuffer(embedding_bytes, dtype=np.float32)
+        return cls(
+            voiceprint_id=vp_id,
+            embedding=embedding,
+            quality_score=quality,
+            speech_duration=duration
+        )
+class VoiceprintExtractor:
+    """Extract voiceprints using ECAPA-TDNN."""
+    EMBEDDING_DIM = 192
+    def __init__(self, device: str = None):
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        self._model = None
+    @property
+    def model(self):
+        """Lazy load model."""
+        if self._model is None:
+            from speechbrain.inference.speaker import SpeakerRecognition
+            self._model = SpeakerRecognition.from_hparams(
+                source="speechbrain/spkrec-ecapa-voxceleb",
+                savedir="pretrained_models/spkrec",
+                run_opts={"device": self.device}
+            )
+        return self._model
+    def extract_from_file(self, audio_path: str) -> VoiceprintResult:
+        """
+        Extract voiceprint from audio file.
+        Args:
+            audio_path: Path to audio file
+        Returns:
+            VoiceprintResult with ID, embedding, quality
+        """
+        import torchaudio
+        # Load audio (use soundfile backend to avoid torchcodec dependency)
+        waveform, sample_rate = torchaudio.load(audio_path, backend="soundfile")
+        duration = waveform.shape[1] / sample_rate
+        return self.extract_from_waveform(waveform, sample_rate, duration)
+    def extract_from_waveform(self, waveform: torch.Tensor, sample_rate: int,
+                              duration: float = None) -> VoiceprintResult:
+        """
+        Extract voiceprint from waveform.
+        Args:
+            waveform: Audio waveform tensor
+            sample_rate: Sample rate
+            duration: Optional duration (calculated if not provided)
+        Returns:
+            VoiceprintResult
+        """
+        if duration is None:
+            duration = waveform.shape[1] / sample_rate
+        # Ensure mono
+        if waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+        # Resample if needed
+        if sample_rate != 16000:
+            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
+            waveform = resampler(waveform)
+        # Extract embedding
+        embedding = self.model.encode_batch(waveform)
+        embedding = embedding.squeeze().cpu().numpy()
+        # Generate ID from embedding
+        vp_id = self._generate_id(embedding)
+        # Calculate quality score (based on duration and SNR proxy)
+        quality = self._calculate_quality(waveform, duration)
+        return VoiceprintResult(
+            voiceprint_id=vp_id,
+            embedding=embedding,
+            quality_score=quality,
+            speech_duration=duration
+        )
+    def extract_from_embedding(self, embedding: np.ndarray,
+                               duration: float = 0.0) -> VoiceprintResult:
+        """Create VoiceprintResult from existing embedding."""
+        vp_id = self._generate_id(embedding)
+        quality = min(1.0, duration / 30.0)  # Simple duration-based quality
+        return VoiceprintResult(
+            voiceprint_id=vp_id,
+            embedding=embedding,
+            quality_score=quality,
+            speech_duration=duration
+        )
+    def _generate_id(self, embedding: np.ndarray) -> str:
+        """Generate unique ID from embedding."""
+        # Hash the embedding
+        emb_bytes = embedding.astype(np.float32).tobytes()
+        hash_hex = hashlib.sha256(emb_bytes).hexdigest()[:8]
+        return f"vp_{hash_hex}"
+    def _calculate_quality(self, waveform: torch.Tensor, duration: float) -> float:
+        """
+        Calculate quality score.
+        Based on:
+        - Duration (more is better, up to 30s)
+        - Signal energy (not too quiet, not clipping)
+        """
+        # Duration factor (0-1, saturates at 30s)
+        duration_score = min(1.0, duration / 30.0)
+        # Energy factor
+        rms = torch.sqrt(torch.mean(waveform ** 2)).item()
+        if rms < 0.01:  # Too quiet
+            energy_score = 0.3
+        elif rms > 0.9:  # Clipping
+            energy_score = 0.5
+        else:
+            energy_score = 1.0
+        # Combined score
+        quality = 0.7 * duration_score + 0.3 * energy_score
+        return round(quality, 2)
+    def compare(self, vp1: VoiceprintResult, vp2: VoiceprintResult) -> float:
+        """
+        Compare two voiceprints.
+        Returns:
+            Similarity score (0-1), higher = more similar
+        """
+        return self.cosine_similarity(vp1.embedding, vp2.embedding)
+    def cosine_similarity(self, emb1: np.ndarray, emb2: np.ndarray) -> float:
+        """Calculate cosine similarity between embeddings."""
+        dot = np.dot(emb1, emb2)
+        norm1 = np.linalg.norm(emb1)
+        norm2 = np.linalg.norm(emb2)
+        if norm1 == 0 or norm2 == 0:
+            return 0.0
+        return float(dot / (norm1 * norm2))
+    def is_same_speaker(self, emb1: np.ndarray, emb2: np.ndarray,
+                        threshold: float = 0.75) -> Tuple[bool, float]:
+        """
+        Check if two embeddings are from the same speaker.
+        Returns:
+            Tuple of (is_same, similarity_score)
+        """
+        similarity = self.cosine_similarity(emb1, emb2)
+        return similarity >= threshold, similarity
+    def quality_label(self, score: float) -> str:
+        """Get quality label from score."""
+        if score >= 0.8:
+            return "High"
+        elif score >= 0.5:
+            return "Medium"
+        else:
+            return "Low"

src/phase2_background/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .analyzer import BackgroundAnalyzer, BackgroundAnomaly, AnomalyType, AudioSource
2	+
3	+ __all__ = ['BackgroundAnalyzer', 'BackgroundAnomaly', 'AnomalyType', 'AudioSource']

src/phase2_background/analyzer.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+Background Audio Analysis - detect subtle anomalies.
+"""
+import torch
+import numpy as np
+import librosa
+from typing import List, Optional
+from dataclasses import dataclass
+from enum import Enum
+class AnomalyType(Enum):
+    WHISPER = "whisper"
+    DISTANT_VOICE = "distant_voice"
+    SPEAKER_AUDIO = "speaker_audio"
+    UNKNOWN = "unknown"
+class AudioSource(Enum):
+    DIRECT = "direct"
+    SPEAKER = "speaker"
+    PHONE = "phone"
+    UNKNOWN = "unknown"
+@dataclass
+class BackgroundAnomaly:
+    """A detected background anomaly."""
+    start: float
+    end: float
+    anomaly_type: AnomalyType
+    amplitude_db: float
+    confidence: float
+    @property
+    def duration(self) -> float:
+        return self.end - self.start
+class BackgroundAnalyzer:
+    """Analyze background audio for anomalies."""
+    def __init__(self):
+        self.sample_rate = 16000
+    def amplify_background(self, waveform: np.ndarray,
+                           threshold_db: float = -40,
+                           boost_db: float = 25) -> np.ndarray:
+        """
+        Amplify quiet background regions.
+        Args:
+            waveform: Audio waveform (numpy array)
+            threshold_db: Regions below this are amplified
+            boost_db: Amount to boost by
+        Returns:
+            Amplified waveform
+        """
+        # Convert to dB
+        rms = np.sqrt(np.mean(waveform ** 2))
+        if rms == 0:
+            return waveform
+        # Simple amplitude-based boosting
+        amplified = waveform.copy()
+        # Calculate local energy in windows
+        window_size = int(0.1 * self.sample_rate)  # 100ms windows
+        hop = window_size // 2
+        for i in range(0, len(waveform) - window_size, hop):
+            window = waveform[i:i + window_size]
+            window_rms = np.sqrt(np.mean(window ** 2))
+            if window_rms > 0:
+                window_db = 20 * np.log10(window_rms + 1e-10)
+                if window_db < threshold_db:
+                    # Boost this region
+                    boost_factor = 10 ** (boost_db / 20)
+                    amplified[i:i + window_size] *= boost_factor
+        # Normalize to prevent clipping
+        max_amp = np.abs(amplified).max()
+        if max_amp > 0.95:
+            amplified = amplified * 0.95 / max_amp
+        return amplified
+    def detect_anomalies(self, waveform: np.ndarray,
+                         speech_segments: List = None,
+                         threshold_db: float = -50) -> List[BackgroundAnomaly]:
+        """
+        Detect anomalies in background audio.
+        Args:
+            waveform: Audio waveform
+            speech_segments: Optional VAD segments to exclude
+            threshold_db: Minimum amplitude to consider
+        Returns:
+            List of detected anomalies
+        """
+        anomalies = []
+        # Amplify background
+        amplified = self.amplify_background(waveform)
+        # Analyze in windows
+        window_size = int(0.5 * self.sample_rate)  # 500ms
+        hop = window_size // 4
+        for i in range(0, len(amplified) - window_size, hop):
+            start_time = i / self.sample_rate
+            end_time = (i + window_size) / self.sample_rate
+            # Skip if in main speech
+            if speech_segments:
+                in_speech = any(
+                    s.start <= start_time + 0.25 <= s.end
+                    for s in speech_segments
+                )
+                if in_speech:
+                    continue
+            window = amplified[i:i + window_size]
+            window_rms = np.sqrt(np.mean(window ** 2))
+            if window_rms == 0:
+                continue
+            window_db = 20 * np.log10(window_rms + 1e-10)
+            # Check for anomaly
+            if window_db > threshold_db:
+                anomaly_type = self._classify_anomaly(window)
+                confidence = self._calculate_confidence(window, window_db, threshold_db)
+                if confidence > 0.3:  # Minimum confidence threshold
+                    anomalies.append(BackgroundAnomaly(
+                        start=start_time,
+                        end=end_time,
+                        anomaly_type=anomaly_type,
+                        amplitude_db=window_db,
+                        confidence=confidence
+                    ))
+        # Merge adjacent anomalies
+        anomalies = self._merge_anomalies(anomalies)
+        return anomalies
+    def _classify_anomaly(self, window: np.ndarray) -> AnomalyType:
+        """Classify the type of anomaly."""
+        # Extract spectral features
+        if len(window) < 512:
+            return AnomalyType.UNKNOWN
+        # Compute spectrum
+        spectrum = np.abs(np.fft.rfft(window))
+        freqs = np.fft.rfftfreq(len(window), 1/self.sample_rate)
+        # Frequency band energies
+        low_mask = freqs < 300
+        mid_mask = (freqs >= 300) & (freqs < 3000)
+        high_mask = freqs >= 3000
+        low_energy = np.sum(spectrum[low_mask] ** 2)
+        mid_energy = np.sum(spectrum[mid_mask] ** 2)
+        high_energy = np.sum(spectrum[high_mask] ** 2)
+        total = low_energy + mid_energy + high_energy + 1e-10
+        # Whisper: less low frequency, more high frequency
+        if low_energy / total < 0.1 and high_energy / total > 0.3:
+            return AnomalyType.WHISPER
+        # Speaker/Phone: limited bandwidth
+        if high_energy / total < 0.1:
+            return AnomalyType.SPEAKER_AUDIO
+        # Distant voice: high reverb indicator (simplified)
+        if mid_energy / total > 0.5:
+            return AnomalyType.DISTANT_VOICE
+        return AnomalyType.UNKNOWN
+    def _calculate_confidence(self, window: np.ndarray,
+                              db: float, threshold: float) -> float:
+        """Calculate confidence score for anomaly."""
+        # Higher amplitude above threshold = higher confidence
+        db_above = db - threshold
+        confidence = min(1.0, db_above / 20)  # Saturate at 20dB above
+        return max(0.0, confidence)
+    def _merge_anomalies(self, anomalies: List[BackgroundAnomaly],
+                         max_gap: float = 0.5) -> List[BackgroundAnomaly]:
+        """Merge adjacent anomalies of same type."""
+        if not anomalies:
+            return []
+        # Sort by start time
+        anomalies = sorted(anomalies, key=lambda a: a.start)
+        merged = [anomalies[0]]
+        for anomaly in anomalies[1:]:
+            last = merged[-1]
+            # Merge if same type and close enough
+            if (anomaly.anomaly_type == last.anomaly_type and
+                anomaly.start - last.end < max_gap):
+                # Extend the last anomaly
+                merged[-1] = BackgroundAnomaly(
+                    start=last.start,
+                    end=anomaly.end,
+                    anomaly_type=last.anomaly_type,
+                    amplitude_db=max(last.amplitude_db, anomaly.amplitude_db),
+                    confidence=max(last.confidence, anomaly.confidence)
+                )
+            else:
+                merged.append(anomaly)
+        return merged
+    def classify_audio_source(self, waveform: np.ndarray) -> AudioSource:
+        """Classify the source of audio (direct, speaker, phone)."""
+        if len(waveform) < 1024:
+            return AudioSource.UNKNOWN
+        # Analyze frequency content
+        spectrum = np.abs(np.fft.rfft(waveform))
+        freqs = np.fft.rfftfreq(len(waveform), 1/self.sample_rate)
+        # Find effective bandwidth
+        total_energy = np.sum(spectrum ** 2)
+        if total_energy == 0:
+            return AudioSource.UNKNOWN
+        cumsum = np.cumsum(spectrum ** 2)
+        idx_95 = np.searchsorted(cumsum, 0.95 * total_energy)
+        max_freq = freqs[min(idx_95, len(freqs)-1)]
+        # Phone typically cuts off around 3.4kHz
+        if max_freq < 4000:
+            return AudioSource.PHONE
+        # Speaker typically has limited high freq
+        if max_freq < 8000:
+            return AudioSource.SPEAKER
+        return AudioSource.DIRECT

src/phase6_synthetic/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .detector import SyntheticDetector, SyntheticResult, PlaybackDetector, PlaybackResult
+from .wake_words import WakeWordDetector, WakeWordDetection, TranscriptionSegment
+__all__ = [
+    'SyntheticDetector', 'SyntheticResult',
+    'PlaybackDetector', 'PlaybackResult',
+    'WakeWordDetector', 'WakeWordDetection', 'TranscriptionSegment'
+]

src/phase6_synthetic/detector.py ADDED Viewed

	@@ -0,0 +1,494 @@

+"""
+Synthetic Voice Detection - detect TTS, AI-generated speech, and playback attacks.
+"""
+import torch
+import numpy as np
+from typing import Tuple
+from dataclasses import dataclass
+@dataclass
+class SyntheticResult:
+    """Result of synthetic voice detection."""
+    score: float  # 0 = genuine, 1 = synthetic
+    is_synthetic: bool
+    confidence: str  # "high", "medium", "low"
+    @classmethod
+    def from_score(cls, score: float, threshold: float = 0.5):
+        is_synthetic = score > threshold
+        if score < 0.2 or score > 0.8:
+            confidence = "high"
+        elif score < 0.35 or score > 0.65:
+            confidence = "medium"
+        else:
+            confidence = "low"
+        return cls(score=score, is_synthetic=is_synthetic, confidence=confidence)
+@dataclass
+class PlaybackResult:
+    """Result of playback/replay detection."""
+    score: float  # 0 = live, 1 = playback
+    is_playback: bool
+    confidence: str
+    indicators: list  # List of detected indicators
+    @classmethod
+    def from_score(cls, score: float, indicators: list = None, threshold: float = 0.5):
+        is_playback = score > threshold
+        if score < 0.2 or score > 0.8:
+            confidence = "high"
+        elif score < 0.35 or score > 0.65:
+            confidence = "medium"
+        else:
+            confidence = "low"
+        return cls(
+            score=score,
+            is_playback=is_playback,
+            confidence=confidence,
+            indicators=indicators or []
+        )
+class SyntheticDetector:
+    """
+    Detect synthetic/AI-generated speech.
+    Note: For MVP, uses a simple heuristic approach.
+    Production should use ASVspoof models.
+    """
+    def __init__(self, device: str = None):
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        self.sample_rate = 16000
+    def detect(self, waveform: np.ndarray) -> SyntheticResult:
+        """
+        Detect if audio is synthetic.
+        Args:
+            waveform: Audio waveform (numpy array)
+        Returns:
+            SyntheticResult with score and classification
+        """
+        if len(waveform) < self.sample_rate:  # Less than 1 second
+            return SyntheticResult.from_score(0.5)  # Uncertain
+        # Extract features that differ between real and synthetic
+        features = self._extract_features(waveform)
+        # Simple scoring based on features
+        # (In production, use a trained classifier)
+        score = self._calculate_score(features)
+        return SyntheticResult.from_score(score)
+    def _extract_features(self, waveform: np.ndarray) -> dict:
+        """Extract features for synthetic detection."""
+        import librosa
+        features = {}
+        # 1. Spectral flatness (synthetic often more uniform)
+        spectral_flatness = librosa.feature.spectral_flatness(y=waveform)
+        features['spectral_flatness_mean'] = np.mean(spectral_flatness)
+        features['spectral_flatness_std'] = np.std(spectral_flatness)
+        # 2. Zero crossing rate variability
+        zcr = librosa.feature.zero_crossing_rate(waveform)
+        features['zcr_std'] = np.std(zcr)
+        # 3. MFCC variance (real speech has more variation)
+        mfccs = librosa.feature.mfcc(y=waveform, sr=self.sample_rate, n_mfcc=13)
+        features['mfcc_var'] = np.mean(np.var(mfccs, axis=1))
+        # 4. Pitch variation (synthetic often more regular)
+        try:
+            pitches, magnitudes = librosa.piptrack(y=waveform, sr=self.sample_rate)
+            pitch_values = pitches[magnitudes > np.median(magnitudes)]
+            if len(pitch_values) > 0:
+                features['pitch_std'] = np.std(pitch_values[pitch_values > 0])
+            else:
+                features['pitch_std'] = 0
+        except:
+            features['pitch_std'] = 0
+        return features
+    def _calculate_score(self, features: dict) -> float:
+        """Calculate synthetic score from features."""
+        score = 0.0
+        count = 0
+        # High spectral flatness = more synthetic
+        if features.get('spectral_flatness_mean', 0) > 0.3:
+            score += 0.7
+        elif features.get('spectral_flatness_mean', 0) > 0.15:
+            score += 0.3
+        count += 1
+        # Low spectral flatness variation = more synthetic
+        if features.get('spectral_flatness_std', 0) < 0.05:
+            score += 0.6
+        count += 1
+        # Low MFCC variance = more synthetic
+        if features.get('mfcc_var', 0) < 50:
+            score += 0.5
+        count += 1
+        # Low pitch variation = more synthetic
+        if features.get('pitch_std', 0) < 20:
+            score += 0.4
+        count += 1
+        return score / count if count > 0 else 0.5
+    def detect_from_file(self, audio_path: str) -> SyntheticResult:
+        """Detect from audio file."""
+        import librosa
+        waveform, _ = librosa.load(audio_path, sr=self.sample_rate)
+        return self.detect(waveform)
+class PlaybackDetector:
+    """
+    Detect if audio is being played back through speakers (replay attack).
+    Analyzes:
+    - Reverberation characteristics (room acoustics from speaker playback)
+    - High frequency roll-off (speakers have limited frequency response)
+    - Compression artifacts (from audio encoding)
+    - TTS/synthetic voice characteristics
+    - Spectral unnaturalness
+    - Double-talk/echo patterns
+    """
+    def __init__(self, sample_rate: int = 16000):
+        self.sample_rate = sample_rate
+    def detect(self, waveform: np.ndarray) -> PlaybackResult:
+        """
+        Detect if audio is from speaker playback.
+        Args:
+            waveform: Audio waveform (numpy array)
+        Returns:
+            PlaybackResult with score and indicators
+        """
+        if len(waveform) < self.sample_rate:
+            return PlaybackResult.from_score(0.5, ["audio_too_short"])
+        indicators = []
+        scores = []
+        weights = []
+        # 1. Check for high-frequency roll-off (speakers cut off high frequencies)
+        hf_score, hf_indicator = self._check_high_freq_rolloff(waveform)
+        scores.append(hf_score)
+        weights.append(1.5)  # Higher weight - very indicative
+        if hf_indicator:
+            indicators.append(hf_indicator)
+        # 2. Check for reverberation (speaker playback adds room reverb)
+        reverb_score, reverb_indicator = self._check_reverberation(waveform)
+        scores.append(reverb_score)
+        weights.append(1.0)
+        if reverb_indicator:
+            indicators.append(reverb_indicator)
+        # 3. Check for compression artifacts
+        comp_score, comp_indicator = self._check_compression_artifacts(waveform)
+        scores.append(comp_score)
+        weights.append(1.2)
+        if comp_indicator:
+            indicators.append(comp_indicator)
+        # 4. Check for unnatural silence patterns (digital silence)
+        silence_score, silence_indicator = self._check_digital_silence(waveform)
+        scores.append(silence_score)
+        weights.append(0.8)
+        if silence_indicator:
+            indicators.append(silence_indicator)
+        # 5. Check for clipping (common in playback through speakers)
+        clip_score, clip_indicator = self._check_clipping(waveform)
+        scores.append(clip_score)
+        weights.append(0.7)
+        if clip_indicator:
+            indicators.append(clip_indicator)
+        # 6. Check for TTS characteristics (ElevenLabs, etc.)
+        tts_score, tts_indicator = self._check_tts_characteristics(waveform)
+        scores.append(tts_score)
+        weights.append(2.0)  # High weight - very important
+        if tts_indicator:
+            indicators.append(tts_indicator)
+        # 7. Check spectral smoothness (TTS has unnaturally smooth spectra)
+        smooth_score, smooth_indicator = self._check_spectral_smoothness(waveform)
+        scores.append(smooth_score)
+        weights.append(1.5)
+        if smooth_indicator:
+            indicators.append(smooth_indicator)
+        # 8. Check for room acoustics double-bounce
+        room_score, room_indicator = self._check_room_acoustics(waveform)
+        scores.append(room_score)
+        weights.append(1.0)
+        if room_indicator:
+            indicators.append(room_indicator)
+        # Calculate weighted final score
+        if scores:
+            final_score = np.average(scores, weights=weights)
+        else:
+            final_score = 0.5
+        # Boost score if multiple indicators detected
+        if len(indicators) >= 3:
+            final_score = min(1.0, final_score * 1.2)
+        if len(indicators) >= 4:
+            final_score = min(1.0, final_score * 1.1)
+        return PlaybackResult.from_score(final_score, indicators, threshold=0.45)
+    def _check_high_freq_rolloff(self, waveform: np.ndarray) -> Tuple[float, str]:
+        """
+        Check for high frequency roll-off typical of speakers.
+        Live voice has energy up to 8kHz+, speakers often cut off around 4-6kHz.
+        """
+        import librosa
+        # Get spectrogram
+        S = np.abs(librosa.stft(waveform))
+        freqs = librosa.fft_frequencies(sr=self.sample_rate)
+        # Calculate energy in different frequency bands
+        low_band = S[(freqs >= 100) & (freqs < 2000)].mean()
+        mid_band = S[(freqs >= 2000) & (freqs < 4000)].mean()
+        high_band = S[(freqs >= 4000) & (freqs < 8000)].mean()
+        if low_band == 0:
+            return 0.5, None
+        # Ratio of high to low frequency energy
+        high_low_ratio = high_band / (low_band + 1e-10)
+        mid_low_ratio = mid_band / (low_band + 1e-10)
+        # Very low high-frequency energy suggests speaker playback
+        if high_low_ratio < 0.05:
+            return 0.8, "severe_hf_rolloff"
+        elif high_low_ratio < 0.15:
+            return 0.6, "moderate_hf_rolloff"
+        elif high_low_ratio < 0.3:
+            return 0.4, None
+        return 0.2, None
+    def _check_reverberation(self, waveform: np.ndarray) -> Tuple[float, str]:
+        """
+        Check for excessive reverberation typical of speaker playback.
+        Audio played through speakers picks up room acoustics twice.
+        """
+        import librosa
+        # Calculate spectral centroid variance (reverb smooths this)
+        centroid = librosa.feature.spectral_centroid(y=waveform, sr=self.sample_rate)
+        centroid_var = np.var(centroid)
+        # Calculate spectral contrast (reverb reduces contrast)
+        contrast = librosa.feature.spectral_contrast(y=waveform, sr=self.sample_rate)
+        contrast_mean = np.mean(contrast)
+        # Low variance and contrast suggest reverb/playback
+        if centroid_var < 100000 and contrast_mean < 15:
+            return 0.7, "high_reverb_detected"
+        elif centroid_var < 500000 and contrast_mean < 20:
+            return 0.5, "moderate_reverb"
+        return 0.2, None
+    def _check_compression_artifacts(self, waveform: np.ndarray) -> Tuple[float, str]:
+        """
+        Check for audio compression artifacts (MP3, AAC, etc.).
+        Playback often involves compressed audio.
+        """
+        import librosa
+        # Check for spectral holes (common in lossy compression)
+        S = np.abs(librosa.stft(waveform))
+        # Count very low energy bins (spectral holes)
+        threshold = np.max(S) * 0.001
+        spectral_holes = np.sum(S < threshold) / S.size
+        # High number of spectral holes suggests compression
+        if spectral_holes > 0.4:
+            return 0.7, "compression_artifacts"
+        elif spectral_holes > 0.25:
+            return 0.5, None
+        return 0.2, None
+    def _check_digital_silence(self, waveform: np.ndarray) -> Tuple[float, str]:
+        """
+        Check for perfectly digital silence (exactly zero values).
+        Natural recordings have noise floor, playback may have digital silence.
+        """
+        # Count exactly zero samples
+        zero_count = np.sum(waveform == 0)
+        zero_ratio = zero_count / len(waveform)
+        # Significant perfect zeros suggest digital source
+        if zero_ratio > 0.1:
+            return 0.8, "digital_silence_detected"
+        elif zero_ratio > 0.05:
+            return 0.5, "some_digital_silence"
+        return 0.1, None
+    def _check_clipping(self, waveform: np.ndarray) -> Tuple[float, str]:
+        """
+        Check for audio clipping (common in speaker playback at high volume).
+        """
+        # Normalize
+        max_val = np.max(np.abs(waveform))
+        if max_val == 0:
+            return 0.5, None
+        normalized = waveform / max_val
+        # Count samples at or very near max amplitude
+        clip_threshold = 0.99
+        clipped_samples = np.sum(np.abs(normalized) > clip_threshold)
+        clip_ratio = clipped_samples / len(waveform)
+        if clip_ratio > 0.01:
+            return 0.7, "audio_clipping"
+        elif clip_ratio > 0.005:
+            return 0.5, None
+        return 0.2, None
+    def _check_tts_characteristics(self, waveform: np.ndarray) -> Tuple[float, str]:
+        """
+        Check for TTS/synthetic voice characteristics.
+        ElevenLabs and similar TTS have very consistent pitch and timing.
+        """
+        import librosa
+        # Extract pitch (F0)
+        try:
+            f0, voiced_flag, voiced_probs = librosa.pyin(
+                waveform,
+                fmin=librosa.note_to_hz('C2'),
+                fmax=librosa.note_to_hz('C7'),
+                sr=self.sample_rate
+            )
+            f0_valid = f0[~np.isnan(f0)]
+            if len(f0_valid) > 10:
+                # TTS has very low pitch variation
+                pitch_std = np.std(f0_valid)
+                pitch_mean = np.mean(f0_valid)
+                pitch_cv = pitch_std / (pitch_mean + 1e-10)  # Coefficient of variation
+                # Natural speech has CV > 0.1, TTS often < 0.08
+                if pitch_cv < 0.05:
+                    return 0.85, "tts_flat_pitch"
+                elif pitch_cv < 0.08:
+                    return 0.7, "tts_low_pitch_variation"
+                elif pitch_cv < 0.12:
+                    return 0.5, None
+        except Exception:
+            pass
+        # Check for unnaturally regular timing (TTS has consistent phoneme duration)
+        try:
+            onset_env = librosa.onset.onset_strength(y=waveform, sr=self.sample_rate)
+            onset_frames = librosa.onset.onset_detect(onset_envelope=onset_env, sr=self.sample_rate)
+            if len(onset_frames) > 5:
+                intervals = np.diff(onset_frames)
+                interval_cv = np.std(intervals) / (np.mean(intervals) + 1e-10)
+                # TTS has very regular intervals
+                if interval_cv < 0.3:
+                    return 0.75, "tts_regular_timing"
+                elif interval_cv < 0.5:
+                    return 0.55, None
+        except Exception:
+            pass
+        return 0.3, None
+    def _check_spectral_smoothness(self, waveform: np.ndarray) -> Tuple[float, str]:
+        """
+        Check for unnaturally smooth spectrum (common in TTS and compressed playback).
+        """
+        import librosa
+        # Get mel spectrogram
+        mel_spec = librosa.feature.melspectrogram(y=waveform, sr=self.sample_rate, n_mels=128)
+        mel_db = librosa.power_to_db(mel_spec, ref=np.max)
+        # Calculate spectral flux (frame-to-frame variation)
+        spectral_flux = np.mean(np.abs(np.diff(mel_db, axis=1)))
+        # TTS and playback have lower spectral flux (smoother)
+        if spectral_flux < 3.0:
+            return 0.8, "smooth_spectrum"
+        elif spectral_flux < 5.0:
+            return 0.6, "slightly_smooth_spectrum"
+        elif spectral_flux < 8.0:
+            return 0.4, None
+        return 0.2, None
+    def _check_room_acoustics(self, waveform: np.ndarray) -> Tuple[float, str]:
+        """
+        Check for room acoustics characteristics from speaker playback.
+        Audio played through speakers picks up room reverb.
+        """
+        import librosa
+        # Calculate spectral bandwidth variation
+        bandwidth = librosa.feature.spectral_bandwidth(y=waveform, sr=self.sample_rate)
+        bandwidth_var = np.var(bandwidth)
+        # Calculate spectral rolloff
+        rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=self.sample_rate)
+        rolloff_mean = np.mean(rolloff)
+        # Speaker playback tends to have limited bandwidth and lower rolloff
+        score = 0.3
+        # Low bandwidth variation suggests processed audio
+        if bandwidth_var < 50000:
+            score += 0.3
+        # Low rolloff frequency suggests speaker limitations
+        if rolloff_mean < 3000:
+            score += 0.3
+            return min(score, 0.85), "limited_frequency_range"
+        elif rolloff_mean < 4500:
+            score += 0.15
+        if score > 0.5:
+            return score, "room_acoustics_detected"
+        return score, None
+    def detect_from_file(self, audio_path: str) -> PlaybackResult:
+        """Detect playback from audio file."""
+        import librosa
+        waveform, _ = librosa.load(audio_path, sr=self.sample_rate)
+        return self.detect(waveform)

src/phase6_synthetic/wake_words.py ADDED Viewed

	@@ -0,0 +1,235 @@

+"""
+Wake Word Detection - detect voice assistant usage.
+"""
+import numpy as np
+from typing import List, Optional
+from dataclasses import dataclass
+import re
+@dataclass
+class WakeWordDetection:
+    """Detected wake word."""
+    word: str
+    assistant: str  # amazon, apple, google, microsoft
+    time: float
+    confidence: float
+    context: str  # Surrounding text
+@dataclass
+class TranscriptionSegment:
+    """A segment of transcription."""
+    start: float
+    end: float
+    text: str
+class WakeWordDetector:
+    """Detect voice assistant wake words using Whisper transcription."""
+    WAKE_WORDS = {
+        "amazon": ["alexa", "echo", "amazon"],
+        "apple": ["hey siri", "siri"],
+        "google": ["ok google", "hey google", "google"],
+        "microsoft": ["cortana", "hey cortana"]
+    }
+    # Patterns that suggest assistant response
+    RESPONSE_PATTERNS = [
+        r"here'?s what i found",
+        r"according to",
+        r"the answer is",
+        r"i found this",
+        r"let me search",
+        r"searching for",
+        r"playing .+ by",
+        r"the weather (is|today|tomorrow)",
+        r"it'?s currently",
+        r"\d+ degrees"
+    ]
+    def __init__(self, model_size: str = "base"):
+        """
+        Initialize wake word detector.
+        Args:
+            model_size: Whisper model size (tiny, base, small, medium, large)
+        """
+        self.model_size = model_size
+        self._model = None
+    @property
+    def model(self):
+        """Lazy load Whisper model."""
+        if self._model is None:
+            import whisper
+            self._model = whisper.load_model(self.model_size)
+        return self._model
+    def transcribe(self, audio_path: str,
+                   language: str = "en") -> List[TranscriptionSegment]:
+        """
+        Transcribe audio file.
+        Args:
+            audio_path: Path to audio file
+            language: Language code
+        Returns:
+            List of transcription segments
+        """
+        result = self.model.transcribe(
+            audio_path,
+            language=language,
+            word_timestamps=True
+        )
+        segments = []
+        for seg in result.get("segments", []):
+            segments.append(TranscriptionSegment(
+                start=seg["start"],
+                end=seg["end"],
+                text=seg["text"].strip()
+            ))
+        return segments
+    def detect_wake_words(self, segments: List[TranscriptionSegment]) -> List[WakeWordDetection]:
+        """
+        Detect wake words in transcription.
+        Args:
+            segments: Transcription segments
+        Returns:
+            List of wake word detections
+        """
+        detections = []
+        full_text = " ".join([s.text for s in segments]).lower()
+        for segment in segments:
+            text_lower = segment.text.lower()
+            for assistant, words in self.WAKE_WORDS.items():
+                for word in words:
+                    if word in text_lower:
+                        # Calculate confidence based on word clarity
+                        confidence = self._calculate_confidence(word, text_lower)
+                        # Get surrounding context
+                        context = self._get_context(segment, segments)
+                        detections.append(WakeWordDetection(
+                            word=word,
+                            assistant=assistant,
+                            time=segment.start,
+                            confidence=confidence,
+                            context=context
+                        ))
+        # Remove duplicates (same word at similar times)
+        detections = self._deduplicate(detections)
+        return detections
+    def detect_assistant_responses(self, segments: List[TranscriptionSegment]) -> List[dict]:
+        """
+        Detect patterns that suggest assistant responses.
+        Returns:
+            List of detected responses with time and pattern
+        """
+        responses = []
+        for segment in segments:
+            text_lower = segment.text.lower()
+            for pattern in self.RESPONSE_PATTERNS:
+                if re.search(pattern, text_lower):
+                    responses.append({
+                        'time': segment.start,
+                        'end': segment.end,
+                        'pattern': pattern,
+                        'text': segment.text
+                    })
+                    break  # One match per segment
+        return responses
+    def _calculate_confidence(self, word: str, text: str) -> float:
+        """Calculate detection confidence."""
+        # Exact match = higher confidence
+        if f" {word} " in f" {text} ":
+            return 0.9
+        # Start of sentence
+        if text.startswith(word):
+            return 0.85
+        # Part of word might be false positive
+        return 0.6
+    def _get_context(self, segment: TranscriptionSegment,
+                     all_segments: List[TranscriptionSegment],
+                     context_window: float = 5.0) -> str:
+        """Get text context around a segment."""
+        context_parts = []
+        for s in all_segments:
+            if abs(s.start - segment.start) <= context_window:
+                context_parts.append(s.text)
+        return " ".join(context_parts)
+    def _deduplicate(self, detections: List[WakeWordDetection],
+                     time_threshold: float = 2.0) -> List[WakeWordDetection]:
+        """Remove duplicate detections."""
+        if not detections:
+            return []
+        # Sort by time
+        detections = sorted(detections, key=lambda d: d.time)
+        unique = [detections[0]]
+        for detection in detections[1:]:
+            last = unique[-1]
+            # Skip if same word within threshold
+            if (detection.word == last.word and
+                abs(detection.time - last.time) < time_threshold):
+                # Keep the one with higher confidence
+                if detection.confidence > last.confidence:
+                    unique[-1] = detection
+            else:
+                unique.append(detection)
+        return unique
+    def analyze(self, audio_path: str) -> dict:
+        """
+        Full wake word analysis.
+        Returns:
+            Dict with wake_words and assistant_responses
+        """
+        try:
+            segments = self.transcribe(audio_path)
+            wake_words = self.detect_wake_words(segments)
+            responses = self.detect_assistant_responses(segments)
+            return {
+                'transcription': segments,
+                'wake_words': wake_words,
+                'assistant_responses': responses
+            }
+        except Exception as e:
+            # Return empty results on error
+            print(f"Wake word detection error: {e}")
+            return {
+                'transcription': [],
+                'wake_words': [],
+                'assistant_responses': []
+            }

src/ui/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # UI components placeholder