Spaces:

Yermia
/

Notulen_Otomatis

Sleeping

App Files Files Community

Yermia commited on Jan 30

Commit

fda93d9

verified ·

1 Parent(s): bc1c0d3

Upload 13 files

Browse files

Files changed (13) hide show

src/__init__.py +69 -0
src/audio_processor.py +398 -0
src/config.py +271 -0
src/diarization.py +1504 -0
src/document_generator.py +852 -0
src/evaluator.py +797 -0
src/nlp_utils.py +243 -0
src/pipeline.py +1121 -0
src/speaker.py +69 -0
src/summarizer.py +1783 -0
src/transcriber.py +1108 -0
src/transcriber_speechbrain.py +155 -0
src/utils.py +555 -0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""
+Meeting Transcriber - Sistem Notulensi Rapat Otomatis
+=====================================================
+Sistem end-to-end untuk mengubah rekaman audio rapat menjadi
+dokumen notulensi terstruktur menggunakan SpeechBrain dan BERT.
+Modules:
+    - config: Konfigurasi sistem
+    - audio_processor: Preprocessing audio
+    - diarization: Speaker diarization
+    - transcriber: ASR transcription
+    - summarizer: BERT summarization
+    - document_generator: Export ke .docx
+    - evaluator: Metrik evaluasi (WER, DER)
+    - pipeline: Main orchestrator
+    - utils: Utility functions
+Example:
+    >>> from src.pipeline import MeetingTranscriberPipeline
+    >>> pipeline = MeetingTranscriberPipeline()
+    >>> result = pipeline.process("meeting.wav", title="Team Meeting")
+    >>> print(result.document_path)
+"""
+__version__ = "1.0.0"
+__author__ = "Yermia Turangan"
+__email__ = "yermiaturangan026@student.unsrat.ac.id"
+from src.audio_processor import AudioConfig, AudioProcessor
+from src.config import Config, load_config
+from src.diarization import DiarizationConfig, SpeakerDiarizer, SpeakerSegment
+from src.document_generator import DocumentGenerator, MeetingMetadata
+from src.evaluator import DERResult, Evaluator, WERResult
+from src.pipeline import MeetingTranscriberPipeline, PipelineConfig, PipelineResult
+from src.summarizer import BERTSummarizer, MeetingSummary, SummarizationConfig
+from src.transcriber import ASRConfig, ASRTranscriber, TranscriptSegment
+__all__ = [
+    # Config
+    "Config",
+    "load_config",
+    # Audio
+    "AudioProcessor",
+    "AudioConfig",
+    # Diarization
+    "SpeakerDiarizer",
+    "DiarizationConfig",
+    "SpeakerSegment",
+    # ASR
+    "ASRTranscriber",
+    "ASRConfig",
+    "TranscriptSegment",
+    # Summarization
+    "BERTSummarizer",
+    "SummarizationConfig",
+    "MeetingSummary",
+    # Document
+    "DocumentGenerator",
+    "MeetingMetadata",
+    # Evaluation
+    "Evaluator",
+    "WERResult",
+    "DERResult",
+    # Pipeline
+    "MeetingTranscriberPipeline",
+    "PipelineConfig",
+    "PipelineResult",
+]

src/audio_processor.py ADDED Viewed

	@@ -0,0 +1,398 @@

+"""
+Audio Processor Module
+======================
+Handles audio loading, preprocessing, and segmentation.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torchaudio
+from torchaudio.transforms import Resample
+try:
+    import librosa
+    LIBROSA_AVAILABLE = True
+except ImportError:
+    LIBROSA_AVAILABLE = False
+@dataclass
+class AudioConfig:
+    """Configuration for audio processing"""
+    sample_rate: int = 16000
+    mono: bool = True
+    normalize: bool = True
+    trim_silence: bool = False
+    silence_threshold_db: float = -40.0
+    max_duration_seconds: Optional[float] = None
+@dataclass
+class AudioInfo:
+    """Information about loaded audio"""
+    path: str
+    duration_seconds: float
+    sample_rate: int
+    num_channels: int
+    num_samples: int
+class AudioProcessor:
+    """
+    Handles all audio preprocessing operations.
+    Converts input audio to standardized format for downstream processing.
+    Attributes:
+        config: AudioConfig object with processing settings
+    Example:
+        >>> processor = AudioProcessor()
+        >>> waveform, sr = processor.load_audio("meeting.wav")
+        >>> print(f"Duration: {processor.get_duration(waveform, sr):.2f}s")
+    """
+    SUPPORTED_FORMATS = {".wav", ".mp3", ".flac", ".ogg", ".m4a", ".wma", ".aac"}
+    def __init__(self, config: Optional[AudioConfig] = None):
+        """
+        Initialize AudioProcessor.
+        Args:
+            config: AudioConfig object (uses defaults if None)
+        """
+        self.config = config or AudioConfig()
+        self._resampler_cache: dict = {}
+    def load_audio(
+        self,
+        audio_path: Union[str, Path],
+        start_time: Optional[float] = None,
+        end_time: Optional[float] = None,
+    ) -> Tuple[torch.Tensor, int]:
+        """
+        Load and preprocess audio file.
+        Args:
+            audio_path: Path to audio file
+            start_time: Start time in seconds (optional)
+            end_time: End time in seconds (optional)
+        Returns:
+            Tuple of (waveform tensor [1, T], sample_rate)
+        Raises:
+            FileNotFoundError: If audio file doesn't exist
+            ValueError: If audio format is not supported
+        """
+        audio_path = Path(audio_path)
+        # Validate file exists
+        if not audio_path.exists():
+            raise FileNotFoundError(f"Audio file not found: {audio_path}")
+        # Validate format
+        if audio_path.suffix.lower() not in self.SUPPORTED_FORMATS:
+            raise ValueError(
+                f"Unsupported audio format: {audio_path.suffix}. "
+                f"Supported formats: {self.SUPPORTED_FORMATS}"
+            )
+        # Load audio
+        try:
+            waveform, orig_sr = torchaudio.load(str(audio_path))
+        except Exception as e:
+            # Fallback to librosa if torchaudio fails
+            if LIBROSA_AVAILABLE:
+                try:
+                    audio_np, orig_sr = librosa.load(str(audio_path), sr=None, mono=False)
+                    if audio_np.ndim == 1:
+                        audio_np = audio_np[np.newaxis, :]
+                    waveform = torch.from_numpy(audio_np).float()
+                except Exception:
+                    # Try pydub (requires ffmpeg) as a robust fallback
+                    try:
+                        from pydub import AudioSegment
+                        seg = AudioSegment.from_file(str(audio_path))
+                        orig_sr = seg.frame_rate
+                        samples = np.array(seg.get_array_of_samples())
+                        if seg.channels > 1:
+                            samples = samples.reshape((-1, seg.channels)).T
+                        else:
+                            samples = samples[np.newaxis, :]
+                        # Normalize based on sample width
+                        max_val = float(1 << (8 * seg.sample_width - 1))
+                        audio_np = samples.astype(np.float32) / max_val
+                        waveform = torch.from_numpy(audio_np).float()
+                    except Exception:
+                        # Try ffmpeg CLI (system binary) to decode to WAV in-memory (no extra Python packages required)
+                        try:
+                            import io
+                            import subprocess
+                            import soundfile as sf
+                            proc = subprocess.run(
+                                [
+                                    "ffmpeg",
+                                    "-i",
+                                    str(audio_path),
+                                    "-f",
+                                    "wav",
+                                    "-ar",
+                                    "16000",
+                                    "-ac",
+                                    "1",
+                                    "pipe:1",
+                                ],
+                                stdout=subprocess.PIPE,
+                                stderr=subprocess.DEVNULL,
+                                check=True,
+                            )
+                            out = proc.stdout
+                            audio_np, orig_sr = sf.read(io.BytesIO(out), dtype="float32")
+                            if audio_np.ndim == 1:
+                                audio_np = audio_np[np.newaxis, :]
+                            else:
+                                audio_np = audio_np.T
+                            waveform = torch.from_numpy(audio_np).float()
+                        except Exception:
+                            # Last resort: use ffmpeg-python to decode into WAV bytes and read via soundfile
+                            try:
+                                import io
+                                import ffmpeg
+                                import soundfile as sf
+                                out, _ = (
+                                    ffmpeg.input(str(audio_path))
+                                    .output("pipe:", format="wav", acodec="pcm_s16le")
+                                    .run(capture_stdout=True, capture_stderr=True)
+                                )
+                                audio_np, orig_sr = sf.read(io.BytesIO(out), dtype="float32")
+                                if audio_np.ndim == 1:
+                                    audio_np = audio_np[np.newaxis, :]
+                                else:
+                                    audio_np = audio_np.T
+                                waveform = torch.from_numpy(audio_np).float()
+                            except Exception:
+                                raise RuntimeError(
+                                    "Format file tidak didukung atau backend decoding (ffmpeg) tidak tersedia. "
+                                    "Silakan install ffmpeg (pastikan tersedia di PATH) atau gunakan format WAV/MP3 yang didukung."
+                                )
+            else:
+                raise RuntimeError(f"Failed to load audio: {e}")
+        # Trim to time range if specified
+        if start_time is not None or end_time is not None:
+            waveform = self._trim_to_range(waveform, orig_sr, start_time, end_time)
+        # Convert to mono if needed
+        if self.config.mono and waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+        # Resample if needed
+        if orig_sr != self.config.sample_rate:
+            waveform = self._resample(waveform, orig_sr, self.config.sample_rate)
+        # Normalize amplitude
+        if self.config.normalize:
+            waveform = self._normalize(waveform)
+        # Trim silence if requested
+        if self.config.trim_silence:
+            waveform = self._trim_silence(waveform)
+        # Enforce max duration
+        if self.config.max_duration_seconds:
+            max_samples = int(self.config.max_duration_seconds * self.config.sample_rate)
+            if waveform.shape[-1] > max_samples:
+                waveform = waveform[:, :max_samples]
+        return waveform, self.config.sample_rate
+    def get_audio_info(self, audio_path: Union[str, Path]) -> AudioInfo:
+        """
+        Get information about audio file without loading full waveform.
+        Args:
+            audio_path: Path to audio file
+        Returns:
+            AudioInfo object with file details
+        """
+        audio_path = Path(audio_path)
+        if not audio_path.exists():
+            raise FileNotFoundError(f"Audio file not found: {audio_path}")
+        info = torchaudio.info(str(audio_path))
+        return AudioInfo(
+            path=str(audio_path),
+            duration_seconds=info.num_frames / info.sample_rate,
+            sample_rate=info.sample_rate,
+            num_channels=info.num_channels,
+            num_samples=info.num_frames,
+        )
+    def _trim_to_range(
+        self,
+        waveform: torch.Tensor,
+        sample_rate: int,
+        start_time: Optional[float],
+        end_time: Optional[float],
+    ) -> torch.Tensor:
+        """Trim waveform to specified time range"""
+        start_sample = int((start_time or 0) * sample_rate)
+        end_sample = int((end_time or waveform.shape[-1] / sample_rate) * sample_rate)
+        start_sample = max(0, start_sample)
+        end_sample = min(waveform.shape[-1], end_sample)
+        return waveform[:, start_sample:end_sample]
+    def _resample(self, waveform: torch.Tensor, orig_sr: int, target_sr: int) -> torch.Tensor:
+        """Resample audio to target sample rate with caching"""
+        cache_key = (orig_sr, target_sr)
+        if cache_key not in self._resampler_cache:
+            self._resampler_cache[cache_key] = Resample(orig_freq=orig_sr, new_freq=target_sr)
+        return self._resampler_cache[cache_key](waveform)
+    def _normalize(self, waveform: torch.Tensor) -> torch.Tensor:
+        """Normalize waveform to [-1, 1] range"""
+        max_val = torch.max(torch.abs(waveform))
+        if max_val > 0:
+            waveform = waveform / max_val
+        return waveform
+    def _trim_silence(self, waveform: torch.Tensor) -> torch.Tensor:
+        """Remove leading and trailing silence"""
+        # Convert threshold from dB to amplitude
+        threshold = 10 ** (self.config.silence_threshold_db / 20)
+        # Find non-silent regions
+        amplitude = torch.abs(waveform).squeeze()
+        non_silent = amplitude > threshold
+        if not non_silent.any():
+            return waveform
+        # Find first and last non-silent sample
+        non_silent_indices = torch.where(non_silent)[0]
+        start_idx = non_silent_indices[0].item()
+        end_idx = non_silent_indices[-1].item() + 1
+        return waveform[:, start_idx:end_idx]
+    def get_duration(self, waveform: torch.Tensor, sample_rate: int) -> float:
+        """Get duration of waveform in seconds"""
+        return waveform.shape[-1] / sample_rate
+    def cut_segment(
+        self, waveform: torch.Tensor, start_sec: float, end_sec: float, sample_rate: int
+    ) -> torch.Tensor:
+        """
+        Extract a segment from waveform.
+        Args:
+            waveform: Input waveform [C, T]
+            start_sec: Start time in seconds
+            end_sec: End time in seconds
+            sample_rate: Sample rate of waveform
+        Returns:
+            Segment waveform [C, t]
+        """
+        start_sample = int(max(0, start_sec) * sample_rate)
+        end_sample = int(min(end_sec * sample_rate, waveform.shape[-1]))
+        return waveform[:, start_sample:end_sample]
+    def split_into_chunks(
+        self,
+        waveform: torch.Tensor,
+        chunk_duration: float,
+        overlap: float = 0.0,
+        sample_rate: Optional[int] = None,
+    ) -> List[Tuple[torch.Tensor, float, float]]:
+        """
+        Split waveform into overlapping chunks.
+        Args:
+            waveform: Input waveform
+            chunk_duration: Duration of each chunk in seconds
+            overlap: Overlap between chunks in seconds
+            sample_rate: Sample rate (uses config if None)
+        Returns:
+            List of (chunk_waveform, start_sec, end_sec)
+        """
+        sample_rate = sample_rate or self.config.sample_rate
+        total_duration = self.get_duration(waveform, sample_rate)
+        chunks = []
+        start = 0.0
+        while start < total_duration:
+            end = min(start + chunk_duration, total_duration)
+            chunk = self.cut_segment(waveform, start, end, sample_rate)
+            chunks.append((chunk, start, end))
+            start += chunk_duration - overlap
+        return chunks
+    def add_noise(
+        self, waveform: torch.Tensor, noise_level: float = 0.01, noise_type: str = "gaussian"
+    ) -> torch.Tensor:
+        """
+        Add noise to waveform (for data augmentation).
+        Args:
+            waveform: Input waveform
+            noise_level: Noise amplitude (0-1)
+            noise_type: Type of noise ("gaussian", "uniform")
+        Returns:
+            Waveform with added noise
+        """
+        if noise_type == "gaussian":
+            noise = torch.randn_like(waveform) * noise_level
+        elif noise_type == "uniform":
+            noise = (torch.rand_like(waveform) * 2 - 1) * noise_level
+        else:
+            raise ValueError(f"Unknown noise type: {noise_type}")
+        return waveform + noise
+    def save_audio(
+        self,
+        waveform: torch.Tensor,
+        output_path: Union[str, Path],
+        sample_rate: Optional[int] = None,
+    ):
+        """
+        Save waveform to audio file.
+        Args:
+            waveform: Waveform to save
+            output_path: Output file path
+            sample_rate: Sample rate (uses config if None)
+        """
+        sample_rate = sample_rate or self.config.sample_rate
+        torchaudio.save(str(output_path), waveform, sample_rate)

src/config.py ADDED Viewed

	@@ -0,0 +1,271 @@

+"""
+Configuration Module
+====================
+Handles loading and managing configuration for the entire system.
+"""
+from __future__ import annotations
+import os
+from dataclasses import dataclass, field
+from typing import List, Optional
+import yaml
+@dataclass
+class VADConfig:
+    """Voice Activity Detection configuration"""
+    threshold: float = 0.5
+    min_speech_duration: float = 0.3
+    min_silence_duration: float = 0.3
+    speech_pad_ms: int = 30
+@dataclass
+class SegmentationConfig:
+    """Segmentation configuration"""
+    window_duration: float = 1.5
+    window_hop: float = 0.75
+    min_segment_duration: float = 0.5
+@dataclass
+class EmbeddingConfig:
+    """Speaker embedding configuration"""
+    model_id: str = "speechbrain/spkrec-ecapa-voxceleb"
+    embedding_dim: int = 192
+@dataclass
+class ClusteringConfig:
+    """Clustering configuration"""
+    method: str = "agglomerative"
+    threshold: float = 0.7
+    min_cluster_size: int = 2
+    linkage: str = "average"
+@dataclass
+class AudioConfig:
+    """Audio processing configuration"""
+    sample_rate: int = 16000
+    mono: bool = True
+    normalize: bool = True
+    trim_silence: bool = False
+    max_duration_minutes: int = 60
+@dataclass
+class DiarizationConfig:
+    """Speaker diarization configuration"""
+    vad: VADConfig = field(default_factory=VADConfig)
+    segmentation: SegmentationConfig = field(default_factory=SegmentationConfig)
+    embedding: EmbeddingConfig = field(default_factory=EmbeddingConfig)
+    clustering: ClusteringConfig = field(default_factory=ClusteringConfig)
+    merge_gap_threshold: float = 0.5
+    min_segment_duration: float = 0.3
+    smooth_segments: bool = True
+    # Embedding and collapse options
+    use_speechbrain: bool = True
+    allow_fallback: bool = False
+    collapse_threshold: float = 0.15
+    silhouette_collapse_threshold: float = 0.05
+@dataclass
+class ASRConfig:
+    """ASR configuration"""
+    model_id: str = "indonesian-nlp/wav2vec2-large-xlsr-indonesian"
+    chunk_length_s: float = 30.0
+    stride_length_s: float = 5.0
+    batch_size: int = 4
+    return_timestamps: Optional[str] = None
+    # Valid values: None (no timestamps), or 'char' / 'word' for CTC timestamp modes
+    capitalize_sentences: bool = True
+    normalize_whitespace: bool = True
+@dataclass
+class SummarizationConfig:
+    """Summarization configuration"""
+    model_id: str = "indobenchmark/indobert-base-p1"
+    sentence_model_id: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+    num_sentences: int = 5
+    min_sentence_length: int = 10
+    max_sentence_length: int = 200
+    position_weight: float = 0.1
+    decision_keywords: List[str] = field(
+        default_factory=lambda: [
+            "diputuskan",
+            "disepakati",
+            "kesimpulan",
+            "keputusan",
+            "jadi",
+            "maka",
+            "sepakat",
+            "setuju",
+            "final",
+        ]
+    )
+    action_keywords: List[str] = field(
+        default_factory=lambda: [
+            "akan",
+            "harus",
+            "perlu",
+            "tolong",
+            "mohon",
+            "deadline",
+            "target",
+            "tugas",
+            "tanggung jawab",
+            "action item",
+            "follow up",
+            "tindak lanjut",
+        ]
+    )
+@dataclass
+class DocumentConfig:
+    """Document generation configuration"""
+    template: str = "default"
+    title_font_size: int = 18
+    heading_font_size: int = 14
+    body_font_size: int = 11
+    font_family: str = "Calibri"
+    include_timestamps: bool = True
+    include_speaker_colors: bool = True
+@dataclass
+class EvaluationConfig:
+    """Evaluation configuration"""
+    wer_lowercase: bool = True
+    wer_remove_punctuation: bool = True
+    der_collar: float = 0.25
+    der_skip_overlap: bool = False
+@dataclass
+class PathsConfig:
+    """Paths configuration"""
+    models_dir: str = "./models"
+    audio_dir: str = "./data/audio"
+    ground_truth_dir: str = "./data/ground_truth"
+    output_dir: str = "./data/output"
+    cache_dir: str = "./cache"
+    logs_dir: str = "./logs"
+@dataclass
+class Config:
+    """Main configuration class"""
+    audio: AudioConfig = field(default_factory=AudioConfig)
+    diarization: DiarizationConfig = field(default_factory=DiarizationConfig)
+    asr: ASRConfig = field(default_factory=ASRConfig)
+    summarization: SummarizationConfig = field(default_factory=SummarizationConfig)
+    document: DocumentConfig = field(default_factory=DocumentConfig)
+    evaluation: EvaluationConfig = field(default_factory=EvaluationConfig)
+    paths: PathsConfig = field(default_factory=PathsConfig)
+    device: str = "auto"
+    verbose: bool = True
+    def __post_init__(self):
+        """Create directories if they don't exist"""
+        for path_attr in [
+            "models_dir",
+            "audio_dir",
+            "ground_truth_dir",
+            "output_dir",
+            "cache_dir",
+            "logs_dir",
+        ]:
+            path = getattr(self.paths, path_attr)
+            os.makedirs(path, exist_ok=True)
+def load_config(config_path: str = "config.yaml") -> Config:
+    """
+    Load configuration from YAML file.
+    Args:
+        config_path: Path to config.yaml file
+    Returns:
+        Config object with loaded settings
+    """
+    config = Config()
+    if os.path.exists(config_path):
+        with open(config_path, "r", encoding="utf-8") as f:
+            yaml_config = yaml.safe_load(f)
+        if yaml_config:
+            # Update audio config
+            if "audio" in yaml_config:
+                for key, value in yaml_config["audio"].items():
+                    if hasattr(config.audio, key):
+                        setattr(config.audio, key, value)
+            # Update ASR config
+            if "asr" in yaml_config:
+                for key, value in yaml_config["asr"].items():
+                    if hasattr(config.asr, key):
+                        setattr(config.asr, key, value)
+            # Update summarization config
+            if "summarization" in yaml_config:
+                for key, value in yaml_config["summarization"].items():
+                    if hasattr(config.summarization, key):
+                        setattr(config.summarization, key, value)
+            # Update paths config
+            if "paths" in yaml_config:
+                for key, value in yaml_config["paths"].items():
+                    if hasattr(config.paths, key):
+                        setattr(config.paths, key, value)
+            # Update device
+            if "hardware" in yaml_config and "device" in yaml_config["hardware"]:
+                config.device = yaml_config["hardware"]["device"]
+    return config
+def save_config(config: Config, config_path: str = "config.yaml"):
+    """
+    Save configuration to YAML file.
+    Args:
+        config: Config object to save
+        config_path: Path to save config.yaml
+    """
+    # Convert dataclass to dict
+    config_dict = {
+        "audio": config.audio.__dict__,
+        "asr": config.asr.__dict__,
+        "summarization": {
+            k: v for k, v in config.summarization.__dict__.items() if not k.endswith("_keywords")
+        },
+        "document": config.document.__dict__,
+        "evaluation": config.evaluation.__dict__,
+        "paths": config.paths.__dict__,
+        "hardware": {"device": config.device},
+    }
+    with open(config_path, "w", encoding="utf-8") as f:
+        yaml.dump(config_dict, f, default_flow_style=False, allow_unicode=True)

src/diarization.py ADDED Viewed

	@@ -0,0 +1,1504 @@

+"""
+Speaker Diarization Module
+==========================
+Implements VAD + Speaker Embedding + Clustering pipeline for speaker diarization.
+"""
+from __future__ import annotations
+import logging
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+import torch
+from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralClustering
+from sklearn.metrics import silhouette_score
+from sklearn.preprocessing import StandardScaler
+from src.utils import setup_logger
+@dataclass
+class DiarizationConfig:
+    """Configuration for speaker diarization"""
+    # VAD settings
+    vad_threshold: float = 0.5
+    min_speech_duration: float = 0.3
+    min_silence_duration: float = 0.3
+    # Segmentation settings
+    segment_window: float = 1.5
+    segment_hop: float = 0.75
+    # Clustering settings
+    clustering_method: str = "agglomerative"
+    clustering_threshold: float = 0.7
+    min_cluster_size: int = 2
+    max_speakers: Optional[int] = None
+    # Post-processing
+    merge_gap_threshold: float = 0.5
+    min_segment_duration: float = 0.3
+    # Model settings
+    embedding_model_id: str = "speechbrain/spkrec-ecapa-voxceleb"
+    use_speechbrain: bool = True  # prefer SpeechBrain embeddings
+    allow_fallback: bool = False  # if False, raise an error when SpeechBrain cannot be loaded
+    # Collapse heuristics
+    collapse_threshold: float = 0.15
+    # When negative, do not automatically collapse clusters to a single speaker based on silhouette.
+    silhouette_collapse_threshold: float = -1.0
+    # Iterative merging (centroid-based)
+    iterative_merge_threshold: float = 0.15
+    iterative_merge_silhouette_threshold: float = 0.0
+    iterative_merge_max_iters: int = 10
+    # Performance tuning
+    embedding_batch_size: int = 32
+    embedding_cache: bool = True  # write/load embedding arrays to cache_dir
+    use_fast_embedding: bool = False  # use MFCC deterministic embeddings for speed
+    # Optional: target speaker count - if set, clusters will be greedily merged to meet target
+    target_num_speakers: Optional[int] = None
+    target_force_threshold: float = (
+        1.0  # 1.0 => allow merges regardless of distance; lower = more conservative
+    )
+    # Device
+    device: str = "cuda" if torch.cuda.is_available() else "cpu"
+@dataclass
+class SpeakerSegment:
+    """Represents a speaker segment with timing and metadata"""
+    speaker_id: str
+    start: float
+    end: float
+    confidence: float = 1.0
+    is_overlap: bool = False
+    embedding: Optional[np.ndarray] = None
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    @property
+    def duration(self) -> float:
+        """Get segment duration in seconds"""
+        return self.end - self.start
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            "speaker_id": self.speaker_id,
+            "start": self.start,
+            "end": self.end,
+            "confidence": self.confidence,
+            "is_overlap": self.is_overlap,
+            "duration": self.duration,
+        }
+class SpeakerDiarizer:
+    """
+    Speaker Diarization using SpeechBrain ECAPA-TDNN embeddings.
+    Pipeline:
+        1. Voice Activity Detection (VAD)
+        2. Audio segmentation into windows
+        3. Speaker embedding extraction (ECAPA-TDNN)
+        4. Clustering to assign speaker labels
+        5. Post-processing (merging, smoothing)
+    Attributes:
+        config: DiarizationConfig object
+    Example:
+        >>> diarizer = SpeakerDiarizer()
+        >>> segments = diarizer.process(waveform, sample_rate=16000, num_speakers=4)
+        >>> for seg in segments:
+        ...     print(f"{seg.speaker_id}: {seg.start:.2f}s - {seg.end:.2f}s")
+    """
+    def __init__(self, config: Optional[DiarizationConfig] = None, models_dir: str = "./models"):
+        """
+        Initialize SpeakerDiarizer.
+        Args:
+            config: DiarizationConfig object
+            models_dir: Directory to cache downloaded models
+        """
+        self.config = config or DiarizationConfig()
+        self.models_dir = Path(models_dir)
+        self.models_dir.mkdir(parents=True, exist_ok=True)
+        self.device = self.config.device
+        # Setup logger
+        self.logger = setup_logger("SpeakerDiarizer")
+        # Model placeholders (lazy loading)
+        self._embedding_model = None
+        self._vad_model = None
+        self._embedding_model_is_speechbrain = False
+    def _load_embedding_model(self):
+        """Lazy load speaker embedding model
+        This function will attempt to patch missing torchaudio APIs (e.g., list_audio_backends)
+        so that SpeechBrain imports cleanly on environments with older torchaudio builds.
+        """
+        if self._embedding_model is None:
+            # Shim torchaudio compatibility if needed (some torchaudio versions lack list_audio_backends)
+            try:
+                import importlib
+                if importlib.util.find_spec("torchaudio"):
+                    import torchaudio
+                    if not hasattr(torchaudio, "list_audio_backends"):
+                        def _list_audio_backends():
+                            # best-effort guess of available backends; not exhaustive
+                            backends = []
+                            try:
+                                # prefer sox_io and soundfile as common options
+                                backends.append("sox_io")
+                            except Exception:
+                                pass
+                            try:
+                                backends.append("soundfile")
+                            except Exception:
+                                pass
+                            if not backends:
+                                backends = ["sox_io"]
+                            return backends
+                        torchaudio.list_audio_backends = _list_audio_backends
+                    if not hasattr(torchaudio, "get_audio_backend"):
+                        torchaudio.get_audio_backend = lambda: torchaudio.list_audio_backends()[0]
+            except Exception:
+                # best-effort only, don't prevent embedding loading attempt
+                pass
+            try:
+                from speechbrain.inference.speaker import EncoderClassifier
+                self.logger.info(f"Loading embedding model: {self.config.embedding_model_id}")
+                import os
+                # Prefer to disable HF symlinks up-front on Windows to prevent permission errors
+                os.environ.setdefault("HF_HUB_DISABLE_SYMLINKS", "1")
+                # Try a robust direct download into a local models directory to avoid symlinks entirely
+                dest_dir = str(self.models_dir / self.config.embedding_model_id.replace("/", "_"))
+                try:
+                    from huggingface_hub import snapshot_download
+                    self.logger.info(
+                        f"Attempting to snapshot_download model to local dir {dest_dir} (no symlinks)"
+                    )
+                    os.makedirs(dest_dir, exist_ok=True)
+                    snapshot_download(
+                        repo_id=self.config.embedding_model_id,
+                        local_dir=dest_dir,
+                        local_dir_use_symlinks=False,
+                    )
+                    # Try to load from the locally downloaded snapshot
+                    try:
+                        self._embedding_model = EncoderClassifier.from_hparams(
+                            source=dest_dir,
+                            savedir=dest_dir,
+                            run_opts={"device": self.device},
+                        )
+                        self.logger.info("Embedding model loaded successfully from local snapshot")
+                        # mark that we used speechbrain
+                        self._embedding_model_is_speechbrain = True
+                        return
+                    except Exception as e_local:
+                        self.logger.warning(f"Local snapshot load failed: {e_local}")
+                except Exception:
+                    # snapshot_download not available or failed; continue with other strategies
+                    pass
+                try:
+                    # First try: load directly from hf cache (no savedir) - this typically avoids writing symlinks
+                    self._embedding_model = EncoderClassifier.from_hparams(
+                        source=self.config.embedding_model_id,
+                        run_opts={"device": self.device},
+                    )
+                    self.logger.info("Embedding model loaded successfully (from HF cache)")
+                    self._embedding_model_is_speechbrain = True
+                    return
+                except Exception as e:
+                    err_msg = str(e)
+                    # Detect Windows symlink permission error and retry with savedir + disabled symlink env
+                    if (
+                        ("A required privilege" in err_msg)
+                        or ("symlink" in err_msg.lower())
+                        or getattr(e, "winerror", None) == 1314
+                    ):
+                        try:
+                            os.environ["HF_HUB_DISABLE_SYMLINKS"] = "1"
+                            self.logger.warning(
+                                "Detected symlink/permission issue; retrying model load with HF_HUB_DISABLE_SYMLINKS=1 and specifying savedir"
+                            )
+                            self._embedding_model = EncoderClassifier.from_hparams(
+                                source=self.config.embedding_model_id,
+                                savedir=str(self.models_dir / "spkrec-ecapa"),
+                                run_opts={"device": self.device},
+                            )
+                            self.logger.info(
+                                "Embedding model loaded successfully (after disabling symlinks)"
+                            )
+                            self._embedding_model_is_speechbrain = True
+                            return
+                        except Exception:
+                            # Try monkeypatching SB fetch to use COPY
+                            try:
+                                import speechbrain.utils.fetching as sbfetch
+                                orig_fetch = sbfetch.fetch
+                                def _fetch_copy(*args, **kwargs):
+                                    kwargs.setdefault("local_strategy", sbfetch.LocalStrategy.COPY)
+                                    return orig_fetch(*args, **kwargs)
+                                sbfetch.fetch = _fetch_copy
+                                self.logger.info(
+                                    "Retrying model load with SpeechBrain fetch set to COPY strategy"
+                                )
+                                self._embedding_model = EncoderClassifier.from_hparams(
+                                    source=self.config.embedding_model_id,
+                                    savedir=str(self.models_dir / "spkrec-ecapa"),
+                                    run_opts={"device": self.device},
+                                )
+                                self.logger.info(
+                                    "Embedding model loaded successfully (after switching fetch strategy)"
+                                )
+                                self._embedding_model_is_speechbrain = True
+                                return
+                            except Exception as e3:
+                                err_msg = str(e3)
+                            finally:
+                                try:
+                                    sbfetch.fetch = orig_fetch
+                                except Exception:
+                                    pass
+                    self.logger.error(f"Failed to load SpeechBrain embedding model: {err_msg}")
+                    # Try to salvage by copying an existing cached snapshot or downloading directly into dest_dir
+                    try:
+                        import re
+                        import shutil
+                        m = re.search(r"'([^']+)'\s*->\s*'([^']+)'", err_msg)
+                        if m:
+                            src_file = m.group(1)
+                            src_dir = os.path.dirname(src_file)
+                            self.logger.info(
+                                f"Attempting to copy cached snapshot from {src_dir} to {dest_dir}"
+                            )
+                            shutil.copytree(src_dir, dest_dir, dirs_exist_ok=True)
+                            # Retry loading from the local copied directory
+                            try:
+                                self._embedding_model = EncoderClassifier.from_hparams(
+                                    source=dest_dir,
+                                    savedir=dest_dir,
+                                    run_opts={"device": self.device},
+                                )
+                                self.logger.info(
+                                    "Embedding model loaded successfully (after copying cached snapshot)"
+                                )
+                                self._embedding_model_is_speechbrain = True
+                                return
+                            except Exception as e4:
+                                err_msg = str(e4)
+                        # As a last resort, try to download model files directly into dest_dir using huggingface_hub APIs
+                        from huggingface_hub import hf_hub_download, list_repo_files
+                        self.logger.info(
+                            f"Attempting direct HF download into {dest_dir} to avoid symlinks"
+                        )
+                        os.makedirs(dest_dir, exist_ok=True)
+                        files = list_repo_files(self.config.embedding_model_id)
+                        for fname in files:
+                            if fname.endswith("/"):
+                                continue
+                            hf_hub_download(
+                                repo_id=self.config.embedding_model_id,
+                                filename=fname,
+                                local_dir=dest_dir,
+                                local_dir_use_symlinks=False,
+                            )
+                        # Retry loading now that files are locally present
+                        self._embedding_model = EncoderClassifier.from_hparams(
+                            source=dest_dir,
+                            savedir=dest_dir,
+                            run_opts={"device": self.device},
+                        )
+                        self.logger.info(
+                            "Embedding model loaded successfully (after direct HF download)"
+                        )
+                        self._embedding_model_is_speechbrain = True
+                        return
+                    except Exception as e5:
+                        err_msg = str(e5)
+                    self.logger.warning(
+                        "Common fixes: install a compatible torchaudio (matching your PyTorch), and install 'soundfile' or enable 'sox_io' backend."
+                    )
+                    # If user allows fallback, provide MFCC fallback; otherwise raise an error to enforce SpeechBrain usage
+                    if getattr(self.config, "allow_fallback", False):
+                        self.logger.warning(
+                            "Falling back to MFCC-based deterministic embeddings (will be less accurate)."
+                        )
+                        self._embedding_model = "FALLBACK"
+                        self._fallback_extractor = self._mfcc_embedding
+                        return
+                    else:
+                        raise RuntimeError(
+                            "Failed to load SpeechBrain embedding model and 'allow_fallback' is False. "
+                            "Ensure torchaudio and speechbrain are installed, or set 'allow_fallback=True' in DiarizationConfig."
+                        )
+            except Exception:
+                # Import of SpeechBrain failed entirely; honor allow_fallback setting
+                self.logger.warning(
+                    "Could not import SpeechBrain; checking 'allow_fallback' setting"
+                )
+                if getattr(self.config, "allow_fallback", False):
+                    self.logger.warning(
+                        "Falling back to MFCC-based deterministic embeddings (allow_fallback=True)"
+                    )
+                    self._embedding_model = "FALLBACK"
+                    self._fallback_extractor = self._mfcc_embedding
+                else:
+                    raise RuntimeError(
+                        "Failed to import or initialize SpeechBrain embedding model and 'allow_fallback' is False. "
+                        "Install SpeechBrain or set 'allow_fallback=True' in DiarizationConfig to allow deterministic fallback."
+                    )
+    def _mfcc_embedding(
+        self, segment_np: np.ndarray, sample_rate: int, target_dim: int = 192
+    ) -> np.ndarray:
+        """Compute a deterministic embedding from audio segment using MFCCs.
+        Falls back to simple waveform statistics if librosa is not available.
+        Returns a fixed-size vector of length `target_dim`.
+        """
+        try:
+            import librosa
+            mfcc = librosa.feature.mfcc(y=segment_np, sr=sample_rate, n_mfcc=40)
+            mfcc_mean = mfcc.mean(axis=1)
+            mfcc_std = mfcc.std(axis=1)
+            vec = np.concatenate([mfcc_mean, mfcc_std])
+        except Exception:
+            # Minimal deterministic fallback: use downsampled waveform statistics + spectral centroid approximation
+            vec = []
+            vec.append(np.mean(segment_np))
+            vec.append(np.std(segment_np))
+            # simple spectral centroid proxy
+            freqs = np.fft.rfftfreq(len(segment_np), d=1.0 / sample_rate)
+            spec = np.abs(np.fft.rfft(segment_np))
+            if spec.sum() > 0:
+                centroid = float((freqs * spec).sum() / spec.sum()) / (sample_rate / 2)
+            else:
+                centroid = 0.0
+            vec.append(centroid)
+            vec = np.array(vec, dtype=float)
+        # Pad or trim to target_dim
+        if len(vec) < target_dim:
+            padded = np.zeros(target_dim, dtype=float)
+            padded[: len(vec)] = vec
+            vec = padded
+        elif len(vec) > target_dim:
+            vec = vec[:target_dim]
+        # normalize
+        norm = np.linalg.norm(vec) + 1e-12
+        return (vec / norm).astype(np.float32)
+    def process(
+        self,
+        waveform: torch.Tensor,
+        sample_rate: int = 16000,
+        num_speakers: Optional[int] = None,
+        cache_dir: Optional[str] = None,
+        audio_id: Optional[str] = None,
+        fast_mode: bool = False,
+    ) -> List[SpeakerSegment]:
+        """
+        Main diarization pipeline.
+        Args:
+            waveform: Audio waveform [1, T]
+            sample_rate: Audio sample rate
+            num_speakers: Known number of speakers (auto-detect if None)
+        Returns:
+            List of SpeakerSegment with speaker assignments
+        """
+        self._load_embedding_model()
+        # Step 1: Voice Activity Detection
+        speech_regions = self._detect_speech(waveform, sample_rate)
+        if not speech_regions:
+            self.logger.warning("No speech detected in audio")
+            return []
+        self.logger.info(f"Detected {len(speech_regions)} speech regions")
+        # Step 2: Create analysis windows
+        windows = self._create_windows(speech_regions)
+        if not windows:
+            self.logger.warning("No valid windows created")
+            return []
+        self.logger.info(f"Created {len(windows)} analysis windows")
+        # Step 3: Extract speaker embeddings
+        embeddings = self._extract_embeddings(waveform, windows, sample_rate)
+        self.logger.info(f"Extracted embeddings with shape: {embeddings.shape}")
+        # Step 4: Cluster embeddings
+        labels = self._cluster_embeddings(
+            embeddings, num_speakers=num_speakers or self.config.max_speakers
+        )
+        num_speakers_found = len(set(labels))
+        self.logger.info(f"Found {num_speakers_found} speakers")
+        # Step 5: Create segments from windows and labels
+        raw_segments = self._create_segments(windows, labels, embeddings)
+        # Step 6: Post-processing
+        processed_segments = self._postprocess_segments(raw_segments)
+        # Step 7: Detect overlapping speech
+        processed_segments = self._detect_overlaps(processed_segments)
+        self.logger.info(f"Final: {len(processed_segments)} segments")
+        return processed_segments
+    def auto_tune(
+        self, waveform: torch.Tensor, sample_rate: int = 16000, num_speakers: Optional[int] = None
+    ) -> dict:
+        """Auto-tune clustering-related hyperparameters by searching simple parameter grid.
+        This method extracts embeddings and tries different clustering thresholds and
+        minimum cluster sizes, scoring candidates by silhouette score (and closeness
+        to `num_speakers` if provided). The best parameter set is applied to
+        `self.config` and returned for inspection.
+        """
+        # Quick extraction path
+        speech_regions = self._detect_speech(waveform, sample_rate)
+        if not speech_regions:
+            self.logger.warning("Auto-tune: no speech regions detected; aborting tuning")
+            return {}
+        windows = self._create_windows(speech_regions)
+        if not windows:
+            self.logger.warning("Auto-tune: no analysis windows created; aborting tuning")
+            return {}
+        embeddings = self._extract_embeddings(waveform, windows, sample_rate)
+        if embeddings is None or len(embeddings) < 4:
+            self.logger.warning("Auto-tune: insufficient embeddings for tuning; aborting tuning")
+            return {}
+        # Parameter grid (coarse)
+        clustering_thresholds = [0.95, 0.85, 0.7, 0.5, 0.3, 0.15]
+        min_cluster_sizes = [1, 2, 3, 4]
+        best_score = -1e9
+        best_params = {
+            "clustering_threshold": self.config.clustering_threshold,
+            "min_cluster_size": self.config.min_cluster_size,
+            "iterative_merge_threshold": self.config.iterative_merge_threshold,
+        }
+        # Save original values to restore if needed
+        orig_threshold = self.config.clustering_threshold
+        orig_min_size = self.config.min_cluster_size
+        orig_iter_thresh = self.config.iterative_merge_threshold
+        try:
+            for thr in clustering_thresholds:
+                for msize in min_cluster_sizes:
+                    # Temporarily set
+                    self.config.clustering_threshold = thr
+                    self.config.min_cluster_size = msize
+                    try:
+                        labels = self._cluster_embeddings(embeddings, num_speakers=None)
+                        k = len(np.unique(labels))
+                        if k <= 1:
+                            sil = 0.0
+                        else:
+                            try:
+                                sil = silhouette_score(embeddings, labels, metric="cosine")
+                            except Exception:
+                                sil = 0.0
+                        # Scoring: prefer higher silhouette and closeness to desired num_speakers
+                        score = sil
+                        if num_speakers is not None:
+                            score -= 0.1 * abs(k - num_speakers)
+                        # small penalty for many clusters
+                        score -= 0.02 * k
+                        self.logger.debug(
+                            f"Auto-tune candidate: thr={thr}, min_size={msize} -> k={k}, sil={sil:.4f}, score={score:.4f}"
+                        )
+                        if score > best_score:
+                            best_score = score
+                            best_params = {
+                                "clustering_threshold": thr,
+                                "min_cluster_size": msize,
+                                "achieved_k": k,
+                                "silhouette": sil,
+                            }
+                    except Exception as e:
+                        self.logger.debug(f"Auto-tune candidate failed: {e}")
+                        continue
+            # Apply best params
+            self.config.clustering_threshold = float(
+                best_params.get("clustering_threshold", orig_threshold)
+            )
+            self.config.min_cluster_size = int(best_params.get("min_cluster_size", orig_min_size))
+            # If a desired num_speakers was provided, set target merge accordingly
+            if num_speakers is not None:
+                self.config.target_num_speakers = int(num_speakers)
+            self.logger.info(f"Auto-tune selected: {best_params}")
+            return best_params
+        finally:
+            # nothing to restore; we've intentionally applied best params
+            pass
+    def _detect_speech(self, waveform: torch.Tensor, sample_rate: int) -> List[Tuple[float, float]]:
+        """
+        Detect speech regions using energy-based VAD.
+        Args:
+            waveform: Audio waveform
+            sample_rate: Sample rate
+        Returns:
+            List of (start, end) tuples for speech regions
+        """
+        waveform_np = waveform.squeeze().cpu().numpy()
+        # Frame parameters
+        frame_length_ms = 25  # 25ms frames
+        hop_length_ms = 10  # 10ms hop
+        frame_length = int(frame_length_ms * sample_rate / 1000)
+        hop_length = int(hop_length_ms * sample_rate / 1000)
+        # Calculate energy per frame
+        num_frames = max(1, 1 + (len(waveform_np) - frame_length) // hop_length)
+        energies = np.zeros(num_frames)
+        for i in range(num_frames):
+            start_idx = i * hop_length
+            end_idx = min(start_idx + frame_length, len(waveform_np))
+            frame = waveform_np[start_idx:end_idx]
+            if len(frame) > 0:
+                energies[i] = np.sqrt(np.mean(frame**2) + 1e-10)
+        # Compute adaptive threshold
+        if len(energies) > 0:
+            energy_sorted = np.sort(energies)
+            # Use 30th percentile as noise floor estimate
+            noise_floor = energy_sorted[int(0.3 * len(energy_sorted))]
+            threshold = noise_floor + self.config.vad_threshold * np.std(energies)
+        else:
+            threshold = self.config.vad_threshold
+        # Find speech regions
+        is_speech = energies > threshold
+        # Apply morphological operations to smooth
+        # (simple dilation and erosion using convolution)
+        kernel_size = max(1, int(self.config.min_speech_duration * 1000 / hop_length_ms))
+        if kernel_size > 1 and len(is_speech) > kernel_size:
+            # Simple smoothing
+            kernel = np.ones(kernel_size) / kernel_size
+            smoothed = np.convolve(is_speech.astype(float), kernel, mode="same")
+            is_speech = smoothed > 0.5
+        # Convert to time regions
+        regions = []
+        in_speech = False
+        speech_start = 0.0
+        for i, speech in enumerate(is_speech):
+            time = i * hop_length / sample_rate
+            if speech and not in_speech:
+                speech_start = time
+                in_speech = True
+            elif not speech and in_speech:
+                duration = time - speech_start
+                if duration >= self.config.min_speech_duration:
+                    regions.append((speech_start, time))
+                in_speech = False
+        # Handle last region
+        if in_speech:
+            end_time = len(waveform_np) / sample_rate
+            duration = end_time - speech_start
+            if duration >= self.config.min_speech_duration:
+                regions.append((speech_start, end_time))
+        # Merge nearby regions
+        regions = self._merge_nearby_regions(regions, self.config.min_silence_duration)
+        return regions
+    def _merge_nearby_regions(
+        self, regions: List[Tuple[float, float]], min_gap: float
+    ) -> List[Tuple[float, float]]:
+        """Merge regions that are close together"""
+        if not regions:
+            return []
+        merged = [regions[0]]
+        for start, end in regions[1:]:
+            last_start, last_end = merged[-1]
+            if start - last_end <= min_gap:
+                merged[-1] = (last_start, end)
+            else:
+                merged.append((start, end))
+        return merged
+    def _create_windows(
+        self, speech_regions: List[Tuple[float, float]]
+    ) -> List[Tuple[float, float]]:
+        """Create sliding windows over speech regions for embedding extraction"""
+        windows = []
+        for region_start, region_end in speech_regions:
+            t = region_start
+            while t < region_end:
+                window_end = min(t + self.config.segment_window, region_end)
+                # Only include windows with sufficient duration
+                if (window_end - t) >= self.config.min_segment_duration:
+                    # Avoid creating too many tiny windows across short recordings
+                    if (region_end - region_start) < (self.config.segment_window * 2):
+                        # for short regions, use a single window covering the region
+                        windows.append((region_start, region_end))
+                        break
+                    windows.append((t, window_end))
+                t += self.config.segment_hop
+        return windows
+    def _extract_embeddings(
+        self,
+        waveform: torch.Tensor,
+        windows: List[Tuple[float, float]],
+        sample_rate: int,
+        cache_dir: Optional[str] = None,
+        audio_id: Optional[str] = None,
+        fast_mode: bool = False,
+    ) -> np.ndarray:
+        """Extract speaker embeddings for each window.
+        Optimizations implemented:
+        - Disk cache (if enabled in config and cache_dir provided)
+        - Batch extraction using model's batch API when available
+        - Fast MFCC embedding path when `use_fast_embedding` is True
+        """
+        # Try disk cache first
+        if (
+            cache_dir
+            and audio_id
+            and self.config.embedding_cache
+            and getattr(self.config, "embedding_cache", True)
+        ):
+            try:
+                import os
+                cache_path = Path(cache_dir) / f"{audio_id}_embeddings.npy"
+                if cache_path.exists():
+                    arr = np.load(str(cache_path))
+                    if arr.shape[0] == len(windows):
+                        self.logger.info(f"Loaded embeddings from cache: {cache_path}")
+                        return arr
+            except Exception:
+                pass
+        n = len(windows)
+        embeddings = [None] * n
+        # If fallback or user requested fast embedding, compute MFCC-based embeddings vectorized
+        if (
+            (self._embedding_model == "FALLBACK" or self._embedding_model is None)
+            or getattr(self.config, "use_fast_embedding", False)
+            or fast_mode
+        ):
+            for i, (start, end) in enumerate(windows):
+                start_sample = int(start * sample_rate)
+                end_sample = int(end * sample_rate)
+                segment = waveform[:, start_sample:end_sample]
+                try:
+                    seg_np = segment.squeeze().cpu().numpy()
+                    emb = self._fallback_extractor(seg_np, sample_rate)
+                except Exception:
+                    seg_np = segment.squeeze().cpu().numpy()
+                    emb = self._mfcc_embedding(seg_np, sample_rate)
+                embeddings[i] = emb
+            embeddings = np.stack(embeddings, axis=0)
+            # Save to cache
+            try:
+                if cache_dir and audio_id and self.config.embedding_cache:
+                    Path(cache_dir).mkdir(parents=True, exist_ok=True)
+                    np.save(str(Path(cache_dir) / f"{audio_id}_embeddings.npy"), embeddings)
+            except Exception:
+                pass
+            return embeddings
+        # Otherwise use model batch encoding when available
+        batch_size = max(1, int(getattr(self.config, "embedding_batch_size", 32)))
+        # Prepare segment numpy arrays
+        segs = []
+        seg_indices = []
+        for i, (start, end) in enumerate(windows):
+            start_sample = int(start * sample_rate)
+            end_sample = int(end * sample_rate)
+            segment = waveform[:, start_sample:end_sample]
+            segs.append(segment)
+            seg_indices.append(i)
+        # Try batch processing
+        try:
+            # If model supports encode_batch on a list or stacked tensor, process in chunks
+            for i in range(0, len(segs), batch_size):
+                batch = segs[i : i + batch_size]
+                # Stack into a tensor batch
+                try:
+                    batch_tensor = torch.stack(
+                        [b.squeeze(0) if b.dim() == 2 else b for b in batch], dim=0
+                    )
+                except Exception:
+                    # Some models expect list of tensors; keep as list
+                    batch_tensor = batch
+                with torch.no_grad():
+                    try:
+                        # Move to model device if available
+                        if hasattr(self._embedding_model, "device") and isinstance(
+                            batch_tensor, torch.Tensor
+                        ):
+                            batch_tensor = batch_tensor.to(self._embedding_model.device)
+                        out = None
+                        # Try the most common batch API names
+                        if hasattr(self._embedding_model, "encode_batch"):
+                            out = self._embedding_model.encode_batch(batch_tensor)
+                        elif hasattr(self._embedding_model, "encode"):
+                            out = self._embedding_model.encode(batch_tensor)
+                        else:
+                            # fallback: try to call on each separately
+                            out = [self._embedding_model.encode_batch(x) for x in batch]
+                        # Normalize outputs into numpy array
+                        if isinstance(out, torch.Tensor):
+                            out_np = out.cpu().numpy()
+                        elif isinstance(out, list):
+                            out_np = np.stack(
+                                [
+                                    (
+                                        o.squeeze().cpu().numpy()
+                                        if isinstance(o, torch.Tensor)
+                                        else np.array(o)
+                                    )
+                                    for o in out
+                                ],
+                                axis=0,
+                            )
+                        else:
+                            out_np = np.array(out)
+                        # assign back to embeddings
+                        for j, idx in enumerate(range(i, i + out_np.shape[0])):
+                            embeddings[idx] = out_np[j]
+                    except Exception as e:
+                        # fallback to per-segment extraction for this batch
+                        self.logger.debug(f"Batch embedding failed, falling back per-segment: {e}")
+                        for bb_idx, seg in enumerate(batch):
+                            try:
+                                with torch.no_grad():
+                                    if hasattr(self._embedding_model, "device") and isinstance(
+                                        seg, torch.Tensor
+                                    ):
+                                        seg = seg.to(self._embedding_model.device)
+                                    emb = self._embedding_model.encode_batch(seg)
+                                    emb = emb.squeeze().cpu().numpy()
+                            except Exception:
+                                emb = np.random.randn(192).astype(np.float32)
+                            embeddings[i + bb_idx] = emb
+            embeddings = np.stack(embeddings, axis=0)
+            # Save to cache
+            try:
+                if cache_dir and audio_id and self.config.embedding_cache:
+                    Path(cache_dir).mkdir(parents=True, exist_ok=True)
+                    np.save(str(Path(cache_dir) / f"{audio_id}_embeddings.npy"), embeddings)
+            except Exception:
+                pass
+            return embeddings
+        except Exception as e:
+            self.logger.warning(f"Batch embedding extraction failed: {e}")
+            # final fallback: single extraction loop
+            embeddings = []
+            for start, end in windows:
+                start_sample = int(start * sample_rate)
+                end_sample = int(end * sample_rate)
+                segment = waveform[:, start_sample:end_sample]
+                try:
+                    with torch.no_grad():
+                        if hasattr(self._embedding_model, "device"):
+                            segment = segment.to(self._embedding_model.device)
+                        emb = self._embedding_model.encode_batch(segment)
+                        emb = emb.squeeze().cpu().numpy()
+                except Exception:
+                    emb = np.random.randn(192).astype(np.float32)
+                embeddings.append(emb)
+            embeddings = np.stack(embeddings, axis=0)
+            return embeddings
+    def _cluster_embeddings(
+        self, embeddings: np.ndarray, num_speakers: Optional[int] = None, method_override: Optional[str] = None
+    ) -> np.ndarray:
+        """Cluster embeddings to assign speaker labels, with small-cluster merging.
+        Args:
+            embeddings: (N, D) array of embeddings
+            num_speakers: Optional target number of speakers
+            method_override: If set, use this clustering method ('agglomerative','spectral','kmeans')
+        """
+        if len(embeddings) < 2:
+            return np.zeros(len(embeddings), dtype=int)
+        # Normalize embeddings
+        scaler = StandardScaler()
+        embeddings_norm = scaler.fit_transform(embeddings)
+        # Support both nested (Config.diarization.clustering) and flat config shapes
+        if method_override is not None:
+            method = method_override
+            # default thresholds - allow config overrides below
+            threshold = getattr(self.config, "clustering_threshold", 0.7)
+            linkage = getattr(self.config, "clustering_linkage", "average")
+            min_size_cfg = getattr(self.config, "min_cluster_size", 2)
+            max_speakers_cfg = getattr(self.config, "max_speakers", None)
+        elif hasattr(self.config, "clustering"):
+            method = self.config.clustering.method
+            threshold = self.config.clustering.threshold
+            linkage = self.config.clustering.linkage
+            min_size_cfg = getattr(
+                self.config.clustering,
+                "min_cluster_size",
+                getattr(self.config, "min_cluster_size", 2),
+            )
+            max_speakers_cfg = getattr(self.config, "max_speakers", None)
+        else:
+            method = getattr(self.config, "clustering_method", "spectral")
+            threshold = getattr(self.config, "clustering_threshold", 0.7)
+            linkage = getattr(self.config, "clustering_linkage", "average")
+            min_size_cfg = getattr(self.config, "min_cluster_size", 2)
+            max_speakers_cfg = getattr(self.config, "max_speakers", None)
+        if method == "agglomerative":
+            if num_speakers is not None:
+                clustering = AgglomerativeClustering(
+                    n_clusters=num_speakers, metric="cosine", linkage=linkage
+                )
+            else:
+                # If no target provided, estimate number of speakers via silhouette search
+                est_max = min(8, max(2, len(embeddings) // 2))
+                est_min = 2
+                best_k = None
+                best_score = -1.0
+                # Only try silhouette search on reasonably-sized inputs
+                if len(embeddings) >= 8:
+                    for k in range(est_min, est_max + 1):
+                        try:
+                            tmp = AgglomerativeClustering(n_clusters=k, metric="cosine", linkage=linkage)
+                            labels_tmp = tmp.fit_predict(embeddings_norm)
+                            # silhouette requires at least 2 clusters and < n_samples clusters
+                            if len(np.unique(labels_tmp)) > 1 and len(np.unique(labels_tmp)) < len(embeddings):
+                                score = silhouette_score(embeddings_norm, labels_tmp, metric="cosine")
+                            else:
+                                score = -1.0
+                        except Exception:
+                            score = -1.0
+                        if score > best_score:
+                            best_score = score
+                            best_k = k
+                # If silhouette search found a sensible k use it; else fallback to threshold style
+                if best_k is not None and best_score > 0.01:
+                    clustering = AgglomerativeClustering(n_clusters=best_k, metric="cosine", linkage=linkage)
+                    self.logger.info(f"Agglomerative autodetected k={best_k} (silhouette={best_score:.3f})")
+                else:
+                    clustering = AgglomerativeClustering(
+                        n_clusters=None,
+                        distance_threshold=threshold,
+                        metric="cosine",
+                        linkage=linkage,
+                    )
+        elif method == "spectral":
+            n_clusters = num_speakers or min(8, len(embeddings) // 2)
+            clustering = SpectralClustering(
+                n_clusters=n_clusters,
+                affinity="nearest_neighbors",
+                n_neighbors=min(10, len(embeddings) - 1),
+            )
+        elif method == "kmeans":
+            n_clusters = num_speakers or min(8, len(embeddings) // 2)
+            clustering = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
+        else:
+            raise ValueError(f"Unknown clustering method: {method}")
+        try:
+            labels = clustering.fit_predict(embeddings_norm)
+        except Exception as e:
+            self.logger.error(f"Clustering failed: {e}")
+            labels = np.array([i % 2 for i in range(len(embeddings))])
+        # Debug: cluster sizes
+        unique, counts = np.unique(labels, return_counts=True)
+        sizes = dict(zip(unique.tolist(), counts.tolist()))
+        self.logger.debug(f"Initial clusters: {len(unique)}, sizes: {sizes}")
+        # Global check: if all embeddings are very similar, collapse directly to 1 speaker
+        try:
+            # First, perform a row-normalized (per-embedding) cosine check on raw embeddings
+            row_norm = embeddings / (np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12)
+            n_sample = min(200, len(row_norm))
+            idx = np.linspace(0, len(row_norm) - 1, n_sample).astype(int)
+            sub = row_norm[idx]
+            sims = np.dot(sub, sub.T)
+            sims = np.clip(sims, -1.0, 1.0)
+            dists = 1.0 - sims
+            mean_row_dist = (
+                float(np.mean(dists[np.triu_indices_from(dists, k=1)])) if n_sample > 1 else 1.0
+            )
+            global_row_threshold = getattr(self.config, "global_collapse_threshold", 0.03)
+            # Be more permissive for short recordings (few windows)
+            if len(embeddings) < 40:
+                global_row_threshold = max(global_row_threshold, 0.08)
+            if mean_row_dist < global_row_threshold:
+                self.logger.info(
+                    f"Row-normalized embeddings too similar (mean dist={mean_row_dist:.6f}), collapsing to 1 speaker"
+                )
+                return np.zeros(len(embeddings), dtype=int)
+            # Next, check on scaled embeddings (existing logic)
+            n_sample = min(200, len(embeddings_norm))
+            idx = np.linspace(0, len(embeddings_norm) - 1, n_sample).astype(int)
+            sub = embeddings_norm[idx]
+            sims = np.dot(sub, sub.T)
+            sims = np.clip(sims, -1.0, 1.0)
+            dists = 1.0 - sims
+            mean_global_dist = (
+                float(np.mean(dists[np.triu_indices_from(dists, k=1)])) if n_sample > 1 else 1.0
+            )
+            global_collapse_threshold = getattr(self.config, "global_collapse_threshold", 0.03)
+            if mean_global_dist < global_collapse_threshold:
+                self.logger.info(
+                    f"Global embeddings too similar (mean dist={mean_global_dist:.4f}), collapsing to 1 speaker"
+                )
+                return np.zeros(len(embeddings), dtype=int)
+            # Additional small-variance heuristic: if feature-wise std is tiny, collapse as well
+            mean_std = float(np.mean(np.std(embeddings_norm, axis=0)))
+            std_threshold = getattr(self.config, "global_std_threshold", 1e-2)
+            if mean_std < std_threshold:
+                self.logger.info(
+                    f"Embeddings have tiny variance (mean std={mean_std:.6f}), collapsing to 1 speaker"
+                )
+                return np.zeros(len(embeddings), dtype=int)
+        except Exception:
+            pass
+        # If centroids are very close to each other, this is likely a single-speaker recording.
+        # Compute mean pairwise centroid cosine distance; if below a threshold, collapse to 1 cluster.
+        try:
+            labels_unique = np.unique(labels)
+            centroids = [embeddings_norm[labels == l].mean(axis=0) for l in labels_unique]
+            if len(centroids) > 1:
+                pair_dists = []
+                for i in range(len(centroids)):
+                    for j in range(i + 1, len(centroids)):
+                        a = centroids[i] / (np.linalg.norm(centroids[i]) + 1e-12)
+                        b = centroids[j] / (np.linalg.norm(centroids[j]) + 1e-12)
+                        pair_dists.append(1.0 - float(np.dot(a, b)))
+                mean_pair_dist = float(np.mean(pair_dists)) if pair_dists else 1.0
+            else:
+                mean_pair_dist = 1.0
+            collapse_threshold = getattr(self.config, "collapse_threshold", 0.15)
+            if mean_pair_dist < collapse_threshold:
+                self.logger.info(
+                    f"Centroids too similar (mean dist={mean_pair_dist:.3f}), collapsing to 1 speaker"
+                )
+                labels = np.zeros_like(labels)
+            # If SpeechBrain embeddings are used and clusters have a very low silhouette score,
+            # it's likely that the recording is single-speaker and clustering is over-fragmenting.
+            try:
+                if getattr(self.config, "use_speechbrain", True) and getattr(
+                    self, "_embedding_model_is_speechbrain", False
+                ):
+                    unique_labels = np.unique(labels)
+                    if len(unique_labels) > 1:
+                        try:
+                            score = silhouette_score(embeddings_norm, labels, metric="cosine")
+                            if score < getattr(self.config, "silhouette_collapse_threshold", 0.05):
+                                self.logger.info(
+                                    f"Low silhouette score ({score:.4f}) detected with SpeechBrain embeddings; collapsing to 1 speaker"
+                                )
+                                return np.zeros(len(embeddings), dtype=int)
+                        except Exception:
+                            pass
+            except Exception:
+                pass
+        except Exception:
+            pass
+        # Merge clusters smaller than min_cluster_size
+        min_size = min_size_cfg
+        if min_size and min_size > 1:
+            changed = True
+            while changed:
+                changed = False
+                labels_unique, label_counts = np.unique(labels, return_counts=True)
+                small_labels = [l for l, c in zip(labels_unique, label_counts) if c < min_size]
+                if not small_labels:
+                    break
+                # compute centroids for existing labels
+                centroids = {l: embeddings_norm[labels == l].mean(axis=0) for l in labels_unique}
+                for sl in small_labels:
+                    candidates = [l for l in labels_unique if l != sl]
+                    if not candidates:
+                        continue
+                    # find nearest centroid (cosine distance)
+                    def cosine_dist(a, b):
+                        a_norm = a / (np.linalg.norm(a) + 1e-12)
+                        b_norm = b / (np.linalg.norm(b) + 1e-12)
+                        return 1.0 - float(np.dot(a_norm, b_norm))
+                    distances = [(c, cosine_dist(centroids[sl], centroids[c])) for c in candidates]
+                    nearest = min(distances, key=lambda x: x[1])[0]
+                    # reassign labels
+                    labels[labels == sl] = nearest
+                    changed = True
+        # Final cluster sizes
+        unique2, counts2 = np.unique(labels, return_counts=True)
+        sizes2 = dict(zip(unique2.tolist(), counts2.tolist()))
+        self.logger.debug(f"Clusters after merge: {len(unique2)}, sizes: {sizes2}")
+        # Additional centroid-based merging: merge clusters whose centroids are very close
+        try:
+            labels_unique = np.unique(labels)
+            centroids = {l: embeddings_norm[labels == l].mean(axis=0) for l in labels_unique}
+            # compute pairwise centroid distances
+            pairs = []
+            for i, a in enumerate(labels_unique):
+                for j, b in enumerate(labels_unique):
+                    if j <= i:
+                        continue
+                    dist = 1.0 - float(
+                        np.dot(
+                            centroids[a] / (np.linalg.norm(centroids[a]) + 1e-12),
+                            centroids[b] / (np.linalg.norm(centroids[b]) + 1e-12),
+                        )
+                    )
+                    pairs.append((dist, a, b))
+            # merge pairs with distance < threshold
+            pairs.sort()
+            merged = False
+            for dist, a, b in pairs:
+                if dist < threshold:
+                    # merge b into a
+                    labels[labels == b] = a
+                    merged = True
+            if merged:
+                labels_unique2, counts2 = np.unique(labels, return_counts=True)
+                sizes2 = dict(zip(labels_unique2.tolist(), counts2.tolist()))
+                self.logger.debug(
+                    f"Clusters after centroid-merge: {len(labels_unique2)}, sizes: {sizes2}"
+                )
+            # Iterative silhouette-guided merging: try merging closest centroid pairs while it improves or meets configured criteria
+            try:
+                iterative_thresh = getattr(self.config, "iterative_merge_threshold", threshold)
+                silhouette_min = getattr(self.config, "iterative_merge_silhouette_threshold", 0.0)
+                max_merge_iters = getattr(self.config, "iterative_merge_max_iters", 10)
+                def compute_centroids(curr_labels):
+                    uniq = np.unique(curr_labels)
+                    return {l: embeddings_norm[curr_labels == l].mean(axis=0) for l in uniq}
+                def pairwise_min_pair(centroids_dict):
+                    uniq = list(centroids_dict.keys())
+                    best = (1.0, None, None)
+                    for i, a in enumerate(uniq):
+                        for j in range(i + 1, len(uniq)):
+                            b = uniq[j]
+                            a_c = centroids_dict[a] / (np.linalg.norm(centroids_dict[a]) + 1e-12)
+                            b_c = centroids_dict[b] / (np.linalg.norm(centroids_dict[b]) + 1e-12)
+                            dist = 1.0 - float(np.dot(a_c, b_c))
+                            if dist < best[0]:
+                                best = (dist, a, b)
+                    return best
+                curr_labels = labels.copy()
+                prev_score = None
+                try:
+                    if len(np.unique(curr_labels)) > 1:
+                        prev_score = silhouette_score(embeddings_norm, curr_labels, metric="cosine")
+                except Exception:
+                    prev_score = None
+                iters = 0
+                while iters < max_merge_iters:
+                    iters += 1
+                    cent = compute_centroids(curr_labels)
+                    if len(cent) <= 1:
+                        break
+                    min_dist, a, b = pairwise_min_pair(cent)
+                    if min_dist >= iterative_thresh:
+                        break
+                    # simulate merge and evaluate silhouette
+                    next_labels = curr_labels.copy()
+                    next_labels[next_labels == b] = a
+                    try:
+                        if len(np.unique(next_labels)) > 1:
+                            next_score = silhouette_score(
+                                embeddings_norm, next_labels, metric="cosine"
+                            )
+                        else:
+                            next_score = 1.0
+                    except Exception:
+                        next_score = None
+                    accept = False
+                    if next_score is not None:
+                        if prev_score is None:
+                            # accept merges that meet a minimum silhouette threshold
+                            if next_score >= silhouette_min:
+                                accept = True
+                        else:
+                            # accept if silhouette improves by a small margin or stays acceptable
+                            if next_score >= prev_score or next_score >= silhouette_min:
+                                accept = True
+                    if accept:
+                        curr_labels = next_labels
+                        prev_score = next_score
+                        labels = curr_labels.copy()
+                        # continue iterating
+                    else:
+                        break
+                if iters > 1:
+                    labels_unique2, counts2 = np.unique(labels, return_counts=True)
+                    sizes2 = dict(zip(labels_unique2.tolist(), counts2.tolist()))
+                    self.logger.debug(
+                        f"Clusters after iterative-merge (iters={iters}): {len(labels_unique2)}, sizes: {sizes2}"
+                    )
+                # If user requested a target speaker count, greedily merge closest centroid pairs until we meet it
+                try:
+                    target_k = getattr(self.config, "target_num_speakers", None)
+                    force_thresh = float(getattr(self.config, "target_force_threshold", 1.0))
+                    if target_k is not None:
+                        curr_labels = labels.copy()
+                        def compute_centroids(curr):
+                            uniq = np.unique(curr)
+                            return {l: embeddings_norm[curr == l].mean(axis=0) for l in uniq}
+                        merged_iters = 0
+                        while len(np.unique(curr_labels)) > target_k:
+                            cent = compute_centroids(curr_labels)
+                            if len(cent) <= 1:
+                                break
+                            # find closest pair
+                            uniq = list(cent.keys())
+                            best = (1.0, None, None)
+                            for i, a in enumerate(uniq):
+                                for j in range(i + 1, len(uniq)):
+                                    b = uniq[j]
+                                    a_c = cent[a] / (np.linalg.norm(cent[a]) + 1e-12)
+                                    b_c = cent[b] / (np.linalg.norm(cent[b]) + 1e-12)
+                                    dist = 1.0 - float(np.dot(a_c, b_c))
+                                    if dist < best[0]:
+                                        best = (dist, a, b)
+                            min_dist, a, b = best
+                            # if min_dist is too large and force_thresh < 1.0, break
+                            if min_dist > force_thresh and force_thresh < 1.0:
+                                self.logger.warning(
+                                    f"Stopping target-merge early: nearest cluster dist {min_dist:.3f} > force_thresh {force_thresh}"
+                                )
+                                break
+                            # merge b into a
+                            curr_labels[curr_labels == b] = a
+                            merged_iters += 1
+                            # safety to avoid infinite loops
+                            if merged_iters > 1000:
+                                break
+                        if merged_iters:
+                            labels = curr_labels.copy()
+                            labels_unique2, counts2 = np.unique(labels, return_counts=True)
+                            sizes2 = dict(zip(labels_unique2.tolist(), counts2.tolist()))
+                            self.logger.info(
+                                f"Clusters after target-merge (target={target_k}, iters={merged_iters}): {len(labels_unique2)}, sizes: {sizes2}"
+                            )
+                except Exception:
+                    pass
+            except Exception:
+                # don't let merging errors break the pipeline
+                pass
+            # Heuristic fallback: if still too fragmented, run KMeans with estimated speaker count
+            n_clusters_found = len(np.unique(labels))
+            max_allowed = 20
+            if n_clusters_found > max_allowed:
+                est_k = min(12, max(2, int(len(embeddings) / 80)))
+                self.logger.warning(
+                    f"Too many clusters ({n_clusters_found}), falling back to KMeans with k={est_k}"
+                )
+                try:
+                    km = KMeans(n_clusters=est_k, random_state=42, n_init=10)
+                    labels = km.fit_predict(embeddings_norm)
+                    # Re-merge small clusters after KMeans
+                    labels_unique2, counts2 = np.unique(labels, return_counts=True)
+                    sizes2 = dict(zip(labels_unique2.tolist(), counts2.tolist()))
+                    self.logger.info(
+                        f"Clusters after KMeans fallback: {len(labels_unique2)}, sizes: {sizes2}"
+                    )
+                except Exception as e:
+                    self.logger.error(f"KMeans fallback failed: {e}")
+        except Exception:
+            pass
+        return labels
+    def _create_segments(
+        self, windows: List[Tuple[float, float]], labels: np.ndarray, embeddings: np.ndarray
+    ) -> List[SpeakerSegment]:
+        """Create SpeakerSegment objects from windows and labels"""
+        segments = []
+        for (start, end), label, emb in zip(windows, labels, embeddings):
+            segments.append(
+                SpeakerSegment(
+                    speaker_id=f"SPEAKER_{label:02d}",
+                    start=start,
+                    end=end,
+                    confidence=1.0,
+                    embedding=emb,
+                )
+            )
+        # If we used the fallback extractor, update segment embeddings to the deterministic MFCC embeddings
+        if getattr(self, "_fallback_extractor", None) is not None:
+            try:
+                for i, seg in enumerate(segments):
+                    # reuse windows to create a deterministic embedding
+                    s, e = windows[i]
+                    # external code expects embeddings array, but ensure segment.embedding is deterministic
+                    if (
+                        segments[i].embedding is None
+                        or isinstance(self._embedding_model, str)
+                        and self._embedding_model == "FALLBACK"
+                    ):
+                        # compute on-demand using fallback extractor
+                        seg_np = self._extract_waveform_segment(windows[i])
+                        segments[i].embedding = self._fallback_extractor(seg_np, sample_rate)
+            except Exception:
+                pass
+        return segments
+    def _postprocess_segments(self, segments: List[SpeakerSegment]) -> List[SpeakerSegment]:
+        """Post-process segments: merge adjacent, filter short"""
+        if not segments:
+            return []
+        # Sort by start time
+        segments = sorted(segments, key=lambda x: x.start)
+        # Merge adjacent segments from same speaker
+        merged = [segments[0]]
+        for seg in segments[1:]:
+            last = merged[-1]
+            gap = seg.start - last.end
+            if seg.speaker_id == last.speaker_id and gap <= self.config.merge_gap_threshold:
+                # Merge: extend last segment
+                last.end = max(last.end, seg.end)
+                last.confidence = (last.confidence + seg.confidence) / 2
+            else:
+                merged.append(seg)
+        # Smoothing: fix short isolated segments between identical speakers
+        smoothed = merged
+        if len(smoothed) >= 3:
+            changed = False
+            for i in range(1, len(smoothed) - 1):
+                seg = smoothed[i]
+                prev = smoothed[i - 1]
+                nxt = smoothed[i + 1]
+                threshold = max(1.0, self.config.min_segment_duration)
+                if seg.duration < threshold and prev.speaker_id == nxt.speaker_id:
+                    seg.speaker_id = prev.speaker_id
+                    changed = True
+            if changed:
+                # merge again after smoothing
+                merged2 = [smoothed[0]]
+                for seg in smoothed[1:]:
+                    last = merged2[-1]
+                    gap = seg.start - last.end
+                    if seg.speaker_id == last.speaker_id and gap <= self.config.merge_gap_threshold:
+                        last.end = max(last.end, seg.end)
+                        last.confidence = (last.confidence + seg.confidence) / 2
+                    else:
+                        merged2.append(seg)
+                merged = merged2
+        # Filter short segments
+        filtered = [seg for seg in merged if seg.duration >= self.config.min_segment_duration]
+        return filtered
+    def _merge_segments(
+        self, segments: List[SpeakerSegment], max_gap: float = 0.5
+    ) -> List[SpeakerSegment]:
+        """Compatibility helper: merge adjacent segments from same speaker within max_gap"""
+        if not segments:
+            return []
+        segments = sorted(segments, key=lambda x: x.start)
+        merged_list = [segments[0]]
+        for seg in segments[1:]:
+            last = merged_list[-1]
+            gap = seg.start - last.end
+            if seg.speaker_id == last.speaker_id and gap <= max_gap:
+                # Merge: extend last segment
+                last.end = max(last.end, seg.end)
+                last.confidence = (last.confidence + seg.confidence) / 2
+            else:
+                merged_list.append(seg)
+        return merged_list
+    def _detect_overlaps(self, segments: List[SpeakerSegment]) -> List[SpeakerSegment]:
+        """Mark segments that overlap with other speakers"""
+        for i, seg1 in enumerate(segments):
+            for j, seg2 in enumerate(segments):
+                if i != j and seg1.speaker_id != seg2.speaker_id:
+                    # Check for time overlap
+                    overlap_start = max(seg1.start, seg2.start)
+                    overlap_end = min(seg1.end, seg2.end)
+                    if overlap_start < overlap_end:
+                        seg1.is_overlap = True
+                        seg2.is_overlap = True
+        return segments
+    def get_speaker_stats(self, segments: List[SpeakerSegment]) -> Dict[str, Dict[str, float]]:
+        """
+        Get statistics for each speaker.
+        Returns:
+            Dict mapping speaker_id to stats (total_duration, num_segments, etc.)
+        """
+        stats = {}
+        for seg in segments:
+            if seg.speaker_id not in stats:
+                stats[seg.speaker_id] = {
+                    "total_duration": 0.0,
+                    "num_segments": 0,
+                    "avg_segment_duration": 0.0,
+                    "overlap_duration": 0.0,
+                }
+            stats[seg.speaker_id]["total_duration"] += seg.duration
+            stats[seg.speaker_id]["num_segments"] += 1
+            if seg.is_overlap:
+                stats[seg.speaker_id]["overlap_duration"] += seg.duration
+        # Calculate averages
+        for speaker_id in stats:
+            num_segs = stats[speaker_id]["num_segments"]
+            if num_segs > 0:
+                stats[speaker_id]["avg_segment_duration"] = (
+                    stats[speaker_id]["total_duration"] / num_segs
+                )
+        return stats

src/document_generator.py ADDED Viewed

	@@ -0,0 +1,852 @@

+"""
+Document Generator Module
+=========================
+Exports meeting minutes to formatted .docx using python-docx.
+"""
+from __future__ import annotations
+import re
+import warnings
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional
+try:
+    from docx import Document
+    from docx.enum.table import WD_TABLE_ALIGNMENT
+    from docx.enum.text import WD_ALIGN_PARAGRAPH
+    from docx.oxml import OxmlElement
+    from docx.oxml.ns import qn
+    from docx.shared import Cm, Pt, RGBColor
+    DOCX_AVAILABLE = True
+except Exception:
+    # Minimal fallback implementations for environments without python-docx (used in tests)
+    DOCX_AVAILABLE = False
+    class Document:
+        def __init__(self):
+            self._paragraphs = []
+            self.sections = []
+            # Minimal styles container to mimic python-docx for tests
+            class DummyStyle:
+                def __init__(self):
+                    self.font = type("F", (), {"name": None, "size": None})
+                    class RFonts:
+                        def set(self, *args, **kwargs):
+                            pass
+                    class RPr:
+                        def __init__(self):
+                            self.rFonts = RFonts()
+                    class Element:
+                        def __init__(self):
+                            self.rPr = RPr()
+                    self._element = Element()
+            class Styles:
+                def __init__(self):
+                    self._styles = {"Normal": DummyStyle()}
+                def __getitem__(self, key):
+                    return self._styles.setdefault(key, DummyStyle())
+            self.styles = Styles()
+        class Run:
+            def __init__(self, text=""):
+                self.text = str(text)
+                self.bold = False
+                self.italic = False
+                self.font = type("F", (), {"size": None, "color": type("C", (), {"rgb": None})()})
+        class Paragraph:
+            def __init__(self, text=""):
+                self.runs = []
+                self.paragraph_format = type("PF", (), {"space_after": None})
+                self.alignment = None
+                if text:
+                    self.add_run(text)
+            def add_run(self, text=""):
+                # Create a lightweight run-like object for fallback
+                run = type(
+                    "Run",
+                    (),
+                    {
+                        "text": str(text),
+                        "bold": False,
+                        "italic": False,
+                        "font": type(
+                            "F", (), {"size": None, "color": type("C", (), {"rgb": None})()}
+                        )(),
+                    },
+                )()
+                self.runs.append(run)
+                return run
+        def add_paragraph(self, text="", **kwargs):
+            # Accept style and other kwargs for compatibility
+            para = self.Paragraph(text)
+            self._paragraphs.append(para)
+            return para
+        def add_heading(self, text, level=None, **kwargs):
+            para = self.Paragraph(text)
+            self._paragraphs.append(para)
+            return para
+        def add_table(self, rows, cols):
+            outer = self
+            class Cell:
+                def __init__(self):
+                    self.paragraphs = [outer.Paragraph()]
+                    # Minimal _tc structure to support shading and other docx operations in fallback
+                    class TCPr:
+                        def append(self, *args, **kwargs):
+                            pass
+                    class TC:
+                        def get_or_add_tcPr(self):
+                            return TCPr()
+                    self._tc = TC()
+                @property
+                def text(self):
+                    if self.paragraphs and self.paragraphs[0].runs:
+                        return " ".join(run.text for run in self.paragraphs[0].runs)
+                    return ""
+                @text.setter
+                def text(self, value):
+                    # Create lightweight run-like object
+                    self.paragraphs[0].runs = [
+                        type(
+                            "Run",
+                            (),
+                            {
+                                "text": str(value),
+                                "bold": False,
+                                "italic": False,
+                                "font": type(
+                                    "F", (), {"size": None, "color": type("C", (), {"rgb": None})()}
+                                )(),
+                            },
+                        )()
+                    ]
+            class Row:
+                def __init__(self, cols):
+                    self.cells = [Cell() for _ in range(cols)]
+            table = type(
+                "Table",
+                (),
+                {"rows": [Row(cols) for _ in range(rows)], "style": None, "alignment": None},
+            )
+            return table
+        def save(self, path):
+            # Save a plain text fallback document so tests can verify file exists
+            lines = []
+            for p in self._paragraphs:
+                if hasattr(p, "runs"):
+                    lines.append(" ".join(getattr(r, "text", "") for r in p.runs))
+                else:
+                    lines.append(str(p))
+            with open(path, "w", encoding="utf-8") as f:
+                f.write("\n".join(lines))
+    class Pt:
+        def __init__(self, value):
+            self.value = value
+    class Cm:
+        def __init__(self, value):
+            self.value = value
+    class RGBColor:
+        def __init__(self, r, g, b):
+            pass
+    class WD_ALIGN_PARAGRAPH:
+        CENTER = 1
+    class WD_TABLE_ALIGNMENT:
+        LEFT = 1
+    class OxmlElement:
+        def __init__(self, *args, **kwargs):
+            pass
+        def set(self, *args, **kwargs):
+            pass
+    def qn(x):
+        return x
+from src.summarizer import MeetingSummary
+from src.transcriber import TranscriptSegment
+@dataclass
+class MeetingMetadata:
+    """Meeting information for document header"""
+    title: str
+    date: str
+    time: str = ""
+    location: str = ""
+    duration: str = ""
+    participants: Optional[List[str]] = None
+    organizer: str = ""
+    agenda: str = ""
+    @classmethod
+    def create_default(cls, audio_duration_sec: float = 0) -> "MeetingMetadata":
+        """Create default metadata"""
+        duration_str = ""
+        if audio_duration_sec > 0:
+            hours = int(audio_duration_sec // 3600)
+            minutes = int((audio_duration_sec % 3600) // 60)
+            seconds = int(audio_duration_sec % 60)
+            if hours > 0:
+                duration_str = f"{hours} jam {minutes} menit {seconds} detik"
+            else:
+                duration_str = f"{minutes} menit {seconds} detik"
+        return cls(
+            title="Notulensi Rapat",
+            date=datetime.now().strftime("%d %B %Y"),
+            time=datetime.now().strftime("%H:%M"),
+            duration=duration_str,
+        )
+@dataclass
+class DocumentConfig:
+    """Configuration for document generation"""
+    # Font settings
+    title_font_size: int = 18
+    heading1_font_size: int = 14
+    heading2_font_size: int = 12
+    body_font_size: int = 11
+    font_family: str = "Calibri"
+    # Layout
+    page_width: float = 21.0  # cm (A4)
+    page_height: float = 29.7  # cm (A4)
+    margin_top: float = 2.5
+    margin_bottom: float = 2.5
+    margin_left: float = 3.0
+    margin_right: float = 2.5
+    # Content options
+    include_timestamps: bool = True
+    include_speaker_colors: bool = True
+    include_table_of_contents: bool = False
+    include_page_numbers: bool = True
+    # Sections to include
+    sections: Dict[str, bool] = field(
+        default_factory=lambda: {
+            "header": True,
+            "meeting_info": True,
+            "summary": True,
+            "decisions": True,
+            "action_items": True,
+            "transcript": True,
+            "footer": True,
+        }
+    )
+class DocumentGenerator:
+    """
+    Generates formatted .docx meeting minutes.
+    Structure:
+        - Title
+        - Meeting Information
+        - Executive Summary
+        - Key Points
+        - Decisions
+        - Action Items
+        - Full Transcript
+        - Footer
+    Attributes:
+        config: DocumentConfig object
+        output_dir: Output directory path
+    Example:
+        >>> generator = DocumentGenerator()
+        >>> doc_path = generator.generate(metadata, summary, transcript)
+        >>> print(f"Document saved: {doc_path}")
+    """
+    # Speaker colors for visual distinction
+    SPEAKER_COLORS = [
+        RGBColor(0, 102, 204),  # Blue
+        RGBColor(204, 51, 0),  # Red
+        RGBColor(0, 153, 51),  # Green
+        RGBColor(153, 51, 153),  # Purple
+        RGBColor(204, 102, 0),  # Orange
+        RGBColor(0, 153, 153),  # Teal
+        RGBColor(102, 102, 0),  # Olive
+        RGBColor(153, 0, 76),  # Maroon
+    ]
+    def __init__(self, config: Optional[DocumentConfig] = None, output_dir: str = "./data/output"):
+        """
+        Initialize DocumentGenerator.
+        Args:
+            config: DocumentConfig object
+            output_dir: Directory for output files
+        """
+        self.config = config or DocumentConfig()
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self._speaker_color_map: Dict[str, RGBColor] = {}
+    def generate(
+        self,
+        metadata: MeetingMetadata,
+        summary: MeetingSummary,
+        transcript: List[TranscriptSegment],
+        output_filename: Optional[str] = None,
+    ) -> str:
+        """
+        Generate complete meeting minutes document.
+        Args:
+            metadata: Meeting information
+            summary: Generated summary
+            transcript: Transcribed segments with speakers
+            output_filename: Output file name (auto-generated if None)
+        Returns:
+            Path to generated document
+        """
+        # Create document
+        doc = Document()
+        # Setup document
+        self._setup_document(doc)
+        self._setup_styles(doc)
+        # Build speaker color map
+        self._build_speaker_color_map(transcript)
+        # Add sections
+        if self.config.sections.get("header", True):
+            self._add_title(doc, metadata)
+        if self.config.sections.get("meeting_info", True):
+            self._add_meeting_info(doc, metadata)
+        if self.config.sections.get("summary", True):
+            self._add_summary_section(doc, summary)
+        if self.config.sections.get("decisions", True):
+            self._add_decisions_section(doc, summary.decisions)
+        if self.config.sections.get("action_items", True):
+            self._add_action_items_section(doc, summary.action_items)
+        if self.config.sections.get("transcript", True):
+            self._add_transcript_section(doc, transcript)
+        if self.config.sections.get("footer", True):
+            self._add_footer(doc)
+        # Generate filename if not provided
+        if output_filename is None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            safe_title = self._sanitize_filename(metadata.title)[:30]
+            ext = ".docx" if DOCX_AVAILABLE else ".txt"
+            output_filename = f"notulensi_{safe_title}_{timestamp}{ext}"
+        # Ensure .docx extension
+        if not output_filename.endswith(".docx"):
+            output_filename = Path(output_filename).with_suffix(".docx").name
+        output_path = self.output_dir / output_filename
+        # Save document
+        if DOCX_AVAILABLE:
+            doc.save(str(output_path))
+        else:
+            # If python-docx is not available, build a minimal valid .docx package so Word can open it.
+            warnings.warn(
+                "python-docx is not available in the current environment; generating a minimal .docx package instead."
+            )
+            paragraphs = self._extract_paragraph_texts(doc)
+            self._save_minimal_docx(str(output_path), paragraphs)
+        return str(output_path)
+    def _setup_document(self, doc: Document):
+        """Configure document settings"""
+        # Set page margins
+        sections = doc.sections
+        for section in sections:
+            section.top_margin = Cm(self.config.margin_top)
+            section.bottom_margin = Cm(self.config.margin_bottom)
+            section.left_margin = Cm(self.config.margin_left)
+            section.right_margin = Cm(self.config.margin_right)
+    def _setup_styles(self, doc: Document):
+        """Configure document styles"""
+        # Normal style
+        style = doc.styles["Normal"]
+        style.font.name = self.config.font_family
+        style.font.size = Pt(self.config.body_font_size)
+        # Set font for East Asian text
+        style._element.rPr.rFonts.set(qn("w:eastAsia"), self.config.font_family)
+    def _build_speaker_color_map(self, transcript: List[TranscriptSegment]):
+        """Build consistent color mapping for speakers"""
+        speakers = sorted(set(seg.speaker_id for seg in transcript))
+        for i, speaker in enumerate(speakers):
+            self._speaker_color_map[speaker] = self.SPEAKER_COLORS[i % len(self.SPEAKER_COLORS)]
+    def _add_title(self, doc: Document, metadata: MeetingMetadata):
+        """Add document title"""
+        # Main title
+        title_para = doc.add_paragraph()
+        title_run = title_para.add_run("NOTULENSI RAPAT")
+        title_run.bold = True
+        title_run.font.size = Pt(self.config.title_font_size)
+        title_run.font.color.rgb = RGBColor(0, 51, 102)
+        title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+        # Subtitle with meeting title
+        if metadata.title and metadata.title != "Notulensi Rapat":
+            subtitle_para = doc.add_paragraph()
+            subtitle_run = subtitle_para.add_run(metadata.title)
+            subtitle_run.bold = True
+            subtitle_run.font.size = Pt(self.config.heading1_font_size)
+            subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+        # Generated by note
+        note_para = doc.add_paragraph()
+        note_run = note_para.add_run("Generated by AI Meeting Transcriber (SpeechBrain + BERT)")
+        note_run.italic = True
+        note_run.font.size = Pt(9)
+        note_run.font.color.rgb = RGBColor(128, 128, 128)
+        note_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+        # Spacer
+        doc.add_paragraph()
+    def _add_meeting_info(self, doc: Document, metadata: MeetingMetadata):
+        """Add meeting information section"""
+        # Section heading
+        heading = doc.add_heading("Informasi Rapat", level=1)
+        heading.runs[0].font.size = Pt(self.config.heading1_font_size)
+        # Create info table
+        info_items = [
+            ("Tanggal", metadata.date),
+            ("Waktu", metadata.time or "-"),
+            ("Lokasi/Platform", metadata.location or "-"),
+            ("Durasi", metadata.duration or "-"),
+            ("Penyelenggara", metadata.organizer or "-"),
+        ]
+        # Filter out empty items
+        info_items = [(label, value) for label, value in info_items if value and value != "-"]
+        if info_items:
+            table = doc.add_table(rows=len(info_items), cols=2)
+            table.style = "Table Grid"
+            table.alignment = WD_TABLE_ALIGNMENT.LEFT
+            for i, (label, value) in enumerate(info_items):
+                row = table.rows[i]
+                # Label cell
+                cell_label = row.cells[0]
+                cell_label.text = label
+                cell_label.paragraphs[0].runs[0].bold = True
+                cell_label.width = Cm(4)
+                # Value cell
+                cell_value = row.cells[1]
+                cell_value.text = value
+        # Add participants if available
+        if metadata.participants:
+            doc.add_paragraph()
+            para = doc.add_paragraph()
+            para.add_run("Peserta Rapat: ").bold = True
+            para.add_run(", ".join(metadata.participants))
+        # Add agenda if available
+        if metadata.agenda:
+            doc.add_paragraph()
+            para = doc.add_paragraph()
+            para.add_run("Agenda: ").bold = True
+            para.add_run(metadata.agenda)
+        # Spacer
+        doc.add_paragraph()
+    def _add_summary_section(self, doc: Document, summary: MeetingSummary):
+        """Add executive summary section"""
+        # Section heading
+        heading = doc.add_heading("Ringkasan Eksekutif", level=1)
+        heading.runs[0].font.size = Pt(self.config.heading1_font_size)
+        # Overview
+        if summary.overview and not self._is_placeholder_text(summary.overview):
+            overview_para = doc.add_paragraph()
+            overview_para.add_run(summary.overview)
+            overview_para.paragraph_format.space_after = Pt(12)
+        else:
+            overview_para = doc.add_paragraph()
+            overview_para.add_run(
+                "Ringkasan tidak tersedia. (Model ringkasan tidak dimuat atau data tidak mencukupi.)"
+            )
+            overview_para.runs[0].italic = True
+            overview_para.runs[0].font.color.rgb = RGBColor(128, 128, 128)
+        # Key points (filter placeholders)
+        filtered_points = [
+            p for p in (summary.key_points or []) if not self._is_placeholder_text(p)
+        ]
+        if filtered_points:
+            subheading = doc.add_heading("Poin-Poin Penting", level=2)
+            subheading.runs[0].font.size = Pt(self.config.heading2_font_size)
+            for point in filtered_points:
+                para = doc.add_paragraph(point, style="List Bullet")
+        else:
+            para = doc.add_paragraph()
+            para.add_run("Tidak ada poin penting yang dihasilkan secara otomatis.")
+            para.runs[0].italic = True
+            para.runs[0].font.color.rgb = RGBColor(128, 128, 128)
+        # Topics discussed (filter placeholders)
+        topics_filtered = [t for t in (summary.topics or []) if not self._is_placeholder_text(t)]
+        if topics_filtered:
+            doc.add_paragraph()
+            para = doc.add_paragraph()
+            para.add_run("Topik yang dibahas: ").bold = True
+            para.add_run(", ".join(topics_filtered))
+        # Spacer
+        doc.add_paragraph()
+    def _add_decisions_section(self, doc: Document, decisions: List[str]):
+        """Add decisions section"""
+        # Section heading
+        heading = doc.add_heading("Keputusan Rapat", level=1)
+        heading.runs[0].font.size = Pt(self.config.heading1_font_size)
+        if decisions:
+            for i, decision in enumerate(decisions, 1):
+                para = doc.add_paragraph()
+                para.add_run(f"{i}. ").bold = True
+                para.add_run(decision)
+        else:
+            para = doc.add_paragraph()
+            para.add_run("Tidak ada keputusan yang teridentifikasi secara otomatis.")
+            para.runs[0].italic = True
+            para.runs[0].font.color.rgb = RGBColor(128, 128, 128)
+        # Spacer
+        doc.add_paragraph()
+    def _add_action_items_section(self, doc: Document, action_items: List[Dict[str, str]]):
+        """Add action items section"""
+        # Section heading
+        heading = doc.add_heading("Action Items / Tindak Lanjut", level=1)
+        heading.runs[0].font.size = Pt(self.config.heading1_font_size)
+        if action_items:
+            # Create table
+            table = doc.add_table(rows=len(action_items) + 1, cols=4)
+            table.style = "Table Grid"
+            table.alignment = WD_TABLE_ALIGNMENT.LEFT
+            # Header row
+            headers = ["No.", "Penanggung Jawab", "Tugas", "Deadline"]
+            header_row = table.rows[0]
+            for i, header_text in enumerate(headers):
+                cell = header_row.cells[i]
+                cell.text = header_text
+                # Style header
+                for paragraph in cell.paragraphs:
+                    for run in paragraph.runs:
+                        run.bold = True
+                # Set header background color
+                shading = OxmlElement("w:shd")
+                shading.set(qn("w:fill"), "D9E2F3")
+                cell._tc.get_or_add_tcPr().append(shading)
+            # Data rows
+            for i, item in enumerate(action_items, 1):
+                row = table.rows[i]
+                row.cells[0].text = str(i)
+                row.cells[1].text = item.get("owner", "-")
+                row.cells[2].text = item.get("task", "-")
+                row.cells[3].text = item.get("due", "-")
+            # Set column widths
+            for row in table.rows:
+                row.cells[0].width = Cm(1.0)
+                row.cells[1].width = Cm(3.5)
+                row.cells[2].width = Cm(9.0)
+                row.cells[3].width = Cm(2.5)
+        else:
+            para = doc.add_paragraph()
+            para.add_run("Tidak ada action item yang teridentifikasi secara otomatis.")
+            para.runs[0].italic = True
+            para.runs[0].font.color.rgb = RGBColor(128, 128, 128)
+        # Spacer
+        doc.add_paragraph()
+    def _add_transcript_section(self, doc: Document, transcript: List[TranscriptSegment]):
+        """Add full transcript section"""
+        # Section heading
+        heading = doc.add_heading("Transkrip Percakapan", level=1)
+        heading.runs[0].font.size = Pt(self.config.heading1_font_size)
+        if not transcript:
+            para = doc.add_paragraph()
+            para.add_run("Tidak ada transkrip yang tersedia.")
+            para.runs[0].italic = True
+            return
+        # Add each segment
+        for seg in transcript:
+            para = doc.add_paragraph()
+            # Timestamp
+            if self.config.include_timestamps:
+                timestamp = self._format_timestamp(seg.start, seg.end)
+                # Speaker label with color
+                speaker_run = para.add_run(f"{seg.speaker_id} [{timestamp}]: ")
+                speaker_run.bold = True
+                if self.config.include_speaker_colors:
+                    color = self._speaker_color_map.get(seg.speaker_id, RGBColor(0, 0, 0))
+                    speaker_run.font.color.rgb = color
+            else:
+                speaker_run = para.add_run(f"{seg.speaker_id}: ")
+                speaker_run.bold = True
+            # Transcript text (sanitize placeholder/fallback strings)
+            text = seg.text or ""
+            cleaned = self._clean_text_for_doc(text)
+            para.add_run(cleaned)
+            # Mark overlapping speech
+            if seg.is_overlap:
+                overlap_run = para.add_run(" [OVERLAP]")
+                overlap_run.italic = True
+                overlap_run.font.color.rgb = RGBColor(255, 102, 0)
+                overlap_run.font.size = Pt(9)
+    def _add_footer(self, doc: Document):
+        """Add document footer"""
+        # Separator line
+        doc.add_paragraph()
+        separator = doc.add_paragraph("─" * 70)
+        separator.alignment = WD_ALIGN_PARAGRAPH.CENTER
+        # Footer text
+        footer_para = doc.add_paragraph()
+        timestamp = datetime.now().strftime("%d %B %Y, %H:%M:%S")
+        footer_text = f"Dokumen ini dihasilkan secara otomatis pada {timestamp}"
+        footer_run = footer_para.add_run(footer_text)
+        footer_run.italic = True
+        footer_run.font.size = Pt(9)
+        footer_run.font.color.rgb = RGBColor(128, 128, 128)
+        footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+        # Disclaimer
+        disclaimer_para = doc.add_paragraph()
+        disclaimer_text = (
+            "Hasil transkripsi dan ringkasan mungkin mengandung ketidakakuratan. "
+            "Harap verifikasi informasi penting."
+        )
+        disclaimer_run = disclaimer_para.add_run(disclaimer_text)
+        disclaimer_run.italic = True
+        disclaimer_run.font.size = Pt(8)
+        disclaimer_run.font.color.rgb = RGBColor(150, 150, 150)
+        disclaimer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+    def _is_placeholder_text(self, text: Optional[str]) -> bool:
+        """Detect summarizer/ASR fallback placeholder text."""
+        if not text:
+            return True
+        t = str(text).strip()
+        # common placeholder patterns from summarizer / transcriber fallbacks
+        if re.search(r"\[\s*Transkripsi placeholder", t, re.I):
+            return True
+        if re.search(r"placeholder", t, re.I) and len(t) < 120:
+            return True
+        return False
+    def _clean_text_for_doc(self, text: Optional[str]) -> str:
+        """Clean text for document: replace raw placeholders with user-friendly notices."""
+        if not text or self._is_placeholder_text(text):
+            return "[transkripsi tidak tersedia]"
+        # Remove any bracketed placeholder fragments embedded in text
+        cleaned = re.sub(r"\[\s*Transkripsi placeholder[^\]]*\]", "", str(text), flags=re.I).strip()
+        return cleaned or "[transkripsi tidak tersedia]"
+    @staticmethod
+    def _format_timestamp(start: float, end: float) -> str:
+        """Format time range as HH:MM:SS"""
+        def sec_to_str(sec: float) -> str:
+            sec = max(0.0, float(sec))
+            h = int(sec // 3600)
+            m = int((sec % 3600) // 60)
+            s = int(sec % 60)
+            if h > 0:
+                return f"{h:02d}:{m:02d}:{s:02d}"
+            return f"{m:02d}:{s:02d}"
+        return f"{sec_to_str(start)}–{sec_to_str(end)}"
+    def _save_minimal_docx(self, path: str, paragraphs: List[str]):
+        """Create a minimal valid .docx (zip package) containing plain paragraphs.
+        This is a lightweight fallback when python-docx is not installed, to ensure
+        the generated file can be opened in Word.
+        """
+        import zipfile
+        def _escape_xml(s: str) -> str:
+            return (
+                s.replace("&", "&amp;")
+                .replace("<", "&lt;")
+                .replace(">", "&gt;")
+                .replace('"', "&quot;")
+                .replace("'", "&apos;")
+            )
+        content_types = (
+            '<?xml version="1.0" encoding="UTF-8"?>\n'
+            '<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">\n'
+            '  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>\n'
+            '  <Default Extension="xml" ContentType="application/xml"/>\n'
+            '  <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>\n'
+            "</Types>"
+        )
+        rels = (
+            '<?xml version="1.0" encoding="UTF-8"?>\n'
+            '<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">\n'
+            '  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>\n'
+            "</Relationships>"
+        )
+        doc_xml_header = (
+            '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
+            '<w:document xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" '
+            'xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" '
+            'xmlns:o="urn:schemas-microsoft-com:office:office" '
+            'xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" '
+            'xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" '
+            'xmlns:v="urn:schemas-microsoft-com:vml" '
+            'xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" '
+            'xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" '
+            'xmlns:w10="urn:schemas-microsoft-com:office:word" '
+            'xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" '
+            'xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" '
+            'xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" '
+            'xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" '
+            'xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" '
+            'xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape">\n'
+            "  <w:body>\n"
+        )
+        doc_xml_footer = (
+            "    <w:sectPr>\n"
+            '      <w:pgSz w:w="11900" w:h="16840"/>\n'
+            '      <w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440" w:header="720" w:footer="720" w:gutter="0"/>\n'
+            "    </w:sectPr>\n"
+            "  </w:body>\n"
+            "</w:document>"
+        )
+        # Build paragraphs as simple <w:p><w:r><w:t>text</w:t></w:r></w:p>
+        paras_xml = []
+        for p in paragraphs:
+            t = _escape_xml(p.strip())
+            if not t:
+                # preserve blank line
+                paras_xml.append("    <w:p/>\n")
+            else:
+                paras_xml.append(f'    <w:p><w:r><w:t xml:space="preserve">{t}</w:t></w:r></w:p>\n')
+        doc_xml = doc_xml_header + "".join(paras_xml) + doc_xml_footer
+        with zipfile.ZipFile(path, "w", compression=zipfile.ZIP_DEFLATED) as z:
+            z.writestr("[Content_Types].xml", content_types)
+            z.writestr("_rels/.rels", rels)
+            z.writestr("word/document.xml", doc_xml)
+    def _extract_paragraph_texts(self, doc: Document) -> List[str]:
+        """Get paragraphs text for python-docx Document or fallback Document"""
+        paras: List[str] = []
+        # python-docx Document
+        try:
+            # using attribute if present
+            if hasattr(doc, "paragraphs"):
+                for p in doc.paragraphs:
+                    paras.append(p.text)
+                return paras
+        except Exception:
+            pass
+        # fallback minimal Document implementation
+        if hasattr(doc, "_paragraphs"):
+            for p in doc._paragraphs:
+                if hasattr(p, "runs"):
+                    paras.append(" ".join(getattr(r, "text", "") for r in p.runs))
+                else:
+                    paras.append(str(p))
+        return paras
+    @staticmethod
+    def _sanitize_filename(filename: str) -> str:
+        """Remove invalid characters from filename"""
+        import re
+        # Remove invalid characters
+        sanitized = re.sub(r'[<>:"/\\|?*]', "", filename)
+        # Replace spaces with underscores
+        sanitized = sanitized.replace(" ", "_")
+        return sanitized

src/evaluator.py ADDED Viewed

	@@ -0,0 +1,797 @@

+"""
+Evaluation Module
+=================
+Implements WER, DER, and other metrics for thesis validation.
+"""
+from __future__ import annotations
+import csv
+import re
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+try:
+    from jiwer import cer, mer, process_words, wer, wil
+    JIWER_AVAILABLE = True
+except ImportError:
+    JIWER_AVAILABLE = False
+    print("[Evaluator] Warning: jiwer not installed. WER calculation will use fallback.")
+@dataclass
+class WERResult:
+    """Word Error Rate evaluation result"""
+    wer: float
+    mer: float = 0.0  # Match Error Rate
+    wil: float = 0.0  # Word Information Lost
+    cer: float = 0.0  # Character Error Rate
+    substitutions: int = 0
+    deletions: int = 0
+    insertions: int = 0
+    hits: int = 0
+    reference_length: int = 0
+    hypothesis_length: int = 0
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            "wer": self.wer,
+            "mer": self.mer,
+            "wil": self.wil,
+            "cer": self.cer,
+            "substitutions": self.substitutions,
+            "deletions": self.deletions,
+            "insertions": self.insertions,
+            "hits": self.hits,
+            "reference_length": self.reference_length,
+            "hypothesis_length": self.hypothesis_length,
+        }
+@dataclass
+class DERResult:
+    """Diarization Error Rate evaluation result"""
+    der: float
+    missed_speech: float = 0.0
+    false_alarm: float = 0.0
+    speaker_confusion: float = 0.0
+    total_duration: float = 0.0
+    num_speakers_ref: int = 0
+    num_speakers_hyp: int = 0
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            "der": self.der,
+            "missed_speech": self.missed_speech,
+            "false_alarm": self.false_alarm,
+            "speaker_confusion": self.speaker_confusion,
+            "total_duration": self.total_duration,
+            "num_speakers_ref": self.num_speakers_ref,
+            "num_speakers_hyp": self.num_speakers_hyp,
+        }
+@dataclass
+class SummaryResult:
+    """Summary evaluation result (ROUGE/BERTScore)"""
+    rouge: Dict[str, float]
+    bertscore: Dict[str, float]
+@dataclass
+class EvaluationResult:
+    """Combined evaluation result"""
+    sample_name: str
+    condition: str
+    wer_result: Optional[WERResult] = None
+    der_result: Optional[DERResult] = None
+    summary_result: Optional[SummaryResult] = None
+    metadata: Dict[str, Any] = field(default_factory=dict)
+class Evaluator:
+    """
+    Evaluation metrics calculator for ASR and Diarization.
+    Provides:
+        - WER (Word Error Rate) for ASR evaluation
+        - DER (Diarization Error Rate) for speaker diarization evaluation
+        - Report generation for thesis documentation
+    Example:
+        >>> evaluator = Evaluator()
+        >>> wer_result = evaluator.calculate_wer(reference, hypothesis)
+        >>> print(f"WER: {wer_result.wer:.2%}")
+    """
+    def __init__(self, output_dir: str = "./data/output"):
+        """
+        Initialize Evaluator.
+        Args:
+            output_dir: Directory for evaluation outputs
+        """
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+    # =========================================================================
+    # Text Preprocessing
+    # =========================================================================
+    @staticmethod
+    def preprocess_text(
+        text: str,
+        lowercase: bool = True,
+        remove_punctuation: bool = True,
+        normalize_whitespace: bool = True,
+        remove_filler_words: bool = False,
+    ) -> str:
+        """
+        Preprocess text for fair WER comparison.
+        Args:
+            text: Input text
+            lowercase: Convert to lowercase
+            remove_punctuation: Remove punctuation marks
+            normalize_whitespace: Normalize whitespace
+            remove_filler_words: Remove filler words (eh, um, etc.)
+        Returns:
+            Preprocessed text
+        """
+        if not text:
+            return ""
+        # Lowercase
+        if lowercase:
+            text = text.lower()
+        # Remove punctuation
+        if remove_punctuation:
+            text = re.sub(r"[^\w\s]", " ", text)
+        # Remove filler words (common in Indonesian)
+        if remove_filler_words:
+            filler_words = ["eh", "em", "um", "uh", "ah", "hmm", "eee", "anu"]
+            pattern = r"\b(" + "|".join(filler_words) + r")\b"
+            text = re.sub(pattern, "", text, flags=re.IGNORECASE)
+        # Normalize whitespace
+        if normalize_whitespace:
+            text = " ".join(text.split())
+        return text.strip()
+    # =========================================================================
+    # WER Calculation
+    # =========================================================================
+    def calculate_wer(self, reference: str, hypothesis: str, preprocess: bool = True) -> WERResult:
+        """
+        Calculate Word Error Rate and related metrics.
+        WER = (S + D + I) / N
+        where:
+            S = Substitutions
+            D = Deletions
+            I = Insertions
+            N = Total words in reference
+        Args:
+            reference: Ground truth text
+            hypothesis: ASR output text
+            preprocess: Apply text preprocessing
+        Returns:
+            WERResult with detailed metrics
+        """
+        # Preprocess
+        if preprocess:
+            reference = self.preprocess_text(reference)
+            hypothesis = self.preprocess_text(hypothesis)
+        # Handle empty cases
+        if not reference:
+            return WERResult(
+                wer=1.0 if hypothesis else 0.0,
+                reference_length=0,
+                hypothesis_length=len(hypothesis.split()) if hypothesis else 0,
+            )
+        if not hypothesis:
+            return WERResult(
+                wer=1.0,
+                deletions=len(reference.split()),
+                reference_length=len(reference.split()),
+                hypothesis_length=0,
+            )
+        # Use jiwer if available
+        if JIWER_AVAILABLE:
+            try:
+                wer_score = wer(reference, hypothesis)
+                mer_score = mer(reference, hypothesis)
+                wil_score = wil(reference, hypothesis)
+                cer_score = cer(reference, hypothesis)
+                # Get detailed breakdown
+                output = process_words(reference, hypothesis)
+                return WERResult(
+                    wer=wer_score,
+                    mer=mer_score,
+                    wil=wil_score,
+                    cer=cer_score,
+                    substitutions=output.substitutions,
+                    deletions=output.deletions,
+                    insertions=output.insertions,
+                    hits=output.hits,
+                    reference_length=len(reference.split()),
+                    hypothesis_length=len(hypothesis.split()),
+                )
+            except Exception as e:
+                print(f"[Evaluator] jiwer calculation failed: {e}")
+        # Fallback: manual calculation using edit distance
+        return self._calculate_wer_manual(reference, hypothesis)
+    def _calculate_wer_manual(self, reference: str, hypothesis: str) -> WERResult:
+        """Calculate WER using manual edit distance (fallback)"""
+        ref_words = reference.split()
+        hyp_words = hypothesis.split()
+        # Dynamic programming for edit distance
+        m, n = len(ref_words), len(hyp_words)
+        dp = [[0] * (n + 1) for _ in range(m + 1)]
+        # Initialize
+        for i in range(m + 1):
+            dp[i][0] = i
+        for j in range(n + 1):
+            dp[0][j] = j
+        # Fill DP table
+        for i in range(1, m + 1):
+            for j in range(1, n + 1):
+                if ref_words[i - 1] == hyp_words[j - 1]:
+                    dp[i][j] = dp[i - 1][j - 1]
+                else:
+                    dp[i][j] = min(
+                        dp[i - 1][j] + 1,  # Deletion
+                        dp[i][j - 1] + 1,  # Insertion
+                        dp[i - 1][j - 1] + 1,  # Substitution
+                    )
+        # Backtrack to count operations
+        i, j = m, n
+        substitutions = deletions = insertions = hits = 0
+        while i > 0 or j > 0:
+            if i > 0 and j > 0 and ref_words[i - 1] == hyp_words[j - 1]:
+                hits += 1
+                i -= 1
+                j -= 1
+            elif i > 0 and j > 0 and dp[i][j] == dp[i - 1][j - 1] + 1:
+                substitutions += 1
+                i -= 1
+                j -= 1
+            elif i > 0 and dp[i][j] == dp[i - 1][j] + 1:
+                deletions += 1
+                i -= 1
+            else:
+                insertions += 1
+                j -= 1
+        total_errors = substitutions + deletions + insertions
+        wer_score = total_errors / len(ref_words) if ref_words else 0.0
+        return WERResult(
+            wer=wer_score,
+            substitutions=substitutions,
+            deletions=deletions,
+            insertions=insertions,
+            hits=hits,
+            reference_length=len(ref_words),
+            hypothesis_length=len(hyp_words),
+        )
+    def calculate_wer_batch(
+        self, references: List[str], hypotheses: List[str], preprocess: bool = True
+    ) -> Tuple[float, List[WERResult]]:
+        """
+        Calculate WER for multiple pairs and return aggregate.
+        Args:
+            references: List of reference texts
+            hypotheses: List of hypothesis texts
+            preprocess: Apply preprocessing
+        Returns:
+            Tuple of (weighted average WER, list of individual results)
+        """
+        if len(references) != len(hypotheses):
+            raise ValueError("Reference and hypothesis lists must have same length")
+        results = []
+        for ref, hyp in zip(references, hypotheses):
+            result = self.calculate_wer(ref, hyp, preprocess)
+            results.append(result)
+        # Calculate weighted average WER
+        total_ref_words = sum(r.reference_length for r in results)
+        total_errors = sum(r.substitutions + r.deletions + r.insertions for r in results)
+        avg_wer = total_errors / total_ref_words if total_ref_words > 0 else 0.0
+        return avg_wer, results
+    # =========================================================================
+    # DER Calculation
+    # =========================================================================
+    def calculate_der(
+        self,
+        reference_segments: List[Tuple[str, float, float]],
+        hypothesis_segments: List[Tuple[str, float, float]],
+        collar: float = 0.25,
+    ) -> DERResult:
+        """
+        Calculate Diarization Error Rate.
+        DER = (Missed Speech + False Alarm + Speaker Confusion) / Total Reference Duration
+        Args:
+            reference_segments: Ground truth [(speaker_id, start, end), ...]
+            hypothesis_segments: System output [(speaker_id, start, end), ...]
+            collar: Forgiveness collar in seconds (standard: 0.25s)
+        Returns:
+            DERResult with detailed breakdown
+        """
+        if not reference_segments:
+            return DERResult(
+                der=0.0,
+                total_duration=0.0,
+                num_speakers_ref=0,
+                num_speakers_hyp=(
+                    len(set(s[0] for s in hypothesis_segments)) if hypothesis_segments else 0
+                ),
+            )
+        # Get unique speakers
+        ref_speakers = set(s[0] for s in reference_segments)
+        hyp_speakers = set(s[0] for s in hypothesis_segments) if hypothesis_segments else set()
+        # Calculate total reference duration
+        total_ref_duration = sum(end - start for _, start, end in reference_segments)
+        if total_ref_duration == 0:
+            return DERResult(
+                der=0.0,
+                total_duration=0.0,
+                num_speakers_ref=len(ref_speakers),
+                num_speakers_hyp=len(hyp_speakers),
+            )
+        # Frame-based evaluation
+        resolution = 0.01  # 10ms resolution
+        # Get time range
+        all_starts = [s[1] for s in reference_segments + (hypothesis_segments or [])]
+        all_ends = [s[2] for s in reference_segments + (hypothesis_segments or [])]
+        min_time = min(all_starts) if all_starts else 0
+        max_time = max(all_ends) if all_ends else 0
+        # Initialize counters
+        missed_speech = 0.0
+        false_alarm = 0.0
+        speaker_confusion = 0.0
+        # Frame-by-frame evaluation
+        t = min_time
+        while t < max_time:
+            t_mid = t + resolution / 2
+            # Get reference speakers at time t
+            ref_spk_at_t = set()
+            for speaker, start, end in reference_segments:
+                # Apply collar
+                if (start + collar) <= t_mid < (end - collar):
+                    ref_spk_at_t.add(speaker)
+            # Get hypothesis speakers at time t
+            hyp_spk_at_t = set()
+            if hypothesis_segments:
+                for speaker, start, end in hypothesis_segments:
+                    if start <= t_mid < end:
+                        hyp_spk_at_t.add(speaker)
+            # Count errors
+            if ref_spk_at_t and not hyp_spk_at_t:
+                # Missed speech: reference has speech, hypothesis doesn't
+                missed_speech += resolution
+            elif hyp_spk_at_t and not ref_spk_at_t:
+                # False alarm: hypothesis has speech, reference doesn't
+                false_alarm += resolution
+            elif ref_spk_at_t and hyp_spk_at_t:
+                # Both have speech - check for speaker confusion
+                # Simplified: if number of speakers differs, count as confusion
+                ref_count = len(ref_spk_at_t)
+                hyp_count = len(hyp_spk_at_t)
+                if ref_count != hyp_count:
+                    # Partial confusion
+                    confusion_ratio = abs(ref_count - hyp_count) / max(ref_count, hyp_count)
+                    speaker_confusion += resolution * confusion_ratio
+            t += resolution
+        # Calculate DER
+        total_error = missed_speech + false_alarm + speaker_confusion
+        der = total_error / total_ref_duration
+        return DERResult(
+            der=min(der, 1.0),  # Cap at 100%
+            missed_speech=missed_speech / total_ref_duration,
+            false_alarm=false_alarm / total_ref_duration,
+            speaker_confusion=speaker_confusion / total_ref_duration,
+            total_duration=total_ref_duration,
+            num_speakers_ref=len(ref_speakers),
+            num_speakers_hyp=len(hyp_speakers),
+        )
+    # =========================================================================
+    # Summary evaluation (ROUGE, BERTScore)
+    # =========================================================================
+    def calculate_summary_metrics(self, reference: str, hypothesis: str) -> SummaryResult:
+        """Calculate ROUGE and BERTScore for summaries.
+        Returns a SummaryResult with compact numeric metrics (rouge1/2/l F1 and bertscore P/R/F1 average).
+        """
+        try:
+            import evaluate
+            rouge = evaluate.load("rouge")
+            bert = evaluate.load("bertscore")
+            # ROUGE expects lists
+            rouge_res = rouge.compute(predictions=[hypothesis], references=[reference])
+            # bertscore returns lists of precision/recall/f1
+            bert_res = bert.compute(predictions=[hypothesis], references=[reference], lang="id")
+            # pick common metrics
+            rouge_out = {
+                "rouge1_f": float(rouge_res.get("rouge1_f", 0.0)),
+                "rouge2_f": float(rouge_res.get("rouge2_f", 0.0)),
+                "rougel_f": float(rouge_res.get("rougeL_f", 0.0)),
+            }
+            bert_out = {
+                "bertscore_precision": float(bert_res.get("precision", [0.0])[0]),
+                "bertscore_recall": float(bert_res.get("recall", [0.0])[0]),
+                "bertscore_f1": float(bert_res.get("f1", [0.0])[0]),
+            }
+            return SummaryResult(rouge=rouge_out, bertscore=bert_out)
+        except Exception as e:
+            print(f"[Evaluator] Summary metric computation failed: {e}")
+            # fallback: empty metrics
+            return SummaryResult(rouge={}, bertscore={})
+    # =========================================================================
+    # Report Generation
+    # =========================================================================
+    def generate_evaluation_report(
+        self,
+        wer_results: List[WERResult],
+        der_results: Optional[List[DERResult]] = None,
+        summary_results: Optional[List[SummaryResult]] = None,
+        sample_names: Optional[List[str]] = None,
+        condition_name: str = "Unknown",
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> str:
+        """
+        Generate formatted evaluation report for thesis.
+        Args:
+            wer_results: List of WER results
+            der_results: List of DER results (optional)
+            sample_names: Names for each sample
+            condition_name: Name of test condition
+            metadata: Optional dictionary of hyperparameters / tuning info used during the run
+        Returns:
+            Formatted report string
+        """
+        lines = []
+        lines.append("=" * 70)
+        lines.append("LAPORAN EVALUASI SISTEM NOTULENSI RAPAT OTOMATIS")
+        lines.append(f"Kondisi: {condition_name}")
+        lines.append(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        lines.append("=" * 70)
+        lines.append("")
+        # WER Summary
+        lines.append("1. EVALUASI ASR (Word Error Rate)")
+        lines.append("-" * 50)
+        if wer_results:
+            wer_values = [r.wer for r in wer_results]
+            avg_wer = np.mean(wer_values)
+            std_wer = np.std(wer_values)
+            min_wer = np.min(wer_values)
+            max_wer = np.max(wer_values)
+            total_subs = sum(r.substitutions for r in wer_results)
+            total_dels = sum(r.deletions for r in wer_results)
+            total_ins = sum(r.insertions for r in wer_results)
+            total_hits = sum(r.hits for r in wer_results)
+            lines.append(f"   Jumlah sampel      : {len(wer_results)}")
+            lines.append(f"   WER rata-rata      : {avg_wer:.4f} ({avg_wer*100:.2f}%)")
+            lines.append(f"   Standar deviasi    : {std_wer:.4f}")
+            lines.append(f"   WER minimum        : {min_wer:.4f} ({min_wer*100:.2f}%)")
+            lines.append(f"   WER maksimum       : {max_wer:.4f} ({max_wer*100:.2f}%)")
+            lines.append("")
+            lines.append("   Detail Error Total:")
+            lines.append(f"   - Substitutions    : {total_subs}")
+            lines.append(f"   - Deletions        : {total_dels}")
+            lines.append(f"   - Insertions       : {total_ins}")
+            lines.append(f"   - Correct (Hits)   : {total_hits}")
+            # Per-sample details
+            if sample_names and len(sample_names) == len(wer_results):
+                lines.append("")
+                lines.append("   Detail per Sampel:")
+                for name, result in zip(sample_names, wer_results):
+                    lines.append(f"   - {name}: WER = {result.wer:.4f} ({result.wer*100:.2f}%)")
+        else:
+            lines.append("   Tidak ada data WER untuk dievaluasi.")
+        lines.append("")
+        # DER Summary
+        lines.append("2. EVALUASI DIARIZATION (Diarization Error Rate)")
+        lines.append("-" * 50)
+        if der_results:
+            der_values = [r.der for r in der_results]
+            avg_der = np.mean(der_values)
+            std_der = np.std(der_values)
+            avg_missed = np.mean([r.missed_speech for r in der_results])
+            avg_fa = np.mean([r.false_alarm for r in der_results])
+            avg_conf = np.mean([r.speaker_confusion for r in der_results])
+            lines.append(f"   Jumlah sampel      : {len(der_results)}")
+            lines.append(f"   DER rata-rata      : {avg_der:.4f} ({avg_der*100:.2f}%)")
+            lines.append(f"   Standar deviasi    : {std_der:.4f}")
+            lines.append("")
+            lines.append("   Komponen Error (rata-rata):")
+            lines.append(f"   - Missed Speech    : {avg_missed:.4f} ({avg_missed*100:.2f}%)")
+            lines.append(f"   - False Alarm      : {avg_fa:.4f} ({avg_fa*100:.2f}%)")
+            lines.append(f"   - Speaker Confusion: {avg_conf:.4f} ({avg_conf*100:.2f}%)")
+            # Per-sample details
+            if sample_names and len(sample_names) == len(der_results):
+                lines.append("")
+                lines.append("   Detail per Sampel:")
+                for name, result in zip(sample_names, der_results):
+                    lines.append(f"   - {name}: DER = {result.der:.4f} ({result.der*100:.2f}%)")
+        else:
+            lines.append("   Tidak ada data DER untuk dievaluasi.")
+        lines.append("")
+        # Summary evaluation (ROUGE, BERTScore)
+        lines.append("3. EVALUASI RINGKASAN (Ringkasan/Abstraksi)")
+        lines.append("-" * 50)
+        if summary_results:
+            try:
+                avg_rouge1 = np.mean([r.rouge.get("rouge1_f", 0.0) for r in summary_results])
+                avg_rouge2 = np.mean([r.rouge.get("rouge2_f", 0.0) for r in summary_results])
+                avg_rougel = np.mean([r.rouge.get("rougel_f", 0.0) for r in summary_results])
+                avg_bertscore = np.mean([r.bertscore.get("bertscore_f1", 0.0) for r in summary_results])
+                lines.append(f"   Jumlah sampel      : {len(summary_results)}")
+                lines.append(f"   ROUGE-1 F1 (avg)   : {avg_rouge1:.4f}")
+                lines.append(f"   ROUGE-2 F1 (avg)   : {avg_rouge2:.4f}")
+                lines.append(f"   ROUGE-L F1 (avg)   : {avg_rougel:.4f}")
+                lines.append(f"   BERTScore F1 (avg) : {avg_bertscore:.4f}")
+            except Exception as e:
+                lines.append(f"   (summary metric aggregation failed: {e})")
+        else:
+            lines.append("   Tidak ada data ringkasan untuk dievaluasi.")
+        lines.append("")
+        # Include metadata/hyperparameters if provided
+        if metadata:
+            lines.append("4. CONFIGURATION & HYPERPARAMETERS")
+            lines.append("-" * 50)
+            try:
+                # Print metadata items in sorted order for consistency
+                for k in sorted(metadata.keys()):
+                    v = metadata[k]
+                    # For nested dicts, pretty-print a compact representation
+                    if isinstance(v, dict):
+                        if not v:
+                            lines.append(f"   - {k}: {{}}")
+                        else:
+                            lines.append(f"   - {k}:")
+                            for kk, vv in v.items():
+                                lines.append(f"       - {kk}: {vv}")
+                    else:
+                        lines.append(f"   - {k}: {v}")
+            except Exception as e:
+                lines.append(f"   - (metadata formatting failed: {e})")
+            lines.append("")
+        lines.append("=" * 70)
+        lines.append("Catatan:")
+        lines.append(
+            "- Evaluasi WER menggunakan preprocessing standar (lowercase, hapus tanda baca)"
+        )
+        lines.append("- Evaluasi DER menggunakan collar forgiveness 0.25 detik")
+        lines.append("=" * 70)
+        return "\n".join(lines)
+    def export_results_to_csv(
+        self, results: List[EvaluationResult], output_filename: str = "evaluation_results.csv"
+    ) -> str:
+        """
+        Export evaluation results to CSV for thesis appendix.
+        Args:
+            results: List of EvaluationResult objects
+            output_filename: Output CSV filename
+        Returns:
+            Path to saved CSV file
+        """
+        output_path = self.output_dir / output_filename
+        with open(output_path, "w", newline="", encoding="utf-8") as f:
+            writer = csv.writer(f)
+            # Header
+            writer.writerow(
+                [
+                    "Sample",
+                    "Condition",
+                    "WER",
+                    "MER",
+                    "WIL",
+                    "CER",
+                    "Substitutions",
+                    "Deletions",
+                    "Insertions",
+                    "Hits",
+                    "Ref_Words",
+                    "Hyp_Words",
+                    "DER",
+                    "Missed_Speech",
+                    "False_Alarm",
+                    "Speaker_Confusion",
+                    # Summary metrics
+                    "ROUGE1_F",
+                    "ROUGE2_F",
+                    "ROUGEL_F",
+                    "BERTScore_F1",
+                    "Duration_Sec",
+                    "Num_Speakers_Ref",
+                    "Num_Speakers_Hyp",
+                ]
+            )
+            # Data rows
+            for result in results:
+                wer = result.wer_result
+                der = result.der_result
+                row = [
+                    result.sample_name,
+                    result.condition,
+                    # WER metrics
+                    f"{wer.wer:.4f}" if wer else "",
+                    f"{wer.mer:.4f}" if wer else "",
+                    f"{wer.wil:.4f}" if wer else "",
+                    f"{wer.cer:.4f}" if wer else "",
+                    wer.substitutions if wer else "",
+                    wer.deletions if wer else "",
+                    wer.insertions if wer else "",
+                    wer.hits if wer else "",
+                    wer.reference_length if wer else "",
+                    wer.hypothesis_length if wer else "",
+                    # DER metrics
+                    f"{der.der:.4f}" if der else "",
+                    f"{der.missed_speech:.4f}" if der else "",
+                    f"{der.false_alarm:.4f}" if der else "",
+                    f"{der.speaker_confusion:.4f}" if der else "",
+                    # Summary metrics
+                    f"{result.summary_result.rouge.get('rouge1_f', ''):.4f}" if result.summary_result and result.summary_result.rouge else "",
+                    f"{result.summary_result.rouge.get('rouge2_f', ''):.4f}" if result.summary_result and result.summary_result.rouge else "",
+                    f"{result.summary_result.rouge.get('rougel_f', ''):.4f}" if result.summary_result and result.summary_result.rouge else "",
+                    f"{result.summary_result.bertscore.get('bertscore_f1', ''):.4f}" if result.summary_result and result.summary_result.bertscore else "",
+                    f"{der.total_duration:.2f}" if der else "",
+                    der.num_speakers_ref if der else "",
+                    der.num_speakers_hyp if der else "",
+                ]
+                writer.writerow(row)
+        return str(output_path)
+    def generate_summary_table(
+        self, results_by_condition: Dict[str, List[EvaluationResult]]
+    ) -> str:
+        """
+        Generate summary table comparing results across conditions.
+        Args:
+            results_by_condition: Dict mapping condition name to list of results
+        Returns:
+            Formatted table string
+        """
+        lines = []
+        lines.append("")
+        lines.append("TABEL RINGKASAN EVALUASI PER KONDISI")
+        lines.append("=" * 80)
+        lines.append("")
+        # Header
+        header = (
+            f"{'Kondisi':<20} {'N':>5} {'WER Mean':>10} {'WER Std':>10} "
+            f"{'DER Mean':>10} {'DER Std':>10}"
+        )
+        lines.append(header)
+        lines.append("-" * 80)
+        # Data rows
+        for condition, results in results_by_condition.items():
+            n = len(results)
+            # WER stats
+            wer_values = [r.wer_result.wer for r in results if r.wer_result]
+            wer_mean = np.mean(wer_values) if wer_values else 0
+            wer_std = np.std(wer_values) if wer_values else 0
+            # DER stats
+            der_values = [r.der_result.der for r in results if r.der_result]
+            der_mean = np.mean(der_values) if der_values else 0
+            der_std = np.std(der_values) if der_values else 0
+            row = (
+                f"{condition:<20} {n:>5} {wer_mean:>10.4f} {wer_std:>10.4f} "
+                f"{der_mean:>10.4f} {der_std:>10.4f}"
+            )
+            lines.append(row)
+        lines.append("-" * 80)
+        lines.append("")
+        return "\n".join(lines)
+    def save_report(self, report: str, filename: str = "evaluation_report.txt") -> str:
+        """Save evaluation report to file"""
+        output_path = self.output_dir / filename
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(report)
+        return str(output_path)

src/nlp_utils.py ADDED Viewed

	@@ -0,0 +1,243 @@

+"""
+Advanced NLP utilities: NER + dependency parsing wrapper with graceful fallbacks.
+Provides a small abstraction `AdvancedNLPExtractor` that will use spaCy if available
+(or fallback regex/heuristic extractors) to extract structured action items and
+decisions from sentence-level metadata.
+"""
+from __future__ import annotations
+import logging
+import re
+from typing import Any, Dict, List, Optional
+try:
+    import spacy
+    from spacy.language import Language
+    _HAS_SPACY = True
+except Exception:
+    _HAS_SPACY = False
+try:
+    from langdetect import detect as _detect_lang
+    _HAS_LANGDETECT = True
+except Exception:
+    _HAS_LANGDETECT = False
+logger = logging.getLogger("AdvancedNLP")
+class AdvancedNLPExtractor:
+    """Wrapper providing NER and dependency-based extraction.
+    Usage:
+        extractor = AdvancedNLPExtractor()
+        items = extractor.extract_actions_from_sentences(sent_meta)
+    `sent_meta` is a list of dicts produced by `BERTSummarizer._get_sentences_with_meta`
+    where each dict contains at least `text`, `speaker_id`, `start`, `end`.
+    """
+    def __init__(self, lang: Optional[str] = None):
+        self.lang = lang
+        self._nlp: Optional[Language] = None
+        if _HAS_SPACY:
+            try:
+                model = self._choose_model(lang)
+                if model is not None:
+                    self._nlp = spacy.load(model)
+                    logger.info(f"Loaded spaCy model: {model}")
+            except Exception as e:
+                logger.warning(f"spaCy model load failed: {e}")
+                self._nlp = None
+        else:
+            logger.debug("spaCy not available; using heuristic fallbacks")
+    def _choose_model(self, lang: Optional[str]) -> Optional[str]:
+        # Prefer language-specific small models if available
+        if lang is None and _HAS_LANGDETECT:
+            return None  # leave None to let caller decide based on text
+        if lang == "id":
+            return "id_core_news_sm"
+        if lang == "en":
+            return "en_core_web_sm"
+        # Fall back to cross-lingual entity model if present
+        return "xx_ent_wiki_sm"
+    def _detect_lang(self, text: str) -> Optional[str]:
+        if not _HAS_LANGDETECT:
+            return None
+        try:
+            return _detect_lang(text)
+        except Exception:
+            return None
+    def _get_doc(self, text: str):
+        # If spaCy is loaded, use it. Otherwise return None.
+        if self._nlp is None:
+            # try to lazily pick a model based on language
+            if _HAS_SPACY:
+                lang = self._detect_lang(text)
+                model = self._choose_model(lang)
+                if model:
+                    try:
+                        self._nlp = spacy.load(model)
+                        logger.info(f"Lazy-loaded spaCy model: {model}")
+                    except Exception:
+                        self._nlp = None
+            return None
+        try:
+            return self._nlp(text)
+        except Exception:
+            return None
+    def extract_persons(self, text: str) -> List[str]:
+        doc = self._get_doc(text)
+        if doc is None:
+            # simple regex: capitalized words sequences
+            names = re.findall(r"\b([A-Z][a-z]{1,20}(?:\s+[A-Z][a-z]{1,20})*)\b", text)
+            return list(dict.fromkeys(names))
+        persons = [ent.text for ent in doc.ents if ent.label_ in ("PERSON", "PER")]
+        # preserve order, unique
+        return list(dict.fromkeys(persons))
+    def extract_actions_from_sentences(
+        self, sent_meta: List[Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
+        """Return candidate action items extracted from sentence metadata.
+        Each returned dict contains: {owner, task, sentence_idx, confidence}
+        """
+        results: List[Dict[str, Any]] = []
+        texts = [s["text"] for s in sent_meta]
+        full = " ".join(texts[: max(1, min(10, len(texts)))])
+        lang = self._detect_lang(full) if _HAS_LANGDETECT else None
+        for i, s in enumerate(sent_meta):
+            text = s.get("text", "").strip()
+            if not text:
+                continue
+            # Quick keyword filter (language-agnostic): if no action words, skip
+            if not re.search(
+                r"\b(akan|harus|perlu|tolong|mohon|harap|deadline|target|tugas|follow up|tindak lanjut|siapkan|buat|bikin|saya|aku|kami|kita)\b",
+                text,
+                flags=re.IGNORECASE,
+            ):
+                # also check for English keywords
+                if not re.search(
+                    r"\b(will|shall|must|please|assign|task|deadline|action item|follow up|todo)\b",
+                    text,
+                    flags=re.IGNORECASE,
+                ):
+                    continue
+            doc = self._get_doc(text)
+            owner: Optional[str] = None
+            task: Optional[str] = None
+            confidence = 0.5
+            # First, try to find PERSON entities in the sentence
+            if doc is not None:
+                persons = [ent.text for ent in doc.ents if ent.label_ in ("PERSON", "PER")]
+                if persons:
+                    owner = persons[0]
+                    confidence = 0.8
+                # dependency parse-based task extraction
+                try:
+                    # find ROOT verb
+                    root = None
+                    for token in doc:
+                        if token.dep_ == "ROOT" and token.pos_ in ("VERB", "AUX"):
+                            root = token
+                            break
+                    if root is not None:
+                        # look for direct objects / xcomp / ccomp
+                        objs = [t for t in doc if t.dep_ in ("dobj", "obj", "xcomp", "ccomp")]
+                        if objs:
+                            task = " ".join([tok.text for tok in objs[0].subtree])
+                            confidence = max(confidence, 0.7)
+                        else:
+                            # fallback: use root subtree as task
+                            task = " ".join([tok.text for tok in root.subtree])
+                            confidence = max(confidence, 0.6)
+                    # If no owner found, search preceding tokens for personal pronouns
+                    if owner is None:
+                        pron = [t for t in doc if t.pos_ == "PRON"]
+                        if pron:
+                            owner = pron[0].text
+                            confidence = 0.6
+                except Exception:
+                    pass
+            # Regex fallback to capture "Name akan <action>" in many languages
+            if owner is None:
+                m = re.search(
+                    r"\b([A-Z][a-z]{1,20})\b\s+(akan|will|harus|must|to)\s+(?P<task>.+)",
+                    text,
+                    flags=re.IGNORECASE,
+                )
+                if m:
+                    owner = m.group(1)
+                    task = m.group("task").strip(" .,:;-")
+                    confidence = 0.7
+            # Otherwise, check for "Saya akan"/"Aku akan" and attribute to speaker
+            if owner is None and re.search(r"\b(saya|aku|kami|kita)\b", text, flags=re.IGNORECASE):
+                owner = s.get("speaker_id")
+                # try extract phrase after 'akan' or commit verb
+                m2 = re.search(
+                    r"\b(?:akan|saya akan|aku akan|saya akan membuat|aku akan membuat|tolong|siapkan|buat|bikin)\b\s*(?P<task>.+)$",
+                    text,
+                    flags=re.IGNORECASE,
+                )
+                if m2:
+                    task = m2.group("task").strip(" .,:;-")
+                    confidence = 0.7
+            # final fallback: if sentence contains action keywords, use whole sentence
+            if task is None:
+                # trim connectors and filler
+                t = re.sub(r"^(oke|ya|nah|baik)\b[:,-]*", "", text, flags=re.IGNORECASE).strip()
+                task = t[:300]
+            # Basic length filter
+            if task and len(task.split()) < 3:
+                continue
+            results.append(
+                {
+                    "owner": owner or s.get("speaker_id"),
+                    "task": task,
+                    "sentence_idx": i,
+                    "confidence": confidence,
+                }
+            )
+        return results
+def extract_decisions_from_sentences(sent_meta: List[Dict[str, Any]]) -> List[str]:
+    """Simple decision extraction: look for decision keywords and return cleaned contexts."""
+    results: List[str] = []
+    decision_kw = re.compile(
+        r"\b(diputuskan|disepakati|kesimpulan|keputusan|sepakat|setuju|disetujui|putus|decided|decision)\b",
+        flags=re.IGNORECASE,
+    )
+    for i, s in enumerate(sent_meta):
+        text = s.get("text", "").strip()
+        if not text:
+            continue
+        if decision_kw.search(text):
+            cleaned = re.sub(r"\[.*?\]", "", text)
+            results.append(cleaned.strip())
+    return results

src/pipeline.py ADDED Viewed

	@@ -0,0 +1,1121 @@

+"""
+Main Pipeline Module
+====================
+Orchestrates all components for end-to-end meeting transcription.
+"""
+from __future__ import annotations
+import os
+import time
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Tuple
+import torch
+from src.audio_processor import AudioConfig, AudioProcessor
+from src.diarization import DiarizationConfig, SpeakerDiarizer, SpeakerSegment
+from src.document_generator import DocumentGenerator, MeetingMetadata
+from src.evaluator import EvaluationResult, Evaluator
+from src.summarizer import BERTSummarizer, MeetingSummary, SummarizationConfig
+from src.transcriber import ASRConfig, ASRTranscriber, TranscriptSegment
+# Optional speechbrain adapter
+try:
+    from src.transcriber_speechbrain import (  # type: ignore
+        SpeechBrainASRConfig,
+        SpeechBrainTranscriber,
+    )
+except Exception:
+    SpeechBrainTranscriber = None
+    SpeechBrainASRConfig = None
+from src.utils import (
+    Timer,
+    ensure_dir,
+    format_duration,
+    sanitize_filename,
+    save_json,
+    setup_logger,
+)
+@dataclass
+class PipelineConfig:
+    """Configuration for the complete pipeline"""
+    # Paths
+    models_dir: str = "./models"
+    output_dir: str = "./data/output"
+    cache_dir: str = "./cache"
+    # Audio settings
+    sample_rate: int = 16000
+    # Diarization settings
+    num_speakers: Optional[int] = None
+    min_speech_duration: float = 0.3
+    # Target speaker enforcement (convenience wrapper for DiarizationConfig.target_num_speakers)
+    target_speakers: Optional[int] = None
+    # ASR settings
+    # Default to Whisper Large v3 Turbo for better accuracy (may be slower)
+    asr_model_id: str = "large-v3-turbo"
+    asr_backend: str = "whisperx"  # whisperx preferred for Large models
+    asr_language: str = "id"
+    whisperx_compute_type: str = "auto"
+    whisperx_vad_filter: bool = True
+    # Summarization settings
+    num_summary_sentences: int = 5
+    # Device
+    device: str = "auto"
+    # Flags
+    save_intermediate: bool = True
+    verbose: bool = True
+    # Performance options
+    fast_mode: bool = False  # reduce accuracy for speed
+    quick_asr: bool = False  # use lightweight ASR where possible
+    embedding_cache: bool = True  # cache diarization embeddings to disk
+    # Preset mode (deployment = recommended default for production: WhisperX large-v3-turbo int8)
+    # Set default to 'fast' to prefer lightweight models (whisper-small) and avoid heavy WhisperX defaults
+    preset: str = "fast"  # choices: deployment|balanced|fast|accurate
+    # Quick ASR options
+    prefer_whisper_small: bool = True
+    # Approximate Continuous Speech Tokenizer token rate in Hz (e.g., 7.5). When set,
+    # ASR will apply a lossy preprocessor to compress audio for speed. Use with care.
+    cst_hz: Optional[float] = 7.5
+    # Compare diarization methods during evaluation
+    diarization_compare: bool = False
+    # Allow explicit override for ASR parallel workers (None = auto)
+    asr_parallel_workers: Optional[int] = None  # override for per-segment ASR parallelism
+    # Optional speaker mapping & diarization tuning
+    speaker_map_path: Optional[str] = None
+    tune_diarization: bool = False
+    # Target speaker convenience
+    target_speakers: Optional[int] = None
+    def __post_init__(self):
+        # Auto-detect device
+        if self.device == "auto":
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Create directories
+        ensure_dir(self.models_dir)
+        ensure_dir(self.output_dir)
+        ensure_dir(self.cache_dir)
+@dataclass
+class PipelineResult:
+    """Complete result from pipeline processing"""
+    # Input info
+    audio_path: str
+    audio_duration: float
+    # Processing info
+    num_speakers: int
+    num_segments: int
+    total_words: int
+    processing_time: float
+    # Outputs
+    segments: List[Dict[str, Any]]
+    transcript_text: str
+    summary: Dict[str, Any]
+    document_path: str
+    # Metadata
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            "audio_path": self.audio_path,
+            "audio_duration": self.audio_duration,
+            "num_speakers": self.num_speakers,
+            "num_segments": self.num_segments,
+            "total_words": self.total_words,
+            "processing_time": self.processing_time,
+            "transcript_text": self.transcript_text,
+            "summary": self.summary,
+            "document_path": self.document_path,
+            "metadata": self.metadata,
+        }
+    def save(self, filepath: str):
+        """Save result to JSON file"""
+        save_json(self.to_dict(), filepath)
+class MeetingTranscriberPipeline:
+    """
+    End-to-end pipeline for automatic meeting transcription.
+    Pipeline Flow:
+        1. Audio Loading & Preprocessing
+        2. Speaker Diarization (VAD + Embedding + Clustering)
+        3. ASR Transcription (per speaker segment)
+        4. BERT Summarization (extractive)
+        5. Document Generation (.docx)
+    Attributes:
+        config: PipelineConfig object
+    Example:
+        >>> pipeline = MeetingTranscriberPipeline()
+        >>> result = pipeline.process("meeting.wav", title="Team Meeting")
+        >>> print(f"Document saved: {result.document_path}")
+    """
+    def __init__(self, config: Optional[PipelineConfig] = None):
+        """
+        Initialize pipeline.
+        Args:
+            config: PipelineConfig object (uses defaults if None)
+        """
+        self.config = config or PipelineConfig()
+        # Setup logger
+        self.logger = setup_logger(
+            "MeetingTranscriber",
+            log_file=(
+                os.path.join(self.config.cache_dir, "pipeline.log")
+                if self.config.save_intermediate
+                else None
+            ),
+        )
+        # Component placeholders (lazy loading)
+        self._audio_processor = None
+        self._diarizer = None
+        self._transcriber = None
+        self._summarizer = None
+        self._doc_generator = None
+        self._evaluator = None
+        # Processing state
+        self._waveform = None
+        self._sample_rate = None
+        self._diarization_segments = None
+        self._transcript_segments = None
+        self._summary = None
+        # Diarization tuning result (if autotune was run)
+        self._diarization_tune_result = None
+        if self.config.verbose:
+            self._log(f"Pipeline initialized with device: {self.config.device}")
+            # Log effective CST value for diagnostics
+            self._log(f"Pipeline effective cst_hz: {getattr(self.config, 'cst_hz', None)} Hz")
+    # =========================================================================
+    # Properties (Lazy Loading)
+    # =========================================================================
+    @property
+    def audio_processor(self) -> AudioProcessor:
+        """Get audio processor (lazy loaded)"""
+        if self._audio_processor is None:
+            self._audio_processor = AudioProcessor(
+                AudioConfig(sample_rate=self.config.sample_rate, mono=True, normalize=True)
+            )
+        return self._audio_processor
+    @property
+    def diarizer(self) -> SpeakerDiarizer:
+        """Get diarizer (lazy loaded)"""
+        if self._diarizer is None:
+            dz_cfg = DiarizationConfig(
+                min_speech_duration=self.config.min_speech_duration,
+                device=self.config.device,
+            )
+            # If pipeline has target_speakers configured, propagate to diarizer config
+            if getattr(self.config, "target_speakers", None) is not None:
+                dz_cfg.target_num_speakers = int(self.config.target_speakers)
+            self._diarizer = SpeakerDiarizer(config=dz_cfg, models_dir=self.config.models_dir)
+        return self._diarizer
+    @property
+    def transcriber(self) -> ASRTranscriber:
+        """Get transcriber (lazy loaded)"""
+        if self._transcriber is None:
+            # Instantiate ASR transcriber; if configured to use SpeechBrain backend prefer adapter
+            asr_cfg = ASRConfig(
+                model_id=self.config.asr_model_id,
+                device=self.config.device,
+                backend=getattr(self.config, "asr_backend", "whisper"),
+                language=getattr(self.config, "asr_language", "id"),
+                whisperx_compute_type=getattr(self.config, "whisperx_compute_type", "auto"),
+                whisperx_vad_filter=bool(getattr(self.config, "whisperx_vad_filter", True)),
+            )
+            # Apply preset defaults (deployment/balanced/fast/accurate)
+            preset = getattr(self.config, "preset", None)
+            if preset == "deployment":
+                # Deployment preset: prefer WhisperX large-v3-turbo (int8 on CPU), full-audio mapping, tuned parallelism
+                asr_cfg.backend = "whisperx"
+                # If user did not explicitly provide a WhisperX-compatible model (e.g. the
+                # configured model contains 'wav2vec' or is an existing TF checkpoint),
+                # override to a known WhisperX-compatible model id. This avoids trying to
+                # load a Transformers checkpoint with WhisperX which expects CTranslate2 format
+                # (contains 'model.bin').
+                user_model = getattr(self.config, "asr_model_id", "") or ""
+                user_model_l = user_model.lower()
+                if (
+                    (not user_model_l)
+                    or ("wav2vec" in user_model_l)
+                    or user_model_l.startswith("models/")
+                ):
+                    asr_cfg.model_id = "large-v3-turbo"
+                    self._log(
+                        "Preset 'deployment' selected: overriding ASR model to 'large-v3-turbo' for WhisperX compatibility."
+                    )
+                else:
+                    asr_cfg.model_id = user_model
+                asr_cfg.use_full_audio_for_segments = True
+                asr_cfg.whisperx_compute_type = (
+                    getattr(self.config, "whisperx_compute_type", "int8") or "int8"
+                )
+                try:
+                    import os
+                    asr_cfg.parallel_workers = min(8, max(1, (os.cpu_count() or 4) - 1))
+                except Exception:
+                    pass
+            elif getattr(self.config, "quick_asr", False) or getattr(self.config, "prefer_whisper_small", False):
+                # Quick/Lightweight ASR: prefer Whisper small for speed and low memory
+                try:
+                    asr_cfg.model_id = "openai/whisper-small"
+                    asr_cfg.backend = "whisper"
+                    # For speed, avoid the costly full-audio alignment step
+                    asr_cfg.use_full_audio_for_segments = False
+                    # Increase parallel workers conservatively for per-segment transcription
+                    import os
+                    asr_cfg.parallel_workers = min(8, max(1, (os.cpu_count() or 4) - 1))
+                    # Larger chunk lengths reduce per-chunk overhead (helps CPU-bound runs)
+                    asr_cfg.chunk_length_s = max(asr_cfg.chunk_length_s, 60.0)
+                    # If Pipeline requested CST approximation, propagate to ASR config
+                    if getattr(self.config, "cst_hz", None) is not None:
+                        asr_cfg.cst_hz = float(self.config.cst_hz)
+                except Exception:
+                    pass
+            # Allow explicit override from pipeline config
+            if getattr(self.config, "asr_parallel_workers", None) is not None:
+                try:
+                    asr_cfg.parallel_workers = int(self.config.asr_parallel_workers)
+                except Exception:
+                    pass
+            # Allow explicit override from pipeline config
+            if getattr(self.config, "asr_parallel_workers", None) is not None:
+                try:
+                    asr_cfg.parallel_workers = int(self.config.asr_parallel_workers)
+                except Exception:
+                    pass
+            if (
+                getattr(self.config, "asr_backend", None) == "speechbrain"
+                and SpeechBrainTranscriber is not None
+            ):
+                # Create SpeechBrain adapter and wrap it with existing ASRTranscriber interface by setting backend
+                self._transcriber = ASRTranscriber(
+                    config=asr_cfg, models_dir=self.config.models_dir
+                )
+                self._transcriber.config.backend = "speechbrain"
+            else:
+                self._transcriber = ASRTranscriber(
+                    config=asr_cfg,
+                    models_dir=self.config.models_dir,
+                )
+        return self._transcriber
+    @property
+    def summarizer(self) -> BERTSummarizer:
+        """Get summarizer (lazy loaded)"""
+        if self._summarizer is None:
+            self._summarizer = BERTSummarizer(
+                config=SummarizationConfig(num_sentences=self.config.num_summary_sentences)
+            )
+        return self._summarizer
+    @property
+    def doc_generator(self) -> DocumentGenerator:
+        """Get document generator (lazy loaded)"""
+        if self._doc_generator is None:
+            self._doc_generator = DocumentGenerator(output_dir=self.config.output_dir)
+        return self._doc_generator
+    @property
+    def evaluator(self) -> Evaluator:
+        """Get evaluator (lazy loaded)"""
+        if self._evaluator is None:
+            self._evaluator = Evaluator(output_dir=self.config.output_dir)
+        return self._evaluator
+    # =========================================================================
+    # Main Processing Methods
+    # =========================================================================
+    def process(
+        self,
+        audio_path: str,
+        title: str = "Notulensi Rapat",
+        date: Optional[str] = None,
+        location: str = "",
+        num_speakers: Optional[int] = None,
+        output_filename: Optional[str] = None,
+        progress_callback: Optional[Callable[[str, int, int], None]] = None,
+    ) -> PipelineResult:
+        """
+        Process audio file through complete pipeline.
+        Args:
+            audio_path: Path to audio file
+            title: Meeting title for document
+            date: Meeting date (default: today)
+            location: Meeting location/platform
+            num_speakers: Known number of speakers (auto-detect if None)
+            output_filename: Output .docx filename (auto-generated if None)
+            progress_callback: Callback function(step_name, current, total)
+        Returns:
+            PipelineResult with all outputs and metadata
+        """
+        start_time = time.time()
+        def update_progress(step: str, current: int, total: int):
+            if progress_callback:
+                progress_callback(step, current, total)
+            if self.config.verbose:
+                self._log(f"Step {current}/{total}: {step}")
+        self._log("=" * 60)
+        self._log(f"Processing: {audio_path}")
+        self._log("=" * 60)
+        # =====================================================================
+        # Step 1: Load and preprocess audio
+        # =====================================================================
+        update_progress("Loading audio", 1, 5)
+        with Timer("Audio loading"):
+            self._waveform, self._sample_rate = self.audio_processor.load_audio(audio_path)
+        duration = self.audio_processor.get_duration(self._waveform, self._sample_rate)
+        self._log(f"Audio loaded: {format_duration(duration)} ({duration:.2f}s)")
+        # Validate audio duration
+        max_duration_minutes = getattr(self.config, "max_duration_minutes", 60)
+        max_duration_seconds = max_duration_minutes * 60
+        if duration > max_duration_seconds:
+            error_msg = (
+                f"Audio duration ({duration:.1f}s) exceeds maximum allowed duration "
+                f"({max_duration_seconds}s / {max_duration_minutes} minutes). "
+                "Please split the audio or increase max_duration_minutes in config."
+            )
+            self.logger.error(error_msg)
+            raise ValueError(error_msg)
+        # =====================================================================
+        # Step 2: Speaker diarization (optionally tune hyperparameters first)
+        # =====================================================================
+        update_progress("Speaker diarization", 2, 5)
+        # Optional automatic tuning step
+        if getattr(self.config, "tune_diarization", False):
+            self._log("Tuning diarization hyperparameters...")
+            try:
+                tune_res = self.diarizer.auto_tune(
+                    self._waveform, self._sample_rate, num_speakers=num_speakers
+                )
+                # store tuning result for later reporting
+                self._diarization_tune_result = tune_res or {}
+            except Exception as e:
+                self._diarization_tune_result = {}
+                self._log(f"Diarization tuning failed (continuing with defaults): {e}")
+        with Timer("Diarization"):
+            # Pass cache directory and audio id so diarizer can cache embeddings
+            self._diarization_segments = self.diarizer.process(
+                self._waveform,
+                self._sample_rate,
+                num_speakers=num_speakers or self.config.num_speakers,
+                cache_dir=self.config.cache_dir,
+                audio_id=Path(audio_path).stem,
+                fast_mode=self.config.fast_mode,
+            )
+        unique_speakers = set(seg.speaker_id for seg in self._diarization_segments)
+        self._log(
+            f"Found {len(unique_speakers)} speakers, {len(self._diarization_segments)} segments"
+        )
+        # =====================================================================
+        # Step 3: ASR transcription
+        # =====================================================================
+        update_progress("Transcribing speech", 3, 5)
+        with Timer("Transcription"):
+            self._transcript_segments = self.transcriber.transcribe_segments(
+                self._waveform, self._diarization_segments, self._sample_rate
+            )
+        total_words = sum(seg.word_count for seg in self._transcript_segments)
+        self._log(f"Transcribed {len(self._transcript_segments)} segments, ~{total_words} words")
+        # =====================================================================
+        # Step 4: BERT summarization
+        # =====================================================================
+        update_progress("Generating summary", 4, 5)
+        # If a manual speaker map was provided via config, apply it so summarizer sees mapped names
+        if getattr(self.config, "speaker_map_path", None):
+            try:
+                speaker_map = self._load_speaker_map(self.config.speaker_map_path)
+                self._apply_speaker_map(speaker_map)
+            except Exception as e:
+                self._log(f"Failed to load/apply speaker map: {e}")
+        with Timer("Summarization"):
+            self._summary = self.summarizer.summarize(self._transcript_segments)
+        self._log(f"Generated summary with {len(self._summary.key_points)} key points")
+        # =====================================================================
+        # Step 5: Generate document
+        # =====================================================================
+        update_progress("Generating document", 5, 5)
+        # Prepare metadata
+        participants = list(unique_speakers)
+        # If speaker map provided, map participants accordingly
+        if getattr(self.config, "speaker_map_path", None):
+            try:
+                speaker_map = self._load_speaker_map(self.config.speaker_map_path)
+                participants = [speaker_map.get(p, p) for p in participants]
+            except Exception:
+                pass
+        metadata = MeetingMetadata(
+            title=title,
+            date=date or datetime.now().strftime("%d %B %Y"),
+            time=datetime.now().strftime("%H:%M"),
+            location=location,
+            duration=format_duration(duration),
+            participants=participants,
+        )
+        # Generate filename if not provided
+        if output_filename is None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            safe_title = sanitize_filename(title)[:30]
+            output_filename = f"notulensi_{safe_title}_{timestamp}.docx"
+        with Timer("Document generation"):
+            doc_path = self.doc_generator.generate(
+                metadata=metadata,
+                summary=self._summary,
+                transcript=self._transcript_segments,
+                output_filename=output_filename,
+            )
+        self._log(f"Document saved: {doc_path}")
+        # =====================================================================
+        # Save intermediate results
+        # =====================================================================
+        if self.config.save_intermediate:
+            self._save_intermediate_results(audio_path, metadata)
+        # Save speaker map alongside results if provided
+        if getattr(self.config, "speaker_map_path", None):
+            try:
+                speaker_map = self._load_speaker_map(self.config.speaker_map_path)
+                save_json(
+                    speaker_map,
+                    Path(self.config.cache_dir) / f"{Path(audio_path).stem}_speaker_map.json",
+                )
+            except Exception:
+                pass
+        # =====================================================================
+        # Build result
+        # =====================================================================
+        processing_time = time.time() - start_time
+        result = PipelineResult(
+            audio_path=audio_path,
+            audio_duration=duration,
+            num_speakers=len(unique_speakers),
+            num_segments=len(self._transcript_segments),
+            total_words=total_words,
+            processing_time=processing_time,
+            segments=[seg.to_dict() for seg in self._transcript_segments],
+            transcript_text=self.get_transcript_text(),
+            summary=self._summary.to_dict(),
+            document_path=doc_path,
+            metadata={
+                "title": title,
+                "date": date or datetime.now().strftime("%Y-%m-%d"),
+                "location": location,
+                "device": self.config.device,
+                "asr_model": self.config.asr_model_id,
+            },
+        )
+        self._log("=" * 60)
+        self._log(f"Processing complete! Total time: {format_duration(processing_time)}")
+        self._log(f"Output: {doc_path}")
+        self._log("=" * 60)
+        return result
+    # =========================================================================
+    # Individual Step Methods
+    # =========================================================================
+    def load_audio(self, audio_path: str) -> Tuple[torch.Tensor, int]:
+        """Load and preprocess audio file"""
+        self._waveform, self._sample_rate = self.audio_processor.load_audio(audio_path)
+        return self._waveform, self._sample_rate
+    def run_diarization(self, num_speakers: Optional[int] = None) -> List[SpeakerSegment]:
+        """Run diarization on loaded audio"""
+        if self._waveform is None:
+            raise ValueError("Audio not loaded. Call load_audio() first.")
+        self._diarization_segments = self.diarizer.process(
+            self._waveform, self._sample_rate, num_speakers=num_speakers
+        )
+        return self._diarization_segments
+    def run_transcription(self) -> List[TranscriptSegment]:
+        """Run ASR on diarized segments"""
+        if self._diarization_segments is None:
+            raise ValueError("Diarization not done. Call run_diarization() first.")
+        self._transcript_segments = self.transcriber.transcribe_segments(
+            self._waveform, self._diarization_segments, self._sample_rate
+        )
+        return self._transcript_segments
+    def run_summarization(self) -> MeetingSummary:
+        """Generate summary from transcript"""
+        if self._transcript_segments is None:
+            raise ValueError("Transcription not done. Call run_transcription() first.")
+        self._summary = self.summarizer.summarize(self._transcript_segments)
+        return self._summary
+    def generate_document(
+        self, metadata: MeetingMetadata, output_filename: str = "notulensi.docx"
+    ) -> str:
+        """Generate .docx document"""
+        if self._transcript_segments is None or self._summary is None:
+            raise ValueError("Transcript and summary required.")
+        return self.doc_generator.generate(
+            metadata=metadata,
+            summary=self._summary,
+            transcript=self._transcript_segments,
+            output_filename=output_filename,
+        )
+    # =========================================================================
+    # Evaluation Methods
+    # =========================================================================
+    def evaluate(
+        self,
+        reference_transcript: Optional[str] = None,
+        reference_diarization: Optional[List[Tuple[str, float, float]]] = None,
+        reference_summary: Optional[str] = None,
+        sample_name: str = "sample",
+        condition: str = "unknown",
+    ) -> EvaluationResult:
+        """
+        Evaluate pipeline output against ground truth.
+        Args:
+            reference_transcript: Ground truth transcript text
+            reference_diarization: Ground truth diarization [(speaker, start, end), ...]
+            reference_summary: Ground truth summary text (for summary evaluation)
+            sample_name: Name for this sample
+            condition: Test condition name
+        Returns:
+            EvaluationResult with WER, DER, and optional summary metrics
+        """
+        wer_result = None
+        der_result = None
+        # Calculate WER if reference transcript provided
+        if reference_transcript and self._transcript_segments:
+            hypothesis = self.get_transcript_text()
+            wer_result = self.evaluator.calculate_wer(reference_transcript, hypothesis)
+            self._log(f"WER: {wer_result.wer:.4f} ({wer_result.wer*100:.2f}%)")
+            # Calculate DER if reference diarization provided
+        if reference_diarization and self._diarization_segments:
+            hypothesis_diarization = [
+                (seg.speaker_id, seg.start, seg.end) for seg in self._diarization_segments
+            ]
+            der_result = self.evaluator.calculate_der(reference_diarization, hypothesis_diarization)
+            self._log(f"DER: {der_result.der:.4f} ({der_result.der*100:.2f}%)")
+        # If reference diarization not provided but reference transcript contains speaker labels,
+        # attempt to build a reference diarization by aligning the labeled transcript to the
+        # pipeline's transcript segments. This often improves DER accuracy when GT RTTM is missing.
+        if not reference_diarization and reference_transcript and self._diarization_segments:
+            # Heuristic detection: presence of 'Name:' lines
+            if ":" in reference_transcript and any(
+                line.strip().endswith(":") or ":" in line
+                for line in reference_transcript.splitlines()[:20]
+            ):
+                try:
+                    from src.utils import (
+                        align_reference_to_segments,
+                        parse_speaker_labeled_text,
+                    )
+                    utterances = parse_speaker_labeled_text(reference_transcript)
+                    if utterances:
+                        hyp_segs = self._transcript_segments or []
+                        # Build reference diarization from alignment
+                        derived_ref = align_reference_to_segments(utterances, hyp_segs)
+                        if derived_ref:
+                            hypothesis_diarization = [
+                                (seg.speaker_id, seg.start, seg.end)
+                                for seg in self._diarization_segments
+                            ]
+                            der_result = self.evaluator.calculate_der(
+                                derived_ref, hypothesis_diarization
+                            )
+                            self._log(
+                                f"Derived RTTM used for DER (from speaker-labeled transcript). DER: {der_result.der:.4f} ({der_result.der*100:.2f}%)"
+                            )
+                except Exception as e:
+                    self._log(f"Auto-alignment for RTTM failed: {e}")
+                    pass
+        # Summary evaluation (if reference_summary provided)
+        summary_result = None
+        if reference_summary and self._summary:
+            try:
+                # Prefer overview text if available, otherwise join key points
+                hyp_summary = getattr(self._summary, "overview", "") or " ".join(getattr(self._summary, "key_points", []))
+                summary_result = self.evaluator.calculate_summary_metrics(reference_summary, hyp_summary)
+                self._log(
+                    f"Summary metrics - ROUGE1_F: {summary_result.rouge.get('rouge1_f', 0.0):.4f}, BERTScore_F1: {summary_result.bertscore.get('bertscore_f1', 0.0):.4f}"
+                )
+            except Exception as e:
+                self._log(f"Summary evaluation failed: {e}")
+        # Build evaluation metadata: include relevant hyperparameters and tuning info
+        metadata: Dict[str, Any] = {}
+        try:
+            # ASR config
+            asr_cfg = getattr(self.transcriber, "config", None)
+            if asr_cfg is not None:
+                metadata["asr_backend"] = getattr(asr_cfg, "backend", None)
+                metadata["asr_model_id"] = getattr(asr_cfg, "model_id", None)
+                metadata["asr_language"] = getattr(asr_cfg, "language", None)
+                metadata["asr_use_full_audio_for_segments"] = getattr(
+                    asr_cfg, "use_full_audio_for_segments", None
+                )
+                metadata["asr_whisperx_compute_type"] = getattr(asr_cfg, "whisperx_compute_type", None)
+                metadata["asr_whisperx_vad_filter"] = getattr(asr_cfg, "whisperx_vad_filter", None)
+                metadata["asr_parallel_workers"] = getattr(asr_cfg, "parallel_workers", None)
+        except Exception:
+            pass
+        try:
+            dz_cfg = getattr(self.diarizer, "config", None)
+            if dz_cfg is not None:
+                # pick a sensible subset of diarizer params
+                metadata["diarizer_vad_threshold"] = getattr(dz_cfg, "vad_threshold", None)
+                metadata["diarizer_min_speech_duration"] = getattr(dz_cfg, "min_speech_duration", None)
+                metadata["diarizer_segment_window"] = getattr(dz_cfg, "segment_window", None)
+                metadata["diarizer_segment_hop"] = getattr(dz_cfg, "segment_hop", None)
+                metadata["diarizer_clustering_method"] = getattr(dz_cfg, "clustering_method", None)
+                metadata["diarizer_clustering_threshold"] = getattr(dz_cfg, "clustering_threshold", None)
+                metadata["diarizer_min_cluster_size"] = getattr(dz_cfg, "min_cluster_size", None)
+                metadata["diarizer_iterative_merge_threshold"] = getattr(
+                    dz_cfg, "iterative_merge_threshold", None
+                )
+                metadata["diarizer_target_num_speakers"] = getattr(dz_cfg, "target_num_speakers", None)
+                metadata["diarizer_target_force_threshold"] = getattr(dz_cfg, "target_force_threshold", None)
+                metadata["diarizer_merge_gap_threshold"] = getattr(dz_cfg, "merge_gap_threshold", None)
+                metadata["diarizer_use_fast_embedding"] = getattr(dz_cfg, "use_fast_embedding", None)
+                metadata["diarizer_embedding_model_id"] = getattr(dz_cfg, "embedding_model_id", None)
+        except Exception:
+            pass
+        metadata["tune_diarization_requested"] = bool(getattr(self.config, "tune_diarization", False))
+        metadata["diarization_tune_result"] = self._diarization_tune_result or {}
+        # Reference information
+        metadata["reference_transcript_provided"] = bool(reference_transcript)
+        metadata["reference_diarization_provided"] = bool(reference_diarization)
+        metadata["used_derived_rttm"] = bool("derived_ref" in locals() and derived_ref)
+        # Optional diarization method comparison (agglomerative vs spectral)
+        if getattr(self.config, "diarization_compare", False) and reference_diarization:
+            try:
+                # Recompute speech regions/windows/embeddings for re-clustering
+                speech_regions = self.diarizer._detect_speech(self._waveform, self._sample_rate)
+                windows = self.diarizer._create_windows(speech_regions)
+                embeddings = self.diarizer._extract_embeddings(
+                    self._waveform, windows, self._sample_rate, cache_dir=self.config.cache_dir, audio_id=Path(sample_name).stem
+                )
+                comp_results = {}
+                for method in ("agglomerative", "spectral"):
+                    try:
+                        labels = self.diarizer._cluster_embeddings(embeddings, num_speakers=None, method_override=method)
+                        hyp_segments = self.diarizer._create_segments(windows, labels, embeddings)
+                        hyp_rttm = [(s.speaker_id, s.start, s.end) for s in hyp_segments]
+                        der_res = self.evaluator.calculate_der(reference_diarization, hyp_rttm)
+                        comp_results[method] = der_res.to_dict()
+                    except Exception as e:
+                        comp_results[method] = {"error": str(e)}
+                metadata["diarization_comparison"] = comp_results
+            except Exception as e:
+                self._log(f"Diarization comparison failed: {e}")
+        return EvaluationResult(
+            sample_name=sample_name,
+            condition=condition,
+            wer_result=wer_result,
+            der_result=der_result,
+            summary_result=summary_result,
+            metadata=metadata,
+        )
+    # =========================================================================
+    # Utility Methods
+    # =========================================================================
+    def get_transcript_text(self) -> str:
+        """Get full transcript as plain text"""
+        if self._transcript_segments is None:
+            return ""
+        return " ".join(seg.text for seg in self._transcript_segments if seg.text)
+    def get_formatted_transcript(self) -> str:
+        """Get transcript with speaker labels and timestamps"""
+        if self._transcript_segments is None:
+            return ""
+        lines = []
+        for seg in self._transcript_segments:
+            timestamp = format_duration(seg.start)
+            lines.append(f"[{timestamp}] {seg.speaker_id}: {seg.text}")
+        return "\n".join(lines)
+    def get_speaker_stats(self) -> Dict[str, Dict[str, Any]]:
+        """Get statistics per speaker"""
+        if self._transcript_segments is None:
+            return {}
+        stats = {}
+        for seg in self._transcript_segments:
+            if seg.speaker_id not in stats:
+                stats[seg.speaker_id] = {"word_count": 0, "duration": 0.0, "segment_count": 0}
+            stats[seg.speaker_id]["word_count"] += seg.word_count
+            stats[seg.speaker_id]["duration"] += seg.duration
+            stats[seg.speaker_id]["segment_count"] += 1
+        return stats
+    def clear_state(self):
+        """Clear internal state for fresh processing"""
+        self._waveform = None
+        self._sample_rate = None
+        self._diarization_segments = None
+        self._transcript_segments = None
+        self._summary = None
+    def _log(self, message: str):
+        """Log message"""
+        if self.config.verbose:
+            print(f"[Pipeline] {message}")
+        self.logger.info(message)
+    def _load_speaker_map(self, path: str) -> dict:
+        """Load a speaker map from JSON or YAML file."""
+        p = Path(path)
+        if not p.exists():
+            raise FileNotFoundError(f"Speaker map file not found: {path}")
+        try:
+            import json
+            with open(p, "r", encoding="utf-8") as fh:
+                data = json.load(fh)
+            if not isinstance(data, dict):
+                raise ValueError("Speaker map must be a JSON object mapping labels to names")
+            return data
+        except Exception:
+            try:
+                import yaml
+                with open(p, "r", encoding="utf-8") as fh:
+                    data = yaml.safe_load(fh)
+                if not isinstance(data, dict):
+                    raise ValueError("Speaker map must be a mapping in YAML/JSON format")
+                return data
+            except Exception as e:
+                raise ValueError(f"Failed to parse speaker map: {e}")
+    def _apply_speaker_map(self, mapping: dict):
+        """Apply speaker mapping to transcript segments and summary action items.
+        This replaces `seg.speaker_id` with the provided name and stores the original id in
+        `seg.metadata['original_speaker_id']` for traceability.
+        """
+        if not mapping:
+            return
+        # Update transcript segments if they exist
+        if getattr(self, "_transcript_segments", None):
+            for seg in self._transcript_segments:
+                orig = seg.speaker_id
+                mapped = mapping.get(orig)
+                if mapped and mapped != orig:
+                    seg.metadata["original_speaker_id"] = orig
+                    seg.speaker_id = mapped
+        # Update action item owners in summary
+        try:
+            for ai in self._summary.action_items or []:
+                owner = ai.get("owner")
+                if owner and owner in mapping:
+                    ai["owner"] = mapping[owner]
+        except Exception:
+            pass
+        # Finally update diarization segments as well (if present)
+        try:
+            self._log(f"Applying speaker mapping to diarization segments: {mapping}")
+            for dseg in self._diarization_segments or []:
+                orig = dseg.speaker_id
+                mapped = mapping.get(orig)
+                self._log(f"Segment {orig} -> mapped: {mapped}")
+                if mapped and mapped != orig:
+                    dseg.metadata["original_speaker_id"] = orig
+                    dseg.speaker_id = mapped
+            self._log(
+                f"Post-map speaker ids: {[d.speaker_id for d in (self._diarization_segments or [])]}"
+            )
+        except Exception as e:
+            self._log(f"Error applying speaker map to diarization segments: {e}")
+            pass
+    def _save_intermediate_results(self, audio_path: str, metadata: MeetingMetadata):
+        """Save intermediate results to JSON"""
+        base_name = Path(audio_path).stem
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        results = {
+            "audio_path": audio_path,
+            "timestamp": timestamp,
+            "metadata": {
+                "title": metadata.title,
+                "date": metadata.date,
+                "duration": metadata.duration,
+            },
+            "config": {
+                "sample_rate": self.config.sample_rate,
+                "asr_model": self.config.asr_model_id,
+                "device": self.config.device,
+            },
+            "diarization": [
+                {
+                    "speaker_id": seg.speaker_id,
+                    "start": seg.start,
+                    "end": seg.end,
+                    "is_overlap": seg.is_overlap,
+                }
+                for seg in (self._diarization_segments or [])
+            ],
+            "transcript": [seg.to_dict() for seg in (self._transcript_segments or [])],
+            "summary": self._summary.to_dict() if self._summary else None,
+        }
+        output_path = Path(self.config.cache_dir) / f"{base_name}_{timestamp}_results.json"
+        save_json(results, output_path)
+        self._log(f"Intermediate results saved: {output_path}")
+    # ------------------------------------------------------------------
+    # Convenience methods for interactive flows (UI, Streamlit)
+    # ------------------------------------------------------------------
+    def run_diarization(self, audio_path: str) -> dict:
+        """Run loading + diarization steps and return a dict with summary info.
+        Returns: {"audio_duration": float, "num_windows": int, "num_speech_regions": int, "unique_speakers": [..], "segments": [..]}
+        """
+        # Load audio
+        self._waveform, self._sample_rate = self.audio_processor.load_audio(audio_path)
+        duration = self.audio_processor.get_duration(self._waveform, self._sample_rate)
+        # Run diarization
+        self._diarization_segments = self.diarizer.process(
+            self._waveform,
+            self._sample_rate,
+            num_speakers=None,
+            cache_dir=self.config.cache_dir,
+            audio_id=Path(audio_path).stem,
+            fast_mode=self.config.fast_mode,
+        )
+        unique_speakers = sorted(list(set(seg.speaker_id for seg in self._diarization_segments)))
+        return {
+            "audio_duration": duration,
+            "num_segments": len(self._diarization_segments),
+            "unique_speakers": unique_speakers,
+            "segments": [
+                {"speaker_id": s.speaker_id, "start": s.start, "end": s.end}
+                for s in self._diarization_segments
+            ],
+        }
+    def apply_speaker_map(
+        self, mapping: dict, save_to_cache: bool = False, audio_id: Optional[str] = None
+    ):
+        """Apply a manual speaker mapping to internal state and optionally save the map to cache.
+        mapping: dict mapping original speaker id -> desired display name
+        """
+        self._apply_speaker_map(mapping)
+        if save_to_cache and audio_id:
+            try:
+                save_json(mapping, Path(self.config.cache_dir) / f"{audio_id}_speaker_map.json")
+            except Exception:
+                pass
+    def continue_from_diarization(
+        self,
+        title: str = "Notulensi Rapat",
+        date: Optional[str] = None,
+        location: str = "",
+        output_filename: Optional[str] = None,
+        progress_callback: Optional[Callable[[str, int, int], None]] = None,
+    ) -> PipelineResult:
+        """Continue processing from the current _waveform and _diarization_segments.
+        Runs ASR, summarization, and document generation using existing in-memory diarization.
+        """
+        if (
+            getattr(self, "_waveform", None) is None
+            or getattr(self, "_diarization_segments", None) is None
+        ):
+            raise RuntimeError(
+                "Diarization state not found. Run run_diarization(audio_path) first."
+            )
+        update_progress = lambda step, cur, total: (
+            progress_callback(step, cur, total) if progress_callback else None
+        )
+        # Step 3: ASR
+        update_progress("Transcribing speech", 3, 5)
+        with Timer("Transcription"):
+            self._transcript_segments = self.transcriber.transcribe_segments(
+                self._waveform, self._diarization_segments, self._sample_rate
+            )
+        total_words = sum(seg.word_count for seg in self._transcript_segments)
+        self._log(f"Transcribed {len(self._transcript_segments)} segments, ~{total_words} words")
+        # Apply speaker map if configured
+        if getattr(self.config, "speaker_map_path", None):
+            try:
+                speaker_map = self._load_speaker_map(self.config.speaker_map_path)
+                self._apply_speaker_map(speaker_map)
+            except Exception as e:
+                self._log(f"Failed to load/apply speaker map: {e}")
+        # Step 4: Summarization
+        update_progress("Generating summary", 4, 5)
+        with Timer("Summarization"):
+            self._summary = self.summarizer.summarize(self._transcript_segments)
+        self._log(f"Generated summary with {len(self._summary.key_points)} key points")
+        # Step 5: Document generation
+        update_progress("Generating document", 5, 5)
+        participants = list(set(seg.speaker_id for seg in self._diarization_segments))
+        if getattr(self.config, "speaker_map_path", None):
+            try:
+                speaker_map = self._load_speaker_map(self.config.speaker_map_path)
+                participants = [speaker_map.get(p, p) for p in participants]
+            except Exception:
+                pass
+        metadata = MeetingMetadata(
+            title=title,
+            date=date or datetime.now().strftime("%d %B %Y"),
+            time=datetime.now().strftime("%H:%M"),
+            location=location,
+            duration=format_duration(
+                self.audio_processor.get_duration(self._waveform, self._sample_rate)
+            ),
+            participants=participants,
+        )
+        if output_filename is None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            safe_title = sanitize_filename(title)[:30]
+            output_filename = f"notulensi_{safe_title}_{timestamp}.docx"
+        with Timer("Document generation"):
+            doc_path = self.doc_generator.generate(
+                metadata=metadata,
+                summary=self._summary,
+                transcript=self._transcript_segments,
+                output_filename=output_filename,
+            )
+        self._log(f"Document saved: {doc_path}")
+        # Save intermediate results
+        if self.config.save_intermediate:
+            self._save_intermediate_results(output_filename, metadata)
+        processing_time = 0.0
+        result = PipelineResult(
+            audio_path=output_filename,
+            audio_duration=self.audio_processor.get_duration(self._waveform, self._sample_rate),
+            num_speakers=len(set(seg.speaker_id for seg in self._diarization_segments)),
+            num_segments=len(self._transcript_segments),
+            total_words=total_words,
+            processing_time=processing_time,
+            segments=[seg.to_dict() for seg in (self._transcript_segments or [])],
+            transcript_text="\n".join([s.text for s in (self._transcript_segments or [])]),
+            summary=self._summary.to_dict() if self._summary else {},
+            document_path=str(doc_path),
+        )
+        return result

src/speaker.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""Speaker classifier scaffold for multi-task training and evaluation.
+This module provides a small PyTorch `SpeakerClassifier` that maps embeddings
+(or pooled encoder outputs) to speaker logits, plus helpers to build speaker
+mappings from manifests.
+"""
+import json
+from pathlib import Path
+from typing import Dict, List
+try:
+    import torch
+    import torch.nn as nn
+except Exception:
+    torch = None
+    nn = None
+class SpeakerClassifier:
+    """A light-weight wrapper that exposes an API-compatible classifier.
+    If PyTorch is available, `SpeakerClassifier.model` is a `nn.Module`.
+    Otherwise this is a placeholder to keep the dependency optional in tests.
+    """
+    def __init__(self, input_dim: int, num_speakers: int, dropout: float = 0.1):
+        self.input_dim = input_dim
+        self.num_speakers = num_speakers
+        self.dropout = dropout
+        if torch is not None and nn is not None:
+            self.model = nn.Sequential(
+                nn.Dropout(p=dropout),
+                nn.Linear(input_dim, num_speakers),
+            )
+        else:
+            self.model = None
+    def forward(self, x):
+        if self.model is None:
+            raise RuntimeError("PyTorch not available for SpeakerClassifier")
+        return self.model(x)
+def build_speaker_map(manifest_paths: List[str]) -> Dict[str, int]:
+    """Read JSONL manifest(s) and return a speaker->id mapping.
+    The manifest format: each line is JSON with optional "speaker" key.
+    Labels are returned in deterministic sorted order.
+    """
+    speakers = set()
+    for p in manifest_paths:
+        pth = Path(p)
+        if not pth.exists():
+            continue
+        with open(pth, "r", encoding="utf-8") as fh:
+            for line in fh:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    obj = json.loads(line)
+                except Exception:
+                    continue
+                spk = obj.get("speaker")
+                if spk is not None:
+                    speakers.add(str(spk))
+    sorted_spks = sorted(speakers)
+    return {s: i for i, s in enumerate(sorted_spks)}

src/summarizer.py ADDED Viewed

	@@ -0,0 +1,1783 @@

+"""
+BERT Extractive Summarization Module
+====================================
+Implements extractive summarization using IndoBERT/mBERT for meeting minutes.
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+import numpy as np
+def _collapse_repeated_phrases_global(text: str, max_ngram: int = 6, min_repeats: int = 2) -> str:
+    """Module-level helper to collapse repeated n-gram phrases.
+    Iteratively collapses repeated adjacent n-gram phrases into a single occurrence.
+    """
+    if not text or min_repeats < 2:
+        return text
+    pattern = re.compile(r"(\b(?:\w+\s+){0,%d}\w+\b)(?:\s+\1){%d,}" % (max_ngram - 1, min_repeats - 1), flags=re.IGNORECASE)
+    prev = None
+    out = text
+    while prev != out:
+        prev = out
+        out = pattern.sub(r"\1", out)
+    return out
+from src.transcriber import TranscriptSegment
+@dataclass
+class SummarizationConfig:
+    """Configuration for summarization"""
+    # Method: 'extractive' (BERT embeddings) or 'abstractive' (seq2seq model)
+    method: str = "extractive"
+    # Models
+    # Use a cached/available model for reliability in offline environments
+    sentence_model_id: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+    abstractive_model_id: str = "google/mt5-base"
+    # Extractive settings (increase to capture more key points)
+    num_sentences: int = 7
+    min_sentence_length: int = 6
+    max_sentence_length: int = 300
+    # Abstractive settings
+    max_input_chars: int = 1000
+    max_summary_length: int = 128
+    min_summary_length: int = 30
+    # Light abstractive refinement step (run on condensed extractive overview)
+    do_abstractive_refinement: bool = True
+    abstractive_refine_max_len: int = 80
+    # Generate a comprehensive executive overview (long, covering entire meeting)
+    comprehensive_overview: bool = True
+    comprehensive_max_length: int = 512
+    # Post-processing options
+    polish_overview: bool = True
+    semantic_dedup_threshold: float = 0.75
+    # Scoring weights
+    position_weight: float = 0.15
+    length_weight: float = 0.10
+    similarity_weight: float = 0.75
+    # Keywords for detection
+    decision_keywords: List[str] = field(
+        default_factory=lambda: [
+            "diputuskan",
+            "disepakati",
+            "kesimpulan",
+            "keputusan",
+            "jadi",
+            "maka",
+            "sepakat",
+            "setuju",
+            "final",
+            "kesepakatan",
+            "disimpulkan",
+            "ditetapkan",
+            "disetujui",
+            "putus",
+        ]
+    )
+    action_keywords: List[str] = field(
+        default_factory=lambda: [
+            "akan",
+            "harus",
+            "perlu",
+            "tolong",
+            "mohon",
+            "harap",
+            "deadline",
+            "target",
+            "tugas",
+            "tanggung jawab",
+            "action item",
+            "follow up",
+            "tindak lanjut",
+            "dikerjakan",
+            "selesaikan",
+            "lakukan",
+            "siapkan",
+            "minggu depan",
+            "besok",
+            "segera",
+            "bikin",
+            "buat",
+        ]
+    )
+    # Device
+    device: str = "cpu"
+@dataclass
+class MeetingSummary:
+    """Structured meeting summary"""
+    overview: str
+    key_points: List[str]
+    decisions: List[str]
+    action_items: List[Dict[str, str]]
+    topics: List[str] = field(default_factory=list)
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            "overview": self.overview,
+            "key_points": self.key_points,
+            "decisions": self.decisions,
+            "action_items": self.action_items,
+            "topics": self.topics,
+            "keywords": getattr(self, "keywords", []),
+        }
+    def __str__(self) -> str:
+        """String representation"""
+        lines = []
+        lines.append("=== RINGKASAN RAPAT ===\n")
+        lines.append(f"Overview:\n{self.overview}\n")
+        if self.key_points:
+            lines.append("Poin-Poin Penting:")
+            for i, point in enumerate(self.key_points, 1):
+                lines.append(f"  {i}. {point}")
+            lines.append("")
+        if self.decisions:
+            lines.append("Keputusan:")
+            for i, decision in enumerate(self.decisions, 1):
+                lines.append(f"  {i}. {decision}")
+            lines.append("")
+        if self.action_items:
+            lines.append("Action Items:")
+            for i, item in enumerate(self.action_items, 1):
+                owner = item.get("owner", "TBD")
+                task = item.get("task", "")
+                due = item.get("due", "")
+                if due:
+                    lines.append(f"  {i}. [{owner}] {task} (Due: {due})")
+                else:
+                    lines.append(f"  {i}. [{owner}] {task}")
+        if self.topics:
+            lines.append("")
+            lines.append("Topik:")
+            lines.append(", ".join(self.topics))
+        return "\n".join(lines)
+    def to_json(self) -> str:
+        """Return a JSON string for machine-readable outputs."""
+        import json
+        return json.dumps(self.to_dict(), ensure_ascii=False, indent=2)
+    def to_yaml(self) -> str:
+        """Return a YAML string (requires PyYAML)."""
+        try:
+            import yaml
+            return yaml.safe_dump(self.to_dict(), allow_unicode=True)
+        except Exception:
+            # Fallback to JSON if YAML not available
+            return self.to_json()
+class AbstractiveSummarizer:
+    """Abstractive summarizer using HuggingFace transformers pipeline (mt5/mbart/etc)."""
+    def __init__(self, config: Optional[SummarizationConfig] = None):
+        self.config = config or SummarizationConfig()
+        self._pipeline = None
+    def _load_model(self):
+        if self._pipeline is None:
+            try:
+                from transformers import pipeline
+                device = 0 if self.config.device.startswith("cuda") else -1
+                print(f"[Summarizer] Loading abstractive model: {self.config.abstractive_model_id}")
+                self._pipeline = pipeline(
+                    "summarization",
+                    model=self.config.abstractive_model_id,
+                    tokenizer=self.config.abstractive_model_id,
+                    device=device,
+                    truncation=True,
+                )
+                print("[Summarizer] Abstractive model loaded successfully")
+            except Exception as e:
+                print(f"[Summarizer] Warning: abstractive model load failed: {e}")
+                self._pipeline = None
+    def _chunk_text(self, text: str) -> List[str]:
+        max_chars = int(self.config.max_input_chars)
+        if len(text) <= max_chars:
+            return [text]
+        chunks = []
+        start = 0
+        while start < len(text):
+            end = min(len(text), start + max_chars)
+            # try to cut at sentence boundary
+            cut = text.rfind(".", start, end)
+            if cut <= start:
+                cut = end
+            chunk = text[start:cut].strip()
+            if chunk:
+                # prevent repeating identical chunks
+                chunk = self._collapse_repeated_phrases(chunk)
+                chunks.append(chunk)
+            start = cut
+        return chunks
+    def _clean_abstractive_output(self, overview: str, full_text: str) -> (str, List[str]):
+        """Clean artifacts from abstractive model output and produce fallback key points.
+        Returns (overview_clean, key_points)
+        """
+        overview_clean = self._clean_abstractive_text(overview)
+        # If abstract output is still noisy (placeholders remain or too few alpha tokens), fallback to extractive
+        if "<extra_id" in overview or len(re.findall(r"[a-zA-Z]{2,}", overview_clean)) < 10 or re.search(r"\b(\w+)(?:\s+\1){2,}", overview_clean.lower()):
+            sentences = BERTSummarizer(self.config)._split_sentences(full_text)
+            key_points = [s for s in sentences[: self.config.num_sentences]]
+            overview_clean = " ".join(key_points[:3])
+            return overview_clean, key_points
+        # Otherwise make sure key points are meaningful and deduplicated
+        parts = [s.strip() for s in re.split(r"\.|!|\?", overview_clean) if s.strip()]
+        seen_kp = set()
+        key_points: List[str] = []
+        for p in parts:
+            p_clean = re.sub(r"[^\w\s]", "", p) if p else p
+            p_clean = re.sub(r"\s+", " ", p_clean).strip()
+            if len(p_clean.split()) < 3:
+                continue
+            low = p_clean.lower()
+            if low in seen_kp:
+                continue
+            seen_kp.add(low)
+            key_points.append(p_clean)
+            if len(key_points) >= self.config.num_sentences:
+                break
+        return overview_clean, key_points
+    def _clean_abstractive_text(self, text: str) -> str:
+        """Lightweight cleaning of abstractive text outputs (remove placeholders, collapse punctuation).
+        Kept as a separate method for unit testing/backwards compatibility with older tests.
+        Also collapses repeated trivial tokens and reduces punctuation runs.
+        """
+        t = re.sub(r"<extra_id_\d+>", "", text)
+        t = re.sub(r"\)\s*<extra_id_\d+>", "", t)
+        # collapse repeated short filler words sequences e.g. "Jadi contohnya Jadi contohnya ..."
+        t = self._collapse_repeated_phrases(t)
+        t = re.sub(r"\s*[\.]{2,}\s*", ". ", t)
+        t = re.sub(r"[!?]{2,}", ".", t)
+        t = re.sub(r"\s+", " ", t).strip()
+        # Remove leading/trailing hyphens and stray punctuation
+        t = re.sub(r"^[-\s]+|[-\s]+$", "", t)
+        if not re.search(r"[.!?]$", t):
+            t = t + "."
+        return t
+    def _generate_keywords(self, text: str, top_k: int = 8) -> List[str]:
+        """Generate simple keywords by frequency (fallback)."""
+        toks = re.findall(r"\b[a-zA-Z]{4,}\b", text.lower())
+        freq = {}
+        stop = {"yang","dan","ini","itu","untuk","dengan","juga","sudah","ada","kita","saya","kamu"}
+        for w in toks:
+            if w in stop:
+                continue
+            freq[w] = freq.get(w, 0) + 1
+        sorted_words = sorted(freq.items(), key=lambda x: x[1], reverse=True)
+        return [w for w, _ in sorted_words[:top_k]]
+    def _collapse_repeated_phrases(self, text: str, max_ngram: int = 6, min_repeats: int = 2) -> str:
+        """Delegates to module-level collapse helper"""
+        return _collapse_repeated_phrases_global(text, max_ngram=max_ngram, min_repeats=min_repeats)
+    def _semantic_deduplicate(self, items: List[str], threshold: Optional[float] = None) -> List[str]:
+        """Delegate to AbstractiveSummarizer's semantic dedupe for compatibility."""
+        return AbstractiveSummarizer(self.config)._semantic_deduplicate(items, threshold)
+    def _semantic_dedup_action_items(self, actions: List[Dict[str, str]], threshold: Optional[float] = None) -> List[Dict[str, str]]:
+        """Delegate to AbstractiveSummarizer's action-item dedupe for compatibility."""
+        return AbstractiveSummarizer(self.config)._semantic_dedup_action_items(actions, threshold)
+    def _parse_structured_output(self, raw: str, defaults: Dict[str, Any]) -> (str, List[str]):
+        """Try to parse YAML/JSON or simple structured text into (overview, keywords).
+        If parsing fails, return (cleaned_raw, fallback_keywords)
+        """
+        cleaned = raw.strip()
+        # Try YAML first (if available)
+        try:
+            import yaml
+            parsed = yaml.safe_load(cleaned)
+            if isinstance(parsed, dict):
+                ov = parsed.get("overview", "")
+                kws = parsed.get("keywords", None)
+                if kws is None:
+                    kws = self._generate_keywords(ov or " ".join(defaults.get("key_points", [])))
+                return (ov.strip() if isinstance(ov, str) else "", kws)
+        except Exception:
+            pass
+        # Try JSON
+        try:
+            import json
+            parsed = json.loads(cleaned)
+            if isinstance(parsed, dict):
+                ov = parsed.get("overview", "")
+                kws = parsed.get("keywords", None)
+                if kws is None:
+                    kws = self._generate_keywords(ov or " ".join(defaults.get("key_points", [])))
+                return (ov.strip() if isinstance(ov, str) else "", kws)
+        except Exception:
+            pass
+        # Simple heuristic: look for header 'overview:' or 'Ringkasan:' in text
+        m = re.search(r"(?im)^(overview|ringkasan)\s*:\s*(.*)$", cleaned)
+        if m:
+            ov = m.group(2).strip()
+            kws = self._generate_keywords(ov or " ".join(defaults.get("key_points", [])))
+            return ov, kws
+        # If nothing recognized, return fallback cleaned text and keywords
+        return cleaned, self._generate_keywords(cleaned or " ".join(defaults.get("key_points", [])))
+    def _sanitize_for_prompt(self, text: str) -> str:
+        """Sanitize text before injecting into the prompt: remove model placeholders, URLs/domains/emails,
+        common web-article boilerplate (closing lines like "Semoga bermanfaat"), and collapse repeats."""
+        if not text:
+            return text
+        t = re.sub(r"<extra_id_\d+>", "", text)
+        # remove emails
+        t = re.sub(r"\b\S+@\S+\.\S+\b", " ", t)
+        # remove domain-like tokens (e.g., Eksekutif.com.co.id)
+        t = re.sub(r"\b\S+\.(?:com|co\.id|info|id|net|org)(?:\.[a-z]{2,})*\b", " ", t, flags=re.IGNORECASE)
+        # remove common article/web boilerplate short phrases that often appear as closings
+        t = re.sub(r"(?i)\b(semoga artikel ini bermanfaat(?: bagi anda semua)?|semoga bermanfaat|terima kasih(?: atas masukannya| juga)?)\b[.!\s,]*", " ", t)
+        t = re.sub(r"\s+", " ", t).strip()
+        t = _collapse_repeated_phrases_global(t)
+        return t
+    def _is_repetitive_text(self, text: str, max_run: int = 6) -> bool:
+        """Detect highly repetitive model outputs (including repeated n-gram phrases).
+        Returns True if repetition patterns exceed thresholds.
+        """
+        if not text:
+            return False
+        # check placeholder presence quickly
+        if re.search(r"<extra_id_\d+>", text):
+            return True
+        # Tokenize
+        tokens = re.findall(r"\w+", text.lower())
+        if not tokens:
+            return False
+        # Check simple token runs
+        run = 1
+        last = tokens[0]
+        for tok in tokens[1:]:
+            if tok == last:
+                run += 1
+                if run >= max_run:
+                    return True
+            else:
+                last = tok
+                run = 1
+        # Check n-gram repeated phrase runs for n=1..4
+        max_ngram = 4
+        n_tokens = len(tokens)
+        for n in range(1, max_ngram + 1):
+            i = 0
+            while i + 2 * n <= n_tokens:
+                # compare tokens[i:i+n] with subsequent repeated occurrences
+                pattern = tokens[i:i + n]
+                run = 1
+                j = i + n
+                while j + n <= n_tokens and tokens[j:j + n] == pattern:
+                    run += 1
+                    j += n
+                    if run >= max_run:
+                        return True
+                i += 1
+        # fallback regex for single-token repetition
+        if re.search(r"(\b\w+\b)(?:\s+\1\b){%d,}" % (max_run - 1), text.lower()):
+            return True
+        return False
+    def _contains_domain_noise(self, text: str) -> bool:
+        """Detect domain-like or short web boilerplate noise (e.g., 'Eksekutif.com', 'Semoga artikel ini bermanfaat').
+        Returns True if common domain patterns or boilerplate phrases are found.
+        """
+        if not text:
+            return False
+        if re.search(r"\b\S+\.(?:com|co\.id|info|id|net|org)(?:\.[a-z]{2,})*\b", text, flags=re.IGNORECASE):
+            return True
+        if re.search(r"(?i)\b(semoga artikel ini bermanfaat(?: bagi anda semua)?|semoga bermanfaat|terima kasih)\b", text):
+            return True
+        return False
+    def _normalize_overview_text(self, text: str) -> str:
+        """Normalize overview into a readable paragraph or keep structured lists tidy."""
+        if not text:
+            return text
+        t = text.strip()
+        # collapse repeated fragments first
+        t = _collapse_repeated_phrases_global(t)
+        # If text contains list markers or section headers, tidy spacing and return
+        if "\n-" in t or "Poin-Poin Penting" in t or "Keputusan" in t or "Action Items" in t:
+            # normalize newlines and strip extra spaces
+            t = re.sub(r"\n\s+", "\n", t)
+            t = re.sub(r"\n{2,}", "\n\n", t)
+            return t.strip()
+        # Otherwise make a single paragraph and deduplicate near-duplicate fragments
+        # split by common separators (newline, bullet, or hyphen sequences)
+        if " - " in t:
+            parts = [p.strip(" -" ) for p in re.split(r"\s*-\s*", t) if p.strip()]
+        else:
+            parts = [p.strip() for p in re.split(r"(?<=[.!?])\s+", t) if p.strip()]
+        seen = set()
+        uniq = []
+        for p in parts:
+            norm = re.sub(r"[^a-z0-9 ]", "", p.lower())
+            norm = re.sub(r"\s+", " ", norm).strip()
+            if not norm:
+                continue
+            if norm in seen:
+                continue
+            seen.add(norm)
+            uniq.append(p.strip(" -."))
+        para = " ".join(uniq)
+        para = re.sub(r"\s+", " ", para).strip()
+        # Remove any leftover emails/domains or short web boilerplate that slipped through
+        para = re.sub(r"\b\S+@\S+\.\S+\b", " ", para)
+        para = re.sub(r"\b\S+\.(?:com|co\.id|info|id|net|org)(?:\.[a-z]{2,})*\b", " ", para, flags=re.IGNORECASE)
+        para = re.sub(r"(?i)\b(semoga artikel ini bermanfaat(?: bagi anda semua)?|semoga bermanfaat|terima kasih(?: atas masukannya| juga)?)\b[.!\s,]*", " ", para)
+        para = re.sub(r"\s+", " ", para).strip()
+        if para and not re.search(r"[.!?]$", para):
+            para = para + "."
+        if para:
+            para = para[0].upper() + para[1:]
+        return para
+    def _polish_overview(self, overview: str, full_text: str) -> str:
+        """Polish overview into an executive, coherent paragraph using abstractive model (if available).
+        Falls back to normalization and deduplication if model not available.
+        """
+        if not overview:
+            return overview
+        # Basic normalization first
+        overview = _collapse_repeated_phrases_global(overview)
+        overview = self._normalize_overview_text(overview)
+        # If model available and config allows, ask for paraphrase/expansion
+        if getattr(self.config, "polish_overview", True):
+            try:
+                self._load_model()
+                if self._pipeline is not None:
+                    prompt = (
+                        "Paraphrase dan perluas teks berikut menjadi paragraf eksekutif yang jelas, ringkas, dan mudah dibaca. "
+                        "Jangan sertakan header."
+                        "\n\nTeks:\n" + overview
+                    )
+                    out = self._pipeline(
+                        prompt,
+                        max_length=min(getattr(self.config, "comprehensive_max_length", 512), 350),
+                        min_length=40,
+                        truncation=True,
+                        do_sample=False,
+                    )
+                    if isinstance(out, list) and out:
+                        candidate = out[0].get("summary_text", "").strip()
+                        candidate = self._clean_abstractive_text(candidate)
+                        candidate = _collapse_repeated_phrases_global(candidate)
+                        candidate = self._normalize_overview_text(candidate)
+                        return candidate
+            except Exception:
+                pass
+        return overview
+    def _semantic_deduplicate(self, items: List[str], threshold: Optional[float] = None) -> List[str]:
+        """Deduplicate similar items using sentence-transformer embeddings + cosine similarity.
+        Returns the first occurrence for each semantic group.
+        """
+        if not items:
+            return []
+        thr = threshold if threshold is not None else getattr(self.config, "semantic_dedup_threshold", 0.75)
+        # try embeddings
+        try:
+            embs = self._compute_embeddings(items)
+            if embs is not None:
+                from sklearn.metrics.pairwise import cosine_similarity
+                sim = cosine_similarity(embs)
+                n = len(items)
+                taken = set()
+                result = []
+                for i in range(n):
+                    if i in taken:
+                        continue
+                    result.append(items[i])
+                    for j in range(i + 1, n):
+                        if sim[i, j] >= thr:
+                            taken.add(j)
+                # If embeddings didn't merge anything useful, fallback to token-jaccard grouping
+                if len(result) == len(items) and len(items) > 1:
+                    # token Jaccard
+                    token_sets = [set(re.findall(r"\w+", it.lower())) for it in items]
+                    taken2 = set()
+                    result2 = []
+                    for i in range(len(items)):
+                        if i in taken2:
+                            continue
+                        result2.append(items[i])
+                        for j in range(i + 1, len(items)):
+                            if j in taken2:
+                                continue
+                            si = token_sets[i]
+                            sj = token_sets[j]
+                            if not si or not sj:
+                                continue
+                            jacc = len(si & sj) / float(len(si | sj))
+                            if jacc >= 0.45:
+                                taken2.add(j)
+                    return result2
+                return result
+            else:
+                raise ValueError("No embeddings")
+        except Exception:
+            # fallback to token-jaccard grouping first (robust when embeddings aren't available)
+            try:
+                token_sets = [set(re.findall(r"\w+", it.lower())) for it in items]
+                taken = set()
+                res = []
+                for i in range(len(items)):
+                    if i in taken:
+                        continue
+                    res.append(items[i])
+                    si = token_sets[i]
+                    for j in range(i + 1, len(items)):
+                        if j in taken:
+                            continue
+                        sj = token_sets[j]
+                        if not si or not sj:
+                            continue
+                        jacc = len(si & sj) / float(len(si | sj))
+                        if jacc >= 0.45:
+                            taken.add(j)
+                return res
+            except Exception:
+                # final fallback to naive textual deduplication
+                seen = set()
+                res = []
+                for it in items:
+                    low = re.sub(r"\s+", " ", it.lower()).strip()
+                    if low in seen:
+                        continue
+                    seen.add(low)
+                    res.append(it)
+                return res
+    def _semantic_dedup_action_items(self, actions: List[Dict[str, str]], threshold: Optional[float] = None) -> List[Dict[str, str]]:
+        """Deduplicate action items by task text; merge owners when necessary."""
+        if not actions:
+            return []
+        tasks = [a.get("task", "") for a in actions]
+        groups = self._semantic_deduplicate(tasks, threshold=threshold)
+        # groups contains first representative tasks; now build merged items
+        merged = []
+        for rep in groups:
+            owners = []
+            timestamps = []
+            dues = set()
+            for a in actions:
+                if a.get("task", "") == rep or (rep and rep in a.get("task", "")):
+                    if a.get("owner") and a.get("owner") not in owners:
+                        owners.append(a.get("owner"))
+                    if a.get("timestamp"):
+                        timestamps.append(a.get("timestamp"))
+                    if a.get("due"):
+                        dues.add(a.get("due"))
+            owner_str = " / ".join(owners) if owners else "TBD"
+            merged.append({
+                "owner": owner_str,
+                "task": rep,
+                "timestamp": timestamps[0] if timestamps else "",
+                "due": ", ".join(sorted(list(dues))) if dues else "",
+            })
+        return merged
+    def generate_comprehensive_summary(self, full_text: str, key_points: List[str], decisions: List[str], action_items: List[Dict[str, str]], topics: List[str]) -> (str, List[str]):
+        """Generate a comprehensive executive summary covering the meeting.
+        Uses the abstractive pipeline with a guided prompt built from extracted components.
+        Attempts to request YAML-structured output for reliable parsing; falls back to rule-based assembly.
+        Returns (overview_text, keywords)
+        """
+        # Build a structured prompt that requests YAML output for safe parsing
+        prompt_parts = [
+            "Anda adalah asisten yang menulis ringkasan rapat yang komprehensif dan terstruktur.",
+            "Output harus dalam format YAML dengan kunci: overview, key_points (list), decisions (list), action_items (list of {owner, task, due}), keywords (list).",
+            "Berikan overview naratif yang jelas, serta daftar poin penting, keputusan, dan tindak lanjut.",
+            "Topik yang dibahas:",
+            ", ".join(topics) if topics else "-",
+            "Poin-poin penting:\n" + "\n".join([f"- {p}" for p in key_points]) if key_points else "",
+            "Keputusan:\n" + "\n".join([f"- {d}" for d in decisions]) if decisions else "",
+            "Tindak lanjut (Action Items):\n" + "\n".join([f"- [{a.get('owner','TBD')}] {a.get('task','')}" for a in action_items]) if action_items else "",
+            "Tuliskan field 'overview' minimal 80 kata sebagai paragraf naratif yang merangkum seluruh rapat dengan jelas.",
+            "Mohon hasilkan YAML yang valid."
+        ]
+        prompt = "\n\n".join([p for p in prompt_parts if p])
+        # Sanitize inputs to avoid placeholder tokens and repeated garbage
+        key_points = [self._sanitize_for_prompt(k) for k in key_points if k and k.strip()]
+        decisions = [self._sanitize_for_prompt(d) for d in decisions if d and d.strip()]
+        for a in action_items:
+            a['task'] = self._sanitize_for_prompt(a.get('task',''))
+        # Deduplicate before sending to model
+        try:
+            key_points = self._semantic_deduplicate(key_points)
+            decisions = self._semantic_deduplicate(decisions)
+        except Exception:
+            key_points = list(dict.fromkeys(key_points))
+            decisions = list(dict.fromkeys(decisions))
+        # Use pipeline if available
+        try:
+            self._load_model()
+            if self._pipeline is not None:
+                # Try up to 2 attempts: first deterministic, second sampled if repetition/shortness detected
+                attempts = 2
+                for attempt in range(attempts):
+                    gen_kwargs = dict(
+                        max_length=getattr(self.config, "comprehensive_max_length", 512),
+                        min_length=max(80, int(getattr(self.config, "comprehensive_max_length", 512) * 0.12)),
+                        truncation=True,
+                        do_sample=False,
+                        no_repeat_ngram_size=4,
+                        repetition_penalty=1.3,
+                    )
+                    if attempt == 1:
+                        # more creative generation if deterministic attempt failed
+                        gen_kwargs.update({"do_sample": True, "temperature": 0.7, "top_p": 0.9})
+                    out = self._pipeline(prompt, **gen_kwargs)
+                    text = out[0].get("summary_text", "").strip()
+                    # collapse repeated fragments, then clean
+                    text = self._collapse_repeated_phrases(text)
+                    cleaned = self._clean_abstractive_text(text)
+                    # Quick heuristic checks (repetition, too short, or domain-like web boilerplate -> retry)
+                    if self._is_repetitive_text(cleaned) or len(cleaned.split()) < 20 or self._contains_domain_noise(cleaned):
+                        # try again (next attempt) with sampling
+                        if attempt + 1 < attempts:
+                            continue
+                    # Attempt to parse structured YAML/JSON
+                    overview, keywords = self._parse_structured_output(cleaned, {
+                        "key_points": key_points,
+                        "decisions": decisions,
+                        "action_items": action_items,
+                    })
+                    # Final normalization / optional polish
+                    overview = self._normalize_overview_text(overview)
+                    if getattr(self.config, "polish_overview", True):
+                        overview = self._polish_overview(overview, full_text)
+                    # Validate overview quality: non-empty, not too short, not repetitive
+                    if overview and len(overview.split()) >= 10 and not self._is_repetitive_text(overview):
+                        return overview, keywords
+                    else:
+                        # Try next attempt if available, otherwise break to fallback
+                        if attempt + 1 < attempts:
+                            continue
+                        else:
+                            break
+        except Exception:
+            pass
+        # Fallback rule-based assembly: construct a narrative paragraph summarizing meeting,
+        # rather than repeating the list headers. Use polishing to turn it into an executive paragraph.
+        def _format_action_items(ai_list):
+            pairs = []
+            for a in ai_list:
+                owner = a.get('owner', 'TBD')
+                task = a.get('task', '').strip()
+                if task:
+                    pairs.append(f"{owner} akan {task.rstrip('.')}.")
+            return " ".join(pairs)
+        def _join_points(pts):
+            # join key points into a sentence
+            if not pts:
+                return ""
+            # take up to 4 points to avoid overly long lists
+            pts_sample = pts[:4]
+            return "; ".join([p.rstrip('.') for p in pts_sample]) + ""
+        narrative_parts = []
+        if topics:
+            narrative_parts.append("Topik utama yang dibahas meliputi: " + ", ".join(topics) + ".")
+        if key_points:
+            narrative_parts.append("Beberapa poin penting termasuk: " + _join_points(key_points) + ".")
+        if decisions:
+            narrative_parts.append("Keputusan utama yang dicapai termasuk: " + ", ".join([d.rstrip('.') for d in decisions]) + ".")
+        if action_items:
+            narrative_parts.append("Tindak lanjut yang disepakati di antaranya: " + _format_action_items(action_items))
+        assembled = " ".join([p for p in narrative_parts if p]).strip()
+        # Normalize and then optionally polish into a smooth executive paragraph
+        assembled = self._normalize_overview_text(assembled)
+        if getattr(self.config, "polish_overview", True):
+            assembled = self._polish_overview(assembled, full_text)
+        keywords = self._generate_keywords(assembled, top_k=8)
+        return assembled, keywords
+    def summarize(self, transcript_segments: List[TranscriptSegment]) -> MeetingSummary:
+        self._load_model()
+        full_text = " ".join([seg.text for seg in transcript_segments if seg.text])
+        if not full_text.strip():
+            return MeetingSummary(
+                overview="Tidak ada konten yang dapat diringkas.",
+                key_points=[],
+                decisions=[],
+                action_items=[],
+            )
+        # Clean up common disfluencies/politeness tokens and ASR annotations
+        full_text = re.sub(r"\[OVERLAP\]|\[NOISE\]|<.*?>", "", full_text)
+        full_text = re.sub(
+            r"\b(oke|ya|oke,|baik|sekarang|sekarang kita|nah|jadi|oke\.|jadi\.)\b",
+            "",
+            full_text,
+            flags=re.IGNORECASE,
+        )
+        full_text = re.sub(r"\s+", " ", full_text).strip()
+        # Chunk and summarize
+        if self._pipeline is None:
+            # fallback: return first few sentences
+            sentences = BERTSummarizer(self.config)._split_sentences(full_text)
+            overview = " ".join(sentences[: min(3, len(sentences))])
+        else:
+            chunks = self._chunk_text(full_text)
+            partial_summaries = []
+            for chunk in chunks:
+                try:
+                    out = self._pipeline(
+                        chunk,
+                        max_length=self.config.max_summary_length,
+                        min_length=self.config.min_summary_length,
+                        truncation=True,
+                        do_sample=False,
+                    )
+                    partial_summaries.append(out[0]["summary_text"].strip())
+                except Exception as e:
+                    print(f"[Summarizer] chunk summarization failed: {e}")
+                    continue
+            # If multiple partial summaries, join and optionally summarize again
+            combined = " ".join(partial_summaries)
+            if len(combined) > self.config.max_input_chars and self._pipeline:
+                try:
+                    out = self._pipeline(
+                        combined,
+                        max_length=self.config.max_summary_length,
+                        min_length=self.config.min_summary_length,
+                        truncation=True,
+                        do_sample=False,
+                    )
+                    overview = out[0]["summary_text"].strip()
+                except Exception:
+                    overview = combined
+            else:
+                overview = combined
+        # Clean abstractive overview and produce robust key points (use helper)
+        overview, key_points = self._clean_abstractive_output(overview, full_text)
+        # Extract decisions and actions via keywords
+        sentences = BERTSummarizer(self.config)._split_sentences(full_text)
+        decisions = BERTSummarizer(self.config)._extract_decisions(sentences)
+        action_items = BERTSummarizer(self.config)._extract_action_items(transcript_segments)
+        topics = BERTSummarizer(self.config)._extract_topics(full_text)
+        # Optionally produce a comprehensive overview (uses abstractive pipeline)
+        if getattr(self.config, "comprehensive_overview", False):
+            try:
+                comp_overview, keywords = self.generate_comprehensive_summary(full_text, key_points, decisions, action_items, topics)
+                overview = comp_overview
+            except Exception:
+                keywords = []
+        ms = MeetingSummary(
+            overview=overview,
+            key_points=key_points,
+            decisions=decisions,
+            action_items=action_items,
+            topics=topics,
+        )
+        if 'keywords' in locals():
+            setattr(ms, 'keywords', keywords)
+        return ms
+class BERTSummarizer:
+    """
+    Extractive Summarization using BERT sentence embeddings.
+    Selects most important sentences based on semantic similarity
+    to document centroid and other features.
+    Attributes:
+        config: SummarizationConfig object
+    Example:
+        >>> summarizer = BERTSummarizer()
+        >>> summary = summarizer.summarize(transcript_segments)
+        >>> print(summary.overview)
+        >>> print(summary.decisions)
+    """
+    def __init__(self, config: Optional[SummarizationConfig] = None):
+        """
+        Initialize BERTSummarizer.
+        Args:
+            config: SummarizationConfig object
+        """
+        self.config = config or SummarizationConfig()
+        self._model = None
+    def _load_model(self):
+        """Lazy load sentence transformer model"""
+        if self._model is None:
+            try:
+                from sentence_transformers import SentenceTransformer
+                print(f"[Summarizer] Loading model: {self.config.sentence_model_id}")
+                self._model = SentenceTransformer(self.config.sentence_model_id)
+                print("[Summarizer] Model loaded successfully")
+            except Exception as e:
+                print(f"[Summarizer] Warning: Could not load model: {e}")
+                print("[Summarizer] Using fallback mode")
+                self._model = "FALLBACK"
+    def _semantic_deduplicate(self, items: List[str], threshold: Optional[float] = None) -> List[str]:
+        """Delegate to AbstractiveSummarizer semantic dedup for compatibility."""
+        return AbstractiveSummarizer(self.config)._semantic_deduplicate(items, threshold)
+    def _semantic_dedup_action_items(self, actions: List[Dict[str, str]], threshold: Optional[float] = None) -> List[Dict[str, str]]:
+        """Delegate to AbstractiveSummarizer action-item dedup for compatibility."""
+        return AbstractiveSummarizer(self.config)._semantic_dedup_action_items(actions, threshold)
+    def _collapse_repeated_phrases(self, text: str, max_ngram: int = 6, min_repeats: int = 2) -> str:
+        """Delegates to module-level collapse helper for compatibility."""
+        return _collapse_repeated_phrases_global(text, max_ngram=max_ngram, min_repeats=min_repeats)
+    def summarize(self, transcript_segments: List[TranscriptSegment]) -> MeetingSummary:
+        """
+        Generate meeting summary from transcript.
+        Args:
+            transcript_segments: List of transcript segments with speaker info
+        Returns:
+            MeetingSummary with overview, key points, decisions, and action items
+        """
+        # If configuration prefers abstractive summarization, delegate to AbstractiveSummarizer
+        if getattr(self.config, "method", "extractive") == "abstractive":
+            try:
+                return AbstractiveSummarizer(self.config).summarize(transcript_segments)
+            except Exception as e:
+                print(
+                    f"[Summarizer] Abstractive summarization failed, falling back to extractive: {e}"
+                )
+        self._load_model()
+        # Combine all text
+        full_text = " ".join([seg.text for seg in transcript_segments if seg.text])
+        # Clean up disfluencies and annotations commonly appearing in ASR output
+        full_text = re.sub(r"\[OVERLAP\]|\[NOISE\]|<.*?>", "", full_text)
+        full_text = re.sub(r"\s+", " ", full_text).strip()
+        if not full_text.strip():
+            return MeetingSummary(
+                overview="Tidak ada konten yang dapat diringkas.",
+                key_points=[],
+                decisions=[],
+                action_items=[],
+            )
+        # Get sentence-level metadata by merging speaker turns
+        sent_meta = self._get_sentences_with_meta(transcript_segments)
+        if not sent_meta:
+            return MeetingSummary(
+                overview="Tidak ada kalimat yang dapat diidentifikasi.",
+                key_points=[],
+                decisions=[],
+                action_items=[],
+            )
+        sentences = [s["text"] for s in sent_meta]
+        # Compute embeddings and select a diverse set of representative sentences via MMR
+        embeddings = self._compute_embeddings(sentences)
+        num_select = min(max(5, self.config.num_sentences + 2), len(sentences))
+        if embeddings is not None:
+            selected_idx = self._mmr_selection(sentences, embeddings, k=num_select)
+            key_sentences = [sentences[i] for i in selected_idx]
+        else:
+            # fallback: use earlier scoring
+            key_sentences = self._extract_key_sentences(sentences)
+        # Generate a multi-sentence overview with some ordering and cleaning
+        overview = self._generate_overview(key_sentences[:3])
+        # Optionally perform a light abstractive refinement on the extractive overview
+        if getattr(self.config, "do_abstractive_refinement", False):
+            try:
+                abs_sum = AbstractiveSummarizer(self.config)
+                abs_sum._load_model()
+                if abs_sum._pipeline is not None and overview:
+                    out = abs_sum._pipeline(
+                        overview,
+                        max_length=getattr(self.config, "abstractive_refine_max_len", 80),
+                        min_length=30,
+                        truncation=True,
+                        do_sample=False,
+                    )
+                    # Expect a single summary text
+                    if isinstance(out, list) and out:
+                        raw_overview = out[0].get("summary_text", overview).strip()
+                        # Use AbstractiveSummarizer's cleaning & fallback logic
+                        overview_cleaned, _ = abs_sum._clean_abstractive_output(raw_overview, full_text)
+                        overview = overview_cleaned
+            except Exception:
+                # Fail silently and use extractive overview
+                pass
+        # Build richer key points: include speaker attribution and short cleaned sentences
+        key_points = []
+        for i in selected_idx if embeddings is not None else list(range(len(key_sentences))):
+            s = sentences[i]
+            sp = sent_meta[i]["speaker_id"]
+            # Short clean
+            s_clean = re.sub(r"\s+", " ", s).strip()
+            key_points.append(f"{s_clean} (oleh {sp})")
+        # Extract decisions using expanded context (look for decision keywords and enumerations)
+        decisions = []
+        seen_decisions = set()
+        for i, s in enumerate(sentences):
+            s_clean = re.sub(r"\s+", " ", s).strip()
+            s_lower = s_clean.lower()
+            if any(kw in s_lower for kw in self.config.decision_keywords) or re.match(
+                r"^(pertama|kedua|ketiga|keempat|kelima)\b", s_lower
+            ):
+                context = self._expand_context_for_sentence(sent_meta, i, window=1)
+                dec_text = re.sub(r"\[.*?\]", "", context)
+                dec_text = re.sub(r"\s+", " ", dec_text).strip()
+                # Truncate to a reasonable length (35 words) and remove trailing punctuation
+                words = dec_text.split()
+                dec_text = " ".join(words[:35]).rstrip(" ,.;:")
+                if len(dec_text.split()) < 3:
+                    continue
+                if dec_text and dec_text not in seen_decisions:
+                    decisions.append(dec_text)
+                    seen_decisions.add(dec_text)
+        # If no decisions found, try to extract from key_sentences
+        if not decisions:
+            for ks in key_sentences:
+                if any(kw in ks.lower() for kw in self.config.decision_keywords):
+                    if ks not in seen_decisions:
+                        decisions.append(ks)
+                        seen_decisions.add(ks)
+        # Apply semantic deduplication to decisions
+        try:
+            decisions = self._semantic_deduplicate(decisions)
+        except Exception:
+            pass
+        # Extract action items at sentence level with speaker inference
+        action_items = []
+        seen_tasks = set()
+        action_kw_re = re.compile(
+            r"\b(" + "|".join([re.escape(k) for k in self.config.action_keywords]) + r")\b",
+            flags=re.IGNORECASE,
+        )
+        # verbs that indicate an actionable commitment (used to validate generic keyword matches)
+        action_verbs_re = re.compile(r"\b(akan|harus|siapkan|bikin|buat|selesaikan|dikerjakan|tolong|mohon|harap)\b", flags=re.IGNORECASE)
+        for i, s in enumerate(sentences):
+            text = re.sub(r"\[OVERLAP\]|\[NOISE\]|<.*?>", "", s).strip()
+            if not text:
+                continue
+            # explicit commit patterns
+            commit_re = re.compile(
+                r"\b(aku|saya|kami|kita|kamu)\b.*\b(bertanggung jawab|akan|saya akan|aku akan|aku akan membuat|kamu tolong|tolong|siapkan|bikin|harus|selesaikan|dikerjakan)\b",
+                flags=re.IGNORECASE,
+            )
+            owner = None
+            task = None
+            if commit_re.search(text):
+                owner = sent_meta[i]["speaker_id"]
+                # try to isolate the actionable clause
+                task = re.sub(
+                    r"^.*?\b(bertanggung jawab|akan|saya akan|aku akan|kamu tolong|tolong|siapkan|bikin|harus|selesaikan|dikerjakan)\b",
+                    "",
+                    text,
+                    flags=re.IGNORECASE,
+                )
+                task = task.strip(" .,:;-")
+                if not task:
+                    task = text
+            elif action_kw_re.search(text):
+                # Validate generic matches for actionability using helper
+                if not self._is_actionable_text(text):
+                    continue
+                owner = sent_meta[i]["speaker_id"]
+                task = text
+            if task:
+                # Normalize task text
+                task = re.sub(
+                    r"^\s*(aku|saya|kami|kita|kamu)\b[:,\s]*", "", task, flags=re.IGNORECASE
+                ).strip()
+                task = re.sub(r"\s+", " ", task).strip(" .,:;-")
+                if len(task.split()) < 3:
+                    continue
+                filler_short = {"setuju", "oke", "ya", "nah", "betul"}
+                if task.lower() in filler_short:
+                    continue
+                key = task.lower()[:120]
+                if key in seen_tasks:
+                    continue
+                seen_tasks.add(key)
+                action_items.append(
+                    {
+                        "owner": owner or "TBD",
+                        "task": task,
+                        "timestamp": f"{sent_meta[i]['start']:.1f}s",
+                        "due": "",
+                    }
+                )
+        # Fall back to segment-level action extraction if none found
+        if not action_items:
+            action_items = self._extract_action_items(transcript_segments)
+        # Apply semantic deduplication to action items (merge owners when possible)
+        try:
+            action_items = self._semantic_dedup_action_items(action_items)
+        except Exception:
+            pass
+        # Extract topics (frequency-based) from cleaned full_text
+        topics = self._extract_topics(full_text)
+        # Optionally produce a comprehensive overview (may use abstractive pipeline)
+        if getattr(self.config, "comprehensive_overview", False):
+            try:
+                abs_s = AbstractiveSummarizer(self.config)
+                comp_overview, keywords = abs_s.generate_comprehensive_summary(full_text, key_points, decisions, action_items, topics)
+                overview = comp_overview
+            except Exception:
+                keywords = []
+        # Return comprehensive MeetingSummary
+        ms = MeetingSummary(
+            overview=overview,
+            key_points=key_points,
+            decisions=decisions,
+            action_items=action_items,
+            topics=topics,
+        )
+        if 'keywords' in locals():
+            setattr(ms, 'keywords', keywords)
+        return ms
+    def _split_sentences(self, text: str) -> List[str]:
+        """Split text into sentences"""
+        # Indonesian sentence splitting
+        # Handle common abbreviations
+        text = re.sub(r"([Dd]r|[Pp]rof|[Bb]pk|[Ii]bu|[Ss]dr|[Nn]o|[Hh]al)\.", r"\1<PERIOD>", text)
+        # Split on sentence-ending punctuation
+        sentences = re.split(r"[.!?]+\s*", text)
+        # Restore periods in abbreviations
+        sentences = [s.replace("<PERIOD>", ".") for s in sentences]
+        # Clean and filter
+        cleaned = []
+        for s in sentences:
+            s = s.strip()
+            # Filter by length
+            if len(s) < self.config.min_sentence_length:
+                continue
+            if len(s) > self.config.max_sentence_length:
+                # Truncate very long sentences
+                s = s[: self.config.max_sentence_length] + "..."
+            # Collapse trivial repeated fragments inside sentence
+            s = self._collapse_repeated_phrases(s)
+            cleaned.append(s)
+        return cleaned
+    def _merge_speaker_turns(self, segments: List[TranscriptSegment]) -> List[Dict[str, Any]]:
+        """Merge consecutive segments by the same speaker into 'turns' to provide more context.
+        Returns a list of dicts: {speaker_id, start, end, text, indices}
+        """
+        turns: List[Dict[str, Any]] = []
+        for i, seg in enumerate(segments):
+            if not seg.text or not seg.text.strip():
+                continue
+            # Clean common ASR artifacts and leading fillers
+            text = re.sub(r"\[OVERLAP\]|\[NOISE\]|<.*?>", "", seg.text)
+            text = re.sub(
+                r"^\s*(oke|ya|nah|oke,|baik|sekarang|jadi)\b[\s,:-]*", "", text, flags=re.IGNORECASE
+            )
+            text = re.sub(r"\s+", " ", text).strip()
+            if not text:
+                continue
+            if turns and turns[-1]["speaker_id"] == seg.speaker_id:
+                turns[-1]["end"] = seg.end
+                turns[-1]["text"] += " " + text
+                turns[-1]["indices"].append(i)
+            else:
+                turns.append(
+                    {
+                        "speaker_id": seg.speaker_id,
+                        "start": seg.start,
+                        "end": seg.end,
+                        "text": text,
+                        "indices": [i],
+                    }
+                )
+        return turns
+    def _get_sentences_with_meta(self, segments: List[TranscriptSegment]) -> List[Dict[str, Any]]:
+        """Split merged speaker turns into sentences and keep metadata."""
+        turns = self._merge_speaker_turns(segments)
+        sent_meta: List[Dict[str, Any]] = []
+        for t in turns:
+            sents = self._split_sentences(t["text"])
+            for j, s in enumerate(sents):
+                sent_meta.append(
+                    {
+                        "text": s,
+                        "speaker_id": t["speaker_id"],
+                        "start": t["start"],
+                        "end": t["end"],
+                        "turn_indices": t["indices"],
+                        "sent_idx_in_turn": j,
+                    }
+                )
+        return sent_meta
+    def _compute_embeddings(self, sentences: List[str]):
+        """Compute sentence embeddings using sentence-transformers (lazy load)."""
+        if not sentences:
+            return None
+        try:
+            from sentence_transformers import SentenceTransformer
+            model = SentenceTransformer(self.config.sentence_model_id)
+            embs = model.encode(sentences, show_progress_bar=False)
+            return embs
+        except Exception as e:
+            print(f"[Summarizer] Embedding model error: {e}")
+            return None
+    def _mmr_selection(
+        self, sentences: List[str], embeddings, k: int = 5, lambda_param: float = 0.6
+    ) -> List[int]:
+        """Maximal Marginal Relevance (MMR) selection for diversity and coverage.
+        Returns list of selected sentence indices in original order.
+        """
+        import numpy as _np
+        if embeddings is None or len(sentences) <= k:
+            return list(range(min(len(sentences), k)))
+        centroid = _np.mean(embeddings, axis=0)
+        # similarity to centroid
+        sim_to_centroid = _np.dot(embeddings, centroid) / (
+            _np.linalg.norm(embeddings, axis=1) * (_np.linalg.norm(centroid) + 1e-8)
+        )
+        selected = []
+        candidate_indices = list(range(len(sentences)))
+        # pick the top similarity as first
+        first = int(_np.argmax(sim_to_centroid))
+        selected.append(first)
+        candidate_indices.remove(first)
+        while len(selected) < k and candidate_indices:
+            mmr_scores = []
+            for idx in candidate_indices:
+                sim_to_sel = max(
+                    [
+                        _np.dot(embeddings[idx], embeddings[s])
+                        / (_np.linalg.norm(embeddings[idx]) * _np.linalg.norm(embeddings[s]) + 1e-8)
+                        for s in selected
+                    ]
+                )
+                score = lambda_param * sim_to_centroid[idx] - (1 - lambda_param) * sim_to_sel
+                mmr_scores.append((idx, score))
+            idx_best, _ = max(mmr_scores, key=lambda x: x[1])
+            selected.append(idx_best)
+            candidate_indices.remove(idx_best)
+        # return in original order
+        selected_sorted = sorted(selected)
+        return selected_sorted
+    def _expand_context_for_sentence(
+        self, sent_meta: List[Dict[str, Any]], idx: int, window: int = 1
+    ) -> str:
+        """Return concatenated sentence with neighboring contextual sentences for better decision/action extraction."""
+        start = max(0, idx - window)
+        end = min(len(sent_meta), idx + window + 1)
+        return " ".join([s["text"] for s in sent_meta[start:end]])
+    def _infer_owner_for_action(self, seg_index: int, sent_meta: List[Dict[str, Any]]) -> str:
+        """Infer owner for an action by looking at the sentence speaker and recent explicit mentions."""
+        # Prefer sentence speaker
+        if 0 <= seg_index < len(sent_meta):
+            return sent_meta[seg_index]["speaker_id"]
+        return "TBD"
+    def _extract_key_sentences(self, sentences: List[str]) -> List[str]:
+        """Extract most important sentences using BERT embeddings"""
+        if not sentences:
+            return []
+        # Fallback mode: simple heuristics
+        if self._model == "FALLBACK" or len(sentences) <= self.config.num_sentences:
+            return sentences[: self.config.num_sentences]
+        try:
+            # Get sentence embeddings
+            embeddings = self._model.encode(sentences, show_progress_bar=False)
+            # Calculate document centroid
+            centroid = np.mean(embeddings, axis=0)
+            # Calculate importance scores for each sentence
+            scores = []
+            for i, (sent, emb) in enumerate(zip(sentences, embeddings)):
+                score = self._calculate_sentence_score(
+                    sentence=sent,
+                    embedding=emb,
+                    centroid=centroid,
+                    position=i,
+                    total_sentences=len(sentences),
+                )
+                scores.append((i, score, sent))
+            # Sort by score
+            scores.sort(key=lambda x: x[1], reverse=True)
+            # Get top-k sentences (maintain original order)
+            top_indices = sorted([s[0] for s in scores[: self.config.num_sentences]])
+            return [sentences[i] for i in top_indices]
+        except Exception as e:
+            print(f"[Summarizer] Embedding extraction failed: {e}")
+            return sentences[: self.config.num_sentences]
+    def _calculate_sentence_score(
+        self,
+        sentence: str,
+        embedding: np.ndarray,
+        centroid: np.ndarray,
+        position: int,
+        total_sentences: int,
+    ) -> float:
+        """Calculate importance score for a sentence"""
+        # 1. Cosine similarity to centroid
+        similarity = np.dot(embedding, centroid) / (
+            np.linalg.norm(embedding) * np.linalg.norm(centroid) + 1e-8
+        )
+        # 2. Position score (favor beginning and end)
+        if total_sentences > 1:
+            normalized_pos = position / (total_sentences - 1)
+            # U-shaped curve: high at start and end
+            position_score = 1.0 - 0.6 * np.sin(np.pi * normalized_pos)
+        else:
+            position_score = 1.0
+        # 3. Length score (favor medium-length sentences)
+        word_count = len(sentence.split())
+        optimal_length = 20
+        length_score = 1.0 - min(abs(word_count - optimal_length) / 30, 1.0)
+        # 4. Keyword bonus
+        keyword_score = 0.0
+        sentence_lower = sentence.lower()
+        for kw in self.config.decision_keywords + self.config.action_keywords:
+            if kw in sentence_lower:
+                keyword_score += 0.1
+        keyword_score = min(keyword_score, 0.3)  # Cap bonus
+        # Combined score
+        score = (
+            self.config.similarity_weight * similarity
+            + self.config.position_weight * position_score
+            + self.config.length_weight * length_score
+            + keyword_score
+        )
+        return score
+    def _generate_overview(self, key_sentences: List[str]) -> str:
+        """Generate overview from key sentences"""
+        if not key_sentences:
+            return "Tidak ada ringkasan yang dapat dibuat."
+        # Use top 2-3 sentences for overview
+        overview_sentences = key_sentences[: min(3, len(key_sentences))]
+        overview = " ".join(overview_sentences)
+        # Clean up
+        overview = re.sub(r"\s+", " ", overview).strip()
+        return overview
+    def _extract_decisions(self, sentences: List[str]) -> List[str]:
+        """Extract decision-related sentences and synthesize enumerated decisions.
+        This method collects sentence-level decision mentions, attempts to synthesize
+        clauses from enumerated statements (e.g., "Pertama..., Kedua..."),
+        and performs semantic deduplication to avoid repeated/near-duplicate items.
+        """
+        raw = []
+        for sent in sentences:
+            sent_lower = sent.lower()
+            # Check for decision keywords
+            if any(kw in sent_lower for kw in self.config.decision_keywords):
+                # Clean the sentence
+                clean_sent = re.sub(r"\s+", " ", sent).strip()
+                if clean_sent and clean_sent not in raw:
+                    raw.append(clean_sent)
+        # Try to synthesize enumerated decisions from sentences
+        synthesized = self._synthesize_enumerated_decisions(sentences)
+        all_decisions = raw + synthesized
+        # Deduplicate semantically (Jaccard over tokens)
+        deduped = self._deduplicate_strings(all_decisions)
+        # Limit number of decisions returned
+        return deduped[:7]
+    def _synthesize_enumerated_decisions(self, sentences: List[str]) -> List[str]:
+        """Extract clauses following enumerations like 'Pertama..., Kedua...' and return list.
+        Handles both ordinal words (pertama, kedua, ...) and numbered lists (1., 2.)
+        by splitting and returning non-trivial clauses.
+        """
+        synth: List[str] = []
+        enum_words_re = re.compile(r"\b(pertama|kedua|ketiga|keempat|kelima)\b", flags=re.IGNORECASE)
+        for s in sentences:
+            s_clean = s.strip()
+            if enum_words_re.search(s_clean.lower()):
+                # Split by Indonesian ordinal words
+                parts = re.split(r"\bpertama\b|\bkedua\b|\bketiga\b|\bkeempat\b|\bkelima\b", s_clean, flags=re.IGNORECASE)
+                for p in parts:
+                    p = p.strip(" .,:;\n-–—")
+                    if len(p.split()) >= 3 and p not in synth:
+                        synth.append(p)
+            # Also handle simple numbered enumerations like '1. ... 2. ...'
+            if re.search(r"\d+\.\s*", s_clean):
+                parts = re.split(r"\d+\.\s*", s_clean)
+                for p in parts:
+                    p = p.strip(" .,:;\n-–—")
+                    if len(p.split()) >= 3 and p not in synth:
+                        synth.append(p)
+        return synth
+    def _normalize_text_for_dedup(self, text: str) -> str:
+        """Normalize text for lightweight semantic deduplication."""
+        t = text.lower()
+        # remove punctuation, keep alphanumerics and spaces
+        t = re.sub(r"[^a-z0-9\s]+", "", t)
+        t = re.sub(r"\s+", " ", t).strip()
+        return t
+    def _deduplicate_strings(self, items: List[str], threshold: float = 0.5) -> List[str]:
+        """Deduplicate items using token Jaccard similarity threshold."""
+        kept: List[str] = []
+        norms: List[str] = []
+        for it in items:
+            n = self._normalize_text_for_dedup(it)
+            if not n:
+                continue
+            toks1 = set(n.split())
+            is_dup = False
+            for other in norms:
+                toks2 = set(other.split())
+                if not toks1 or not toks2:
+                    continue
+                inter = len(toks1 & toks2)
+                union = len(toks1 | toks2)
+                if union > 0 and (inter / union) >= threshold:
+                    is_dup = True
+                    break
+            if not is_dup:
+                kept.append(it)
+                norms.append(n)
+        return kept
+    def _extract_action_items(self, segments: List[TranscriptSegment]) -> List[Dict[str, str]]:
+        """Extract action items with speaker attribution (improved heuristics)
+        Heuristics:
+        - Detect explicit commitments like "aku akan", "saya bertanggung jawab", "kamu siapkan" and assign owner
+        - Fallback to keyword-based detection
+        - Normalize duplicate tasks and detect simple due-date mentions like "minggu depan", "besok"
+        - Try to infer explicit owner names mentioned in the clause
+        """
+        action_items: List[Dict[str, str]] = []
+        seen_tasks = set()
+        # Try to use AdvancedNLPExtractor (NER + dependency parse) for higher-quality extraction
+        try:
+            from src.nlp_utils import AdvancedNLPExtractor
+            extractor = AdvancedNLPExtractor()
+            sent_meta = self._get_sentences_with_meta(segments)
+            nlp_actions = extractor.extract_actions_from_sentences(sent_meta)
+            for item in nlp_actions:
+                task_key = item.get("task", "").lower()[:120]
+                if task_key in seen_tasks:
+                    continue
+                seen_tasks.add(task_key)
+                action_items.append(
+                    {
+                        "owner": item.get("owner", "TBD"),
+                        "task": item.get("task", "").strip(),
+                        "timestamp": f"{sent_meta[item.get('sentence_idx', 0)]['start']:.1f}s",
+                        "due": self._detect_due_from_text(item.get("task", "")),
+                    }
+                )
+        except Exception:
+            extractor = None
+        commit_re = re.compile(
+            r"\b(aku|saya|kami|kita|kamu)\b.*\b(bertanggung jawab|akan|saya akan|aku akan|aku akan membuat|kamu tolong|tolong|siapkan|bikin|harus|selesaikan|dikerjakan)\b",
+            flags=re.IGNORECASE,
+        )
+        # Actionable verbs/phrases to validate generic keyword matches
+        _action_verbs_re = re.compile(r"\b(akan|harus|siapkan|bikin|buat|selesaikan|dikerjakan|tolong|mohon|harap)\b", flags=re.IGNORECASE)
+        for seg in segments:
+            if not seg.text:
+                continue
+            text = re.sub(r"\[OVERLAP\]|\[NOISE\]|<.*?>", "", seg.text).strip()
+            text_lower = text.lower()
+            # 1) explicit commitment patterns
+            if commit_re.search(text_lower):
+                # Try to extract short actionable clause
+                task = re.sub(
+                    r"^.*?(bertanggung jawab|akan|membuat|siapkan|tolong|saya akan|aku akan|kamu tolong)\b",
+                    "",
+                    text,
+                    flags=re.IGNORECASE,
+                )
+                task = task.strip(" .,:;-")
+                if not task:
+                    # fallback to whole segment
+                    task = text
+                # Try to detect explicit owner name within the clause (e.g., "Budi akan ...")
+                owner = self._extract_name_as_owner(text) or seg.speaker_id
+                task_key = task.lower()[:120]
+                if task_key not in seen_tasks:
+                    seen_tasks.add(task_key)
+                    action_items.append(
+                        {
+                            "owner": owner,
+                            "task": task,
+                            "timestamp": f"{seg.start:.1f}s",
+                            "due": self._detect_due_from_text(task),
+                        }
+                    )
+                continue
+            # 2) keyword-based detection
+            if any(kw in text_lower for kw in self.config.action_keywords):
+                # Validate that the segment is actionable (has verbs like 'akan'/'perlu' or explicit name)
+                if not self._is_actionable_text(text):
+                    continue
+                owner = self._extract_name_as_owner(text) or seg.speaker_id
+                task = text.strip()
+                task_key = task.lower()[:120]
+                if task_key in seen_tasks:
+                    continue
+                seen_tasks.add(task_key)
+                action_items.append(
+                    {
+                        "owner": owner,
+                        "task": task,
+                        "timestamp": f"{seg.start:.1f}s",
+                        "due": self._detect_due_from_text(task),
+                    }
+                )
+        # Post-process: deduplicate semantically and filter tiny filler tasks
+        processed: List[Dict[str, str]] = []
+        seen_norms = set()
+        # Filter out filler / non-actionable phrases (e.g., meeting start/thanks)
+        filler_patterns = [
+            r"\bkita mulai rapat",
+            r"\bitu yang mau kita bahas",
+            r"\bterima kasih",
+            r"\bok(e|ey)?\b",
+            r"\bsip\b",
+            r"\bcukup(kan)? sampai",
+            r"\btidak ada( yang)?\b",
+            r"\biya\b",
+            r"\bsetuju\b",
+        ]
+        filler_re = re.compile("|".join(filler_patterns), flags=re.IGNORECASE)
+        for it in action_items:
+            task_text = it.get("task", "")
+            # Skip common non-actionable conversational lines
+            if filler_re.search(task_text):
+                continue
+            # Ensure the sentence is actionable (has a commitment verb or explicit owner/name)
+            if not self._is_actionable_text(task_text):
+                continue
+            norm = self._normalize_text_for_dedup(task_text)[:200]
+            # skip if too short
+            if len(task_text.split()) < 3:
+                continue
+            if norm in seen_norms:
+                continue
+            seen_norms.add(norm)
+            processed.append(it)
+        # Limit number of action items
+        return processed[:15]
+    def _detect_due_from_text(self, text: str) -> str:
+        """Detect simple due-date hints from text and return a short normalized due string."""
+        t = text.lower()
+        if "besok" in t:
+            return "besok"
+        if "segera" in t or "secepat" in t or "sekarang" in t:
+            return "segera"
+        if "minggu depan" in t:
+            return "1 minggu"
+        m = re.search(r"(\d+)\s*minggu", t)
+        if m:
+            return f"{m.group(1)} minggu"
+        if "2 minggu" in t or "dua minggu" in t:
+            return "2 minggu"
+        if "deadline" in t:
+            # try to capture a following date/token
+            m2 = re.search(r"deadline\s*[:\-\s]*([\w\-\./]+)", t)
+            return m2.group(1) if m2 else "TBD"
+        return ""
+    def _extract_name_as_owner(self, text: str) -> Optional[str]:
+        """Return a candidate owner name if a capitalized proper name is explicitly present in the clause.
+        Simple heuristic: look for capitalized words (not at sentence start if it's a pronoun) followed by 'akan' or similar.
+        """
+        m = re.search(r"\b([A-Z][a-z]{2,})\b(?=\s+akan|\s+siapkan|\s+tolong|\s+bisa|\s+bertanggung)", text)
+        if m:
+            return m.group(1)
+        return None
+    def _is_actionable_text(self, text: str) -> bool:
+        """Return True if text contains indicators of an actionable commitment.
+        Indicators:
+        - Commitment verbs (akan, harus, perlu, siapkan, dll.)
+        - Explicit owner mention (capitalized name)
+        - Time indicators / deadlines (besok, minggu depan, deadline)
+        """
+        t = text or ""
+        tl = t.lower()
+        if re.search(r"\b(akan|harus|siapkan|bikin|buat|selesaikan|dikerjakan|tolong|mohon|harap|perlu)\b", tl):
+            return True
+        # Only consider capitalized names as indicators if followed by an action verb
+        if re.search(r"\b([A-Z][a-z]{2,})\b(?=\s+(akan|siapkan|tolong|mohon|harus|selesaikan|buat|bikin))", t):
+            return True
+        if any(k in tl for k in ("deadline", "minggu depan", "besok")):
+            return True
+        return False
+    def _extract_topics(self, text: str, num_topics: int = 5) -> List[str]:
+        """Extract main topics from text using simple frequency analysis"""
+        # Simple word frequency approach
+        # Remove common Indonesian stopwords
+        stopwords = {
+            "yang",
+            "dan",
+            "di",
+            "ke",
+            "dari",
+            "ini",
+            "itu",
+            "dengan",
+            "untuk",
+            "pada",
+            "adalah",
+            "dalam",
+            "tidak",
+            "akan",
+            "sudah",
+            "juga",
+            "saya",
+            "kita",
+            "kami",
+            "mereka",
+            "ada",
+            "bisa",
+            "atau",
+            "seperti",
+            "jadi",
+            "kalau",
+            "karena",
+            "tapi",
+            "ya",
+            "apa",
+            "bagaimana",
+            "kenapa",
+            "siapa",
+            "kapan",
+            "dimana",
+            "nya",
+            "kan",
+            "dong",
+            "sih",
+            "kok",
+            "deh",
+            "loh",
+            "lah",
+        }
+        # Tokenize and count
+        words = re.findall(r"\b[a-zA-Z]{4,}\b", text.lower())
+        word_counts = {}
+        for word in words:
+            if word not in stopwords:
+                word_counts[word] = word_counts.get(word, 0) + 1
+        # Sort by frequency
+        sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
+        # Return top topics
+        return [word for word, count in sorted_words[:num_topics]]
+    def summarize_by_speaker(self, segments: List[TranscriptSegment]) -> Dict[str, str]:
+        """Generate per-speaker summary"""
+        # Group segments by speaker
+        speaker_texts = {}
+        for seg in segments:
+            if seg.speaker_id not in speaker_texts:
+                speaker_texts[seg.speaker_id] = []
+            speaker_texts[seg.speaker_id].append(seg.text)
+        # Summarize each speaker's contribution
+        speaker_summaries = {}
+        for speaker_id, texts in speaker_texts.items():
+            full_text = " ".join(texts)
+            sentences = self._split_sentences(full_text)
+            if sentences:
+                # Get top 2 sentences for each speaker
+                key_sentences = self._extract_key_sentences(sentences)[:2]
+                speaker_summaries[speaker_id] = " ".join(key_sentences)
+            else:
+                speaker_summaries[speaker_id] = "Tidak ada kontribusi yang dapat diringkas."
+        return speaker_summaries

src/transcriber.py ADDED Viewed

	@@ -0,0 +1,1108 @@

+"""
+ASR Transcription Module
+========================
+Implements speech-to-text with configurable backends (Whisper, Wav2Vec2).
+Default is Whisper-base for multilingual support; supports beam CTC decoding for CTC models.
+"""
+from __future__ import annotations
+import logging
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional
+import numpy as np
+import torch
+from src.diarization import SpeakerSegment
+from src.utils import setup_logger
+@dataclass
+class ASRConfig:
+    """Configuration for ASR"""
+    model_id: str = "openai/whisper-small"
+    chunk_length_s: float = 30.0
+    stride_length_s: float = 5.0
+    batch_size: int = 4
+    return_timestamps: Optional[str] = None  # None or 'char'/'word'
+    # Approximate Continuous Speech Tokenizer token rate in Hz (e.g., 7.5). When set,
+    # the transcriber will apply a fast lossy compression preprocessor for speed.
+    # Default: disabled (None). Use --cst-hz to enable.
+    cst_hz: Optional[float] = None
+    # Backend options:
+    # - 'whisper': HuggingFace transformers ASR pipeline (seq2seq whisper)
+    # - 'transformers': HuggingFace transformers ASR pipeline (CTC wav2vec2, etc)
+    # - 'whisperx': WhisperX (faster-whisper + optional alignment; we use transcription + segments)
+    # - 'speechbrain': SpeechBrain adapter
+    backend: str = "whisper"
+    # Preferred language for whisper (use 'id' for Indonesian)
+    language: str = "id"
+    # WhisperX options
+    # compute_type common values: "float16" (GPU), "int8" / "int8_float16" (lower VRAM)
+    whisperx_compute_type: str = "auto"
+    whisperx_vad_filter: bool = True
+    # Use full-audio ASR and align timestamps to diarization segments if available
+    use_full_audio_for_segments: bool = False
+    # Quick mode (single-pass full audio + reduced precision) and parallelism
+    quick_mode: bool = False
+    parallel_workers: int = 4
+    # When not using full-audio timestamps, include a small context window around short segments
+    context_window_s: float = 0.5
+    # Decoder options: 'greedy' or 'beam' (beam can use pyctcdecode + kenlm)
+    decoder: str = "greedy"
+    beam_width: int = 10
+    use_lm: bool = False
+    lm_path: Optional[str] = None
+    # Text post-processing
+    capitalize_sentences: bool = True
+    normalize_whitespace: bool = True
+    add_punctuation: bool = False
+    # Device
+    device: str = "cuda" if torch.cuda.is_available() else "cpu"
+@dataclass
+class TranscriptSegment:
+    """Transcript segment with speaker and timing information"""
+    speaker_id: str
+    start: float
+    end: float
+    text: str
+    confidence: float = 1.0
+    is_overlap: bool = False
+    language: str = "id"
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    @property
+    def duration(self) -> float:
+        """Get segment duration in seconds"""
+        return self.end - self.start
+    @property
+    def word_count(self) -> int:
+        """Get number of words in text"""
+        return len(self.text.split()) if self.text else 0
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary"""
+        return {
+            "speaker_id": self.speaker_id,
+            "start": self.start,
+            "end": self.end,
+            "text": self.text,
+            "confidence": self.confidence,
+            "is_overlap": self.is_overlap,
+            "duration": self.duration,
+            "word_count": self.word_count,
+        }
+class ASRTranscriber:
+    """
+    Automatic Speech Recognition using Wav2Vec2-XLSR. Supports multiple backends including
+    HuggingFace `transformers` pipeline and optional SpeechBrain adapter.
+    Transcribes audio segments with speaker information.
+    Optimized for Indonesian language with code-switching support.
+    Attributes:
+        config: ASRConfig object
+    Example:
+        >>> transcriber = ASRTranscriber()
+        >>> segments = transcriber.transcribe_segments(waveform, diarization_segments)
+        >>> for seg in segments:
+        ...     print(f"{seg.speaker_id}: {seg.text}")
+    """
+    def __init__(self, config: Optional[ASRConfig] = None, models_dir: str = "./models"):
+        """
+        Initialize ASRTranscriber.
+        Args:
+            config: ASRConfig object
+            models_dir: Directory to cache downloaded models
+        """
+        self.config = config or ASRConfig()
+        self.models_dir = Path(models_dir)
+        self.models_dir.mkdir(parents=True, exist_ok=True)
+        self.device = self.config.device
+        # Setup logger
+        self.logger = setup_logger("ASRTranscriber")
+        # Log configured CST value for diagnostics
+        try:
+            self.logger.info(f"ASRTranscriber configured cst_hz: {getattr(self.config, 'cst_hz', None)} Hz")
+        except Exception:
+            pass
+        # Model placeholders (lazy loading)
+        self._pipeline = None
+        self._processor = None
+        self._model = None
+        self._speechbrain_adapter = None
+        self._whisperx_model = None
+    def _load_model(self):
+        """Lazy load ASR model and pipeline"""
+        # If user configured SpeechBrain backend, prefer it
+        if getattr(self.config, "backend", "whisper") == "speechbrain":
+            if self._speechbrain_adapter is None:
+                try:
+                    from .transcriber_speechbrain import (
+                        SpeechBrainASRConfig,
+                        SpeechBrainTranscriber,
+                    )
+                    sb_cfg = SpeechBrainASRConfig(model_id=self.config.model_id, device=self.device)
+                    self._speechbrain_adapter = SpeechBrainTranscriber(
+                        sb_cfg, models_dir=str(self.models_dir)
+                    )
+                    self.logger.info(
+                        f"SpeechBrain adapter initialized with model: {self.config.model_id}"
+                    )
+                except Exception as e:
+                    self.logger.warning(f"Could not initialize SpeechBrain adapter: {e}")
+                    self._speechbrain_adapter = None
+            return
+        # WhisperX backend
+        if getattr(self.config, "backend", None) == "whisperx":
+            if self._whisperx_model is None:
+                try:
+                    # WhisperX imports torchaudio.AudioMetaData (not present in some builds, e.g., torchaudio 2.8 CPU on Windows)
+                    import torchaudio
+                    if not hasattr(torchaudio, "AudioMetaData"):
+                        from typing import NamedTuple
+                        class AudioMetaData(NamedTuple):
+                            sample_rate: int
+                            num_frames: int
+                            num_channels: int
+                            bits_per_sample: int = 16
+                            encoding: str = "PCM_S"
+                        # Provide stub to satisfy downstream imports; uses safe defaults
+                        torchaudio.AudioMetaData = AudioMetaData  # type: ignore
+                    import whisperx  # type: ignore
+                    # Allowlist OmegaConf ListConfig for torch.load (needed since PyTorch 2.6 weights_only=True)
+                    try:
+                        import typing
+                        import torch.serialization as ts
+                        from omegaconf.base import ContainerMetadata  # type: ignore
+                        from omegaconf.listconfig import ListConfig  # type: ignore
+                        # Allow torch.load with weights_only=True to unpickle HF configs that store plain list
+                        # Allowlist common builtin types and container types used inside HF checkpoints
+                        ts.add_safe_globals([dict, list, int, float, str, tuple, set])
+                        # Add collections.defaultdict (needed by some HF checkpoints under newer PyTorch)
+                        import collections
+                        ts.add_safe_globals([collections.defaultdict])
+                        # Ensure OmegaConf ListConfig is allowlisted (common in HF configs)
+                        ts.add_safe_globals([ListConfig])
+                        # Allow AnyNode from OmegaConf which some HF configs embed
+                        try:
+                            from omegaconf.nodes import AnyNode  # type: ignore
+                            ts.add_safe_globals([AnyNode])
+                        except Exception:
+                            # Not strictly fatal; continue if import fails
+                            pass
+                        # Some checkpoints include TorchVersion objects
+                        try:
+                            import torch
+                            ts.add_safe_globals([torch.torch_version.TorchVersion])
+                        except Exception:
+                            pass
+                        # Add ContainerMetadata and Metadata from OmegaConf if present
+                        try:
+                            from omegaconf.base import Metadata  # type: ignore
+                            ts.add_safe_globals([ContainerMetadata, Metadata, typing.Any])
+                        except Exception:
+                            ts.add_safe_globals([ContainerMetadata, typing.Any])
+                    except Exception as e:
+                        self.logger.warning(f"Could not add ListConfig to torch safe globals: {e}")
+                    model_name_or_path = self.config.model_id
+                    p = Path(str(model_name_or_path))
+                    if p.exists() and p.is_dir():
+                        # WhisperX (faster-whisper / CTranslate2) expects a CT2-converted model directory
+                        # containing model.bin + config files. A folder with only *.safetensors is a
+                        # HuggingFace Transformers checkpoint and cannot be loaded directly by WhisperX.
+                        has_model_bin = (p / "model.bin").exists()
+                        has_safetensors = any(p.glob("*.safetensors"))
+                        if not has_model_bin and has_safetensors:
+                            raise RuntimeError(
+                                "WhisperX backend membutuhkan model format CTranslate2 (ada file 'model.bin'). "
+                                f"Folder '{p.as_posix()}' hanya berisi *.safetensors (format Transformers), jadi "
+                                "tidak bisa dipakai langsung oleh WhisperX. "
+                                "Solusi: pakai nama model WhisperX seperti 'large-v3-turbo' agar auto-download, "
+                                "atau convert model Transformers -> CTranslate2 memakai ctranslate2 converter."
+                            )
+                    compute_type = getattr(self.config, "whisperx_compute_type", "auto")
+                    if compute_type == "auto":
+                        # Sensible default: float16 on CUDA, int8 on CPU
+                        compute_type = "float16" if self.device == "cuda" else "int8"
+                    # WhisperX uses faster-whisper under the hood; model can be a name ("large-v3", "large-v3-turbo")
+                    # or a local directory containing model weights (e.g. safetensors).
+                    self.logger.info(
+                        f"Loading WhisperX model: {model_name_or_path} (device={self.device}, compute_type={compute_type})"
+                    )
+                    # Robust loading: try to parse WeightsUnpickler errors and auto-allowlist missing globals
+                    def _load_model_with_retry():
+                        import importlib
+                        import re
+                        import torch.serialization as ts
+                        max_attempts = 8
+                        attempt = 0
+                        while True:
+                            try:
+                                return whisperx.load_model(
+                                    model_name_or_path,
+                                    device=self.device,
+                                    compute_type=compute_type,
+                                    download_root=str(self.models_dir),
+                                )
+                            except Exception as e:
+                                attempt += 1
+                                if attempt >= max_attempts:
+                                    # Give up and re-raise the original exception
+                                    raise
+                                msg = str(e)
+                                # Find module.Class patterns in the error message
+                                missing = set(
+                                    re.findall(
+                                        r"GLOBAL\s+([\w\.]+)\s+was not an allowed global", msg
+                                    )
+                                )
+                                # Also catch suggestions in the message
+                                more = set(re.findall(r"add_safe_globals\(\[([^\]]+)\]\)", msg))
+                                for m in more:
+                                    # split comma-separated list like 'collections.defaultdict' or 'omegaconf.nodes.AnyNode'
+                                    parts = [
+                                        p.strip().strip("\"''") for p in m.split(",") if p.strip()
+                                    ]
+                                    missing.update(parts)
+                                if not missing:
+                                    # nothing we can do programmatically
+                                    raise
+                                for fullname in missing:
+                                    try:
+                                        module_name, cls_name = fullname.rsplit(".", 1)
+                                        mod = importlib.import_module(module_name)
+                                        cls = getattr(mod, cls_name)
+                                        ts.add_safe_globals([cls])
+                                        self.logger.info(
+                                            f"Auto-added {fullname} to torch safe globals"
+                                        )
+                                    except Exception as ie:
+                                        self.logger.warning(
+                                            f"Could not auto-add {fullname} to safe globals: {ie}"
+                                        )
+                                # retry loop
+                    self._whisperx_model = _load_model_with_retry()
+                    self.logger.info("WhisperX model loaded successfully")
+                except Exception as e:
+                    # When user explicitly requests WhisperX backend, fail loudly with a helpful message.
+                    self._whisperx_model = None
+                    raise RuntimeError(f"Failed to load WhisperX model: {e}") from e
+        if self._pipeline is None:
+            # If user explicitly selected WhisperX and the WhisperX model loaded OK,
+            # prefer WhisperX and skip attempting the Transformers pipeline which may
+            # not recognize model names like 'large-v3-turbo' and produce confusing errors.
+            if (
+                getattr(self.config, "backend", None) == "whisperx"
+                and self._whisperx_model is not None
+            ):
+                self._pipeline = "WHISPERX"
+                self.logger.info("WhisperX backend active; skipping Transformers pipeline load")
+            else:
+                try:
+                    from transformers import pipeline
+                    self.logger.info(f"Loading model: {self.config.model_id}")
+                    # Try to use pipeline first (simpler)
+                    self._pipeline = pipeline(
+                        "automatic-speech-recognition",
+                        model=self.config.model_id,
+                        device=0 if self.device == "cuda" and torch.cuda.is_available() else -1,
+                        chunk_length_s=self.config.chunk_length_s,
+                        stride_length_s=(self.config.stride_length_s, self.config.stride_length_s),
+                    )
+                    self.logger.info("Model loaded successfully via pipeline")
+                except Exception as e:
+                    self.logger.warning(f"Pipeline loading failed: {e}")
+                    self.logger.info("Attempting direct model loading...")
+                    # Attempt direct transformers model loading (Wav2Vec2)
+                    try:
+                        from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+                        self._processor = Wav2Vec2Processor.from_pretrained(
+                            self.config.model_id, cache_dir=str(self.models_dir)
+                        )
+                        self._model = Wav2Vec2ForCTC.from_pretrained(
+                            self.config.model_id, cache_dir=str(self.models_dir)
+                        )
+                        if self.device == "cuda" and torch.cuda.is_available():
+                            self._model = self._model.cuda()
+                        self._model.eval()
+                        self.logger.info("Model loaded successfully via direct loading")
+                        # If user requested beam decoding, try to prepare a CTC beam decoder (pyctcdecode)
+                        self._ctc_decoder = None
+                        try:
+                            if self.config.decoder == "beam":
+                                from pyctcdecode import build_ctcdecoder
+                                # Build label list from tokenizer vocab ordered by id
+                                vocab = self._processor.tokenizer.get_vocab()
+                                labels = [t for t, _ in sorted(vocab.items(), key=lambda x: x[1])]
+                                if self.config.use_lm and self.config.lm_path:
+                                    self.logger.info("Building CTC decoder with LM...")
+                                    self._ctc_decoder = build_ctcdecoder(
+                                        labels, self.config.lm_path
+                                    )
+                                else:
+                                    self.logger.info("Building CTC decoder (no LM)")
+                                    self._ctc_decoder = build_ctcdecoder(labels)
+                                self.logger.info("CTC decoder ready")
+                        except Exception as e:
+                            self.logger.warning(
+                                f"Could not build CTC decoder (pyctcdecode/kenlm missing or failed): {e}"
+                            )
+                            self._ctc_decoder = None
+                    except Exception as e2:
+                        self.logger.error(f"Direct loading also failed: {e2}")
+                        self.logger.warning("Using fallback placeholder mode")
+                        self._pipeline = "FALLBACK"
+    def transcribe_segments(
+        self,
+        waveform: torch.Tensor,
+        segments: List[SpeakerSegment],
+        sample_rate: int = 16000,
+        progress_callback: Optional[Callable[[int, int], None]] = None,
+    ) -> List[TranscriptSegment]:
+        """
+        Transcribe each speaker segment. If `use_full_audio_for_segments` is enabled,
+        run ASR once on the full audio and map word/segment timestamps back to
+        the diarization segments when the ASR pipeline returns timestamps.
+        Falls back to context-augmented per-segment transcription when timestamps
+        are not available.
+        """
+        try:
+            self._load_model()
+        except Exception as e:
+            # If loading the configured ASR backend fails (common when deployment preset
+            # forced WhisperX but model_id is a Transformers repo), attempt a safe
+            # runtime fallback to a lightweight Whisper model so interactive UI flows
+            # remain responsive instead of crashing.
+            self.logger.error(
+                f"ASR model load failed: {e}. Attempting fallback to 'whisper' backend with 'openai/whisper-small'."
+            )
+            try:
+                self.config.backend = "whisper"
+                self.config.model_id = "openai/whisper-small"
+                # Clear any partially-initialized model state
+                self._pipeline = None
+                self._model = None
+                self._processor = None
+                self._whisperx_model = None
+                self._load_model()
+                self.logger.info("Fallback ASR model loaded successfully (openai/whisper-small)")
+            except Exception as e2:
+                self.logger.error(f"Fallback ASR model load also failed: {e2}")
+                # Re-raise to let caller handle/report the error
+                raise
+        # If SpeechBrain backend adapter is configured, delegate to it
+        if (
+            getattr(self.config, "backend", None) == "speechbrain"
+            and getattr(self, "_speechbrain_adapter", None) is not None
+        ):
+            try:
+                sb_res = self._speechbrain_adapter.transcribe_segments(
+                    waveform, segments, sample_rate
+                )
+                for s in sb_res:
+                    s.text = self._postprocess_text(s.text)
+                return sb_res
+            except Exception as e:
+                self.logger.error(f"SpeechBrain adapter transcription failed: {e}")
+        transcripts = []
+        total_segments = len(segments)
+        # If using full-audio mapping, run pipeline once on entire audio and try to align
+        full_asr_result = None
+        audio_np_full = waveform.squeeze().cpu().numpy()
+        if self.config.use_full_audio_for_segments:
+            # If SpeechBrain backend is used, ask adapter to produce full transcription
+            if (
+                getattr(self.config, "backend", "whisper") == "speechbrain"
+                and self._speechbrain_adapter is not None
+            ):
+                try:
+                    self.logger.info(
+                        "Running full-audio ASR via SpeechBrain adapter for alignment to segments"
+                    )
+                    full_text = self._speechbrain_adapter.transcribe_full_audio(
+                        waveform, sample_rate
+                    )
+                    # SpeechBrain adapter currently returns plain text; we can't map timestamps, so store as simple str
+                    full_asr_result = {"text": full_text}
+                except Exception as e:
+                    self.logger.error(f"SpeechBrain full-audio ASR failed: {e}")
+                    full_asr_result = None
+            elif self._pipeline not in (None, "FALLBACK"):
+                try:
+                    # Whisper (seq2seq) pipelines don't accept 'sampling_rate' kwarg; omit it and set language
+                    if getattr(self.config, "backend", "transformers") == "whisper":
+                        kwargs = {}
+                        # prefer explicit language if configured (e.g., Indonesian 'id')
+                        kwargs["language"] = self.config.language
+                    else:
+                        kwargs = {"sampling_rate": sample_rate}
+                    rt = self.config.return_timestamps
+                    if rt in ("char", "word"):
+                        kwargs["return_timestamps"] = rt
+                    self.logger.info("Running full-audio ASR for alignment to segments")
+                    full_asr_result = self._pipeline(audio_np_full, **kwargs)
+                except Exception as e:
+                    self.logger.error(f"Full-audio ASR failed: {e}")
+                    full_asr_result = None
+        # Build list of segment tasks that need per-segment ASR
+        tasks = []
+        for idx, seg in enumerate(segments):
+            # Skip very short segments
+            duration = seg.end - seg.start
+            if duration < 0.3:
+                continue
+            tasks.append((idx, seg))
+        # If we have a full-audio ASR result that includes timestamps, map once and avoid per-segment ASR
+        if full_asr_result is not None:
+            for idx, seg in tasks:
+                text = self._map_full_asr_to_segment(full_asr_result, seg)
+                if text:
+                    text = self._postprocess_text(text)
+                    if text:
+                        transcripts.append(
+                            TranscriptSegment(
+                                speaker_id=seg.speaker_id,
+                                start=seg.start,
+                                end=seg.end,
+                                text=text,
+                                confidence=seg.confidence,
+                                is_overlap=seg.is_overlap,
+                                metadata={
+                                    "embedding": (
+                                        seg.embedding if hasattr(seg, "embedding") else None
+                                    ),
+                                    "asr_model": self.config.model_id,
+                                },
+                            )
+                        )
+            # Filter out tasks that were handled by mapping
+            tasks = [
+                (i, s)
+                for (i, s) in tasks
+                if not any(t.start == s.start and t.end == s.end for t in transcripts)
+            ]
+        # If quick_mode or parallel workers > 1, perform parallel per-segment ASR
+        workers = int(getattr(self.config, "parallel_workers", 1))
+        if workers > 1 and tasks:
+            import concurrent.futures
+            def _transcribe_task(item):
+                idx, seg = item
+                # Progress update is handled by caller optionally, but we log
+                # Use context window if available
+                if self.config.context_window_s and self._pipeline not in (None, "FALLBACK"):
+                    ctx_start = max(0.0, seg.start - self.config.context_window_s)
+                    ctx_end = seg.end + self.config.context_window_s
+                    cs = int(ctx_start * sample_rate)
+                    ce = int(min(ctx_end * sample_rate, waveform.shape[-1]))
+                    audio_np = waveform[:, cs:ce].squeeze().cpu().numpy()
+                    text = self._transcribe_audio(
+                        torch.from_numpy(audio_np).unsqueeze(0), sample_rate
+                    )
+                else:
+                    start_sample = int(seg.start * sample_rate)
+                    end_sample = int(seg.end * sample_rate)
+                    audio_segment = waveform[:, start_sample:end_sample]
+                    text = self._transcribe_audio(audio_segment, sample_rate)
+                text = self._postprocess_text(text)
+                return idx, seg, text
+            with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as ex:
+                futures = {ex.submit(_transcribe_task, t): t for t in tasks}
+                for fut in concurrent.futures.as_completed(futures):
+                    try:
+                        idx, seg, text = fut.result()
+                        if not text or not text.strip():
+                            continue
+                        transcripts.append(
+                            TranscriptSegment(
+                                speaker_id=seg.speaker_id,
+                                start=seg.start,
+                                end=seg.end,
+                                text=text,
+                                confidence=seg.confidence,
+                                is_overlap=seg.is_overlap,
+                                metadata={
+                                    "embedding": (
+                                        seg.embedding if hasattr(seg, "embedding") else None
+                                    ),
+                                    "asr_model": self.config.model_id,
+                                },
+                            )
+                        )
+                    except Exception as e:
+                        self.logger.error(f"Segment transcription failed: {e}")
+        else:
+            # Serial fallback
+            for idx, seg in tasks:
+                # create context window
+                if self.config.context_window_s and self._pipeline not in (None, "FALLBACK"):
+                    ctx_start = max(0.0, seg.start - self.config.context_window_s)
+                    ctx_end = seg.end + self.config.context_window_s
+                    cs = int(ctx_start * sample_rate)
+                    ce = int(min(ctx_end * sample_rate, waveform.shape[-1]))
+                    audio_np = waveform[:, cs:ce].squeeze().cpu().numpy()
+                    text = self._transcribe_audio(
+                        torch.from_numpy(audio_np).unsqueeze(0), sample_rate
+                    )
+                else:
+                    start_sample = int(seg.start * sample_rate)
+                    end_sample = int(seg.end * sample_rate)
+                    audio_segment = waveform[:, start_sample:end_sample]
+                    text = self._transcribe_audio(audio_segment, sample_rate)
+                # Post-process text
+                text = self._postprocess_text(text)
+                # Skip empty transcriptions
+                if not text or not text.strip():
+                    continue
+                transcripts.append(
+                    TranscriptSegment(
+                        speaker_id=seg.speaker_id,
+                        start=seg.start,
+                        end=seg.end,
+                        text=text,
+                        confidence=seg.confidence,
+                        is_overlap=seg.is_overlap,
+                        metadata={
+                            "embedding": seg.embedding if hasattr(seg, "embedding") else None,
+                            "asr_model": self.config.model_id,
+                        },
+                    )
+                )
+        return transcripts
+    def _detect_language_from_text(self, text: str) -> Optional[str]:
+        """Detect top language code from text using langdetect. Returns ISO code or None."""
+        try:
+            from langdetect import detect_langs
+            if not text or not text.strip():
+                return None
+            probs = detect_langs(text)
+            if not probs:
+                return None
+            return probs[0].lang
+        except Exception:
+            return None
+    def _transcribe_audio(self, audio_segment: torch.Tensor, sample_rate: int) -> str:
+        """Transcribe a single audio segment
+        Supports `language='auto'` for Whisper backend which will perform a quick
+        pre-pass (no language hint) and use a text-based language detector to
+        choose the language for the final transcription pass.
+        If `self.config.cst_hz` is set, an aggressive lossy preprocessor (approximation
+        of a low-rate Continuous Speech Tokenizer) is applied before sending audio to
+        the ASR backend. This significantly reduces compute at the cost of precision
+        and should be used only when speed is critical.
+        """
+        # Fallback mode: only return placeholders when no working ASR backend is available.
+        # If user requested WhisperX backend and model is loaded, prefer using WhisperX.
+        if self._pipeline == "FALLBACK":
+            backend = getattr(self.config, "backend", None)
+            if not (backend == "whisperx" and self._whisperx_model is not None):
+                duration = audio_segment.shape[-1] / sample_rate
+                return f"[Transkripsi placeholder - durasi {duration:.1f}s]"
+        # Convert to numpy
+        audio_np = audio_segment.squeeze().cpu().numpy()
+        # Apply CST approximation preprocessor if requested (lossy, speed-optimized)
+        if getattr(self.config, "cst_hz", None) is not None:
+            try:
+                audio_np = self._apply_cst_approximation(audio_np, sample_rate, float(self.config.cst_hz))
+                # After approximation we keep the original sample_rate for downstream callers
+                self.logger.info(f"Applied CST approximation: {self.config.cst_hz} Hz (lossy)")
+            except Exception as e:
+                self.logger.warning(f"CST approximation failed, continuing with original audio: {e}")
+        # Ensure float32
+        if audio_np.dtype != np.float32:
+            audio_np = audio_np.astype(np.float32)
+        # WhisperX backend
+        if getattr(self.config, "backend", None) == "whisperx":
+            try:
+                if self._whisperx_model is None:
+                    self._load_model()
+                if self._whisperx_model is None:
+                    return ""
+                language = getattr(self.config, "language", "id")
+                # whisperx expects None for auto language
+                language_arg = None if language == "auto" else language
+                vad_filter = bool(getattr(self.config, "whisperx_vad_filter", True))
+                # Build kwargs and only pass vad_filter if the transcribe signature accepts it
+                from inspect import signature
+                kwargs = {"batch_size": self.config.batch_size}
+                if language_arg is not None:
+                    kwargs["language"] = language_arg
+                try:
+                    sig = signature(self._whisperx_model.transcribe)
+                    if "vad_filter" in sig.parameters:
+                        kwargs["vad_filter"] = vad_filter
+                except Exception:
+                    # If introspection fails, do not pass vad_filter
+                    pass
+                # First attempt
+                try:
+                    result = self._whisperx_model.transcribe(audio_np, **kwargs)
+                except Exception as e_inner:
+                    self.logger.warning(f"WhisperX transcription failed on first attempt: {e_inner}. Retrying with `vad_filter=False, batch_size=1`")
+                    # retry with safer options
+                    try:
+                        retry_kwargs = kwargs.copy()
+                        retry_kwargs["batch_size"] = 1
+                        if "vad_filter" in retry_kwargs:
+                            retry_kwargs["vad_filter"] = False
+                        result = self._whisperx_model.transcribe(audio_np, **retry_kwargs)
+                    except Exception as e_retry:
+                        self.logger.error(f"WhisperX transcription retry failed: {e_retry}. Falling back to lightweight Whisper model.")
+                        # Fallback: switch backend to 'whisper' with small model and attempt to load it
+                        try:
+                            self.config.backend = "whisper"
+                            self.config.model_id = "openai/whisper-small"
+                            # Clear whisperx state
+                            self._whisperx_model = None
+                            self._pipeline = None
+                            self._model = None
+                            self._processor = None
+                            self._load_model()
+                            # attempt pipeline-based transcription
+                            return self._transcribe_audio(audio_segment, sample_rate)
+                        except Exception as e_fb:
+                            self.logger.error(f"Fallback ASR model load/transcription failed: {e_fb}")
+                            return ""
+                # Normalize result into plain text.
+                if isinstance(result, dict):
+                    # 'text' is common, but some ASR returns 'segments' list
+                    if "text" in result and result.get("text"):
+                        return result.get("text", "")
+                    if "segments" in result and isinstance(result["segments"], list):
+                        seg_texts = [
+                            s.get("text", "") for s in result["segments"] if isinstance(s, dict)
+                        ]
+                        joined = " ".join(t.strip() for t in seg_texts if t and t.strip())
+                        return joined or ""
+                    # fallback to empty
+                    return ""
+                return str(result)
+            except Exception as e:
+                self.logger.error(f"WhisperX transcription failed: {e}")
+                return ""
+        # Use pipeline if available
+        if self._pipeline is not None and self._pipeline != "FALLBACK":
+            try:
+                # Whisper backend: handle language auto-detection
+                if getattr(self.config, "backend", "transformers") == "whisper":
+                    if getattr(self.config, "language", "id") == "auto":
+                        # quick pre-pass to get candidate text
+                        try:
+                            quick_kwargs = {}
+                            rt = self.config.return_timestamps
+                            if rt in ("char", "word"):
+                                quick_kwargs["return_timestamps"] = rt
+                            quick_res = self._pipeline(audio_np, **quick_kwargs)
+                            quick_text = (
+                                quick_res.get("text", "")
+                                if isinstance(quick_res, dict)
+                                else str(quick_res)
+                            )
+                            detected = self._detect_language_from_text(quick_text)
+                            chosen_lang = detected if detected else "id"
+                        except Exception:
+                            chosen_lang = "id"
+                    else:
+                        chosen_lang = getattr(self.config, "language", "id")
+                    kwargs = {"language": chosen_lang}
+                else:
+                    kwargs = {"sampling_rate": sample_rate}
+                rt = self.config.return_timestamps
+                if rt in ("char", "word"):
+                    kwargs["return_timestamps"] = rt
+                result = self._pipeline(audio_np, **kwargs)
+                # If result is a dict with text
+                if isinstance(result, dict):
+                    # If pipeline returns a list of word/segment timestamps, user may want that via full-audio flow
+                    if isinstance(result.get("chunks", None), list) or isinstance(
+                        result.get("segments", None), list
+                    ):
+                        return result.get("text", "")
+                    return result.get("text", "")
+                return str(result)
+            except Exception as e:
+                self.logger.warning(f"Pipeline transcription failed: {e}")
+                # Try to fall back to direct model path (if available)
+                self._pipeline = None
+                # continue to attempt direct model below
+        # Use direct model if pipeline not available
+        if self._model is not None and self._processor is not None:
+            try:
+                # Process input
+                inputs = self._processor(
+                    audio_np, sampling_rate=sample_rate, return_tensors="pt", padding=True
+                )
+                # Move to device
+                if self.device == "cuda" and torch.cuda.is_available():
+                    inputs = {k: v.cuda() for k, v in inputs.items()}
+                # Run inference
+                with torch.no_grad():
+                    logits = self._model(**inputs).logits
+                # If CTC beam decoder available and requested, use it
+                if (
+                    getattr(self, "_ctc_decoder", None) is not None
+                    and self.config.decoder == "beam"
+                ):
+                    try:
+                        # Convert logits to probabilities (T, C)
+                        probs = torch.softmax(logits, dim=-1).cpu().numpy()
+                        # some models return batch dimension; take first batch
+                        emissions = probs[0]
+                        try:
+                            # Try simple decode
+                            transcription = self._ctc_decoder.decode(
+                                emissions, beam_width=self.config.beam_width
+                            )
+                        except Exception:
+                            # Try beam candidates and pick top
+                            beams = self._ctc_decoder.decode_beams(
+                                emissions, beam_width=self.config.beam_width
+                            )
+                            transcription = beams[0][0] if beams else ""
+                        return transcription if transcription else ""
+                    except Exception as e:
+                        self.logger.warning(f"CTC beam decode failed: {e}")
+                        # fallback to greedy
+                # Fallback: greedy argmax decode
+                predicted_ids = torch.argmax(logits, dim=-1)
+                transcription = self._processor.batch_decode(predicted_ids)
+                return transcription[0] if transcription else ""
+            except Exception as e:
+                self.logger.error(f"Direct model transcription failed: {e}")
+                return ""
+        return ""
+    def transcribe_full_audio(self, waveform: torch.Tensor, sample_rate: int = 16000) -> str:
+        """
+        Transcribe full audio without diarization.
+        Useful for baseline comparison.
+        """
+        self._load_model()
+        # WhisperX: call directly to keep consistency
+        if getattr(self.config, "backend", None) == "whisperx":
+            audio_np = waveform.squeeze().cpu().numpy().astype(np.float32, copy=False)
+            if self._whisperx_model is None:
+                return ""
+            language = getattr(self.config, "language", "id")
+            language_arg = None if language == "auto" else language
+            vad_filter = bool(getattr(self.config, "whisperx_vad_filter", True))
+            try:
+                res = self._whisperx_model.transcribe(
+                    audio_np,
+                    batch_size=self.config.batch_size,
+                    language=language_arg,
+                    vad_filter=vad_filter,
+                )
+                text = res.get("text", "") if isinstance(res, dict) else str(res)
+                return self._postprocess_text(text)
+            except Exception as e:
+                self.logger.warning(f"WhisperX full-audio transcription failed: {e}. Retrying with vad_filter=False, batch_size=1")
+                try:
+                    res = self._whisperx_model.transcribe(
+                        audio_np,
+                        batch_size=1,
+                        language=language_arg,
+                        vad_filter=False,
+                    )
+                    text = res.get("text", "") if isinstance(res, dict) else str(res)
+                    return self._postprocess_text(text)
+                except Exception as e2:
+                    self.logger.error(f"WhisperX full-audio retry failed: {e2}. Falling back to 'whisper-small'.")
+                    # Fallback to whisper-small pipeline
+                    try:
+                        self.config.backend = "whisper"
+                        self.config.model_id = "openai/whisper-small"
+                        self._whisperx_model = None
+                        self._pipeline = None
+                        self._model = None
+                        self._processor = None
+                        self._load_model()
+                        text = self._transcribe_audio(waveform, sample_rate)
+                        return self._postprocess_text(text)
+                    except Exception as e_fb:
+                        self.logger.error(f"Fallback full-audio ASR failed: {e_fb}")
+                        return ""
+        text = self._transcribe_audio(waveform, sample_rate)
+        return self._postprocess_text(text)
+    def _apply_cst_approximation(self, audio_np: np.ndarray, sample_rate: int, cst_hz: float) -> np.ndarray:
+        """Approximate a Continuous Speech Tokenizer by block-averaging audio frames
+        This method is intentionally conservative and reversible only in the sense
+        that it produces a downsample-like version of the waveform which is then
+        expanded back to the original rate (by repeating block values). This is
+        extremely lossy but can reduce model runtime for long audio when you
+        accept lower ASR fidelity.
+        Implementation details:
+        - token_duration = 1.0 / cst_hz
+        - compute mean amplitude per token window
+        - expand each token mean to the window length (constant value) to produce
+          a waveform of the original sample length
+        Note: This is an approximation to the user's requested ultralow-rate tokenizer
+        (7.5 Hz). For best accuracy, tune `cst_hz` and verify results on your data.
+        """
+        if cst_hz <= 0 or np.isnan(cst_hz):
+            return audio_np
+        token_dur = 1.0 / float(cst_hz)
+        window_samp = max(1, int(round(token_dur * sample_rate)))
+        # Partition audio and compute mean for each window
+        n = len(audio_np)
+        n_windows = int(np.ceil(n / window_samp))
+        means = []
+        for i in range(n_windows):
+            s = i * window_samp
+            e = min(n, s + window_samp)
+            if e <= s:
+                means.append(0.0)
+            else:
+                means.append(float(np.mean(audio_np[s:e])))
+        # Reconstruct waveform by repeating means per window
+        out = np.zeros(n, dtype=np.float32)
+        for i, m in enumerate(means):
+            s = i * window_samp
+            e = min(n, s + window_samp)
+            out[s:e] = m
+        return out
+    def _postprocess_text(self, text: str) -> str:
+        """Clean and format transcribed text"""
+        if not text:
+            return ""
+        # Basic cleaning
+        text = text.strip()
+        # Remove special tokens and math/code blocks bounded by $$...$$
+        text = re.sub(r"<[^>]+>", "", text)
+        text = re.sub(r"\$\$.*?\$\$", "", text, flags=re.DOTALL)
+        # Normalize whitespace
+        if self.config.normalize_whitespace:
+            text = " ".join(text.split())
+        # Capitalize first letter of sentences
+        if self.config.capitalize_sentences and text:
+            # Capitalize first character
+            text = text[0].upper() + text[1:] if len(text) > 1 else text.upper()
+            # Capitalize after sentence-ending punctuation
+            text = re.sub(r"([.!?]\s+)([a-z])", lambda m: m.group(1) + m.group(2).upper(), text)
+        # Add period if missing
+        if text and text[-1] not in ".!?,:;":
+            text += "."
+        return text
+    def _map_full_asr_to_segment(self, full_result: Any, seg: SpeakerSegment) -> str:
+        """Attempt to extract text for a given segment from a full-audio ASR result.
+        Supports multiple result shapes returned by different ASR pipelines:
+        - result['chunks'] or result['segments']: list of dicts with 'start','end','text'
+        - result may also include 'words' lists with per-word timestamps
+        If no timestamped structure is present, returns empty string so caller can fallback.
+        """
+        try:
+            # Prefer 'chunks' (some pipelines) then 'segments'
+            blocks = None
+            if isinstance(full_result, dict):
+                if isinstance(full_result.get("chunks"), list):
+                    blocks = full_result["chunks"]
+                elif isinstance(full_result.get("segments"), list):
+                    blocks = full_result["segments"]
+                # some pipelines return word-level timestamps
+                elif isinstance(full_result.get("words"), list):
+                    words = full_result["words"]
+                    text_parts = [
+                        w["word"]
+                        for w in words
+                        if w.get("start") is not None
+                        and w.get("end") is not None
+                        and (w["start"] >= seg.start and w["end"] <= seg.end)
+                    ]
+                    return " ".join(text_parts)
+            if blocks is None:
+                return ""
+            # Concatenate blocks that overlap with seg time window
+            collected = []
+            for b in blocks:
+                bstart = float(b.get("start", 0.0))
+                bend = float(b.get("end", 0.0))
+                if bstart < seg.end and bend > seg.start:
+                    collected.append(b.get("text", ""))
+            return " ".join([c.strip() for c in collected]).strip()
+        except Exception:
+            return ""
+    def get_transcription_stats(self, segments: List[TranscriptSegment]) -> Dict[str, Any]:
+        """
+        Get transcription statistics.
+        Args:
+            segments: List of transcript segments
+        Returns:
+            Dictionary with statistics
+        """
+        if not segments:
+            return {
+                "total_segments": 0,
+                "total_words": 0,
+                "total_duration": 0.0,
+                "words_per_minute": 0.0,
+                "speakers": {},
+            }
+        total_words = sum(seg.word_count for seg in segments)
+        total_duration = sum(seg.duration for seg in segments)
+        # Per-speaker stats
+        speaker_stats = {}
+        for seg in segments:
+            if seg.speaker_id not in speaker_stats:
+                speaker_stats[seg.speaker_id] = {
+                    "word_count": 0,
+                    "duration": 0.0,
+                    "segment_count": 0,
+                }
+            speaker_stats[seg.speaker_id]["word_count"] += seg.word_count
+            speaker_stats[seg.speaker_id]["duration"] += seg.duration
+            speaker_stats[seg.speaker_id]["segment_count"] += 1
+        return {
+            "total_segments": len(segments),
+            "total_words": total_words,
+            "total_duration": total_duration,
+            "words_per_minute": (total_words / total_duration * 60) if total_duration > 0 else 0,
+            "speakers": speaker_stats,
+        }

src/transcriber_speechbrain.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""
+SpeechBrain ASR wrapper (optional)
+Provides a lightweight adapter around SpeechBrain's EncoderASR/EncoderDecoderASR to be used
+as an optional backend in `meeting_transcriber`.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, List, Optional
+import numpy as np
+import torch
+from src.diarization import SpeakerSegment
+from src.transcriber import TranscriptSegment
+@dataclass
+class SpeechBrainASRConfig:
+    model_id: str = "speechbrain/asr-crdnn-rnnlm-librispeech"
+    device: str = "cuda" if torch.cuda.is_available() else "cpu"
+    chunk_length_s: float = 30.0
+class SpeechBrainTranscriber:
+    """Adapter for SpeechBrain ASR models.
+    Usage:
+        t = SpeechBrainTranscriber(config)
+        t.transcribe_segments(waveform, segments, sample_rate)
+    """
+    def __init__(self, config: Optional[SpeechBrainASRConfig] = None, models_dir: str = "./models"):
+        self.config = config or SpeechBrainASRConfig()
+        self.models_dir = Path(models_dir)
+        self.models_dir.mkdir(parents=True, exist_ok=True)
+        self._model = None
+    def _load_model(self):
+        if self._model is not None:
+            return
+        try:
+            # Prefer the new import path to avoid deprecation warnings in SpeechBrain >=1.0
+            try:
+                from speechbrain.inference import (  # type: ignore
+                    EncoderASR,
+                    EncoderDecoderASR,
+                )
+            except Exception:
+                from speechbrain.pretrained import (  # type: ignore
+                    EncoderASR,
+                    EncoderDecoderASR,
+                )
+            # Try EncoderDecoderASR first (seq2seq), fall back to EncoderASR
+            try:
+                self._model = EncoderDecoderASR.from_hparams(
+                    source=self.config.model_id, savedir=str(self.models_dir)
+                )
+            except Exception:
+                self._model = EncoderASR.from_hparams(
+                    source=self.config.model_id, savedir=str(self.models_dir)
+                )
+        except Exception as e:
+            print(f"[SpeechBrain] Could not load model: {e}")
+            self._model = None
+    def transcribe_full_audio(self, waveform: torch.Tensor, sample_rate: int = 16000) -> str:
+        """Transcribe full audio waveform. Returns post-processed text (raw)."""
+        self._load_model()
+        if self._model is None:
+            return ""
+        # SpeechBrain typically expects a file path for convenience; some models accept numpy arrays
+        try:
+            audio_np = waveform.squeeze().cpu().numpy()
+            # Many SpeechBrain models accept numpy arrays for `transcribe_batch`/`transcribe_file`
+            # Use transcribe_batch for in-memory audio
+            try:
+                res = self._model.transcribe_batch([audio_np])
+                if isinstance(res, list):
+                    return str(res[0])
+                return str(res)
+            except Exception:
+                # Fallback: write temporary file
+                import tempfile
+                import soundfile as sf
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp:
+                    sf.write(tmp.name, audio_np.astype("float32"), sample_rate)
+                    return str(self._model.transcribe_file(tmp.name))
+        except Exception as e:
+            print(f"[SpeechBrain] Full audio transcription failed: {e}")
+            return ""
+    def transcribe_segments(
+        self, waveform: torch.Tensor, segments: List[SpeakerSegment], sample_rate: int = 16000
+    ) -> List[TranscriptSegment]:
+        """Transcribe each segment and return list of TranscriptSegment objects."""
+        self._load_model()
+        transcripts: List[TranscriptSegment] = []
+        if self._model is None:
+            return transcripts
+        for seg in segments:
+            start = int(seg.start * sample_rate)
+            end = int(seg.end * sample_rate)
+            segment_np = waveform[:, start:end].squeeze().cpu().numpy()
+            if segment_np.size == 0:
+                continue
+            # Skip extremely short segments
+            if seg.end - seg.start < 0.2:
+                continue
+            try:
+                # prefer in-memory transcribe_batch
+                res = self._model.transcribe_batch([segment_np])
+                text = str(res[0]) if isinstance(res, list) else str(res)
+            except Exception:
+                # fallback to temporary file path
+                try:
+                    import tempfile
+                    import soundfile as sf
+                    with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp:
+                        sf.write(tmp.name, segment_np.astype("float32"), sample_rate)
+                        text = str(self._model.transcribe_file(tmp.name))
+                except Exception as e:
+                    print(f"[SpeechBrain] Segment transcription failed: {e}")
+                    text = ""
+            if not text or not text.strip():
+                continue
+            transcripts.append(
+                TranscriptSegment(
+                    speaker_id=seg.speaker_id,
+                    start=seg.start,
+                    end=seg.end,
+                    text=text.strip(),
+                    confidence=getattr(seg, "confidence", 1.0),
+                    is_overlap=getattr(seg, "is_overlap", False),
+                )
+            )
+        return transcripts

src/utils.py ADDED Viewed

	@@ -0,0 +1,555 @@

+"""
+Utility Functions Module
+========================
+Helper functions used across the system.
+"""
+from __future__ import annotations
+import hashlib
+import json
+import logging
+import os
+import re
+import time
+from functools import wraps
+from pathlib import Path
+from typing import Any, List, Optional, Union
+# =============================================================================
+# Logging Setup
+# =============================================================================
+def setup_logger(
+    name: str = "MeetingTranscriber", level: int = logging.INFO, log_file: Optional[str] = None
+) -> logging.Logger:
+    """
+    Setup and return a logger instance.
+    Args:
+        name: Logger name
+        level: Logging level
+        log_file: Optional file path for logging
+    Returns:
+        Configured logger instance
+    """
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    # Console handler
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(level)
+    # Formatter
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+    )
+    console_handler.setFormatter(formatter)
+    logger.addHandler(console_handler)
+    # File handler (optional)
+    if log_file:
+        os.makedirs(os.path.dirname(log_file), exist_ok=True)
+        file_handler = logging.FileHandler(log_file, encoding="utf-8")
+        file_handler.setLevel(level)
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+    return logger
+# =============================================================================
+# Timing Utilities
+# =============================================================================
+def timer(func):
+    """Decorator to measure function execution time"""
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        end_time = time.time()
+        print(f"[Timer] {func.__name__} took {end_time - start_time:.2f} seconds")
+        return result
+    return wrapper
+class Timer:
+    """Context manager for timing code blocks"""
+    def __init__(self, name: str = "Block"):
+        self.name = name
+        self.start_time = None
+        self.end_time = None
+    def __enter__(self):
+        self.start_time = time.time()
+        return self
+    def __exit__(self, *args):
+        self.end_time = time.time()
+        self.elapsed = self.end_time - self.start_time
+        print(f"[Timer] {self.name} took {self.elapsed:.2f} seconds")
+# =============================================================================
+# File Utilities
+# =============================================================================
+def get_file_hash(filepath: Union[str, Path], algorithm: str = "md5") -> str:
+    """
+    Calculate hash of a file.
+    Args:
+        filepath: Path to file
+        algorithm: Hash algorithm ('md5', 'sha256')
+    Returns:
+        Hex digest of file hash
+    """
+    hash_func = hashlib.new(algorithm)
+    with open(filepath, "rb") as f:
+        for chunk in iter(lambda: f.read(8192), b""):
+            hash_func.update(chunk)
+    return hash_func.hexdigest()
+def ensure_dir(path: Union[str, Path]) -> Path:
+    """Ensure directory exists, create if not"""
+    path = Path(path)
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+def list_audio_files(
+    directory: Union[str, Path], extensions: Optional[List[str]] = None
+) -> List[Path]:
+    """
+    List all audio files in directory.
+    Args:
+        directory: Directory to search
+        extensions: List of extensions to include (default: common audio formats)
+    Returns:
+        List of audio file paths
+    """
+    if extensions is None:
+        extensions = [".wav", ".mp3", ".flac", ".ogg", ".m4a", ".wma", ".aac"]
+    directory = Path(directory)
+    audio_files = []
+    for ext in extensions:
+        audio_files.extend(directory.glob(f"*{ext}"))
+        audio_files.extend(directory.glob(f"*{ext.upper()}"))
+    return sorted(audio_files)
+def sanitize_filename(filename: str) -> str:
+    """Remove invalid characters from filename"""
+    # Remove invalid characters
+    sanitized = re.sub(r'[<>:"/\\|?*]', "", filename)
+    # Replace spaces with underscores
+    sanitized = sanitized.replace(" ", "_")
+    # Remove multiple underscores
+    sanitized = re.sub(r"_+", "_", sanitized)
+    return sanitized.strip("_")
+# =============================================================================
+# JSON Utilities
+# =============================================================================
+def save_json(data: Any, filepath: Union[str, Path], indent: int = 2):
+    """Save data to JSON file"""
+    filepath = Path(filepath)
+    filepath.parent.mkdir(parents=True, exist_ok=True)
+    with open(filepath, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=indent, default=str)
+def load_json(filepath: Union[str, Path]) -> Any:
+    """Load data from JSON file"""
+    with open(filepath, "r", encoding="utf-8") as f:
+        return json.load(f)
+# =============================================================================
+# Text Utilities
+# =============================================================================
+def format_duration(seconds: float) -> str:
+    """Format duration in seconds to human-readable string"""
+    if seconds < 0:
+        return "0:00"
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = int(seconds % 60)
+    if hours > 0:
+        return f"{hours}:{minutes:02d}:{secs:02d}"
+    return f"{minutes}:{secs:02d}"
+def format_timestamp(seconds: float) -> str:
+    """Format timestamp for document display"""
+    seconds = max(0, seconds)
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = int(seconds % 60)
+    if hours > 0:
+        return f"{hours:02d}:{minutes:02d}:{secs:02d}"
+    return f"{minutes:02d}:{secs:02d}"
+def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
+    """Truncate text to maximum length"""
+    if len(text) <= max_length:
+        return text
+    return text[: max_length - len(suffix)] + suffix
+def clean_text(text: str) -> str:
+    """Clean text: normalize whitespace, remove special chars"""
+    if not text:
+        return ""
+    # Normalize whitespace
+    text = " ".join(text.split())
+    # Remove control characters
+    text = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", text)
+    return text.strip()
+# =============================================================================
+# Progress Utilities
+# =============================================================================
+class ProgressTracker:
+    """Simple progress tracker for long operations"""
+    def __init__(self, total: int, description: str = "Processing"):
+        self.total = total
+        self.current = 0
+        self.description = description
+        self.start_time = time.time()
+    def update(self, n: int = 1):
+        """Update progress by n steps"""
+        self.current += n
+        self._print_progress()
+    def _print_progress(self):
+        """Print progress bar"""
+        percent = self.current / self.total * 100 if self.total > 0 else 0
+        elapsed = time.time() - self.start_time
+        # Estimate remaining time
+        if self.current > 0:
+            eta = elapsed / self.current * (self.total - self.current)
+            eta_str = format_duration(eta)
+        else:
+            eta_str = "?"
+        bar_length = 30
+        filled = int(bar_length * self.current / self.total) if self.total > 0 else 0
+        bar = "█" * filled + "░" * (bar_length - filled)
+        print(
+            f"\r[{bar}] {percent:5.1f}% ({self.current}/{self.total}) ETA: {eta_str}  ",
+            end="",
+            flush=True,
+        )
+        if self.current >= self.total:
+            print()  # New line at completion
+    def finish(self):
+        """Mark progress as complete"""
+        self.current = self.total
+        self._print_progress()
+        elapsed = time.time() - self.start_time
+        print(f"[{self.description}] Completed in {format_duration(elapsed)}")
+# =============================================================================
+# Validation Utilities
+# =============================================================================
+def validate_audio_file(filepath: Union[str, Path]) -> bool:
+    """
+    Validate that file exists and is a supported audio format.
+    Args:
+        filepath: Path to audio file
+    Returns:
+        True if valid, raises exception otherwise
+    """
+    filepath = Path(filepath)
+    if not filepath.exists():
+        raise FileNotFoundError(f"Audio file not found: {filepath}")
+    supported_formats = {".wav", ".mp3", ".flac", ".ogg", ".m4a", ".wma", ".aac"}
+    if filepath.suffix.lower() not in supported_formats:
+        raise ValueError(
+            f"Unsupported audio format: {filepath.suffix}. "
+            f"Supported: {', '.join(supported_formats)}"
+        )
+    return True
+def validate_ground_truth_file(filepath: Union[str, Path]) -> bool:
+    """
+    Validate ground truth file format.
+    Args:
+        filepath: Path to ground truth file
+    Returns:
+        True if valid
+    """
+    filepath = Path(filepath)
+    if not filepath.exists():
+        raise FileNotFoundError(f"Ground truth file not found: {filepath}")
+    supported_formats = {".txt", ".json", ".rttm"}
+    if filepath.suffix.lower() not in supported_formats:
+        raise ValueError(
+            f"Unsupported ground truth format: {filepath.suffix}. "
+            f"Supported: {', '.join(supported_formats)}"
+        )
+    return True
+# =============================================================================
+# Ground Truth Parsing
+# =============================================================================
+def parse_transcript_file(filepath: Union[str, Path]) -> str:
+    """
+    Parse transcript file (plain text).
+    Args:
+        filepath: Path to transcript file
+    Returns:
+        Transcript text
+    """
+    with open(filepath, "r", encoding="utf-8") as f:
+        return f.read().strip()
+def parse_rttm_file(filepath: Union[str, Path]) -> List[tuple]:
+    """
+    Parse RTTM (Rich Transcription Time Marked) file for diarization ground truth.
+    RTTM format:
+    SPEAKER <file_id> <channel> <start> <duration> <NA> <NA> <speaker_id> <NA> <NA>
+    Args:
+        filepath: Path to RTTM file
+    Returns:
+        List of (speaker_id, start, end) tuples
+    """
+    segments = []
+    with open(filepath, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            parts = line.split()
+            if len(parts) >= 8 and parts[0] == "SPEAKER":
+                start = float(parts[3])
+                duration = float(parts[4])
+                speaker_id = parts[7]
+                segments.append((speaker_id, start, start + duration))
+    return segments
+# -----------------------------------------------------------------------------
+# Helpers for building RTTM from speaker-labeled transcripts
+# -----------------------------------------------------------------------------
+def parse_speaker_labeled_text(text: str) -> List[Tuple[str, str]]:
+    """Parse speaker-labeled transcript text into a list of (speaker, text).
+    Recognizes lines that start with `Name:` (case-insensitive) as speaker labels.
+    Consecutive non-label lines are appended to the current speaker utterance.
+    Returns empty list if input is empty.
+    """
+    label_re = re.compile(r"^\s*([^:\n\r]{1,80}):\s*(.*)$")
+    items: List[Tuple[str, str]] = []
+    cur_speaker = None
+    cur_lines: List[str] = []
+    for raw in text.splitlines():
+        line = raw.rstrip("\n\r")
+        m = label_re.match(line)
+        if m:
+            if cur_speaker is not None:
+                items.append((cur_speaker, " ".join(l.strip() for l in cur_lines if l.strip())))
+            cur_speaker = m.group(1).strip()
+            first = m.group(2).strip()
+            cur_lines = [first] if first else []
+        else:
+            if line.strip():
+                cur_lines.append(line.strip())
+    if cur_speaker is not None:
+        items.append((cur_speaker, " ".join(l.strip() for l in cur_lines if l.strip())))
+    return items
+def align_reference_to_segments(
+    utterances: List[Tuple[str, str]],
+    hyp_segments: List[object],
+    min_score: float = 0.20,
+) -> List[Tuple[str, float, float]]:
+    """Align reference speaker utterances to hypothesis transcript segments.
+    Strategy (simple heuristic):
+      - Iterate utterances in order and try to find the best contiguous window of
+        hypothesis segments (starting from last matched index) whose combined
+        words have maximal overlap with the reference utterance words.
+      - Overlap score = intersection_words / reference_word_count.
+      - Accept match if score >= min_score; assign start/end from matched segments.
+    Returns list of (speaker_id, start, end).
+    """
+    if not utterances or not hyp_segments:
+        return []
+    # Precompute normalized words for hypothesis segments
+    hyp_words = []
+    for seg in hyp_segments:
+        txt = getattr(seg, "text", "") or ""
+        words = [w.lower() for w in re.findall(r"\w+", txt)]
+        hyp_words.append(words)
+    results: List[Tuple[str, float, float]] = []
+    cur_idx = 0
+    for speaker, ref_text in utterances:
+        ref_tokens = [w.lower() for w in re.findall(r"\w+", ref_text)]
+        if not ref_tokens:
+            continue
+        ref_set = set(ref_tokens)
+        best_score = 0.0
+        best_j = None
+        best_k = None
+        # Search windows starting at cur_idx
+        for j in range(cur_idx, len(hyp_segments)):
+            combined = []
+            for k in range(j, len(hyp_segments)):
+                combined.extend(hyp_words[k])
+                if not combined:
+                    continue
+                comb_set = set(combined)
+                score = len(ref_set & comb_set) / max(1, len(ref_set))
+                if score > best_score:
+                    best_score = score
+                    best_j = j
+                    best_k = k
+                # early break if we reach high confidence
+                if score >= 0.75:
+                    break
+        if best_j is not None and best_score >= min_score:
+            start = float(getattr(hyp_segments[best_j], "start", 0.0))
+            end = float(getattr(hyp_segments[best_k], "end", start))
+            spk = re.sub(r"[^0-9A-Za-z_\-]", "_", speaker)
+            results.append((spk, start, end))
+            cur_idx = best_k + 1
+        else:
+            # If no match found, skip (could be silence/non-speech)
+            continue
+    return results
+def create_ground_truth_template(
+    output_path: Union[str, Path], audio_duration: float, num_speakers: int = 2
+):
+    """
+    Create template ground truth files for annotation.
+    Args:
+        output_path: Output directory
+        audio_duration: Duration of audio in seconds
+        num_speakers: Expected number of speakers
+    """
+    output_path = Path(output_path)
+    output_path.mkdir(parents=True, exist_ok=True)
+    # Create transcript template
+    transcript_template = """# Ground Truth Transcript
+# Instruksi: Tulis transkripsi lengkap audio di bawah ini
+# Hapus baris komentar (yang dimulai dengan #) sebelum evaluasi
+[Tulis transkripsi di sini...]
+"""
+    with open(output_path / "transcript.txt", "w", encoding="utf-8") as f:
+        f.write(transcript_template)
+    # Create RTTM template
+    rttm_template = f"""# Ground Truth Diarization (RTTM Format)
+# Format: SPEAKER <file_id> <channel> <start_time> <duration> <NA> <NA> <speaker_id> <NA> <NA>
+#
+# Contoh:
+# SPEAKER audio 1 0.0 5.5 <NA> <NA> SPEAKER_00 <NA> <NA>
+# SPEAKER audio 1 5.5 3.2 <NA> <NA> SPEAKER_01 <NA> <NA>
+#
+# Audio duration: {audio_duration:.2f} seconds
+# Expected speakers: {num_speakers}
+#
+# Tambahkan baris SPEAKER di bawah:
+"""
+    with open(output_path / "diarization.rttm", "w", encoding="utf-8") as f:
+        f.write(rttm_template)
+    print(f"Ground truth templates created in: {output_path}")