Spaces:
Sleeping
Sleeping
Upload 6 files
Browse files- Dockerfile +21 -20
- config.yaml +209 -0
- main.py +602 -0
- requirements.txt +72 -3
- setup.py +43 -0
- streamlit_app.py +259 -0
Dockerfile
CHANGED
|
@@ -1,20 +1,21 @@
|
|
| 1 |
-
FROM python:3.
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
COPY
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
RUN
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
# System deps
|
| 4 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 5 |
+
ffmpeg \
|
| 6 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 7 |
+
|
| 8 |
+
WORKDIR /app
|
| 9 |
+
|
| 10 |
+
# Copy project
|
| 11 |
+
COPY . /app
|
| 12 |
+
|
| 13 |
+
# Install Python deps
|
| 14 |
+
RUN pip install --upgrade pip
|
| 15 |
+
RUN pip install -r requirements.txt
|
| 16 |
+
|
| 17 |
+
# Expose Streamlit port
|
| 18 |
+
EXPOSE 8501
|
| 19 |
+
|
| 20 |
+
# Run Streamlit
|
| 21 |
+
CMD ["streamlit", "run", "streamlit_app.py", "--server.port", "8501", "--server.address", "0.0.0.0"]
|
config.yaml
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# CONFIG.YAML - Konfigurasi Sistem Notulensi Rapat Otomatis
|
| 3 |
+
# =============================================================================
|
| 4 |
+
|
| 5 |
+
# Informasi Proyek
|
| 6 |
+
project:
|
| 7 |
+
name: "Meeting Transcriber"
|
| 8 |
+
version: "1.0.0"
|
| 9 |
+
author: "Yermia Turangan"
|
| 10 |
+
description: "Sistem Notulensi Rapat Otomatis berbasis SpeechBrain dan BERT"
|
| 11 |
+
|
| 12 |
+
# Konfigurasi Audio
|
| 13 |
+
audio:
|
| 14 |
+
sample_rate: 16000
|
| 15 |
+
mono: true
|
| 16 |
+
normalize: true
|
| 17 |
+
trim_silence: false
|
| 18 |
+
max_duration_minutes: 60
|
| 19 |
+
|
| 20 |
+
# Konfigurasi Speaker Diarization
|
| 21 |
+
diarization:
|
| 22 |
+
# Voice Activity Detection
|
| 23 |
+
vad:
|
| 24 |
+
threshold: 0.5
|
| 25 |
+
min_speech_duration: 0.3
|
| 26 |
+
min_silence_duration: 0.3
|
| 27 |
+
speech_pad_ms: 30
|
| 28 |
+
|
| 29 |
+
# Segmentation
|
| 30 |
+
segmentation:
|
| 31 |
+
window_duration: 1.5
|
| 32 |
+
window_hop: 0.75
|
| 33 |
+
min_segment_duration: 0.5
|
| 34 |
+
|
| 35 |
+
# Speaker Embedding
|
| 36 |
+
embedding:
|
| 37 |
+
model_id: "speechbrain/spkrec-ecapa-voxceleb"
|
| 38 |
+
embedding_dim: 192
|
| 39 |
+
|
| 40 |
+
# Clustering
|
| 41 |
+
clustering:
|
| 42 |
+
method: "agglomerative" # agglomerative, spectral, kmeans
|
| 43 |
+
threshold: 0.7
|
| 44 |
+
min_cluster_size: 2
|
| 45 |
+
linkage: "average"
|
| 46 |
+
|
| 47 |
+
# Post-processing
|
| 48 |
+
postprocessing:
|
| 49 |
+
merge_gap_threshold: 0.5
|
| 50 |
+
min_segment_duration: 0.3
|
| 51 |
+
smooth_segments: true
|
| 52 |
+
|
| 53 |
+
# Konfigurasi ASR (Speech Recognition)
|
| 54 |
+
asr:
|
| 55 |
+
model_id: "whisper/whisper-base"
|
| 56 |
+
# model_id: "indonesian-nlp/wav2vec2-large-xlsr-indonesian"
|
| 57 |
+
# Alternatif: "facebook/wav2vec2-large-xlsr-53"
|
| 58 |
+
chunk_length_s: 30
|
| 59 |
+
stride_length_s: 5
|
| 60 |
+
batch_size: 4
|
| 61 |
+
return_timestamps: false
|
| 62 |
+
backend: "transformers" # options: 'transformers'|'whisper'|'speechbrain'
|
| 63 |
+
|
| 64 |
+
# Post-processing teks
|
| 65 |
+
text_postprocessing:
|
| 66 |
+
capitalize_sentences: true
|
| 67 |
+
normalize_whitespace: true
|
| 68 |
+
add_punctuation: false # Bisa diaktifkan jika ada model punctuation
|
| 69 |
+
|
| 70 |
+
# Konfigurasi BERT Summarization
|
| 71 |
+
summarization:
|
| 72 |
+
# Model
|
| 73 |
+
model_id: "indobenchmark/indobert-base-p1"
|
| 74 |
+
sentence_model_id: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
| 75 |
+
|
| 76 |
+
# Extractive Summarization Settings
|
| 77 |
+
extractive:
|
| 78 |
+
num_sentences: 5
|
| 79 |
+
min_sentence_length: 10
|
| 80 |
+
max_sentence_length: 200
|
| 81 |
+
position_weight: 0.1
|
| 82 |
+
similarity_threshold: 0.3
|
| 83 |
+
|
| 84 |
+
# Keyword Detection
|
| 85 |
+
keywords:
|
| 86 |
+
decisions:
|
| 87 |
+
- "diputuskan"
|
| 88 |
+
- "disepakati"
|
| 89 |
+
- "kesimpulan"
|
| 90 |
+
- "keputusan"
|
| 91 |
+
- "jadi"
|
| 92 |
+
- "maka"
|
| 93 |
+
- "sepakat"
|
| 94 |
+
- "setuju"
|
| 95 |
+
- "final"
|
| 96 |
+
- "kesepakatan"
|
| 97 |
+
|
| 98 |
+
action_items:
|
| 99 |
+
- "akan"
|
| 100 |
+
- "harus"
|
| 101 |
+
- "perlu"
|
| 102 |
+
- "tolong"
|
| 103 |
+
- "mohon"
|
| 104 |
+
- "deadline"
|
| 105 |
+
- "target"
|
| 106 |
+
- "tugas"
|
| 107 |
+
- "tanggung jawab"
|
| 108 |
+
- "action item"
|
| 109 |
+
- "follow up"
|
| 110 |
+
- "tindak lanjut"
|
| 111 |
+
- "dikerjakan"
|
| 112 |
+
- "selesaikan"
|
| 113 |
+
|
| 114 |
+
# Konfigurasi Document Generation
|
| 115 |
+
document:
|
| 116 |
+
template: "default"
|
| 117 |
+
|
| 118 |
+
# Struktur dokumen
|
| 119 |
+
sections:
|
| 120 |
+
header: true
|
| 121 |
+
meeting_info: true
|
| 122 |
+
summary: true
|
| 123 |
+
decisions: true
|
| 124 |
+
action_items: true
|
| 125 |
+
transcript: true
|
| 126 |
+
footer: true
|
| 127 |
+
|
| 128 |
+
# Formatting
|
| 129 |
+
formatting:
|
| 130 |
+
title_font_size: 18
|
| 131 |
+
heading_font_size: 14
|
| 132 |
+
body_font_size: 11
|
| 133 |
+
font_family: "Calibri"
|
| 134 |
+
include_timestamps: true
|
| 135 |
+
include_speaker_colors: true
|
| 136 |
+
|
| 137 |
+
# Output
|
| 138 |
+
output:
|
| 139 |
+
directory: "./data/output"
|
| 140 |
+
filename_template: "notulensi_{title}_{date}_{timestamp}"
|
| 141 |
+
|
| 142 |
+
# Konfigurasi Evaluasi
|
| 143 |
+
evaluation:
|
| 144 |
+
# WER Settings
|
| 145 |
+
wer:
|
| 146 |
+
lowercase: true
|
| 147 |
+
remove_punctuation: true
|
| 148 |
+
normalize_whitespace: true
|
| 149 |
+
|
| 150 |
+
# DER Settings
|
| 151 |
+
der:
|
| 152 |
+
collar: 0.25 # Forgiveness collar in seconds
|
| 153 |
+
skip_overlap: false
|
| 154 |
+
|
| 155 |
+
# Output
|
| 156 |
+
output:
|
| 157 |
+
save_detailed_results: true
|
| 158 |
+
generate_plots: true
|
| 159 |
+
export_csv: true
|
| 160 |
+
|
| 161 |
+
# Konfigurasi Hardware
|
| 162 |
+
hardware:
|
| 163 |
+
device: "auto" # auto, cuda, cpu
|
| 164 |
+
num_workers: 4
|
| 165 |
+
pin_memory: true
|
| 166 |
+
|
| 167 |
+
# Memory management
|
| 168 |
+
max_batch_size: 8
|
| 169 |
+
gradient_checkpointing: false
|
| 170 |
+
|
| 171 |
+
# Konfigurasi Paths
|
| 172 |
+
paths:
|
| 173 |
+
models_dir: "./models"
|
| 174 |
+
audio_dir: "./data/audio"
|
| 175 |
+
ground_truth_dir: "./data/ground_truth"
|
| 176 |
+
output_dir: "./data/output"
|
| 177 |
+
cache_dir: "./cache"
|
| 178 |
+
logs_dir: "./logs"
|
| 179 |
+
|
| 180 |
+
# Konfigurasi Logging
|
| 181 |
+
logging:
|
| 182 |
+
level: "INFO" # DEBUG, INFO, WARNING, ERROR
|
| 183 |
+
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 184 |
+
save_to_file: true
|
| 185 |
+
log_file: "./logs/pipeline.log"
|
| 186 |
+
|
| 187 |
+
# Eksperimen (untuk skripsi)
|
| 188 |
+
experiment:
|
| 189 |
+
name: "baseline_evaluation"
|
| 190 |
+
conditions:
|
| 191 |
+
- name: "bersih"
|
| 192 |
+
description: "Audio bersih, ruangan tenang"
|
| 193 |
+
expected_wer: 0.15
|
| 194 |
+
expected_der: 0.15
|
| 195 |
+
|
| 196 |
+
- name: "noisy"
|
| 197 |
+
description: "Audio dengan background noise"
|
| 198 |
+
expected_wer: 0.25
|
| 199 |
+
expected_der: 0.25
|
| 200 |
+
|
| 201 |
+
- name: "overlap"
|
| 202 |
+
description: "Audio dengan overlapping speech"
|
| 203 |
+
expected_wer: 0.35
|
| 204 |
+
expected_der: 0.40
|
| 205 |
+
|
| 206 |
+
- name: "multispeaker"
|
| 207 |
+
description: "Audio dengan 4-6 speaker"
|
| 208 |
+
expected_wer: 0.25
|
| 209 |
+
expected_der: 0.35
|
main.py
ADDED
|
@@ -0,0 +1,602 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Main Entry Point - Meeting Transcriber System
|
| 4 |
+
==============================================
|
| 5 |
+
|
| 6 |
+
Automatic Meeting Minutes Generation using SpeechBrain + BERT
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
# Basic transcription
|
| 10 |
+
python main.py --audio meeting.wav --title "Team Meeting"
|
| 11 |
+
|
| 12 |
+
# With evaluation
|
| 13 |
+
python main.py --audio meeting.wav --evaluate --reference transcript.txt
|
| 14 |
+
|
| 15 |
+
# Batch processing
|
| 16 |
+
python main.py --batch ./audio_folder/ --output ./results/
|
| 17 |
+
|
| 18 |
+
# Specify number of speakers
|
| 19 |
+
python main.py --audio meeting.wav --speakers 4
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
import argparse
|
| 23 |
+
import os
|
| 24 |
+
import sys
|
| 25 |
+
from datetime import datetime
|
| 26 |
+
from pathlib import Path
|
| 27 |
+
from typing import List
|
| 28 |
+
|
| 29 |
+
from src.evaluator import EvaluationResult, Evaluator
|
| 30 |
+
from src.pipeline import MeetingTranscriberPipeline, PipelineConfig, PipelineResult
|
| 31 |
+
from src.utils import (
|
| 32 |
+
format_duration,
|
| 33 |
+
list_audio_files,
|
| 34 |
+
parse_rttm_file,
|
| 35 |
+
parse_transcript_file,
|
| 36 |
+
validate_audio_file,
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def parse_args():
|
| 41 |
+
"""Parse command line arguments"""
|
| 42 |
+
parser = argparse.ArgumentParser(
|
| 43 |
+
description="Sistem Notulensi Rapat Otomatis (SpeechBrain + BERT)",
|
| 44 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 45 |
+
epilog="""
|
| 46 |
+
Contoh Penggunaan:
|
| 47 |
+
==================
|
| 48 |
+
|
| 49 |
+
# Transkripsi dasar
|
| 50 |
+
python main.py --audio rapat.wav
|
| 51 |
+
|
| 52 |
+
# Dengan detail rapat
|
| 53 |
+
python main.py --audio rapat.wav --title "Rapat Sprint" --speakers 4 --location "Zoom"
|
| 54 |
+
|
| 55 |
+
# Dengan evaluasi WER
|
| 56 |
+
python main.py --audio rapat.wav --evaluate --reference transkrip_manual.txt
|
| 57 |
+
|
| 58 |
+
# Batch processing
|
| 59 |
+
python main.py --batch ./folder_audio/ --output ./hasil/
|
| 60 |
+
|
| 61 |
+
Untuk dokumentasi lengkap, lihat README.md
|
| 62 |
+
""",
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
# Input arguments
|
| 66 |
+
input_group = parser.add_argument_group("Input")
|
| 67 |
+
input_group.add_argument(
|
| 68 |
+
"--audio", "-a", type=str, help="Path ke file audio (.wav, .mp3, .m4a)"
|
| 69 |
+
)
|
| 70 |
+
input_group.add_argument(
|
| 71 |
+
"--batch", "-b", type=str, help="Direktori berisi file audio untuk batch processing"
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# Meeting metadata
|
| 75 |
+
meta_group = parser.add_argument_group("Meeting Metadata")
|
| 76 |
+
meta_group.add_argument(
|
| 77 |
+
"--title",
|
| 78 |
+
"-t",
|
| 79 |
+
type=str,
|
| 80 |
+
default="Notulensi Rapat",
|
| 81 |
+
help="Judul rapat (default: 'Notulensi Rapat')",
|
| 82 |
+
)
|
| 83 |
+
meta_group.add_argument(
|
| 84 |
+
"--date", "-d", type=str, default=None, help="Tanggal rapat (default: hari ini)"
|
| 85 |
+
)
|
| 86 |
+
meta_group.add_argument("--location", "-l", type=str, default="", help="Lokasi/platform rapat")
|
| 87 |
+
meta_group.add_argument(
|
| 88 |
+
"--speakers",
|
| 89 |
+
"-s",
|
| 90 |
+
type=int,
|
| 91 |
+
default=None,
|
| 92 |
+
help="Jumlah speaker (opsional, auto-detect jika tidak disebut)",
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
meta_group.add_argument(
|
| 96 |
+
"--speaker-map",
|
| 97 |
+
type=str,
|
| 98 |
+
default=None,
|
| 99 |
+
help='Path ke JSON/YAML file yang memetakan speaker label (SPEAKER_00) ke nama (mis: {"SPEAKER_00": "Budi"})',
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
meta_group.add_argument(
|
| 103 |
+
"--tune-diarization",
|
| 104 |
+
action="store_true",
|
| 105 |
+
help="Jalankan tuning hyperparameter diarization sebelum clustering (tries several settings)",
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
meta_group.add_argument(
|
| 109 |
+
"--target-speakers",
|
| 110 |
+
type=int,
|
| 111 |
+
default=None,
|
| 112 |
+
help="Target jumlah speaker untuk dipaksakan (opsional). Jika diset, pipeline akan mencoba merge cluster hingga jumlah ini.",
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
# Performance and tuning flags
|
| 116 |
+
misc_group = parser.add_argument_group("Performance")
|
| 117 |
+
misc_group.add_argument(
|
| 118 |
+
"--fast",
|
| 119 |
+
action="store_true",
|
| 120 |
+
help="Aktifkan modus cepat (mengorbankan sedikit akurasi demi kinerja)",
|
| 121 |
+
)
|
| 122 |
+
misc_group.add_argument(
|
| 123 |
+
"--preset",
|
| 124 |
+
type=str,
|
| 125 |
+
choices=["deployment", "balanced", "fast", "accurate"],
|
| 126 |
+
default="deployment",
|
| 127 |
+
help="Preset pipeline yang merekomendasikan konfigurasi (default: deployment - prefer 'large-v3-turbo')",
|
| 128 |
+
)
|
| 129 |
+
misc_group.add_argument(
|
| 130 |
+
"--quick-asr",
|
| 131 |
+
action="store_true",
|
| 132 |
+
help="Gunakan backend ASR lebih ringan/cepat (model kecil) jika memungkinkan (opsional override)",
|
| 133 |
+
)
|
| 134 |
+
misc_group.add_argument(
|
| 135 |
+
"--prefer-whisper-small",
|
| 136 |
+
action="store_true",
|
| 137 |
+
help="Paksa penggunaan `openai/whisper-small` untuk ASR (lebih cepat, lebih ringan)",
|
| 138 |
+
)
|
| 139 |
+
misc_group.add_argument(
|
| 140 |
+
"--cst-hz",
|
| 141 |
+
type=float,
|
| 142 |
+
default=None,
|
| 143 |
+
help="(opsional) Approximate Continuous Speech Tokenizer token rate in Hz (e.g., 7.5). Applies lossy compression preprocessor for speed.",
|
| 144 |
+
)
|
| 145 |
+
misc_group.add_argument(
|
| 146 |
+
"--diarization-compare",
|
| 147 |
+
action="store_true",
|
| 148 |
+
help="Jalankan perbandingan metode diarization (agglomerative vs spectral) selama evaluasi",
|
| 149 |
+
)
|
| 150 |
+
misc_group.add_argument(
|
| 151 |
+
"--parallel-workers",
|
| 152 |
+
type=int,
|
| 153 |
+
default=None,
|
| 154 |
+
help="Override jumlah worker paralel untuk per-segment ASR (default: auto berdasarkan CPU atau preset)",
|
| 155 |
+
)
|
| 156 |
+
misc_group.add_argument(
|
| 157 |
+
"--no-embedding-cache",
|
| 158 |
+
action="store_true",
|
| 159 |
+
help="Nonaktifkan cache embeddings di disk (default: aktif)",
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
# Output settings
|
| 163 |
+
output_group = parser.add_argument_group("Output")
|
| 164 |
+
output_group.add_argument(
|
| 165 |
+
"--output",
|
| 166 |
+
"-o",
|
| 167 |
+
type=str,
|
| 168 |
+
default="./data/output",
|
| 169 |
+
help="Direktori output (default: ./data/output)",
|
| 170 |
+
)
|
| 171 |
+
output_group.add_argument(
|
| 172 |
+
"--filename",
|
| 173 |
+
"-f",
|
| 174 |
+
type=str,
|
| 175 |
+
default=None,
|
| 176 |
+
help="Nama file output (auto-generate jika tidak disebut)",
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
# Evaluation
|
| 180 |
+
eval_group = parser.add_argument_group("Evaluation")
|
| 181 |
+
eval_group.add_argument("--evaluate", "-e", action="store_true", help="Aktifkan mode evaluasi")
|
| 182 |
+
eval_group.add_argument(
|
| 183 |
+
"--reference",
|
| 184 |
+
"-r",
|
| 185 |
+
type=str,
|
| 186 |
+
default=None,
|
| 187 |
+
help="Path ke file reference transcript untuk WER",
|
| 188 |
+
)
|
| 189 |
+
eval_group.add_argument(
|
| 190 |
+
"--reference-rttm", type=str, default=None, help="Path ke file RTTM untuk DER"
|
| 191 |
+
)
|
| 192 |
+
eval_group.add_argument(
|
| 193 |
+
"--reference-summary",
|
| 194 |
+
type=str,
|
| 195 |
+
default=None,
|
| 196 |
+
help="Path ke file reference summary untuk evaluasi ringkasan (ROUGE/BERTScore)",
|
| 197 |
+
)
|
| 198 |
+
eval_group.add_argument(
|
| 199 |
+
"--condition",
|
| 200 |
+
type=str,
|
| 201 |
+
default="unknown",
|
| 202 |
+
help="Nama kondisi untuk evaluasi (misal: bersih, noisy)",
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
# Model settings
|
| 206 |
+
model_group = parser.add_argument_group("Model Settings")
|
| 207 |
+
model_group.add_argument(
|
| 208 |
+
"--asr-model",
|
| 209 |
+
type=str,
|
| 210 |
+
default="large-v3-turbo",
|
| 211 |
+
help="ASR model (HF model id / alias / path folder model lokal). Default: large-v3-turbo for better accuracy.",
|
| 212 |
+
)
|
| 213 |
+
model_group.add_argument(
|
| 214 |
+
"--asr-backend",
|
| 215 |
+
type=str,
|
| 216 |
+
default="whisper",
|
| 217 |
+
choices=["whisperx", "whisper", "transformers", "speechbrain"],
|
| 218 |
+
help="Backend ASR (default: whisper)",
|
| 219 |
+
)
|
| 220 |
+
model_group.add_argument(
|
| 221 |
+
"--asr-language",
|
| 222 |
+
type=str,
|
| 223 |
+
default="id",
|
| 224 |
+
help="Kode bahasa (mis: id, en, auto). Untuk WhisperX: 'auto' = autodetect.",
|
| 225 |
+
)
|
| 226 |
+
model_group.add_argument(
|
| 227 |
+
"--whisperx-compute-type",
|
| 228 |
+
type=str,
|
| 229 |
+
default="auto",
|
| 230 |
+
help="WhisperX compute_type (auto|float16|int8|int8_float16). Default auto.",
|
| 231 |
+
)
|
| 232 |
+
model_group.add_argument(
|
| 233 |
+
"--whisperx-no-vad-filter",
|
| 234 |
+
action="store_true",
|
| 235 |
+
help="Matikan VAD filter WhisperX (kadang berguna untuk audio sangat pendek/aneh).",
|
| 236 |
+
)
|
| 237 |
+
model_group.add_argument(
|
| 238 |
+
"--device",
|
| 239 |
+
type=str,
|
| 240 |
+
default="auto",
|
| 241 |
+
choices=["auto", "cuda", "cpu"],
|
| 242 |
+
help="Device untuk inferensi (default: auto)",
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
# Misc
|
| 246 |
+
misc_group = parser.add_argument_group("Misc")
|
| 247 |
+
misc_group.add_argument(
|
| 248 |
+
"--verbose", "-v", action="store_true", default=True, help="Output verbose"
|
| 249 |
+
)
|
| 250 |
+
misc_group.add_argument("--quiet", "-q", action="store_true", help="Minimal output")
|
| 251 |
+
misc_group.add_argument(
|
| 252 |
+
"--no-save-intermediate", action="store_true", help="Jangan simpan hasil intermediate"
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
return parser.parse_args()
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def print_banner():
|
| 259 |
+
"""Print application banner"""
|
| 260 |
+
banner = """
|
| 261 |
+
╔══════════════════════════════════════════════════════════════════╗
|
| 262 |
+
║ ║
|
| 263 |
+
║ ███╗ ███╗███████╗███████╗████████╗██╗███╗ ██╗ ██████╗ ║
|
| 264 |
+
║ ████╗ ████║██╔════╝██╔════╝╚══██╔══╝██║████╗ ██║██╔════╝ ║
|
| 265 |
+
║ ██╔████╔██║█████╗ █████╗ ██║ ██║██╔██╗ ██║██║ ███╗ ║
|
| 266 |
+
║ ██║╚██╔╝██║██╔══╝ ██╔══╝ ██║ ██║██║╚██╗██║██║ ██║ ║
|
| 267 |
+
║ ██║ ╚═╝ ██║███████╗███████╗ ██║ ██║██║ ╚████║╚██████╔╝ ║
|
| 268 |
+
║ ╚═╝ ╚═╝╚══════╝╚══════╝ ╚═╝ ╚═╝╚═╝ ╚═══╝ ╚═════╝ ║
|
| 269 |
+
║ ║
|
| 270 |
+
║ TRANSCRIBER - Notulensi Rapat Otomatis ║
|
| 271 |
+
║ SpeechBrain + BERT Pipeline ║
|
| 272 |
+
║ ║
|
| 273 |
+
╚══════════════════════════════════════════════════════════════════╝
|
| 274 |
+
"""
|
| 275 |
+
print(banner)
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
def process_single_audio(args, pipeline: MeetingTranscriberPipeline) -> PipelineResult:
|
| 279 |
+
"""Process a single audio file"""
|
| 280 |
+
|
| 281 |
+
# Validate audio file
|
| 282 |
+
validate_audio_file(args.audio)
|
| 283 |
+
|
| 284 |
+
print(f"\n{'='*60}")
|
| 285 |
+
print(f"Processing: {args.audio}")
|
| 286 |
+
print(f"{'='*60}")
|
| 287 |
+
|
| 288 |
+
# Run pipeline
|
| 289 |
+
result = pipeline.process(
|
| 290 |
+
audio_path=args.audio,
|
| 291 |
+
title=args.title,
|
| 292 |
+
date=args.date,
|
| 293 |
+
location=args.location,
|
| 294 |
+
num_speakers=args.speakers,
|
| 295 |
+
output_filename=args.filename,
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
# Print summary
|
| 299 |
+
print_result_summary(result)
|
| 300 |
+
|
| 301 |
+
# Run evaluation if requested
|
| 302 |
+
if args.evaluate:
|
| 303 |
+
run_evaluation(args, pipeline, result)
|
| 304 |
+
|
| 305 |
+
return result
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
def process_batch(args, pipeline: MeetingTranscriberPipeline) -> List[PipelineResult]:
|
| 309 |
+
"""Process multiple audio files in a directory"""
|
| 310 |
+
|
| 311 |
+
batch_dir = Path(args.batch)
|
| 312 |
+
|
| 313 |
+
if not batch_dir.is_dir():
|
| 314 |
+
print(f"Error: Direktori tidak ditemukan: {args.batch}")
|
| 315 |
+
sys.exit(1)
|
| 316 |
+
|
| 317 |
+
# Find audio files
|
| 318 |
+
audio_files = list_audio_files(batch_dir)
|
| 319 |
+
|
| 320 |
+
if not audio_files:
|
| 321 |
+
print(f"Tidak ada file audio ditemukan di: {args.batch}")
|
| 322 |
+
sys.exit(1)
|
| 323 |
+
|
| 324 |
+
print(f"\nDitemukan {len(audio_files)} file audio untuk diproses")
|
| 325 |
+
print("-" * 60)
|
| 326 |
+
|
| 327 |
+
results = []
|
| 328 |
+
failed = []
|
| 329 |
+
|
| 330 |
+
for i, audio_path in enumerate(audio_files, 1):
|
| 331 |
+
print(f"\n[{i}/{len(audio_files)}] Processing: {audio_path.name}")
|
| 332 |
+
|
| 333 |
+
try:
|
| 334 |
+
# Generate title from filename
|
| 335 |
+
title = audio_path.stem.replace("_", " ").replace("-", " ").title()
|
| 336 |
+
|
| 337 |
+
result = pipeline.process(
|
| 338 |
+
audio_path=str(audio_path),
|
| 339 |
+
title=title,
|
| 340 |
+
date=args.date,
|
| 341 |
+
location=args.location,
|
| 342 |
+
num_speakers=args.speakers,
|
| 343 |
+
)
|
| 344 |
+
results.append(result)
|
| 345 |
+
|
| 346 |
+
# Clear state for next file
|
| 347 |
+
pipeline.clear_state()
|
| 348 |
+
|
| 349 |
+
except Exception as e:
|
| 350 |
+
print(f"Error processing {audio_path.name}: {e}")
|
| 351 |
+
failed.append((audio_path.name, str(e)))
|
| 352 |
+
continue
|
| 353 |
+
|
| 354 |
+
# Print batch summary
|
| 355 |
+
print_batch_summary(results, failed, audio_files)
|
| 356 |
+
|
| 357 |
+
return results
|
| 358 |
+
|
| 359 |
+
|
| 360 |
+
def run_evaluation(args, pipeline: MeetingTranscriberPipeline, result: PipelineResult):
|
| 361 |
+
"""Run evaluation with reference files"""
|
| 362 |
+
|
| 363 |
+
print(f"\n{'='*60}")
|
| 364 |
+
print("EVALUASI")
|
| 365 |
+
print(f"{'='*60}")
|
| 366 |
+
|
| 367 |
+
reference_transcript = None
|
| 368 |
+
reference_diarization = None
|
| 369 |
+
|
| 370 |
+
# Load reference transcript
|
| 371 |
+
if args.reference:
|
| 372 |
+
if not os.path.exists(args.reference):
|
| 373 |
+
print(f"Warning: File reference tidak ditemukan: {args.reference}")
|
| 374 |
+
else:
|
| 375 |
+
reference_transcript = parse_transcript_file(args.reference)
|
| 376 |
+
print(f"Reference transcript loaded: {len(reference_transcript.split())} words")
|
| 377 |
+
|
| 378 |
+
# Load reference diarization
|
| 379 |
+
if args.reference_rttm:
|
| 380 |
+
if not os.path.exists(args.reference_rttm):
|
| 381 |
+
print(f"Warning: File RTTM tidak ditemukan: {args.reference_rttm}")
|
| 382 |
+
else:
|
| 383 |
+
reference_diarization = parse_rttm_file(args.reference_rttm)
|
| 384 |
+
print(f"Reference diarization loaded: {len(reference_diarization)} segments")
|
| 385 |
+
else:
|
| 386 |
+
# If user didn't provide an RTTM, try to find a *_vibevoice.rttm for the sample
|
| 387 |
+
try:
|
| 388 |
+
audio_stem = Path(args.audio).stem
|
| 389 |
+
cand = Path("data/ground_truth") / f"{audio_stem}_vibevoice.rttm"
|
| 390 |
+
if cand.exists():
|
| 391 |
+
reference_diarization = parse_rttm_file(str(cand))
|
| 392 |
+
print(f"Reference RTTM auto-loaded: {cand} ({len(reference_diarization)} segments)")
|
| 393 |
+
except Exception:
|
| 394 |
+
pass
|
| 395 |
+
|
| 396 |
+
# Load reference summary (optional)
|
| 397 |
+
reference_summary = None
|
| 398 |
+
if getattr(args, "reference_summary", None):
|
| 399 |
+
if not os.path.exists(args.reference_summary):
|
| 400 |
+
print(f"Warning: File reference summary tidak ditemukan: {args.reference_summary}")
|
| 401 |
+
else:
|
| 402 |
+
try:
|
| 403 |
+
reference_summary = Path(args.reference_summary).read_text(encoding="utf-8")
|
| 404 |
+
print(f"Reference summary loaded (len={len(reference_summary.split())} words)")
|
| 405 |
+
except Exception as e:
|
| 406 |
+
print(f"Warning: gagal membaca file summary: {e}")
|
| 407 |
+
|
| 408 |
+
# Run evaluation
|
| 409 |
+
eval_result = pipeline.evaluate(
|
| 410 |
+
reference_transcript=reference_transcript,
|
| 411 |
+
reference_diarization=reference_diarization,
|
| 412 |
+
reference_summary=reference_summary,
|
| 413 |
+
sample_name=Path(args.audio).stem,
|
| 414 |
+
condition=args.condition,
|
| 415 |
+
)
|
| 416 |
+
|
| 417 |
+
# Print evaluation results
|
| 418 |
+
print_evaluation_results(eval_result)
|
| 419 |
+
|
| 420 |
+
# Generate and save report
|
| 421 |
+
evaluator = Evaluator(output_dir=args.output)
|
| 422 |
+
|
| 423 |
+
wer_results = [eval_result.wer_result] if eval_result.wer_result else []
|
| 424 |
+
der_results = [eval_result.der_result] if eval_result.der_result else []
|
| 425 |
+
|
| 426 |
+
# Pass evaluation metadata for reproducibility & documentation
|
| 427 |
+
report = evaluator.generate_evaluation_report(
|
| 428 |
+
wer_results=wer_results,
|
| 429 |
+
der_results=der_results,
|
| 430 |
+
summary_results=[eval_result.summary_result] if eval_result.summary_result else None,
|
| 431 |
+
sample_names=[eval_result.sample_name],
|
| 432 |
+
condition_name=args.condition,
|
| 433 |
+
metadata=eval_result.metadata,
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
# Save report
|
| 437 |
+
report_path = evaluator.save_report(
|
| 438 |
+
report,
|
| 439 |
+
f"evaluation_{eval_result.sample_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
|
| 440 |
+
)
|
| 441 |
+
print(f"\nEvaluation report saved: {report_path}")
|
| 442 |
+
|
| 443 |
+
|
| 444 |
+
def print_result_summary(result: PipelineResult):
|
| 445 |
+
"""Print processing result summary"""
|
| 446 |
+
print(f"\n{'='*60}")
|
| 447 |
+
print("HASIL PEMROSESAN")
|
| 448 |
+
print(f"{'='*60}")
|
| 449 |
+
print(f" Audio Duration : {format_duration(result.audio_duration)}")
|
| 450 |
+
print(f" Speakers Found : {result.num_speakers}")
|
| 451 |
+
print(f" Total Segments : {result.num_segments}")
|
| 452 |
+
print(f" Total Words : {result.total_words}")
|
| 453 |
+
print(f" Processing Time : {format_duration(result.processing_time)}")
|
| 454 |
+
print(f" Output Document : {result.document_path}")
|
| 455 |
+
print(f"{'='*60}")
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
def print_evaluation_results(eval_result: EvaluationResult):
|
| 459 |
+
"""Print evaluation results"""
|
| 460 |
+
print("\n--- Hasil Evaluasi ---")
|
| 461 |
+
|
| 462 |
+
if eval_result.wer_result:
|
| 463 |
+
wer = eval_result.wer_result
|
| 464 |
+
print("\nWord Error Rate (WER):")
|
| 465 |
+
print(f" WER : {wer.wer:.4f} ({wer.wer*100:.2f}%)")
|
| 466 |
+
print(f" Substitutions : {wer.substitutions}")
|
| 467 |
+
print(f" Deletions : {wer.deletions}")
|
| 468 |
+
print(f" Insertions : {wer.insertions}")
|
| 469 |
+
print(f" Correct : {wer.hits}")
|
| 470 |
+
|
| 471 |
+
if eval_result.der_result:
|
| 472 |
+
der = eval_result.der_result
|
| 473 |
+
print("\nDiarization Error Rate (DER):")
|
| 474 |
+
print(f" DER : {der.der:.4f} ({der.der*100:.2f}%)")
|
| 475 |
+
print(f" Missed Speech : {der.missed_speech:.4f} ({der.missed_speech*100:.2f}%)")
|
| 476 |
+
print(f" False Alarm : {der.false_alarm:.4f} ({der.false_alarm*100:.2f}%)")
|
| 477 |
+
print(
|
| 478 |
+
f" Speaker Confusion : {der.speaker_confusion:.4f} ({der.speaker_confusion*100:.2f}%)"
|
| 479 |
+
)
|
| 480 |
+
|
| 481 |
+
# Summary metrics (if available)
|
| 482 |
+
if eval_result.summary_result:
|
| 483 |
+
s = eval_result.summary_result
|
| 484 |
+
print("\nRingkasan (Summary) Evaluation:")
|
| 485 |
+
try:
|
| 486 |
+
print(f" ROUGE-1 F1 : {s.rouge.get('rouge1_f', 0.0):.4f}")
|
| 487 |
+
print(f" ROUGE-2 F1 : {s.rouge.get('rouge2_f', 0.0):.4f}")
|
| 488 |
+
print(f" ROUGE-L F1 : {s.rouge.get('rougel_f', 0.0):.4f}")
|
| 489 |
+
print(f" BERTScore F1 : {s.bertscore.get('bertscore_f1', 0.0):.4f}")
|
| 490 |
+
except Exception as e:
|
| 491 |
+
print(f" (failed to print summary metrics: {e})")
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
def print_batch_summary(
|
| 495 |
+
results: List[PipelineResult], failed: List[tuple], total_files: List[Path]
|
| 496 |
+
):
|
| 497 |
+
"""Print batch processing summary"""
|
| 498 |
+
print(f"\n{'='*60}")
|
| 499 |
+
print("RINGKASAN BATCH PROCESSING")
|
| 500 |
+
print(f"{'='*60}")
|
| 501 |
+
print(f" Total files : {len(total_files)}")
|
| 502 |
+
print(f" Successful : {len(results)}")
|
| 503 |
+
print(f" Failed : {len(failed)}")
|
| 504 |
+
|
| 505 |
+
if results:
|
| 506 |
+
total_duration = sum(r.audio_duration for r in results)
|
| 507 |
+
total_time = sum(r.processing_time for r in results)
|
| 508 |
+
avg_time = total_time / len(results)
|
| 509 |
+
|
| 510 |
+
print(f" Total audio : {format_duration(total_duration)}")
|
| 511 |
+
print(f" Total proc. time : {format_duration(total_time)}")
|
| 512 |
+
print(f" Avg time/file : {format_duration(avg_time)}")
|
| 513 |
+
|
| 514 |
+
if failed:
|
| 515 |
+
print("\n Failed files:")
|
| 516 |
+
for filename, error in failed:
|
| 517 |
+
print(f" - {filename}: {error[:50]}...")
|
| 518 |
+
|
| 519 |
+
print(f"{'='*60}")
|
| 520 |
+
|
| 521 |
+
|
| 522 |
+
def main():
|
| 523 |
+
"""Main entry point"""
|
| 524 |
+
args = parse_args()
|
| 525 |
+
|
| 526 |
+
# Handle quiet mode
|
| 527 |
+
verbose = not args.quiet and args.verbose
|
| 528 |
+
|
| 529 |
+
if verbose:
|
| 530 |
+
print_banner()
|
| 531 |
+
|
| 532 |
+
# Validate input
|
| 533 |
+
if not args.audio and not args.batch:
|
| 534 |
+
print("Error: Harap tentukan --audio atau --batch")
|
| 535 |
+
print("Gunakan --help untuk informasi penggunaan")
|
| 536 |
+
sys.exit(1)
|
| 537 |
+
|
| 538 |
+
# Determine device
|
| 539 |
+
device = args.device
|
| 540 |
+
if device == "auto":
|
| 541 |
+
import torch
|
| 542 |
+
|
| 543 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 544 |
+
|
| 545 |
+
if verbose:
|
| 546 |
+
print("\nDevice: {}".format(device))
|
| 547 |
+
print("ASR Backend: {}".format(args.asr_backend))
|
| 548 |
+
print("ASR Model: {}".format(args.asr_model))
|
| 549 |
+
print("ASR Language: {}".format(args.asr_language))
|
| 550 |
+
print("Output Dir: {}".format(args.output))
|
| 551 |
+
|
| 552 |
+
# Initialize pipeline
|
| 553 |
+
config = PipelineConfig(
|
| 554 |
+
output_dir=args.output,
|
| 555 |
+
asr_model_id=args.asr_model,
|
| 556 |
+
asr_backend=args.asr_backend,
|
| 557 |
+
asr_language=args.asr_language,
|
| 558 |
+
whisperx_compute_type=args.whisperx_compute_type,
|
| 559 |
+
whisperx_vad_filter=not args.whisperx_no_vad_filter,
|
| 560 |
+
device=device,
|
| 561 |
+
verbose=verbose,
|
| 562 |
+
save_intermediate=not args.no_save_intermediate,
|
| 563 |
+
fast_mode=args.fast,
|
| 564 |
+
quick_asr=args.quick_asr,
|
| 565 |
+
prefer_whisper_small=args.prefer_whisper_small,
|
| 566 |
+
cst_hz=args.cst_hz,
|
| 567 |
+
diarization_compare=args.diarization_compare,
|
| 568 |
+
embedding_cache=not args.no_embedding_cache,
|
| 569 |
+
target_speakers=args.target_speakers,
|
| 570 |
+
# New flags
|
| 571 |
+
asr_parallel_workers=args.parallel_workers,
|
| 572 |
+
speaker_map_path=args.speaker_map,
|
| 573 |
+
tune_diarization=args.tune_diarization,
|
| 574 |
+
num_speakers=args.speakers,
|
| 575 |
+
preset=args.preset,
|
| 576 |
+
)
|
| 577 |
+
|
| 578 |
+
pipeline = MeetingTranscriberPipeline(config)
|
| 579 |
+
|
| 580 |
+
# Run processing
|
| 581 |
+
try:
|
| 582 |
+
if args.batch:
|
| 583 |
+
process_batch(args, pipeline)
|
| 584 |
+
else:
|
| 585 |
+
process_single_audio(args, pipeline)
|
| 586 |
+
|
| 587 |
+
print("\nSelesai!")
|
| 588 |
+
|
| 589 |
+
except KeyboardInterrupt:
|
| 590 |
+
print("\n\nProses dibatalkan oleh user")
|
| 591 |
+
sys.exit(1)
|
| 592 |
+
except Exception as e:
|
| 593 |
+
print(f"\nError: {e}")
|
| 594 |
+
if verbose:
|
| 595 |
+
import traceback
|
| 596 |
+
|
| 597 |
+
traceback.print_exc()
|
| 598 |
+
sys.exit(1)
|
| 599 |
+
|
| 600 |
+
|
| 601 |
+
if __name__ == "__main__":
|
| 602 |
+
main()
|
requirements.txt
CHANGED
|
@@ -1,3 +1,72 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# REQUIREMENTS - Sistem Notulensi Rapat Otomatis
|
| 3 |
+
# SpeechBrain + BERT Pipeline
|
| 4 |
+
# =============================================================================
|
| 5 |
+
|
| 6 |
+
# Core Deep Learning
|
| 7 |
+
torch>=2.0.0
|
| 8 |
+
torchaudio>=2.0.0
|
| 9 |
+
|
| 10 |
+
# Speech Processing (SpeechBrain)
|
| 11 |
+
speechbrain>=0.5.15
|
| 12 |
+
|
| 13 |
+
# NLP & Transformers
|
| 14 |
+
transformers>=4.30.0
|
| 15 |
+
sentence-transformers>=2.2.0
|
| 16 |
+
tokenizers>=0.13.0
|
| 17 |
+
|
| 18 |
+
# Audio Processing
|
| 19 |
+
librosa>=0.10.0
|
| 20 |
+
soundfile>=0.12.0
|
| 21 |
+
pydub>=0.25.1
|
| 22 |
+
webrtcvad>=2.0.10
|
| 23 |
+
|
| 24 |
+
# Document Generation
|
| 25 |
+
python-docx>=0.8.11
|
| 26 |
+
|
| 27 |
+
# Evaluation Metrics
|
| 28 |
+
jiwer>=3.0.0
|
| 29 |
+
|
| 30 |
+
# Data Processing
|
| 31 |
+
numpy>=1.24.0
|
| 32 |
+
pandas>=2.0.0
|
| 33 |
+
scipy>=1.10.0
|
| 34 |
+
|
| 35 |
+
# Machine Learning
|
| 36 |
+
scikit-learn>=1.3.0
|
| 37 |
+
|
| 38 |
+
# Visualization
|
| 39 |
+
matplotlib>=3.7.0
|
| 40 |
+
seaborn>=0.12.0
|
| 41 |
+
|
| 42 |
+
# Configuration
|
| 43 |
+
pyyaml>=6.0
|
| 44 |
+
python-dotenv>=1.0.0
|
| 45 |
+
|
| 46 |
+
# Utilities
|
| 47 |
+
tqdm>=4.65.0
|
| 48 |
+
colorama>=0.4.6
|
| 49 |
+
tabulate>=0.9.0
|
| 50 |
+
|
| 51 |
+
# Jupyter (untuk notebooks)
|
| 52 |
+
jupyter>=1.0.0
|
| 53 |
+
ipywidgets>=8.0.0
|
| 54 |
+
|
| 55 |
+
# Testing
|
| 56 |
+
pytest>=7.0.0
|
| 57 |
+
|
| 58 |
+
# Web UI
|
| 59 |
+
streamlit>=1.18.0
|
| 60 |
+
|
| 61 |
+
# Optional: GPU monitoring
|
| 62 |
+
# nvidia-ml-py>=12.0.0
|
| 63 |
+
|
| 64 |
+
# Training & Evaluation (for Whisper fine-tuning)
|
| 65 |
+
datasets>=2.14.0
|
| 66 |
+
evaluate>=0.4.0
|
| 67 |
+
accelerate>=0.20.3
|
| 68 |
+
peft>=0.4.0
|
| 69 |
+
transformers[torch]>=4.30.0
|
| 70 |
+
ffmpeg-python>=0.1.18
|
| 71 |
+
langdetect>=1.0.9
|
| 72 |
+
whisperx>=1.0.0
|
setup.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Setup script for Meeting Transcriber package
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from setuptools import find_packages, setup
|
| 7 |
+
|
| 8 |
+
with open("README.md", "r", encoding="utf-8") as fh:
|
| 9 |
+
long_description = fh.read()
|
| 10 |
+
|
| 11 |
+
with open("requirements.txt", "r", encoding="utf-8") as fh:
|
| 12 |
+
requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
|
| 13 |
+
|
| 14 |
+
setup(
|
| 15 |
+
name="meeting-transcriber",
|
| 16 |
+
version="1.0.0",
|
| 17 |
+
author="Yermia Turangan",
|
| 18 |
+
author_email="yermiaturangan026@student.unsrat.ac.id",
|
| 19 |
+
description="Sistem Notulensi Rapat Otomatis berbasis SpeechBrain dan BERT",
|
| 20 |
+
long_description=long_description,
|
| 21 |
+
long_description_content_type="text/markdown",
|
| 22 |
+
url="https://github.com/username/meeting-transcriber",
|
| 23 |
+
packages=find_packages(),
|
| 24 |
+
classifiers=[
|
| 25 |
+
"Development Status :: 3 - Alpha",
|
| 26 |
+
"Intended Audience :: Science/Research",
|
| 27 |
+
"License :: OSI Approved :: MIT License",
|
| 28 |
+
"Operating System :: OS Independent",
|
| 29 |
+
"Programming Language :: Python :: 3",
|
| 30 |
+
"Programming Language :: Python :: 3.8",
|
| 31 |
+
"Programming Language :: Python :: 3.9",
|
| 32 |
+
"Programming Language :: Python :: 3.10",
|
| 33 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 34 |
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
| 35 |
+
],
|
| 36 |
+
python_requires=">=3.8",
|
| 37 |
+
install_requires=requirements,
|
| 38 |
+
entry_points={
|
| 39 |
+
"console_scripts": [
|
| 40 |
+
"meeting-transcriber=main:main",
|
| 41 |
+
],
|
| 42 |
+
},
|
| 43 |
+
)
|
streamlit_app.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import tempfile
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import streamlit as st
|
| 6 |
+
|
| 7 |
+
from src.pipeline import MeetingTranscriberPipeline, PipelineConfig
|
| 8 |
+
|
| 9 |
+
st.set_page_config(page_title="Meeting Transcriber", layout="wide")
|
| 10 |
+
|
| 11 |
+
st.title("Meeting Transcriber — Demo")
|
| 12 |
+
st.markdown(
|
| 13 |
+
"Upload an audio file or pick a sample to generate transcript, summary and downloadable DOCX."
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
# Sample audio chooser
|
| 17 |
+
AUDIO_DIR = Path.cwd() / "data" / "audio"
|
| 18 |
+
# Build safe sample list: prefer paths relative to cwd, but fall back to absolute paths if not possible
|
| 19 |
+
SAMPLES = []
|
| 20 |
+
for p in AUDIO_DIR.rglob("*.mp3"):
|
| 21 |
+
try:
|
| 22 |
+
SAMPLES.append(str(p.relative_to(Path.cwd())))
|
| 23 |
+
except ValueError:
|
| 24 |
+
# Path is not under cwd (different drive or external mount), use absolute path
|
| 25 |
+
SAMPLES.append(str(p.resolve()))
|
| 26 |
+
|
| 27 |
+
with st.sidebar:
|
| 28 |
+
st.header("Settings")
|
| 29 |
+
# Detect deployment target (e.g., set STREAMLIT_DEPLOY_TARGET=community on Streamlit Cloud)
|
| 30 |
+
deploy_target = os.getenv("STREAMLIT_DEPLOY_TARGET", "")
|
| 31 |
+
# Community Cloud has no GPU and limited CPU/time; default to 'fast' preset there
|
| 32 |
+
default_index = 0
|
| 33 |
+
default_quick_asr = False
|
| 34 |
+
if deploy_target.lower() == "community":
|
| 35 |
+
default_index = 2 # 'fast'
|
| 36 |
+
default_quick_asr = True
|
| 37 |
+
st.info(
|
| 38 |
+
"Running in Streamlit Community mode: using fast preset and quick ASR for responsiveness."
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
preset = st.selectbox(
|
| 42 |
+
"Preset", ["deployment", "balanced", "fast", "accurate"], index=default_index
|
| 43 |
+
)
|
| 44 |
+
quick_asr = st.checkbox("Quick ASR (override)", value=default_quick_asr)
|
| 45 |
+
parallel_workers = st.number_input(
|
| 46 |
+
"Parallel workers (0 = auto)", min_value=0, max_value=16, value=0
|
| 47 |
+
)
|
| 48 |
+
sample_choice = st.selectbox("Pick sample audio (optional)", ["None"] + SAMPLES)
|
| 49 |
+
|
| 50 |
+
uploaded_file = st.file_uploader("Upload audio (.wav, .mp3, .m4a)")
|
| 51 |
+
|
| 52 |
+
# Determine audio path
|
| 53 |
+
audio_path = None
|
| 54 |
+
if uploaded_file is not None:
|
| 55 |
+
tmpdir = tempfile.gettempdir()
|
| 56 |
+
tmp_path = Path(tmpdir) / uploaded_file.name
|
| 57 |
+
with open(tmp_path, "wb") as f:
|
| 58 |
+
f.write(uploaded_file.read())
|
| 59 |
+
audio_path = str(tmp_path)
|
| 60 |
+
elif sample_choice and sample_choice != "None":
|
| 61 |
+
audio_path = sample_choice
|
| 62 |
+
|
| 63 |
+
if not audio_path:
|
| 64 |
+
st.info("Upload an audio file or pick a sample from the sidebar to begin.")
|
| 65 |
+
|
| 66 |
+
# Interactive flow: run diarization first and allow manual mapping
|
| 67 |
+
# Clear existing session state if user changed audio selection
|
| 68 |
+
if "diarization_done" in st.session_state and st.session_state.get("audio_path") != audio_path:
|
| 69 |
+
# Keep only unrelated session keys
|
| 70 |
+
for k in [
|
| 71 |
+
"diarization_done",
|
| 72 |
+
"pipeline",
|
| 73 |
+
"dz_res",
|
| 74 |
+
"sample_segments",
|
| 75 |
+
"snippet_transcripts",
|
| 76 |
+
"result",
|
| 77 |
+
"mapping",
|
| 78 |
+
]:
|
| 79 |
+
if k in st.session_state:
|
| 80 |
+
del st.session_state[k]
|
| 81 |
+
|
| 82 |
+
if st.button("Run diarization only"):
|
| 83 |
+
if not audio_path:
|
| 84 |
+
st.error("Please provide audio first.")
|
| 85 |
+
else:
|
| 86 |
+
cfg = PipelineConfig(preset=preset, quick_asr=quick_asr)
|
| 87 |
+
if parallel_workers and parallel_workers > 0:
|
| 88 |
+
cfg.asr_parallel_workers = int(parallel_workers)
|
| 89 |
+
pipeline = MeetingTranscriberPipeline(cfg)
|
| 90 |
+
|
| 91 |
+
with st.spinner("Running diarization..."):
|
| 92 |
+
try:
|
| 93 |
+
dz_res = pipeline.run_diarization(audio_path)
|
| 94 |
+
st.success("Diarization complete")
|
| 95 |
+
except Exception as e:
|
| 96 |
+
st.error(f"Diarization failed: {e}")
|
| 97 |
+
raise
|
| 98 |
+
|
| 99 |
+
# Persist state so interactive widgets survive reruns
|
| 100 |
+
st.session_state["diarization_done"] = True
|
| 101 |
+
st.session_state["pipeline"] = pipeline
|
| 102 |
+
st.session_state["dz_res"] = dz_res
|
| 103 |
+
st.session_state["audio_path"] = audio_path
|
| 104 |
+
|
| 105 |
+
# If we already have diarization state (either just-run or from previous interaction), show mapping UI
|
| 106 |
+
if st.session_state.get("diarization_done") and audio_path:
|
| 107 |
+
pipeline = st.session_state["pipeline"]
|
| 108 |
+
dz_res = st.session_state["dz_res"]
|
| 109 |
+
|
| 110 |
+
st.write(
|
| 111 |
+
f"Detected {len(dz_res['unique_speakers'])} speakers and {dz_res['num_segments']} segments"
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
# Playable sample and quick per-speaker snippets so user can listen/read before mapping
|
| 115 |
+
st.subheader("Sample snippets (listen + read before mapping)")
|
| 116 |
+
|
| 117 |
+
# Try to reuse cached sample snippets if present
|
| 118 |
+
sample_segments = st.session_state.get("sample_segments") or {}
|
| 119 |
+
snippet_transcripts = st.session_state.get("snippet_transcripts") or {}
|
| 120 |
+
|
| 121 |
+
if not sample_segments:
|
| 122 |
+
try:
|
| 123 |
+
dsegs = pipeline._diarization_segments or []
|
| 124 |
+
for spk in dz_res["unique_speakers"]:
|
| 125 |
+
cand = [s for s in dsegs if s.speaker_id == spk]
|
| 126 |
+
if not cand:
|
| 127 |
+
continue
|
| 128 |
+
best = max(cand, key=lambda x: x.duration)
|
| 129 |
+
cap_end = min(best.end, best.start + 10.0)
|
| 130 |
+
from src.diarization import SpeakerSegment
|
| 131 |
+
|
| 132 |
+
sample_segments[spk] = SpeakerSegment(
|
| 133 |
+
speaker_id=best.speaker_id,
|
| 134 |
+
start=best.start,
|
| 135 |
+
end=cap_end,
|
| 136 |
+
confidence=best.confidence,
|
| 137 |
+
is_overlap=best.is_overlap,
|
| 138 |
+
metadata=best.metadata.copy() if getattr(best, "metadata", None) else {},
|
| 139 |
+
)
|
| 140 |
+
st.session_state["sample_segments"] = sample_segments
|
| 141 |
+
except Exception as e:
|
| 142 |
+
st.warning(f"Could not prepare sample segments: {e}")
|
| 143 |
+
sample_segments = {}
|
| 144 |
+
|
| 145 |
+
# Run quick per-segment ASR for the sample snippets (avoid full-audio mapping for speed)
|
| 146 |
+
if not snippet_transcripts and sample_segments:
|
| 147 |
+
try:
|
| 148 |
+
transcriber = pipeline.transcriber
|
| 149 |
+
orig_full_audio = getattr(transcriber.config, "use_full_audio_for_segments", False)
|
| 150 |
+
transcriber.config.use_full_audio_for_segments = False
|
| 151 |
+
orig_workers = getattr(transcriber.config, "parallel_workers", 1)
|
| 152 |
+
transcriber.config.parallel_workers = 1
|
| 153 |
+
|
| 154 |
+
transcripts = transcriber.transcribe_segments(
|
| 155 |
+
pipeline._waveform, list(sample_segments.values()), pipeline._sample_rate
|
| 156 |
+
)
|
| 157 |
+
for t in transcripts:
|
| 158 |
+
snippet_transcripts[t.speaker_id] = t.text
|
| 159 |
+
|
| 160 |
+
transcriber.config.use_full_audio_for_segments = orig_full_audio
|
| 161 |
+
transcriber.config.parallel_workers = orig_workers
|
| 162 |
+
|
| 163 |
+
st.session_state["snippet_transcripts"] = snippet_transcripts
|
| 164 |
+
except Exception as e:
|
| 165 |
+
st.warning(f"Quick snippet transcription failed: {e}")
|
| 166 |
+
|
| 167 |
+
# Display snippets in columns with audio player + short transcript
|
| 168 |
+
import tempfile
|
| 169 |
+
|
| 170 |
+
import soundfile as sf
|
| 171 |
+
|
| 172 |
+
mapping = st.session_state.get("mapping") or {}
|
| 173 |
+
st.subheader("Manual speaker mapping")
|
| 174 |
+
audio_id = Path(audio_path).stem
|
| 175 |
+
for spk in dz_res["unique_speakers"]:
|
| 176 |
+
with st.expander(f"Speaker: {spk}"):
|
| 177 |
+
col1, col2 = st.columns([1, 2])
|
| 178 |
+
with col1:
|
| 179 |
+
seg = sample_segments.get(spk)
|
| 180 |
+
if seg is not None:
|
| 181 |
+
try:
|
| 182 |
+
sr = pipeline._sample_rate
|
| 183 |
+
start_sample = int(seg.start * sr)
|
| 184 |
+
end_sample = int(seg.end * sr)
|
| 185 |
+
audio_np = (
|
| 186 |
+
pipeline._waveform[:, start_sample:end_sample].squeeze().cpu().numpy()
|
| 187 |
+
)
|
| 188 |
+
tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
| 189 |
+
sf.write(tmpf.name, audio_np, sr)
|
| 190 |
+
st.audio(tmpf.name)
|
| 191 |
+
except Exception as e:
|
| 192 |
+
st.warning(f"Could not prepare audio snippet: {e}")
|
| 193 |
+
else:
|
| 194 |
+
st.write("No sample segment available for this speaker")
|
| 195 |
+
with col2:
|
| 196 |
+
st.write("**Sample transcript:**")
|
| 197 |
+
st.write(snippet_transcripts.get(spk, "(no transcription available)"))
|
| 198 |
+
key = f"map_{audio_id}_{spk}"
|
| 199 |
+
# Preserve user input across reruns by using session state keys
|
| 200 |
+
default_val = mapping.get(spk, spk)
|
| 201 |
+
mapping_val = st.text_input(f"Map {spk} to name", value=default_val, key=key)
|
| 202 |
+
mapping[spk] = mapping_val
|
| 203 |
+
|
| 204 |
+
st.session_state["mapping"] = mapping
|
| 205 |
+
|
| 206 |
+
if st.button("Apply mapping and continue processing"):
|
| 207 |
+
pipeline.apply_speaker_map(mapping, save_to_cache=True, audio_id=audio_id)
|
| 208 |
+
with st.spinner("Running full processing..."):
|
| 209 |
+
try:
|
| 210 |
+
res = pipeline.continue_from_diarization(title="Streamlit run")
|
| 211 |
+
st.session_state["result"] = res
|
| 212 |
+
st.success("Processing complete")
|
| 213 |
+
except Exception as e:
|
| 214 |
+
st.error(f"Processing failed: {e}")
|
| 215 |
+
raise
|
| 216 |
+
|
| 217 |
+
# If result available, display
|
| 218 |
+
if st.session_state.get("result"):
|
| 219 |
+
res = st.session_state["result"]
|
| 220 |
+
st.subheader("Summary")
|
| 221 |
+
st.json(res.summary or {})
|
| 222 |
+
|
| 223 |
+
st.subheader("Transcript (first 5000 characters)")
|
| 224 |
+
st.text(res.transcript_text[:5000])
|
| 225 |
+
|
| 226 |
+
if res.document_path and os.path.exists(res.document_path):
|
| 227 |
+
with open(res.document_path, "rb") as fh:
|
| 228 |
+
doc_bytes = fh.read()
|
| 229 |
+
st.download_button(
|
| 230 |
+
"Download .docx", data=doc_bytes, file_name=Path(res.document_path).name
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
st.write("---")
|
| 234 |
+
st.write("Processing metadata:")
|
| 235 |
+
st.write(
|
| 236 |
+
{
|
| 237 |
+
"Audio duration": res.audio_duration,
|
| 238 |
+
"Speakers found": res.num_speakers,
|
| 239 |
+
"Segments": res.num_segments,
|
| 240 |
+
"Total words": res.total_words,
|
| 241 |
+
"Processing time (s)": res.processing_time,
|
| 242 |
+
}
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
st.balloons()
|
| 246 |
+
|
| 247 |
+
# Allow clearing state
|
| 248 |
+
if st.button("Clear diarization state"):
|
| 249 |
+
for k in [
|
| 250 |
+
"diarization_done",
|
| 251 |
+
"pipeline",
|
| 252 |
+
"dz_res",
|
| 253 |
+
"sample_segments",
|
| 254 |
+
"snippet_transcripts",
|
| 255 |
+
"result",
|
| 256 |
+
"mapping",
|
| 257 |
+
]:
|
| 258 |
+
if k in st.session_state:
|
| 259 |
+
del st.session_state[k]
|