Yermia commited on
Commit
920b714
·
verified ·
1 Parent(s): 72508c3

Upload 6 files

Browse files
Files changed (6) hide show
  1. Dockerfile +21 -20
  2. config.yaml +209 -0
  3. main.py +602 -0
  4. requirements.txt +72 -3
  5. setup.py +43 -0
  6. streamlit_app.py +259 -0
Dockerfile CHANGED
@@ -1,20 +1,21 @@
1
- FROM python:3.13.5-slim
2
-
3
- WORKDIR /app
4
-
5
- RUN apt-get update && apt-get install -y \
6
- build-essential \
7
- curl \
8
- git \
9
- && rm -rf /var/lib/apt/lists/*
10
-
11
- COPY requirements.txt ./
12
- COPY src/ ./src/
13
-
14
- RUN pip3 install -r requirements.txt
15
-
16
- EXPOSE 8501
17
-
18
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
-
20
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # System deps
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ ffmpeg \
6
+ && rm -rf /var/lib/apt/lists/*
7
+
8
+ WORKDIR /app
9
+
10
+ # Copy project
11
+ COPY . /app
12
+
13
+ # Install Python deps
14
+ RUN pip install --upgrade pip
15
+ RUN pip install -r requirements.txt
16
+
17
+ # Expose Streamlit port
18
+ EXPOSE 8501
19
+
20
+ # Run Streamlit
21
+ CMD ["streamlit", "run", "streamlit_app.py", "--server.port", "8501", "--server.address", "0.0.0.0"]
config.yaml ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # CONFIG.YAML - Konfigurasi Sistem Notulensi Rapat Otomatis
3
+ # =============================================================================
4
+
5
+ # Informasi Proyek
6
+ project:
7
+ name: "Meeting Transcriber"
8
+ version: "1.0.0"
9
+ author: "Yermia Turangan"
10
+ description: "Sistem Notulensi Rapat Otomatis berbasis SpeechBrain dan BERT"
11
+
12
+ # Konfigurasi Audio
13
+ audio:
14
+ sample_rate: 16000
15
+ mono: true
16
+ normalize: true
17
+ trim_silence: false
18
+ max_duration_minutes: 60
19
+
20
+ # Konfigurasi Speaker Diarization
21
+ diarization:
22
+ # Voice Activity Detection
23
+ vad:
24
+ threshold: 0.5
25
+ min_speech_duration: 0.3
26
+ min_silence_duration: 0.3
27
+ speech_pad_ms: 30
28
+
29
+ # Segmentation
30
+ segmentation:
31
+ window_duration: 1.5
32
+ window_hop: 0.75
33
+ min_segment_duration: 0.5
34
+
35
+ # Speaker Embedding
36
+ embedding:
37
+ model_id: "speechbrain/spkrec-ecapa-voxceleb"
38
+ embedding_dim: 192
39
+
40
+ # Clustering
41
+ clustering:
42
+ method: "agglomerative" # agglomerative, spectral, kmeans
43
+ threshold: 0.7
44
+ min_cluster_size: 2
45
+ linkage: "average"
46
+
47
+ # Post-processing
48
+ postprocessing:
49
+ merge_gap_threshold: 0.5
50
+ min_segment_duration: 0.3
51
+ smooth_segments: true
52
+
53
+ # Konfigurasi ASR (Speech Recognition)
54
+ asr:
55
+ model_id: "whisper/whisper-base"
56
+ # model_id: "indonesian-nlp/wav2vec2-large-xlsr-indonesian"
57
+ # Alternatif: "facebook/wav2vec2-large-xlsr-53"
58
+ chunk_length_s: 30
59
+ stride_length_s: 5
60
+ batch_size: 4
61
+ return_timestamps: false
62
+ backend: "transformers" # options: 'transformers'|'whisper'|'speechbrain'
63
+
64
+ # Post-processing teks
65
+ text_postprocessing:
66
+ capitalize_sentences: true
67
+ normalize_whitespace: true
68
+ add_punctuation: false # Bisa diaktifkan jika ada model punctuation
69
+
70
+ # Konfigurasi BERT Summarization
71
+ summarization:
72
+ # Model
73
+ model_id: "indobenchmark/indobert-base-p1"
74
+ sentence_model_id: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
75
+
76
+ # Extractive Summarization Settings
77
+ extractive:
78
+ num_sentences: 5
79
+ min_sentence_length: 10
80
+ max_sentence_length: 200
81
+ position_weight: 0.1
82
+ similarity_threshold: 0.3
83
+
84
+ # Keyword Detection
85
+ keywords:
86
+ decisions:
87
+ - "diputuskan"
88
+ - "disepakati"
89
+ - "kesimpulan"
90
+ - "keputusan"
91
+ - "jadi"
92
+ - "maka"
93
+ - "sepakat"
94
+ - "setuju"
95
+ - "final"
96
+ - "kesepakatan"
97
+
98
+ action_items:
99
+ - "akan"
100
+ - "harus"
101
+ - "perlu"
102
+ - "tolong"
103
+ - "mohon"
104
+ - "deadline"
105
+ - "target"
106
+ - "tugas"
107
+ - "tanggung jawab"
108
+ - "action item"
109
+ - "follow up"
110
+ - "tindak lanjut"
111
+ - "dikerjakan"
112
+ - "selesaikan"
113
+
114
+ # Konfigurasi Document Generation
115
+ document:
116
+ template: "default"
117
+
118
+ # Struktur dokumen
119
+ sections:
120
+ header: true
121
+ meeting_info: true
122
+ summary: true
123
+ decisions: true
124
+ action_items: true
125
+ transcript: true
126
+ footer: true
127
+
128
+ # Formatting
129
+ formatting:
130
+ title_font_size: 18
131
+ heading_font_size: 14
132
+ body_font_size: 11
133
+ font_family: "Calibri"
134
+ include_timestamps: true
135
+ include_speaker_colors: true
136
+
137
+ # Output
138
+ output:
139
+ directory: "./data/output"
140
+ filename_template: "notulensi_{title}_{date}_{timestamp}"
141
+
142
+ # Konfigurasi Evaluasi
143
+ evaluation:
144
+ # WER Settings
145
+ wer:
146
+ lowercase: true
147
+ remove_punctuation: true
148
+ normalize_whitespace: true
149
+
150
+ # DER Settings
151
+ der:
152
+ collar: 0.25 # Forgiveness collar in seconds
153
+ skip_overlap: false
154
+
155
+ # Output
156
+ output:
157
+ save_detailed_results: true
158
+ generate_plots: true
159
+ export_csv: true
160
+
161
+ # Konfigurasi Hardware
162
+ hardware:
163
+ device: "auto" # auto, cuda, cpu
164
+ num_workers: 4
165
+ pin_memory: true
166
+
167
+ # Memory management
168
+ max_batch_size: 8
169
+ gradient_checkpointing: false
170
+
171
+ # Konfigurasi Paths
172
+ paths:
173
+ models_dir: "./models"
174
+ audio_dir: "./data/audio"
175
+ ground_truth_dir: "./data/ground_truth"
176
+ output_dir: "./data/output"
177
+ cache_dir: "./cache"
178
+ logs_dir: "./logs"
179
+
180
+ # Konfigurasi Logging
181
+ logging:
182
+ level: "INFO" # DEBUG, INFO, WARNING, ERROR
183
+ format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
184
+ save_to_file: true
185
+ log_file: "./logs/pipeline.log"
186
+
187
+ # Eksperimen (untuk skripsi)
188
+ experiment:
189
+ name: "baseline_evaluation"
190
+ conditions:
191
+ - name: "bersih"
192
+ description: "Audio bersih, ruangan tenang"
193
+ expected_wer: 0.15
194
+ expected_der: 0.15
195
+
196
+ - name: "noisy"
197
+ description: "Audio dengan background noise"
198
+ expected_wer: 0.25
199
+ expected_der: 0.25
200
+
201
+ - name: "overlap"
202
+ description: "Audio dengan overlapping speech"
203
+ expected_wer: 0.35
204
+ expected_der: 0.40
205
+
206
+ - name: "multispeaker"
207
+ description: "Audio dengan 4-6 speaker"
208
+ expected_wer: 0.25
209
+ expected_der: 0.35
main.py ADDED
@@ -0,0 +1,602 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Main Entry Point - Meeting Transcriber System
4
+ ==============================================
5
+
6
+ Automatic Meeting Minutes Generation using SpeechBrain + BERT
7
+
8
+ Usage:
9
+ # Basic transcription
10
+ python main.py --audio meeting.wav --title "Team Meeting"
11
+
12
+ # With evaluation
13
+ python main.py --audio meeting.wav --evaluate --reference transcript.txt
14
+
15
+ # Batch processing
16
+ python main.py --batch ./audio_folder/ --output ./results/
17
+
18
+ # Specify number of speakers
19
+ python main.py --audio meeting.wav --speakers 4
20
+ """
21
+
22
+ import argparse
23
+ import os
24
+ import sys
25
+ from datetime import datetime
26
+ from pathlib import Path
27
+ from typing import List
28
+
29
+ from src.evaluator import EvaluationResult, Evaluator
30
+ from src.pipeline import MeetingTranscriberPipeline, PipelineConfig, PipelineResult
31
+ from src.utils import (
32
+ format_duration,
33
+ list_audio_files,
34
+ parse_rttm_file,
35
+ parse_transcript_file,
36
+ validate_audio_file,
37
+ )
38
+
39
+
40
+ def parse_args():
41
+ """Parse command line arguments"""
42
+ parser = argparse.ArgumentParser(
43
+ description="Sistem Notulensi Rapat Otomatis (SpeechBrain + BERT)",
44
+ formatter_class=argparse.RawDescriptionHelpFormatter,
45
+ epilog="""
46
+ Contoh Penggunaan:
47
+ ==================
48
+
49
+ # Transkripsi dasar
50
+ python main.py --audio rapat.wav
51
+
52
+ # Dengan detail rapat
53
+ python main.py --audio rapat.wav --title "Rapat Sprint" --speakers 4 --location "Zoom"
54
+
55
+ # Dengan evaluasi WER
56
+ python main.py --audio rapat.wav --evaluate --reference transkrip_manual.txt
57
+
58
+ # Batch processing
59
+ python main.py --batch ./folder_audio/ --output ./hasil/
60
+
61
+ Untuk dokumentasi lengkap, lihat README.md
62
+ """,
63
+ )
64
+
65
+ # Input arguments
66
+ input_group = parser.add_argument_group("Input")
67
+ input_group.add_argument(
68
+ "--audio", "-a", type=str, help="Path ke file audio (.wav, .mp3, .m4a)"
69
+ )
70
+ input_group.add_argument(
71
+ "--batch", "-b", type=str, help="Direktori berisi file audio untuk batch processing"
72
+ )
73
+
74
+ # Meeting metadata
75
+ meta_group = parser.add_argument_group("Meeting Metadata")
76
+ meta_group.add_argument(
77
+ "--title",
78
+ "-t",
79
+ type=str,
80
+ default="Notulensi Rapat",
81
+ help="Judul rapat (default: 'Notulensi Rapat')",
82
+ )
83
+ meta_group.add_argument(
84
+ "--date", "-d", type=str, default=None, help="Tanggal rapat (default: hari ini)"
85
+ )
86
+ meta_group.add_argument("--location", "-l", type=str, default="", help="Lokasi/platform rapat")
87
+ meta_group.add_argument(
88
+ "--speakers",
89
+ "-s",
90
+ type=int,
91
+ default=None,
92
+ help="Jumlah speaker (opsional, auto-detect jika tidak disebut)",
93
+ )
94
+
95
+ meta_group.add_argument(
96
+ "--speaker-map",
97
+ type=str,
98
+ default=None,
99
+ help='Path ke JSON/YAML file yang memetakan speaker label (SPEAKER_00) ke nama (mis: {"SPEAKER_00": "Budi"})',
100
+ )
101
+
102
+ meta_group.add_argument(
103
+ "--tune-diarization",
104
+ action="store_true",
105
+ help="Jalankan tuning hyperparameter diarization sebelum clustering (tries several settings)",
106
+ )
107
+
108
+ meta_group.add_argument(
109
+ "--target-speakers",
110
+ type=int,
111
+ default=None,
112
+ help="Target jumlah speaker untuk dipaksakan (opsional). Jika diset, pipeline akan mencoba merge cluster hingga jumlah ini.",
113
+ )
114
+
115
+ # Performance and tuning flags
116
+ misc_group = parser.add_argument_group("Performance")
117
+ misc_group.add_argument(
118
+ "--fast",
119
+ action="store_true",
120
+ help="Aktifkan modus cepat (mengorbankan sedikit akurasi demi kinerja)",
121
+ )
122
+ misc_group.add_argument(
123
+ "--preset",
124
+ type=str,
125
+ choices=["deployment", "balanced", "fast", "accurate"],
126
+ default="deployment",
127
+ help="Preset pipeline yang merekomendasikan konfigurasi (default: deployment - prefer 'large-v3-turbo')",
128
+ )
129
+ misc_group.add_argument(
130
+ "--quick-asr",
131
+ action="store_true",
132
+ help="Gunakan backend ASR lebih ringan/cepat (model kecil) jika memungkinkan (opsional override)",
133
+ )
134
+ misc_group.add_argument(
135
+ "--prefer-whisper-small",
136
+ action="store_true",
137
+ help="Paksa penggunaan `openai/whisper-small` untuk ASR (lebih cepat, lebih ringan)",
138
+ )
139
+ misc_group.add_argument(
140
+ "--cst-hz",
141
+ type=float,
142
+ default=None,
143
+ help="(opsional) Approximate Continuous Speech Tokenizer token rate in Hz (e.g., 7.5). Applies lossy compression preprocessor for speed.",
144
+ )
145
+ misc_group.add_argument(
146
+ "--diarization-compare",
147
+ action="store_true",
148
+ help="Jalankan perbandingan metode diarization (agglomerative vs spectral) selama evaluasi",
149
+ )
150
+ misc_group.add_argument(
151
+ "--parallel-workers",
152
+ type=int,
153
+ default=None,
154
+ help="Override jumlah worker paralel untuk per-segment ASR (default: auto berdasarkan CPU atau preset)",
155
+ )
156
+ misc_group.add_argument(
157
+ "--no-embedding-cache",
158
+ action="store_true",
159
+ help="Nonaktifkan cache embeddings di disk (default: aktif)",
160
+ )
161
+
162
+ # Output settings
163
+ output_group = parser.add_argument_group("Output")
164
+ output_group.add_argument(
165
+ "--output",
166
+ "-o",
167
+ type=str,
168
+ default="./data/output",
169
+ help="Direktori output (default: ./data/output)",
170
+ )
171
+ output_group.add_argument(
172
+ "--filename",
173
+ "-f",
174
+ type=str,
175
+ default=None,
176
+ help="Nama file output (auto-generate jika tidak disebut)",
177
+ )
178
+
179
+ # Evaluation
180
+ eval_group = parser.add_argument_group("Evaluation")
181
+ eval_group.add_argument("--evaluate", "-e", action="store_true", help="Aktifkan mode evaluasi")
182
+ eval_group.add_argument(
183
+ "--reference",
184
+ "-r",
185
+ type=str,
186
+ default=None,
187
+ help="Path ke file reference transcript untuk WER",
188
+ )
189
+ eval_group.add_argument(
190
+ "--reference-rttm", type=str, default=None, help="Path ke file RTTM untuk DER"
191
+ )
192
+ eval_group.add_argument(
193
+ "--reference-summary",
194
+ type=str,
195
+ default=None,
196
+ help="Path ke file reference summary untuk evaluasi ringkasan (ROUGE/BERTScore)",
197
+ )
198
+ eval_group.add_argument(
199
+ "--condition",
200
+ type=str,
201
+ default="unknown",
202
+ help="Nama kondisi untuk evaluasi (misal: bersih, noisy)",
203
+ )
204
+
205
+ # Model settings
206
+ model_group = parser.add_argument_group("Model Settings")
207
+ model_group.add_argument(
208
+ "--asr-model",
209
+ type=str,
210
+ default="large-v3-turbo",
211
+ help="ASR model (HF model id / alias / path folder model lokal). Default: large-v3-turbo for better accuracy.",
212
+ )
213
+ model_group.add_argument(
214
+ "--asr-backend",
215
+ type=str,
216
+ default="whisper",
217
+ choices=["whisperx", "whisper", "transformers", "speechbrain"],
218
+ help="Backend ASR (default: whisper)",
219
+ )
220
+ model_group.add_argument(
221
+ "--asr-language",
222
+ type=str,
223
+ default="id",
224
+ help="Kode bahasa (mis: id, en, auto). Untuk WhisperX: 'auto' = autodetect.",
225
+ )
226
+ model_group.add_argument(
227
+ "--whisperx-compute-type",
228
+ type=str,
229
+ default="auto",
230
+ help="WhisperX compute_type (auto|float16|int8|int8_float16). Default auto.",
231
+ )
232
+ model_group.add_argument(
233
+ "--whisperx-no-vad-filter",
234
+ action="store_true",
235
+ help="Matikan VAD filter WhisperX (kadang berguna untuk audio sangat pendek/aneh).",
236
+ )
237
+ model_group.add_argument(
238
+ "--device",
239
+ type=str,
240
+ default="auto",
241
+ choices=["auto", "cuda", "cpu"],
242
+ help="Device untuk inferensi (default: auto)",
243
+ )
244
+
245
+ # Misc
246
+ misc_group = parser.add_argument_group("Misc")
247
+ misc_group.add_argument(
248
+ "--verbose", "-v", action="store_true", default=True, help="Output verbose"
249
+ )
250
+ misc_group.add_argument("--quiet", "-q", action="store_true", help="Minimal output")
251
+ misc_group.add_argument(
252
+ "--no-save-intermediate", action="store_true", help="Jangan simpan hasil intermediate"
253
+ )
254
+
255
+ return parser.parse_args()
256
+
257
+
258
+ def print_banner():
259
+ """Print application banner"""
260
+ banner = """
261
+ ╔══════════════════════════════════════════════════════════════════╗
262
+ ║ ║
263
+ ║ ███╗ ███╗███████╗███████╗████████╗██╗███╗ ██╗ ██████╗ ║
264
+ ║ ████╗ ████║██╔════╝██╔════╝╚══██╔══╝██║████╗ ██║██╔════╝ ║
265
+ ║ ██╔████╔██║█████╗ █████╗ ██║ ██║██╔██╗ ██║██║ ███╗ ║
266
+ ║ ██║╚██╔╝██║██╔══╝ ██╔══╝ ██║ ██║██║╚██╗██║██║ ██║ ║
267
+ ║ ██║ ╚═╝ ██║███████╗███████╗ ██║ ██║██║ ╚████║╚██████╔╝ ║
268
+ ║ ╚═╝ ╚═╝╚══════╝╚══════╝ ╚═╝ ╚═╝╚═╝ ╚═══╝ ╚═════╝ ║
269
+ ║ ║
270
+ ║ TRANSCRIBER - Notulensi Rapat Otomatis ║
271
+ ║ SpeechBrain + BERT Pipeline ║
272
+ ║ ║
273
+ ╚══════════════════════════════════════════════════════════════════╝
274
+ """
275
+ print(banner)
276
+
277
+
278
+ def process_single_audio(args, pipeline: MeetingTranscriberPipeline) -> PipelineResult:
279
+ """Process a single audio file"""
280
+
281
+ # Validate audio file
282
+ validate_audio_file(args.audio)
283
+
284
+ print(f"\n{'='*60}")
285
+ print(f"Processing: {args.audio}")
286
+ print(f"{'='*60}")
287
+
288
+ # Run pipeline
289
+ result = pipeline.process(
290
+ audio_path=args.audio,
291
+ title=args.title,
292
+ date=args.date,
293
+ location=args.location,
294
+ num_speakers=args.speakers,
295
+ output_filename=args.filename,
296
+ )
297
+
298
+ # Print summary
299
+ print_result_summary(result)
300
+
301
+ # Run evaluation if requested
302
+ if args.evaluate:
303
+ run_evaluation(args, pipeline, result)
304
+
305
+ return result
306
+
307
+
308
+ def process_batch(args, pipeline: MeetingTranscriberPipeline) -> List[PipelineResult]:
309
+ """Process multiple audio files in a directory"""
310
+
311
+ batch_dir = Path(args.batch)
312
+
313
+ if not batch_dir.is_dir():
314
+ print(f"Error: Direktori tidak ditemukan: {args.batch}")
315
+ sys.exit(1)
316
+
317
+ # Find audio files
318
+ audio_files = list_audio_files(batch_dir)
319
+
320
+ if not audio_files:
321
+ print(f"Tidak ada file audio ditemukan di: {args.batch}")
322
+ sys.exit(1)
323
+
324
+ print(f"\nDitemukan {len(audio_files)} file audio untuk diproses")
325
+ print("-" * 60)
326
+
327
+ results = []
328
+ failed = []
329
+
330
+ for i, audio_path in enumerate(audio_files, 1):
331
+ print(f"\n[{i}/{len(audio_files)}] Processing: {audio_path.name}")
332
+
333
+ try:
334
+ # Generate title from filename
335
+ title = audio_path.stem.replace("_", " ").replace("-", " ").title()
336
+
337
+ result = pipeline.process(
338
+ audio_path=str(audio_path),
339
+ title=title,
340
+ date=args.date,
341
+ location=args.location,
342
+ num_speakers=args.speakers,
343
+ )
344
+ results.append(result)
345
+
346
+ # Clear state for next file
347
+ pipeline.clear_state()
348
+
349
+ except Exception as e:
350
+ print(f"Error processing {audio_path.name}: {e}")
351
+ failed.append((audio_path.name, str(e)))
352
+ continue
353
+
354
+ # Print batch summary
355
+ print_batch_summary(results, failed, audio_files)
356
+
357
+ return results
358
+
359
+
360
+ def run_evaluation(args, pipeline: MeetingTranscriberPipeline, result: PipelineResult):
361
+ """Run evaluation with reference files"""
362
+
363
+ print(f"\n{'='*60}")
364
+ print("EVALUASI")
365
+ print(f"{'='*60}")
366
+
367
+ reference_transcript = None
368
+ reference_diarization = None
369
+
370
+ # Load reference transcript
371
+ if args.reference:
372
+ if not os.path.exists(args.reference):
373
+ print(f"Warning: File reference tidak ditemukan: {args.reference}")
374
+ else:
375
+ reference_transcript = parse_transcript_file(args.reference)
376
+ print(f"Reference transcript loaded: {len(reference_transcript.split())} words")
377
+
378
+ # Load reference diarization
379
+ if args.reference_rttm:
380
+ if not os.path.exists(args.reference_rttm):
381
+ print(f"Warning: File RTTM tidak ditemukan: {args.reference_rttm}")
382
+ else:
383
+ reference_diarization = parse_rttm_file(args.reference_rttm)
384
+ print(f"Reference diarization loaded: {len(reference_diarization)} segments")
385
+ else:
386
+ # If user didn't provide an RTTM, try to find a *_vibevoice.rttm for the sample
387
+ try:
388
+ audio_stem = Path(args.audio).stem
389
+ cand = Path("data/ground_truth") / f"{audio_stem}_vibevoice.rttm"
390
+ if cand.exists():
391
+ reference_diarization = parse_rttm_file(str(cand))
392
+ print(f"Reference RTTM auto-loaded: {cand} ({len(reference_diarization)} segments)")
393
+ except Exception:
394
+ pass
395
+
396
+ # Load reference summary (optional)
397
+ reference_summary = None
398
+ if getattr(args, "reference_summary", None):
399
+ if not os.path.exists(args.reference_summary):
400
+ print(f"Warning: File reference summary tidak ditemukan: {args.reference_summary}")
401
+ else:
402
+ try:
403
+ reference_summary = Path(args.reference_summary).read_text(encoding="utf-8")
404
+ print(f"Reference summary loaded (len={len(reference_summary.split())} words)")
405
+ except Exception as e:
406
+ print(f"Warning: gagal membaca file summary: {e}")
407
+
408
+ # Run evaluation
409
+ eval_result = pipeline.evaluate(
410
+ reference_transcript=reference_transcript,
411
+ reference_diarization=reference_diarization,
412
+ reference_summary=reference_summary,
413
+ sample_name=Path(args.audio).stem,
414
+ condition=args.condition,
415
+ )
416
+
417
+ # Print evaluation results
418
+ print_evaluation_results(eval_result)
419
+
420
+ # Generate and save report
421
+ evaluator = Evaluator(output_dir=args.output)
422
+
423
+ wer_results = [eval_result.wer_result] if eval_result.wer_result else []
424
+ der_results = [eval_result.der_result] if eval_result.der_result else []
425
+
426
+ # Pass evaluation metadata for reproducibility & documentation
427
+ report = evaluator.generate_evaluation_report(
428
+ wer_results=wer_results,
429
+ der_results=der_results,
430
+ summary_results=[eval_result.summary_result] if eval_result.summary_result else None,
431
+ sample_names=[eval_result.sample_name],
432
+ condition_name=args.condition,
433
+ metadata=eval_result.metadata,
434
+ )
435
+
436
+ # Save report
437
+ report_path = evaluator.save_report(
438
+ report,
439
+ f"evaluation_{eval_result.sample_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
440
+ )
441
+ print(f"\nEvaluation report saved: {report_path}")
442
+
443
+
444
+ def print_result_summary(result: PipelineResult):
445
+ """Print processing result summary"""
446
+ print(f"\n{'='*60}")
447
+ print("HASIL PEMROSESAN")
448
+ print(f"{'='*60}")
449
+ print(f" Audio Duration : {format_duration(result.audio_duration)}")
450
+ print(f" Speakers Found : {result.num_speakers}")
451
+ print(f" Total Segments : {result.num_segments}")
452
+ print(f" Total Words : {result.total_words}")
453
+ print(f" Processing Time : {format_duration(result.processing_time)}")
454
+ print(f" Output Document : {result.document_path}")
455
+ print(f"{'='*60}")
456
+
457
+
458
+ def print_evaluation_results(eval_result: EvaluationResult):
459
+ """Print evaluation results"""
460
+ print("\n--- Hasil Evaluasi ---")
461
+
462
+ if eval_result.wer_result:
463
+ wer = eval_result.wer_result
464
+ print("\nWord Error Rate (WER):")
465
+ print(f" WER : {wer.wer:.4f} ({wer.wer*100:.2f}%)")
466
+ print(f" Substitutions : {wer.substitutions}")
467
+ print(f" Deletions : {wer.deletions}")
468
+ print(f" Insertions : {wer.insertions}")
469
+ print(f" Correct : {wer.hits}")
470
+
471
+ if eval_result.der_result:
472
+ der = eval_result.der_result
473
+ print("\nDiarization Error Rate (DER):")
474
+ print(f" DER : {der.der:.4f} ({der.der*100:.2f}%)")
475
+ print(f" Missed Speech : {der.missed_speech:.4f} ({der.missed_speech*100:.2f}%)")
476
+ print(f" False Alarm : {der.false_alarm:.4f} ({der.false_alarm*100:.2f}%)")
477
+ print(
478
+ f" Speaker Confusion : {der.speaker_confusion:.4f} ({der.speaker_confusion*100:.2f}%)"
479
+ )
480
+
481
+ # Summary metrics (if available)
482
+ if eval_result.summary_result:
483
+ s = eval_result.summary_result
484
+ print("\nRingkasan (Summary) Evaluation:")
485
+ try:
486
+ print(f" ROUGE-1 F1 : {s.rouge.get('rouge1_f', 0.0):.4f}")
487
+ print(f" ROUGE-2 F1 : {s.rouge.get('rouge2_f', 0.0):.4f}")
488
+ print(f" ROUGE-L F1 : {s.rouge.get('rougel_f', 0.0):.4f}")
489
+ print(f" BERTScore F1 : {s.bertscore.get('bertscore_f1', 0.0):.4f}")
490
+ except Exception as e:
491
+ print(f" (failed to print summary metrics: {e})")
492
+
493
+
494
+ def print_batch_summary(
495
+ results: List[PipelineResult], failed: List[tuple], total_files: List[Path]
496
+ ):
497
+ """Print batch processing summary"""
498
+ print(f"\n{'='*60}")
499
+ print("RINGKASAN BATCH PROCESSING")
500
+ print(f"{'='*60}")
501
+ print(f" Total files : {len(total_files)}")
502
+ print(f" Successful : {len(results)}")
503
+ print(f" Failed : {len(failed)}")
504
+
505
+ if results:
506
+ total_duration = sum(r.audio_duration for r in results)
507
+ total_time = sum(r.processing_time for r in results)
508
+ avg_time = total_time / len(results)
509
+
510
+ print(f" Total audio : {format_duration(total_duration)}")
511
+ print(f" Total proc. time : {format_duration(total_time)}")
512
+ print(f" Avg time/file : {format_duration(avg_time)}")
513
+
514
+ if failed:
515
+ print("\n Failed files:")
516
+ for filename, error in failed:
517
+ print(f" - {filename}: {error[:50]}...")
518
+
519
+ print(f"{'='*60}")
520
+
521
+
522
+ def main():
523
+ """Main entry point"""
524
+ args = parse_args()
525
+
526
+ # Handle quiet mode
527
+ verbose = not args.quiet and args.verbose
528
+
529
+ if verbose:
530
+ print_banner()
531
+
532
+ # Validate input
533
+ if not args.audio and not args.batch:
534
+ print("Error: Harap tentukan --audio atau --batch")
535
+ print("Gunakan --help untuk informasi penggunaan")
536
+ sys.exit(1)
537
+
538
+ # Determine device
539
+ device = args.device
540
+ if device == "auto":
541
+ import torch
542
+
543
+ device = "cuda" if torch.cuda.is_available() else "cpu"
544
+
545
+ if verbose:
546
+ print("\nDevice: {}".format(device))
547
+ print("ASR Backend: {}".format(args.asr_backend))
548
+ print("ASR Model: {}".format(args.asr_model))
549
+ print("ASR Language: {}".format(args.asr_language))
550
+ print("Output Dir: {}".format(args.output))
551
+
552
+ # Initialize pipeline
553
+ config = PipelineConfig(
554
+ output_dir=args.output,
555
+ asr_model_id=args.asr_model,
556
+ asr_backend=args.asr_backend,
557
+ asr_language=args.asr_language,
558
+ whisperx_compute_type=args.whisperx_compute_type,
559
+ whisperx_vad_filter=not args.whisperx_no_vad_filter,
560
+ device=device,
561
+ verbose=verbose,
562
+ save_intermediate=not args.no_save_intermediate,
563
+ fast_mode=args.fast,
564
+ quick_asr=args.quick_asr,
565
+ prefer_whisper_small=args.prefer_whisper_small,
566
+ cst_hz=args.cst_hz,
567
+ diarization_compare=args.diarization_compare,
568
+ embedding_cache=not args.no_embedding_cache,
569
+ target_speakers=args.target_speakers,
570
+ # New flags
571
+ asr_parallel_workers=args.parallel_workers,
572
+ speaker_map_path=args.speaker_map,
573
+ tune_diarization=args.tune_diarization,
574
+ num_speakers=args.speakers,
575
+ preset=args.preset,
576
+ )
577
+
578
+ pipeline = MeetingTranscriberPipeline(config)
579
+
580
+ # Run processing
581
+ try:
582
+ if args.batch:
583
+ process_batch(args, pipeline)
584
+ else:
585
+ process_single_audio(args, pipeline)
586
+
587
+ print("\nSelesai!")
588
+
589
+ except KeyboardInterrupt:
590
+ print("\n\nProses dibatalkan oleh user")
591
+ sys.exit(1)
592
+ except Exception as e:
593
+ print(f"\nError: {e}")
594
+ if verbose:
595
+ import traceback
596
+
597
+ traceback.print_exc()
598
+ sys.exit(1)
599
+
600
+
601
+ if __name__ == "__main__":
602
+ main()
requirements.txt CHANGED
@@ -1,3 +1,72 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # REQUIREMENTS - Sistem Notulensi Rapat Otomatis
3
+ # SpeechBrain + BERT Pipeline
4
+ # =============================================================================
5
+
6
+ # Core Deep Learning
7
+ torch>=2.0.0
8
+ torchaudio>=2.0.0
9
+
10
+ # Speech Processing (SpeechBrain)
11
+ speechbrain>=0.5.15
12
+
13
+ # NLP & Transformers
14
+ transformers>=4.30.0
15
+ sentence-transformers>=2.2.0
16
+ tokenizers>=0.13.0
17
+
18
+ # Audio Processing
19
+ librosa>=0.10.0
20
+ soundfile>=0.12.0
21
+ pydub>=0.25.1
22
+ webrtcvad>=2.0.10
23
+
24
+ # Document Generation
25
+ python-docx>=0.8.11
26
+
27
+ # Evaluation Metrics
28
+ jiwer>=3.0.0
29
+
30
+ # Data Processing
31
+ numpy>=1.24.0
32
+ pandas>=2.0.0
33
+ scipy>=1.10.0
34
+
35
+ # Machine Learning
36
+ scikit-learn>=1.3.0
37
+
38
+ # Visualization
39
+ matplotlib>=3.7.0
40
+ seaborn>=0.12.0
41
+
42
+ # Configuration
43
+ pyyaml>=6.0
44
+ python-dotenv>=1.0.0
45
+
46
+ # Utilities
47
+ tqdm>=4.65.0
48
+ colorama>=0.4.6
49
+ tabulate>=0.9.0
50
+
51
+ # Jupyter (untuk notebooks)
52
+ jupyter>=1.0.0
53
+ ipywidgets>=8.0.0
54
+
55
+ # Testing
56
+ pytest>=7.0.0
57
+
58
+ # Web UI
59
+ streamlit>=1.18.0
60
+
61
+ # Optional: GPU monitoring
62
+ # nvidia-ml-py>=12.0.0
63
+
64
+ # Training & Evaluation (for Whisper fine-tuning)
65
+ datasets>=2.14.0
66
+ evaluate>=0.4.0
67
+ accelerate>=0.20.3
68
+ peft>=0.4.0
69
+ transformers[torch]>=4.30.0
70
+ ffmpeg-python>=0.1.18
71
+ langdetect>=1.0.9
72
+ whisperx>=1.0.0
setup.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Setup script for Meeting Transcriber package
4
+ """
5
+
6
+ from setuptools import find_packages, setup
7
+
8
+ with open("README.md", "r", encoding="utf-8") as fh:
9
+ long_description = fh.read()
10
+
11
+ with open("requirements.txt", "r", encoding="utf-8") as fh:
12
+ requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
13
+
14
+ setup(
15
+ name="meeting-transcriber",
16
+ version="1.0.0",
17
+ author="Yermia Turangan",
18
+ author_email="yermiaturangan026@student.unsrat.ac.id",
19
+ description="Sistem Notulensi Rapat Otomatis berbasis SpeechBrain dan BERT",
20
+ long_description=long_description,
21
+ long_description_content_type="text/markdown",
22
+ url="https://github.com/username/meeting-transcriber",
23
+ packages=find_packages(),
24
+ classifiers=[
25
+ "Development Status :: 3 - Alpha",
26
+ "Intended Audience :: Science/Research",
27
+ "License :: OSI Approved :: MIT License",
28
+ "Operating System :: OS Independent",
29
+ "Programming Language :: Python :: 3",
30
+ "Programming Language :: Python :: 3.8",
31
+ "Programming Language :: Python :: 3.9",
32
+ "Programming Language :: Python :: 3.10",
33
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
34
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
35
+ ],
36
+ python_requires=">=3.8",
37
+ install_requires=requirements,
38
+ entry_points={
39
+ "console_scripts": [
40
+ "meeting-transcriber=main:main",
41
+ ],
42
+ },
43
+ )
streamlit_app.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ from pathlib import Path
4
+
5
+ import streamlit as st
6
+
7
+ from src.pipeline import MeetingTranscriberPipeline, PipelineConfig
8
+
9
+ st.set_page_config(page_title="Meeting Transcriber", layout="wide")
10
+
11
+ st.title("Meeting Transcriber — Demo")
12
+ st.markdown(
13
+ "Upload an audio file or pick a sample to generate transcript, summary and downloadable DOCX."
14
+ )
15
+
16
+ # Sample audio chooser
17
+ AUDIO_DIR = Path.cwd() / "data" / "audio"
18
+ # Build safe sample list: prefer paths relative to cwd, but fall back to absolute paths if not possible
19
+ SAMPLES = []
20
+ for p in AUDIO_DIR.rglob("*.mp3"):
21
+ try:
22
+ SAMPLES.append(str(p.relative_to(Path.cwd())))
23
+ except ValueError:
24
+ # Path is not under cwd (different drive or external mount), use absolute path
25
+ SAMPLES.append(str(p.resolve()))
26
+
27
+ with st.sidebar:
28
+ st.header("Settings")
29
+ # Detect deployment target (e.g., set STREAMLIT_DEPLOY_TARGET=community on Streamlit Cloud)
30
+ deploy_target = os.getenv("STREAMLIT_DEPLOY_TARGET", "")
31
+ # Community Cloud has no GPU and limited CPU/time; default to 'fast' preset there
32
+ default_index = 0
33
+ default_quick_asr = False
34
+ if deploy_target.lower() == "community":
35
+ default_index = 2 # 'fast'
36
+ default_quick_asr = True
37
+ st.info(
38
+ "Running in Streamlit Community mode: using fast preset and quick ASR for responsiveness."
39
+ )
40
+
41
+ preset = st.selectbox(
42
+ "Preset", ["deployment", "balanced", "fast", "accurate"], index=default_index
43
+ )
44
+ quick_asr = st.checkbox("Quick ASR (override)", value=default_quick_asr)
45
+ parallel_workers = st.number_input(
46
+ "Parallel workers (0 = auto)", min_value=0, max_value=16, value=0
47
+ )
48
+ sample_choice = st.selectbox("Pick sample audio (optional)", ["None"] + SAMPLES)
49
+
50
+ uploaded_file = st.file_uploader("Upload audio (.wav, .mp3, .m4a)")
51
+
52
+ # Determine audio path
53
+ audio_path = None
54
+ if uploaded_file is not None:
55
+ tmpdir = tempfile.gettempdir()
56
+ tmp_path = Path(tmpdir) / uploaded_file.name
57
+ with open(tmp_path, "wb") as f:
58
+ f.write(uploaded_file.read())
59
+ audio_path = str(tmp_path)
60
+ elif sample_choice and sample_choice != "None":
61
+ audio_path = sample_choice
62
+
63
+ if not audio_path:
64
+ st.info("Upload an audio file or pick a sample from the sidebar to begin.")
65
+
66
+ # Interactive flow: run diarization first and allow manual mapping
67
+ # Clear existing session state if user changed audio selection
68
+ if "diarization_done" in st.session_state and st.session_state.get("audio_path") != audio_path:
69
+ # Keep only unrelated session keys
70
+ for k in [
71
+ "diarization_done",
72
+ "pipeline",
73
+ "dz_res",
74
+ "sample_segments",
75
+ "snippet_transcripts",
76
+ "result",
77
+ "mapping",
78
+ ]:
79
+ if k in st.session_state:
80
+ del st.session_state[k]
81
+
82
+ if st.button("Run diarization only"):
83
+ if not audio_path:
84
+ st.error("Please provide audio first.")
85
+ else:
86
+ cfg = PipelineConfig(preset=preset, quick_asr=quick_asr)
87
+ if parallel_workers and parallel_workers > 0:
88
+ cfg.asr_parallel_workers = int(parallel_workers)
89
+ pipeline = MeetingTranscriberPipeline(cfg)
90
+
91
+ with st.spinner("Running diarization..."):
92
+ try:
93
+ dz_res = pipeline.run_diarization(audio_path)
94
+ st.success("Diarization complete")
95
+ except Exception as e:
96
+ st.error(f"Diarization failed: {e}")
97
+ raise
98
+
99
+ # Persist state so interactive widgets survive reruns
100
+ st.session_state["diarization_done"] = True
101
+ st.session_state["pipeline"] = pipeline
102
+ st.session_state["dz_res"] = dz_res
103
+ st.session_state["audio_path"] = audio_path
104
+
105
+ # If we already have diarization state (either just-run or from previous interaction), show mapping UI
106
+ if st.session_state.get("diarization_done") and audio_path:
107
+ pipeline = st.session_state["pipeline"]
108
+ dz_res = st.session_state["dz_res"]
109
+
110
+ st.write(
111
+ f"Detected {len(dz_res['unique_speakers'])} speakers and {dz_res['num_segments']} segments"
112
+ )
113
+
114
+ # Playable sample and quick per-speaker snippets so user can listen/read before mapping
115
+ st.subheader("Sample snippets (listen + read before mapping)")
116
+
117
+ # Try to reuse cached sample snippets if present
118
+ sample_segments = st.session_state.get("sample_segments") or {}
119
+ snippet_transcripts = st.session_state.get("snippet_transcripts") or {}
120
+
121
+ if not sample_segments:
122
+ try:
123
+ dsegs = pipeline._diarization_segments or []
124
+ for spk in dz_res["unique_speakers"]:
125
+ cand = [s for s in dsegs if s.speaker_id == spk]
126
+ if not cand:
127
+ continue
128
+ best = max(cand, key=lambda x: x.duration)
129
+ cap_end = min(best.end, best.start + 10.0)
130
+ from src.diarization import SpeakerSegment
131
+
132
+ sample_segments[spk] = SpeakerSegment(
133
+ speaker_id=best.speaker_id,
134
+ start=best.start,
135
+ end=cap_end,
136
+ confidence=best.confidence,
137
+ is_overlap=best.is_overlap,
138
+ metadata=best.metadata.copy() if getattr(best, "metadata", None) else {},
139
+ )
140
+ st.session_state["sample_segments"] = sample_segments
141
+ except Exception as e:
142
+ st.warning(f"Could not prepare sample segments: {e}")
143
+ sample_segments = {}
144
+
145
+ # Run quick per-segment ASR for the sample snippets (avoid full-audio mapping for speed)
146
+ if not snippet_transcripts and sample_segments:
147
+ try:
148
+ transcriber = pipeline.transcriber
149
+ orig_full_audio = getattr(transcriber.config, "use_full_audio_for_segments", False)
150
+ transcriber.config.use_full_audio_for_segments = False
151
+ orig_workers = getattr(transcriber.config, "parallel_workers", 1)
152
+ transcriber.config.parallel_workers = 1
153
+
154
+ transcripts = transcriber.transcribe_segments(
155
+ pipeline._waveform, list(sample_segments.values()), pipeline._sample_rate
156
+ )
157
+ for t in transcripts:
158
+ snippet_transcripts[t.speaker_id] = t.text
159
+
160
+ transcriber.config.use_full_audio_for_segments = orig_full_audio
161
+ transcriber.config.parallel_workers = orig_workers
162
+
163
+ st.session_state["snippet_transcripts"] = snippet_transcripts
164
+ except Exception as e:
165
+ st.warning(f"Quick snippet transcription failed: {e}")
166
+
167
+ # Display snippets in columns with audio player + short transcript
168
+ import tempfile
169
+
170
+ import soundfile as sf
171
+
172
+ mapping = st.session_state.get("mapping") or {}
173
+ st.subheader("Manual speaker mapping")
174
+ audio_id = Path(audio_path).stem
175
+ for spk in dz_res["unique_speakers"]:
176
+ with st.expander(f"Speaker: {spk}"):
177
+ col1, col2 = st.columns([1, 2])
178
+ with col1:
179
+ seg = sample_segments.get(spk)
180
+ if seg is not None:
181
+ try:
182
+ sr = pipeline._sample_rate
183
+ start_sample = int(seg.start * sr)
184
+ end_sample = int(seg.end * sr)
185
+ audio_np = (
186
+ pipeline._waveform[:, start_sample:end_sample].squeeze().cpu().numpy()
187
+ )
188
+ tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
189
+ sf.write(tmpf.name, audio_np, sr)
190
+ st.audio(tmpf.name)
191
+ except Exception as e:
192
+ st.warning(f"Could not prepare audio snippet: {e}")
193
+ else:
194
+ st.write("No sample segment available for this speaker")
195
+ with col2:
196
+ st.write("**Sample transcript:**")
197
+ st.write(snippet_transcripts.get(spk, "(no transcription available)"))
198
+ key = f"map_{audio_id}_{spk}"
199
+ # Preserve user input across reruns by using session state keys
200
+ default_val = mapping.get(spk, spk)
201
+ mapping_val = st.text_input(f"Map {spk} to name", value=default_val, key=key)
202
+ mapping[spk] = mapping_val
203
+
204
+ st.session_state["mapping"] = mapping
205
+
206
+ if st.button("Apply mapping and continue processing"):
207
+ pipeline.apply_speaker_map(mapping, save_to_cache=True, audio_id=audio_id)
208
+ with st.spinner("Running full processing..."):
209
+ try:
210
+ res = pipeline.continue_from_diarization(title="Streamlit run")
211
+ st.session_state["result"] = res
212
+ st.success("Processing complete")
213
+ except Exception as e:
214
+ st.error(f"Processing failed: {e}")
215
+ raise
216
+
217
+ # If result available, display
218
+ if st.session_state.get("result"):
219
+ res = st.session_state["result"]
220
+ st.subheader("Summary")
221
+ st.json(res.summary or {})
222
+
223
+ st.subheader("Transcript (first 5000 characters)")
224
+ st.text(res.transcript_text[:5000])
225
+
226
+ if res.document_path and os.path.exists(res.document_path):
227
+ with open(res.document_path, "rb") as fh:
228
+ doc_bytes = fh.read()
229
+ st.download_button(
230
+ "Download .docx", data=doc_bytes, file_name=Path(res.document_path).name
231
+ )
232
+
233
+ st.write("---")
234
+ st.write("Processing metadata:")
235
+ st.write(
236
+ {
237
+ "Audio duration": res.audio_duration,
238
+ "Speakers found": res.num_speakers,
239
+ "Segments": res.num_segments,
240
+ "Total words": res.total_words,
241
+ "Processing time (s)": res.processing_time,
242
+ }
243
+ )
244
+
245
+ st.balloons()
246
+
247
+ # Allow clearing state
248
+ if st.button("Clear diarization state"):
249
+ for k in [
250
+ "diarization_done",
251
+ "pipeline",
252
+ "dz_res",
253
+ "sample_segments",
254
+ "snippet_transcripts",
255
+ "result",
256
+ "mapping",
257
+ ]:
258
+ if k in st.session_state:
259
+ del st.session_state[k]