Jedi09 commited on
Commit
be02700
·
verified ·
1 Parent(s): 5652d57

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +299 -92
app.py CHANGED
@@ -1,122 +1,329 @@
1
  """
2
- Speaker Diarization Module
3
- Pyannote-audio ile konuşmacı ayrımı (kim ne zaman konuşuyor).
 
4
  """
5
 
 
 
 
 
6
  import os
7
- from typing import List, Tuple, Optional
8
 
9
- # PyTorch 2.6+ compatibility: Disable weights_only restriction for pyannote models
10
- os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
 
 
 
 
11
 
12
- import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- # Check for GPU availability
15
- DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
- print(f"🔧 Diarization device: {DEVICE}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
 
19
- def get_diarization_pipeline(hf_token: Optional[str] = None):
20
  """
21
- Load pyannote speaker diarization pipeline.
22
-
23
- Args:
24
- hf_token: Hugging Face token (required for pyannote models)
25
-
26
- Returns:
27
- Diarization pipeline or None if failed
28
  """
29
  try:
30
- from pyannote.audio import Pipeline
31
-
32
- # Try to get token from environment if not provided
33
- token = hf_token or os.environ.get("HF_TOKEN")
34
-
35
- if not token:
36
- print("⚠️ HF_TOKEN bulunamadı. pyannote modeli yüklenemeyebilir.")
37
-
38
- pipeline = Pipeline.from_pretrained(
39
- "pyannote/speaker-diarization-3.1",
40
- token=token
41
  )
42
 
43
- # Move to GPU if available
44
- pipeline.to(DEVICE)
45
-
46
- print("✅ Diarization pipeline yüklendi!")
47
- return pipeline
 
48
 
 
49
  except Exception as e:
50
- print(f" Diarization pipeline yüklenemedi: {e}")
51
- return None
52
 
53
 
54
- def diarize_audio(audio_path: str, pipeline, num_speakers: int = None) -> List[Tuple[float, float, str]]:
55
  """
56
- Perform speaker diarization on audio file.
57
-
58
- Args:
59
- audio_path: Path to audio file
60
- pipeline: Pyannote diarization pipeline
61
- num_speakers: Expected number of speakers (None for auto-detect)
62
-
63
- Returns:
64
- List of (start_time, end_time, speaker_label) tuples
65
  """
66
- if pipeline is None:
67
- return []
68
 
69
- try:
70
- # Run diarization (auto-detect speakers or use specified count)
71
- if num_speakers:
72
- result = pipeline(audio_path, min_speakers=1, max_speakers=num_speakers)
73
- else:
74
- result = pipeline(audio_path)
 
 
75
 
76
- # Extract segments from DiarizeOutput object
77
- segments = []
 
 
78
 
79
- # DiarizeOutput has speaker_diarization attribute which is the Annotation
80
- if hasattr(result, 'speaker_diarization'):
81
- diarization = result.speaker_diarization
82
- print(f"🔍 Using speaker_diarization attribute")
83
- else:
84
- diarization = result
85
 
86
- # Now iterate over the Annotation object
87
- for segment, track, speaker in diarization.itertracks(yield_label=True):
88
- segments.append((segment.start, segment.end, speaker))
 
 
 
 
89
 
90
- print(f"✅ Diarization tamamlandı: {len(segments)} segment bulundu")
91
- return segments
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- except Exception as e:
94
- print(f"❌ Diarization hatası: {e}")
95
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
- def format_speaker_label(speaker: str) -> str:
99
- """
100
- Convert pyannote speaker labels (SPEAKER_00, SPEAKER_01) to user-friendly format.
101
- """
102
- speaker_map = {
103
- "SPEAKER_00": "Kişi 1",
104
- "SPEAKER_01": "Kişi 2",
105
- "SPEAKER_02": "Kişi 3",
106
- "SPEAKER_03": "Kişi 4",
107
- }
108
- return speaker_map.get(speaker, speaker)
109
 
110
 
111
- def format_timestamp(seconds: float) -> str:
112
- """
113
- Convert seconds to [HH:MM:SS] or [MM:SS] format.
114
- """
115
- hours = int(seconds // 3600)
116
- minutes = int((seconds % 3600) // 60)
117
- secs = int(seconds % 60)
118
-
119
- if hours > 0:
120
- return f"{hours:02d}:{minutes:02d}:{secs:02d}"
121
- else:
122
- return f"{minutes:02d}:{secs:02d}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Danışman-Danışan Transkripsiyon Sistemi
3
+ Speaker diarization + transcription pipeline.
4
+ Zaman damgalı, konuşmacı ayrımlı çıktı.
5
  """
6
 
7
+ import gradio as gr
8
+ from faster_whisper import WhisperModel
9
+ import tempfile
10
+ import time
11
  import os
12
+ import torch
13
 
14
+ from diarization import (
15
+ get_diarization_pipeline,
16
+ diarize_audio,
17
+ format_speaker_label,
18
+ format_timestamp
19
+ )
20
 
21
+ # ==================== CONFIGURATION ====================
22
+ MODEL_SIZE = "small" # Changed to small for HF Spaces memory limits
23
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
24
+ COMPUTE_TYPE = "float16" if DEVICE == "cuda" else "int8"
25
+ # =======================================================
26
+
27
+ print(f"🔧 Device: {DEVICE}, Compute: {COMPUTE_TYPE}")
28
+
29
+ # Load models at startup
30
+ print("🔄 Whisper model yükleniyor...")
31
+ whisper_model = WhisperModel(
32
+ MODEL_SIZE,
33
+ device=DEVICE,
34
+ compute_type=COMPUTE_TYPE
35
+ )
36
+ print("✅ Whisper model yüklendi!")
37
 
38
+ print("🔄 Diarization pipeline yükleniyor...")
39
+ diarization_pipeline = get_diarization_pipeline()
40
+
41
+
42
+ def get_audio_duration(audio_path: str) -> float:
43
+ """Get audio duration in seconds using ffprobe."""
44
+ import subprocess
45
+ try:
46
+ result = subprocess.run([
47
+ 'ffprobe', '-v', 'error',
48
+ '-show_entries', 'format=duration',
49
+ '-of', 'default=noprint_wrappers=1:nokey=1',
50
+ audio_path
51
+ ], capture_output=True, text=True, check=True)
52
+ return float(result.stdout.strip())
53
+ except:
54
+ return 0.0
55
 
56
 
57
+ def transcribe_segment(audio_path: str, start: float, end: float) -> str:
58
  """
59
+ Transcribe a specific segment of audio.
 
 
 
 
 
 
60
  """
61
  try:
62
+ # Faster-whisper doesn't support segment extraction directly,
63
+ # so we transcribe the whole file and filter by timestamp
64
+ segments, _ = whisper_model.transcribe(
65
+ audio_path,
66
+ language="tr",
67
+ beam_size=5
 
 
 
 
 
68
  )
69
 
70
+ # Collect text from segments that fall within our time range
71
+ text_parts = []
72
+ for segment in segments:
73
+ # Check if segment overlaps with our range
74
+ if segment.end > start and segment.start < end:
75
+ text_parts.append(segment.text)
76
 
77
+ return " ".join(text_parts).strip()
78
  except Exception as e:
79
+ return f"[Transkripsiyon hatası: {e}]"
 
80
 
81
 
82
+ def transcribe_with_diarization(audio_path: str) -> tuple:
83
  """
84
+ Full pipeline: diarization + transcription.
85
+ Returns formatted transcript with speaker labels and timestamps.
 
 
 
 
 
 
 
86
  """
87
+ start_time = time.time()
 
88
 
89
+ # Get audio duration for stats
90
+ duration = get_audio_duration(audio_path)
91
+
92
+ # Step 1: Diarization
93
+ print("🎭 Diarization başlıyor...")
94
+ if diarization_pipeline is None:
95
+ # Fallback: no diarization, just transcribe
96
+ segments, info = whisper_model.transcribe(audio_path, language="tr", beam_size=5)
97
 
98
+ full_text = []
99
+ for segment in segments:
100
+ timestamp = format_timestamp(segment.start)
101
+ full_text.append(f"[{timestamp}] {segment.text}")
102
 
103
+ result = "\n".join(full_text)
104
+ elapsed = time.time() - start_time
 
 
 
 
105
 
106
+ stats = f"""
107
+ ───────────────────────────────────
108
+ 📊 İstatistikler
109
+ • Toplam süre: {format_timestamp(info.duration)}
110
+ • İşlem süresi: {elapsed:.1f} saniye
111
+ • ⚠️ Diarization kullanılamadı (yalnızca transkripsiyon)
112
+ ───────────────────────────────────"""
113
 
114
+ return result + stats, None
115
+
116
+ # Run diarization
117
+ diarization_segments = diarize_audio(audio_path, diarization_pipeline, num_speakers=2)
118
+
119
+ if not diarization_segments:
120
+ return "❌ Diarization başarısız oldu.", None
121
+
122
+ # Step 2: Transcribe each segment
123
+ print("🎙️ Transkripsiyon başlıyor...")
124
+ segments, info = whisper_model.transcribe(audio_path, language="tr", beam_size=5)
125
+ whisper_segments = list(segments) # Convert generator to list
126
+
127
+ # Track which whisper segments have been used
128
+ used_whisper_indices = set()
129
+
130
+ # Step 3: Merge diarization with transcription
131
+ print("🔗 Birleştirme yapılıyor...")
132
+ transcript_parts = []
133
+ speaker_times = {}
134
+
135
+ for start, end, speaker in diarization_segments:
136
+ speaker_label = format_speaker_label(speaker)
137
 
138
+ # Track speaker time
139
+ if speaker_label not in speaker_times:
140
+ speaker_times[speaker_label] = 0
141
+ speaker_times[speaker_label] += (end - start)
142
+
143
+ # Find whisper segments that overlap with this diarization segment
144
+ # Only use segments that haven't been used before
145
+ segment_text = []
146
+ for idx, ws in enumerate(whisper_segments):
147
+ if idx in used_whisper_indices:
148
+ continue
149
+ # Check if whisper segment's midpoint falls within diarization segment
150
+ ws_midpoint = (ws.start + ws.end) / 2
151
+ if start <= ws_midpoint <= end:
152
+ segment_text.append(ws.text)
153
+ used_whisper_indices.add(idx)
154
+
155
+ if segment_text:
156
+ text = " ".join(segment_text).strip()
157
+ timestamp_start = format_timestamp(start)
158
+ timestamp_end = format_timestamp(end)
159
+ transcript_parts.append(f"[{timestamp_start} → {timestamp_end}] {speaker_label}:\n{text}\n")
160
+
161
+ # Build final output
162
+ header = """═══════════════════════════════════════════════════
163
+ 📋 GÖRÜŞME TRANSKRİPTİ
164
+ ═══════════════════════════════════════════════════
165
 
166
+ """
167
+
168
+ body = "\n".join(transcript_parts)
169
+
170
+ # Statistics
171
+ elapsed = time.time() - start_time
172
+ total_time = info.duration
173
+
174
+ stats_lines = [
175
+ "",
176
+ "───────────────────────────────────",
177
+ "📊 İstatistikler",
178
+ f"• Toplam süre: {format_timestamp(total_time)}",
179
+ f"• İşlem süresi: {elapsed:.1f} saniye",
180
+ ]
181
+
182
+ for speaker, stime in sorted(speaker_times.items()):
183
+ percentage = (stime / total_time) * 100 if total_time > 0 else 0
184
+ stats_lines.append(f"• {speaker} konuşma: {format_timestamp(stime)} (%{percentage:.0f})")
185
+
186
+ stats_lines.append("───────────────────────────────────")
187
+ stats = "\n".join(stats_lines)
188
+
189
+ full_result = header + body + stats
190
+
191
+ # Create downloadable file
192
+ txt_file = tempfile.NamedTemporaryFile(
193
+ mode='w',
194
+ suffix='.txt',
195
+ delete=False,
196
+ encoding='utf-8'
197
+ )
198
+ txt_file.write(full_result)
199
+ txt_file.close()
200
+
201
+ return full_result, txt_file.name
202
 
203
+
204
+ def process_audio(audio_path):
205
+ """Gradio handler."""
206
+ if audio_path is None:
207
+ return "⚠️ Lütfen bir ses dosyası yükleyin.", None
208
+
209
+ try:
210
+ return transcribe_with_diarization(audio_path)
211
+ except Exception as e:
212
+ return f"❌ Beklenmeyen hata: {str(e)}", None
 
213
 
214
 
215
+ # ==================== GRADIO UI ====================
216
+ with gr.Blocks(title="Görüşme Transkripsiyon") as demo:
217
+
218
+ gr.HTML("""
219
+ <style>
220
+ footer { display: none !important; }
221
+ .gradio-container { max-width: 900px !important; margin: auto !important; }
222
+ </style>
223
+ <div style="text-align: center; padding: 40px 20px 30px;
224
+ background: linear-gradient(135deg, #1e3a5f 0%, #2d5a87 100%);
225
+ border-radius: 20px; margin-bottom: 24px; color: white;">
226
+ <h1 style="font-size: 2.2rem; font-weight: 700; margin: 0 0 8px 0;">
227
+ 🎙️ Görüşme Transkripsiyon Sistemi
228
+ </h1>
229
+ <p style="font-size: 1rem; opacity: 0.95; margin: 0;">
230
+ Danışman-Danışan görüşmelerini zaman damgalı ve konuşmacı ayrımlı olarak yazıya dökün
231
+ </p>
232
+ </div>
233
+ """)
234
+
235
+ with gr.Row():
236
+ with gr.Column():
237
+ gr.HTML('<div style="font-weight: 600; margin-bottom: 12px;">📤 Ses Dosyası</div>')
238
+
239
+ audio_input = gr.Audio(
240
+ label="Görüşme Kaydı",
241
+ type="filepath",
242
+ sources=["upload", "microphone"]
243
+ )
244
+
245
+ submit_btn = gr.Button(
246
+ "🚀 Transkripsiyon Başlat",
247
+ variant="primary",
248
+ size="lg"
249
+ )
250
+
251
+ # Info box
252
+ gr.HTML("""
253
+ <div style="background: linear-gradient(135deg, #f0f9ff 0%, #e0f2fe 100%);
254
+ border: 1px solid #7dd3fc; border-radius: 12px;
255
+ padding: 16px 20px; margin-top: 16px;">
256
+ <p style="margin: 0; color: #0369a1; font-size: 14px;">
257
+ ℹ️ <strong>Nasıl Çalışır:</strong><br>
258
+ 1. Ses dosyasını yükleyin (MP3, WAV, M4A)<br>
259
+ 2. AI otomatik olarak konuşmacıları ayırır<br>
260
+ 3. Zaman damgalı transkript oluşturulur
261
+ </p>
262
+ </div>
263
+ """)
264
+
265
+ with gr.Row():
266
+ with gr.Column():
267
+ gr.HTML('<div style="font-weight: 600; margin-bottom: 12px;">📝 Transkript Sonucu</div>')
268
+
269
+ output_text = gr.Textbox(
270
+ label="",
271
+ placeholder="Transkript burada görünecek...",
272
+ lines=20,
273
+ interactive=False
274
+ )
275
+
276
+ download_file = gr.File(
277
+ label="📥 Transkripti İndir (.txt)"
278
+ )
279
+
280
+ # Features
281
+ gr.HTML("""
282
+ <div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; margin-top: 24px;">
283
+ <div style="text-align: center; padding: 16px; background: #f9fafb; border-radius: 12px;">
284
+ <div style="font-size: 24px; margin-bottom: 6px;">🎭</div>
285
+ <div style="font-size: 12px; color: #6b7280; font-weight: 500;">Konuşmacı Ayrımı</div>
286
+ </div>
287
+ <div style="text-align: center; padding: 16px; background: #f9fafb; border-radius: 12px;">
288
+ <div style="font-size: 24px; margin-bottom: 6px;">⏱️</div>
289
+ <div style="font-size: 12px; color: #6b7280; font-weight: 500;">Zaman Damgası</div>
290
+ </div>
291
+ <div style="text-align: center; padding: 16px; background: #f9fafb; border-radius: 12px;">
292
+ <div style="font-size: 24px; margin-bottom: 6px;">🔒</div>
293
+ <div style="font-size: 12px; color: #6b7280; font-weight: 500;">%100 Local</div>
294
+ </div>
295
+ <div style="text-align: center; padding: 16px; background: #f9fafb; border-radius: 12px;">
296
+ <div style="font-size: 24px; margin-bottom: 6px;">🇹🇷</div>
297
+ <div style="font-size: 12px; color: #6b7280; font-weight: 500;">Türkçe Optimizeli</div>
298
+ </div>
299
+ </div>
300
+ """)
301
+
302
+ # Privacy notice
303
+ gr.HTML("""
304
+ <div style="background: #ecfdf5; border: 1px solid #6ee7b7; border-radius: 8px;
305
+ padding: 12px 16px; margin-top: 16px;">
306
+ <p style="margin: 0; color: #047857; font-size: 13px;">
307
+ 🔒 <strong>Gizlilik:</strong> Tüm işlemler yerel olarak yapılır.
308
+ Ses dosyalarınız hiçbir sunucuya gönderilmez.
309
+ </p>
310
+ </div>
311
+ """)
312
+
313
+ # Footer
314
+ gr.HTML("""
315
+ <div style="text-align: center; padding: 24px 0; color: #9ca3af; font-size: 13px;">
316
+ <p>Powered by Faster-Whisper & Pyannote-Audio • GPU & CPU Destekli</p>
317
+ </div>
318
+ """)
319
+
320
+ # Event handling
321
+ submit_btn.click(
322
+ fn=process_audio,
323
+ inputs=[audio_input],
324
+ outputs=[output_text, download_file]
325
+ )
326
+
327
+ # Launch
328
+ if __name__ == "__main__":
329
+ demo.launch(share=False, show_error=True)