Jedi09 commited on
Commit
5652d57
·
verified ·
1 Parent(s): 179896d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -291
app.py CHANGED
@@ -1,321 +1,122 @@
1
  """
2
- Danışman-Danışan Transkripsiyon Sistemi
3
- Speaker diarization + transcription pipeline.
4
- Zaman damgalı, konuşmacı ayrımlı çıktı.
5
  """
6
 
7
- import gradio as gr
8
- from faster_whisper import WhisperModel
9
- import tempfile
10
- import time
11
  import os
12
- import torch
13
-
14
- from diarization import (
15
- get_diarization_pipeline,
16
- diarize_audio,
17
- format_speaker_label,
18
- format_timestamp
19
- )
20
-
21
- # ==================== CONFIGURATION ====================
22
- MODEL_SIZE = "medium" # Options: tiny, base, small, medium, large-v3
23
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
24
- COMPUTE_TYPE = "float16" if DEVICE == "cuda" else "int8"
25
- # =======================================================
26
-
27
- print(f"🔧 Device: {DEVICE}, Compute: {COMPUTE_TYPE}")
28
 
29
- # Load models at startup
30
- print("🔄 Whisper model yükleniyor...")
31
- whisper_model = WhisperModel(
32
- MODEL_SIZE,
33
- device=DEVICE,
34
- compute_type=COMPUTE_TYPE
35
- )
36
- print("✅ Whisper model yüklendi!")
37
-
38
- print("🔄 Diarization pipeline yükleniyor...")
39
- diarization_pipeline = get_diarization_pipeline()
40
 
 
41
 
42
- def get_audio_duration(audio_path: str) -> float:
43
- """Get audio duration in seconds using ffprobe."""
44
- import subprocess
45
- try:
46
- result = subprocess.run([
47
- 'ffprobe', '-v', 'error',
48
- '-show_entries', 'format=duration',
49
- '-of', 'default=noprint_wrappers=1:nokey=1',
50
- audio_path
51
- ], capture_output=True, text=True, check=True)
52
- return float(result.stdout.strip())
53
- except:
54
- return 0.0
55
 
56
 
57
- def transcribe_segment(audio_path: str, start: float, end: float) -> str:
58
  """
59
- Transcribe a specific segment of audio.
 
 
 
 
 
 
60
  """
61
  try:
62
- # Faster-whisper doesn't support segment extraction directly,
63
- # so we transcribe the whole file and filter by timestamp
64
- segments, _ = whisper_model.transcribe(
65
- audio_path,
66
- language="tr",
67
- beam_size=5
 
 
 
 
 
68
  )
69
 
70
- # Collect text from segments that fall within our time range
71
- text_parts = []
72
- for segment in segments:
73
- # Check if segment overlaps with our range
74
- if segment.end > start and segment.start < end:
75
- text_parts.append(segment.text)
76
 
77
- return " ".join(text_parts).strip()
78
  except Exception as e:
79
- return f"[Transkripsiyon hatası: {e}]"
 
80
 
81
 
82
- def transcribe_with_diarization(audio_path: str) -> tuple:
83
  """
84
- Full pipeline: diarization + transcription.
85
- Returns formatted transcript with speaker labels and timestamps.
86
- """
87
- start_time = time.time()
88
 
89
- # Get audio duration for stats
90
- duration = get_audio_duration(audio_path)
 
 
 
 
 
 
 
 
91
 
92
- # Step 1: Diarization
93
- print("🎭 Diarization başlıyor...")
94
- if diarization_pipeline is None:
95
- # Fallback: no diarization, just transcribe
96
- segments, info = whisper_model.transcribe(audio_path, language="tr", beam_size=5)
 
97
 
98
- full_text = []
99
- for segment in segments:
100
- timestamp = format_timestamp(segment.start)
101
- full_text.append(f"[{timestamp}] {segment.text}")
102
 
103
- result = "\n".join(full_text)
104
- elapsed = time.time() - start_time
 
 
 
 
105
 
106
- stats = f"""
107
- ───────────────────────────────────
108
- 📊 İstatistikler
109
- • Toplam süre: {format_timestamp(info.duration)}
110
- • İşlem süresi: {elapsed:.1f} saniye
111
- • ⚠️ Diarization kullanılamadı (yalnızca transkripsiyon)
112
- ───────────────────────────────────"""
113
 
114
- return result + stats, None
115
-
116
- # Run diarization
117
- diarization_segments = diarize_audio(audio_path, diarization_pipeline, num_speakers=2)
118
-
119
- if not diarization_segments:
120
- return "❌ Diarization başarısız oldu.", None
121
-
122
- # Step 2: Transcribe each segment
123
- print("🎙️ Transkripsiyon başlıyor...")
124
- segments, info = whisper_model.transcribe(audio_path, language="tr", beam_size=5)
125
- whisper_segments = list(segments) # Convert generator to list
126
-
127
- # Step 3: Merge diarization with transcription
128
- print("🔗 Birleştirme yapılıyor...")
129
- transcript_parts = []
130
- speaker_times = {}
131
-
132
- for start, end, speaker in diarization_segments:
133
- speaker_label = format_speaker_label(speaker)
134
-
135
- # Track speaker time
136
- if speaker_label not in speaker_times:
137
- speaker_times[speaker_label] = 0
138
- speaker_times[speaker_label] += (end - start)
139
 
140
- # Find whisper segments that overlap with this diarization segment
141
- segment_text = []
142
- for ws in whisper_segments:
143
- # Check overlap
144
- if ws.end > start and ws.start < end:
145
- segment_text.append(ws.text)
146
-
147
- if segment_text:
148
- text = " ".join(segment_text).strip()
149
- timestamp_start = format_timestamp(start)
150
- timestamp_end = format_timestamp(end)
151
- transcript_parts.append(f"[{timestamp_start} → {timestamp_end}] {speaker_label}:\n{text}\n")
152
-
153
- # Build final output
154
- header = """═══════════════════════════════════════════════════
155
- 📋 GÖRÜŞME TRANSKRİPTİ
156
- ═══════════════════════════════════════════════════
157
-
158
- """
159
-
160
- body = "\n".join(transcript_parts)
161
-
162
- # Statistics
163
- elapsed = time.time() - start_time
164
- total_time = info.duration
165
-
166
- stats_lines = [
167
- "",
168
- "───────────────────────────────────",
169
- "📊 İstatistikler",
170
- f"• Toplam süre: {format_timestamp(total_time)}",
171
- f"• İşlem süresi: {elapsed:.1f} saniye",
172
- ]
173
-
174
- for speaker, stime in sorted(speaker_times.items()):
175
- percentage = (stime / total_time) * 100 if total_time > 0 else 0
176
- stats_lines.append(f"• {speaker} konuşma: {format_timestamp(stime)} (%{percentage:.0f})")
177
-
178
- stats_lines.append("───────────────────────────────────")
179
- stats = "\n".join(stats_lines)
180
-
181
- full_result = header + body + stats
182
-
183
- # Create downloadable file
184
- txt_file = tempfile.NamedTemporaryFile(
185
- mode='w',
186
- suffix='.txt',
187
- delete=False,
188
- encoding='utf-8'
189
- )
190
- txt_file.write(full_result)
191
- txt_file.close()
192
-
193
- return full_result, txt_file.name
194
-
195
-
196
- def process_audio(audio_path):
197
- """Gradio handler."""
198
- if audio_path is None:
199
- return "⚠️ Lütfen bir ses dosyası yükleyin.", None
200
-
201
- try:
202
- return transcribe_with_diarization(audio_path)
203
  except Exception as e:
204
- return f"❌ Beklenmeyen hata: {str(e)}", None
 
205
 
206
 
207
- # ==================== GRADIO UI ====================
208
- with gr.Blocks(title="Görüşme Transkripsiyon") as demo:
209
-
210
- gr.HTML("""
211
- <style>
212
- footer { display: none !important; }
213
- .gradio-container { max-width: 900px !important; margin: auto !important; }
214
- </style>
215
- <div style="text-align: center; padding: 40px 20px 30px;
216
- background: linear-gradient(135deg, #1e3a5f 0%, #2d5a87 100%);
217
- border-radius: 20px; margin-bottom: 24px; color: white;">
218
- <h1 style="font-size: 2.2rem; font-weight: 700; margin: 0 0 8px 0;">
219
- 🎙️ Görüşme Transkripsiyon Sistemi
220
- </h1>
221
- <p style="font-size: 1rem; opacity: 0.95; margin: 0;">
222
- Danışman-Danışan görüşmelerini zaman damgalı ve konuşmacı ayrımlı olarak yazıya dökün
223
- </p>
224
- </div>
225
- """)
226
-
227
- with gr.Row():
228
- with gr.Column():
229
- gr.HTML('<div style="font-weight: 600; margin-bottom: 12px;">📤 Ses Dosyası</div>')
230
-
231
- audio_input = gr.Audio(
232
- label="Görüşme Kaydı",
233
- type="filepath",
234
- sources=["upload", "microphone"]
235
- )
236
-
237
- submit_btn = gr.Button(
238
- "🚀 Transkripsiyon Başlat",
239
- variant="primary",
240
- size="lg"
241
- )
242
-
243
- # Info box
244
- gr.HTML("""
245
- <div style="background: linear-gradient(135deg, #f0f9ff 0%, #e0f2fe 100%);
246
- border: 1px solid #7dd3fc; border-radius: 12px;
247
- padding: 16px 20px; margin-top: 16px;">
248
- <p style="margin: 0; color: #0369a1; font-size: 14px;">
249
- ℹ️ <strong>Nasıl Çalışır:</strong><br>
250
- 1. Ses dosyasını yükleyin (MP3, WAV, M4A)<br>
251
- 2. AI otomatik olarak konuşmacıları ayırır<br>
252
- 3. Zaman damgalı transkript oluşturulur
253
- </p>
254
- </div>
255
- """)
256
-
257
- with gr.Row():
258
- with gr.Column():
259
- gr.HTML('<div style="font-weight: 600; margin-bottom: 12px;">📝 Transkript Sonucu</div>')
260
-
261
- output_text = gr.Textbox(
262
- label="",
263
- placeholder="Transkript burada görünecek...",
264
- lines=20,
265
- interactive=False
266
- )
267
-
268
- download_file = gr.File(
269
- label="📥 Transkripti İndir (.txt)"
270
- )
271
-
272
- # Features
273
- gr.HTML("""
274
- <div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; margin-top: 24px;">
275
- <div style="text-align: center; padding: 16px; background: #f9fafb; border-radius: 12px;">
276
- <div style="font-size: 24px; margin-bottom: 6px;">🎭</div>
277
- <div style="font-size: 12px; color: #6b7280; font-weight: 500;">Konuşmacı Ayrımı</div>
278
- </div>
279
- <div style="text-align: center; padding: 16px; background: #f9fafb; border-radius: 12px;">
280
- <div style="font-size: 24px; margin-bottom: 6px;">⏱️</div>
281
- <div style="font-size: 12px; color: #6b7280; font-weight: 500;">Zaman Damgası</div>
282
- </div>
283
- <div style="text-align: center; padding: 16px; background: #f9fafb; border-radius: 12px;">
284
- <div style="font-size: 24px; margin-bottom: 6px;">🔒</div>
285
- <div style="font-size: 12px; color: #6b7280; font-weight: 500;">%100 Local</div>
286
- </div>
287
- <div style="text-align: center; padding: 16px; background: #f9fafb; border-radius: 12px;">
288
- <div style="font-size: 24px; margin-bottom: 6px;">🇹🇷</div>
289
- <div style="font-size: 12px; color: #6b7280; font-weight: 500;">Türkçe Optimizeli</div>
290
- </div>
291
- </div>
292
- """)
293
-
294
- # Privacy notice
295
- gr.HTML("""
296
- <div style="background: #ecfdf5; border: 1px solid #6ee7b7; border-radius: 8px;
297
- padding: 12px 16px; margin-top: 16px;">
298
- <p style="margin: 0; color: #047857; font-size: 13px;">
299
- 🔒 <strong>Gizlilik:</strong> Tüm işlemler yerel olarak yapılır.
300
- Ses dosyalarınız hiçbir sunucuya gönderilmez.
301
- </p>
302
- </div>
303
- """)
304
-
305
- # Footer
306
- gr.HTML("""
307
- <div style="text-align: center; padding: 24px 0; color: #9ca3af; font-size: 13px;">
308
- <p>Powered by Faster-Whisper & Pyannote-Audio • GPU & CPU Destekli</p>
309
- </div>
310
- """)
311
-
312
- # Event handling
313
- submit_btn.click(
314
- fn=process_audio,
315
- inputs=[audio_input],
316
- outputs=[output_text, download_file]
317
- )
318
 
319
- # Launch
320
- if __name__ == "__main__":
321
- demo.launch(share=False, show_error=True)
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Speaker Diarization Module
3
+ Pyannote-audio ile konuşmacı ayrımı (kim ne zaman konuşuyor).
 
4
  """
5
 
 
 
 
 
6
  import os
7
+ from typing import List, Tuple, Optional
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # PyTorch 2.6+ compatibility: Disable weights_only restriction for pyannote models
10
+ os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
 
 
 
 
 
 
 
 
 
11
 
12
+ import torch
13
 
14
+ # Check for GPU availability
15
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+ print(f"🔧 Diarization device: {DEVICE}")
 
 
 
 
 
 
 
 
 
 
17
 
18
 
19
+ def get_diarization_pipeline(hf_token: Optional[str] = None):
20
  """
21
+ Load pyannote speaker diarization pipeline.
22
+
23
+ Args:
24
+ hf_token: Hugging Face token (required for pyannote models)
25
+
26
+ Returns:
27
+ Diarization pipeline or None if failed
28
  """
29
  try:
30
+ from pyannote.audio import Pipeline
31
+
32
+ # Try to get token from environment if not provided
33
+ token = hf_token or os.environ.get("HF_TOKEN")
34
+
35
+ if not token:
36
+ print("⚠️ HF_TOKEN bulunamadı. pyannote modeli yüklenemeyebilir.")
37
+
38
+ pipeline = Pipeline.from_pretrained(
39
+ "pyannote/speaker-diarization-3.1",
40
+ token=token
41
  )
42
 
43
+ # Move to GPU if available
44
+ pipeline.to(DEVICE)
45
+
46
+ print("✅ Diarization pipeline yüklendi!")
47
+ return pipeline
 
48
 
 
49
  except Exception as e:
50
+ print(f" Diarization pipeline yüklenemedi: {e}")
51
+ return None
52
 
53
 
54
+ def diarize_audio(audio_path: str, pipeline, num_speakers: int = None) -> List[Tuple[float, float, str]]:
55
  """
56
+ Perform speaker diarization on audio file.
 
 
 
57
 
58
+ Args:
59
+ audio_path: Path to audio file
60
+ pipeline: Pyannote diarization pipeline
61
+ num_speakers: Expected number of speakers (None for auto-detect)
62
+
63
+ Returns:
64
+ List of (start_time, end_time, speaker_label) tuples
65
+ """
66
+ if pipeline is None:
67
+ return []
68
 
69
+ try:
70
+ # Run diarization (auto-detect speakers or use specified count)
71
+ if num_speakers:
72
+ result = pipeline(audio_path, min_speakers=1, max_speakers=num_speakers)
73
+ else:
74
+ result = pipeline(audio_path)
75
 
76
+ # Extract segments from DiarizeOutput object
77
+ segments = []
 
 
78
 
79
+ # DiarizeOutput has speaker_diarization attribute which is the Annotation
80
+ if hasattr(result, 'speaker_diarization'):
81
+ diarization = result.speaker_diarization
82
+ print(f"🔍 Using speaker_diarization attribute")
83
+ else:
84
+ diarization = result
85
 
86
+ # Now iterate over the Annotation object
87
+ for segment, track, speaker in diarization.itertracks(yield_label=True):
88
+ segments.append((segment.start, segment.end, speaker))
 
 
 
 
89
 
90
+ print(f"✅ Diarization tamamlandı: {len(segments)} segment bulundu")
91
+ return segments
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  except Exception as e:
94
+ print(f"❌ Diarization hatası: {e}")
95
+ return []
96
 
97
 
98
+ def format_speaker_label(speaker: str) -> str:
99
+ """
100
+ Convert pyannote speaker labels (SPEAKER_00, SPEAKER_01) to user-friendly format.
101
+ """
102
+ speaker_map = {
103
+ "SPEAKER_00": "Kişi 1",
104
+ "SPEAKER_01": "Kişi 2",
105
+ "SPEAKER_02": "Kişi 3",
106
+ "SPEAKER_03": "Kişi 4",
107
+ }
108
+ return speaker_map.get(speaker, speaker)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
+
111
+ def format_timestamp(seconds: float) -> str:
112
+ """
113
+ Convert seconds to [HH:MM:SS] or [MM:SS] format.
114
+ """
115
+ hours = int(seconds // 3600)
116
+ minutes = int((seconds % 3600) // 60)
117
+ secs = int(seconds % 60)
118
+
119
+ if hours > 0:
120
+ return f"{hours:02d}:{minutes:02d}:{secs:02d}"
121
+ else:
122
+ return f"{minutes:02d}:{secs:02d}"