Spaces:

Jedi09
/

session-scribe

Sleeping

App Files Files Community

Jedi09 commited on Dec 6, 2025

Commit

5652d57

verified ·

1 Parent(s): 179896d

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -291

app.py CHANGED Viewed

@@ -1,321 +1,122 @@
 """
-Danışman-Danışan Transkripsiyon Sistemi
-Speaker diarization + transcription pipeline.
-Zaman damgalı, konuşmacı ayrımlı çıktı.
 """
-import gradio as gr
-from faster_whisper import WhisperModel
-import tempfile
-import time
 import os
-import torch
-from diarization import (
-    get_diarization_pipeline,
-    diarize_audio,
-    format_speaker_label,
-    format_timestamp
-)
-# ==================== CONFIGURATION ====================
-MODEL_SIZE = "medium"  # Options: tiny, base, small, medium, large-v3
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-COMPUTE_TYPE = "float16" if DEVICE == "cuda" else "int8"
-# =======================================================
-print(f"🔧 Device: {DEVICE}, Compute: {COMPUTE_TYPE}")
-# Load models at startup
-print("🔄 Whisper model yükleniyor...")
-whisper_model = WhisperModel(
-    MODEL_SIZE,
-    device=DEVICE,
-    compute_type=COMPUTE_TYPE
-)
-print("✅ Whisper model yüklendi!")
-print("🔄 Diarization pipeline yükleniyor...")
-diarization_pipeline = get_diarization_pipeline()
-def get_audio_duration(audio_path: str) -> float:
-    """Get audio duration in seconds using ffprobe."""
-    import subprocess
-    try:
-        result = subprocess.run([
-            'ffprobe', '-v', 'error',
-            '-show_entries', 'format=duration',
-            '-of', 'default=noprint_wrappers=1:nokey=1',
-            audio_path
-        ], capture_output=True, text=True, check=True)
-        return float(result.stdout.strip())
-    except:
-        return 0.0
-def transcribe_segment(audio_path: str, start: float, end: float) -> str:
     """
-    Transcribe a specific segment of audio.
     """
     try:
-        # Faster-whisper doesn't support segment extraction directly,
-        # so we transcribe the whole file and filter by timestamp
-        segments, _ = whisper_model.transcribe(
-            audio_path,
-            language="tr",
-            beam_size=5
         )
-        # Collect text from segments that fall within our time range
-        text_parts = []
-        for segment in segments:
-            # Check if segment overlaps with our range
-            if segment.end > start and segment.start < end:
-                text_parts.append(segment.text)
-        return " ".join(text_parts).strip()
     except Exception as e:
-        return f"[Transkripsiyon hatası: {e}]"
-def transcribe_with_diarization(audio_path: str) -> tuple:
     """
-    Full pipeline: diarization + transcription.
-    Returns formatted transcript with speaker labels and timestamps.
-    """
-    start_time = time.time()
-    # Get audio duration for stats
-    duration = get_audio_duration(audio_path)
-    # Step 1: Diarization
-    print("🎭 Diarization başlıyor...")
-    if diarization_pipeline is None:
-        # Fallback: no diarization, just transcribe
-        segments, info = whisper_model.transcribe(audio_path, language="tr", beam_size=5)
-        full_text = []
-        for segment in segments:
-            timestamp = format_timestamp(segment.start)
-            full_text.append(f"[{timestamp}] {segment.text}")
-        result = "\n".join(full_text)
-        elapsed = time.time() - start_time
-        stats = f"""
-───────────────────────────────────
-📊 İstatistikler
-• Toplam süre: {format_timestamp(info.duration)}
-• İşlem süresi: {elapsed:.1f} saniye
-• ⚠️ Diarization kullanılamadı (yalnızca transkripsiyon)
-───────────────────────────────────"""
-        return result + stats, None
-    # Run diarization
-    diarization_segments = diarize_audio(audio_path, diarization_pipeline, num_speakers=2)
-    if not diarization_segments:
-        return "❌ Diarization başarısız oldu.", None
-    # Step 2: Transcribe each segment
-    print("🎙️ Transkripsiyon başlıyor...")
-    segments, info = whisper_model.transcribe(audio_path, language="tr", beam_size=5)
-    whisper_segments = list(segments)  # Convert generator to list
-    # Step 3: Merge diarization with transcription
-    print("🔗 Birleştirme yapılıyor...")
-    transcript_parts = []
-    speaker_times = {}
-    for start, end, speaker in diarization_segments:
-        speaker_label = format_speaker_label(speaker)
-        # Track speaker time
-        if speaker_label not in speaker_times:
-            speaker_times[speaker_label] = 0
-        speaker_times[speaker_label] += (end - start)
-        # Find whisper segments that overlap with this diarization segment
-        segment_text = []
-        for ws in whisper_segments:
-            # Check overlap
-            if ws.end > start and ws.start < end:
-                segment_text.append(ws.text)
-        if segment_text:
-            text = " ".join(segment_text).strip()
-            timestamp_start = format_timestamp(start)
-            timestamp_end = format_timestamp(end)
-            transcript_parts.append(f"[{timestamp_start} → {timestamp_end}] {speaker_label}:\n{text}\n")
-    # Build final output
-    header = """═══════════════════════════════════════════════════
-📋 GÖRÜŞME TRANSKRİPTİ
-═══════════════════════════════════════════════════
-"""
-    body = "\n".join(transcript_parts)
-    # Statistics
-    elapsed = time.time() - start_time
-    total_time = info.duration
-    stats_lines = [
-        "",
-        "───────────────────────────────────",
-        "📊 İstatistikler",
-        f"• Toplam süre: {format_timestamp(total_time)}",
-        f"• İşlem süresi: {elapsed:.1f} saniye",
-    ]
-    for speaker, stime in sorted(speaker_times.items()):
-        percentage = (stime / total_time) * 100 if total_time > 0 else 0
-        stats_lines.append(f"• {speaker} konuşma: {format_timestamp(stime)} (%{percentage:.0f})")
-    stats_lines.append("───────────────────────────────────")
-    stats = "\n".join(stats_lines)
-    full_result = header + body + stats
-    # Create downloadable file
-    txt_file = tempfile.NamedTemporaryFile(
-        mode='w',
-        suffix='.txt',
-        delete=False,
-        encoding='utf-8'
-    )
-    txt_file.write(full_result)
-    txt_file.close()
-    return full_result, txt_file.name
-def process_audio(audio_path):
-    """Gradio handler."""
-    if audio_path is None:
-        return "⚠️ Lütfen bir ses dosyası yükleyin.", None
-    try:
-        return transcribe_with_diarization(audio_path)
     except Exception as e:
-        return f"❌ Beklenmeyen hata: {str(e)}", None
-# ==================== GRADIO UI ====================
-with gr.Blocks(title="Görüşme Transkripsiyon") as demo:
-    gr.HTML("""
-        <style>
-            footer { display: none !important; }
-            .gradio-container { max-width: 900px !important; margin: auto !important; }
-        </style>
-        <div style="text-align: center; padding: 40px 20px 30px;
-                    background: linear-gradient(135deg, #1e3a5f 0%, #2d5a87 100%);
-                    border-radius: 20px; margin-bottom: 24px; color: white;">
-            <h1 style="font-size: 2.2rem; font-weight: 700; margin: 0 0 8px 0;">
-                🎙️ Görüşme Transkripsiyon Sistemi
-            </h1>
-            <p style="font-size: 1rem; opacity: 0.95; margin: 0;">
-                Danışman-Danışan görüşmelerini zaman damgalı ve konuşmacı ayrımlı olarak yazıya dökün
-            </p>
-        </div>
-    """)
-    with gr.Row():
-        with gr.Column():
-            gr.HTML('<div style="font-weight: 600; margin-bottom: 12px;">📤 Ses Dosyası</div>')
-            audio_input = gr.Audio(
-                label="Görüşme Kaydı",
-                type="filepath",
-                sources=["upload", "microphone"]
-            )
-            submit_btn = gr.Button(
-                "🚀 Transkripsiyon Başlat",
-                variant="primary",
-                size="lg"
-            )
-            # Info box
-            gr.HTML("""
-                <div style="background: linear-gradient(135deg, #f0f9ff 0%, #e0f2fe 100%);
-                            border: 1px solid #7dd3fc; border-radius: 12px;
-                            padding: 16px 20px; margin-top: 16px;">
-                    <p style="margin: 0; color: #0369a1; font-size: 14px;">
-                        ℹ️ <strong>Nasıl Çalışır:</strong><br>
-                        1. Ses dosyasını yükleyin (MP3, WAV, M4A)<br>
-                        2. AI otomatik olarak konuşmacıları ayırır<br>
-                        3. Zaman damgalı transkript oluşturulur
-                    </p>
-                </div>
-            """)
-    with gr.Row():
-        with gr.Column():
-            gr.HTML('<div style="font-weight: 600; margin-bottom: 12px;">📝 Transkript Sonucu</div>')
-            output_text = gr.Textbox(
-                label="",
-                placeholder="Transkript burada görünecek...",
-                lines=20,
-                interactive=False
-            )
-            download_file = gr.File(
-                label="📥 Transkripti İndir (.txt)"
-            )
-    # Features
-    gr.HTML("""
-        <div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; margin-top: 24px;">
-            <div style="text-align: center; padding: 16px; background: #f9fafb; border-radius: 12px;">
-                <div style="font-size: 24px; margin-bottom: 6px;">🎭</div>
-                <div style="font-size: 12px; color: #6b7280; font-weight: 500;">Konuşmacı Ayrımı</div>
-            </div>
-            <div style="text-align: center; padding: 16px; background: #f9fafb; border-radius: 12px;">
-                <div style="font-size: 24px; margin-bottom: 6px;">⏱️</div>
-                <div style="font-size: 12px; color: #6b7280; font-weight: 500;">Zaman Damgası</div>
-            </div>
-            <div style="text-align: center; padding: 16px; background: #f9fafb; border-radius: 12px;">
-                <div style="font-size: 24px; margin-bottom: 6px;">🔒</div>
-                <div style="font-size: 12px; color: #6b7280; font-weight: 500;">%100 Local</div>
-            </div>
-            <div style="text-align: center; padding: 16px; background: #f9fafb; border-radius: 12px;">
-                <div style="font-size: 24px; margin-bottom: 6px;">🇹🇷</div>
-                <div style="font-size: 12px; color: #6b7280; font-weight: 500;">Türkçe Optimizeli</div>
-            </div>
-        </div>
-    """)
-    # Privacy notice
-    gr.HTML("""
-        <div style="background: #ecfdf5; border: 1px solid #6ee7b7; border-radius: 8px;
-                    padding: 12px 16px; margin-top: 16px;">
-            <p style="margin: 0; color: #047857; font-size: 13px;">
-                🔒 <strong>Gizlilik:</strong> Tüm işlemler yerel olarak yapılır.
-                Ses dosyalarınız hiçbir sunucuya gönderilmez.
-            </p>
-        </div>
-    """)
-    # Footer
-    gr.HTML("""
-        <div style="text-align: center; padding: 24px 0; color: #9ca3af; font-size: 13px;">
-            <p>Powered by Faster-Whisper & Pyannote-Audio • GPU & CPU Destekli</p>
-        </div>
-    """)
-    # Event handling
-    submit_btn.click(
-        fn=process_audio,
-        inputs=[audio_input],
-        outputs=[output_text, download_file]
-    )
-# Launch
-if __name__ == "__main__":
-    demo.launch(share=False, show_error=True)

 """
+Speaker Diarization Module
+Pyannote-audio ile konuşmacı ayrımı (kim ne zaman konuşuyor).
 """
 import os
+from typing import List, Tuple, Optional
+# PyTorch 2.6+ compatibility: Disable weights_only restriction for pyannote models
+os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
+import torch
+# Check for GPU availability
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"🔧 Diarization device: {DEVICE}")
+def get_diarization_pipeline(hf_token: Optional[str] = None):
     """
+    Load pyannote speaker diarization pipeline.
+    Args:
+        hf_token: Hugging Face token (required for pyannote models)
+    Returns:
+        Diarization pipeline or None if failed
     """
     try:
+        from pyannote.audio import Pipeline
+        # Try to get token from environment if not provided
+        token = hf_token or os.environ.get("HF_TOKEN")
+        if not token:
+            print("⚠️ HF_TOKEN bulunamadı. pyannote modeli yüklenemeyebilir.")
+        pipeline = Pipeline.from_pretrained(
+            "pyannote/speaker-diarization-3.1",
+            token=token
         )
+        # Move to GPU if available
+        pipeline.to(DEVICE)
+        print("✅ Diarization pipeline yüklendi!")
+        return pipeline
     except Exception as e:
+        print(f"❌ Diarization pipeline yüklenemedi: {e}")
+        return None
+def diarize_audio(audio_path: str, pipeline, num_speakers: int = None) -> List[Tuple[float, float, str]]:
     """
+    Perform speaker diarization on audio file.
+    Args:
+        audio_path: Path to audio file
+        pipeline: Pyannote diarization pipeline
+        num_speakers: Expected number of speakers (None for auto-detect)
+    Returns:
+        List of (start_time, end_time, speaker_label) tuples
+    """
+    if pipeline is None:
+        return []
+    try:
+        # Run diarization (auto-detect speakers or use specified count)
+        if num_speakers:
+            result = pipeline(audio_path, min_speakers=1, max_speakers=num_speakers)
+        else:
+            result = pipeline(audio_path)
+        # Extract segments from DiarizeOutput object
+        segments = []
+        # DiarizeOutput has speaker_diarization attribute which is the Annotation
+        if hasattr(result, 'speaker_diarization'):
+            diarization = result.speaker_diarization
+            print(f"🔍 Using speaker_diarization attribute")
+        else:
+            diarization = result
+        # Now iterate over the Annotation object
+        for segment, track, speaker in diarization.itertracks(yield_label=True):
+            segments.append((segment.start, segment.end, speaker))
+        print(f"✅ Diarization tamamlandı: {len(segments)} segment bulundu")
+        return segments
     except Exception as e:
+        print(f"❌ Diarization hatası: {e}")
+        return []
+def format_speaker_label(speaker: str) -> str:
+    """
+    Convert pyannote speaker labels (SPEAKER_00, SPEAKER_01) to user-friendly format.
+    """
+    speaker_map = {
+        "SPEAKER_00": "Kişi 1",
+        "SPEAKER_01": "Kişi 2",
+        "SPEAKER_02": "Kişi 3",
+        "SPEAKER_03": "Kişi 4",
+    }
+    return speaker_map.get(speaker, speaker)
+def format_timestamp(seconds: float) -> str:
+    """
+    Convert seconds to [HH:MM:SS] or [MM:SS] format.
+    """
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = int(seconds % 60)
+    if hours > 0:
+        return f"{hours:02d}:{minutes:02d}:{secs:02d}"
+    else:
+        return f"{minutes:02d}:{secs:02d}"