Spaces:

Jedi09
/

session-scribe

Sleeping

App Files Files Community

Jedi09 commited on Dec 6, 2025

Commit

be02700

verified ·

1 Parent(s): 5652d57

Update app.py

Browse files

Files changed (1) hide show

app.py +299 -92

app.py CHANGED Viewed

@@ -1,122 +1,329 @@
 """
-Speaker Diarization Module
-Pyannote-audio ile konuşmacı ayrımı (kim ne zaman konuşuyor).
 """
 import os
-from typing import List, Tuple, Optional
-# PyTorch 2.6+ compatibility: Disable weights_only restriction for pyannote models
-os.environ["TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD"] = "1"
-import torch
-# Check for GPU availability
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print(f"🔧 Diarization device: {DEVICE}")
-def get_diarization_pipeline(hf_token: Optional[str] = None):
     """
-    Load pyannote speaker diarization pipeline.
-    Args:
-        hf_token: Hugging Face token (required for pyannote models)
-    Returns:
-        Diarization pipeline or None if failed
     """
     try:
-        from pyannote.audio import Pipeline
-        # Try to get token from environment if not provided
-        token = hf_token or os.environ.get("HF_TOKEN")
-        if not token:
-            print("⚠️ HF_TOKEN bulunamadı. pyannote modeli yüklenemeyebilir.")
-        pipeline = Pipeline.from_pretrained(
-            "pyannote/speaker-diarization-3.1",
-            token=token
         )
-        # Move to GPU if available
-        pipeline.to(DEVICE)
-        print("✅ Diarization pipeline yüklendi!")
-        return pipeline
     except Exception as e:
-        print(f"❌ Diarization pipeline yüklenemedi: {e}")
-        return None
-def diarize_audio(audio_path: str, pipeline, num_speakers: int = None) -> List[Tuple[float, float, str]]:
     """
-    Perform speaker diarization on audio file.
-    Args:
-        audio_path: Path to audio file
-        pipeline: Pyannote diarization pipeline
-        num_speakers: Expected number of speakers (None for auto-detect)
-    Returns:
-        List of (start_time, end_time, speaker_label) tuples
     """
-    if pipeline is None:
-        return []
-    try:
-        # Run diarization (auto-detect speakers or use specified count)
-        if num_speakers:
-            result = pipeline(audio_path, min_speakers=1, max_speakers=num_speakers)
-        else:
-            result = pipeline(audio_path)
-        # Extract segments from DiarizeOutput object
-        segments = []
-        # DiarizeOutput has speaker_diarization attribute which is the Annotation
-        if hasattr(result, 'speaker_diarization'):
-            diarization = result.speaker_diarization
-            print(f"🔍 Using speaker_diarization attribute")
-        else:
-            diarization = result
-        # Now iterate over the Annotation object
-        for segment, track, speaker in diarization.itertracks(yield_label=True):
-            segments.append((segment.start, segment.end, speaker))
-        print(f"✅ Diarization tamamlandı: {len(segments)} segment bulundu")
-        return segments
-    except Exception as e:
-        print(f"❌ Diarization hatası: {e}")
-        return []
-def format_speaker_label(speaker: str) -> str:
-    """
-    Convert pyannote speaker labels (SPEAKER_00, SPEAKER_01) to user-friendly format.
-    """
-    speaker_map = {
-        "SPEAKER_00": "Kişi 1",
-        "SPEAKER_01": "Kişi 2",
-        "SPEAKER_02": "Kişi 3",
-        "SPEAKER_03": "Kişi 4",
-    }
-    return speaker_map.get(speaker, speaker)
-def format_timestamp(seconds: float) -> str:
-    """
-    Convert seconds to [HH:MM:SS] or [MM:SS] format.
-    """
-    hours = int(seconds // 3600)
-    minutes = int((seconds % 3600) // 60)
-    secs = int(seconds % 60)
-    if hours > 0:
-        return f"{hours:02d}:{minutes:02d}:{secs:02d}"
-    else:
-        return f"{minutes:02d}:{secs:02d}"

 """
+Danışman-Danışan Transkripsiyon Sistemi
+Speaker diarization + transcription pipeline.
+Zaman damgalı, konuşmacı ayrımlı çıktı.
 """
+import gradio as gr
+from faster_whisper import WhisperModel
+import tempfile
+import time
 import os
+import torch
+from diarization import (
+    get_diarization_pipeline,
+    diarize_audio,
+    format_speaker_label,
+    format_timestamp
+)
+# ==================== CONFIGURATION ====================
+MODEL_SIZE = "small"  # Changed to small for HF Spaces memory limits
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+COMPUTE_TYPE = "float16" if DEVICE == "cuda" else "int8"
+# =======================================================
+print(f"🔧 Device: {DEVICE}, Compute: {COMPUTE_TYPE}")
+# Load models at startup
+print("🔄 Whisper model yükleniyor...")
+whisper_model = WhisperModel(
+    MODEL_SIZE,
+    device=DEVICE,
+    compute_type=COMPUTE_TYPE
+)
+print("✅ Whisper model yüklendi!")
+print("🔄 Diarization pipeline yükleniyor...")
+diarization_pipeline = get_diarization_pipeline()
+def get_audio_duration(audio_path: str) -> float:
+    """Get audio duration in seconds using ffprobe."""
+    import subprocess
+    try:
+        result = subprocess.run([
+            'ffprobe', '-v', 'error',
+            '-show_entries', 'format=duration',
+            '-of', 'default=noprint_wrappers=1:nokey=1',
+            audio_path
+        ], capture_output=True, text=True, check=True)
+        return float(result.stdout.strip())
+    except:
+        return 0.0
+def transcribe_segment(audio_path: str, start: float, end: float) -> str:
     """
+    Transcribe a specific segment of audio.
     """
     try:
+        # Faster-whisper doesn't support segment extraction directly,
+        # so we transcribe the whole file and filter by timestamp
+        segments, _ = whisper_model.transcribe(
+            audio_path,
+            language="tr",
+            beam_size=5
         )
+        # Collect text from segments that fall within our time range
+        text_parts = []
+        for segment in segments:
+            # Check if segment overlaps with our range
+            if segment.end > start and segment.start < end:
+                text_parts.append(segment.text)
+        return " ".join(text_parts).strip()
     except Exception as e:
+        return f"[Transkripsiyon hatası: {e}]"
+def transcribe_with_diarization(audio_path: str) -> tuple:
     """
+    Full pipeline: diarization + transcription.
+    Returns formatted transcript with speaker labels and timestamps.
     """
+    start_time = time.time()
+    # Get audio duration for stats
+    duration = get_audio_duration(audio_path)
+    # Step 1: Diarization
+    print("🎭 Diarization başlıyor...")
+    if diarization_pipeline is None:
+        # Fallback: no diarization, just transcribe
+        segments, info = whisper_model.transcribe(audio_path, language="tr", beam_size=5)
+        full_text = []
+        for segment in segments:
+            timestamp = format_timestamp(segment.start)
+            full_text.append(f"[{timestamp}] {segment.text}")
+        result = "\n".join(full_text)
+        elapsed = time.time() - start_time
+        stats = f"""
+───────────────────────────────────
+📊 İstatistikler
+• Toplam süre: {format_timestamp(info.duration)}
+• İşlem süresi: {elapsed:.1f} saniye
+• ⚠️ Diarization kullanılamadı (yalnızca transkripsiyon)
+───────────────────────────────────"""
+        return result + stats, None
+    # Run diarization
+    diarization_segments = diarize_audio(audio_path, diarization_pipeline, num_speakers=2)
+    if not diarization_segments:
+        return "❌ Diarization başarısız oldu.", None
+    # Step 2: Transcribe each segment
+    print("🎙️ Transkripsiyon başlıyor...")
+    segments, info = whisper_model.transcribe(audio_path, language="tr", beam_size=5)
+    whisper_segments = list(segments)  # Convert generator to list
+    # Track which whisper segments have been used
+    used_whisper_indices = set()
+    # Step 3: Merge diarization with transcription
+    print("🔗 Birleştirme yapılıyor...")
+    transcript_parts = []
+    speaker_times = {}
+    for start, end, speaker in diarization_segments:
+        speaker_label = format_speaker_label(speaker)
+        # Track speaker time
+        if speaker_label not in speaker_times:
+            speaker_times[speaker_label] = 0
+        speaker_times[speaker_label] += (end - start)
+        # Find whisper segments that overlap with this diarization segment
+        # Only use segments that haven't been used before
+        segment_text = []
+        for idx, ws in enumerate(whisper_segments):
+            if idx in used_whisper_indices:
+                continue
+            # Check if whisper segment's midpoint falls within diarization segment
+            ws_midpoint = (ws.start + ws.end) / 2
+            if start <= ws_midpoint <= end:
+                segment_text.append(ws.text)
+                used_whisper_indices.add(idx)
+        if segment_text:
+            text = " ".join(segment_text).strip()
+            timestamp_start = format_timestamp(start)
+            timestamp_end = format_timestamp(end)
+            transcript_parts.append(f"[{timestamp_start} → {timestamp_end}] {speaker_label}:\n{text}\n")
+    # Build final output
+    header = """═══════════════════════════════════════════════════
+📋 GÖRÜŞME TRANSKRİPTİ
+═══════════════════════════════════════════════════
+"""
+    body = "\n".join(transcript_parts)
+    # Statistics
+    elapsed = time.time() - start_time
+    total_time = info.duration
+    stats_lines = [
+        "",
+        "───────────────────────────────────",
+        "📊 İstatistikler",
+        f"• Toplam süre: {format_timestamp(total_time)}",
+        f"• İşlem süresi: {elapsed:.1f} saniye",
+    ]
+    for speaker, stime in sorted(speaker_times.items()):
+        percentage = (stime / total_time) * 100 if total_time > 0 else 0
+        stats_lines.append(f"• {speaker} konuşma: {format_timestamp(stime)} (%{percentage:.0f})")
+    stats_lines.append("───────────────────────────────────")
+    stats = "\n".join(stats_lines)
+    full_result = header + body + stats
+    # Create downloadable file
+    txt_file = tempfile.NamedTemporaryFile(
+        mode='w',
+        suffix='.txt',
+        delete=False,
+        encoding='utf-8'
+    )
+    txt_file.write(full_result)
+    txt_file.close()
+    return full_result, txt_file.name
+def process_audio(audio_path):
+    """Gradio handler."""
+    if audio_path is None:
+        return "⚠️ Lütfen bir ses dosyası yükleyin.", None
+    try:
+        return transcribe_with_diarization(audio_path)
+    except Exception as e:
+        return f"❌ Beklenmeyen hata: {str(e)}", None
+# ==================== GRADIO UI ====================
+with gr.Blocks(title="Görüşme Transkripsiyon") as demo:
+    gr.HTML("""
+        <style>
+            footer { display: none !important; }
+            .gradio-container { max-width: 900px !important; margin: auto !important; }
+        </style>
+        <div style="text-align: center; padding: 40px 20px 30px;
+                    background: linear-gradient(135deg, #1e3a5f 0%, #2d5a87 100%);
+                    border-radius: 20px; margin-bottom: 24px; color: white;">
+            <h1 style="font-size: 2.2rem; font-weight: 700; margin: 0 0 8px 0;">
+                🎙️ Görüşme Transkripsiyon Sistemi
+            </h1>
+            <p style="font-size: 1rem; opacity: 0.95; margin: 0;">
+                Danışman-Danışan görüşmelerini zaman damgalı ve konuşmacı ayrımlı olarak yazıya dökün
+            </p>
+        </div>
+    """)
+    with gr.Row():
+        with gr.Column():
+            gr.HTML('<div style="font-weight: 600; margin-bottom: 12px;">📤 Ses Dosyası</div>')
+            audio_input = gr.Audio(
+                label="Görüşme Kaydı",
+                type="filepath",
+                sources=["upload", "microphone"]
+            )
+            submit_btn = gr.Button(
+                "🚀 Transkripsiyon Başlat",
+                variant="primary",
+                size="lg"
+            )
+            # Info box
+            gr.HTML("""
+                <div style="background: linear-gradient(135deg, #f0f9ff 0%, #e0f2fe 100%);
+                            border: 1px solid #7dd3fc; border-radius: 12px;
+                            padding: 16px 20px; margin-top: 16px;">
+                    <p style="margin: 0; color: #0369a1; font-size: 14px;">
+                        ℹ️ <strong>Nasıl Çalışır:</strong><br>
+                        1. Ses dosyasını yükleyin (MP3, WAV, M4A)<br>
+                        2. AI otomatik olarak konuşmacıları ayırır<br>
+                        3. Zaman damgalı transkript oluşturulur
+                    </p>
+                </div>
+            """)
+    with gr.Row():
+        with gr.Column():
+            gr.HTML('<div style="font-weight: 600; margin-bottom: 12px;">📝 Transkript Sonucu</div>')
+            output_text = gr.Textbox(
+                label="",
+                placeholder="Transkript burada görünecek...",
+                lines=20,
+                interactive=False
+            )
+            download_file = gr.File(
+                label="📥 Transkripti İndir (.txt)"
+            )
+    # Features
+    gr.HTML("""
+        <div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; margin-top: 24px;">
+            <div style="text-align: center; padding: 16px; background: #f9fafb; border-radius: 12px;">
+                <div style="font-size: 24px; margin-bottom: 6px;">🎭</div>
+                <div style="font-size: 12px; color: #6b7280; font-weight: 500;">Konuşmacı Ayrımı</div>
+            </div>
+            <div style="text-align: center; padding: 16px; background: #f9fafb; border-radius: 12px;">
+                <div style="font-size: 24px; margin-bottom: 6px;">⏱️</div>
+                <div style="font-size: 12px; color: #6b7280; font-weight: 500;">Zaman Damgası</div>
+            </div>
+            <div style="text-align: center; padding: 16px; background: #f9fafb; border-radius: 12px;">
+                <div style="font-size: 24px; margin-bottom: 6px;">🔒</div>
+                <div style="font-size: 12px; color: #6b7280; font-weight: 500;">%100 Local</div>
+            </div>
+            <div style="text-align: center; padding: 16px; background: #f9fafb; border-radius: 12px;">
+                <div style="font-size: 24px; margin-bottom: 6px;">🇹🇷</div>
+                <div style="font-size: 12px; color: #6b7280; font-weight: 500;">Türkçe Optimizeli</div>
+            </div>
+        </div>
+    """)
+    # Privacy notice
+    gr.HTML("""
+        <div style="background: #ecfdf5; border: 1px solid #6ee7b7; border-radius: 8px;
+                    padding: 12px 16px; margin-top: 16px;">
+            <p style="margin: 0; color: #047857; font-size: 13px;">
+                🔒 <strong>Gizlilik:</strong> Tüm işlemler yerel olarak yapılır.
+                Ses dosyalarınız hiçbir sunucuya gönderilmez.
+            </p>
+        </div>
+    """)
+    # Footer
+    gr.HTML("""
+        <div style="text-align: center; padding: 24px 0; color: #9ca3af; font-size: 13px;">
+            <p>Powered by Faster-Whisper & Pyannote-Audio • GPU & CPU Destekli</p>
+        </div>
+    """)
+    # Event handling
+    submit_btn.click(
+        fn=process_audio,
+        inputs=[audio_input],
+        outputs=[output_text, download_file]
+    )
+# Launch
+if __name__ == "__main__":
+    demo.launch(share=False, show_error=True)