Spaces:

empirenexus
/

TranscriptWriting

Sleeping

App Files Files Community

jmisak commited on Oct 18, 2025

Commit

b74585c

verified ·

1 Parent(s): 02be25d

Delete audio_transcriber_hf.py

Browse files

Files changed (1) hide show

audio_transcriber_hf.py +0 -104

audio_transcriber_hf.py DELETED Viewed

@@ -1,104 +0,0 @@
-"""
-Audio transcription with speaker diarization
-"""
-from faster_whisper import WhisperModel
-from pyannote.audio import Pipeline
-import torch
-from docx import Document
-import os
-def transcribe_with_diarization(audio_path: str, num_speakers: int = 2) -> str:
-    """
-    Transcribe audio with speaker labels
-    Args:
-        audio_path: Path to audio file (mp3, wav, m4a)
-        num_speakers: Expected number of speakers (default 2 for interviews)
-    Returns:
-        Path to generated DOCX transcript
-    """
-    print(f"[1/3] Transcribing audio...")
-    # Load Whisper model
-    model = WhisperModel("large-v3", device="cuda", compute_type="float16")
-    # Transcribe with timestamps
-    segments, info = model.transcribe(
-        audio_path,
-        language="en",
-        beam_size=5,
-        word_timestamps=True
-    )
-    segments_list = list(segments)
-    print(f"[2/3] Identifying speakers...")
-    # Load diarization pipeline
-    # Note: Requires HuggingFace token for pyannote models
-    hf_token = os.getenv("HUGGINGFACE_TOKEN", "")
-    if not hf_token:
-        print("[Warning] No HF token - using simple alternating speakers")
-        return transcribe_simple(segments_list, audio_path)
-    diarization = Pipeline.from_pretrained(
-        "pyannote/speaker-diarization-3.1",
-        use_auth_token=hf_token
-    )
-    if torch.cuda.is_available():
-        diarization.to(torch.device("cuda"))
-    # Run diarization
-    diarization_result = diarization(audio_path, num_speakers=num_speakers)
-    print(f"[3/3] Combining transcription + speakers...")
-    # Match segments to speakers
-    transcript_lines = []
-    for segment in segments_list:
-        start = segment.start
-        end = segment.end
-        text = segment.text
-        # Find speaker at this timestamp
-        speaker = get_speaker_at_time(diarization_result, start)
-        transcript_lines.append(f"{speaker}: {text}")
-    # Save to DOCX
-    doc = Document()
-    doc.add_heading('Interview Transcript', 0)
-    for line in transcript_lines:
-        doc.add_paragraph(line)
-    output_path = audio_path.replace('.mp3', '_transcript.docx').replace('.wav', '_transcript.docx').replace('.m4a', '_transcript.docx')
-    doc.save(output_path)
-    print(f"✓ Transcript saved: {output_path}")
-    return output_path
-def get_speaker_at_time(diarization_result, timestamp):
-    """Find which speaker is talking at given timestamp"""
-    for turn, _, speaker in diarization_result.itertracks(yield_label=True):
-        if turn.start <= timestamp <= turn.end:
-            return f"Speaker {speaker}"
-    return "Speaker Unknown"
-def transcribe_simple(segments_list, audio_path):
-    """Fallback: alternating speakers without diarization"""
-    doc = Document()
-    doc.add_heading('Interview Transcript', 0)
-    current_speaker = 1
-    for segment in segments_list:
-        doc.add_paragraph(f"Speaker {current_speaker}: {segment.text}")
-        # Simple heuristic: alternate on pauses > 2 seconds
-        if hasattr(segment, 'no_speech_prob') and segment.no_speech_prob > 0.5:
-            current_speaker = 3 - current_speaker  # Toggle between 1 and 2
-    output_path = audio_path.replace('.mp3', '_transcript.docx')
-    doc.save(output_path)
-    return output_path