jmisak commited on
Commit
b74585c
·
verified ·
1 Parent(s): 02be25d

Delete audio_transcriber_hf.py

Browse files
Files changed (1) hide show
  1. audio_transcriber_hf.py +0 -104
audio_transcriber_hf.py DELETED
@@ -1,104 +0,0 @@
1
- """
2
- Audio transcription with speaker diarization
3
- """
4
- from faster_whisper import WhisperModel
5
- from pyannote.audio import Pipeline
6
- import torch
7
- from docx import Document
8
- import os
9
-
10
- def transcribe_with_diarization(audio_path: str, num_speakers: int = 2) -> str:
11
- """
12
- Transcribe audio with speaker labels
13
-
14
- Args:
15
- audio_path: Path to audio file (mp3, wav, m4a)
16
- num_speakers: Expected number of speakers (default 2 for interviews)
17
-
18
- Returns:
19
- Path to generated DOCX transcript
20
- """
21
- print(f"[1/3] Transcribing audio...")
22
-
23
- # Load Whisper model
24
- model = WhisperModel("large-v3", device="cuda", compute_type="float16")
25
-
26
- # Transcribe with timestamps
27
- segments, info = model.transcribe(
28
- audio_path,
29
- language="en",
30
- beam_size=5,
31
- word_timestamps=True
32
- )
33
-
34
- segments_list = list(segments)
35
- print(f"[2/3] Identifying speakers...")
36
-
37
- # Load diarization pipeline
38
- # Note: Requires HuggingFace token for pyannote models
39
- hf_token = os.getenv("HUGGINGFACE_TOKEN", "")
40
- if not hf_token:
41
- print("[Warning] No HF token - using simple alternating speakers")
42
- return transcribe_simple(segments_list, audio_path)
43
-
44
- diarization = Pipeline.from_pretrained(
45
- "pyannote/speaker-diarization-3.1",
46
- use_auth_token=hf_token
47
- )
48
-
49
- if torch.cuda.is_available():
50
- diarization.to(torch.device("cuda"))
51
-
52
- # Run diarization
53
- diarization_result = diarization(audio_path, num_speakers=num_speakers)
54
-
55
- print(f"[3/3] Combining transcription + speakers...")
56
-
57
- # Match segments to speakers
58
- transcript_lines = []
59
- for segment in segments_list:
60
- start = segment.start
61
- end = segment.end
62
- text = segment.text
63
-
64
- # Find speaker at this timestamp
65
- speaker = get_speaker_at_time(diarization_result, start)
66
- transcript_lines.append(f"{speaker}: {text}")
67
-
68
- # Save to DOCX
69
- doc = Document()
70
- doc.add_heading('Interview Transcript', 0)
71
-
72
- for line in transcript_lines:
73
- doc.add_paragraph(line)
74
-
75
- output_path = audio_path.replace('.mp3', '_transcript.docx').replace('.wav', '_transcript.docx').replace('.m4a', '_transcript.docx')
76
- doc.save(output_path)
77
-
78
- print(f"✓ Transcript saved: {output_path}")
79
- return output_path
80
-
81
-
82
- def get_speaker_at_time(diarization_result, timestamp):
83
- """Find which speaker is talking at given timestamp"""
84
- for turn, _, speaker in diarization_result.itertracks(yield_label=True):
85
- if turn.start <= timestamp <= turn.end:
86
- return f"Speaker {speaker}"
87
- return "Speaker Unknown"
88
-
89
-
90
- def transcribe_simple(segments_list, audio_path):
91
- """Fallback: alternating speakers without diarization"""
92
- doc = Document()
93
- doc.add_heading('Interview Transcript', 0)
94
-
95
- current_speaker = 1
96
- for segment in segments_list:
97
- doc.add_paragraph(f"Speaker {current_speaker}: {segment.text}")
98
- # Simple heuristic: alternate on pauses > 2 seconds
99
- if hasattr(segment, 'no_speech_prob') and segment.no_speech_prob > 0.5:
100
- current_speaker = 3 - current_speaker # Toggle between 1 and 2
101
-
102
- output_path = audio_path.replace('.mp3', '_transcript.docx')
103
- doc.save(output_path)
104
- return output_path