jmisak commited on
Commit
02be25d
·
verified ·
1 Parent(s): 92752f3

Delete audio_transcriber.py

Browse files
Files changed (1) hide show
  1. audio_transcriber.py +0 -100
audio_transcriber.py DELETED
@@ -1,100 +0,0 @@
1
- from faster_whisper import WhisperModel
2
- from speechbrain.inference import EncoderClassifier
3
- from sklearn.cluster import AgglomerativeClustering
4
- from docx import Document
5
- import torch, torchaudio, numpy as np
6
-
7
- def transcribe_with_diarization_streaming(audio_path: str, num_speakers: int = 1):
8
- """
9
- Streaming transcription with diarization support.
10
- - Processes audio in chunks (default 30s).
11
- - Streams partial transcripts as they’re ready.
12
- - Handles single-speaker fallback.
13
- """
14
-
15
- # Device fallback
16
- device = "cuda" if torch.cuda.is_available() else "cpu"
17
- compute_type = "float16" if device == "cuda" else "int8"
18
-
19
- print(f"[1/3] Loading Whisper model on {device}...")
20
-
21
- try:
22
- if torch.cuda.is_available():
23
- device = "cuda"
24
- compute_type = "float16"
25
- _ = torch.zeros(1).to(device) # sanity check
26
- else:
27
- raise RuntimeError("No CUDA")
28
- except Exception:
29
- print("⚠️ CUDA not usable, falling back to CPU")
30
- device = "cpu"
31
- compute_type = "int8"
32
-
33
- whisper_model = WhisperModel("large-v3", device=device, compute_type=compute_type)
34
- return whisper_model
35
-
36
-
37
-
38
- print(f"[2/3] Transcribing...")
39
- # Streaming generator
40
- segments, info = whisper_model.transcribe(
41
- audio_path,
42
- language="en",
43
- beam_size=5,
44
- word_timestamps=True,
45
- vad_filter=True,
46
-
47
- )
48
-
49
- segments_list = []
50
- for seg in segments:
51
- print(f"[stream] {seg.start:.2f}-{seg.end:.2f}: {seg.text}")
52
- segments_list.append(seg)
53
-
54
- # Speaker embeddings
55
- print(f"[3/3] Extracting speaker embeddings...")
56
- speaker_model = EncoderClassifier.from_hparams(
57
- source="speechbrain/spkrec-ecapa-voxceleb",
58
- savedir="models/speaker_embeddings",
59
- run_opts={"device": device}
60
- )
61
-
62
- waveform, sample_rate = torchaudio.load(audio_path)
63
- embeddings, valid_segments = [], []
64
-
65
- for seg in segments_list:
66
- start_sample = int(seg.start * sample_rate)
67
- end_sample = int(seg.end * sample_rate)
68
- if end_sample > start_sample:
69
- seg_audio = waveform[:, start_sample:end_sample]
70
- if sample_rate != 16000:
71
- seg_audio = torchaudio.transforms.Resample(sample_rate, 16000)(seg_audio)
72
- with torch.no_grad():
73
- emb = speaker_model.encode_batch(seg_audio)
74
- embeddings.append(emb.squeeze().cpu().numpy())
75
- valid_segments.append(seg)
76
-
77
- # Handle empty or single-speaker case
78
- if len(embeddings) == 0 or num_speakers <= 1:
79
- print("Single speaker detected or no embeddings. Skipping clustering.")
80
- speaker_labels = [0] * len(valid_segments)
81
- num_speakers = 1
82
- else:
83
- if num_speakers > len(embeddings):
84
- num_speakers = len(embeddings)
85
- clustering = AgglomerativeClustering(n_clusters=num_speakers)
86
- speaker_labels = clustering.fit_predict(np.array(embeddings))
87
-
88
- # Build transcript
89
- doc = Document()
90
- doc.add_heading('Interview Transcript', 0)
91
- doc.add_paragraph(f"Detected {num_speakers} speaker(s)")
92
- doc.add_paragraph("")
93
-
94
- for seg, spk in zip(valid_segments, speaker_labels):
95
- doc.add_paragraph(f"Speaker {spk+1}: {seg.text.strip()}")
96
-
97
- output_path = audio_path.rsplit('.', 1)[0] + '_transcript.docx'
98
- doc.save(output_path)
99
- print(f"✓ Saved transcript: {output_path}")
100
- return output_path