Bindu36 commited on
Commit
93d69a6
·
verified ·
1 Parent(s): a9befeb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -40
app.py CHANGED
@@ -3,6 +3,7 @@ import whisper
3
  import datetime
4
  import torch
5
  import subprocess
 
6
  from pyannote.audio import Audio
7
  from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
8
  from pyannote.core import Segment
@@ -19,52 +20,55 @@ audio = Audio()
19
  embedding_model = PretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb", device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
20
 
21
  def transcribe_and_diarize(audio_file, num_speakers=2):
22
- path = audio_file.name
23
- # Convert to WAV if necessary
24
- if path[-3:] != 'wav':
25
- subprocess.call(['ffmpeg', '-i', path, 'audio.wav', '-y'])
26
- path = 'audio.wav'
 
27
 
28
- # Transcribe audio
29
- result = model.transcribe(path)
30
- segments = result["segments"]
31
 
32
- # Get audio duration
33
- with contextlib.closing(wave.open(path, 'r')) as f:
34
- frames = f.getnframes()
35
- rate = f.getframerate()
36
- duration = frames / float(rate)
37
 
38
- # Define function to extract segment embeddings
39
- def segment_embedding(segment):
40
- start = segment["start"]
41
- end = min(duration, segment["end"])
42
- clip = Segment(start, end)
43
- waveform, sample_rate = audio.crop(path, clip)
44
- return embedding_model(waveform[None])
45
 
46
- # Extract embeddings for each segment
47
- embeddings = np.zeros(shape=(len(segments), 192))
48
- for i, segment in enumerate(segments):
49
- embeddings[i] = segment_embedding(segment)
50
 
51
- embeddings = np.nan_to_num(embeddings)
52
 
53
- # Perform speaker clustering
54
- clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
55
- labels = clustering.labels_
56
- for i in range(len(segments)):
57
- segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
58
 
59
- # Generate transcript
60
- transcript = ""
61
- for i, segment in enumerate(segments):
62
- if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
63
- transcript += "\n" + segment["speaker"] + ' ' + str(datetime.timedelta(seconds=round(segment["start"]))) + '\n'
64
- transcript += segment["text"][1:] + ' '
65
- transcript += "\n\n"
66
-
67
- return transcript
 
 
68
 
69
  iface = gr.Interface(
70
  fn=transcribe_and_diarize,
@@ -77,4 +81,4 @@ iface = gr.Interface(
77
  description="Upload an audio file to get a transcription with speaker diarization."
78
  )
79
 
80
- iface.launch()
 
3
  import datetime
4
  import torch
5
  import subprocess
6
+ import os
7
  from pyannote.audio import Audio
8
  from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
9
  from pyannote.core import Segment
 
20
  embedding_model = PretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb", device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
21
 
22
  def transcribe_and_diarize(audio_file, num_speakers=2):
23
+ try:
24
+ path = audio_file.name
25
+ # Convert to WAV if necessary
26
+ if not path.endswith('.wav'):
27
+ subprocess.call(['ffmpeg', '-i', path, 'audio.wav', '-y'])
28
+ path = 'audio.wav'
29
 
30
+ # Transcribe audio
31
+ result = model.transcribe(path)
32
+ segments = result["segments"]
33
 
34
+ # Get audio duration
35
+ with contextlib.closing(wave.open(path, 'r')) as f:
36
+ frames = f.getnframes()
37
+ rate = f.getframerate()
38
+ duration = frames / float(rate)
39
 
40
+ # Define function to extract segment embeddings
41
+ def segment_embedding(segment):
42
+ start = segment["start"]
43
+ end = min(duration, segment["end"])
44
+ clip = Segment(start, end)
45
+ waveform, sample_rate = audio.crop(path, clip)
46
+ return embedding_model(waveform[None])
47
 
48
+ # Extract embeddings for each segment
49
+ embeddings = np.zeros(shape=(len(segments), 192))
50
+ for i, segment in enumerate(segments):
51
+ embeddings[i] = segment_embedding(segment)
52
 
53
+ embeddings = np.nan_to_num(embeddings)
54
 
55
+ # Perform speaker clustering
56
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
57
+ labels = clustering.labels_
58
+ for i in range(len(segments)):
59
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
60
 
61
+ # Generate transcript
62
+ transcript = ""
63
+ for i, segment in enumerate(segments):
64
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
65
+ transcript += "\n" + segment["speaker"] + ' ' + str(datetime.timedelta(seconds=round(segment["start"]))) + '\n'
66
+ transcript += segment["text"][1:] + ' '
67
+ transcript += "\n\n"
68
+
69
+ return transcript
70
+ except Exception as e:
71
+ return f"An error occurred: {str(e)}"
72
 
73
  iface = gr.Interface(
74
  fn=transcribe_and_diarize,
 
81
  description="Upload an audio file to get a transcription with speaker diarization."
82
  )
83
 
84
+ iface.launch()