BissakaAI commited on
Commit
535bed6
·
verified ·
1 Parent(s): 46b214b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -80
app.py CHANGED
@@ -1,95 +1,46 @@
1
- import os
2
- import torch
3
  import gradio as gr
4
- import librosa
5
  import numpy as np
6
- from transformers import AutoProcessor, SeamlessM4Tv2ForSpeechToText
7
 
8
-
9
- ASR_MODEL_ID = "facebook/seamless-m4t-v2-large"
10
- HF_TOKEN = os.getenv("HF_TOKEN")
11
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
-
13
-
14
- processor = AutoProcessor.from_pretrained(
15
- ASR_MODEL_ID,
16
- token=HF_TOKEN
17
  )
18
 
19
- asr_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(
20
- ASR_MODEL_ID,
21
- token=HF_TOKEN
22
- ).to(DEVICE)
23
-
24
- asr_model.eval()
25
-
26
- # Audio preprocessing
27
- def preprocess_audio(audio):
28
  if audio is None:
29
- return None
30
-
31
- # Gradio returns (sr, np.ndarray) OR (np.ndarray, sr)
32
- if isinstance(audio, tuple):
33
- if isinstance(audio[0], np.ndarray):
34
- speech = audio[0]
35
- sr = audio[1]
36
- else:
37
- sr = audio[0]
38
- speech = audio[1]
39
- else:
40
- return None
41
-
42
- # Stereo → mono
43
- if speech.ndim > 1:
44
- speech = np.mean(speech, axis=1)
45
-
46
- speech = speech.astype(np.float32)
47
-
48
- # Force 16kHz
49
- if sr != 16000:
50
- speech = librosa.resample(
51
- speech,
52
- orig_sr=sr,
53
- target_sr=16000
54
- )
55
-
56
- return speech
57
-
58
-
59
- #ASR
60
- def transcribe_audio(audio):
61
- speech = preprocess_audio(audio)
62
-
63
- if speech is None or len(speech) == 0:
64
- return "No audio provided."
65
 
66
- inputs = processor(
67
- audio=speech,
68
- sampling_rate=16000,
69
- return_tensors="pt"
70
- ).to(DEVICE)
71
 
72
- with torch.no_grad():
73
- generated_ids = asr_model.generate(
74
- inputs["input_features"],
75
- max_new_tokens=256
76
- )
77
 
78
- transcription = processor.batch_decode(
79
- generated_ids,
80
- skip_special_tokens=True
81
- )[0]
 
82
 
83
- return transcription.strip()
 
 
84
 
 
85
 
86
  demo = gr.Interface(
87
- fn=transcribe_audio,
88
- inputs=gr.Audio(type="numpy", label="Upload or Record Speech"),
89
- outputs=gr.Textbox(label="Transcription"),
90
- title="HealthAtlas ASR Service",
91
- description="Automatic language detection (Seamless-M4T v2)"
 
 
 
 
92
  )
93
 
94
- if __name__ == "__main__":
95
- demo.launch()
 
 
 
1
  import gradio as gr
 
2
  import numpy as np
3
+ from faster_whisper import WhisperModel
4
 
5
+ # Load model (small = fast, medium = better accuracy)
6
+ model = WhisperModel(
7
+ "small",
8
+ device="cuda" if torch.cuda.is_available() else "cpu",
9
+ compute_type="float16" if torch.cuda.is_available() else "int8"
 
 
 
 
10
  )
11
 
12
+ def transcribe_stream(audio):
 
 
 
 
 
 
 
 
13
  if audio is None:
14
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ sr, data = audio
 
 
 
 
17
 
18
+ # Convert to mono
19
+ if data.ndim > 1:
20
+ data = np.mean(data, axis=1)
 
 
21
 
22
+ segments, info = model.transcribe(
23
+ data,
24
+ language="yo", # Yoruba (use None for auto-detect)
25
+ beam_size=5
26
+ )
27
 
28
+ text = ""
29
+ for seg in segments:
30
+ text += seg.text + " "
31
 
32
+ return text.strip()
33
 
34
  demo = gr.Interface(
35
+ fn=transcribe_stream,
36
+ inputs=gr.Audio(
37
+ source="microphone",
38
+ type="numpy",
39
+ streaming=True
40
+ ),
41
+ outputs=gr.Textbox(),
42
+ title="Real-Time Streaming ASR (Whisper)",
43
+ description="Low-latency live speech recognition"
44
  )
45
 
46
+ demo.launch()