ACloudCenter commited on
Commit
7b150af
·
verified ·
1 Parent(s): ec2f83b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -73
app.py CHANGED
@@ -7,63 +7,35 @@ import librosa
7
  import math
8
  from transformers import MoonshineForConditionalGeneration, AutoProcessor
9
 
10
- # Use GPU if available and set appropriate dtype
11
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
13
 
14
- # Load model and processor - Moonshine Tiny
15
  model = MoonshineForConditionalGeneration.from_pretrained('UsefulSensors/moonshine-tiny').to(device).to(torch_dtype)
16
  processor = AutoProcessor.from_pretrained('UsefulSensors/moonshine-tiny')
17
 
18
- # --- Longer token limits (simple) ---
19
- TOKENS_PER_SEC = 12.0 # was ~7.0 before
20
- MIN_NEW_TOKENS = 48 # was 24; gives short clips more room
21
- MAX_NEW_TOKENS_CAP = 3200 # generous cap to avoid runaway
22
 
23
- # Define transcription function using HF Zero GPU
24
  @spaces.GPU
25
  def transcribe_audio(audio_file):
26
  if not audio_file:
27
  return "No audio provided."
28
-
29
- # Load and preprocess audio
30
  audio_array, sr = sf.read(audio_file)
31
  if audio_array.ndim > 1:
32
- audio_array = np.mean(audio_array, axis=1)
33
-
34
- # Resample if necessary in case the audio file has a different sampling rate
35
  target_sr = processor.feature_extractor.sampling_rate
36
  if sr != target_sr:
37
  audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=target_sr)
38
-
39
- # Prepare inputs for the model - ensure correct dtype and device
40
- inputs = processor(
41
- audio_array,
42
- sampling_rate=target_sr,
43
- return_tensors="pt"
44
- ).to(device, torch_dtype)
45
-
46
- # Duration-based max_new_tokens calculation (longer limits)
47
  duration_sec = len(audio_array) / float(target_sr)
48
- max_new_tokens = min(
49
- MAX_NEW_TOKENS_CAP,
50
- max(MIN_NEW_TOKENS, int(math.ceil(duration_sec * TOKENS_PER_SEC)))
51
- )
52
-
53
- # Generate transcription with adjusted max_new_tokens
54
- generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)
55
- return processor.decode(generated_ids[0], skip_special_tokens=True) # Decode the generated IDs to text
56
 
57
- # Set Gradio theme
58
- theme = gr.themes.Ocean(
59
- primary_hue="indigo",
60
- secondary_hue="fuchsia",
61
- neutral_hue="slate",
62
- ).set(
63
- button_large_radius='*radius_sm'
64
- )
65
 
66
- # Create Gradio interface
67
  with gr.Blocks(theme=theme) as demo:
68
  gr.Markdown("## Moonshine Tiny STT - 27M Parameters")
69
  gr.HTML("""
@@ -73,46 +45,17 @@ with gr.Blocks(theme=theme) as demo:
73
  alt="VibeVoice Banner">
74
  </div>
75
  """)
76
-
77
  with gr.Tabs():
78
  with gr.TabItem("Upload Audio"):
79
- audio_file = gr.Audio(
80
- sources=["upload"],
81
- type="filepath",
82
- label="Upload Audio File"
83
- )
84
- output_text1 = gr.Textbox(
85
- label="Transcription",
86
- placeholder="Transcription will appear here...",
87
- lines=20,
88
- autoscroll=True
89
- )
90
  upload_button = gr.Button("Transcribe Uploaded Audio")
91
- upload_button.click(
92
- fn=transcribe_audio,
93
- inputs=audio_file,
94
- outputs=output_text1
95
- )
96
-
97
  with gr.TabItem("Record Audio"):
98
- audio_mic = gr.Audio(
99
- sources=["microphone"],
100
- type="filepath",
101
- label="Record Audio"
102
- )
103
- output_text2 = gr.Textbox(
104
- label="Transcription",
105
- placeholder="Transcription will appear here...",
106
- lines=20,
107
- autoscroll=True
108
- )
109
  record_button = gr.Button("Transcribe Recorded Audio")
110
- record_button.click(
111
- fn=transcribe_audio,
112
- inputs=audio_mic,
113
- outputs=output_text2
114
- )
115
-
116
  gr.Markdown("""
117
  ### Instructions:
118
  1. Choose either 'Upload Audio' or 'Record Audio' tab
 
7
  import math
8
  from transformers import MoonshineForConditionalGeneration, AutoProcessor
9
 
 
10
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
12
 
 
13
  model = MoonshineForConditionalGeneration.from_pretrained('UsefulSensors/moonshine-tiny').to(device).to(torch_dtype)
14
  processor = AutoProcessor.from_pretrained('UsefulSensors/moonshine-tiny')
15
 
16
+ TOKENS_PER_SEC = 12.0
17
+ MIN_NEW_TOKENS = 48
18
+ MAX_NEW_TOKENS_CAP = 1600
 
19
 
 
20
  @spaces.GPU
21
  def transcribe_audio(audio_file):
22
  if not audio_file:
23
  return "No audio provided."
 
 
24
  audio_array, sr = sf.read(audio_file)
25
  if audio_array.ndim > 1:
26
+ audio_array = np.mean(audio_array, axis=1)
 
 
27
  target_sr = processor.feature_extractor.sampling_rate
28
  if sr != target_sr:
29
  audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=target_sr)
30
+ inputs = processor(audio_array, sampling_rate=target_sr, return_tensors="pt")
31
+ inputs = {k: v.to(device) for k, v in inputs.items()}
 
 
 
 
 
 
 
32
  duration_sec = len(audio_array) / float(target_sr)
33
+ max_new_tokens = min(MAX_NEW_TOKENS_CAP, max(MIN_NEW_TOKENS, int(math.ceil(duration_sec * TOKENS_PER_SEC))))
34
+ generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens, no_repeat_ngram_size=4, repetition_penalty=1.05)
35
+ return processor.decode(generated_ids[0], skip_special_tokens=True)
 
 
 
 
 
36
 
37
+ theme = gr.themes.Ocean(primary_hue="indigo", secondary_hue="fuchsia", neutral_hue="slate").set(button_large_radius='*radius_sm')
 
 
 
 
 
 
 
38
 
 
39
  with gr.Blocks(theme=theme) as demo:
40
  gr.Markdown("## Moonshine Tiny STT - 27M Parameters")
41
  gr.HTML("""
 
45
  alt="VibeVoice Banner">
46
  </div>
47
  """)
 
48
  with gr.Tabs():
49
  with gr.TabItem("Upload Audio"):
50
+ audio_file = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File")
51
+ output_text1 = gr.Textbox(label="Transcription", placeholder="Transcription will appear here...", lines=20, autoscroll=True)
 
 
 
 
 
 
 
 
 
52
  upload_button = gr.Button("Transcribe Uploaded Audio")
53
+ upload_button.click(fn=transcribe_audio, inputs=audio_file, outputs=output_text1)
 
 
 
 
 
54
  with gr.TabItem("Record Audio"):
55
+ audio_mic = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
56
+ output_text2 = gr.Textbox(label="Transcription", placeholder="Transcription will appear here...", lines=20, autoscroll=True)
 
 
 
 
 
 
 
 
 
57
  record_button = gr.Button("Transcribe Recorded Audio")
58
+ record_button.click(fn=transcribe_audio, inputs=audio_mic, outputs=output_text2)
 
 
 
 
 
59
  gr.Markdown("""
60
  ### Instructions:
61
  1. Choose either 'Upload Audio' or 'Record Audio' tab