Spaces:

mic3333
/

asr

Sleeping

App Files Files Community

michaeltangz commited on Dec 8, 2025

Commit

6fdae11

1 Parent(s): 7b34cad

refactor app.py to streamline flash-attn installation and model loading; update requirements.txt to remove unnecessary dependencies

Browse files

Files changed (3) hide show

.gitattributes +0 -1
app.py +56 -33
requirements.txt +0 -11

.gitattributes CHANGED Viewed

@@ -25,7 +25,6 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -9,39 +9,19 @@ import time
 import numpy as np
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
 import subprocess
-# Install flash-attn if possible, but don't fail if it doesn't work
-try:
-    subprocess.run(
-        "pip install flash-attn --no-build-isolation",
-        env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
-        shell=True,
-    )
-except Exception as e:
-    print(f"Flash attention installation skipped: {e}")
 device = "cuda" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 MODEL_NAME = "openai/whisper-large-v3-turbo"
-# Load model with flash attention if available
-try:
-    model = AutoModelForSpeechSeq2Seq.from_pretrained(
-        MODEL_NAME,
-        torch_dtype=torch_dtype,
-        low_cpu_mem_usage=True,
-        use_safetensors=True,
-        attn_implementation="flash_attention_2"
-    )
-except Exception as e:
-    print(f"Could not load with flash_attention_2, falling back to default: {e}")
-    model = AutoModelForSpeechSeq2Seq.from_pretrained(
-        MODEL_NAME,
-        torch_dtype=torch_dtype,
-        low_cpu_mem_usage=True,
-        use_safetensors=True
-    )
 model.to(device)
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
@@ -52,7 +32,7 @@ pipe = pipeline(
     model=model,
     tokenizer=tokenizer,
     feature_extractor=processor.feature_extractor,
-    chunk_length_s=30,
     torch_dtype=torch_dtype,
     device=device,
 )
@@ -82,7 +62,7 @@ def stream_transcribe(stream, new_chunk):
         return stream, transcription, f"{latency:.2f}"
     except Exception as e:
         print(f"Error during Transcription: {e}")
-        return stream, str(e), "Error"
 @spaces.GPU
 def transcribe(inputs, previous_transcription):
@@ -102,6 +82,25 @@ def transcribe(inputs, previous_transcription):
         print(f"Error during Transcription: {e}")
         return previous_transcription, "Error"
 def clear():
     return ""
@@ -135,8 +134,32 @@ with gr.Blocks() as file:
         submit_button.click(transcribe, [input_audio_microphone, output], [output, latency_textbox], concurrency_limit=None)
         clear_button.click(clear, outputs=[output])
 with gr.Blocks(theme=gr.themes.Ocean()) as demo:
     gr.TabbedInterface([microphone, file], ["Microphone", "Transcribe from file"])
-if __name__ == "__main__":
-    demo.launch(share=False)

 import numpy as np
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
 import subprocess
+subprocess.run(
+    "pip install flash-attn --no-build-isolation",
+    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+    shell=True,
+)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16
 MODEL_NAME = "openai/whisper-large-v3-turbo"
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2"
+)
 model.to(device)
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
     model=model,
     tokenizer=tokenizer,
     feature_extractor=processor.feature_extractor,
+    chunk_length_s=10,
     torch_dtype=torch_dtype,
     device=device,
 )
         return stream, transcription, f"{latency:.2f}"
     except Exception as e:
         print(f"Error during Transcription: {e}")
+        return stream, e, "Error"
 @spaces.GPU
 def transcribe(inputs, previous_transcription):
         print(f"Error during Transcription: {e}")
         return previous_transcription, "Error"
+@spaces.GPU
+def translate_and_transcribe(inputs, previous_transcription, target_language):
+    start_time = time.time()
+    try:
+        filename = f"{uuid.uuid4().hex}.wav"
+        sample_rate, audio_data = inputs
+        scipy.io.wavfile.write(filename, sample_rate, audio_data)
+        translation = pipe(filename, generate_kwargs={"task": "translate", "language": target_language} )["text"]
+        previous_transcription += translation
+        end_time = time.time()
+        latency = end_time - start_time
+        return previous_transcription, f"{latency:.2f}"
+    except Exception as e:
+        print(f"Error during Translation and Transcription: {e}")
+        return previous_transcription, "Error"
 def clear():
     return ""
         submit_button.click(transcribe, [input_audio_microphone, output], [output, latency_textbox], concurrency_limit=None)
         clear_button.click(clear, outputs=[output])
+# with gr.Blocks() as translate:
+#     with gr.Column():
+#         gr.Markdown(f"# Realtime Whisper Large V3 Turbo (Translation): \n Transcribe and Translate Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
+#         with gr.Row():
+#             input_audio_microphone = gr.Audio(streaming=True)
+#             output = gr.Textbox(label="Transcription and Translation", value="")
+#             latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
+#             target_language_dropdown = gr.Dropdown(
+#                 choices=["english", "french", "hindi", "spanish", "russian"],
+#                 label="Target Language",
+#                 value="<|es|>"
+#             )
+#         with gr.Row():
+#             clear_button = gr.Button("Clear Output")
+#         input_audio_microphone.stream(
+#             translate_and_transcribe,
+#             [input_audio_microphone, output, target_language_dropdown],
+#             [output, latency_textbox],
+#             time_limit=45,
+#             stream_every=2,
+#             concurrency_limit=None
+#         )
+#         clear_button.click(clear, outputs=[output])
 with gr.Blocks(theme=gr.themes.Ocean()) as demo:
     gr.TabbedInterface([microphone, file], ["Microphone", "Transcribe from file"])
+demo.launch()

requirements.txt CHANGED Viewed

@@ -1,14 +1,3 @@
-torch==2.6.0
-gradio==4.44.1
-numpy==1.24.3
-spaces>=0.20.0
-accelerate>=0.24.0
-safetensors>=0.4.0
-sentencepiece>=0.1.99
-protobuf>=3.20.0
-webrtcvad
-librosa
-flash-attn
 transformers
 scipy
 accelerate

 transformers
 scipy
 accelerate