Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
import subprocess
|
|
|
|
| 2 |
# Install required libraries
|
| 3 |
subprocess.check_call(["pip", "install", "torch>=1.11.0"])
|
| 4 |
subprocess.check_call(["pip", "install", "transformers>=4.31.0"])
|
| 5 |
subprocess.check_call(["pip", "install", "diffusers>=0.14.0"])
|
| 6 |
subprocess.check_call(["pip", "install", "librosa"])
|
| 7 |
subprocess.check_call(["pip", "install", "accelerate>=0.20.1"])
|
|
|
|
| 8 |
|
| 9 |
import os
|
| 10 |
import threading
|
|
@@ -44,7 +46,11 @@ else:
|
|
| 44 |
raise ValueError("HF_TOKEN environment variable not set.")
|
| 45 |
|
| 46 |
# Load speech-to-text model (Whisper)
|
| 47 |
-
speech_to_text = pipeline(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
# Load Stable Diffusion model for text-to-image
|
| 50 |
text_to_image = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
|
@@ -62,7 +68,7 @@ def preprocess_audio(audio_path):
|
|
| 62 |
except Exception as e:
|
| 63 |
return f"Error in preprocessing audio: {str(e)}"
|
| 64 |
|
| 65 |
-
# Speech-to-text function
|
| 66 |
@lru_cache(maxsize=10)
|
| 67 |
def transcribe_audio(audio_path):
|
| 68 |
try:
|
|
@@ -70,7 +76,9 @@ def transcribe_audio(audio_path):
|
|
| 70 |
if isinstance(audio_array, str): # Error message from preprocessing
|
| 71 |
return audio_array
|
| 72 |
result = speech_to_text(audio_array)
|
| 73 |
-
|
|
|
|
|
|
|
| 74 |
except Exception as e:
|
| 75 |
return f"Error in transcription: {str(e)}"
|
| 76 |
|
|
@@ -142,4 +150,4 @@ iface = gr.TabbedInterface(
|
|
| 142 |
)
|
| 143 |
|
| 144 |
# Launch Gradio interface
|
| 145 |
-
iface.launch(debug=True, share=True)
|
|
|
|
| 1 |
import subprocess
|
| 2 |
+
|
| 3 |
# Install required libraries
|
| 4 |
subprocess.check_call(["pip", "install", "torch>=1.11.0"])
|
| 5 |
subprocess.check_call(["pip", "install", "transformers>=4.31.0"])
|
| 6 |
subprocess.check_call(["pip", "install", "diffusers>=0.14.0"])
|
| 7 |
subprocess.check_call(["pip", "install", "librosa"])
|
| 8 |
subprocess.check_call(["pip", "install", "accelerate>=0.20.1"])
|
| 9 |
+
subprocess.check_call(["pip", "install", "gradio>=3.35.2"])
|
| 10 |
|
| 11 |
import os
|
| 12 |
import threading
|
|
|
|
| 46 |
raise ValueError("HF_TOKEN environment variable not set.")
|
| 47 |
|
| 48 |
# Load speech-to-text model (Whisper)
|
| 49 |
+
speech_to_text = pipeline(
|
| 50 |
+
"automatic-speech-recognition",
|
| 51 |
+
model="openai/whisper-tiny",
|
| 52 |
+
return_timestamps=True
|
| 53 |
+
)
|
| 54 |
|
| 55 |
# Load Stable Diffusion model for text-to-image
|
| 56 |
text_to_image = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
|
|
|
| 68 |
except Exception as e:
|
| 69 |
return f"Error in preprocessing audio: {str(e)}"
|
| 70 |
|
| 71 |
+
# Speech-to-text function with long-form transcription support
|
| 72 |
@lru_cache(maxsize=10)
|
| 73 |
def transcribe_audio(audio_path):
|
| 74 |
try:
|
|
|
|
| 76 |
if isinstance(audio_array, str): # Error message from preprocessing
|
| 77 |
return audio_array
|
| 78 |
result = speech_to_text(audio_array)
|
| 79 |
+
# Combine text from multiple segments for long-form transcription
|
| 80 |
+
transcription = " ".join(segment["text"] for segment in result["chunks"])
|
| 81 |
+
return transcription
|
| 82 |
except Exception as e:
|
| 83 |
return f"Error in transcription: {str(e)}"
|
| 84 |
|
|
|
|
| 150 |
)
|
| 151 |
|
| 152 |
# Launch Gradio interface
|
| 153 |
+
iface.launch(debug=True, share=True)
|