Spaces:

Sayiqa7
/

Voice_clone_image

Runtime error

App Files Files Community

Sayiqa7 commited on Dec 17, 2024

Commit

d66e6ff

verified ·

1 Parent(s): 6683fa5

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -4

app.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import subprocess
 # Install required libraries
 subprocess.check_call(["pip", "install", "torch>=1.11.0"])
 subprocess.check_call(["pip", "install", "transformers>=4.31.0"])
 subprocess.check_call(["pip", "install", "diffusers>=0.14.0"])
 subprocess.check_call(["pip", "install", "librosa"])
 subprocess.check_call(["pip", "install", "accelerate>=0.20.1"])
 import os
 import threading
@@ -44,7 +46,11 @@ else:
     raise ValueError("HF_TOKEN environment variable not set.")
 # Load speech-to-text model (Whisper)
-speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
 # Load Stable Diffusion model for text-to-image
 text_to_image = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
@@ -62,7 +68,7 @@ def preprocess_audio(audio_path):
     except Exception as e:
         return f"Error in preprocessing audio: {str(e)}"
-# Speech-to-text function
 @lru_cache(maxsize=10)
 def transcribe_audio(audio_path):
     try:
@@ -70,7 +76,9 @@ def transcribe_audio(audio_path):
         if isinstance(audio_array, str):  # Error message from preprocessing
             return audio_array
         result = speech_to_text(audio_array)
-        return result["text"]
     except Exception as e:
         return f"Error in transcription: {str(e)}"
@@ -142,4 +150,4 @@ iface = gr.TabbedInterface(
 )
 # Launch Gradio interface
-iface.launch(debug=True, share=True)

 import subprocess
 # Install required libraries
 subprocess.check_call(["pip", "install", "torch>=1.11.0"])
 subprocess.check_call(["pip", "install", "transformers>=4.31.0"])
 subprocess.check_call(["pip", "install", "diffusers>=0.14.0"])
 subprocess.check_call(["pip", "install", "librosa"])
 subprocess.check_call(["pip", "install", "accelerate>=0.20.1"])
+subprocess.check_call(["pip", "install", "gradio>=3.35.2"])
 import os
 import threading
     raise ValueError("HF_TOKEN environment variable not set.")
 # Load speech-to-text model (Whisper)
+speech_to_text = pipeline(
+    "automatic-speech-recognition",
+    model="openai/whisper-tiny",
+    return_timestamps=True
+)
 # Load Stable Diffusion model for text-to-image
 text_to_image = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
     except Exception as e:
         return f"Error in preprocessing audio: {str(e)}"
+# Speech-to-text function with long-form transcription support
 @lru_cache(maxsize=10)
 def transcribe_audio(audio_path):
     try:
         if isinstance(audio_array, str):  # Error message from preprocessing
             return audio_array
         result = speech_to_text(audio_array)
+        # Combine text from multiple segments for long-form transcription
+        transcription = " ".join(segment["text"] for segment in result["chunks"])
+        return transcription
     except Exception as e:
         return f"Error in transcription: {str(e)}"
 )
 # Launch Gradio interface
+iface.launch(debug=True, share=True)