Spaces:

Sayiqa
/

IMAGE

Sleeping

App Files Files Community

Sayiqa commited on Dec 16, 2024

Commit

343125e

verified ·

1 Parent(s): 49720ee

Create app.py

Browse files

Files changed (1) hide show

app.py +134 -0

app.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import subprocess
+# Install required libraries
+subprocess.check_call(["pip", "install", "torch>=1.11.0"])
+subprocess.check_call(["pip", "install", "transformers"])
+subprocess.check_call(["pip", "install", "diffusers"])
+subprocess.check_call(["pip", "install", "librosa"])
+import os
+import threading
+import numpy as np
+import diffusers
+from functools import lru_cache
+import gradio as gr
+from transformers import pipeline
+from huggingface_hub import login
+from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
+import librosa
+import torch
+# Ensure required dependencies are installed
+def install_missing_packages():
+    required_packages = {
+        "librosa": None,
+        "diffusers": ">=0.14.0",
+        "gradio": ">=3.35.2",
+        "huggingface_hub": None,
+    }
+    for package, version in required_packages.items():
+        try:
+            __import__(package)
+        except ImportError:
+            package_name = f"{package}{version}" if version else package
+            subprocess.check_call(["pip", "install", package_name])
+install_missing_packages()
+# Get Hugging Face token for authentication
+hf_token = os.getenv("HF_TOKEN")
+if hf_token:
+    login(hf_token)
+else:
+    raise ValueError("HF_TOKEN environment variable not set.")
+# Load speech-to-text model (Whisper)
+speech_to_text = pipeline(
+    "automatic-speech-recognition",
+    model="openai/whisper-tiny",
+    generate_kwargs={"language": "en"},  # Enforce English transcription
+)
+# Load Stable Diffusion model for text-to-image
+text_to_image = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5"
+)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+text_to_image.to(device)
+text_to_image.enable_attention_slicing()  # Optimizes memory usage
+text_to_image.safety_checker = None  # Disables safety checker to improve speed
+text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)  # Faster scheduler
+# Preprocess audio file into NumPy array
+def preprocess_audio(audio_path):
+    try:
+        audio, sr = librosa.load(audio_path, sr=16000)  # Resample to 16kHz
+        return np.array(audio, dtype=np.float32)
+    except Exception as e:
+        return f"Error in preprocessing audio: {str(e)}"
+# Speech-to-text function
+@lru_cache(maxsize=10)
+def transcribe_audio(audio_path):
+    try:
+        audio_array = preprocess_audio(audio_path)
+        if isinstance(audio_array, str):  # Error message from preprocessing
+            return audio_array
+        result = speech_to_text(audio_array)
+        return result["text"]
+    except Exception as e:
+        return f"Error in transcription: {str(e)}"
+# Text-to-image function
+@lru_cache(maxsize=10)
+def generate_image_from_text(text):
+    try:
+        image = text_to_image(text, height=256, width=256).images[0]  # Generate smaller images for speed
+        return image
+    except Exception as e:
+        return f"Error in image generation: {str(e)}"
+# Optimized combined processing function
+def process_audio_and_generate_image(audio_path):
+    transcription_result = {"result": None}
+    image_result = {"result": None}
+    # Function to run transcription and image generation in parallel
+    def transcription_thread():
+        transcription_result["result"] = transcribe_audio(audio_path)
+    def image_generation_thread():
+        transcription = transcription_result["result"]
+        if transcription and "Error" not in transcription:
+            image_result["result"] = generate_image_from_text(transcription)
+    # Start both tasks in parallel
+    t1 = threading.Thread(target=transcription_thread)
+    t2 = threading.Thread(target=image_generation_thread)
+    t1.start()
+    t2.start()
+    t1.join()  # Wait for transcription to finish
+    t2.join()  # Wait for image generation to finish
+    transcription = transcription_result["result"]
+    image = image_result["result"]
+    if "Error" in transcription:
+        return None, transcription
+    if isinstance(image, str) and "Error" in image:
+        return None, image
+    return image, transcription
+# Gradio interface
+iface = gr.Interface(
+    fn=process_audio_and_generate_image,
+    inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
+    outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
+    title="Voice-to-Image Generator",
+    description="Upload an audio file to transcribe speech to text, and then generate an image based on the transcription.",
+)
+# Launch Gradio interface
+iface.launch(debug=True, share=True)