Sayiqa commited on
Commit
343125e
·
verified ·
1 Parent(s): 49720ee

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -0
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+
3
+ # Install required libraries
4
+ subprocess.check_call(["pip", "install", "torch>=1.11.0"])
5
+ subprocess.check_call(["pip", "install", "transformers"])
6
+ subprocess.check_call(["pip", "install", "diffusers"])
7
+ subprocess.check_call(["pip", "install", "librosa"])
8
+ import os
9
+ import threading
10
+ import numpy as np
11
+ import diffusers
12
+ from functools import lru_cache
13
+ import gradio as gr
14
+ from transformers import pipeline
15
+ from huggingface_hub import login
16
+ from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
17
+ import librosa
18
+ import torch
19
+
20
+ # Ensure required dependencies are installed
21
+ def install_missing_packages():
22
+ required_packages = {
23
+ "librosa": None,
24
+ "diffusers": ">=0.14.0",
25
+ "gradio": ">=3.35.2",
26
+ "huggingface_hub": None,
27
+ }
28
+ for package, version in required_packages.items():
29
+ try:
30
+ __import__(package)
31
+ except ImportError:
32
+ package_name = f"{package}{version}" if version else package
33
+ subprocess.check_call(["pip", "install", package_name])
34
+
35
+ install_missing_packages()
36
+
37
+ # Get Hugging Face token for authentication
38
+ hf_token = os.getenv("HF_TOKEN")
39
+ if hf_token:
40
+ login(hf_token)
41
+ else:
42
+ raise ValueError("HF_TOKEN environment variable not set.")
43
+
44
+ # Load speech-to-text model (Whisper)
45
+ speech_to_text = pipeline(
46
+ "automatic-speech-recognition",
47
+ model="openai/whisper-tiny",
48
+ generate_kwargs={"language": "en"}, # Enforce English transcription
49
+ )
50
+
51
+ # Load Stable Diffusion model for text-to-image
52
+ text_to_image = StableDiffusionPipeline.from_pretrained(
53
+ "runwayml/stable-diffusion-v1-5"
54
+ )
55
+ device = "cuda" if torch.cuda.is_available() else "cpu"
56
+ text_to_image.to(device)
57
+ text_to_image.enable_attention_slicing() # Optimizes memory usage
58
+ text_to_image.safety_checker = None # Disables safety checker to improve speed
59
+ text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config) # Faster scheduler
60
+
61
+ # Preprocess audio file into NumPy array
62
+ def preprocess_audio(audio_path):
63
+ try:
64
+ audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz
65
+ return np.array(audio, dtype=np.float32)
66
+ except Exception as e:
67
+ return f"Error in preprocessing audio: {str(e)}"
68
+
69
+ # Speech-to-text function
70
+ @lru_cache(maxsize=10)
71
+ def transcribe_audio(audio_path):
72
+ try:
73
+ audio_array = preprocess_audio(audio_path)
74
+ if isinstance(audio_array, str): # Error message from preprocessing
75
+ return audio_array
76
+ result = speech_to_text(audio_array)
77
+ return result["text"]
78
+ except Exception as e:
79
+ return f"Error in transcription: {str(e)}"
80
+
81
+ # Text-to-image function
82
+ @lru_cache(maxsize=10)
83
+ def generate_image_from_text(text):
84
+ try:
85
+ image = text_to_image(text, height=256, width=256).images[0] # Generate smaller images for speed
86
+ return image
87
+ except Exception as e:
88
+ return f"Error in image generation: {str(e)}"
89
+
90
+ # Optimized combined processing function
91
+ def process_audio_and_generate_image(audio_path):
92
+ transcription_result = {"result": None}
93
+ image_result = {"result": None}
94
+
95
+ # Function to run transcription and image generation in parallel
96
+ def transcription_thread():
97
+ transcription_result["result"] = transcribe_audio(audio_path)
98
+
99
+ def image_generation_thread():
100
+ transcription = transcription_result["result"]
101
+ if transcription and "Error" not in transcription:
102
+ image_result["result"] = generate_image_from_text(transcription)
103
+
104
+ # Start both tasks in parallel
105
+ t1 = threading.Thread(target=transcription_thread)
106
+ t2 = threading.Thread(target=image_generation_thread)
107
+
108
+ t1.start()
109
+ t2.start()
110
+
111
+ t1.join() # Wait for transcription to finish
112
+ t2.join() # Wait for image generation to finish
113
+
114
+ transcription = transcription_result["result"]
115
+ image = image_result["result"]
116
+
117
+ if "Error" in transcription:
118
+ return None, transcription
119
+ if isinstance(image, str) and "Error" in image:
120
+ return None, image
121
+
122
+ return image, transcription
123
+
124
+ # Gradio interface
125
+ iface = gr.Interface(
126
+ fn=process_audio_and_generate_image,
127
+ inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
128
+ outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
129
+ title="Voice-to-Image Generator",
130
+ description="Upload an audio file to transcribe speech to text, and then generate an image based on the transcription.",
131
+ )
132
+
133
+ # Launch Gradio interface
134
+ iface.launch(debug=True, share=True)