Sayiqa7 commited on
Commit
3cda8fc
·
verified ·
1 Parent(s): 392280a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -0
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+
3
+ # Install required libraries
4
+ subprocess.check_call(["pip", "install", "torch>=1.11.0"])
5
+ subprocess.check_call(["pip", "install", "transformers>=4.31.0"])
6
+ subprocess.check_call(["pip", "install", "diffusers>=0.14.0"])
7
+ subprocess.check_call(["pip", "install", "librosa"])
8
+ subprocess.check_call(["pip", "install", "accelerate>=0.20.1"])
9
+ subprocess.check_call(["pip", "install", "gradio>=3.35.2"])
10
+ subprocess.check_call(["pip", "install", "huggingface_hub"])
11
+
12
+ import os
13
+ import threading
14
+ import numpy as np
15
+ import librosa
16
+ import torch
17
+ import gradio as gr
18
+ from functools import lru_cache
19
+ from transformers import pipeline
20
+ from huggingface_hub import login
21
+ from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
22
+
23
+ # Ensure required dependencies are installed
24
+ def install_missing_packages():
25
+ required_packages = {
26
+ "librosa": None,
27
+ "diffusers": ">=0.14.0",
28
+ "gradio": ">=3.35.2",
29
+ "huggingface_hub": None,
30
+ "accelerate": ">=0.20.1",
31
+ "transformers": ">=4.31.0"
32
+ }
33
+ for package, version in required_packages.items():
34
+ try:
35
+ __import__(package)
36
+ except ImportError:
37
+ package_name = f"{package}{version}" if version else package
38
+ subprocess.check_call(["pip", "install", package_name])
39
+
40
+ install_missing_packages()
41
+
42
+ # Get Hugging Face token for authentication
43
+ hf_token = os.getenv("HF_TOKEN")
44
+ if hf_token:
45
+ login(hf_token)
46
+ else:
47
+ raise ValueError("HF_TOKEN environment variable not set.")
48
+
49
+ # Load speech-to-text model (Whisper)
50
+ speech_to_text = pipeline(
51
+ "automatic-speech-recognition",
52
+ model="openai/whisper-tiny",
53
+ return_timestamps=True
54
+ )
55
+
56
+ # Load Stable Diffusion model for text-to-image
57
+ text_to_image = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
58
+ device = "cuda" if torch.cuda.is_available() else "cpu"
59
+ text_to_image.to(device)
60
+ text_to_image.enable_attention_slicing()
61
+ text_to_image.safety_checker = None
62
+ text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)
63
+
64
+ # Preprocess audio file into NumPy array
65
+ def preprocess_audio(audio_path):
66
+ try:
67
+ audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz
68
+ return np.array(audio, dtype=np.float32)
69
+ except Exception as e:
70
+ return f"Error in preprocessing audio: {str(e)}"
71
+
72
+ # Speech-to-text function with long-form transcription support
73
+ @lru_cache(maxsize=10)
74
+ def transcribe_audio(audio_path):
75
+ try:
76
+ audio_array = preprocess_audio(audio_path)
77
+ if isinstance(audio_array, str): # Error message from preprocessing
78
+ return audio_array
79
+ result = speech_to_text(audio_array)
80
+ # Combine text from multiple segments for long-form transcription
81
+ transcription = " ".join(segment["text"] for segment in result["chunks"])
82
+ return transcription
83
+ except Exception as e:
84
+ return f"Error in transcription: {str(e)}"
85
+
86
+ # Text-to-image function
87
+ @lru_cache(maxsize=10)
88
+ def generate_image_from_text(text):
89
+ try:
90
+ image = text_to_image(text, height=256, width=256).images[0] # Generate smaller images for speed
91
+ return image
92
+ except Exception as e:
93
+ return f"Error in image generation: {str(e)}"
94
+
95
+ # Combined processing function
96
+ def process_audio_and_generate_results(audio_path):
97
+ transcription_result = {"result": None}
98
+ image_result = {"result": None}
99
+
100
+ # Function to run transcription and image generation in parallel
101
+ def transcription_thread():
102
+ transcription_result["result"] = transcribe_audio(audio_path)
103
+
104
+ def image_generation_thread():
105
+ transcription = transcription_result["result"]
106
+ if transcription and "Error" not in transcription:
107
+ image_result["result"] = generate_image_from_text(transcription)
108
+
109
+ # Start both tasks in parallel
110
+ t1 = threading.Thread(target=transcription_thread)
111
+ t2 = threading.Thread(target=image_generation_thread)
112
+
113
+ t1.start()
114
+ t2.start()
115
+
116
+ t1.join() # Wait for transcription to finish
117
+ t2.join() # Wait for image generation to finish
118
+
119
+ transcription = transcription_result["result"]
120
+ image = image_result["result"]
121
+
122
+ if "Error" in transcription:
123
+ return None, transcription
124
+ if isinstance(image, str) and "Error" in image:
125
+ return None, image
126
+
127
+ return image, transcription
128
+
129
+ # Gradio interface for speech-to-text
130
+ speech_to_text_iface = gr.Interface(
131
+ fn=transcribe_audio,
132
+ inputs=gr.Audio(type="filepath", label="Upload audio file for transcription (WAV/MP3)"),
133
+ outputs=gr.Textbox(label="Transcription"),
134
+ title="Speech-to-Text Transcription",
135
+ description="Upload an audio file to transcribe speech into text.",
136
+ )
137
+
138
+ # Gradio interface for voice-to-image
139
+ voice_to_image_iface = gr.Interface(
140
+ fn=process_audio_and_generate_results,
141
+ inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
142
+ outputs=[gr.Image(label="Generated Image"), gr.Textbox(label="Transcription")],
143
+ title="Voice-to-Image",
144
+ description="Upload an audio file to transcribe speech to text and generate an image based on the transcription.",
145
+ )
146
+
147
+ # Combined Gradio app
148
+ iface = gr.TabbedInterface(
149
+ interface_list=[speech_to_text_iface, voice_to_image_iface],
150
+ tab_names=["Speech-to-Text", "Voice-to-Image"]
151
+ )
152
+
153
+ # Launch Gradio interface
154
+ iface.launch(debug=True, share=True)