Sayiqa commited on
Commit
5a916c5
·
verified ·
1 Parent(s): 90b8ddc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -2
app.py CHANGED
@@ -1,4 +1,65 @@
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- gr.load("models/stabilityai/stable-diffusion-3.5-large").launch()
4
- gr.load("tts_models/multilingual/multi-dataset/xtts_v2").launch()
 
1
+ # import gradio as gr
2
+
3
+ # gr.load("models/stabilityai/stable-diffusion-3.5-large").launch()
4
+ # gr.load("tts_models/multilingual/multi-dataset/xtts_v2").launch()
5
+
6
  import gradio as gr
7
+ from transformers import pipeline
8
+ from diffusers import StableDiffusionPipeline
9
+ import torch
10
+
11
+ # Load the speech-to-text model (OpenAI Whisper)
12
+ speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-base")
13
+
14
+ # Load the Stable Diffusion model
15
+ text_to_image = StableDiffusionPipeline.from_pretrained(
16
+ "stabilityai/stable-diffusion-3.5-large", torch_dtype=torch.float16
17
+ ).to("cuda" if torch.cuda.is_available() else "cpu")
18
+
19
+
20
+ # Function to transcribe audio
21
+ def transcribe_audio(audio_file):
22
+ try:
23
+ result = speech_to_text(audio_file)
24
+ transcription = result["text"]
25
+ return transcription
26
+ except Exception as e:
27
+ return f"Error in transcription: {str(e)}"
28
+
29
+
30
+ # Function to generate image from text
31
+ def generate_image_from_text(prompt):
32
+ try:
33
+ image = text_to_image(prompt).images[0] # Generate one image
34
+ return image
35
+ except Exception as e:
36
+ return f"Error in image generation: {str(e)}"
37
+
38
+
39
+ # Combined function: Transcribe and generate image
40
+ def process_audio_and_generate_image(audio_file):
41
+ transcription = transcribe_audio(audio_file)
42
+ if "Error" in transcription:
43
+ return None, transcription
44
+
45
+ image = generate_image_from_text(transcription)
46
+ if isinstance(image, str) and "Error" in image:
47
+ return None, image
48
+
49
+ return image, transcription
50
+
51
+
52
+ # Gradio interface
53
+ interface = gr.Interface(
54
+ fn=process_audio_and_generate_image,
55
+ inputs=gr.Audio(type="filepath", label="Upload an Audio File (WAV/MP3)"),
56
+ outputs=[
57
+ gr.Image(label="Generated Image"),
58
+ gr.Textbox(label="Transcription"),
59
+ ],
60
+ title="Voice-to-Image Generator",
61
+ description="Upload an audio file to transcribe speech to text and generate an image based on the transcription.",
62
+ )
63
 
64
+ # Launch the interface
65
+ interface.launch()