|
|
from transformers import pipeline |
|
|
from huggingface_hub import login |
|
|
from diffusers import StableDiffusionPipeline |
|
|
import gradio as gr |
|
|
import torch |
|
|
import transformers |
|
|
|
|
|
hf_token = "your_huggingface_token_here" |
|
|
login(hf_token) |
|
|
|
|
|
|
|
|
speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-base") |
|
|
|
|
|
|
|
|
text_to_image = StableDiffusionPipeline.from_pretrained( |
|
|
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16 |
|
|
).to("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
|
|
|
def transcribe_audio(audio_file): |
|
|
try: |
|
|
result = speech_to_text(audio_file) |
|
|
transcription = result["text"] |
|
|
return transcription |
|
|
except Exception as e: |
|
|
return f"Error in transcription: {str(e)}" |
|
|
|
|
|
|
|
|
def generate_image_from_text(text): |
|
|
try: |
|
|
image = text_to_image(text).images[0] |
|
|
return image |
|
|
except Exception as e: |
|
|
return f"Error in image generation: {str(e)}" |
|
|
|
|
|
|
|
|
def process_audio_and_generate_image(audio_file): |
|
|
transcription = transcribe_audio(audio_file) |
|
|
if "Error" in transcription: |
|
|
return None, transcription |
|
|
|
|
|
image = generate_image_from_text(transcription) |
|
|
if isinstance(image, str) and "Error" in image: |
|
|
return None, image |
|
|
|
|
|
return image, transcription |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=process_audio_and_generate_image, |
|
|
inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"), |
|
|
outputs=[ |
|
|
gr.Image(label="Generated Image"), |
|
|
gr.Textbox(label="Transcription") |
|
|
], |
|
|
title="Speech-to-Text and Image Generation", |
|
|
description="Upload an audio file to transcribe speech to text, and then generate an image based on the transcription.", |
|
|
) |
|
|
|
|
|
|
|
|
iface.launch(share=True) |
|
|
|