import gradio as gr import whisper import torch import os from diffusers import StableDiffusionPipeline from typing import BinaryIO, Literal def get_device() -> Literal['cuda', 'cpu']: return "cuda" if torch.cuda.is_available() else "cpu" def get_token() -> str: return os.environ.get("HUGGING_FACE_TOKEN") def generate_images(prompt: str, scale: str, iterations: str, seed: str, num_images: str) -> list: AUTH_TOKEN = get_token() device = get_device() pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=AUTH_TOKEN) pipe.to(device) generator = torch.Generator(device).manual_seed(seed) prompt = [prompt] * num_images images = pipe(prompt, num_inference_steps = iterations, guidance_scale = scale, generator=generator).images output_files_names = [] for id, image in enumerate(images): filename = f"output{id}.png" image.save(filename) output_files_names.append(filename) return output_files_names def transcribe_audio(model_selected :str, audio_input: BinaryIO) -> tuple: model = whisper.load_model(model_selected) audio_input = whisper.load_audio(audio_input) audio_input = whisper.pad_or_trim(audio_input) translation_output = "" prompt_for_sd = "" mel = whisper.log_mel_spectrogram(audio_input).to(model.device) transcript_options = whisper.DecodingOptions(task="transcribe", fp16 = False) transcription = whisper.decode(model, mel, transcript_options) prompt_for_sd = transcription.text if transcription.language != "en": translation_options = whisper.DecodingOptions(task="translate", fp16 = False) translation = whisper.decode(model, mel, translation_options) translation_output = translation.text prompt_for_sd = translation_output return transcription.text, translation_output, str(transcription.language).upper(), prompt_for_sd with gr.Blocks() as demo: gr.HTML( """
Try Open AI Whisper with a recorded audio to generate images with Stable Diffusion!