import gradio as gr from transformers import pipeline from diffusers import StableDiffusionPipeline import torch from langdetect import detect # Load models translator = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=0) prompt_generator = pipeline("text2text-generation", model="google/flan-t5-base", device=0) transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0) pipe_v15 = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16) pipe_v15 = pipe_v15.to("cuda") pipe_sdxl = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16) pipe_sdxl = pipe_sdxl.to("cuda") def translate_to_english(text): return translator(text)[0]["translation_text"] def generate_prompt(description): try: lang = detect(description) except: lang = 'en' # Default to English if detection fails if lang.startswith('zh'): english_description = translate_to_english(description) else: english_description = description input_text = "Generate a detailed description for an AI image generator: " + english_description generated = prompt_generator(input_text, max_length=100, num_beams=5, early_stopping=True) return generated[0]["generated_text"] def generate_image(prompt, model_choice, guidance_scale): if model_choice == "v1.5": pipe = pipe_v15 else: pipe = pipe_sdxl with torch.autocast("cuda"): image = pipe(prompt=prompt, guidance_scale=guidance_scale).images[0] return image def app_function(text_description, audio_path, model_choice, guidance_scale): if audio_path is not None: description = transcriber(audio_path, language="zh")["text"] else: description = text_description if not description: return "请通过文本或音频提供描述。", None prompt = generate_prompt(description) image = generate_image(prompt, model_choice, guidance_scale) return prompt, image iface = gr.Interface( fn=app_function, inputs=[ gr.Textbox(label="输入描述(文字)"), gr.Audio(label="输入描述(语音)", type="filepath"), gr.Dropdown(choices=["v1.5", "SDXL"], label="选择模型"), gr.Slider(minimum=1, maximum=20, value=7.5, label="引导比例") ], outputs=[ gr.Textbox(label="生成的提示词"), gr.Image(label="生成图像") ], title="提示词到图像生成器", description="通过文本或上传音频文件输入简短描述,使用 Stable Diffusion 生成图像。如果同时提供两者,将使用音频输入。" ) if __name__ == "__main__": iface.launch()