zuoye / app.py
shenyugan's picture
Update app.py
004f5c9 verified
import gradio as gr
from transformers import pipeline
from diffusers import StableDiffusionPipeline
import torch
from langdetect import detect
# Load models
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=0)
prompt_generator = pipeline("text2text-generation", model="google/flan-t5-base", device=0)
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0)
pipe_v15 = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
pipe_v15 = pipe_v15.to("cuda")
pipe_sdxl = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
pipe_sdxl = pipe_sdxl.to("cuda")
def translate_to_english(text):
return translator(text)[0]["translation_text"]
def generate_prompt(description):
try:
lang = detect(description)
except:
lang = 'en' # Default to English if detection fails
if lang.startswith('zh'):
english_description = translate_to_english(description)
else:
english_description = description
input_text = "Generate a detailed description for an AI image generator: " + english_description
generated = prompt_generator(input_text, max_length=100, num_beams=5, early_stopping=True)
return generated[0]["generated_text"]
def generate_image(prompt, model_choice, guidance_scale):
if model_choice == "v1.5":
pipe = pipe_v15
else:
pipe = pipe_sdxl
with torch.autocast("cuda"):
image = pipe(prompt=prompt, guidance_scale=guidance_scale).images[0]
return image
def app_function(text_description, audio_path, model_choice, guidance_scale):
if audio_path is not None:
description = transcriber(audio_path, language="zh")["text"]
else:
description = text_description
if not description:
return "请通过文本或音频提供描述。", None
prompt = generate_prompt(description)
image = generate_image(prompt, model_choice, guidance_scale)
return prompt, image
iface = gr.Interface(
fn=app_function,
inputs=[
gr.Textbox(label="输入描述(文字)"),
gr.Audio(label="输入描述(语音)", type="filepath"),
gr.Dropdown(choices=["v1.5", "SDXL"], label="选择模型"),
gr.Slider(minimum=1, maximum=20, value=7.5, label="引导比例")
],
outputs=[
gr.Textbox(label="生成的提示词"),
gr.Image(label="生成图像")
],
title="提示词到图像生成器",
description="通过文本或上传音频文件输入简短描述,使用 Stable Diffusion 生成图像。如果同时提供两者,将使用音频输入。"
)
if __name__ == "__main__":
iface.launch()