|
|
import gradio as gr |
|
|
from transformers import pipeline |
|
|
from diffusers import StableDiffusionPipeline |
|
|
import torch |
|
|
from langdetect import detect |
|
|
|
|
|
|
|
|
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=0) |
|
|
prompt_generator = pipeline("text2text-generation", model="google/flan-t5-base", device=0) |
|
|
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0) |
|
|
|
|
|
pipe_v15 = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16) |
|
|
pipe_v15 = pipe_v15.to("cuda") |
|
|
|
|
|
pipe_sdxl = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16) |
|
|
pipe_sdxl = pipe_sdxl.to("cuda") |
|
|
|
|
|
def translate_to_english(text): |
|
|
return translator(text)[0]["translation_text"] |
|
|
|
|
|
def generate_prompt(description): |
|
|
try: |
|
|
lang = detect(description) |
|
|
except: |
|
|
lang = 'en' |
|
|
if lang.startswith('zh'): |
|
|
english_description = translate_to_english(description) |
|
|
else: |
|
|
english_description = description |
|
|
input_text = "Generate a detailed description for an AI image generator: " + english_description |
|
|
generated = prompt_generator(input_text, max_length=100, num_beams=5, early_stopping=True) |
|
|
return generated[0]["generated_text"] |
|
|
|
|
|
def generate_image(prompt, model_choice, guidance_scale): |
|
|
if model_choice == "v1.5": |
|
|
pipe = pipe_v15 |
|
|
else: |
|
|
pipe = pipe_sdxl |
|
|
with torch.autocast("cuda"): |
|
|
image = pipe(prompt=prompt, guidance_scale=guidance_scale).images[0] |
|
|
return image |
|
|
|
|
|
def app_function(text_description, audio_path, model_choice, guidance_scale): |
|
|
if audio_path is not None: |
|
|
description = transcriber(audio_path, language="zh")["text"] |
|
|
else: |
|
|
description = text_description |
|
|
if not description: |
|
|
return "请通过文本或音频提供描述。", None |
|
|
prompt = generate_prompt(description) |
|
|
image = generate_image(prompt, model_choice, guidance_scale) |
|
|
return prompt, image |
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=app_function, |
|
|
inputs=[ |
|
|
gr.Textbox(label="输入描述(文字)"), |
|
|
gr.Audio(label="输入描述(语音)", type="filepath"), |
|
|
gr.Dropdown(choices=["v1.5", "SDXL"], label="选择模型"), |
|
|
gr.Slider(minimum=1, maximum=20, value=7.5, label="引导比例") |
|
|
], |
|
|
outputs=[ |
|
|
gr.Textbox(label="生成的提示词"), |
|
|
gr.Image(label="生成图像") |
|
|
], |
|
|
title="提示词到图像生成器", |
|
|
description="通过文本或上传音频文件输入简短描述,使用 Stable Diffusion 生成图像。如果同时提供两者,将使用音频输入。" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
iface.launch() |