|
|
import gradio as gr |
|
|
from transformers import pipeline |
|
|
import torch |
|
|
from diffusers import StableDiffusionPipeline |
|
|
import soundfile as sf |
|
|
import speech_recognition as sr |
|
|
import numpy as np |
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
llm_pipe = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1") |
|
|
|
|
|
|
|
|
sd_pipe = StableDiffusionPipeline.from_pretrained( |
|
|
"runwayml/stable-diffusion-v1-5", |
|
|
torch_dtype=torch.float16 |
|
|
).to("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
|
|
|
recognizer = sr.Recognizer() |
|
|
|
|
|
def enhance_prompt(basic_prompt, style, detail_level, artist_style): |
|
|
"""使用LLM增强提示词""" |
|
|
prompt_template = f""" |
|
|
根据以下简短描述创建一个详细的Stable Diffusion提示: |
|
|
原始描述: {basic_prompt} |
|
|
风格: {style} |
|
|
细节级别: {detail_level} |
|
|
艺术家风格: {artist_style} |
|
|
|
|
|
请生成一个包含以下元素的详细提示: |
|
|
- 主体描述 |
|
|
- 环境/背景 |
|
|
- 光照条件 |
|
|
- 色彩风格 |
|
|
- 艺术媒介(如数字绘画、油画等) |
|
|
- 质量描述(如4K、超详细等) |
|
|
|
|
|
生成的提示: |
|
|
""" |
|
|
|
|
|
enhanced_prompt = llm_pipe( |
|
|
prompt_template, |
|
|
max_length=200, |
|
|
num_return_sequences=1, |
|
|
temperature=0.7 |
|
|
)[0]['generated_text'] |
|
|
|
|
|
|
|
|
enhanced_prompt = enhanced_prompt.replace(prompt_template, "").strip() |
|
|
return enhanced_prompt |
|
|
|
|
|
def generate_image(enhanced_prompt, steps, guidance_scale, seed): |
|
|
"""使用Stable Diffusion生成图像""" |
|
|
if seed == -1: |
|
|
seed = torch.randint(0, 2**32, (1,)).item() |
|
|
|
|
|
generator = torch.Generator(device="cuda" if torch.cuda.is_available() else "cpu").manual_seed(seed) |
|
|
|
|
|
image = sd_pipe( |
|
|
enhanced_prompt, |
|
|
num_inference_steps=steps, |
|
|
guidance_scale=guidance_scale, |
|
|
generator=generator |
|
|
).images[0] |
|
|
|
|
|
return image, seed |
|
|
|
|
|
def process_audio(audio): |
|
|
"""处理语音输入""" |
|
|
sr, audio_data = audio |
|
|
audio_array = np.array(audio_data, dtype=np.float32) |
|
|
|
|
|
|
|
|
temp_file = "temp_audio.wav" |
|
|
sf.write(temp_file, audio_array, sr) |
|
|
|
|
|
with sr.AudioFile(temp_file) as source: |
|
|
audio_data = recognizer.record(source) |
|
|
try: |
|
|
text = recognizer.recognize_google(audio_data, language='en-US') |
|
|
os.remove(temp_file) |
|
|
return text |
|
|
except Exception as e: |
|
|
os.remove(temp_file) |
|
|
return f"语音识别错误: {str(e)}" |
|
|
|
|
|
def full_process(basic_prompt, style, detail_level, artist_style, steps, guidance_scale, seed, use_audio, audio_input): |
|
|
"""完整处理流程""" |
|
|
|
|
|
if use_audio and audio_input is not None: |
|
|
basic_prompt = process_audio(audio_input) |
|
|
|
|
|
|
|
|
enhanced_prompt = enhance_prompt(basic_prompt, style, detail_level, artist_style) |
|
|
|
|
|
|
|
|
image, used_seed = generate_image(enhanced_prompt, steps, guidance_scale, seed) |
|
|
|
|
|
return enhanced_prompt, image, used_seed |
|
|
|
|
|
|
|
|
with gr.Blocks(title="魔法树屋图像生成器") as demo: |
|
|
gr.Markdown("# 🎨 魔法树屋图像生成器") |
|
|
gr.Markdown("输入简短描述或使用语音输入,生成精美图像!") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
|
|
|
use_audio = gr.Checkbox(label="使用语音输入") |
|
|
audio_input = gr.Audio(label="录音", visible=False) |
|
|
|
|
|
basic_prompt = gr.Textbox( |
|
|
label="简短描述", |
|
|
placeholder="例如: 天空中的魔法树屋", |
|
|
visible=True |
|
|
) |
|
|
|
|
|
|
|
|
def toggle_input(use_audio): |
|
|
return { |
|
|
basic_prompt: gr.update(visible=not use_audio), |
|
|
audio_input: gr.update(visible=use_audio) |
|
|
} |
|
|
|
|
|
use_audio.change( |
|
|
toggle_input, |
|
|
inputs=use_audio, |
|
|
outputs=[basic_prompt, audio_input] |
|
|
) |
|
|
|
|
|
|
|
|
style = gr.Dropdown( |
|
|
label="风格", |
|
|
choices=["现实主义", "幻想艺术", "赛博朋克", "水墨画", "卡通", "极简主义"], |
|
|
value="幻想艺术" |
|
|
) |
|
|
|
|
|
detail_level = gr.Slider( |
|
|
label="细节级别", |
|
|
minimum=1, |
|
|
maximum=5, |
|
|
step=1, |
|
|
value=3 |
|
|
) |
|
|
|
|
|
artist_style = gr.Dropdown( |
|
|
label="艺术家风格", |
|
|
choices=["无", "梵高", "毕加索", "莫奈", "达利", "宫崎骏"], |
|
|
value="无" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Accordion("高级选项", open=False): |
|
|
steps = gr.Slider( |
|
|
label="生成步数", |
|
|
minimum=20, |
|
|
maximum=100, |
|
|
step=5, |
|
|
value=50 |
|
|
) |
|
|
|
|
|
guidance_scale = gr.Slider( |
|
|
label="引导尺度", |
|
|
minimum=1.0, |
|
|
maximum=20.0, |
|
|
step=0.5, |
|
|
value=7.5 |
|
|
) |
|
|
|
|
|
seed = gr.Number( |
|
|
label="随机种子 (-1 表示随机)", |
|
|
value=-1 |
|
|
) |
|
|
|
|
|
submit_btn = gr.Button("生成图像", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
|
|
|
enhanced_prompt = gr.Textbox( |
|
|
label="生成的提示", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
image_output = gr.Image( |
|
|
label="生成的图像", |
|
|
height=512 |
|
|
) |
|
|
|
|
|
used_seed = gr.Number( |
|
|
label="使用的种子", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
|
|
|
submit_btn.click( |
|
|
fn=full_process, |
|
|
inputs=[ |
|
|
basic_prompt, style, detail_level, artist_style, |
|
|
steps, guidance_scale, seed, use_audio, audio_input |
|
|
], |
|
|
outputs=[enhanced_prompt, image_output, used_seed] |
|
|
) |
|
|
|
|
|
|
|
|
demo.queue() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |