import gradio as gr from transformers import pipeline import torch from diffusers import StableDiffusionPipeline import soundfile as sf import speech_recognition as sr import numpy as np import os # 初始化组件 # 使用较小的开源LLM进行提示增强 llm_pipe = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1") # 初始化Stable Diffusion sd_pipe = StableDiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16 ).to("cuda" if torch.cuda.is_available() else "cpu") # 语音识别初始化 recognizer = sr.Recognizer() def enhance_prompt(basic_prompt, style, detail_level, artist_style): """使用LLM增强提示词""" prompt_template = f""" 根据以下简短描述创建一个详细的Stable Diffusion提示: 原始描述: {basic_prompt} 风格: {style} 细节级别: {detail_level} 艺术家风格: {artist_style} 请生成一个包含以下元素的详细提示: - 主体描述 - 环境/背景 - 光照条件 - 色彩风格 - 艺术媒介(如数字绘画、油画等) - 质量描述(如4K、超详细等) 生成的提示: """ enhanced_prompt = llm_pipe( prompt_template, max_length=200, num_return_sequences=1, temperature=0.7 )[0]['generated_text'] # 清理生成的文本 enhanced_prompt = enhanced_prompt.replace(prompt_template, "").strip() return enhanced_prompt def generate_image(enhanced_prompt, steps, guidance_scale, seed): """使用Stable Diffusion生成图像""" if seed == -1: seed = torch.randint(0, 2**32, (1,)).item() generator = torch.Generator(device="cuda" if torch.cuda.is_available() else "cpu").manual_seed(seed) image = sd_pipe( enhanced_prompt, num_inference_steps=steps, guidance_scale=guidance_scale, generator=generator ).images[0] return image, seed def process_audio(audio): """处理语音输入""" sr, audio_data = audio audio_array = np.array(audio_data, dtype=np.float32) # 保存临时文件供语音识别使用 temp_file = "temp_audio.wav" sf.write(temp_file, audio_array, sr) with sr.AudioFile(temp_file) as source: audio_data = recognizer.record(source) try: text = recognizer.recognize_google(audio_data, language='en-US') os.remove(temp_file) return text except Exception as e: os.remove(temp_file) return f"语音识别错误: {str(e)}" def full_process(basic_prompt, style, detail_level, artist_style, steps, guidance_scale, seed, use_audio, audio_input): """完整处理流程""" # 处理语音输入 if use_audio and audio_input is not None: basic_prompt = process_audio(audio_input) # 生成增强提示 enhanced_prompt = enhance_prompt(basic_prompt, style, detail_level, artist_style) # 生成图像 image, used_seed = generate_image(enhanced_prompt, steps, guidance_scale, seed) return enhanced_prompt, image, used_seed # Gradio界面 with gr.Blocks(title="魔法树屋图像生成器") as demo: gr.Markdown("# 🎨 魔法树屋图像生成器") gr.Markdown("输入简短描述或使用语音输入,生成精美图像!") with gr.Row(): with gr.Column(): # 输入部分 use_audio = gr.Checkbox(label="使用语音输入") audio_input = gr.Audio(label="录音", visible=False) basic_prompt = gr.Textbox( label="简短描述", placeholder="例如: 天空中的魔法树屋", visible=True ) # 当复选框变化时切换输入方式 def toggle_input(use_audio): return { basic_prompt: gr.update(visible=not use_audio), audio_input: gr.update(visible=use_audio) } use_audio.change( toggle_input, inputs=use_audio, outputs=[basic_prompt, audio_input] ) # 风格选项 style = gr.Dropdown( label="风格", choices=["现实主义", "幻想艺术", "赛博朋克", "水墨画", "卡通", "极简主义"], value="幻想艺术" ) detail_level = gr.Slider( label="细节级别", minimum=1, maximum=5, step=1, value=3 ) artist_style = gr.Dropdown( label="艺术家风格", choices=["无", "梵高", "毕加索", "莫奈", "达利", "宫崎骏"], value="无" ) # 高级选项 with gr.Accordion("高级选项", open=False): steps = gr.Slider( label="生成步数", minimum=20, maximum=100, step=5, value=50 ) guidance_scale = gr.Slider( label="引导尺度", minimum=1.0, maximum=20.0, step=0.5, value=7.5 ) seed = gr.Number( label="随机种子 (-1 表示随机)", value=-1 ) submit_btn = gr.Button("生成图像", variant="primary") with gr.Column(): # 输出部分 enhanced_prompt = gr.Textbox( label="生成的提示", interactive=False ) image_output = gr.Image( label="生成的图像", height=512 ) used_seed = gr.Number( label="使用的种子", interactive=False ) # 连接按钮 submit_btn.click( fn=full_process, inputs=[ basic_prompt, style, detail_level, artist_style, steps, guidance_scale, seed, use_audio, audio_input ], outputs=[enhanced_prompt, image_output, used_seed] ) # 对于Hugging Face Spaces,我们需要设置队列 demo.queue() # 启动应用 if __name__ == "__main__": demo.launch()