Delete app.py
Browse filesimport os
import gradio as gr
from openai import OpenAI
from diffusers import StableDiffusionPipeline
from PIL import Image
import speech_recognition as sr
# 确保你已经设置了 OPENAI_API_KEY 环境变量
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
# 加载 Stable Diffusion v1.5 模型
pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
# 如果你的 GPU 显存有限,可以尝试取消注释下一行以在 CUDA 上运行
# pipeline.to("cuda")
def generate_stable_diffusion_prompt(brief_description, style_preset=""):
"""
使用大语言模型将简短描述转换为适用于 Stable Diffusion 的详细 prompt,
并可以根据 style_preset 调整 prompt。
Args:
brief_description (str): 用户输入的简短描述。
style_preset (str, optional): 用户选择的艺术风格预设。默认为 "".
Returns:
str: 生成的 Stable Diffusion prompt。
"""
style_instruction = f",请使其具有 {style_preset} 的艺术风格" if style_preset else ""
prompt = f"""请将以下简短的描述扩展为一个结构良好、详细且富有创意的 Stable Diffusion 图像生成提示词,
包含主体、环境、光照、艺术风格、细节等要素{style_instruction},使其能够生成高质量的图像。描述是:"{brief_description}"。"""
try:
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": prompt}
],
max_tokens=150,
n=1,
stop=None,
temperature=0.8,
)
generated_prompt = response.choices[0].message.content.strip()
return generated_prompt
except Exception as e:
return f"生成 Prompt 时出错:{e}"
def generate_image(prompt, num_inference_steps, guidance_scale):
"""
使用 Stable Diffusion 模型根据 prompt 生成图像。
Args:
prompt (str): Stable Diffusion prompt。
num_inference_steps (int): 迭代步数。
guidance_scale (float): prompt 的引导强度。
Returns:
PIL.Image.Image: 生成的图像。
"""
try:
image = pipeline(prompt, num_inference_steps=int(num_inference_steps), guidance_scale=guidance_scale).images[0]
return image
except Exception as e:
print(f"生成图像时出错:{e}")
return None
def transcribe_audio(audio_file):
"""
使用 SpeechRecognition 将上传的音频文件转录为文本。
Args:
audio_file (str): 上传的音频文件路径。
Returns:
str: 转录的文本。
"""
try:
r = sr.Recognizer()
with sr.AudioFile(audio_file) as source:
audio_data = r.record(source)
text = r.recognize_google(audio_data, language="zh-CN")
return text
except sr.UnknownValueError:
return "无法识别语音"
except sr.RequestError as e:
return f"语音识别服务出错:{e}"
def process(description, num_inference_steps, guidance_scale, style_preset):
"""
整合了生成 prompt 和生成图像的流程。
Args:
description (str): 用户输入的描述。
num_inference_steps (int): 迭代步数。
guidance_scale (float): 引导强度。
style_preset (str): 艺术风格预设。
Returns:
tuple: 生成的 prompt 和图像。
"""
stable_diffusion_prompt = generate_stable_diffusion_prompt(description, style_preset)
generated_image = generate_image(stable_diffusion_prompt, num_inference_steps, guidance_scale)
return stable_diffusion_prompt, generated_image
if __name__ == '__main__':
with gr.Blocks() as demo:
gr.Markdown("# Prompt-to-Image 生成器 (支持文本和语音输入)")
with gr.TabbedInterface():
with gr.Tab("文本输入"):
description_input_text = gr.Textbox(label="输入你的简短描述", placeholder="例如:未来城市的日落")
style_choices_text = ["", "赛博朋克", "奇幻", "水彩", "油画", "像素艺术"]
style_dropdown_text = gr.Dropdown(style_choices_text, label="选择艺术风格 (将影响 Prompt 生成)", value="")
step_slider_text = gr.Slider(minimum=10, maximum=100, value=30, step=1, label="迭代步数")
guidance_slider_text = gr.Slider(minimum=1.0, maximum=10.0, value=7.5, step=0.5, label="引导强度")
generate_button_text = gr.Button("生成图像")
prompt_output_text = gr.Textbox(label="生成的 Prompt")
image_output_text = gr.Image(label="生成的图像")
generate_button_text.click(
fn=process,
inputs=[description_input_text, step_slider_text, guidance_slider_text, style_dropdown_text],
outputs=[prompt_output_text, image_output_text]
)
with gr.Tab("语音输入"):
audio_input = gr.Audio(source="microphone", label="通过麦克风输入描述")
transcription_output = gr.Textbox(label="语音转录文本", interactive=False)
style_choices_audio = ["", "赛博朋克", "奇幻", "水彩", "油画", "像素艺术"]
style_dropdown_audio = gr.Dropdown(style_choices_audio, label="选择艺术风格 (将影响 Prompt 生成)", value="")
step_slider_audio = gr.Slider(minimum=10, maximum=100, value=30, step=1, label="迭代步数")
guidance_slider_audio = gr.Slider(minimum=1.0, maximum=10.0, value=7.5, step=0.5, label="引导强度")
generate_button_audio = gr.Button("生成图像")
prompt_output_audio = gr.Textbox(label="生成的 Prompt")
image_output_audio = gr.Image(label="生成的图像")
audio_input.change(fn=transcribe_audio, inputs=audio_input, outputs=transcription_output)
generate_button_audio.click(
fn=process,
inputs=[transcription_output, step_slider_audio, guidance_slider_audio, style_dropdown_audio],
outputs=[prompt_output_audio, image_output_audio]
)
with gr.Tab("上传音频文件"):
audio_file_input = gr.Audio(source="upload", label="上传音频文件")
transcription_output_file = gr.Textbox(label="语音转录文本", interactive=False)
style_choices_file = ["", "赛博朋克", "奇幻", "水彩", "油画", "像素艺术"]
style_dropdown_file = gr.Dropdown(style_choices_file, label="选择艺术风格 (将影响 Prompt 生成)", value="")
step_slider_file = gr.Slider(minimum=10, maximum=100, value=30, step=1, label="迭代步数")
guidance_slider_file = gr.Slider(minimum=1.0, maximum=10.0, value=7.5, step=0.5, label="引导强度")
generate_button_file = gr.Button("生成图像")
prompt_output_file = gr.Textbox(label="生成的 Prompt")
image_output_file = gr.Image(label="生成的图像")
audio_file_input.change(fn=transcribe_audio, inputs=audio_file_input, outputs=transcription_output_file)
generate_button_file.click(
fn=process,
inputs=[transcription_output_file, step_slider_file, guidance_slider_file, style_dropdown_file],
outputs=[prompt_output_file, image_output_file]
)
demo.launch()
|
@@ -1,160 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import gradio as gr
|
| 3 |
-
from openai import OpenAI
|
| 4 |
-
from diffusers import StableDiffusionPipeline
|
| 5 |
-
from PIL import Image
|
| 6 |
-
import speech_recognition as sr
|
| 7 |
-
|
| 8 |
-
# 确保你已经设置了 OPENAI_API_KEY 环境变量
|
| 9 |
-
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
| 10 |
-
|
| 11 |
-
# 加载 Stable Diffusion v1.5 模型
|
| 12 |
-
pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
|
| 13 |
-
# 如果你的 GPU 显存有限,可以尝试取消注释下一行以在 CUDA 上运行
|
| 14 |
-
# pipeline.to("cuda")
|
| 15 |
-
|
| 16 |
-
def generate_stable_diffusion_prompt(brief_description, style_preset=""):
|
| 17 |
-
"""
|
| 18 |
-
使用大语言模型将简短描述转换为适用于 Stable Diffusion 的详细 prompt,
|
| 19 |
-
并可以根据 style_preset 调整 prompt。
|
| 20 |
-
|
| 21 |
-
Args:
|
| 22 |
-
brief_description (str): 用户输入的简短描述。
|
| 23 |
-
style_preset (str, optional): 用户选择的艺术风格预设。默认为 "".
|
| 24 |
-
|
| 25 |
-
Returns:
|
| 26 |
-
str: 生成的 Stable Diffusion prompt。
|
| 27 |
-
"""
|
| 28 |
-
style_instruction = f",请使其具有 {style_preset} 的艺术风格" if style_preset else ""
|
| 29 |
-
prompt = f"""请将以下简短的描述扩展为一个结构良好、详细且富有创意的 Stable Diffusion 图像生成提示词,
|
| 30 |
-
包含主体、环境、光照、艺术风格、细节等要素{style_instruction},使其能够生成高质量的图像。描述是:"{brief_description}"。"""
|
| 31 |
-
try:
|
| 32 |
-
response = client.chat.completions.create(
|
| 33 |
-
model="gpt-3.5-turbo",
|
| 34 |
-
messages=[
|
| 35 |
-
{"role": "user", "content": prompt}
|
| 36 |
-
],
|
| 37 |
-
max_tokens=150,
|
| 38 |
-
n=1,
|
| 39 |
-
stop=None,
|
| 40 |
-
temperature=0.8,
|
| 41 |
-
)
|
| 42 |
-
generated_prompt = response.choices[0].message.content.strip()
|
| 43 |
-
return generated_prompt
|
| 44 |
-
except Exception as e:
|
| 45 |
-
return f"生成 Prompt 时出错:{e}"
|
| 46 |
-
|
| 47 |
-
def generate_image(prompt, num_inference_steps, guidance_scale):
|
| 48 |
-
"""
|
| 49 |
-
使用 Stable Diffusion 模型根据 prompt 生成图像。
|
| 50 |
-
|
| 51 |
-
Args:
|
| 52 |
-
prompt (str): Stable Diffusion prompt。
|
| 53 |
-
num_inference_steps (int): 迭代步数。
|
| 54 |
-
guidance_scale (float): prompt 的引导强度。
|
| 55 |
-
|
| 56 |
-
Returns:
|
| 57 |
-
PIL.Image.Image: 生成的图像。
|
| 58 |
-
"""
|
| 59 |
-
try:
|
| 60 |
-
image = pipeline(prompt, num_inference_steps=int(num_inference_steps), guidance_scale=guidance_scale).images[0]
|
| 61 |
-
return image
|
| 62 |
-
except Exception as e:
|
| 63 |
-
print(f"生成图像时出错:{e}")
|
| 64 |
-
return None
|
| 65 |
-
|
| 66 |
-
def transcribe_audio(audio_file):
|
| 67 |
-
"""
|
| 68 |
-
使用 SpeechRecognition 将上传的音频文件转录为文本。
|
| 69 |
-
|
| 70 |
-
Args:
|
| 71 |
-
audio_file (str): 上传的音频文件路径。
|
| 72 |
-
|
| 73 |
-
Returns:
|
| 74 |
-
str: 转录的文本。
|
| 75 |
-
"""
|
| 76 |
-
try:
|
| 77 |
-
r = sr.Recognizer()
|
| 78 |
-
with sr.AudioFile(audio_file) as source:
|
| 79 |
-
audio_data = r.record(source)
|
| 80 |
-
text = r.recognize_google(audio_data, language="zh-CN")
|
| 81 |
-
return text
|
| 82 |
-
except sr.UnknownValueError:
|
| 83 |
-
return "无法识别语音"
|
| 84 |
-
except sr.RequestError as e:
|
| 85 |
-
return f"语音识别服务出错:{e}"
|
| 86 |
-
|
| 87 |
-
def process(description, num_inference_steps, guidance_scale, style_preset):
|
| 88 |
-
"""
|
| 89 |
-
整合了生成 prompt 和生成图像的流程。
|
| 90 |
-
|
| 91 |
-
Args:
|
| 92 |
-
description (str): 用户输入的描述。
|
| 93 |
-
num_inference_steps (int): 迭代步数。
|
| 94 |
-
guidance_scale (float): 引导强度。
|
| 95 |
-
style_preset (str): 艺术风格预设。
|
| 96 |
-
|
| 97 |
-
Returns:
|
| 98 |
-
tuple: 生成的 prompt 和图像。
|
| 99 |
-
"""
|
| 100 |
-
stable_diffusion_prompt = generate_stable_diffusion_prompt(description, style_preset)
|
| 101 |
-
generated_image = generate_image(stable_diffusion_prompt, num_inference_steps, guidance_scale)
|
| 102 |
-
return stable_diffusion_prompt, generated_image
|
| 103 |
-
|
| 104 |
-
if __name__ == '__main__':
|
| 105 |
-
with gr.Blocks() as demo:
|
| 106 |
-
gr.Markdown("# Prompt-to-Image 生成器 (支持文本和语音输入)")
|
| 107 |
-
with gr.TabbedInterface():
|
| 108 |
-
with gr.Tab("文本输入"):
|
| 109 |
-
description_input_text = gr.Textbox(label="输入你的简短描述", placeholder="例如:未来城市的日落")
|
| 110 |
-
style_choices_text = ["", "赛博朋克", "奇幻", "水彩", "油画", "像素艺术"]
|
| 111 |
-
style_dropdown_text = gr.Dropdown(style_choices_text, label="选择艺术风格 (将影响 Prompt 生成)", value="")
|
| 112 |
-
step_slider_text = gr.Slider(minimum=10, maximum=100, value=30, step=1, label="迭代步数")
|
| 113 |
-
guidance_slider_text = gr.Slider(minimum=1.0, maximum=10.0, value=7.5, step=0.5, label="引导强度")
|
| 114 |
-
generate_button_text = gr.Button("生成图像")
|
| 115 |
-
prompt_output_text = gr.Textbox(label="生成的 Prompt")
|
| 116 |
-
image_output_text = gr.Image(label="生成的图像")
|
| 117 |
-
|
| 118 |
-
generate_button_text.click(
|
| 119 |
-
fn=process,
|
| 120 |
-
inputs=[description_input_text, step_slider_text, guidance_slider_text, style_dropdown_text],
|
| 121 |
-
outputs=[prompt_output_text, image_output_text]
|
| 122 |
-
)
|
| 123 |
-
|
| 124 |
-
with gr.Tab("语音输入"):
|
| 125 |
-
audio_input = gr.Audio(source="microphone", label="通过麦克风输入描述")
|
| 126 |
-
transcription_output = gr.Textbox(label="语音转录文本")
|
| 127 |
-
style_choices_audio = ["", "赛博朋克", "奇幻", "水彩", "油画", "像素艺术"]
|
| 128 |
-
style_dropdown_audio = gr.Dropdown(style_choices_audio, label="选择艺术风格 (将影响 Prompt 生成)", value="")
|
| 129 |
-
step_slider_audio = gr.Slider(minimum=10, maximum=100, value=30, step=1, label="迭代步数")
|
| 130 |
-
guidance_slider_audio = gr.Slider(minimum=1.0, maximum=10.0, value=7.5, step=0.5, label="引导强度")
|
| 131 |
-
generate_button_audio = gr.Button("生成图像")
|
| 132 |
-
prompt_output_audio = gr.Textbox(label="生成的 Prompt")
|
| 133 |
-
image_output_audio = gr.Image(label="生成的图像")
|
| 134 |
-
|
| 135 |
-
audio_input.change(fn=transcribe_audio, inputs=audio_input, outputs=transcription_output)
|
| 136 |
-
generate_button_audio.click(
|
| 137 |
-
fn=process,
|
| 138 |
-
inputs=[transcription_output, step_slider_audio, guidance_slider_audio, style_dropdown_audio],
|
| 139 |
-
outputs=[prompt_output_audio, image_output_audio]
|
| 140 |
-
)
|
| 141 |
-
|
| 142 |
-
with gr.Tab("上传音频文件"):
|
| 143 |
-
audio_file_input = gr.Audio(source="upload", label="上传音频文件")
|
| 144 |
-
transcription_output_file = gr.Textbox(label="语音转录文本")
|
| 145 |
-
style_choices_file = ["", "赛博朋克", "奇幻", "水彩", "油画", "像素艺术"]
|
| 146 |
-
style_dropdown_file = gr.Dropdown(style_choices_file, label="选择艺术风格 (将影响 Prompt 生成)", value="")
|
| 147 |
-
step_slider_file = gr.Slider(minimum=10, maximum=100, value=30, step=1, label="迭代步数")
|
| 148 |
-
guidance_slider_file = gr.Slider(minimum=1.0, maximum=10.0, value=7.5, step=0.5, label="引导强度")
|
| 149 |
-
generate_button_file = gr.Button("生成图像")
|
| 150 |
-
prompt_output_file = gr.Textbox(label="生成的 Prompt")
|
| 151 |
-
image_output_file = gr.Image(label="生成的图像")
|
| 152 |
-
|
| 153 |
-
audio_file_input.change(fn=transcribe_audio, inputs=audio_file_input, outputs=transcription_output_file)
|
| 154 |
-
generate_button_file.click(
|
| 155 |
-
fn=process,
|
| 156 |
-
inputs=[transcription_output_file, step_slider_file, guidance_slider_file, style_dropdown_file],
|
| 157 |
-
outputs=[prompt_output_file, image_output_file]
|
| 158 |
-
)
|
| 159 |
-
|
| 160 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|