Spaces:

HeChuan1
/

1

No application file

App Files Files Community

HeChuan1 commited on Apr 25, 2025

Commit

d79cf4d

verified ·

1 Parent(s): c09f07c

Delete app.py

Browse files

import os
import gradio as gr
from openai import OpenAI
from diffusers import StableDiffusionPipeline
from PIL import Image
import speech_recognition as sr

# 确保你已经设置了 OPENAI_API_KEY 环境变量
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# 加载 Stable Diffusion v1.5 模型
pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
# 如果你的 GPU 显存有限，可以尝试取消注释下一行以在 CUDA 上运行
# pipeline.to("cuda")

def generate_stable_diffusion_prompt(brief_description, style_preset=""):
"""
使用大语言模型将简短描述转换为适用于 Stable Diffusion 的详细 prompt，
并可以根据 style_preset 调整 prompt。

Args:
brief_description (str): 用户输入的简短描述。
style_preset (str, optional): 用户选择的艺术风格预设。默认为 "".

Returns:
str: 生成的 Stable Diffusion prompt。
"""
style_instruction = f"，请使其具有 {style_preset} 的艺术风格" if style_preset else ""
prompt = f"""请将以下简短的描述扩展为一个结构良好、详细且富有创意的 Stable Diffusion 图像生成提示词，
包含主体、环境、光照、艺术风格、细节等要素{style_instruction}，使其能够生成高质量的图像。描述是："{brief_description}"。"""
try:
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": prompt}
],
max_tokens=150,
n=1,
stop=None,
temperature=0.8,
)
generated_prompt = response.choices[0].message.content.strip()
return generated_prompt
except Exception as e:
return f"生成 Prompt 时出错：{e}"

def generate_image(prompt, num_inference_steps, guidance_scale):
"""
使用 Stable Diffusion 模型根据 prompt 生成图像。

Args:
prompt (str): Stable Diffusion prompt。
num_inference_steps (int): 迭代步数。
guidance_scale (float): prompt 的引导强度。

Returns:
PIL.Image.Image: 生成的图像。
"""
try:
image = pipeline(prompt, num_inference_steps=int(num_inference_steps), guidance_scale=guidance_scale).images[0]
return image
except Exception as e:
print(f"生成图像时出错：{e}")
return None

def transcribe_audio(audio_file):
"""
使用 SpeechRecognition 将上传的音频文件转录为文本。

Args:
audio_file (str): 上传的音频文件路径。

Returns:
str: 转录的文本。
"""
try:
r = sr.Recognizer()
with sr.AudioFile(audio_file) as source:
audio_data = r.record(source)
text = r.recognize_google(audio_data, language="zh-CN")
return text
except sr.UnknownValueError:
return "无法识别语音"
except sr.RequestError as e:
return f"语音识别服务出错：{e}"

def process(description, num_inference_steps, guidance_scale, style_preset):
"""
整合了生成 prompt 和生成图像的流程。

Args:
description (str): 用户输入的描述。
num_inference_steps (int): 迭代步数。
guidance_scale (float): 引导强度。
style_preset (str): 艺术风格预设。

Returns:
tuple: 生成的 prompt 和图像。
"""
stable_diffusion_prompt = generate_stable_diffusion_prompt(description, style_preset)
generated_image = generate_image(stable_diffusion_prompt, num_inference_steps, guidance_scale)
return stable_diffusion_prompt, generated_image

if __name__ == '__main__':
with gr.Blocks() as demo:
gr.Markdown("# Prompt-to-Image 生成器 (支持文本和语音输入)")
with gr.TabbedInterface():
with gr.Tab("文本输入"):
description_input_text = gr.Textbox(label="输入你的简短描述", placeholder="例如：未来城市的日落")
style_choices_text = ["", "赛博朋克", "奇幻", "水彩", "油画", "像素艺术"]
style_dropdown_text = gr.Dropdown(style_choices_text, label="选择艺术风格 (将影响 Prompt 生成)", value="")
step_slider_text = gr.Slider(minimum=10, maximum=100, value=30, step=1, label="迭代步数")
guidance_slider_text = gr.Slider(minimum=1.0, maximum=10.0, value=7.5, step=0.5, label="引导强度")
generate_button_text = gr.Button("生成图像")
prompt_output_text = gr.Textbox(label="生成的 Prompt")
image_output_text = gr.Image(label="生成的图像")

generate_button_text.click(
fn=process,
inputs=[description_input_text, step_slider_text, guidance_slider_text, style_dropdown_text],
outputs=[prompt_output_text, image_output_text]
)

with gr.Tab("语音输入"):
audio_input = gr.Audio(source="microphone", label="通过麦克风输入描述")
transcription_output = gr.Textbox(label="语音转录文本", interactive=False)
style_choices_audio = ["", "赛博朋克", "奇幻", "水彩", "油画", "像素艺术"]
style_dropdown_audio = gr.Dropdown(style_choices_audio, label="选择艺术风格 (将影响 Prompt 生成)", value="")
step_slider_audio = gr.Slider(minimum=10, maximum=100, value=30, step=1, label="迭代步数")
guidance_slider_audio = gr.Slider(minimum=1.0, maximum=10.0, value=7.5, step=0.5, label="引导强度")
generate_button_audio = gr.Button("生成图像")
prompt_output_audio = gr.Textbox(label="生成的 Prompt")
image_output_audio = gr.Image(label="生成的图像")

audio_input.change(fn=transcribe_audio, inputs=audio_input, outputs=transcription_output)
generate_button_audio.click(
fn=process,
inputs=[transcription_output, step_slider_audio, guidance_slider_audio, style_dropdown_audio],
outputs=[prompt_output_audio, image_output_audio]
)

with gr.Tab("上传音频文件"):
audio_file_input = gr.Audio(source="upload", label="上传音频文件")
transcription_output_file = gr.Textbox(label="语音转录文本", interactive=False)
style_choices_file = ["", "赛博朋克", "奇幻", "水彩", "油画", "像素艺术"]
style_dropdown_file = gr.Dropdown(style_choices_file, label="选择艺术风格 (将影响 Prompt 生成)", value="")
step_slider_file = gr.Slider(minimum=10, maximum=100, value=30, step=1, label="迭代步数")
guidance_slider_file = gr.Slider(minimum=1.0, maximum=10.0, value=7.5, step=0.5, label="引导强度")
generate_button_file = gr.Button("生成图像")
prompt_output_file = gr.Textbox(label="生成的 Prompt")
image_output_file = gr.Image(label="生成的图像")

audio_file_input.change(fn=transcribe_audio, inputs=audio_file_input, outputs=transcription_output_file)
generate_button_file.click(
fn=process,
inputs=[transcription_output_file, step_slider_file, guidance_slider_file, style_dropdown_file],
outputs=[prompt_output_file, image_output_file]
)

demo.launch()

Files changed (1) hide show

app.py +0 -160

app.py DELETED Viewed

@@ -1,160 +0,0 @@
-import os
-import gradio as gr
-from openai import OpenAI
-from diffusers import StableDiffusionPipeline
-from PIL import Image
-import speech_recognition as sr
-# 确保你已经设置了 OPENAI_API_KEY 环境变量
-client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
-# 加载 Stable Diffusion v1.5 模型
-pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-# 如果你的 GPU 显存有限，可以尝试取消注释下一行以在 CUDA 上运行
-# pipeline.to("cuda")
-def generate_stable_diffusion_prompt(brief_description, style_preset=""):
-    """
-    使用大语言模型将简短描述转换为适用于 Stable Diffusion 的详细 prompt，
-    并可以根据 style_preset 调整 prompt。
-    Args:
-        brief_description (str): 用户输入的简短描述。
-        style_preset (str, optional): 用户选择的艺术风格预设。默认为 "".
-    Returns:
-        str: 生成的 Stable Diffusion prompt。
-    """
-    style_instruction = f"，请使其具有 {style_preset} 的艺术风格" if style_preset else ""
-    prompt = f"""请将以下简短的描述扩展为一个结构良好、详细且富有创意的 Stable Diffusion 图像生成提示词，
-    包含主体、环境、光照、艺术风格、细节等要素{style_instruction}，使其能够生成高质量的图像。描述是："{brief_description}"。"""
-    try:
-        response = client.chat.completions.create(
-            model="gpt-3.5-turbo",
-            messages=[
-                {"role": "user", "content": prompt}
-            ],
-            max_tokens=150,
-            n=1,
-            stop=None,
-            temperature=0.8,
-        )
-        generated_prompt = response.choices[0].message.content.strip()
-        return generated_prompt
-    except Exception as e:
-        return f"生成 Prompt 时出错：{e}"
-def generate_image(prompt, num_inference_steps, guidance_scale):
-    """
-    使用 Stable Diffusion 模型根据 prompt 生成图像。
-    Args:
-        prompt (str): Stable Diffusion prompt。
-        num_inference_steps (int): 迭代步数。
-        guidance_scale (float): prompt 的引导强度。
-    Returns:
-        PIL.Image.Image: 生成的图像。
-    """
-    try:
-        image = pipeline(prompt, num_inference_steps=int(num_inference_steps), guidance_scale=guidance_scale).images[0]
-        return image
-    except Exception as e:
-        print(f"生成图像时出错：{e}")
-        return None
-def transcribe_audio(audio_file):
-    """
-    使用 SpeechRecognition 将上传的音频文件转录为文本。
-    Args:
-        audio_file (str): 上传的音频文件路径。
-    Returns:
-        str: 转录的文本。
-    """
-    try:
-        r = sr.Recognizer()
-        with sr.AudioFile(audio_file) as source:
-            audio_data = r.record(source)
-        text = r.recognize_google(audio_data, language="zh-CN")
-        return text
-    except sr.UnknownValueError:
-        return "无法识别语音"
-    except sr.RequestError as e:
-        return f"语音识别服务出错：{e}"
-def process(description, num_inference_steps, guidance_scale, style_preset):
-    """
-    整合了生成 prompt 和生成图像的流程。
-    Args:
-        description (str): 用户输入的描述。
-        num_inference_steps (int): 迭代步数。
-        guidance_scale (float): 引导强度。
-        style_preset (str): 艺术风格预设。
-    Returns:
-        tuple: 生成的 prompt 和图像。
-    """
-    stable_diffusion_prompt = generate_stable_diffusion_prompt(description, style_preset)
-    generated_image = generate_image(stable_diffusion_prompt, num_inference_steps, guidance_scale)
-    return stable_diffusion_prompt, generated_image
-if __name__ == '__main__':
-    with gr.Blocks() as demo:
-        gr.Markdown("# Prompt-to-Image 生成器 (支持文本和语音输入)")
-        with gr.TabbedInterface():
-            with gr.Tab("文本输入"):
-                description_input_text = gr.Textbox(label="输入你的简短描述", placeholder="例如：未来城市的日落")
-                style_choices_text = ["", "赛博朋克", "奇幻", "水彩", "油画", "像素艺术"]
-                style_dropdown_text = gr.Dropdown(style_choices_text, label="选择艺术风格 (将影响 Prompt 生成)", value="")
-                step_slider_text = gr.Slider(minimum=10, maximum=100, value=30, step=1, label="迭代步数")
-                guidance_slider_text = gr.Slider(minimum=1.0, maximum=10.0, value=7.5, step=0.5, label="引导强度")
-                generate_button_text = gr.Button("生成图像")
-                prompt_output_text = gr.Textbox(label="生成的 Prompt")
-                image_output_text = gr.Image(label="生成的图像")
-                generate_button_text.click(
-                    fn=process,
-                    inputs=[description_input_text, step_slider_text, guidance_slider_text, style_dropdown_text],
-                    outputs=[prompt_output_text, image_output_text]
-                )
-            with gr.Tab("语音输入"):
-                audio_input = gr.Audio(source="microphone", label="通过麦克风输入描述")
-                transcription_output = gr.Textbox(label="语音转录文本")
-                style_choices_audio = ["", "赛博朋克", "奇幻", "水彩", "油画", "像素艺术"]
-                style_dropdown_audio = gr.Dropdown(style_choices_audio, label="选择艺术风格 (将影响 Prompt 生成)", value="")
-                step_slider_audio = gr.Slider(minimum=10, maximum=100, value=30, step=1, label="迭代步数")
-                guidance_slider_audio = gr.Slider(minimum=1.0, maximum=10.0, value=7.5, step=0.5, label="引导强度")
-                generate_button_audio = gr.Button("生成图像")
-                prompt_output_audio = gr.Textbox(label="生成的 Prompt")
-                image_output_audio = gr.Image(label="生成的图像")
-                audio_input.change(fn=transcribe_audio, inputs=audio_input, outputs=transcription_output)
-                generate_button_audio.click(
-                    fn=process,
-                    inputs=[transcription_output, step_slider_audio, guidance_slider_audio, style_dropdown_audio],
-                    outputs=[prompt_output_audio, image_output_audio]
-                )
-            with gr.Tab("上传音频文件"):
-                audio_file_input = gr.Audio(source="upload", label="上传音频文件")
-                transcription_output_file = gr.Textbox(label="语音转录文本")
-                style_choices_file = ["", "赛博朋克", "奇幻", "水彩", "油画", "像素艺术"]
-                style_dropdown_file = gr.Dropdown(style_choices_file, label="选择艺术风格 (将影响 Prompt 生成)", value="")
-                step_slider_file = gr.Slider(minimum=10, maximum=100, value=30, step=1, label="迭代步数")
-                guidance_slider_file = gr.Slider(minimum=1.0, maximum=10.0, value=7.5, step=0.5, label="引导强度")
-                generate_button_file = gr.Button("生成图像")
-                prompt_output_file = gr.Textbox(label="生成的 Prompt")
-                image_output_file = gr.Image(label="生成的图像")
-                audio_file_input.change(fn=transcribe_audio, inputs=audio_file_input, outputs=transcription_output_file)
-                generate_button_file.click(
-                    fn=process,
-                    inputs=[transcription_output_file, step_slider_file, guidance_slider_file, style_dropdown_file],
-                    outputs=[prompt_output_file, image_output_file]
-                )
-    demo.launch()