Spaces:

HeChuan1
/

123

Sleeping

App Files Files Community

123 / app.py

HeChuan1

Update app.py

9cb20a6 verified 9 months ago

raw

history blame contribute delete

15 kB

	# app.py

	import gradio as gr
	import os
	import torch
	from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
	from transformers import pipeline
	from openai import OpenAI
	import time
	from PIL import Image

	# --- 配置 ---

	# 尝试从 Hugging Face Secrets 或环境变量获取 API 密钥
	# 重要：请将您的 API 密钥在 Hugging Face Space Secrets 中命名为 'OPENAI_API_KEY'！
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
	if not OPENAI_API_KEY:
	print("警告：未在环境变量/Secrets中找到OpenAI API密钥。")
	# 如果需要强制要求密钥存在，可以提供占位符或引发错误
	# raise ValueError("缺少 OpenAI API 密钥！请在 Hugging Face Space Secrets 中设置。")

	# 初始化 OpenAI 客户端 (仅当找到密钥时)
	openai_client = None
	if OPENAI_API_KEY:
	try:
	openai_client = OpenAI(api_key=OPENAI_API_KEY)
	except Exception as e:
	print(f"初始化OpenAI客户端时出错: {e}")
	openai_client = None # 确保初始化失败时客户端为 None
	else:
	print("因缺少API密钥，OpenAI客户端未初始化。")


	# --- 模型加载 ---
	# 在主函数外部加载模型，以防止每次请求都重新加载。
	# Space 启动时可能需要一些时间。

	# 1. Stable Diffusion 模型 (v1.5 以降低 CPU 资源使用)
	# 使用 float32 以获得更好的 CPU 兼容性。在 CPU 上生成会很慢。
	print("正在加载 Stable Diffusion 模型...")
	start_time = time.time()
	try:
	pipe = StableDiffusionPipeline.from_pretrained(
	"runwayml/stable-diffusion-v1-5",
	torch_dtype=torch.float32 # CPU 使用 float32
	)
	# 使用更快的调度器可以在 CPU 上有所帮助
	pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
	# 如果在免费层遇到 RAM 问题，可能需要进一步优化
	# pipe.enable_attention_slicing() # 有助于略微减少内存使用
	print(f"Stable Diffusion 模型加载完成，耗时 {time.time() - start_time:.2f} 秒。")
	except Exception as e:
	print(f"加载 Stable Diffusion 模型时出错: {e}")
	pipe = None # 如果加载失败，则将 pipe 设置为 None


	# 2. 语音转文本模型 (Whisper - base 模型在 CPU 上是较好的平衡)
	print("正在加载语音转文本模型...")
	start_time = time.time()
	try:
	# 使用 'base' 模型以在 CPU 上获得更快的推理速度
	# 如果需要，可以显式指定 device='cpu'，尽管它通常会正确默认
	asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=-1) # device=-1 强制使用 CPU
	print(f"语音转文本模型加载完成，耗时 {time.time() - start_time:.2f} 秒。")
	except Exception as e:
	print(f"加载语音转文本模型时出错: {e}")
	asr_pipeline = None # 如果加载失败，则将 pipeline 设置为 None


	# --- 核心函数 ---

	# 步骤 1: Prompt-to-Prompt 模块
	def generate_detailed_prompt(short_prompt, style_preference="电影感", neg_prompt=""):
	"""使用 LLM 生成详细提示词。"""
	if not openai_client:
	print("OpenAI 客户端不可用。")
	gr.Warning("OpenAI API Key 未配置。将使用基础提示词。")
	# 回退：如果没有 LLM，则仅使用简短提示词和风格
	return f"{short_prompt}，风格：{style_preference}", neg_prompt or "丑陋, 模糊, 低质量"

	# 为 LLM 构建更具指导性的提示 (保持英文以确保模型理解)
	system_message = "You are an expert prompt generator for Stable Diffusion."
	user_message = (
	f"Based on the user's simple idea: '{short_prompt}', "
	f"generate a detailed and structured prompt for Stable Diffusion v1.5. "
	f"Incorporate the desired style: '{style_preference}'. "
	f"Include details about the subject, setting, lighting, composition, and quality keywords (like 'photorealistic', 'highly detailed', '4k', 'masterpiece'). "
	f"Also, suggest a relevant negative prompt focusing on common image issues (like 'ugly, deformed, blurry, low quality, extra limbs, text, words'). "
	f"If the user provided a negative prompt ('{neg_prompt}'), incorporate its essence or add to it. "
	f"Format the output clearly, separating the main prompt and the negative prompt, perhaps using '### Prompt:' and '### Negative Prompt:' labels."
	)

	try:
	print(f"向OpenAI发送请求以生成提示词： {short_prompt}")
	response = openai_client.chat.completions.create(
	model="gpt-3.5-turbo", # 一个相对快速且有能力的模型
	messages=[
	{"role": "system", "content": system_message},
	{"role": "user", "content": user_message},
	],
	max_tokens=150, # 限制 token 使用
	temperature=0.7,
	)
	generated_text = response.choices[0].message.content.strip()
	print(f"收到来自OpenAI的响应： {generated_text}")

	# 假设 LLM 遵循请求的格式进行基本解析
	prompt_marker = "### Prompt:"
	neg_prompt_marker = "### Negative Prompt:"

	final_prompt = generated_text # 默认为全部文本
	final_neg_prompt = neg_prompt # 默认为用户输入的反向提示词

	if prompt_marker in generated_text:
	start_idx = generated_text.find(prompt_marker) + len(prompt_marker)
	end_idx = generated_text.find(neg_prompt_marker)
	if end_idx != -1:
	final_prompt = generated_text[start_idx:end_idx].strip()
	else:
	final_prompt = generated_text[start_idx:].strip()

	if neg_prompt_marker in generated_text:
	start_idx = generated_text.find(neg_prompt_marker) + len(neg_prompt_marker)
	final_neg_prompt = generated_text[start_idx:].strip()
	# 如果用户提供了反向提示词，则合并
	if neg_prompt and neg_prompt not in final_neg_prompt:
	final_neg_prompt = f"{neg_prompt}, {final_neg_prompt}"


	# 清理可能未完美解析的残留部分
	final_prompt = final_prompt.replace("### Prompt:", "").strip()
	final_neg_prompt = final_neg_prompt.replace("### Negative Prompt:", "").strip()

	# 确保不返回空字符串
	if not final_prompt:
	final_prompt = f"{short_prompt}，风格：{style_preference}，高度详细"
	if not final_neg_prompt:
	final_neg_prompt = "丑陋, 模糊, 低质量" # 默认中文反向提示词

	return final_prompt, final_neg_prompt

	except Exception as e:
	print(f"调用OpenAI API时出错: {e}")
	gr.Warning(f"OpenAI API 错误: {e}。将使用基础提示词。")
	# 出错时回退
	return f"{short_prompt}，风格：{style_preference}，高度详细", neg_prompt or "丑陋, 模糊, 低质量"

	# 步骤 2: Prompt-to-Image 模块
	def generate_image(prompt, neg_prompt, guidance, steps):
	"""使用 Stable Diffusion 生成图像。"""
	if not pipe:
	gr.Error("Stable Diffusion 模型加载失败。无法生成图像。")
	# 返回占位符图像或 None
	return Image.new('RGB', (512, 512), color = 'grey'), "错误：SD 模型未加载。"

	print(f"正在使用提示词生成图像： {prompt}")
	print(f"反向提示词: {neg_prompt}, 引导系数: {guidance}, 步数: {steps}")
	start_time = time.time()
	try:
	# 在 CPU 上生成图像 (会很慢!)
	with torch.no_grad(): # 确保不计算梯度
	image = pipe(
	prompt,
	negative_prompt=neg_prompt,
	guidance_scale=float(guidance),
	num_inference_steps=int(steps),
	num_images_per_prompt=1,
	# height=512, # v1.5 默认值
	# width=512, # v1.5 默认值
	).images[0]
	print(f"图像生成完成，耗时 {time.time() - start_time:.2f} 秒。")
	return image, prompt # 返回图像和使用的提示词
	except Exception as e:
	print(f"图像生成过程中出错: {e}")
	gr.Error(f"图像生成失败: {e}")
	return Image.new('RGB', (512, 512), color = 'red'), prompt # 返回错误占位符


	# 附加功能: 语音转文本函数
	def transcribe_audio(audio_filepath):
	"""使用 Whisper 模型转录音频。"""
	if not asr_pipeline:
	gr.Warning("语音转文本模型未加载。")
	return "错误：ASR模型未加载。"
	if audio_filepath is None:
	gr.Warning("未提供音频输入。")
	return "" # 如果没有音频，则返回空字符串

	print(f"正在转录音频文件： {audio_filepath}")
	start_time = time.time()
	try:
	transcript = asr_pipeline(audio_filepath)
	print(f"音频转录完成，耗时 {time.time() - start_time:.2f} 秒。")
	print(f"转录结果： {transcript['text']}")
	return transcript["text"]
	except Exception as e:
	print(f"音频转录过程中出错: {e}")
	gr.Error(f"音频转录失败: {e}")
	return f"音频转录过程中出错: {e}"

	# --- 组合工作流函数 ---
	def process_input(short_prompt_text, audio_input, style, neg_prompt, guidance, steps):
	"""按钮点击触发的主函数。"""
	final_short_prompt = short_prompt_text
	status = "正在开始..."

	# 1. 确定输入：如果可用，优先使用音频
	if audio_input is not None:
	status = "正在转录音频..."
	print(f"检测到音频输入： {audio_input}")
	# 如果需要在 Gradio 中更新状态（需要生成器函数）
	# yield status, None, None # 更新状态，此时还没有提示词/图像
	transcription = transcribe_audio(audio_input)
	if "错误" not in transcription and transcription.strip(): # 检查是否包含错误并且不是空转录
	final_short_prompt = transcription
	status = f"使用转录文本： '{transcription[:50]}...'"
	print(status)
	elif "错误" in transcription:
	status = "音频转录失败。如文本框有内容将使用文本输入。"
	print(status)
	# 如果文本输入也为空，需要明确错误
	if not final_short_prompt:
	gr.Error("音频转录失败且文本框为空。")
	return "错误：无有效输入。", None, None # 状态, 提示词, 图像
	else: # 转录成功但为空（例如，静音）
	status = "音频转录结果为空。如文本框有内容将使用文本输入。"
	print(status)
	# 如果文本输入也为空
	if not final_short_prompt:
	gr.Error("音频转录结果为空且文本框为空。")
	return "错误：无有效输入。", None, None


	if not final_short_prompt:
	gr.Error("请输入简短描述或使用语音输入。")
	return "错误：输入为空。", None, None # 状态, 提示词, 图像

	# 2. 生成详细提示词
	status = "正在生成详细提示词..."
	print(status)
	# yield status, None, None
	# 使用中文风格偏好调用
	detailed_prompt, final_neg_prompt = generate_detailed_prompt(final_short_prompt, style, neg_prompt)
	status = "正在生成图像..."
	print(f"使用详细提示词： {detailed_prompt}")
	print(f"使用反向提示词： {final_neg_prompt}")
	# yield status, detailed_prompt, None # 显示详细提示词

	# 3. 生成图像
	image, used_prompt = generate_image(detailed_prompt, final_neg_prompt, guidance, steps)
	status = "图像生成完成！"
	print(status)
	return status, used_prompt, image # 最终状态, 提示词, 图像


	# --- Gradio 界面 ---
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# Stable Diffusion 提示词优化与图像生成器")
	gr.Markdown(
	"输入简短描述（或使用语音输入！），选择风格，获取详细提示词和生成的图像。\n"
	"注意：图像生成在 CPU 上运行，将会很慢（可能需要几分钟）。\n"
	"提示词优化功能需要在 Hugging Face Space Secrets 中设置 `OPENAI_API_KEY`。"
	)

	# 可以使用 Accordion 进行更好的组织，或仅使用 Row/Column
	with gr.Row():
	with gr.Column(scale=1):
	# 输入控件
	inp_short_prompt = gr.Textbox(label="简短描述", placeholder="例如：天空中的魔法树屋")
	inp_audio = gr.Audio(sources=["microphone"], type="filepath", label="或者录制语音输入（可选）") # Type='filepath' 通常对 pipeline 更好
	inp_style = gr.Dropdown(
	label="图像风格",
	choices=["照片写实", "电影感", "奇幻艺术", "动漫", "水彩", "像素艺术", "赛博朋克"],
	value="电影感" # 默认值
	) # 控件类型 1: Dropdown
	inp_neg_prompt = gr.Textbox(label="反向提示词（可选）", placeholder="例如：文字, 词语, 模糊, 变形") # 控件类型 2: Textbox
	inp_guidance = gr.Slider(minimum=1, maximum=15, step=0.5, value=7.5, label="引导系数 (CFG)") # 控件类型 3: Slider
	inp_steps = gr.Slider(minimum=10, maximum=50, step=1, value=20, label="推理步数（越低越快，细节可能减少）") # 控件类型 4: 另一个 Slider

	generate_button = gr.Button("生成图像", variant="primary") # 控件类型 5: Button

	# 状态显示
	out_status = gr.Textbox(label="状态", value="准备就绪", interactive=False)

	with gr.Column(scale=1):
	# 输出控件
	out_detailed_prompt = gr.Textbox(label="生成的详细提示词", interactive=False, lines=5) # 控件类型 6: Textbox (输出)
	out_image = gr.Image(label="生成的图像", type="pil") # 控件类型 7: Image 输出


	# 连接组件
	generate_button.click(
	fn=process_input,
	inputs=[
	inp_short_prompt,
	inp_audio,
	inp_style,
	inp_neg_prompt,
	inp_guidance,
	inp_steps
	],
	outputs=[
	out_status,
	out_detailed_prompt,
	out_image
	]
	)

	# --- 启动应用 ---
	if __name__ == "__main__":
	# 启动前检查模型加载状态
	if not pipe:
	print("\n严重错误：Stable Diffusion 模型加载失败。Gradio 应用可能无法正常工作。")
	if not asr_pipeline:
	print("\n警告：语音转文本模型加载失败。语音输入将无法工作。")
	if not openai_client:
	print("\n警告：OpenAI客户端未初始化（检查API密钥）。提示词优化将是基础模式。")

	demo.queue() # 为处理多个用户/较长任务启用队列
	demo.launch(debug=False) # 如果需要本地更详细的日志，设置 debug=True