# app.py

import gradio as gr
import os
import torch
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
from transformers import pipeline
from openai import OpenAI
import time
from PIL import Image

# --- 配置 ---

# 尝试从 Hugging Face Secrets 或环境变量获取 API 密钥
# 重要：请将您的 API 密钥在 Hugging Face Space Secrets 中命名为 'OPENAI_API_KEY'！
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("警告：未在环境变量/Secrets中找到OpenAI API密钥。")
    # 如果需要强制要求密钥存在，可以提供占位符或引发错误
    # raise ValueError("缺少 OpenAI API 密钥！请在 Hugging Face Space Secrets 中设置。")

# 初始化 OpenAI 客户端 (仅当找到密钥时)
openai_client = None
if OPENAI_API_KEY:
    try:
        openai_client = OpenAI(api_key=OPENAI_API_KEY)
    except Exception as e:
        print(f"初始化OpenAI客户端时出错: {e}")
        openai_client = None # 确保初始化失败时客户端为 None
else:
    print("因缺少API密钥，OpenAI客户端未初始化。")


# --- 模型加载 ---
# 在主函数外部加载模型，以防止每次请求都重新加载。
# Space 启动时可能需要一些时间。

# 1. Stable Diffusion 模型 (v1.5 以降低 CPU 资源使用)
# 使用 float32 以获得更好的 CPU 兼容性。在 CPU 上生成会很慢。
print("正在加载 Stable Diffusion 模型...")
start_time = time.time()
try:
    pipe = StableDiffusionPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        torch_dtype=torch.float32 # CPU 使用 float32
    )
    # 使用更快的调度器可以在 CPU 上有所帮助
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
    # 如果在免费层遇到 RAM 问题，可能需要进一步优化
    # pipe.enable_attention_slicing() # 有助于略微减少内存使用
    print(f"Stable Diffusion 模型加载完成，耗时 {time.time() - start_time:.2f} 秒。")
except Exception as e:
    print(f"加载 Stable Diffusion 模型时出错: {e}")
    pipe = None # 如果加载失败，则将 pipe 设置为 None


# 2. 语音转文本模型 (Whisper - base 模型在 CPU 上是较好的平衡)
print("正在加载语音转文本模型...")
start_time = time.time()
try:
    # 使用 'base' 模型以在 CPU 上获得更快的推理速度
    # 如果需要，可以显式指定 device='cpu'，尽管它通常会正确默认
    asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=-1) # device=-1 强制使用 CPU
    print(f"语音转文本模型加载完成，耗时 {time.time() - start_time:.2f} 秒。")
except Exception as e:
    print(f"加载语音转文本模型时出错: {e}")
    asr_pipeline = None # 如果加载失败，则将 pipeline 设置为 None


# --- 核心函数 ---

# 步骤 1: Prompt-to-Prompt 模块
def generate_detailed_prompt(short_prompt, style_preference="电影感", neg_prompt=""):
    """使用 LLM 生成详细提示词。"""
    if not openai_client:
        print("OpenAI 客户端不可用。")
        gr.Warning("OpenAI API Key 未配置。将使用基础提示词。")
        # 回退：如果没有 LLM，则仅使用简短提示词和风格
        return f"{short_prompt}，风格：{style_preference}", neg_prompt or "丑陋, 模糊, 低质量"

    # 为 LLM 构建更具指导性的提示 (保持英文以确保模型理解)
    system_message = "You are an expert prompt generator for Stable Diffusion."
    user_message = (
        f"Based on the user's simple idea: '{short_prompt}', "
        f"generate a detailed and structured prompt for Stable Diffusion v1.5. "
        f"Incorporate the desired style: '{style_preference}'. "
        f"Include details about the subject, setting, lighting, composition, and quality keywords (like 'photorealistic', 'highly detailed', '4k', 'masterpiece'). "
        f"Also, suggest a relevant negative prompt focusing on common image issues (like 'ugly, deformed, blurry, low quality, extra limbs, text, words'). "
        f"If the user provided a negative prompt ('{neg_prompt}'), incorporate its essence or add to it. "
        f"Format the output clearly, separating the main prompt and the negative prompt, perhaps using '### Prompt:' and '### Negative Prompt:' labels."
    )

    try:
        print(f"向OpenAI发送请求以生成提示词： {short_prompt}")
        response = openai_client.chat.completions.create(
            model="gpt-3.5-turbo", # 一个相对快速且有能力的模型
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message},
            ],
            max_tokens=150, # 限制 token 使用
            temperature=0.7,
        )
        generated_text = response.choices[0].message.content.strip()
        print(f"收到来自OpenAI的响应： {generated_text}")

        # 假设 LLM 遵循请求的格式进行基本解析
        prompt_marker = "### Prompt:"
        neg_prompt_marker = "### Negative Prompt:"

        final_prompt = generated_text # 默认为全部文本
        final_neg_prompt = neg_prompt # 默认为用户输入的反向提示词

        if prompt_marker in generated_text:
            start_idx = generated_text.find(prompt_marker) + len(prompt_marker)
            end_idx = generated_text.find(neg_prompt_marker)
            if end_idx != -1:
                 final_prompt = generated_text[start_idx:end_idx].strip()
            else:
                 final_prompt = generated_text[start_idx:].strip()

        if neg_prompt_marker in generated_text:
            start_idx = generated_text.find(neg_prompt_marker) + len(neg_prompt_marker)
            final_neg_prompt = generated_text[start_idx:].strip()
            # 如果用户提供了反向提示词，则合并
            if neg_prompt and neg_prompt not in final_neg_prompt:
                final_neg_prompt = f"{neg_prompt}, {final_neg_prompt}"


        # 清理可能未完美解析的残留部分
        final_prompt = final_prompt.replace("### Prompt:", "").strip()
        final_neg_prompt = final_neg_prompt.replace("### Negative Prompt:", "").strip()

        # 确保不返回空字符串
        if not final_prompt:
             final_prompt = f"{short_prompt}，风格：{style_preference}，高度详细"
        if not final_neg_prompt:
             final_neg_prompt = "丑陋, 模糊, 低质量" # 默认中文反向提示词

        return final_prompt, final_neg_prompt

    except Exception as e:
        print(f"调用OpenAI API时出错: {e}")
        gr.Warning(f"OpenAI API 错误: {e}。将使用基础提示词。")
        # 出错时回退
        return f"{short_prompt}，风格：{style_preference}，高度详细", neg_prompt or "丑陋, 模糊, 低质量"

# 步骤 2: Prompt-to-Image 模块
def generate_image(prompt, neg_prompt, guidance, steps):
    """使用 Stable Diffusion 生成图像。"""
    if not pipe:
        gr.Error("Stable Diffusion 模型加载失败。无法生成图像。")
        # 返回占位符图像或 None
        return Image.new('RGB', (512, 512), color = 'grey'), "错误：SD 模型未加载。"

    print(f"正在使用提示词生成图像： {prompt}")
    print(f"反向提示词: {neg_prompt}, 引导系数: {guidance}, 步数: {steps}")
    start_time = time.time()
    try:
        # 在 CPU 上生成图像 (会很慢!)
        with torch.no_grad(): # 确保不计算梯度
             image = pipe(
                 prompt,
                 negative_prompt=neg_prompt,
                 guidance_scale=float(guidance),
                 num_inference_steps=int(steps),
                 num_images_per_prompt=1,
                 # height=512, # v1.5 默认值
                 # width=512,  # v1.5 默认值
             ).images[0]
        print(f"图像生成完成，耗时 {time.time() - start_time:.2f} 秒。")
        return image, prompt # 返回图像和使用的提示词
    except Exception as e:
        print(f"图像生成过程中出错: {e}")
        gr.Error(f"图像生成失败: {e}")
        return Image.new('RGB', (512, 512), color = 'red'), prompt # 返回错误占位符


# 附加功能: 语音转文本函数
def transcribe_audio(audio_filepath):
    """使用 Whisper 模型转录音频。"""
    if not asr_pipeline:
        gr.Warning("语音转文本模型未加载。")
        return "错误：ASR模型未加载。"
    if audio_filepath is None:
        gr.Warning("未提供音频输入。")
        return "" # 如果没有音频，则返回空字符串

    print(f"正在转录音频文件： {audio_filepath}")
    start_time = time.time()
    try:
        transcript = asr_pipeline(audio_filepath)
        print(f"音频转录完成，耗时 {time.time() - start_time:.2f} 秒。")
        print(f"转录结果： {transcript['text']}")
        return transcript["text"]
    except Exception as e:
        print(f"音频转录过程中出错: {e}")
        gr.Error(f"音频转录失败: {e}")
        return f"音频转录过程中出错: {e}"

# --- 组合工作流函数 ---
def process_input(short_prompt_text, audio_input, style, neg_prompt, guidance, steps):
    """按钮点击触发的主函数。"""
    final_short_prompt = short_prompt_text
    status = "正在开始..."

    # 1. 确定输入：如果可用，优先使用音频
    if audio_input is not None:
        status = "正在转录音频..."
        print(f"检测到音频输入： {audio_input}")
        # 如果需要在 Gradio 中更新状态（需要生成器函数）
        # yield status, None, None # 更新状态，此时还没有提示词/图像
        transcription = transcribe_audio(audio_input)
        if "错误" not in transcription and transcription.strip(): # 检查是否包含错误并且不是空转录
            final_short_prompt = transcription
            status = f"使用转录文本： '{transcription[:50]}...'"
            print(status)
        elif "错误" in transcription:
             status = "音频转录失败。如文本框有内容将使用文本输入。"
             print(status)
             # 如果文本输入也为空，需要明确错误
             if not final_short_prompt:
                 gr.Error("音频转录失败且文本框为空。")
                 return "错误：无有效输入。", None, None # 状态, 提示词, 图像
        else: # 转录成功但为空（例如，静音）
            status = "音频转录结果为空。如文本框有内容将使用文本输入。"
            print(status)
            # 如果文本输入也为空
            if not final_short_prompt:
                 gr.Error("音频转录结果为空且文本框为空。")
                 return "错误：无有效输入。", None, None


    if not final_short_prompt:
        gr.Error("请输入简短描述或使用语音输入。")
        return "错误：输入为空。", None, None # 状态, 提示词, 图像

    # 2. 生成详细提示词
    status = "正在生成详细提示词..."
    print(status)
    # yield status, None, None
    # 使用中文风格偏好调用
    detailed_prompt, final_neg_prompt = generate_detailed_prompt(final_short_prompt, style, neg_prompt)
    status = "正在生成图像..."
    print(f"使用详细提示词： {detailed_prompt}")
    print(f"使用反向提示词： {final_neg_prompt}")
    # yield status, detailed_prompt, None # 显示详细提示词

    # 3. 生成图像
    image, used_prompt = generate_image(detailed_prompt, final_neg_prompt, guidance, steps)
    status = "图像生成完成！"
    print(status)
    return status, used_prompt, image # 最终状态, 提示词, 图像


# --- Gradio 界面 ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Stable Diffusion 提示词优化与图像生成器")
    gr.Markdown(
        "输入简短描述（或使用语音输入！），选择风格，获取详细提示词和生成的图像。\n"
        "*注意：图像生成在 CPU 上运行，将会**很慢**（可能需要几分钟）。*\n"
        "*提示词优化功能需要在 Hugging Face Space Secrets 中设置 `OPENAI_API_KEY`。*"
        )

    # 可以使用 Accordion 进行更好的组织，或仅使用 Row/Column
    with gr.Row():
        with gr.Column(scale=1):
            # 输入控件
            inp_short_prompt = gr.Textbox(label="简短描述", placeholder="例如：天空中的魔法树屋")
            inp_audio = gr.Audio(sources=["microphone"], type="filepath", label="或者录制语音输入（可选）") # Type='filepath' 通常对 pipeline 更好
            inp_style = gr.Dropdown(
                label="图像风格",
                choices=["照片写实", "电影感", "奇幻艺术", "动漫", "水彩", "像素艺术", "赛博朋克"],
                value="电影感" # 默认值
            ) # 控件类型 1: Dropdown
            inp_neg_prompt = gr.Textbox(label="反向提示词（可选）", placeholder="例如：文字, 词语, 模糊, 变形") # 控件类型 2: Textbox
            inp_guidance = gr.Slider(minimum=1, maximum=15, step=0.5, value=7.5, label="引导系数 (CFG)") # 控件类型 3: Slider
            inp_steps = gr.Slider(minimum=10, maximum=50, step=1, value=20, label="推理步数（越低越快，细节可能减少）") # 控件类型 4: 另一个 Slider

            generate_button = gr.Button("生成图像", variant="primary") # 控件类型 5: Button

            # 状态显示
            out_status = gr.Textbox(label="状态", value="准备就绪", interactive=False)

        with gr.Column(scale=1):
            # 输出控件
            out_detailed_prompt = gr.Textbox(label="生成的详细提示词", interactive=False, lines=5) # 控件类型 6: Textbox (输出)
            out_image = gr.Image(label="生成的图像", type="pil") # 控件类型 7: Image 输出


    # 连接组件
    generate_button.click(
        fn=process_input,
        inputs=[
            inp_short_prompt,
            inp_audio,
            inp_style,
            inp_neg_prompt,
            inp_guidance,
            inp_steps
        ],
        outputs=[
            out_status,
            out_detailed_prompt,
            out_image
        ]
    )

# --- 启动应用 ---
if __name__ == "__main__":
    # 启动前检查模型加载状态
    if not pipe:
        print("\n严重错误：Stable Diffusion 模型加载失败。Gradio 应用可能无法正常工作。")
    if not asr_pipeline:
        print("\n警告：语音转文本模型加载失败。语音输入将无法工作。")
    if not openai_client:
        print("\n警告：OpenAI客户端未初始化（检查API密钥）。提示词优化将是基础模式。")

    demo.queue() # 为处理多个用户/较长任务启用队列
    demo.launch(debug=False) # 如果需要本地更详细的日志，设置 debug=True