import json
import os
import gradio as gr
from sentence_transformers import SentenceTransformer, util
from huggingface_hub import InferenceClient

print("Loading embedding model...")
model = SentenceTransformer("BAAI/bge-small-zh-v1.5")

print("Loading FAQs...")
with open("faqs.json", "r", encoding="utf-8") as f:
    faqs = json.load(f)

print(f"Encoding {len(faqs)} FAQ questions...")
questions = [item["q"] for item in faqs]
faq_embeddings = model.encode(questions, normalize_embeddings=True)
print("Ready!")

THRESHOLD = 0.55

# --- LLM ---
SYSTEM_PROMPT = """你是一个友好、简洁的 AI 学习答疑助手。
规则：
1. 严格基于"参考资料"回答，不要编造
2. 资料里没有的内容，直接说"我暂时没这方面的资料"
3. 用自然、口语化的中文，避免生硬复读资料原文
4. 控制在 3 句话以内"""

USER_PROMPT_TEMPLATE = """【参考资料】
{context}

【用户问题】
{question}

请基于资料用自然语言回答。"""

client = InferenceClient(
    model="Qwen/Qwen2.5-72B-Instruct",
    token=os.environ.get("HF_TOKEN"),
    timeout=20,
)


def llm_answer(question, top_faqs):
    if not os.environ.get("HF_TOKEN"):
        return top_faqs[0]["a"] + "\n\n_(需要在 Space Secrets 设置 HF_TOKEN 以启用 LLM)_"
    context = "\n\n".join(f"Q: {f['q']}\nA: {f['a']}" for f in top_faqs)
    user_prompt = USER_PROMPT_TEMPLATE.format(context=context, question=question)
    try:
        resp = client.chat_completion(
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_prompt},
            ],
            max_tokens=200,
            temperature=0.3,
        )
        return resp.choices[0].message.content
    except Exception as e:
        return top_faqs[0]["a"] + f"\n\n_(LLM 暂不可用：{e})_"


# --- Gradio ---
def chat(query):
    if not query or not query.strip():
        return "请输入您的问题", "", ""

    q_emb = model.encode(query, normalize_embeddings=True)
    scores = util.cos_sim(q_emb, faq_embeddings)[0]
    top_idx = scores.argsort(descending=True)[:3].tolist()
    top1 = top_idx[0]
    top1_score = float(scores[top1])

    if top1_score < THRESHOLD:
        reply = "抱歉，我暂时无法理解您的问题。建议换个说法，或查看下方相关问题。"
    else:
        top3_faqs = [faqs[i] for i in top_idx]
        reply = llm_answer(query, top3_faqs)

    info = f"**类别**: {faqs[top1]['category']}　|　**匹配度**: {top1_score:.2f}　|　**匹配的问题**: {faqs[top1]['q']}"
    related = "### 您可能也想问：\n"
    for i in top_idx[1:]:
        related += f"- {faqs[i]['q']} _(相似度 {float(scores[i]):.2f})_\n"
    return reply, info, related


examples = [
    "embedding 是什么意思？",
    "中文应该用哪个向量模型？",
    "BERT 和 GPT 有什么不一样？",
    "pipeline 是干什么用的？",
    "AI 怎么知道两句话意思一样？",
    "怎么把模型跑到 GPU 上？",
    "为什么 LLM 会胡说八道？",
    "今天天气怎么样？",
]

iface = gr.Interface(
    fn=chat,
    inputs=gr.Textbox(label="您的问题", placeholder="例如：embedding 是什么？", lines=2),
    outputs=[
        gr.Markdown(label="答案"),
        gr.Markdown(label="检索详情"),
        gr.Markdown(label="相关问题"),
    ],
    title="🤖 AI 学习 FAQ 机器人（RAG）",
    description="基于 BAAI/bge-small-zh-v1.5 检索 + Qwen2.5-7B-Instruct 生成 · 30 条 AI 学习 FAQ",
    examples=examples,
    flagging_mode="never",
    theme="soft",
)

if __name__ == "__main__":
    iface.launch()