# rag_engine.py
import os
from typing import List, Dict, Tuple

from syllabus_utils import (
    parse_syllabus_docx,
    parse_syllabus_pdf,
    parse_pptx_slides,
)
from clare_core import (
    get_embedding,
    cosine_similarity,
)
from langsmith import traceable
from langsmith.run_helpers import set_run_metadata


def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
    """
    从文件构建 RAG chunk 列表（session 级别）。

    支持两种输入形式：
    - file 是上传文件对象（带 .name）
    - file 是字符串路径（用于预加载 Module10）

    每个 chunk 结构：
    {
        "text": str,
        "embedding": List[float],
        "source_file": "module10_responsible_ai.pdf",
        "section": "Literature Review / Paper – chunk 3"
    }
    """
    # 1) 统一拿到文件路径
    if isinstance(file, str):
        file_path = file
    else:
        file_path = getattr(file, "name", None)

    if not file_path:
        return []

    ext = os.path.splitext(file_path)[1].lower()
    basename = os.path.basename(file_path)

    try:
        # 2) 解析文件 → 文本块列表
        if ext == ".docx":
            texts = parse_syllabus_docx(file_path)
        elif ext == ".pdf":
            texts = parse_syllabus_pdf(file_path)
        elif ext == ".pptx":
            texts = parse_pptx_slides(file_path)
        else:
            print(f"[RAG] unsupported file type for RAG: {ext}")
            return []

        # 3) 对每个文本块做 embedding，并附上 metadata
        chunks: List[Dict] = []
        for idx, t in enumerate(texts):
            text = (t or "").strip()
            if not text:
                continue
            emb = get_embedding(text)
            if emb is None:
                continue

            section_label = f"{doc_type_val} – chunk {idx + 1}"
            chunks.append(
                {
                    "text": text,
                    "embedding": emb,
                    "source_file": basename,
                    "section": section_label,
                }
            )

        print(
            f"[RAG] built {len(chunks)} chunks from file ({ext}, doc_type={doc_type_val}, path={basename})"
        )
        return chunks

    except Exception as e:
        print(f"[RAG] error while building chunks: {repr(e)}")
        return []


@traceable(run_type="retriever", name="retrieve_relevant_chunks")
def retrieve_relevant_chunks(
    question: str,
    rag_chunks: List[Dict],
    top_k: int = 3,
) -> Tuple[str, List[Dict]]:
    """
    用 embedding 对当前问题做检索，从 rag_chunks 中找出最相关的 top_k 段落。

    返回：
    - context_text: 拼接后的文本（给 LLM 用）
    - used_chunks:   本轮实际用到的 chunk 列表（给 reference 用）
    """
    if not rag_chunks:
        return "", []

    q_emb = get_embedding(question)
    if q_emb is None:
        return "", []

    scored = []
    for item in rag_chunks:
        emb = item.get("embedding")
        text = item.get("text", "")
        if not emb or not text:
            continue
        sim = cosine_similarity(q_emb, emb)
        scored.append((sim, item))

    if not scored:
        return "", []

    scored.sort(key=lambda x: x[0], reverse=True)
    top_items = scored[:top_k]

    # 供 LLM 使用的拼接上下文
    top_texts = [it["text"] for _sim, it in top_items]
    context_text = "\n---\n".join(top_texts)

    # 供 reference & logging 使用的详细 chunk
    used_chunks = [it for _sim, it in top_items]

    # LangSmith metadata（可选）
    try:
        previews = [
            {
                "score": float(sim),
                "text_preview": it["text"][:200],
                "source_file": it.get("source_file"),
                "section": it.get("section"),
            }
            for sim, it in top_items
        ]
        set_run_metadata(
            question=question,
            retrieved_chunks=previews,
        )
    except Exception as e:
        print(f"[LangSmith metadata error in retrieve_relevant_chunks] {repr(e)}")

    return context_text, used_chunks