Spaces:

Sukmadi
/

test-generator

Sleeping

File size: 63,867 Bytes

import re
import random
import fitz
import string
import numpy as np
import os
from typing import List, Optional, Tuple, Dict, Any
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import pipeline
from uuid import uuid4
import pymupdf4llm
from typing_extensions import override

try:
    from qdrant_client import QdrantClient
    from qdrant_client.http.models import (
        PointStruct,
        Filter,
        FieldCondition,
        MatchValue,
        Distance,
        VectorParams,
    )
    from qdrant_client.http import models as rest
    _HAS_QDRANT = True
except Exception:
    _HAS_QDRANT = False

try:
    import faiss
    _HAS_FAISS = True
except Exception:
    _HAS_FAISS = False

from utils import generate_mcqs_from_text, structure_context_for_llm, new_generate_mcqs_from_text

from huggingface_hub import login
login(token=os.environ['HF_MODEL_TOKEN'])

class RAGMCQ:
    def __init__(
        self,
        embedder_model: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
        generation_model: str = "openai/gpt-oss-120b",
        qdrant_url: str = os.environ.get('QDRANT_URL') or "",
        qdrant_api_key: str = os.environ.get('QDRANT_API_KEY') or "",
        qdrant_prefer_grpc: bool = False,
    ):
        self.embedder = SentenceTransformer(embedder_model)
        self.generation_model = generation_model
        self.qa_pipeline = pipeline("question-answering", model="nguyenvulebinh/vi-mrc-base", tokenizer="nguyenvulebinh/vi-mrc-base")
        self.cross_entail = CrossEncoder("itdainb/PhoRanker")
        self.embeddings = None   # np.array of shape (N, D)
        self.texts = []          # list of chunk texts
        self.metadata = []       # list of dicts (page, chunk_id, char_range)
        self.index = None
        self.dim = self.embedder.get_sentence_embedding_dimension()

        self.qdrant = None
        self.qdrant_url = qdrant_url
        self.qdrant_api_key = qdrant_api_key
        self.qdrant_prefer_grpc = qdrant_prefer_grpc

        if qdrant_url:
            self.connect_qdrant(qdrant_url, qdrant_api_key, qdrant_prefer_grpc)

    def extract_pages(
            self,
            pdf_path: str,
            *,
            pages: Optional[List[int]] = None,
            ignore_images: bool = False,
            dpi: int = 150
        ) -> List[str]:
            doc = fitz.open(pdf_path)
            try:
                # request page-wise output (page_chunks=True -> list[dict] per page)
                page_dicts = pymupdf4llm.to_markdown(
                    doc,
                    pages=pages,
                    ignore_images=ignore_images,
                    dpi=dpi,
                    page_chunks=True,
                )

                # to_markdown(..., page_chunks=True) returns a list of dicts, each has key "text" (markdown)
                pages_md: List[str] = []
                for p in page_dicts:
                    txt = p.get("text", "") or ""
                    pages_md.append(txt.strip())

                return pages_md
            finally:
                doc.close()

    def chunk_text(self, text: str, max_chars: int = 1200, overlap: int = 100) -> List[str]:
        text = text.strip()
        if not text:
            return []

        if len(text) <= max_chars:
            return [text]

        # split by sentence-like boundaries
        sentences = re.split(r'(?<=[\.\?\!])\s+', text)
        chunks = []
        cur = ""

        for s in sentences:
            if len(cur) + len(s) + 1 <= max_chars:
                cur += (" " if cur else "") + s
            else:
                if cur:
                    chunks.append(cur)

                cur = (cur[-overlap:] + " " + s) if overlap > 0 else s

        if cur:
            chunks.append(cur)

        # if still too long, hard-split
        final = []
        for c in chunks:
            if len(c) <= max_chars:
                final.append(c)
            else:
                for i in range(0, len(c), max_chars):
                    final.append(c[i:i+max_chars])

        return final

    def build_index_from_pdf(self, pdf_path: str, max_chars: int = 1200):
        pages = self.extract_pages(pdf_path)

        self.texts = []
        self.metadata = []

        for p_idx, page_text in enumerate(pages, start=1):
            chunks = self.chunk_text(page_text or "", max_chars=max_chars)
            for cid, ch in enumerate(chunks, start=1):
                self.texts.append(ch)
                self.metadata.append({"page": p_idx, "chunk_id": cid, "length": len(ch)})

        if not self.texts:
            raise RuntimeError("No text extracted from PDF.")

        # save_to_local('test/text_chunks.md', content=self.texts)

        # compute embeddings
        emb = self.embedder.encode(self.texts, convert_to_numpy=True, show_progress_bar=True)
        self.embeddings = emb.astype("float32")
        self._build_faiss_index()

    def _build_faiss_index(self, ef_construction=200, M=32):
        if _HAS_FAISS:
            d = self.embeddings.shape[1]
            index = faiss.IndexHNSWFlat(d, M)
            faiss.normalize_L2(self.embeddings)
            index.add(self.embeddings)
            index.hnsw.efConstruction = ef_construction
            self.index = index
        else:
            # store normalized embeddings and use brute-force numpy
            norms = np.linalg.norm(self.embeddings, axis=1, keepdims=True) + 1e-10
            self.embeddings = self.embeddings / norms
            self.index = None

    def _retrieve(self, query: str, top_k: int = 3) -> List[Tuple[int, float]]:
        q_emb = self.embedder.encode([query], convert_to_numpy=True).astype("float32")

        if _HAS_FAISS:
            faiss.normalize_L2(q_emb)
            D_list, I_list = self.index.search(q_emb, top_k)
            # D are inner products; return list of (idx, score)
            return [(int(i), float(d)) for i, d in zip(I_list[0], D_list[0]) if i != -1]
        else:
            qn = q_emb / (np.linalg.norm(q_emb, axis=1, keepdims=True) + 1e-10)
            sims = (self.embeddings @ qn.T).squeeze(axis=1)
            idxs = np.argsort(-sims)[:top_k]
            return [(int(i), float(sims[i])) for i in idxs]

    def generate_from_pdf(
        self,
        pdf_path: str,
        n_questions: int = 10,
        mode: str = "rag", # per_page or rag
        questions_per_page: int = 3, # for per_page mode
        top_k: int = 3, # chunks to retrieve for each question in rag mode
        temperature: float = 0.2,
        enable_fiddler: bool = False,
    ) -> Dict[str, Any]:
        # build index
        self.build_index_from_pdf(pdf_path)

        output: Dict[str, Any] = {}
        qcount = 0

        if mode == "per_page":
            # iterate pages -> chunks
            for idx, meta in enumerate(self.metadata):
                chunk_text = self.texts[idx]

                if not chunk_text.strip():
                    continue

                # ask generator
                try:
                    structured_context = structure_context_for_llm(chunk_text, model=self.generation_model, temperature=0.2, enable_fiddler=enable_fiddler)
                    mcq_block = generate_mcqs_from_text(
                        structured_context, n=questions_per_page, model=self.generation_model, temperature=temperature, enable_fiddler=enable_fiddler
                    )
                except Exception as e:
                    # skip this chunk if generator fails
                    print(f"Generator failed on page {meta['page']} chunk {meta['chunk_id']}: {e}")
                    continue

                if "error" in list(mcq_block.keys()):
                    return output

                for item in sorted(mcq_block.keys(), key=lambda x: int(x)):
                    qcount += 1
                    output[str(qcount)] = mcq_block[item]
                    if qcount >= n_questions:
                        return output

            return output

        elif mode == "rag":
            # strategy: create a few natural short queries by sampling sentences or using chunk summaries.
            # create queries by sampling chunk text sentences.
            # stop when n_questions reached or max_attempts exceeded.
            attempts = 0
            max_attempts = n_questions * 4

            while qcount < n_questions and attempts < max_attempts:
                attempts += 1
                # create a seed query: pick a random chunk, pick a sentence from it
                seed_idx = random.randrange(len(self.texts))
                chunk = self.texts[seed_idx]

                #? investigate better Chunking Strategy
                #with open("chunks.txt", "a", encoding="utf-8") as f:
                    #f.write(chunk + "\n")

                sents = re.split(r'(?<=[\.\?\!])\s+', chunk)
                seed_sent = random.choice([s for s in sents if len(s.strip()) > 20]) if sents else chunk[:200]
                query = f"Create questions about: {seed_sent}"

                # retrieve top_k chunks
                retrieved = self._retrieve(query, top_k=top_k)
                context_parts = []
                for ridx, score in retrieved:
                    md = self.metadata[ridx]
                    context_parts.append(f"[page {md['page']}] {self.texts[ridx]}")
                context = "\n\n".join(context_parts)

                # save_to_local('test/context.md', content=context)

                # call generator for 1 question (or small batch) with the retrieved context
                try:
                    # request 1 question at a time to keep diversity
                    structured_context = structure_context_for_llm(context, model=self.generation_model, temperature=0.2, enable_fiddler=enable_fiddler)
                    mcq_block = generate_mcqs_from_text(
                        structured_context, n=1, model=self.generation_model, temperature=temperature, enable_fiddler=enable_fiddler
                    )
                except Exception as e:
                    print(f"Generator failed during RAG attempt {attempts}: {e}")
                    continue

                if "error" in list(mcq_block.keys()):
                    return output

                # append result(s)
                for item in sorted(mcq_block.keys(), key=lambda x: int(x)):
                    payload = mcq_block[item]
                    q_text = (payload.get("câu hỏi") or payload.get("question") or payload.get("stem") or "").strip()
                    options = payload.get("lựa chọn") or payload.get("options") or payload.get("choices") or {}
                    if isinstance(options, list):
                        options = {str(i+1): o for i, o in enumerate(options)}
                    correct_key = payload.get("đáp án") or payload.get("answer") or payload.get("correct") or None
                    correct_text = ""
                    if isinstance(correct_key, str) and correct_key.strip() in options:
                        correct_text = options[correct_key.strip()]
                    else:
                        correct_text = payload.get("correct_text") or correct_key or ""

                    diff_score, diff_label = self._estimate_difficulty_for_generation(
                        q_text=q_text, options={k: str(v) for k,v in options.items()}, correct_text=str(correct_text), context_text=context
                    )
                    payload["difficulty"] = {"score": diff_score, "label": diff_label}

                    qcount += 1
                    output[str(qcount)] = mcq_block[item]
                    if qcount >= n_questions:
                        return output

            return output
        else:
            raise ValueError("mode must be 'per_page' or 'rag'.")

    def validate_mcqs(
        self,
        mcqs: Dict[str, Any],
        top_k: int = 4,
        similarity_threshold: float = 0.5,
        evidence_score_cutoff: float = 0.5,
        use_cross_encoder: bool = True,
        use_qa: bool = True,
        auto_accept_threshold: float = 0.7,
        review_threshold: float = 0.5,
        distractor_too_similar: float = 0.8,
        distractor_too_different: float = 0.15,
        model_verification_temperature: float = 0.0,
    ) -> Dict[str, Any]:
        """
        Upgraded validation pipeline:
            - embedding retrieval (self.index / self.embeddings)
            - cross-encoder entailment scoring (optional)
            - extractive QA consistency check (optional)
            - distractor similarity and type checks
            - aggregate into quality_score and triage_action

        Returns a dict keyed by qid with detailed info and triage decision.
        """
        cross_entail = None
        qa_pipeline = None
        if use_cross_encoder:
            try:
                cross_entail = self.cross_entail
            except Exception as e:
                cross_entail = None
        if use_qa:
            try:
                qa_pipeline = self.qa_pipeline
            except Exception:
                qa_pipeline = None

        # --- helpers ---
        def _norm_text(s: str) -> str:
            if s is None:
                return ""
            s = s.strip().lower()
            # remove punctuation
            s = s.translate(str.maketrans("", "", string.punctuation))
            # collapse whitespace
            s = " ".join(s.split())
            return s

        def _semantic_search(statement: str, k: int = top_k):
            # returns list of (idx, score) using current embeddings/index
            q_emb = self.embedder.encode([statement], convert_to_numpy=True).astype("float32")
            if _HAS_FAISS and getattr(self, "index", None) is not None:
                try:
                    faiss.normalize_L2(q_emb)
                    D_list, I_list = self.index.search(q_emb, k)
                    return [(int(i), float(d)) for i, d in zip(I_list[0], D_list[0]) if i != -1]
                except Exception:
                    pass
            # fallback to brute force
            qn = q_emb / (np.linalg.norm(q_emb, axis=1, keepdims=True) + 1e-10)
            sims = (self.embeddings @ qn.T).squeeze(axis=1)
            idxs = np.argsort(-sims)[:k]
            return [(int(i), float(sims[i])) for i in idxs]

        def _compose_context_from_retrieved(retrieved):
            parts = []
            for ridx, score in retrieved:
                md = self.metadata[ridx] if ridx < len(self.metadata) else {}
                page = md.get("page", "?")
                text = self.texts[ridx]
                parts.append(f"[page {page}] {text}")
            return "\n\n".join(parts)

        def _compute_option_embeddings(options_map: Dict[str, str]):
            # returns dict key->embedding
            keys = list(options_map.keys())
            texts = [options_map[k] for k in keys]
            embs = self.embedder.encode(texts, convert_to_numpy=True)
            return dict(zip(keys, embs))

        def _cosine(a, b):
            a = np.asarray(a, dtype=float)
            b = np.asarray(b, dtype=float)
            denom = (np.linalg.norm(a) * np.linalg.norm(b) + 1e-12)
            return float(np.dot(a, b) / denom)

        # --- main loop ---
        report = {}
        for qid, item in mcqs.items():
            # support both Vietnamese keys and English keys
            q_text = (item.get("câu hỏi") or item.get("question") or item.get("q") or item.get("stem") or "").strip()
            options = item.get("lựa chọn") or item.get("options") or item.get("choices") or {}
            # options may be dict mapping letters to text, or list: normalize to dict
            if isinstance(options, list):
                options = {str(i+1): o for i, o in enumerate(options)}
            # correct answer may be a key (like "A") or the text; try both
            correct_key = item.get("đáp án") or item.get("answer") or item.get("correct") or item.get("ans")
            correct_text = ""
            if isinstance(correct_key, str) and correct_key.strip() in options:
                correct_text = options[correct_key.strip()]
            else:
                # maybe the answer is full text
                if isinstance(correct_key, str):
                    correct_text = correct_key.strip()
                else:
                    # fallback to 'correct_text' field
                    correct_text = item.get("correct_text") or item.get("đáp án_text") or ""

            # default empty guard
            options = {k: str(v) for k, v in options.items()}
            correct_text = str(correct_text)

            # prepare statement for retrieval
            statement = f"{q_text} Answer: {correct_text}"
            retrieved = _semantic_search(statement, k=top_k)
            # build context from top retrieved
            context_parts = []
            for ridx, score in retrieved:
                md = self.metadata[ridx] if ridx < len(self.metadata) else {}
                context_parts.append({"idx": ridx, "score": float(score), "page": md.get("page", None), "text": self.texts[ridx]})
            context_text = "\n\n".join([f"[page {p['page']}] {p['text']}" for p in context_parts])

            # Evidence list (embedding-based)
            evidence_list = []
            max_sim = 0.0
            for r in context_parts:
                if r["score"] >= evidence_score_cutoff:
                    snippet = r["text"]
                    evidence_list.append({
                        "idx": r["idx"],
                        "page": r["page"],
                        "score": r["score"],
                        "text": (snippet[:1000] + ("..." if len(snippet) > 1000 else "")),
                    })
                if r["score"] > max_sim:
                    max_sim = float(r["score"])
            supported_by_embeddings = max_sim >= similarity_threshold

            # Cross-encoder entailment scores for each option
            entailment_scores = {}
            correct_entail = 0.0
            try:
                if cross_entail is not None and context_text.strip():
                    # prepare list of (premise, hypothesis)
                    pairs = []
                    opt_keys = list(options.keys())
                    for k in opt_keys:
                        hyp = f"{q_text} Answer: {options[k]}"
                        pairs.append((context_text, hyp))
                    scores = cross_entail.predict(pairs)  # returns list of floats
                    # normalize scores to 0-1 if needed (cross-encoder may return arbitrary positive)
                    # do a min-max normalization across the returned scores
                    # but avoid division by zero
                    min_s = float(min(scores)) if len(scores) else 0.0
                    max_s = float(max(scores)) if len(scores) else 1.0
                    denom = max_s - min_s if max_s - min_s > 1e-6 else 1.0
                    for k, raw in zip(opt_keys, scores):
                        scaled = (raw - min_s) / denom
                        entailment_scores[k] = float(scaled)
                    # find correct key if available
                    # if `correct_text` exactly matches one of options, find that key
                    matched_key = None
                    for k, v in options.items():
                        if _norm_text(v) == _norm_text(correct_text):
                            matched_key = k
                            break
                    if matched_key:
                        correct_entail = entailment_scores.get(matched_key, 0.0)
                    else:
                        # fallback: treat 'correct_text' as a separate hypothesis
                        hyp = f"{q_text} Answer: {correct_text}"
                        raw = cross_entail.predict([(context_text, hyp)])[0]
                        # scale relative to min/max used above
                        correct_entail = float((raw - min_s) / denom)
                else:
                    entailment_scores = {}
                    correct_entail = 0.0
            except Exception as e:
                entailment_scores = {}
                correct_entail = 0.0

            def embed_cosine_sim(a, b):
                emb = self.embedder.encode([a, b], convert_to_numpy=True, normalize_embeddings=True)
                return float(np.dot(emb[0], emb[1]))

            # QA consistency
            qa_answer = None
            qa_score = 0.0
            qa_agrees = False
            if qa_pipeline is not None and context_text.strip():
                try:
                    qa_res = qa_pipeline(question=q_text, context=context_text)
                    # some QA pipelines return list of answers or dict
                    if isinstance(qa_res, list) and len(qa_res) > 0:
                        top = qa_res[0]
                        qa_answer = top.get("answer") if isinstance(top, dict) else str(top)
                        # qa_score = float(top.get("score", 0.0) if isinstance(top, dict) else 0.0)
                    elif isinstance(qa_res, dict):
                        qa_answer = qa_res.get("answer", "")
                        qa_score = float(qa_res.get("score", 0.0))
                    else:
                        qa_answer = str(qa_res)
                        qa_score = 0.0
                    qa_score = embed_cosine_sim(qa_answer, correct_text)
                    qa_agrees = (qa_score >= 0.5)
                except Exception:
                    qa_answer = None
                    qa_score = 0.0
                    qa_agrees = False

            try:
                opt_embs = _compute_option_embeddings({**options, "__CORRECT__": correct_text})
                correct_emb = opt_embs.pop("__CORRECT__")
                distractor_similarities = {}
                for k, emb in opt_embs.items():
                    distractor_similarities[k] = float(_cosine(correct_emb, emb))
            except Exception:
                distractor_similarities = {k: None for k in options.keys()}

            # distractor flags
            distractor_penalty = 0.0
            distractor_flags = []
            for k, sim in distractor_similarities.items():
                if sim is None or sim >= 0.999999 or (sim >= -0.01 and sim <= 0):
                    continue
                if sim >= distractor_too_similar:
                    distractor_flags.append({"key": k, "reason": "too_similar", "similarity": sim})
                    distractor_penalty += 0.25
                elif sim <= distractor_too_different:
                    distractor_flags.append({"key": k, "reason": "too_different", "similarity": sim})
                    distractor_penalty += 0.15
            # clamp penalty
            distractor_penalty = min(distractor_penalty, 1.0)

            # Ambiguity detection: how many options have entailment >= threshold
            ambiguous = False
            ambiguous_options = []
            if entailment_scores:
                # count options whose entailment >= max(correct_entail * 0.9, 0.6)
                amb_thresh = max(correct_entail * 0.9, 0.6)
                for k, sc in entailment_scores.items():
                    if sc >= amb_thresh and (options.get(k, "") != correct_text):
                        ambiguous_options.append({"key": k, "score": sc, "text": options[k]})
                ambiguous = len(ambiguous_options) > 0

            # Compose aggregated quality score
            # Components:
            #   - embedding_support: normalized max_sim (0..1)
            #   - entailment: correct_entail (0..1)
            #   - qa_agree: boolean -> 1 or 0 times qa_score
            #   - distractor_penalty: subtracted
            emb_support_norm = max_sim  # embedding similarity typically already 0..1 (inner product normalized)
            entail_component = float(correct_entail)
            qa_component = float(qa_score) if qa_agrees else 0.0

            # weighted sum
            quality_score = (
                0.40 * emb_support_norm +
                0.35 * entail_component +
                0.20 * qa_component -
                0.05 * distractor_penalty
            )
            # clamp to 0..1
            quality_score = max(0.0, min(1.0, quality_score))

            # triage decision
            triage_action = "reject"
            if quality_score >= auto_accept_threshold and not ambiguous:
                triage_action = "pass"
            elif quality_score >= review_threshold:
                triage_action = "review"
            else:
                triage_action = "reject"

            # compile flags/reasons
            flag_reasons = []
            if not supported_by_embeddings:
                flag_reasons.append("no_strong_embedding_evidence")
            if entailment_scores and correct_entail < 0.6:
                flag_reasons.append("low_entailment_score_for_correct")
            if qa_pipeline is not None and qa_score > 0.6 and not qa_agrees:
                flag_reasons.append("qa_contradiction")
            if ambiguous:
                flag_reasons.append("ambiguous_options_supported")
            if distractor_flags:
                flag_reasons.append({"distractor_issues": distractor_flags})

            # assemble per-question report
            report[qid] = {
                "supported_by_embeddings": bool(supported_by_embeddings),
                "max_similarity": float(max_sim),
                "evidence": evidence_list,
                "entailment_scores": entailment_scores,
                "correct_entailment": float(correct_entail),
                "qa_answer": qa_answer,
                "qa_score": float(qa_score),
                "qa_agrees": bool(qa_agrees),
                "distractor_similarities": distractor_similarities,
                "distractor_flags": distractor_flags,
                "distractor_penalty": float(distractor_penalty),
                "ambiguous_options": ambiguous_options,
                "quality_score": float(quality_score),
                "triage_action": triage_action,
                "flag_reasons": flag_reasons,
            }

        return report

    def connect_qdrant(self, url: str, api_key: str = None, prefer_grpc: bool = False):
        if not _HAS_QDRANT:
            raise RuntimeError("qdrant-client is not installed. Install with `pip install qdrant-client`.")
        self.qdrant_url = url
        self.qdrant_api_key = api_key
        self.qdrant_prefer_grpc = prefer_grpc
        # Create client
        self.qdrant = QdrantClient(url=url, api_key=api_key, prefer_grpc=prefer_grpc)

    def _ensure_collection(self, collection_name: str):
        if self.qdrant is None:
            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")
        try:
            # get_collection will raise if not present
            _ = self.qdrant.get_collection(collection_name)
        except Exception:
            # create collection with vector size = self.dim
            vect_params = VectorParams(size=self.dim, distance=Distance.COSINE)
            self.qdrant.recreate_collection(collection_name=collection_name, vectors_config=vect_params)
            # recreate_collection ensures a clean collection; if you prefer to avoid wiping use create_collection instead.

    def save_pdf_to_qdrant(
        self,
        pdf_path: str,
        filename: str,
        collection: str,
        max_chars: int = 1200,
        batch_size: int = 64,
        overwrite: bool = False,
    ):
        if self.qdrant is None:
            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")

        # extract pages and chunks (re-using your existing helpers)
        pages = self.extract_pages(pdf_path)

        all_chunks = []
        all_meta = []
        for p_idx, page_text in enumerate(pages, start=1):
            chunks = self.chunk_text(page_text or "", max_chars=max_chars)
            for cid, ch in enumerate(chunks, start=1):
                all_chunks.append(ch)
                all_meta.append({"page": p_idx, "chunk_id": cid, "length": len(ch)})

        if not all_chunks:
            raise RuntimeError("No tSext extracted from PDF.")

        # ensure collection exists
        self._ensure_collection(collection)

        # optional: delete previous points for this filename if overwrite
        if overwrite:
            # delete by filter: filename == filename
            flt = Filter(must=[FieldCondition(key="filename", match=MatchValue(value=filename))])
            try:
                # qdrant-client delete uses delete(
                self.qdrant.delete(collection_name=collection, filter=flt)
            except Exception:
                # ignore if deletion fails
                pass

        # compute embeddings in batches
        embeddings = self.embedder.encode(all_chunks, convert_to_numpy=True, show_progress_bar=True)
        embeddings = embeddings.astype("float32")

        # prepare points
        points = []
        for i, (emb, md, txt) in enumerate(zip(embeddings, all_meta, all_chunks)):
            pid = str(uuid4())
            source_id = f"{filename}__p{md['page']}__c{md['chunk_id']}"
            payload = {
                "filename": filename,
                "page": md["page"],
                "chunk_id": md["chunk_id"],
                "length": md["length"],
                "text": txt,
                "source_id": source_id,
            }
            points.append(PointStruct(id=pid, vector=emb.tolist(), payload=payload)) # pyright: ignore[reportPossiblyUnboundVariable]

            # upsert in batches
            if len(points) >= batch_size:
                self.qdrant.upsert(collection_name=collection, points=points)
                points = []

        # upsert remaining
        if points:
            self.qdrant.upsert(collection_name=collection, points=points)

        try:
            self.qdrant.create_payload_index(
                collection_name=collection,
                field_name="filename",
                field_schema=rest.PayloadSchemaType.KEYWORD
            )
        except Exception as e:
            print(f"Index creation skipped or failed: {e}")

        return {"status": "ok", "uploaded_chunks": len(all_chunks), "collection": collection, "filename": filename}


    def list_files_in_collection(
        self,
        collection: str,
        payload_field: str = "filename",
        batch_size: int = 500,
    ) -> List[str]:
        if self.qdrant is None:
            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")

        # ensure collection exists
        try:
            if not self.qdrant.collection_exists(collection):
                raise RuntimeError(f"Collection '{collection}' does not exist.")
        except Exception:
            # collection_exists may raise if server unreachable
            raise

        filenames = set()
        offset = None

        while True:
            # scroll returns (points, next_offset)
            pts, next_offset = self.qdrant.scroll(
                collection_name=collection,
                limit=batch_size,
                offset=offset,
                with_payload=[payload_field],
                with_vectors=False,
            )

            if not pts:
                break

            for p in pts:
                # p may be a dict-like or an object with .payload
                payload = None
                if hasattr(p, "payload"):
                    payload = p.payload
                elif isinstance(p, dict):
                    # older/newer variants might use nested structures: try common keys
                    payload = p.get("payload") or p.get("payload", None) or p
                else:
                    # best-effort fallback: convert to dict if possible
                    try:
                        payload = dict(p)
                    except Exception:
                        payload = None

                if not payload:
                    continue

                # extract candidate value(s)
                val = None
                if isinstance(payload, dict):
                    val = payload.get(payload_field)
                else:
                    # Some payload representations store fields differently; try attribute access
                    val = getattr(payload, payload_field, None)

                # If value is list-like, iterate, else add single
                if isinstance(val, (list, tuple, set)):
                    for v in val:
                        if v is not None:
                            filenames.add(str(v))
                elif val is not None:
                    filenames.add(str(val))

            # stop if no more pages
            if not next_offset:
                break
            offset = next_offset

        return sorted(filenames)


    def list_chunks_for_filename(self, collection: str, filename: str, batch: int = 256) -> List[Dict[str, Any]]:
        if self.qdrant is None:
            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")

        results = []
        offset = None
        while True:
            # scroll returns (points, next_offset)
            points, next_offset = self.qdrant.scroll(
                collection_name=collection,
                scroll_filter=Filter(
                    must=[
                        FieldCondition(key="filename", match=MatchValue(value=filename))
                    ]
                ),
                limit=batch,
                offset=offset,
                with_payload=True,
                with_vectors=False,
            )
            # points are objects (Record / ScoredPoint-like); get id and payload
            for p in points:
                # p.payload is a dict, p.id is point id
                results.append({"point_id": p.id, "payload": p.payload})
            if not next_offset:
                break
            offset = next_offset
        return results


    def _retrieve_qdrant(self, query: str, collection: str, filename: str = None, top_k: int = 3) -> List[Tuple[Dict[str, Any], float]]:
        if self.qdrant is None:
            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")

        q_emb = self.embedder.encode([query], convert_to_numpy=True).astype("float32")[0].tolist()
        q_filter = None
        if filename:
            q_filter = Filter(must=[FieldCondition(key="filename", match=MatchValue(value=filename))])

        search_res = self.qdrant.search(
            collection_name=collection,
            query_vector=q_emb,
            query_filter=q_filter,
            limit=top_k,
            with_payload=True,
            with_vectors=False,
        )

        out = []
        for hit in search_res:
            # hit.payload is the stored payload, hit.score is similarity
            out.append((hit.payload, float(getattr(hit, "score", 0.0))))
        return out


    def generate_from_qdrant(
        self,
        filename: str,
        collection: str,
        n_questions: int = 10,
        mode: str = "rag",               # 'per_chunk' or 'rag'
        questions_per_chunk: int = 3,    # used for 'per_chunk'
        top_k: int = 3,                  # retrieval size used in RAG
        temperature: float = 0.2,
        enable_fiddler: bool = False,
    ) -> Dict[str, Any]:
        if self.qdrant is None:
            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")

        # get all chunks for this filename (payload should contain 'text', 'page', 'chunk_id', etc.)
        file_points = self.list_chunks_for_filename(collection=collection, filename=filename)
        if not file_points:
            raise RuntimeError(f"No chunks found for filename={filename} in collection={collection}.")

        # create a local list of texts & metadata for sampling
        texts = []
        metas = []
        for p in file_points:
            payload = p.get("payload", {})
            text = payload.get("text", "")
            texts.append(text)
            metas.append(payload)

        self.texts = texts
        self.metadata = metas
        embeddings = self.embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True)
        if embeddings is None or len(embeddings) == 0:
            self.embeddings = None
            self.index = None
        else:
            self.embeddings = embeddings.astype("float32")

            # update dim in case embedder changed unexpectedly
            self.dim = int(self.embeddings.shape[1])

            # build index
            self._build_faiss_index()

        output = {}
        qcount = 0

        if mode == "per_chunk":
            # iterate all chunks (in payload order) and request questions_per_chunk from each
            for i, txt in enumerate(texts):
                if not txt.strip():
                    continue

                try:
                    structured_context = structure_context_for_llm(txt, model=self.generation_model, temperature=0.2, enable_fiddler=enable_fiddler)
                    mcq_block = generate_mcqs_from_text(structured_context, n=questions_per_chunk, model=self.generation_model, temperature=temperature, enable_fiddler=enable_fiddler)
                except Exception as e:
                    print(f"Generator failed on chunk (index {i}): {e}")
                    continue

                if "error" in list(mcq_block.keys()):
                    return output

                for item in sorted(mcq_block.keys(), key=lambda x: int(x)):
                    qcount += 1
                    output[str(qcount)] = mcq_block[item]
                    if qcount >= n_questions:
                        return output
            return output

        elif mode == "rag":
            attempts = 0
            max_attempts = n_questions * 4
            while qcount < n_questions and attempts < max_attempts:
                attempts += 1
                # create a seed query: pick a random chunk, pick a sentence from it
                seed_idx = random.randrange(len(self.texts))
                chunk = self.texts[seed_idx]
                sents = re.split(r'(?<=[\.\?\!])\s+', chunk)
                candidate = [s for s in sents if len(s.strip()) > 20]
                if candidate:
                    seed_sent = random.choice(candidate)
                else:
                    stripped = chunk.strip()
                    seed_sent = (stripped[:200] if stripped else "[no text available]")
                query = f"Create questions about: {seed_sent}"


                # retrieve top_k chunks from the same file (restricted by filename filter)
                retrieved = self._retrieve_qdrant(query=query, collection=collection, filename=filename, top_k=top_k)
                context_parts = []
                for payload, score in retrieved:
                    # payload should contain page & chunk_id and text
                    page = payload.get("page", "?")
                    ctxt = payload.get("text", "")
                    context_parts.append(f"[page {page}] {ctxt}")
                context = "\n\n".join(context_parts)

                try:
                    structured_context = structure_context_for_llm(context, model=self.generation_model, temperature=0.2, enable_fiddler=enable_fiddler)
                    mcq_block = generate_mcqs_from_text(structured_context, n=questions_per_chunk, model=self.generation_model, temperature=temperature, enable_fiddler=enable_fiddler)
                except Exception as e:
                    print(f"Generator failed during RAG attempt {attempts}: {e}")
                    continue

                if "error" in list(mcq_block.keys()):
                    return output

                for item in sorted(mcq_block.keys(), key=lambda x: int(x)):
                    payload = mcq_block[item]
                    q_text = (payload.get("câu hỏi") or payload.get("question") or payload.get("stem") or "").strip()
                    options = payload.get("lựa chọn") or payload.get("options") or payload.get("choices") or {}
                    if isinstance(options, list):
                        options = {str(i+1): o for i, o in enumerate(options)}
                    correct_key = payload.get("đáp án") or payload.get("answer") or payload.get("correct") or None
                    correct_text = ""
                    if isinstance(correct_key, str) and correct_key.strip() in options:
                        correct_text = options[correct_key.strip()]
                    else:
                        correct_text = payload.get("correct_text") or correct_key or ""

                    diff_score, diff_label = self._estimate_difficulty_for_generation(
                        q_text=q_text, options={k: str(v) for k,v in options.items()}, correct_text=str(correct_text), context_text=context
                    )
                    payload["độ khó"] = {"điểm": diff_score, "mức độ": diff_label}

                    qcount += 1
                    output[str(qcount)] = mcq_block[item]
                    if qcount >= n_questions:
                        return output
            return output
        else:
            raise ValueError("mode must be 'per_chunk' or 'rag'.")



    def _estimate_difficulty_for_generation(
        self,
        q_text: str,
        options: Dict[str, str],
        correct_text: str,
        context_text: str,
    ) -> Tuple[float, str]:
        def safe_map_sim(s):
            # map potentially [-1,1] cosine-like to [0,1], clamp
            try:
                s = float(s)
            except Exception:
                return 0.0
            mapped = (s + 1.0) / 2.0
            return max(0.0, min(1.0, mapped))

        # embedding support
        emb_support = 0.0
        try:
            stmt = (q_text or "").strip()
            if correct_text:
                stmt = f"{stmt} Answer: {correct_text}"

            # use internal retrieve but map returned score
            res = []
            try:
                res = self._retrieve(stmt, top_k=1)
            except Exception:
                res = []

            if res:
                raw_score = float(res[0][1])
                emb_support = safe_map_sim(raw_score)
            else:
                emb_support = 0.0
        except Exception:
            emb_support = 0.0

        # distractor sims
        mean_sim = 0.0
        distractor_penalty = 0.0
        amb_flag = 0.0
        try:
            keys = list(options.keys())
            texts = [options[k] for k in keys]
            if correct_text is None:
                correct_text = ""

            all_texts = [correct_text] + texts
            embs = self.embedder.encode(all_texts, convert_to_numpy=True)
            embs = np.asarray(embs, dtype=float)
            norms = np.linalg.norm(embs, axis=1, keepdims=True) + 1e-12
            embs = embs / norms
            corr = embs[0]
            opts = embs[1:]

            if opts.size == 0:
                mean_sim = 0.0
                distractor_penalty = 0.0
                gap = 0.0
            else:
                sims = (opts @ corr).tolist() # [-1,1]
                sims_mapped = [safe_map_sim(s) for s in sims] # [0,1]
                mean_sim = float(sum(sims_mapped) / len(sims_mapped))
                # gap between best distractor and second best (higher gap -> easier)
                sorted_s = sorted(sims_mapped, reverse=True)
                top = sorted_s[0]
                second = sorted_s[1] if len(sorted_s) > 1 else 0.0
                gap = top - second
                # penalties: if distractors are extremely close to correct -> higher penalty
                too_close_count = sum(1 for s in sims_mapped if s >= 0.85)
                too_far_count = sum(1 for s in sims_mapped if s <= 0.15)
                distractor_penalty = min(1.0, 0.5 * mean_sim + 0.2 * (too_close_count / max(1, len(sims_mapped))) - 0.2 * (too_far_count / max(1, len(sims_mapped))))
                amb_flag = 1.0 if top >= 0.9 else 0.0
        except Exception:
            mean_sim = 0.0
            distractor_penalty = 0.0
            amb_flag = 0.0
            gap = 0.0

        # stem length normalized
        qlen = len((q_text or "").strip())
        qlen_norm = min(1.0, qlen / 300.0)

        # combine signals using safer semantics:
        #    higher emb_support -> easier (so we subtract a term)
        #    higher distractor_penalty -> harder (add)
        #    better gap -> easier (subtract)
        # compute score (higher -> harder)
        score = 0
        score += 0.35 * float(distractor_penalty)
        score += 0.20 * float(mean_sim)
        score += 0.22 * float(amb_flag)
        score += 0.05 * float(qlen_norm)
        score -= 0.20 * float(gap)

        # clamp
        score = max(0.0, min(1.0, float(score)))

        # label
        if score <= 0.33:
            label = "dễ"
        elif score <= 0.66 and score > 0.33:
            label = "trung bình"
        else:
            label = "khó"

        return score, label

class RAGMCQWithDifficulty(RAGMCQ):
    def __init__(
        self,
        embedder_model: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
        generation_model: str = "openai/gpt-oss-120b",
        qdrant_url: str = os.environ.get('QDRANT_URL') or "",
        qdrant_api_key: str = os.environ.get('QDRANT_API_KEY') or "",
        qdrant_prefer_grpc: bool = False,
    ):
        super().__init__(embedder_model, generation_model, qdrant_url, qdrant_api_key, qdrant_prefer_grpc)

    @override
    def generate_from_pdf(
        self,
        pdf_path: str,
        n_questions: int = 10,
        mode: str = "rag", # per_page or rag
        questions_per_page: int = 3, # for per_page mode
        top_k: int = 3, # chunks to retrieve for each question in rag mode
        temperature: float = 0.2,
        enable_fiddler: bool = False,
        target_difficulty: str = 'easy'  # easy, mid, difficult
    ) -> Dict[str, Any]:
        # build index
        self.build_index_from_pdf(pdf_path)

        output: Dict[str, Any] = {}
        qcount = 0

        if mode == "per_page":
            # iterate pages -> chunks
            for idx, meta in enumerate(self.metadata):
                chunk_text = self.texts[idx]

                if not chunk_text.strip():
                    continue


                # ask generator
                try:
                    structured_context = structure_context_for_llm(chunk_text, model=self.generation_model, temperature=0.2, enable_fiddler=enable_fiddler)
                    mcq_block = new_generate_mcqs_from_text(
                        source_text=structured_context, n=questions_per_page, model=self.generation_model, temperature=temperature, target_difficulty=target_difficulty ,enable_fiddler=enable_fiddler
                    )
                except Exception as e:
                    # skip this chunk if generator fails
                    print(f"Generator failed on page {meta['page']} chunk {meta['chunk_id']}: {e}")
                    continue

                if "error" in list(mcq_block.keys()):
                    return output

                for item in sorted(mcq_block.keys(), key=lambda x: int(x)):
                    qcount += 1
                    output[str(qcount)] = mcq_block[item]
                    if qcount >= n_questions:
                        return output

            return output

        # pdf gene
        elif mode == "rag":
            # strategy: create a few natural short queries by sampling sentences or using chunk summaries.
            # create queries by sampling chunk text sentences.
            # stop when n_questions reached or max_attempts exceeded.
            attempts = 0
            max_attempts = n_questions * 4

            while qcount < n_questions and attempts < max_attempts:
                attempts += 1
                # create a seed query: pick a random chunk, pick a sentence from it
                seed_idx = random.randrange(len(self.texts))
                chunk = self.texts[seed_idx]

                #? investigate better Chunking Strategy
                #with open("chunks.txt", "a", encoding="utf-8") as f:
                    #f.write(chunk + "\n")

                sents = re.split(r'(?<=[\.\?\!])\s+', chunk)
                seed_sent = random.choice([s for s in sents if len(s.strip()) > 20]) if sents else chunk[:200]
                query = f"Create questions about: {seed_sent}"

                # retrieve top_k chunks
                retrieved = self._retrieve(query, top_k=top_k)
                context_parts = []
                for ridx, score in retrieved:
                    md = self.metadata[ridx]
                    context_parts.append(f"[page {md['page']}] {self.texts[ridx]}")
                context = "\n\n".join(context_parts)

                # save_to_local('test/context.md', content=context)

                # call generator for 1 question (or small batch) with the retrieved context
                try:
                    structured_context = structure_context_for_llm(context, model=self.generation_model, temperature=0.2, enable_fiddler=False)
                    mcq_block = new_generate_mcqs_from_text(
                        source_text=structured_context, n=questions_per_page, model=self.generation_model, temperature=temperature, target_difficulty=target_difficulty ,enable_fiddler=enable_fiddler
                    )
                except Exception as e:
                    print(f"Generator failed during RAG attempt {attempts}: {e}")
                    continue

                if "error" in list(mcq_block.keys()):
                    return output

                # append result(s)
                for item in sorted(mcq_block.keys(), key=lambda x: int(x)):
                    payload = mcq_block[item]
                    q_text = (payload.get("câu hỏi") or payload.get("question") or payload.get("stem") or "").strip()
                    options = payload.get("lựa chọn") or payload.get("options") or payload.get("choices") or {}
                    if isinstance(options, list):
                        options = {str(i+1): o for i, o in enumerate(options)}
                    correct_key = payload.get("đáp án") or payload.get("answer") or payload.get("correct") or None
                    concepts = payload.get("khái niệm sử dụng") or payload.get("concepts") or payload.get("concepts used") or None
                    correct_text = ""
                    if isinstance(correct_key, str) and correct_key.strip() in options:
                        correct_text = options[correct_key.strip()]
                    else:
                        correct_text = payload.get("correct_text") or correct_key or ""

                    diff_score, diff_label, components = self._estimate_difficulty_for_generation( # type: ignore
                        q_text=q_text, options={k: str(v) for k,v in options.items()}, correct_text=str(correct_text), context_text=structured_context, concepts_used=concepts
                    )

                    payload["độ khó"] = {"điểm": diff_score, "mức độ": diff_label}

                    qcount += 1
                    output[str(qcount)] = mcq_block[item]
                    if qcount >= n_questions:
                        return output

            return output
        else:
            raise ValueError("mode must be 'per_page' or 'rag'.")

    @override
    def generate_from_qdrant(
        self,
        filename: str,
        collection: str,
        n_questions: int = 10,
        mode: str = "rag",               # 'per_chunk' or 'rag'
        questions_per_chunk: int = 3,    # used for 'per_chunk'
        top_k: int = 3,                  # retrieval size used in RAG
        temperature: float = 0.2,
        enable_fiddler: bool = False,
        target_difficulty: str = 'easy',

    ) -> Dict[str, Any]:
        if self.qdrant is None:
            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")

        # get all chunks for this filename (payload should contain 'text', 'page', 'chunk_id', etc.)
        file_points = self.list_chunks_for_filename(collection=collection, filename=filename)
        if not file_points:
            raise RuntimeError(f"No chunks found for filename={filename} in collection={collection}.")

        # create a local list of texts & metadata for sampling
        texts = []
        metas = []
        for p in file_points:
            payload = p.get("payload", {})
            text = payload.get("text", "")
            texts.append(text)
            metas.append(payload)

        self.texts = texts
        self.metadata = metas
        embeddings = self.embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True)
        if embeddings is None or len(embeddings) == 0:
            self.embeddings = None
            self.index = None
        else:
            self.embeddings = embeddings.astype("float32")

            # update dim in case embedder changed unexpectedly
            self.dim = int(self.embeddings.shape[1])

            # build index
            self._build_faiss_index()

        output = {}
        qcount = 0

        if mode == "per_chunk":
            # iterate all chunks (in payload order) and request questions_per_chunk from each
            for i, txt in enumerate(texts):
                if not txt.strip():
                    continue
                try:
                    structured_context = structure_context_for_llm(txt, model=self.generation_model, temperature=0.2, enable_fiddler=False)
                    mcq_block = new_generate_mcqs_from_text(
                        source_text=structured_context, n=questions_per_chunk, model=self.generation_model,
                        temperature=temperature, target_difficulty=target_difficulty ,enable_fiddler=enable_fiddler
                    )
                except Exception as e:
                    print(f"Generator failed on chunk (index {i}): {e}")
                    continue

                if "error" in list(mcq_block.keys()):
                    return output

                for item in sorted(mcq_block.keys(), key=lambda x: int(x)):
                    qcount += 1
                    output[str(qcount)] = mcq_block[item]
                    if qcount >= n_questions:
                        return output
            return output

        elif mode == "rag":
            attempts = 0
            max_attempts = n_questions * 4
            while qcount < n_questions and attempts < max_attempts:
                attempts += 1
                # create a seed query: pick a random chunk, pick a sentence from it
                seed_idx = random.randrange(len(self.texts))
                chunk = self.texts[seed_idx]
                sents = re.split(r'(?<=[\.\?\!])\s+', chunk)
                candidate = [s for s in sents if len(s.strip()) > 20]
                if candidate:
                    seed_sent = random.choice(candidate)
                else:
                    stripped = chunk.strip()
                    seed_sent = (stripped[:200] if stripped else "[no text available]")
                query = f"Create questions about: {seed_sent}"

                # retrieve top_k chunks from the same file (restricted by filename filter)
                retrieved = self._retrieve_qdrant(query=query, collection=collection, filename=filename, top_k=top_k)
                context_parts = []
                for payload, score in retrieved:
                    # payload should contain page & chunk_id and text
                    page = payload.get("page", "?")
                    ctxt = payload.get("text", "")
                    context_parts.append(f"[page {page}] {ctxt}")
                context = "\n\n".join(context_parts)


                # q generation
                try:
                    structured_context = structure_context_for_llm(context, model=self.generation_model, temperature=0.2, enable_fiddler=False)
                    mcq_block = new_generate_mcqs_from_text(
                        source_text=structured_context, n=questions_per_chunk, model=self.generation_model,
                        temperature=temperature, target_difficulty=target_difficulty ,enable_fiddler=enable_fiddler
                    )
                except Exception as e:
                    print(f"Generator failed during RAG attempt {attempts}: {e}")
                    continue

                if "error" in list(mcq_block.keys()):
                    return output

                for item in sorted(mcq_block.keys(), key=lambda x: int(x)):
                    payload = mcq_block[item]
                    q_text = (payload.get("câu hỏi") or payload.get("question") or payload.get("stem") or "").strip()
                    options = payload.get("lựa chọn") or payload.get("options") or payload.get("choices") or {}

                    if isinstance(options, list):
                        options = {str(i+1): o for i, o in enumerate(options)}

                    correct_key = payload.get("đáp án") or payload.get("answer") or payload.get("correct") or None
                    concepts = payload.get("khái niệm sử dụng") or payload.get("concepts") or payload.get("concepts used") or None

                    correct_text = ""
                    if isinstance(correct_key, str) and correct_key.strip() in options:
                        correct_text = options[correct_key.strip()]
                    else:
                        correct_text = payload.get("correct_text") or correct_key or ""

                    #? change estimate
                    diff_score, diff_label, components = self._estimate_difficulty_for_generation( # type: ignore
                        q_text=q_text, options={k: str(v) for k,v in options.items()}, correct_text=str(correct_text), context_text=structured_context, concepts_used=concepts # type: ignore
                    )

                    payload["độ khó"] = {"điểm": diff_score, "mức độ": diff_label}

                    # CHECK n generation: if number of request mcqs < default generation number e.g. 5 - 3 = 2 < 3 then only genearate 2 mcqs
                    if n_questions - qcount < questions_per_chunk:
                        questions_per_chunk = n_questions - qcount

                    qcount += 1 # count number of question
                    # print('qcount:', qcount)
                    # print('questions_per_chunk:', questions_per_chunk)

                    output[str(qcount)] = mcq_block[item]
                    if qcount >= n_questions:
                        return output

            if output is not None:
              print("output available")
            return output
        else:
            raise ValueError("mode must be 'per_chunk' or 'rag'.")

    @override
    def _estimate_difficulty_for_generation(
        self,
        q_text: str,
        options: Dict[str, str],
        correct_text: str,
        context_text: str,
        concepts_used: Dict = {}
    ) -> Tuple[float, str]:
        def safe_map_sim(s):
            # map potentially [-1,1] cosine-like to [0,1], clamp
            try:
                s = float(s)
            except Exception:
                return 0.0
            mapped = (s + 1.0) / 2.0
            return max(0.0, min(1.0, mapped))

        # embedding support
        emb_support = 0.0
        try:
            stmt = (q_text or "").strip()
            if correct_text:
                stmt = f"{stmt} Answer: {correct_text}"

            # use internal retrieve but map returned score
            res = []
            try:
                res = self._retrieve(stmt, top_k=1)
            except Exception:
                res = []

            if res:
                raw_score = float(res[0][1])
                emb_support = safe_map_sim(raw_score)
            else:
                emb_support = 0.0
        except Exception:
            emb_support = 0.0

        # distractor sims
        mean_sim = 0.0
        distractor_penalty = 0.0
        amb_flag = 0.0
        try:
            keys = list(options.keys())
            texts = [options[k] for k in keys]
            if correct_text is None:
                correct_text = ""

            all_texts = [correct_text] + texts
            embs = self.embedder.encode(all_texts, convert_to_numpy=True)
            embs = np.asarray(embs, dtype=float)
            norms = np.linalg.norm(embs, axis=1, keepdims=True) + 1e-12
            embs = embs / norms
            corr = embs[0]
            opts = embs[1:]

            if opts.size == 0:
                mean_sim = 0.0
                distractor_penalty = 0.0
                gap = 0.0
            else:
                sims = (opts @ corr).tolist() # [-1,1]
                sims_mapped = [safe_map_sim(s) for s in sims] # [0,1]
                mean_sim = float(sum(sims_mapped) / len(sims_mapped))
                # gap between best distractor and second best (higher gap -> easier)
                sorted_s = sorted(sims_mapped, reverse=True)
                top = sorted_s[0]
                second = sorted_s[1] if len(sorted_s) > 1 else 0.0
                gap = top - second
                # penalties: if distractors are extremely close to correct -> higher penalty
                too_close_count = sum(1 for s in sims_mapped if s >= 0.85)
                too_far_count = sum(1 for s in sims_mapped if s <= 0.15)
                distractor_penalty = min(1.0, 0.5 * mean_sim + 0.2 * (too_close_count / max(1, len(sims_mapped))) - 0.2 * (too_far_count / max(1, len(sims_mapped))))
                amb_flag = 1.0 if top >= 0.8 else 0.0
        except Exception:
            mean_sim = 0.0
            distractor_penalty = 0.0
            amb_flag = 0.0
            gap = 0.0

        # question length normalized
        question_len = len((q_text or "").strip())
        question_len_norm = min(1.0, question_len / 300.0)

        # count number of concept from string
        concepts_num = len(concepts_used.keys())
        if concepts_num < 2:
          concepts_penalty = 0
        else:
          concepts_penalty = concepts_num

        # combine signals using safer semantics:
        #    higher emb_support -> easier (so we subtract a term)
        #    higher distractor_penalty -> harder (add)
        #    better gap -> easier (subtract)
        # compute score (higher -> harder)

        score = 0
        score += 0.35 * float(distractor_penalty)
        score += 0.20 * float(mean_sim)
        score += 0.22 * float(amb_flag)
        score += 0.08 * float(question_len_norm)
        score -= 0.20 * float(gap)

        # clamp
        score = max(0.0, min(1.0, float(score)))
        components = {
            "base": 0.3,
            "distractor_penalty": 0.35 * float(distractor_penalty),
            "mean_sim": 0.15 * float(mean_sim),
            "amb_flag": 0.05 * float(amb_flag),
            "concepts_num": 0.1 * float(concepts_num),
            "gap": -0.12 * float(gap),
            "question_len_norm": 0.05 * float(question_len_norm),
            "emb_support": -0.45 * float(emb_support),
            "total_score": score,
        }

        # label
        if score <= 0.56:
            label = "dễ"
        elif score <= 0.755 and score > 0.56:
            label = "trung bình"
        else:
            label = "khó"

        return score, label, components # type: ignore