Spaces:

namberino
/

mcq-generator

Runtime error

App Files Files Community

namberino commited on Aug 17, 2025

Commit

dfa5afb

1 Parent(s): 073c79b

Initial commit

Browse files

Files changed (22) hide show

.dockerignore +15 -0
.gitattributes +1 -0
.github/workflows/hf_deploy.yml +19 -0
.github/workflows/image_scan.yml +25 -0
.github/workflows/lint_test.yml +30 -0
.github/workflows/sast.yml +31 -0
.github/workflows/security_scan.yml +23 -0
.gitignore +1 -0
Dockerfile +32 -0
app.py +179 -0
app/app.py +176 -0
app/generator.py +695 -0
app/output.json +0 -0
app/software_report_template.md +261 -0
app/utils.py +88 -0
generator.py +696 -0
requirements.txt +8 -0
test/cerebras-api.py +48 -0
test/logging.txt +111 -0
test/mcq_output.json +163 -0
test/output.json +0 -0
utils.py +86 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,15 @@

+__pycache__
+*.pyc
+*.pyo
+*.pyd
+*.sqlite3
+.env
+.env.*
+.git
+.gitignore
+.wheelhouse
+wheels
+dist
+build
+.vscode
+.idea

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

.github/workflows/hf_deploy.yml ADDED Viewed

	@@ -0,0 +1,19 @@

+name: Sync to HuggingFace
+on:
+  push:
+    branches: [ main ]
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push https://namberino:$HF_TOKEN@huggingface.co/spaces/namberino/mcq-generator main

.github/workflows/image_scan.yml ADDED Viewed

	@@ -0,0 +1,25 @@

+name: Scan docker image for security issues
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+  workflow_dispatch:
+jobs:
+  trivy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Build image
+        run: docker build -t mcq-gen:ci .
+      - name: Run Trivy scan
+        uses: aquasecurity/trivy-action@0.32.0
+        with:
+          image-ref: mcq-gen:ci
+          format: 'table'
+          exit-code: '1'

.github/workflows/lint_test.yml ADDED Viewed

	@@ -0,0 +1,30 @@

+name: Lint, Typecheck, Tests
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+  workflow_dispatch:
+jobs:
+  lint-and-test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+      - name: Install dependencies
+        run: |
+          pip install ruff
+      - name: Run ruff (lint)
+        run: |
+          ruff check .

.github/workflows/sast.yml ADDED Viewed

	@@ -0,0 +1,31 @@

+name: SAST
+on:
+  push:
+  pull_request:
+  workflow_dispatch:
+jobs:
+  sast:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install tools
+        run: |
+          python -m pip install --upgrade pip
+          pip install semgrep bandit
+      - name: Run semgrep
+        run: semgrep --config auto --output semgrep-results.txt || true
+      - name: Run bandit
+        run: bandit -r . -f json -o bandit-results.json || true
+      - name: Upload SARIF/artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: security-reports
+          path: |
+            semgrep-results.txt
+            bandit-results.json

.github/workflows/security_scan.yml ADDED Viewed

	@@ -0,0 +1,23 @@

+name: Scan for security issues
+on:
+  push:
+  pull_request:
+  workflow_dispatch:
+jobs:
+  gitleaks-scan:
+    runs-on: ubuntu-latest
+    name: Scan for secrets and sensitive information
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Run gitleaks
+        uses: gitleaks/gitleaks-action@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          # GITLEAKS_CONFIG: .gitleaks.toml
+          GITLEAKS_ENABLE_UPLOAD_ARTIFACT: true
+          GITLEAKS_ENABLE_SUMMARY: true

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .vscode

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+FROM python:3.11-slim
+# set HF cache to /tmp for writable FS on Spaces
+ENV HF_HOME=/tmp/huggingface
+ENV TOKENIZERS_PARALLELISM=false
+# install system packages needed by some python libs
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    git \
+    wget \
+    libsndfile1 \
+    libgl1 \
+    libglib2.0-0 \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# copy requirements and install
+COPY requirements.txt /app/requirements.txt
+RUN pip install --upgrade pip
+# try to be robust to wheels/build issues
+# RUN pip wheel --no-cache-dir --wheel-dir=/wheels -r /app/requirements.txt || true
+RUN pip install --no-cache-dir -r /app/requirements.txt
+# copy app code
+COPY . /app
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import os
+import shutil
+import tempfile
+from typing import Optional
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+# Import the user's RAGMCQ implementation
+from generator import RAGMCQ
+app = FastAPI(title="RAG MCQ Generator API")
+# allow cross-origin requests (adjust in production)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# global rag instance
+rag: Optional[RAGMCQ] = None
+class GenerateResponse(BaseModel):
+    mcqs: dict
+    validation: Optional[dict] = None
+class ListResponse(BaseModel):
+    files: list
+@app.on_event("startup")
+def startup_event():
+    global rag
+    # instantiate the heavy object once
+    rag = RAGMCQ(
+        qdrant_url=os.environ['QDRANT_URL'],
+        qdrant_api_key=os.environ['QDRANT_API_KEY']
+    )
+    print("RAGMCQ instance created on startup.")
+@app.get("/health")
+def health():
+    return {"status": "ok", "ready": rag is not None}
+def _save_upload_to_temp(upload: UploadFile) -> str:
+    suffix = ".pdf"
+    fd, path = tempfile.mkstemp(suffix=suffix)
+    os.close(fd)
+    with open(path, "wb") as out_file:
+        shutil.copyfileobj(upload.file, out_file)
+    return path
+@app.get("/list_collection_files", response_model=ListResponse)
+async def list_collection_files_endpoint(
+    collection_name: str = "programming"
+):
+    global rag
+    if rag is None:
+        raise HTTPException(status_code=503, detail="RAGMCQ not ready on server.")
+    files = rag.list_files_in_collection(collection_name)
+    return {"files": files}
+@app.post("/generate_saved", response_model=GenerateResponse)
+async def generate_saved_endpoint(
+    n_questions: int = Form(10),
+    qdrant_filename: str = Form("default_filename"),
+    collection_name: str = Form("programming"),
+    mode: str = Form("rag"),
+    questions_per_chunk: int = Form(3),
+    top_k: int = Form(3),
+    temperature: float = Form(0.2),
+    validate: bool = Form(False),
+    use_model_verification: bool = Form(False)
+):
+    global rag
+    if rag is None:
+        raise HTTPException(status_code=503, detail="RAGMCQ not ready on server.")
+    try:
+        mcqs = rag.generate_from_qdrant(
+            filename=qdrant_filename,
+            collection=collection_name,
+            n_questions=n_questions,
+            mode=mode,
+            questions_per_chunk=questions_per_chunk,
+            top_k=top_k,
+            temperature=temperature
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Generation from saved file failed: {e}")
+    validation_report = None
+    if validate:
+        try:
+            # validate_mcqs expects keys as strings and the normalized content
+            validation_report = rag.validate_mcqs(mcqs, top_k=top_k, use_model_verification=use_model_verification)
+        except Exception as e:
+            # don't fail the whole request for a validation error — return generator output and note the error
+            validation_report = {"error": f"Validation failed: {e}"}
+    return {"mcqs": mcqs, "validation": validation_report}
+@app.post("/generate", response_model=GenerateResponse)
+async def generate_endpoint(
+    background_tasks: BackgroundTasks,
+    file: UploadFile = File(...),
+    n_questions: int = Form(10),
+    qdrant_filename: str = Form("default_filename"),
+    collection_name: str = Form("programming"),
+    mode: str = Form("rag"),
+    questions_per_page: int = Form(3),
+    top_k: int = Form(3),
+    temperature: float = Form(0.2),
+    validate: bool = Form(False),
+    use_model_verification: bool = Form(False)
+):
+    global rag
+    if rag is None:
+        raise HTTPException(status_code=503, detail="RAGMCQ not ready on server.")
+    # basic file validation
+    if not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF files are supported.")
+    # save uploaded file to a temp location
+    tmp_path = _save_upload_to_temp(file)
+    # ensure file removed afterward
+    def _cleanup(path: str):
+        try:
+            os.remove(path)
+        except Exception:
+            pass
+    background_tasks.add_task(_cleanup, tmp_path)
+    # save pdf
+    try:
+        rag.save_pdf_to_qdrant(tmp_path, filename=qdrant_filename, collection=collection_name, overwrite=True)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Could not save file to Qdrant Cloud: {e}")
+    # generate
+    try:
+        mcqs = rag.generate_from_pdf(
+            tmp_path,
+            n_questions=n_questions,
+            mode=mode,
+            questions_per_page=questions_per_page,
+            top_k=top_k,
+            temperature=temperature,
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Generation failed: {e}")
+    validation_report = None
+    if validate:
+        try:
+            # rag.build_index_from_pdf(tmp_path)
+            # validate_mcqs expects keys as strings and the normalized content
+            validation_report = rag.validate_mcqs(mcqs, top_k=top_k, use_model_verification=use_model_verification)
+        except Exception as e:
+            # don't fail the whole request for a validation error — return generator output and note the error
+            validation_report = {"error": f"Validation failed: {e}"}
+    return {"mcqs": mcqs, "validation": validation_report}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("app:app", host="0.0.0.0", port=8000, log_level="info")

app/app.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import os
+import shutil
+import tempfile
+from typing import Optional
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+# Import the user's RAGMCQ implementation
+from generator import RAGMCQ
+app = FastAPI(title="RAG MCQ Generator API")
+# allow cross-origin requests (adjust in production)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# global rag instance
+rag: Optional[RAGMCQ] = None
+class GenerateResponse(BaseModel):
+    mcqs: dict
+    validation: Optional[dict] = None
+class ListResponse(BaseModel):
+    files: list
+@app.on_event("startup")
+def startup_event():
+    global rag
+    # instantiate the heavy object once
+    rag = RAGMCQ()
+    print("RAGMCQ instance created on startup.")
+@app.get("/health")
+def health():
+    return {"status": "ok", "ready": rag is not None}
+def _save_upload_to_temp(upload: UploadFile) -> str:
+    suffix = ".pdf"
+    fd, path = tempfile.mkstemp(suffix=suffix)
+    os.close(fd)
+    with open(path, "wb") as out_file:
+        shutil.copyfileobj(upload.file, out_file)
+    return path
+@app.get("/list_collection_files", response_model=ListResponse)
+async def list_collection_files_endpoint(
+    collection_name: str = "programming"
+):
+    global rag
+    if rag is None:
+        raise HTTPException(status_code=503, detail="RAGMCQ not ready on server.")
+    files = rag.list_files_in_collection(collection_name)
+    return {"files": files}
+@app.post("/generate_saved", response_model=GenerateResponse)
+async def generate_saved_endpoint(
+    n_questions: int = Form(10),
+    qdrant_filename: str = Form("default_filename"),
+    collection_name: str = Form("programming"),
+    mode: str = Form("rag"),
+    questions_per_chunk: int = Form(3),
+    top_k: int = Form(3),
+    temperature: float = Form(0.2),
+    validate: bool = Form(False),
+    use_model_verification: bool = Form(False)
+):
+    global rag
+    if rag is None:
+        raise HTTPException(status_code=503, detail="RAGMCQ not ready on server.")
+    try:
+        mcqs = rag.generate_from_qdrant(
+            filename=qdrant_filename,
+            collection=collection_name,
+            n_questions=n_questions,
+            mode=mode,
+            questions_per_chunk=questions_per_chunk,
+            top_k=top_k,
+            temperature=temperature
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Generation from saved file failed: {e}")
+    validation_report = None
+    if validate:
+        try:
+            # validate_mcqs expects keys as strings and the normalized content
+            validation_report = rag.validate_mcqs(mcqs, top_k=top_k, use_model_verification=use_model_verification)
+        except Exception as e:
+            # don't fail the whole request for a validation error — return generator output and note the error
+            validation_report = {"error": f"Validation failed: {e}"}
+    return {"mcqs": mcqs, "validation": validation_report}
+@app.post("/generate", response_model=GenerateResponse)
+async def generate_endpoint(
+    background_tasks: BackgroundTasks,
+    file: UploadFile = File(...),
+    n_questions: int = Form(10),
+    qdrant_filename: str = Form("default_filename"),
+    collection_name: str = Form("programming"),
+    mode: str = Form("rag"),
+    questions_per_page: int = Form(3),
+    top_k: int = Form(3),
+    temperature: float = Form(0.2),
+    validate: bool = Form(False),
+    use_model_verification: bool = Form(False)
+):
+    global rag
+    if rag is None:
+        raise HTTPException(status_code=503, detail="RAGMCQ not ready on server.")
+    # basic file validation
+    if not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF files are supported.")
+    # save uploaded file to a temp location
+    tmp_path = _save_upload_to_temp(file)
+    # ensure file removed afterward
+    def _cleanup(path: str):
+        try:
+            os.remove(path)
+        except Exception:
+            pass
+    background_tasks.add_task(_cleanup, tmp_path)
+    # save pdf
+    try:
+        rag.save_pdf_to_qdrant(tmp_path, filename=qdrant_filename, collection=collection_name, overwrite=True)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Could not save file to Qdrant Cloud: {e}")
+    # generate
+    try:
+        mcqs = rag.generate_from_pdf(
+            tmp_path,
+            n_questions=n_questions,
+            mode=mode,
+            questions_per_page=questions_per_page,
+            top_k=top_k,
+            temperature=temperature,
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Generation failed: {e}")
+    validation_report = None
+    if validate:
+        try:
+            # rag.build_index_from_pdf(tmp_path)
+            # validate_mcqs expects keys as strings and the normalized content
+            validation_report = rag.validate_mcqs(mcqs, top_k=top_k, use_model_verification=use_model_verification)
+        except Exception as e:
+            # don't fail the whole request for a validation error — return generator output and note the error
+            validation_report = {"error": f"Validation failed: {e}"}
+    return {"mcqs": mcqs, "validation": validation_report}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("app:app", host="0.0.0.0", port=8000, log_level="info")

app/generator.py ADDED Viewed

	@@ -0,0 +1,695 @@

+import re
+import random
+import numpy as np
+from typing import List, Tuple, Dict, Any, Optional
+from sentence_transformers import SentenceTransformer
+from uuid import uuid4
+import pymupdf4llm
+import pymupdf as fitz
+try:
+    from qdrant_client import QdrantClient
+    from qdrant_client.http.models import (
+        PointStruct,
+        Filter,
+        FieldCondition,
+        MatchValue,
+        Distance,
+        VectorParams,
+    )
+    from qdrant_client.http import models as rest
+    _HAS_QDRANT = True
+except Exception:
+    _HAS_QDRANT = False
+try:
+    import faiss
+    _HAS_FAISS = True
+except Exception:
+    _HAS_FAISS = False
+from utils import generate_mcqs_from_text, _post_chat, _safe_extract_json
+class RAGMCQ:
+    def __init__(
+        self,
+        embedder_model: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
+        hf_model: str = "gpt-oss-120b",
+        qdrant_url: str = None,
+        qdrant_api_key: str = None,
+        qdrant_prefer_grpc: bool = False,
+    ):
+        self.embedder = SentenceTransformer(embedder_model)
+        self.hf_model = hf_model
+        self.embeddings = None   # np.array of shape (N, D)
+        self.texts = []          # list of chunk texts
+        self.metadata = []       # list of dicts (page, chunk_id, char_range)
+        self.index = None
+        self.dim = self.embedder.get_sentence_embedding_dimension()
+        self.qdrant = None
+        self.qdrant_url = qdrant_url
+        self.qdrant_api_key = qdrant_api_key
+        self.qdrant_prefer_grpc = qdrant_prefer_grpc
+        if qdrant_url:
+            self.connect_qdrant(qdrant_url, qdrant_api_key, qdrant_prefer_grpc)
+    def extract_pages(
+        self,
+        pdf_path: str,
+        *,
+        pages: Optional[List[int]] = None,
+        ignore_images: bool = False,
+        dpi: int = 150
+    ) -> List[str]:
+        doc = fitz.open(pdf_path)
+        try:
+            # request page-wise output (page_chunks=True -> list[dict] per page)
+            page_dicts = pymupdf4llm.to_markdown(
+                doc,
+                pages=pages,
+                ignore_images=ignore_images,
+                dpi=dpi,
+                page_chunks=True,
+            )
+            # to_markdown(..., page_chunks=True) returns a list of dicts, each has key "text" (markdown)
+            pages_md: List[str] = []
+            for p in page_dicts:
+                txt = p.get("text", "") or ""
+                pages_md.append(txt.strip())
+            return pages_md
+        finally:
+            doc.close()
+    def chunk_text(self, text: str, max_chars: int = 1200) -> List[str]:
+        text = text.strip()
+        if not text:
+            return []
+        if len(text) <= max_chars:
+            return [text]
+        # split by sentence-like boundaries
+        sentences = re.split(r'(?<=[\.\?\!])\s+', text)
+        chunks = []
+        cur = ""
+        for s in sentences:
+            if len(cur) + len(s) + 1 <= max_chars:
+                cur += (" " if cur else "") + s
+            else:
+                if cur:
+                    chunks.append(cur)
+                cur = s
+        if cur:
+            chunks.append(cur)
+        # if still too long, hard-split
+        final = []
+        for c in chunks:
+            if len(c) <= max_chars:
+                final.append(c)
+            else:
+                for i in range(0, len(c), max_chars):
+                    final.append(c[i:i+max_chars])
+        return final
+    def build_index_from_pdf(self, pdf_path: str, max_chars: int = 1200):
+        pages = self.extract_pages(pdf_path)
+        self.texts = []
+        self.metadata = []
+        for p_idx, page_text in enumerate(pages, start=1):
+            chunks = self.chunk_text(page_text or "", max_chars=max_chars)
+            for cid, ch in enumerate(chunks, start=1):
+                self.texts.append(ch)
+                self.metadata.append({"page": p_idx, "chunk_id": cid, "length": len(ch)})
+        if not self.texts:
+            raise RuntimeError("No text extracted from PDF.")
+        # compute embeddings
+        emb = self.embedder.encode(self.texts, convert_to_numpy=True, show_progress_bar=True)
+        self.embeddings = emb.astype("float32")
+        self._build_faiss_index()
+    def _build_faiss_index(self):
+        if _HAS_FAISS:
+            d = self.embeddings.shape[1]
+            index = faiss.IndexFlatIP(d)  # inner product -> cosine if vectors normalized
+            faiss.normalize_L2(self.embeddings)
+            index.add(self.embeddings)
+            self.index = index
+        else:
+            # store normalized embeddings and use brute-force numpy
+            norms = np.linalg.norm(self.embeddings, axis=1, keepdims=True) + 1e-10
+            self.embeddings = self.embeddings / norms
+            self.index = None
+    def _retrieve(self, query: str, top_k: int = 3) -> List[Tuple[int, float]]:
+        q_emb = self.embedder.encode([query], convert_to_numpy=True).astype("float32")
+        if _HAS_FAISS:
+            faiss.normalize_L2(q_emb)
+            D_list, I_list = self.index.search(q_emb, top_k)
+            # D are inner products; return list of (idx, score)
+            return [(int(i), float(d)) for i, d in zip(I_list[0], D_list[0]) if i != -1]
+        else:
+            qn = q_emb / (np.linalg.norm(q_emb, axis=1, keepdims=True) + 1e-10)
+            sims = (self.embeddings @ qn.T).squeeze(axis=1)
+            idxs = np.argsort(-sims)[:top_k]
+            return [(int(i), float(sims[i])) for i in idxs]
+    def generate_from_pdf(
+        self,
+        pdf_path: str,
+        n_questions: int = 10,
+        mode: str = "rag", # per_page or rag
+        questions_per_page: int = 3, # for per_page mode
+        top_k: int = 3, # chunks to retrieve for each question in rag mode
+        temperature: float = 0.2,
+    ) -> Dict[str, Any]:
+        # build index
+        self.build_index_from_pdf(pdf_path)
+        output: Dict[str, Any] = {}
+        qcount = 0
+        if mode == "per_page":
+            # iterate pages -> chunks
+            for idx, meta in enumerate(self.metadata):
+                chunk_text = self.texts[idx]
+                if not chunk_text.strip():
+                    continue
+                to_gen = questions_per_page
+                # ask generator
+                try:
+                    mcq_block = generate_mcqs_from_text(
+                        chunk_text, n=to_gen, model=self.hf_model, temperature=temperature
+                    )
+                except Exception as e:
+                    # skip this chunk if generator fails
+                    print(f"Generator failed on page {meta['page']} chunk {meta['chunk_id']}: {e}")
+                    continue
+                for item in sorted(mcq_block.keys(), key=lambda x: int(x)):
+                    qcount += 1
+                    output[str(qcount)] = mcq_block[item]
+                    if qcount >= n_questions:
+                        return output
+            return output
+        elif mode == "rag":
+            # strategy: create a few natural short queries by sampling sentences or using chunk summaries.
+            # create queries by sampling chunk text sentences.
+            # stop when n_questions reached or max_attempts exceeded.
+            attempts = 0
+            max_attempts = n_questions * 4
+            while qcount < n_questions and attempts < max_attempts:
+                attempts += 1
+                # create a seed query: pick a random chunk, pick a sentence from it
+                seed_idx = random.randrange(len(self.texts))
+                chunk = self.texts[seed_idx]
+                sents = re.split(r'(?<=[\.\?\!])\s+', chunk)
+                seed_sent = random.choice([s for s in sents if len(s.strip()) > 20]) if sents else chunk[:200]
+                query = f"Create questions about: {seed_sent}"
+                # retrieve top_k chunks
+                retrieved = self._retrieve(query, top_k=top_k)
+                context_parts = []
+                for ridx, score in retrieved:
+                    md = self.metadata[ridx]
+                    context_parts.append(f"[page {md['page']}] {self.texts[ridx]}")
+                context = "\n\n".join(context_parts)
+                # call generator for 1 question (or small batch) with the retrieved context
+                try:
+                    # request 1 question at a time to keep diversity
+                    mcq_block = generate_mcqs_from_text(
+                        context, n=1, model=self.hf_model, temperature=temperature
+                    )
+                except Exception as e:
+                    print(f"Generator failed during RAG attempt {attempts}: {e}")
+                    continue
+                # append result(s)
+                for item in sorted(mcq_block.keys(), key=lambda x: int(x)):
+                    qcount += 1
+                    output[str(qcount)] = mcq_block[item]
+                    if qcount >= n_questions:
+                        return output
+            return output
+        else:
+            raise ValueError("mode must be 'per_page' or 'rag'.")
+    def validate_mcqs(
+        self,
+        mcqs: Dict[str, Any],
+        top_k: int = 4,
+        similarity_threshold: float = 0.5,
+        evidence_score_cutoff: float = 0.5,
+        use_model_verification: bool = True,
+        model_verification_temperature: float = 0.0,
+    ) -> Dict[str, Any]:
+        if self.embeddings is None or not self.texts:
+            raise RuntimeError("Index/embeddings not built. Run build_index_from_pdf() first.")
+        report: Dict[str, Any] = {}
+        # helper: semantic similarity search on statement -> returns list of (idx, score)
+        def semantic_search(statement: str, k: int = top_k):
+            q_emb = self.embedder.encode([statement], convert_to_numpy=True).astype("float32")
+            if _HAS_FAISS:
+                faiss.normalize_L2(q_emb)
+                D_list, I_list = self.index.search(q_emb, k)
+                # D are inner products; return list of (idx, score)
+                return [(int(i), float(d)) for i, d in zip(I_list[0], D_list[0]) if i != -1]
+            else:
+                qn = q_emb / (np.linalg.norm(q_emb, axis=1, keepdims=True) + 1e-10)
+                sims = (self.embeddings @ qn.T).squeeze(axis=1)
+                idxs = np.argsort(-sims)[:k]
+                return [(int(i), float(sims[i])) for i in idxs]
+        # helper: verify with model (strict JSON in response)
+        def _verify_with_model(question_text: str, options: Dict[str, str], correct_text: str, context_text: str):
+            system = {
+                "role": "system",
+                "content": (
+                    "Bạn là một trợ lý đánh giá tính thực chứng của câu hỏi trắc nghiệm dựa trên đoạn văn được cung cấp. "
+                    "Hãy trả lời DUY NHẤT bằng JSON hợp lệ (không có văn bản khác) theo schema:\n\n"
+                    "{\n"
+                    '  "supported": true/false,            # câu trả lời đúng có được nội dung chứng thực không\n'
+                    '  "confidence": 0.0-1.0,              # mức độ tự tin (số)\n'
+                    '  "evidence": "cụm văn bản ngắn làm bằng chứng hoặc trích dẫn",\n'
+                    '  "reason": "ngắn gọn, vì sao supported hoặc không"\n'
+                    "}\n\n"
+                    "Luôn dựa chỉ trên nội dung trong trường 'Context' dưới đây. Nếu nội dung không chứa bằng chứng, trả về supported: false."
+                )
+            }
+            user = {
+                "role": "user",
+                "content": (
+                    "Câu hỏi:\n" + question_text + "\n\n"
+                    "Lựa chọn:\n" + "\n".join([f"{k}: {v}" for k, v in options.items()]) + "\n\n"
+                    "Đáp án:\n" + correct_text + "\n\n"
+                    "Context:\n" + context_text + "\n\n"
+                    "Hãy trả lời như yêu cầu."
+                )
+            }
+            raw = _post_chat([system, user], model=self.hf_model, temperature=model_verification_temperature)
+            # parse JSON object in response
+            try:
+                parsed = _safe_extract_json(raw)
+            except Exception as e:
+                return {"error": f"Model verification failed to return JSON: {e}", "raw": raw}
+            return parsed
+        # iterate MCQs
+        for qid, item in mcqs.items():
+            q_text = item.get("câu hỏi", "").strip()
+            options = item.get("lựa chọn", {})
+            correct_text = item.get("đáp án", "").strip()
+            # form a short declarative statement to embed: "Question: ... Answer: <correct>"
+            statement = f"{q_text} Answer: {correct_text}"
+            retrieved = semantic_search(statement, k=top_k)
+            evidence_list = []
+            max_sim = 0.0
+            for idx, score in retrieved:
+                if score >= evidence_score_cutoff:
+                    evidence_list.append({
+                        "idx": idx,
+                        "page": self.metadata[idx].get("page", None),
+                        "score": float(score),
+                        "text": (self.texts[idx][:1000] + ("..." if len(self.texts[idx]) > 1000 else "")),
+                    })
+                if score > max_sim:
+                    max_sim = float(score)
+            supported_by_embeddings = max_sim >= similarity_threshold
+            model_verdict = None
+            if use_model_verification:
+                # build a context string from top retrieved chunks (regardless of cutoff)
+                context_parts = []
+                for ridx, sc in retrieved:
+                    md = self.metadata[ridx]
+                    context_parts.append(f"[page {md.get('page')}] {self.texts[ridx]}")
+                context_text = "\n\n".join(context_parts)
+                try:
+                    parsed = _verify_with_model(q_text, options, correct_text, context_text)
+                    model_verdict = parsed
+                except Exception as e:
+                    model_verdict = {"error": f"verification exception: {e}"}
+            report[qid] = {
+                "supported_by_embeddings": bool(supported_by_embeddings),
+                "max_similarity": float(max_sim),
+                "evidence": evidence_list,
+                "model_verdict": model_verdict,
+            }
+        return report
+    def connect_qdrant(self, url: str, api_key: str = None, prefer_grpc: bool = False):
+        if not _HAS_QDRANT:
+            raise RuntimeError("qdrant-client is not installed. Install with `pip install qdrant-client`.")
+        self.qdrant_url = url
+        self.qdrant_api_key = api_key
+        self.qdrant_prefer_grpc = prefer_grpc
+        # Create client
+        self.qdrant = QdrantClient(url=url, api_key=api_key, prefer_grpc=prefer_grpc)
+    def _ensure_collection(self, collection_name: str):
+        if self.qdrant is None:
+            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")
+        try:
+            # get_collection will raise if not present
+            _ = self.qdrant.get_collection(collection_name)
+        except Exception:
+            # create collection with vector size = self.dim
+            vect_params = VectorParams(size=self.dim, distance=Distance.COSINE)
+            self.qdrant.recreate_collection(collection_name=collection_name, vectors_config=vect_params)
+            # recreate_collection ensures a clean collection; if you prefer to avoid wiping use create_collection instead.
+    def save_pdf_to_qdrant(
+        self,
+        pdf_path: str,
+        filename: str,
+        collection: str,
+        max_chars: int = 1200,
+        batch_size: int = 64,
+        overwrite: bool = False,
+    ):
+        if self.qdrant is None:
+            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")
+        # extract pages and chunks (re-using your existing helpers)
+        pages = self.extract_pages(pdf_path)
+        all_chunks = []
+        all_meta = []
+        for p_idx, page_text in enumerate(pages, start=1):
+            chunks = self.chunk_text(page_text or "", max_chars=max_chars)
+            for cid, ch in enumerate(chunks, start=1):
+                all_chunks.append(ch)
+                all_meta.append({"page": p_idx, "chunk_id": cid, "length": len(ch)})
+        if not all_chunks:
+            raise RuntimeError("No text extracted from PDF.")
+        # ensure collection exists
+        self._ensure_collection(collection)
+        # optional: delete previous points for this filename if overwrite
+        if overwrite:
+            # delete by filter: filename == filename
+            flt = Filter(must=[FieldCondition(key="filename", match=MatchValue(value=filename))])
+            try:
+                # qdrant-client delete uses delete(
+                self.qdrant.delete(collection_name=collection, filter=flt)
+            except Exception:
+                # ignore if deletion fails
+                pass
+        # compute embeddings in batches
+        embeddings = self.embedder.encode(all_chunks, convert_to_numpy=True, show_progress_bar=True)
+        embeddings = embeddings.astype("float32")
+        # prepare points
+        points = []
+        for i, (emb, md, txt) in enumerate(zip(embeddings, all_meta, all_chunks)):
+            pid = str(uuid4())
+            source_id = f"{filename}__p{md['page']}__c{md['chunk_id']}"
+            payload = {
+                "filename": filename,
+                "page": md["page"],
+                "chunk_id": md["chunk_id"],
+                "length": md["length"],
+                "text": txt,
+                "source_id": source_id,
+            }
+            points.append(PointStruct(id=pid, vector=emb.tolist(), payload=payload))
+            # upsert in batches
+            if len(points) >= batch_size:
+                self.qdrant.upsert(collection_name=collection, points=points)
+                points = []
+        # upsert remaining
+        if points:
+            self.qdrant.upsert(collection_name=collection, points=points)
+        try:
+            self.qdrant.create_payload_index(
+                collection_name=collection,
+                field_name="filename",
+                field_schema=rest.PayloadSchemaType.KEYWORD
+            )
+        except Exception as e:
+            print(f"Index creation skipped or failed: {e}")
+        return {"status": "ok", "uploaded_chunks": len(all_chunks), "collection": collection, "filename": filename}
+    def list_files_in_collection(
+        self,
+        collection: str,
+        payload_field: str = "filename",
+        batch_size: int = 500,
+    ) -> List[str]:
+        if self.qdrant is None:
+            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")
+        # ensure collection exists
+        try:
+            if not self.qdrant.collection_exists(collection):
+                raise RuntimeError(f"Collection '{collection}' does not exist.")
+        except Exception:
+            # collection_exists may raise if server unreachable
+            raise
+        filenames = set()
+        offset = None
+        while True:
+            # scroll returns (points, next_offset)
+            pts, next_offset = self.qdrant.scroll(
+                collection_name=collection,
+                limit=batch_size,
+                offset=offset,
+                with_payload=[payload_field],
+                with_vectors=False,
+            )
+            if not pts:
+                break
+            for p in pts:
+                # p may be a dict-like or an object with .payload
+                payload = None
+                if hasattr(p, "payload"):
+                    payload = p.payload
+                elif isinstance(p, dict):
+                    # older/newer variants might use nested structures: try common keys
+                    payload = p.get("payload") or p.get("payload", None) or p
+                else:
+                    # best-effort fallback: convert to dict if possible
+                    try:
+                        payload = dict(p)
+                    except Exception:
+                        payload = None
+                if not payload:
+                    continue
+                # extract candidate value(s)
+                val = None
+                if isinstance(payload, dict):
+                    val = payload.get(payload_field)
+                else:
+                    # Some payload representations store fields differently; try attribute access
+                    val = getattr(payload, payload_field, None)
+                # If value is list-like, iterate, else add single
+                if isinstance(val, (list, tuple, set)):
+                    for v in val:
+                        if v is not None:
+                            filenames.add(str(v))
+                elif val is not None:
+                    filenames.add(str(val))
+            # stop if no more pages
+            if not next_offset:
+                break
+            offset = next_offset
+        return sorted(filenames)
+    def list_chunks_for_filename(self, collection: str, filename: str, batch: int = 256) -> List[Dict[str, Any]]:
+        if self.qdrant is None:
+            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")
+        results = []
+        offset = None
+        while True:
+            # scroll returns (points, next_offset)
+            points, next_offset = self.qdrant.scroll(
+                collection_name=collection,
+                scroll_filter=Filter(
+                    must=[
+                        FieldCondition(key="filename", match=MatchValue(value=filename))
+                    ]
+                ),
+                limit=batch,
+                offset=offset,
+                with_payload=True,
+                with_vectors=False,
+            )
+            # points are objects (Record / ScoredPoint-like); get id and payload
+            for p in points:
+                # p.payload is a dict, p.id is point id
+                results.append({"point_id": p.id, "payload": p.payload})
+            if not next_offset:
+                break
+            offset = next_offset
+        return results
+    def _retrieve_qdrant(self, query: str, collection: str, filename: str = None, top_k: int = 3) -> List[Tuple[Dict[str, Any], float]]:
+        if self.qdrant is None:
+            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")
+        q_emb = self.embedder.encode([query], convert_to_numpy=True).astype("float32")[0].tolist()
+        q_filter = None
+        if filename:
+            q_filter = Filter(must=[FieldCondition(key="filename", match=MatchValue(value=filename))])
+        search_res = self.qdrant.search(
+            collection_name=collection,
+            query_vector=q_emb,
+            query_filter=q_filter,
+            limit=top_k,
+            with_payload=True,
+            with_vectors=False,
+        )
+        out = []
+        for hit in search_res:
+            # hit.payload is the stored payload, hit.score is similarity
+            out.append((hit.payload, float(getattr(hit, "score", 0.0))))
+        return out
+    def generate_from_qdrant(
+        self,
+        filename: str,
+        collection: str,
+        n_questions: int = 10,
+        mode: str = "rag",               # 'per_chunk' or 'rag'
+        questions_per_chunk: int = 3,    # used for 'per_chunk'
+        top_k: int = 3,                  # retrieval size used in RAG
+        temperature: float = 0.2,
+    ) -> Dict[str, Any]:
+        if self.qdrant is None:
+            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")
+        # get all chunks for this filename (payload should contain 'text', 'page', 'chunk_id', etc.)
+        file_points = self.list_chunks_for_filename(collection=collection, filename=filename)
+        if not file_points:
+            raise RuntimeError(f"No chunks found for filename={filename} in collection={collection}.")
+        # create a local list of texts & metadata for sampling
+        texts = []
+        metas = []
+        for p in file_points:
+            payload = p.get("payload", {})
+            text = payload.get("text", "")
+            texts.append(text)
+            metas.append(payload)
+        self.texts = texts
+        self.metadata = metas
+        embeddings = self.embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True)
+        if embeddings is None or len(embeddings) == 0:
+            self.embeddings = None
+            self.index = None
+        else:
+            self.embeddings = embeddings.astype("float32")
+            # update dim in case embedder changed unexpectedly
+            self.dim = int(self.embeddings.shape[1])
+            # build index
+            self._build_faiss_index()
+        output = {}
+        qcount = 0
+        if mode == "per_chunk":
+            # iterate all chunks (in payload order) and request questions_per_chunk from each
+            for i, txt in enumerate(texts):
+                if not txt.strip():
+                    continue
+                to_gen = questions_per_chunk
+                try:
+                    mcq_block = generate_mcqs_from_text(txt, n=to_gen, model=self.hf_model, temperature=temperature)
+                except Exception as e:
+                    print(f"Generator failed on chunk (index {i}): {e}")
+                    continue
+                for item in sorted(mcq_block.keys(), key=lambda x: int(x)):
+                    qcount += 1
+                    output[str(qcount)] = mcq_block[item]
+                    if qcount >= n_questions:
+                        return output
+            return output
+        elif mode == "rag":
+            attempts = 0
+            max_attempts = n_questions * 4
+            while qcount < n_questions and attempts < max_attempts:
+                attempts += 1
+                # sample a seed sentence from a random chunk of this file
+                seed_idx = random.randrange(len(texts))
+                chunk = texts[seed_idx]
+                sents = re.split(r'(?<=[\.\?\!])\s+', chunk)
+                seed_sent = None
+                for s in sents:
+                    if len(s.strip()) > 20:
+                        seed_sent = s
+                        break
+                if not seed_sent:
+                    seed_sent = chunk[:200]
+                query = f"Create questions about: {seed_sent}"
+                # retrieve top_k chunks from the same file (restricted by filename filter)
+                retrieved = self._retrieve_qdrant(query=query, collection=collection, filename=filename, top_k=top_k)
+                context_parts = []
+                for payload, score in retrieved:
+                    # payload should contain page & chunk_id and text
+                    page = payload.get("page", "?")
+                    ctxt = payload.get("text", "")
+                    context_parts.append(f"[page {page}] {ctxt}")
+                context = "\n\n".join(context_parts)
+                try:
+                    mcq_block = generate_mcqs_from_text(context, n=1, model=self.hf_model, temperature=temperature)
+                except Exception as e:
+                    print(f"Generator failed during RAG attempt {attempts}: {e}")
+                    continue
+                for item in sorted(mcq_block.keys(), key=lambda x: int(x)):
+                    qcount += 1
+                    output[str(qcount)] = mcq_block[item]
+                    if qcount >= n_questions:
+                        return output
+            return output
+        else:
+            raise ValueError("mode must be 'per_chunk' or 'rag'.")

app/output.json ADDED Viewed

The diff for this file is too large to render. See raw diff

app/software_report_template.md ADDED Viewed

	@@ -0,0 +1,261 @@

+# Software Report: RAG-based MCQ Generation System
+## 1. Overview / Abstract
+The project provides an API service that ingests a PDF document and automatically generates multiple–choice questions (MCQs) using a Retrieval-Augmented Generation (RAG) pipeline. It exposes a FastAPI endpoint (`/generate`) that orchestrates: PDF text extraction → chunking → embedding + indexing → (mode-dependent) context selection → MCQ generation via an LLM (Together AI chat completion) → optional semantic + model-based validation.
+Core components:
+- Controller (FastAPI endpoints) – handles HTTP, file upload, response shaping.
+- Use Case (RAGMCQ class) – encapsulates business logic: indexing, retrieval, generation, validation.
+- Repositories / Data Stores – implicit: in‑memory lists of chunks, embeddings, optional FAISS index.
+## 2. High-Level Workflow Diagram
+### Mermaid Activity Diagram
+```mermaid
+flowchart LR
+    A[Client Uploads PDF -> /generate] --> B{Mode?}
+    B -->|rag| R1[Extract & Chunk PDF]
+    B -->|per_page| R1
+    R1 --> R2[SentenceTransformer Embeddings]
+    R2 --> R3{FAISS Available?}
+    R3 -->|Yes| R4[Build FAISS Index]
+    R3 -->|No| R5[Normalize Embeddings (NumPy)]
+    R4 --> R6[Question Generation Loop]
+    R5 --> R6
+    R6 -->|rag: sample queries + retrieve top-k| R7[Assemble Context]
+    R6 -->|per_page: iterate chunks| R7
+    R7 --> G1[Prompt LLM (JSON MCQs)]
+    G1 --> P1[Parse & Validate JSON shape]
+    P1 --> C{Need more?}
+    C -->|Yes| R6
+    C -->|No| V{Validation requested?}
+    V -->|Yes| V1[Semantic Evidence Search + (Optional) Model Verification]
+    V -->|No| OUT[Return MCQs]
+    V1 --> OUT
+```
+### Alternative PlantUML Activity (Optional)
+```plantuml
+@startuml
+start
+:Upload PDF (multipart form);
+:Select params (mode, n_questions,...);
+:Extract pages via pdfplumber;
+:Chunk text (sentence pack <= max_chars);
+:Embed chunks (SentenceTransformer);
+if (FAISS installed?) then (yes)
+  :Build FAISS IndexFlatIP + L2 normalize;
+else (no)
+  :Keep normalized NumPy embeddings;
+endif
+repeat
+if (mode == per_page) then (per_page)
+  :Take next chunk;
+else (rag)
+  :Sample seed sentence;
+  :Encode query & retrieve top-k chunks;
+endif
+:Assemble context;
+:Call Together AI chat completion (prompt -> JSON);
+:Parse JSON + accumulate MCQs;
+repeat while (Need more questions?) is (yes)
+end repeat
+if (validate?) then (yes)
+  :For each Q -> build statement;
+  :Similarity search top_k evidence;
+  if (Insufficient sim & model verify on) then (yes)
+    :Call model for verification JSON;
+  endif
+  :Build validation report;
+endif
+:Return response JSON;
+stop
+@enduml
+```
+## 3. Repository–Controller–Use Case Abstraction
+| Layer | Responsibility | In This Project |
+|-------|---------------|-----------------|
+| Controller | HTTP I/O, request validation, mapping domain results to API schema | `app.py` endpoints (`/health`, `/generate`) |
+| Use Case | Orchestrates domain flow, independent of HTTP details | `RAGMCQ` methods: `build_index_from_pdf`, `generate_from_pdf`, `validate_mcqs` |
+| Repository (implicit) | Data persistence / retrieval | In-memory: `texts`, `metadata`, `embeddings`, `FAISS index` (no external DB) |
+Data Flow (simplified):
+Client → Controller(`/generate`) → UseCase(`generate_from_pdf`) → (Extract + Chunk + Embed + Index + Retrieve + Generate) → Controller (normalize/optional validation) → Response
+## 4. Detailed Pipeline Explanation
+### 4.1 PDF Text Extraction & Chunking
+- File saved to a temp path, then `pdfplumber` loads each page.
+- `extract_pages()` returns list of raw page strings.
+- `chunk_text()` packs sentences (regex split on punctuation boundaries) into segments up to `max_chars` (default 1200). If a sentence overflows, the existing chunk is flushed. Residual oversize chunks are hard-split.
+- Metadata collected: page number, chunk id, length.
+### 4.2 Embedding Generation
+- Model: `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` loaded via `SentenceTransformer`.
+- Batched encoding of all chunks → NumPy array (float32).
+- If FAISS installed: L2 normalize embeddings, create `IndexFlatIP` (inner product ~ cosine after normalization), add embeddings.
+- Else: manually store normalized embeddings for brute-force cosine similarity with matrix multiply.
+### 4.3 Retrieval Strategy
+Two modes:
+1. `per_page`: Sequentially process each chunk; each call to the LLM asks for `questions_per_page` new MCQs until target `n_questions` reached.
+2. `rag`: Loop builds a synthetic query by sampling a random chunk and a sentence. Retrieval:
+   - Encode query → similarity search (FAISS or NumPy).
+   - Take top-k chunk texts; join them with page tags as context.
+   - Request 1 question per iteration (promotes diversity). Up to `max_attempts = n_questions * 4`.
+Similarity Metric: Inner product on normalized vectors (equivalent to cosine). Sorting by descending similarity.
+### 4.4 Question Generation Prompt Template
+Implemented in `generate_mcqs_from_text` (utils):
+- System message (Vietnamese) forcing strict JSON schema:
+  ```json
+  {
+    "1": { "câu hỏi": "...", "lựa chọn": {"a":"...","b":"...","c":"...","d":"..."}, "đáp án":"..."},
+    "2": { ... }
+  }
+  ```
+- Constraints: exactly `n` entries; answer must be full text identical to one option; no explanations.
+- User message: instructs generation from provided source text only.
+- Post-processing: Regex extracts first JSON object; attempts `json.loads`; fallback removes trailing commas.
+### 4.5 Validation (Optional)
+For each MCQ (after normalization in controller):
+1. Construct statement: `Question + Answer`.
+2. Embed query → retrieve top_k evidence chunks.
+3. Mark `supported_by_embeddings` if max similarity ≥ threshold.
+4. If not supported and model verification enabled, call verification LLM prompt (also JSON-only) to assess `supported`, `confidence`, `evidence`, `reason`.
+### 4.6 Together AI Integration
+- Endpoint: `https://api.together.xyz/v1/chat/completions`.
+- Authorization header uses `TOGETHER_KEY` environment variable.
+- Payload: `{ model, messages, temperature }`.
+- Response Handling: support both OpenAI-like `choices[0].message.content` and fallback `choices[0].text`.
+## 5. API Endpoints
+### 5.1 Health Check
+GET `/health`
+Response:
+```json
+{ "status": "ok", "ready": true }
+```
+### 5.2 Generate MCQs
+POST `/generate` (multipart/form-data)
+Fields:
+- `file` (PDF) – required
+- `n_questions` (int, default 10)
+- `mode` ("rag" | "per_page", default "rag")
+- `questions_per_page` (int, default 3) – used only in per_page mode
+- `top_k` (int, default 3) – retrieval depth (rag & validation)
+- `temperature` (float, default 0.2)
+- `validate` (bool, default false)
+- `debug` (bool) – if truthy writes `output.json` locally
+Example Request (curl, PowerShell style quoting simplified):
+```bash
+curl -X POST http://localhost:8000/generate ^
+  -F "file=@sample.pdf" ^
+  -F "n_questions=5" ^
+  -F "mode=rag" ^
+  -F "top_k=3" ^
+  -F "validate=true"
+```
+Success Response (validation on, abbreviated):
+```json
+{
+  "mcqs": {
+    "1": { "câu hỏi": "...", "lựa chọn": {"a":"...","b":"...","c":"...","d":"..."}, "đáp án": "..."},
+    "2": { "câu hỏi": "...", "lựa chọn": { ... }, "đáp án": "..." }
+  },
+  "validation": {
+    "1": {
+      "supported_by_embeddings": true,
+      "max_similarity": 0.83,
+      "evidence": [ { "page": 2, "score": 0.81, "text": "Excerpt..." } ],
+      "model_verdict": null
+    }
+  }
+}
+```
+Error Examples:
+- 400: non-PDF upload
+- 500: generation pipeline error (e.g., empty PDF or model failure)
+- 503: service not initialized
+## 6. Data Structures & Types (Conceptual)
+- Chunk: `{ text: str, page: int, chunk_id: int, length: int }`
+- MCQ (generated raw): `{ "câu hỏi": str, "lựa chọn": {"a": str, ...}, "đáp án": str }`
+- Normalized MCQ (API shaping): `{ mcq: str, options: { .. }, correct: str }`
+- Validation Entry: `{ supported_by_embeddings: bool, max_similarity: float, evidence: [ {page, score, text}... ], model_verdict?: {...} }`
+## 7. Configuration Points
+| Parameter | Location | Purpose |
+|-----------|----------|---------|
+| `embedder_model` | `RAGMCQ.__init__` | Pretrained SentenceTransformer model name |
+| `hf_model` | `RAGMCQ.__init__` | LLM model name for generation/verification |
+| `top_k` | API form field & internal methods | Retrieval depth |
+| `temperature` | API form field | Creativity vs determinism |
+| `questions_per_page` | API form field | Batch size per chunk in per_page mode |
+## 8. Simple Code Improvements (Quick Wins)
+Below are low-risk refactors to make the code cleaner and more maintainable:
+1. Environment Variable Safety:
+   ```python
+   def _require_env(name: str) -> str:
+       val = os.getenv(name)
+       if not val:
+           raise RuntimeError(f"Missing required environment variable: {name}")
+       return val
+   TOGETHER_KEY = _require_env("TOGETHER_KEY")
+   ```
+2. Remove Unused Constant: `API_URL` in `utils.py` is unused (can delete to avoid confusion).
+3. Unify Header Construction: Replace separate `HEADERS` / `TOGETHER_HEADERS` with a single function `auth_headers(provider)` that returns the correct dict.
+4. Add Dataclass for MCQ:
+   ```python
+   from dataclasses import dataclass
+   @dataclass
+   class MCQ: question: str; options: Dict[str,str]; answer: str
+   ```
+   Helps type clarity in validation.
+5. Extract Prompt Templates: Store system/user template strings as module-level constants to avoid duplication and ease future edits.
+6. Fail-Fast on Empty PDF: Early check after extraction to return a user-friendly error message rather than a generic 500 later.
+7. Replace Random Query Sampling Magic Numbers: Expose `max_attempts_factor` as a parameter (currently `n_questions * 4`).
+8. Vector Normalization Consistency: Always keep an unnormalized copy if future scoring types are needed; currently normalization overwrites original when FAISS absent.
+9. Logging Standardization: Replace scattered `print()` with Python `logging` module (configurable levels; avoids polluting stdout in production).
+10. Validation Normalization: Move `_normalize_mcqs` from `app.py` into `RAGMCQ` (keeps domain logic together; controller stays thin).
+11. Error Message Specificity: On generation failure wrap exceptions with context (page/chunk), but avoid leaking internal stack to clients; log full internally.
+12. Dependency Pinning: Specify versions in `requirements.txt` for reproducibility (e.g., `sentence-transformers==2.2.2`).
+13. Add `/models` Endpoint (Optional): Expose available embedder & generation models for UI introspection.
+14. Add Basic Tests: e.g., a test for `chunk_text` (ensures boundaries) and JSON parsing fallback.
+15. Reusable Retrieval: Expose a public `retrieve(query, top_k)` method to support future features (like user-specified queries) without duplicating private logic.
+## 9. Potential Medium-Term Enhancements
+| Area | Improvement |
+|------|-------------|
+| Prompt Robustness | Add JSON schema validation (e.g., `jsonschema`) & auto-regeneration for malformed outputs |
+| Performance | Embed asynchronously / stream generation if backend supports it |
+| Multi-Provider | Abstract provider strategy for HuggingFace, Together, OpenAI with pluggable client classes |
+| Caching | Cache embeddings per PDF hash to avoid reprocessing identical documents |
+| Analytics | Track generation latency, validation pass rate, average similarity in structured logs |
+| i18n | Parameterize language; currently prompts in Vietnamese only |
+## 10. Security & Operational Notes
+- Ensure `TOGETHER_KEY` is not committed; rely on environment variables / secret managers.
+- Limit PDF size and number of pages to prevent excessive memory or token usage.
+- Consider sanitizing extracted text (remove personally identifiable info) before sending to LLM if sensitive documents are used.
+- Add request timeout & retry logic for the LLM API (current single call may raise immediately).
+## 11. Quick Start (Local)
+1. Set API key: `setx TOGETHER_KEY "your_api_key"` (then restart shell).
+2. Install dependencies: `pip install -r requirements.txt`.
+3. Run API: `uvicorn app:app --reload`.
+4. POST a PDF to `/generate`.
+## 12. Summary
+The system cleanly separates HTTP handling from the core RAG pipeline. Text is chunked at sentence boundaries, embedded, indexed (FAISS if available), and retrieved to assemble focused contexts that guide a JSON-constrained MCQ generation prompt. Optional validation uses embedding similarity and secondary model verification to flag unsupported questions. Suggested refactors improve safety, clarity, extensibility, and readiness for multi-provider expansion.
+---
+This report delivers architectural insight, workflow diagrams, detailed pipeline mechanics, API contract, and actionable improvement ideas for rapid comprehension and iteration.

app/utils.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import re
+import json
+from typing import Dict, Any
+import requests
+import os
+#TODO: allow to choose different provider later + dynamic routing when token expired
+API_URL = "https://api.cerebras.ai/v1/chat/completions"
+CEREBRAS_API_KEY = os.environ['CEREBRAS_API_KEY']
+HEADERS = {"Authorization": f"Bearer {CEREBRAS_API_KEY}"}
+JSON_OBJ_RE = re.compile(r"(\{[\s\S]*\})", re.MULTILINE)
+def _post_chat(messages: list, model: str, temperature: float = 0.2, timeout: int = 60) -> str:
+    payload = {"model": model, "messages": messages, "temperature": temperature}
+    resp = requests.post(API_URL, headers=HEADERS, json=payload, timeout=timeout)
+    resp.raise_for_status()
+    data = resp.json()
+    # handle various shapes
+    if "choices" in data and len(data["choices"]) > 0:
+        # prefer message.content
+        ch = data["choices"][0]
+        if isinstance(ch, dict) and "message" in ch and "content" in ch["message"]:
+            return ch["message"]["content"]
+        if "text" in ch:
+            return ch["text"]
+    # final fallback
+    raise RuntimeError("Unexpected HF response shape: " + json.dumps(data)[:200])
+def _safe_extract_json(text: str) -> dict:
+    # remove triple backticks
+    text = re.sub(r"```(?:json)?\n?", "", text)
+    m = JSON_OBJ_RE.search(text)
+    if not m:
+        raise ValueError("No JSON object found in model output.")
+    js = m.group(1)
+    # try load, fix trailing commas
+    try:
+        return json.loads(js)
+    except json.JSONDecodeError:
+        fixed = re.sub(r",\s*([}\]])", r"\1", js)
+        return json.loads(fixed)
+def generate_mcqs_from_text(
+    source_text: str,
+    n: int = 3,
+    model: str = "gpt-oss-120b",
+    temperature: float = 0.2,
+) -> Dict[str, Any]:
+    system_message = {
+        "role": "system",
+        "content": (
+            "Bạn là một trợ lý hữu ích chuyên tạo câu hỏi trắc nghiệm. "
+            "Chỉ TRẢ VỀ duy nhất một đối tượng JSON theo đúng schema sau và không có bất kỳ văn bản nào khác:\n\n"
+            "{\n"
+            '  "1": { "câu hỏi": "...", "lựa chọn": {"a":"...","b":"...","c":"...","d":"..."}, "đáp án":"..."},\n'
+            '  "2": { ... }\n'
+            "}\n\n"
+            "Lưu ý:\n"
+            f"- Tạo đúng {n} mục, đánh YOUR_API_KEYsố từ 1 tới {n}.\n"
+            "- Khóa 'lựa chọn' phải có các phím a, b, c, d.\n"
+            "- 'đáp án' phải là toàn văn đáp án đúng (không phải ký tự chữ cái), và giá trị này phải khớp chính xác với một trong các giá trị trong 'lựa chọn'.\n"
+            "- Không kèm giải thích hay trường thêm.\n"
+            "- Các phương án sai (distractors) phải hợp lý và không lặp lại."
+        )
+    }
+    user_message = {
+        "role": "user",
+        "content": (
+            f"Hãy tạo {n} câu hỏi trắc nghiệm từ nội dung dưới đây. Dùng nội dung này làm nguồn duy nhất để trả lời."
+            "Nếu nội dung quá ít để tạo câu hỏi chính xác, hãy tạo các phương án hợp lý nhưng có thể biện minh được.\n\n"
+            f"Nội dung:\n\n{source_text}"
+        )
+    }
+    raw = _post_chat([system_message, user_message], model=model, temperature=temperature)
+    parsed = _safe_extract_json(raw)
+    # validate structure and length
+    if not isinstance(parsed, dict) or len(parsed) != n:
+        raise ValueError(f"Generator returned invalid structure. Raw:\n{raw}")
+    return parsed

generator.py ADDED Viewed

	@@ -0,0 +1,696 @@

+import re
+import random
+import numpy as np
+from typing import List, Tuple, Dict, Any, Optional
+from sentence_transformers import SentenceTransformer
+from uuid import uuid4
+import pymupdf4llm
+import pymupdf as fitz
+try:
+    from qdrant_client import QdrantClient
+    from qdrant_client.http.models import (
+        PointStruct,
+        Filter,
+        FieldCondition,
+        MatchValue,
+        Distance,
+        VectorParams,
+    )
+    from qdrant_client.http import models as rest
+    _HAS_QDRANT = True
+except Exception:
+    _HAS_QDRANT = False
+try:
+    import faiss
+    _HAS_FAISS = True
+except Exception:
+    _HAS_FAISS = False
+from utils import generate_mcqs_from_text, _post_chat, _safe_extract_json
+class RAGMCQ:
+    def __init__(
+        self,
+        embedder_model: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
+        hf_model: str = "openai/gpt-oss-120b:cerebras",
+        qdrant_url: str = None,
+        qdrant_api_key: str = None,
+        qdrant_prefer_grpc: bool = False,
+    ):
+        self.embedder = SentenceTransformer(embedder_model)
+        self.hf_model = hf_model
+        self.embeddings = None   # np.array of shape (N, D)
+        self.texts = []          # list of chunk texts
+        self.metadata = []       # list of dicts (page, chunk_id, char_range)
+        self.index = None
+        self.dim = self.embedder.get_sentence_embedding_dimension()
+        self.qdrant = None
+        self.qdrant_url = qdrant_url
+        self.qdrant_api_key = qdrant_api_key
+        self.qdrant_prefer_grpc = qdrant_prefer_grpc
+        if qdrant_url:
+            self.connect_qdrant(qdrant_url, qdrant_api_key, qdrant_prefer_grpc)
+    def extract_pages(
+        self,
+        pdf_path: str,
+        *,
+        pages: Optional[List[int]] = None,
+        ignore_images: bool = False,
+        dpi: int = 150
+    ) -> List[str]:
+        doc = fitz.open(pdf_path)
+        try:
+            # request page-wise output (page_chunks=True -> list[dict] per page)
+            page_dicts = pymupdf4llm.to_markdown(
+                doc,
+                pages=pages,
+                ignore_images=ignore_images,
+                dpi=dpi,
+                page_chunks=True,
+            )
+            # to_markdown(..., page_chunks=True) returns a list of dicts, each has key "text" (markdown)
+            pages_md: List[str] = []
+            for p in page_dicts:
+                txt = p.get("text", "") or ""
+                pages_md.append(txt.strip())
+            return pages_md
+        finally:
+            doc.close()
+    def chunk_text(self, text: str, max_chars: int = 1200, overlap: int = 100) -> List[str]:
+        text = text.strip()
+        if not text:
+            return []
+        if len(text) <= max_chars:
+            return [text]
+        # split by sentence-like boundaries
+        sentences = re.split(r'(?<=[\.\?\!])\s+', text)
+        chunks = []
+        cur = ""
+        for s in sentences:
+            if len(cur) + len(s) + 1 <= max_chars:
+                cur += (" " if cur else "") + s
+            else:
+                if cur:
+                    chunks.append(cur)
+                cur = (cur[-overlap:] + " " + s) if overlap > 0 else s
+        if cur:
+            chunks.append(cur)
+        # if still too long, hard-split
+        final = []
+        for c in chunks:
+            if len(c) <= max_chars:
+                final.append(c)
+            else:
+                for i in range(0, len(c), max_chars):
+                    final.append(c[i:i+max_chars])
+        return final
+    def build_index_from_pdf(self, pdf_path: str, max_chars: int = 1200):
+        pages = self.extract_pages(pdf_path)
+        self.texts = []
+        self.metadata = []
+        for p_idx, page_text in enumerate(pages, start=1):
+            chunks = self.chunk_text(page_text or "", max_chars=max_chars)
+            for cid, ch in enumerate(chunks, start=1):
+                self.texts.append(ch)
+                self.metadata.append({"page": p_idx, "chunk_id": cid, "length": len(ch)})
+        if not self.texts:
+            raise RuntimeError("No text extracted from PDF.")
+        # compute embeddings
+        emb = self.embedder.encode(self.texts, convert_to_numpy=True, show_progress_bar=True)
+        self.embeddings = emb.astype("float32")
+        self._build_faiss_index()
+    def _build_faiss_index(self, ef_construction=200, M=32):
+        if _HAS_FAISS:
+            d = self.embeddings.shape[1]
+            index = faiss.IndexHNSWFlat(d, M)
+            faiss.normalize_L2(self.embeddings)
+            index.add(self.embeddings)
+            index.hnsw.efConstruction = ef_construction
+            self.index = index
+        else:
+            # store normalized embeddings and use brute-force numpy
+            norms = np.linalg.norm(self.embeddings, axis=1, keepdims=True) + 1e-10
+            self.embeddings = self.embeddings / norms
+            self.index = None
+    def _retrieve(self, query: str, top_k: int = 3) -> List[Tuple[int, float]]:
+        q_emb = self.embedder.encode([query], convert_to_numpy=True).astype("float32")
+        if _HAS_FAISS:
+            faiss.normalize_L2(q_emb)
+            D_list, I_list = self.index.search(q_emb, top_k)
+            # D are inner products; return list of (idx, score)
+            return [(int(i), float(d)) for i, d in zip(I_list[0], D_list[0]) if i != -1]
+        else:
+            qn = q_emb / (np.linalg.norm(q_emb, axis=1, keepdims=True) + 1e-10)
+            sims = (self.embeddings @ qn.T).squeeze(axis=1)
+            idxs = np.argsort(-sims)[:top_k]
+            return [(int(i), float(sims[i])) for i in idxs]
+    def generate_from_pdf(
+        self,
+        pdf_path: str,
+        n_questions: int = 10,
+        mode: str = "rag", # per_page or rag
+        questions_per_page: int = 3, # for per_page mode
+        top_k: int = 3, # chunks to retrieve for each question in rag mode
+        temperature: float = 0.2,
+    ) -> Dict[str, Any]:
+        # build index
+        self.build_index_from_pdf(pdf_path)
+        output: Dict[str, Any] = {}
+        qcount = 0
+        if mode == "per_page":
+            # iterate pages -> chunks
+            for idx, meta in enumerate(self.metadata):
+                chunk_text = self.texts[idx]
+                if not chunk_text.strip():
+                    continue
+                to_gen = questions_per_page
+                # ask generator
+                try:
+                    mcq_block = generate_mcqs_from_text(
+                        chunk_text, n=to_gen, model=self.hf_model, temperature=temperature
+                    )
+                except Exception as e:
+                    # skip this chunk if generator fails
+                    print(f"Generator failed on page {meta['page']} chunk {meta['chunk_id']}: {e}")
+                    continue
+                for item in sorted(mcq_block.keys(), key=lambda x: int(x)):
+                    qcount += 1
+                    output[str(qcount)] = mcq_block[item]
+                    if qcount >= n_questions:
+                        return output
+            return output
+        elif mode == "rag":
+            # strategy: create a few natural short queries by sampling sentences or using chunk summaries.
+            # create queries by sampling chunk text sentences.
+            # stop when n_questions reached or max_attempts exceeded.
+            attempts = 0
+            max_attempts = n_questions * 4
+            while qcount < n_questions and attempts < max_attempts:
+                attempts += 1
+                # create a seed query: pick a random chunk, pick a sentence from it
+                seed_idx = random.randrange(len(self.texts))
+                chunk = self.texts[seed_idx]
+                sents = re.split(r'(?<=[\.\?\!])\s+', chunk)
+                seed_sent = random.choice([s for s in sents if len(s.strip()) > 20]) if sents else chunk[:200]
+                query = f"Create questions about: {seed_sent}"
+                # retrieve top_k chunks
+                retrieved = self._retrieve(query, top_k=top_k)
+                context_parts = []
+                for ridx, score in retrieved:
+                    md = self.metadata[ridx]
+                    context_parts.append(f"[page {md['page']}] {self.texts[ridx]}")
+                context = "\n\n".join(context_parts)
+                # call generator for 1 question (or small batch) with the retrieved context
+                try:
+                    # request 1 question at a time to keep diversity
+                    mcq_block = generate_mcqs_from_text(
+                        context, n=1, model=self.hf_model, temperature=temperature
+                    )
+                except Exception as e:
+                    print(f"Generator failed during RAG attempt {attempts}: {e}")
+                    continue
+                # append result(s)
+                for item in sorted(mcq_block.keys(), key=lambda x: int(x)):
+                    qcount += 1
+                    output[str(qcount)] = mcq_block[item]
+                    if qcount >= n_questions:
+                        return output
+            return output
+        else:
+            raise ValueError("mode must be 'per_page' or 'rag'.")
+    def validate_mcqs(
+        self,
+        mcqs: Dict[str, Any],
+        top_k: int = 4,
+        similarity_threshold: float = 0.5,
+        evidence_score_cutoff: float = 0.5,
+        use_model_verification: bool = True,
+        model_verification_temperature: float = 0.0,
+    ) -> Dict[str, Any]:
+        if self.embeddings is None or not self.texts:
+            raise RuntimeError("Index/embeddings not built. Run build_index_from_pdf() first.")
+        report: Dict[str, Any] = {}
+        # helper: semantic similarity search on statement -> returns list of (idx, score)
+        def semantic_search(statement: str, k: int = top_k):
+            q_emb = self.embedder.encode([statement], convert_to_numpy=True).astype("float32")
+            if _HAS_FAISS:
+                faiss.normalize_L2(q_emb)
+                D_list, I_list = self.index.search(q_emb, k)
+                # D are inner products; return list of (idx, score)
+                return [(int(i), float(d)) for i, d in zip(I_list[0], D_list[0]) if i != -1]
+            else:
+                qn = q_emb / (np.linalg.norm(q_emb, axis=1, keepdims=True) + 1e-10)
+                sims = (self.embeddings @ qn.T).squeeze(axis=1)
+                idxs = np.argsort(-sims)[:k]
+                return [(int(i), float(sims[i])) for i in idxs]
+        # helper: verify with model (strict JSON in response)
+        def _verify_with_model(question_text: str, options: Dict[str, str], correct_text: str, context_text: str):
+            system = {
+                "role": "system",
+                "content": (
+                    "Bạn là một trợ lý đánh giá tính thực chứng của câu hỏi trắc nghiệm dựa trên đoạn văn được cung cấp. "
+                    "Hãy trả lời DUY NHẤT bằng JSON hợp lệ (không có văn bản khác) theo schema:\n\n"
+                    "{\n"
+                    '  "supported": true/false,            # câu trả lời đúng có được nội dung chứng thực không\n'
+                    '  "confidence": 0.0-1.0,              # mức độ tự tin (số)\n'
+                    '  "evidence": "cụm văn bản ngắn làm bằng chứng hoặc trích dẫn",\n'
+                    '  "reason": "ngắn gọn, vì sao supported hoặc không"\n'
+                    "}\n\n"
+                    "Luôn dựa chỉ trên nội dung trong trường 'Context' dưới đây. Nếu nội dung không chứa bằng chứng, trả về supported: false."
+                )
+            }
+            user = {
+                "role": "user",
+                "content": (
+                    "Câu hỏi:\n" + question_text + "\n\n"
+                    "Lựa chọn:\n" + "\n".join([f"{k}: {v}" for k, v in options.items()]) + "\n\n"
+                    "Đáp án:\n" + correct_text + "\n\n"
+                    "Context:\n" + context_text + "\n\n"
+                    "Hãy trả lời như yêu cầu."
+                )
+            }
+            raw = _post_chat([system, user], model=self.hf_model, temperature=model_verification_temperature)
+            # parse JSON object in response
+            try:
+                parsed = _safe_extract_json(raw)
+            except Exception as e:
+                return {"error": f"Model verification failed to return JSON: {e}", "raw": raw}
+            return parsed
+        # iterate MCQs
+        for qid, item in mcqs.items():
+            q_text = item.get("câu hỏi", "").strip()
+            options = item.get("lựa chọn", {})
+            correct_text = item.get("đáp án", "").strip()
+            # form a short declarative statement to embed: "Question: ... Answer: <correct>"
+            statement = f"{q_text} Answer: {correct_text}"
+            retrieved = semantic_search(statement, k=top_k)
+            evidence_list = []
+            max_sim = 0.0
+            for idx, score in retrieved:
+                if score >= evidence_score_cutoff:
+                    evidence_list.append({
+                        "idx": idx,
+                        "page": self.metadata[idx].get("page", None),
+                        "score": float(score),
+                        "text": (self.texts[idx][:1000] + ("..." if len(self.texts[idx]) > 1000 else "")),
+                    })
+                if score > max_sim:
+                    max_sim = float(score)
+            supported_by_embeddings = max_sim >= similarity_threshold
+            model_verdict = None
+            if use_model_verification:
+                # build a context string from top retrieved chunks (regardless of cutoff)
+                context_parts = []
+                for ridx, sc in retrieved:
+                    md = self.metadata[ridx]
+                    context_parts.append(f"[page {md.get('page')}] {self.texts[ridx]}")
+                context_text = "\n\n".join(context_parts)
+                try:
+                    parsed = _verify_with_model(q_text, options, correct_text, context_text)
+                    model_verdict = parsed
+                except Exception as e:
+                    model_verdict = {"error": f"verification exception: {e}"}
+            report[qid] = {
+                "supported_by_embeddings": bool(supported_by_embeddings),
+                "max_similarity": float(max_sim),
+                "evidence": evidence_list,
+                "model_verdict": model_verdict,
+            }
+        return report
+    def connect_qdrant(self, url: str, api_key: str = None, prefer_grpc: bool = False):
+        if not _HAS_QDRANT:
+            raise RuntimeError("qdrant-client is not installed. Install with `pip install qdrant-client`.")
+        self.qdrant_url = url
+        self.qdrant_api_key = api_key
+        self.qdrant_prefer_grpc = prefer_grpc
+        # Create client
+        self.qdrant = QdrantClient(url=url, api_key=api_key, prefer_grpc=prefer_grpc)
+    def _ensure_collection(self, collection_name: str):
+        if self.qdrant is None:
+            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")
+        try:
+            # get_collection will raise if not present
+            _ = self.qdrant.get_collection(collection_name)
+        except Exception:
+            # create collection with vector size = self.dim
+            vect_params = VectorParams(size=self.dim, distance=Distance.COSINE)
+            self.qdrant.recreate_collection(collection_name=collection_name, vectors_config=vect_params)
+            # recreate_collection ensures a clean collection; if you prefer to avoid wiping use create_collection instead.
+    def save_pdf_to_qdrant(
+        self,
+        pdf_path: str,
+        filename: str,
+        collection: str,
+        max_chars: int = 1200,
+        batch_size: int = 64,
+        overwrite: bool = False,
+    ):
+        if self.qdrant is None:
+            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")
+        # extract pages and chunks (re-using your existing helpers)
+        pages = self.extract_pages(pdf_path)
+        all_chunks = []
+        all_meta = []
+        for p_idx, page_text in enumerate(pages, start=1):
+            chunks = self.chunk_text(page_text or "", max_chars=max_chars)
+            for cid, ch in enumerate(chunks, start=1):
+                all_chunks.append(ch)
+                all_meta.append({"page": p_idx, "chunk_id": cid, "length": len(ch)})
+        if not all_chunks:
+            raise RuntimeError("No text extracted from PDF.")
+        # ensure collection exists
+        self._ensure_collection(collection)
+        # optional: delete previous points for this filename if overwrite
+        if overwrite:
+            # delete by filter: filename == filename
+            flt = Filter(must=[FieldCondition(key="filename", match=MatchValue(value=filename))])
+            try:
+                # qdrant-client delete uses delete(
+                self.qdrant.delete(collection_name=collection, filter=flt)
+            except Exception:
+                # ignore if deletion fails
+                pass
+        # compute embeddings in batches
+        embeddings = self.embedder.encode(all_chunks, convert_to_numpy=True, show_progress_bar=True)
+        embeddings = embeddings.astype("float32")
+        # prepare points
+        points = []
+        for i, (emb, md, txt) in enumerate(zip(embeddings, all_meta, all_chunks)):
+            pid = str(uuid4())
+            source_id = f"{filename}__p{md['page']}__c{md['chunk_id']}"
+            payload = {
+                "filename": filename,
+                "page": md["page"],
+                "chunk_id": md["chunk_id"],
+                "length": md["length"],
+                "text": txt,
+                "source_id": source_id,
+            }
+            points.append(PointStruct(id=pid, vector=emb.tolist(), payload=payload))
+            # upsert in batches
+            if len(points) >= batch_size:
+                self.qdrant.upsert(collection_name=collection, points=points)
+                points = []
+        # upsert remaining
+        if points:
+            self.qdrant.upsert(collection_name=collection, points=points)
+        try:
+            self.qdrant.create_payload_index(
+                collection_name=collection,
+                field_name="filename",
+                field_schema=rest.PayloadSchemaType.KEYWORD
+            )
+        except Exception as e:
+            print(f"Index creation skipped or failed: {e}")
+        return {"status": "ok", "uploaded_chunks": len(all_chunks), "collection": collection, "filename": filename}
+    def list_files_in_collection(
+        self,
+        collection: str,
+        payload_field: str = "filename",
+        batch_size: int = 500,
+    ) -> List[str]:
+        if self.qdrant is None:
+            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")
+        # ensure collection exists
+        try:
+            if not self.qdrant.collection_exists(collection):
+                raise RuntimeError(f"Collection '{collection}' does not exist.")
+        except Exception:
+            # collection_exists may raise if server unreachable
+            raise
+        filenames = set()
+        offset = None
+        while True:
+            # scroll returns (points, next_offset)
+            pts, next_offset = self.qdrant.scroll(
+                collection_name=collection,
+                limit=batch_size,
+                offset=offset,
+                with_payload=[payload_field],
+                with_vectors=False,
+            )
+            if not pts:
+                break
+            for p in pts:
+                # p may be a dict-like or an object with .payload
+                payload = None
+                if hasattr(p, "payload"):
+                    payload = p.payload
+                elif isinstance(p, dict):
+                    # older/newer variants might use nested structures: try common keys
+                    payload = p.get("payload") or p.get("payload", None) or p
+                else:
+                    # best-effort fallback: convert to dict if possible
+                    try:
+                        payload = dict(p)
+                    except Exception:
+                        payload = None
+                if not payload:
+                    continue
+                # extract candidate value(s)
+                val = None
+                if isinstance(payload, dict):
+                    val = payload.get(payload_field)
+                else:
+                    # Some payload representations store fields differently; try attribute access
+                    val = getattr(payload, payload_field, None)
+                # If value is list-like, iterate, else add single
+                if isinstance(val, (list, tuple, set)):
+                    for v in val:
+                        if v is not None:
+                            filenames.add(str(v))
+                elif val is not None:
+                    filenames.add(str(val))
+            # stop if no more pages
+            if not next_offset:
+                break
+            offset = next_offset
+        return sorted(filenames)
+    def list_chunks_for_filename(self, collection: str, filename: str, batch: int = 256) -> List[Dict[str, Any]]:
+        if self.qdrant is None:
+            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")
+        results = []
+        offset = None
+        while True:
+            # scroll returns (points, next_offset)
+            points, next_offset = self.qdrant.scroll(
+                collection_name=collection,
+                scroll_filter=Filter(
+                    must=[
+                        FieldCondition(key="filename", match=MatchValue(value=filename))
+                    ]
+                ),
+                limit=batch,
+                offset=offset,
+                with_payload=True,
+                with_vectors=False,
+            )
+            # points are objects (Record / ScoredPoint-like); get id and payload
+            for p in points:
+                # p.payload is a dict, p.id is point id
+                results.append({"point_id": p.id, "payload": p.payload})
+            if not next_offset:
+                break
+            offset = next_offset
+        return results
+    def _retrieve_qdrant(self, query: str, collection: str, filename: str = None, top_k: int = 3) -> List[Tuple[Dict[str, Any], float]]:
+        if self.qdrant is None:
+            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")
+        q_emb = self.embedder.encode([query], convert_to_numpy=True).astype("float32")[0].tolist()
+        q_filter = None
+        if filename:
+            q_filter = Filter(must=[FieldCondition(key="filename", match=MatchValue(value=filename))])
+        search_res = self.qdrant.search(
+            collection_name=collection,
+            query_vector=q_emb,
+            query_filter=q_filter,
+            limit=top_k,
+            with_payload=True,
+            with_vectors=False,
+        )
+        out = []
+        for hit in search_res:
+            # hit.payload is the stored payload, hit.score is similarity
+            out.append((hit.payload, float(getattr(hit, "score", 0.0))))
+        return out
+    def generate_from_qdrant(
+        self,
+        filename: str,
+        collection: str,
+        n_questions: int = 10,
+        mode: str = "rag",               # 'per_chunk' or 'rag'
+        questions_per_chunk: int = 3,    # used for 'per_chunk'
+        top_k: int = 3,                  # retrieval size used in RAG
+        temperature: float = 0.2,
+    ) -> Dict[str, Any]:
+        if self.qdrant is None:
+            raise RuntimeError("Qdrant client not connected. Call connect_qdrant(...) first.")
+        # get all chunks for this filename (payload should contain 'text', 'page', 'chunk_id', etc.)
+        file_points = self.list_chunks_for_filename(collection=collection, filename=filename)
+        if not file_points:
+            raise RuntimeError(f"No chunks found for filename={filename} in collection={collection}.")
+        # create a local list of texts & metadata for sampling
+        texts = []
+        metas = []
+        for p in file_points:
+            payload = p.get("payload", {})
+            text = payload.get("text", "")
+            texts.append(text)
+            metas.append(payload)
+        self.texts = texts
+        self.metadata = metas
+        embeddings = self.embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True)
+        if embeddings is None or len(embeddings) == 0:
+            self.embeddings = None
+            self.index = None
+        else:
+            self.embeddings = embeddings.astype("float32")
+            # update dim in case embedder changed unexpectedly
+            self.dim = int(self.embeddings.shape[1])
+            # build index
+            self._build_faiss_index()
+        output = {}
+        qcount = 0
+        if mode == "per_chunk":
+            # iterate all chunks (in payload order) and request questions_per_chunk from each
+            for i, txt in enumerate(texts):
+                if not txt.strip():
+                    continue
+                to_gen = questions_per_chunk
+                try:
+                    mcq_block = generate_mcqs_from_text(txt, n=to_gen, model=self.hf_model, temperature=temperature)
+                except Exception as e:
+                    print(f"Generator failed on chunk (index {i}): {e}")
+                    continue
+                for item in sorted(mcq_block.keys(), key=lambda x: int(x)):
+                    qcount += 1
+                    output[str(qcount)] = mcq_block[item]
+                    if qcount >= n_questions:
+                        return output
+            return output
+        elif mode == "rag":
+            attempts = 0
+            max_attempts = n_questions * 4
+            while qcount < n_questions and attempts < max_attempts:
+                attempts += 1
+                # sample a seed sentence from a random chunk of this file
+                seed_idx = random.randrange(len(texts))
+                chunk = texts[seed_idx]
+                sents = re.split(r'(?<=[\.\?\!])\s+', chunk)
+                seed_sent = None
+                for s in sents:
+                    if len(s.strip()) > 20:
+                        seed_sent = s
+                        break
+                if not seed_sent:
+                    seed_sent = chunk[:200]
+                query = f"Create questions about: {seed_sent}"
+                # retrieve top_k chunks from the same file (restricted by filename filter)
+                retrieved = self._retrieve_qdrant(query=query, collection=collection, filename=filename, top_k=top_k)
+                context_parts = []
+                for payload, score in retrieved:
+                    # payload should contain page & chunk_id and text
+                    page = payload.get("page", "?")
+                    ctxt = payload.get("text", "")
+                    context_parts.append(f"[page {page}] {ctxt}")
+                context = "\n\n".join(context_parts)
+                try:
+                    mcq_block = generate_mcqs_from_text(context, n=1, model=self.hf_model, temperature=temperature)
+                except Exception as e:
+                    print(f"Generator failed during RAG attempt {attempts}: {e}")
+                    continue
+                for item in sorted(mcq_block.keys(), key=lambda x: int(x)):
+                    qcount += 1
+                    output[str(qcount)] = mcq_block[item]
+                    if qcount >= n_questions:
+                        return output
+            return output
+        else:
+            raise ValueError("mode must be 'per_chunk' or 'rag'.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+boto3
+pdfplumber
+faiss-cpu
+sentence-transformers
+fastapi[standard]
+uvicorn[standard]
+qdrant-client
+pymupdf4llm

test/cerebras-api.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# import os
+# from cerebras.cloud.sdk import Cerebras
+import tiktoken
+# client = Cerebras(
+#     # This is the default and can be omitted
+#     api_key=os.environ.get("CEREBRAS_API_KEY")
+# )
+# stream = client.chat.completions.create(
+#     messages=[
+#         {
+#             "role": "system",
+#             "content": ""
+#         }
+#     ],
+#     model="gpt-oss-120b",
+#     stream=True,
+#     max_completion_tokens=65536,
+#     temperature=1,
+#     top_p=1
+# )
+import numpy as np
+INPUT_TOKEN_COUNT = np.array([], dtype=int)
+OUTPUT_TOKEN_COUNT = np.array([], dtype=int)
+# for chunk in stream:
+# 	print(chunk.choices[0].delta.content or "", end="")
+with open('../test/mcq_output.json', 'r', encoding='utf-8') as f:
+	text = f.read()
+def count_tokens(text: str, model_name='gpt-oss-120b', encoding_name='cl100k_base') -> int:
+    """Look up model encoding; fallback to encoding_name if model not known."""
+    try:
+        # encoding_for_model can raise if model is unknown to tiktoken
+        enc = tiktoken.encoding_for_model(model_name)
+    except Exception:
+        enc = None
+    if enc is None:
+        enc = tiktoken.get_encoding(encoding_name)
+    return len(enc.encode(text))
+c = count_tokens(text)
+INPUT_TOKEN_COUNT = np.append(INPUT_TOKEN_COUNT, c)
+print(INPUT_TOKEN_COUNT)

test/logging.txt ADDED Viewed

	@@ -0,0 +1,111 @@

+**********************
+Windows PowerShell transcript start
+Start time: 20250815161919
+Username: MACBOOKM5\boboi
+RunAs User: MACBOOKM5\boboi
+Configuration Name:
+Machine: MACBOOKM5 (Microsoft Windows NT 10.0.22631.0)
+Host Application: C:\WINDOWS\System32\WindowsPowerShell\v1.0\powershell.exe
+Process ID: 20936
+PSVersion: 5.1.22621.5697
+PSEdition: Desktop
+PSCompatibleVersions: 1.0, 2.0, 3.0, 4.0, 5.0, 5.1.22621.5697
+BuildVersion: 10.0.22621.5697
+CLRVersion: 4.0.30319.42000
+WSManStackVersion: 3.0
+PSRemotingProtocolVersion: 2.3
+SerializationVersion: 1.1.0.1
+**********************
+Transcript started, output file is D:\graduation_project\mcq-generator\test\logging.txt
+PS D:\graduation_project\mcq-generator\app>
+(rag-api) uvicorn app:app --reload
+  File "D:\CODE\IDE\Anaconda\envs\rag-api\Lib\asyncio\base_events.py", li
+ne 608, in run_forever
+    self._run_once()
+  File "D:\CODE\IDE\Anaconda\envs\rag-api\Lib\asyncio\base_events.py", li
+ne 1936, in _run_once
+    handle._run()
+  File "D:\CODE\IDE\Anaconda\envs\rag-api\Lib\asyncio\events.py", line 84
+, in _run
+    self._context.run(self._callback, *self._args)
+  File "D:\CODE\IDE\Anaconda\envs\rag-api\Lib\site-packages\uvicorn\serve
+r.py", line 70, in serve
+    with self.capture_signals():
+  File "D:\CODE\IDE\Anaconda\envs\rag-api\Lib\contextlib.py", line 144, i
+n __exit__
+    next(self.gen)
+  File "D:\CODE\IDE\Anaconda\envs\rag-api\Lib\site-packages\uvicorn\serve
+r.py", line 331, in capture_signals
+    signal.raise_signal(captured_signal)
+  File "D:\CODE\IDE\Anaconda\envs\rag-api\Lib\asyncio\runners.py", line 1
+57, in _on_sigint
+    raise KeyboardInterrupt()
+KeyboardInterrupt
+During handling of the above exception, another exception occurred:
+Traceback (most recent call last):
+  File "D:\CODE\IDE\Anaconda\envs\rag-api\Lib\site-packages\starlette\rou
+ting.py", line 701, in lifespan
+    await receive()
+  File "D:\CODE\IDE\Anaconda\envs\rag-api\Lib\site-packages\uvicorn\lifes
+pan\on.py", line 137, in receive
+    return await self.receive_queue.get()
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "D:\CODE\IDE\Anaconda\envs\rag-api\Lib\asyncio\queues.py", line 15
+8, in get
+    await getter
+asyncio.exceptions.CancelledError
+INFO:     Stopping reloader process [21928]
+(rag-api) TerminatingError(): "The pipeline has been stopped."
+>> TerminatingError(): "The pipeline has been stopped."
+PS D:\graduation_project\mcq-generator\app>
+(rag-api) uvicorn app:app --reload
+  warnings.warn(
+INFO:     Started server process [20356]
+INFO:     Waiting for application startup.
+RAGMCQ instance created on startup.
+INFO:     Application startup complete.
+ERROR:    Traceback (most recent call last):
+  File "D:\CODE\IDE\Anaconda\envs\rag-api\Lib\asyncio\runners.py", line 1
+18, in run
+    return self._loop.run_until_complete(task)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "D:\CODE\IDE\Anaconda\envs\rag-api\Lib\asyncio\base_events.py", li
+ne 654, in run_until_complete
+    return future.result()
+           ^^^^^^^^^^^^^^^
+asyncio.exceptions.CancelledError
+During handling of the above exception, another exception occurred:
+Traceback (most recent call last):
+  File "D:\CODE\IDE\Anaconda\envs\rag-api\Lib\asyncio\runners.py", line 1
+90, in run
+    return runner.run(main)
+           ^^^^^^^^^^^^^^^^
+  File "D:\CODE\IDE\Anaconda\envs\rag-api\Lib\asyncio\runners.py", line 1
+23, in run
+    raise KeyboardInterrupt()
+KeyboardInterrupt
+During handling of the above exception, another exception occurred:
+Traceback (most recent call last):
+  File "D:\CODE\IDE\Anaconda\envs\rag-api\Lib\site-packages\starlette\rou
+ting.py", line 701, in lifespan
+    await receive()
+  File "D:\CODE\IDE\Anaconda\envs\rag-api\Lib\site-packages\uvicorn\lifes
+pan\on.py", line 137, in receive
+    return await self.receive_queue.get()
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "D:\CODE\IDE\Anaconda\envs\rag-api\Lib\asyncio\queues.py", line 15
+8, in get
+    await getter
+asyncio.exceptions.CancelledError
+INFO:     Stopping reloader process [1968]
+(rag-api) TerminatingError(): "The pipeline has been stopped."
+>> TerminatingError(): "The pipeline has been stopped."
+PS D:\graduation_project\mcq-generator\app>

test/mcq_output.json ADDED Viewed

	@@ -0,0 +1,163 @@

+{
+  "mcqs": {
+    "1": {
+      "câu hỏi": "Trong lớp Str_OutputParser, biểu thức chính quy nào được sử dụng để trích xuất câu trả lời từ chuỗi phản hồi?",
+      "lựa chọn": {
+        "a": "r\"Answer:\\s*(.*)\"",
+        "b": "r\"Respuesta:\\s*(.*)\"",
+        "c": "r\"Answer :\\s*(.*)\"",
+        "d": "r\"Result:\\s*(.*)\""
+      },
+      "đáp án": "r\"Answer :\\s*(.*)\""
+    },
+    "2": {
+      "câu hỏi": "Trong dự án RAG, file nào được dùng để khai báo các hàm load file PDF?",
+      "lựa chọn": {
+        "a": "src/rag/main.py",
+        "b": "src/rag/file_loader.py",
+        "c": "src/rag/offline_rag.py",
+        "d": "src/rag/utils.py"
+      },
+      "đáp án": "src/rag/file_loader.py"
+    },
+    "3": {
+      "câu hỏi": "Trong file src/rag/vectorstore.py, lớp nào được đặt làm giá trị mặc định cho vector database?",
+      "lựa chọn": {
+        "a": "FAISS",
+        "b": "Chroma",
+        "c": "Pinecone",
+        "d": "Milvus"
+      },
+      "đáp án": "Chroma"
+    },
+    "4": {
+      "câu hỏi": "Trong đoạn mã được trích dẫn, tham số nào được sử dụng cho kiểu lượng tử (quantization type) trong cấu hình BitsAndBytesConfig?",
+      "lựa chọn": {
+        "a": "nf4",
+        "b": "int8",
+        "c": "fp16",
+        "d": "int4"
+      },
+      "đáp án": "nf4"
+    },
+    "5": {
+      "câu hỏi": "Theo mô tả trong nội dung, bước nào liên quan đến việc tạo cơ sở dữ liệu vector bằng mô hình embedding?",
+      "lựa chọn": {
+        "a": "Tách danh sách các bài báo khoa học thành các văn bản nhỏ.",
+        "b": "Xây dựng một cơ sở dữ liệu vector từ các văn bản nhỏ bằng mô hình embedding.",
+        "c": "Truy vấn các mẫu văn bản có liên quan đến câu hỏi đầu vào để làm ngữ cảnh.",
+        "d": "Đưa câu prompt (câu hỏi và ngữ cảnh) vào mô hình để nhận câu trả lời."
+      },
+      "đáp án": "Xây dựng một cơ sở dữ liệu vector từ các văn bản nhỏ bằng mô hình embedding."
+    }
+  },
+  "validation": {
+    "1": {
+      "supported_by_embeddings": true,
+      "max_similarity": 0.5152225494384766,
+      "evidence": [
+        {
+          "idx": 26,
+          "page": 15,
+          "score": 0.5152225494384766,
+          "text": "Ý nghĩa của phương thức `from_template()` trong class PromptTemplate là? ( _a_ ) Đểkhởi tạo prompt template từmột file. ( _b_ ) Đểkhởi tạo prompt template từmột string. ( _c_ ) Đểkhởi tạo prompt template từmột danh sách các tin nhắn. ( _d_ ) Đểkhởi tạo prompt template từmột prompt template có sẵn. 15"
+        }
+      ],
+      "model_verdict": {
+        "supported": false,
+        "confidence": 0.9,
+        "evidence": "",
+        "reason": "Context không chứa thông tin về lớp Str_OutputParser hay biểu thức chính quy được sử dụng, vì vậy không thể chứng thực đáp án được đưa ra."
+      }
+    },
+    "2": {
+      "supported_by_embeddings": true,
+      "max_similarity": 0.694902777671814,
+      "evidence": [
+        {
+          "idx": 4,
+          "page": 4,
+          "score": 0.694902777671814,
+          "text": "**AI VIETNAM (AIO2024)** **aivietnam.edu.vn**\n\n\n_ **src/rag/:** Thư mục dùng đểlưu trữcác code liên quan đến xây dựng RAG, bao gồm:\n\n\n1. **src/rag/file_loader.py:** File code dùng đểkhai báo các hàm load file pdf (vì tài\nliệu của chúng ta thu thập thuộc file pdf). 2. **src/rag/main.py:** File code dùng đểkhai báo hàm khởi tạo chains. 3. **src/rag/offline_rag.py:** File code dùng đểkhai báo PromptTemplate. 4. **src/rag/utils.py:** File code dùng đểkhai báo hàm tách câu trảlời từmodel. 5. **src/rag/vectorstore.py:** File code dùng đểkhai báo hàm khởi tạo hệcơ sởdữliệu\n\nvector. _ **src/app.py:** File code dùng đểkhởi tạo API. _ **requirements.txt:** File code dùng đểkhai báo các thư viện cần thiết đểsửdụng source\ncode. ## II.2. Cập nhật file requirements.txt\n\n\nĐểbắt đầu, chúng ta sẽliệt kê các gói thư viện cần thiết đểchạy được chương trình này."
+        },
+        {
+          "idx": 28,
+          "page": 16,
+          "score": 0.5763600468635559,
+          "text": "document_loaders` `import` `PyPDFLoader`\n\n\n2\n\n\n3 `pdf_loader = PyPDFLoader(url, extract_images =` `True` `)`\n\n\n4\n\n\n5 `docs = pdf_loader.load ()`\n\n\nTham số `extract_images` tại dòng code 3 có chức năng gì? ( _a_ ) Trảvềtất cảảnh từfile pdf. ( _b_ ) Bỏqua ảnh, chỉload text. ( _c_ ) Phân tích ảnh thành vector. ( _d_ ) Chuyển đổi ảnh trong file pdf thành text. 16"
+        },
+        {
+          "idx": 16,
+          "page": 9,
+          "score": 0.5420067310333252,
+          "text": "**AI VIETNAM (AIO2024)** **aivietnam.edu.vn**\n\n\n86 `return` `self.load(files, workers=workers)`\n\n## II.6. Cập nhật file src/rag/vectorstore.py\n\n\nTại file này, ta định nghĩa một class đểkhởi tạo hệcơ sởdữliệu vector. Trong project này, chúng\nta sẽsửdụng Chroma. Vềviệc tìm kiếm tài liệu tương đồng, ta sửdụng FAISS. Như vậy, nội\ndung của file như sau:\n\n\nHình 4: Minh họa việc sửdụng vector database Chroma đểtruy vấn các tài liệu có liên quan\n[làm context trong prompt. Ảnh: Link.](https://heidloff.net/article/retrieval-augmented-generation-chroma-langchain/)\n\n\n1 `from` `typing` `import` `Union`\n\n2 `from` `langchain_chroma` `import` `Chroma`\n\n3 `from` `langchain_community .vectorstores` `import` `FAISS`\n\n4 `from` `langchain_community .embeddings` `import` `HuggingFaceEmbeddings`\n\n\n5\n\n\n6 `class` `VectorDB:`\n\n\n7 `def` `__init__(self,`\n\n\n8 `documents = None,`\n\n9 `vector_db: Union[Chroma, FAISS] = Chroma,`\n\n10 `embedding = HuggingFaceEmbeddings (),`\n\n11 `) -> None` `:`\n\n\n12\n\n\n13 `self.vector_db ..."
+        }
+      ],
+      "model_verdict": {
+        "supported": true,
+        "confidence": 0.99,
+        "evidence": "src/rag/file_loader.py: File code dùng để khai báo các hàm load file pdf",
+        "reason": "Context explicitly states that src/rag/file_loader.py declares functions for loading PDF files, matching the answer."
+      }
+    },
+    "3": {
+      "supported_by_embeddings": true,
+      "max_similarity": 0.579485297203064,
+      "evidence": [
+        {
+          "idx": 20,
+          "page": 11,
+          "score": 0.579485297203064,
+          "text": "Cập nhật file src/rag/main.py\n\n\nTại file này, ta khởi tạo toàn bộcác instance của các class, các hàm mà ta đã khai báo trước đó\nvà kết nối chúng vào trong một hàm duy nhất gọi là `build_rag_chain()` :\n\n\n1 `from` `pydantic` `import` `BaseModel, Field`\n\n\n2\n\n\n3 `from src.rag.file_loader` `import` `Loader`\n\n4 `from src.rag.vectorstore` `import` `VectorDB`\n\n5 `from src.rag.offline_rag` `import` `Offline_RAG`\n\n\n6\n\n\n7 `class` `InputQA(BaseModel):`\n\n8 `question: str = Field (..., title=` `\"Question to ask the model\"` `)`\n\n\n9\n\n\n10 `class` `OutputQA(BaseModel):`\n\n11 `answer: str = Field (..., title=` `\"Answer` `from the model\"` `)`\n\n\n12\n\n\n13 `def` `build_rag_chain (llm, data_dir, data_type):`\n\n14 `doc_loaded = Loader(file_type=data_type).load_dir(data_dir, workers=2)`\n\n15 `retriever = VectorDB(documents = doc_loaded).get_retriever ()`\n\n16 `rag_chain = Offline_RAG(llm).get_chain(retriever)`\n\n\n17\n\n\n18 `return` `rag_chain`\n\n\n11"
+        },
+        {
+          "idx": 16,
+          "page": 9,
+          "score": 0.5778905749320984,
+          "text": "**AI VIETNAM (AIO2024)** **aivietnam.edu.vn**\n\n\n86 `return` `self.load(files, workers=workers)`\n\n## II.6. Cập nhật file src/rag/vectorstore.py\n\n\nTại file này, ta định nghĩa một class đểkhởi tạo hệcơ sởdữliệu vector. Trong project này, chúng\nta sẽsửdụng Chroma. Vềviệc tìm kiếm tài liệu tương đồng, ta sửdụng FAISS. Như vậy, nội\ndung của file như sau:\n\n\nHình 4: Minh họa việc sửdụng vector database Chroma đểtruy vấn các tài liệu có liên quan\n[làm context trong prompt. Ảnh: Link.](https://heidloff.net/article/retrieval-augmented-generation-chroma-langchain/)\n\n\n1 `from` `typing` `import` `Union`\n\n2 `from` `langchain_chroma` `import` `Chroma`\n\n3 `from` `langchain_community .vectorstores` `import` `FAISS`\n\n4 `from` `langchain_community .embeddings` `import` `HuggingFaceEmbeddings`\n\n\n5\n\n\n6 `class` `VectorDB:`\n\n\n7 `def` `__init__(self,`\n\n\n8 `documents = None,`\n\n9 `vector_db: Union[Chroma, FAISS] = Chroma,`\n\n10 `embedding = HuggingFaceEmbeddings (),`\n\n11 `) -> None` `:`\n\n\n12\n\n\n13 `self.vector_db ..."
+        }
+      ],
+      "model_verdict": {
+        "supported": true,
+        "confidence": 1.0,
+        "evidence": "vector_db: Union[Chroma, FAISS] = Chroma",
+        "reason": "Mặc định của tham số vector_db trong class VectorDB được đặt là Chroma"
+      }
+    },
+    "4": {
+      "supported_by_embeddings": false,
+      "max_similarity": 0.43995893001556396,
+      "evidence": [],
+      "model_verdict": {
+        "supported": false,
+        "confidence": 0.95,
+        "evidence": "",
+        "reason": "Trong nội dung Context không có bất kỳ đoạn nào đề cập đến BitsAndBytesConfig hay tham số kiểu lượng tử, vì vậy không thể chứng thực đáp án nf4."
+      }
+    },
+    "5": {
+      "supported_by_embeddings": true,
+      "max_similarity": 0.6268875598907471,
+      "evidence": [
+        {
+          "idx": 1,
+          "page": 2,
+          "score": 0.6268875598907471,
+          "text": "**AI VIETNAM (AIO2024)** **aivietnam.edu.vn**\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHình 2: Tổng quan vềpipeline của project.\n\n\n**Theo đó:**\n\n\n1. Từdanh sách các bài báo khoa học, ta tách thành các văn bản nhỏ. Từđó, xây dựng một\nhệcơ sởdữliệu vector với một embedding model.\n\n\n2. Bên cạnh câu hỏi đầu vào (question), ta truy vấn các mẫu văn bản có liên quan đến đến\ncâu hỏi, dùng làm ngữcảnh (context) trong câu prompt. Đây là nguồn thông tin mà LLMs\ncó thểdựa vào đểtrảlời câu hỏi.\n\n\n3. Đưa câu prompt vào mô hình (question và context) đểnhận câu trảlời từmô hình.\n\n\n2"
+        },
+        {
+          "idx": 30,
+          "page": 17,
+          "score": 0.5708718299865723,
+          "text": "split_documents (pdf_pages)`\n\n\n18\n\n\n19 _`# Embedding`_ _`model`_\n\n20 `embedding_model = HuggingFaceEmbeddings ()`\n\n\n21\n\n\n22 _`# vector`_ _`store`_\n\n\n23 `chroma_db = Chroma.from_documents(docs, embedding= embedding_model )`\n\n\nNhiệm vụcủa `embedding_model` là gì? ( _a_ ) Dùng biến đổi chuỗi đầu vào thành các vector cho cơ sởdữliệu vector. ( _b_ ) Dùng đểlập chỉmục cho cơ sởdữliệu. ( _c_ ) Dùng đểtìm kiếm tài liệu. ( _d_ ) Dùng đểtính toán độtương đồng. 17"
+        }
+      ],
+      "model_verdict": {
+        "supported": true,
+        "confidence": 0.99,
+        "evidence": "1. Từ danh sách các bài báo khoa học, ta tách thành các văn bản nhỏ. Từ đó, xây dựng một hẹcơ sở dữ liệu vector với một embedding model.",
+        "reason": "Context explicitly states that after splitting documents, a vector database is built using an embedding model, matching the chosen answer."
+      }
+    }
+  }
+}

test/output.json ADDED Viewed

The diff for this file is too large to render. See raw diff

utils.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import re
+import json
+from typing import Dict, Any
+import requests
+import os
+API_URL = "https://router.huggingface.co/v1/chat/completions"
+HF_KEY = os.environ['HF_API_KEY']
+HEADERS = {"Authorization": f"Bearer {HF_KEY}"}
+JSON_OBJ_RE = re.compile(r"(\{[\s\S]*\})", re.MULTILINE)
+def _post_chat(messages: list, model: str, temperature: float = 0.2, timeout: int = 60) -> str:
+    payload = {"model": model, "messages": messages, "temperature": temperature}
+    resp = requests.post(API_URL, headers=HEADERS, json=payload, timeout=timeout)
+    resp.raise_for_status()
+    data = resp.json()
+    # handle various shapes
+    if "choices" in data and len(data["choices"]) > 0:
+        # prefer message.content
+        ch = data["choices"][0]
+        if isinstance(ch, dict) and "message" in ch and "content" in ch["message"]:
+            return ch["message"]["content"]
+        if "text" in ch:
+            return ch["text"]
+    # final fallback
+    raise RuntimeError("Unexpected HF response shape: " + json.dumps(data)[:200])
+def _safe_extract_json(text: str) -> dict:
+    # remove triple backticks
+    text = re.sub(r"```(?:json)?\n?", "", text)
+    m = JSON_OBJ_RE.search(text)
+    if not m:
+        raise ValueError("No JSON object found in model output.")
+    js = m.group(1)
+    # try load, fix trailing commas
+    try:
+        return json.loads(js)
+    except json.JSONDecodeError:
+        fixed = re.sub(r",\s*([}\]])", r"\1", js)
+        return json.loads(fixed)
+def generate_mcqs_from_text(
+    source_text: str,
+    n: int = 3,
+    model: str = "openai/gpt-oss-120b:cerebras",
+    temperature: float = 0.2,
+) -> Dict[str, Any]:
+    system_message = {
+        "role": "system",
+        "content": (
+            "Bạn là một trợ lý hữu ích chuyên tạo câu hỏi trắc nghiệm. "
+            "Chỉ TRẢ VỀ duy nhất một đối tượng JSON theo đúng schema sau và không có bất kỳ văn bản nào khác:\n\n"
+            "{\n"
+            '  "1": { "câu hỏi": "...", "lựa chọn": {"a":"...","b":"...","c":"...","d":"..."}, "đáp án":"..."},\n'
+            '  "2": { ... }\n'
+            "}\n\n"
+            "Lưu ý:\n"
+            f"- Tạo đúng {n} mục, đánh số từ 1 tới {n}.\n"
+            "- Khóa 'lựa chọn' phải có các phím a, b, c, d.\n"
+            "- 'đáp án' phải là toàn văn đáp án đúng (không phải ký tự chữ cái), và giá trị này phải khớp chính xác với một trong các giá trị trong 'lựa chọn'.\n"
+            "- Không kèm giải thích hay trường thêm.\n"
+            "- Các phương án sai (distractors) phải hợp lý và không lặp lại."
+        )
+    }
+    user_message = {
+        "role": "user",
+        "content": (
+            f"Hãy tạo {n} câu hỏi trắc nghiệm từ nội dung dưới đây. Dùng nội dung này làm nguồn duy nhất để trả lời."
+            "Nếu nội dung quá ít để tạo câu hỏi chính xác, hãy tạo các phương án hợp lý nhưng có thể biện minh được.\n\n"
+            f"Nội dung:\n\n{source_text}"
+        )
+    }
+    raw = _post_chat([system_message, user_message], model=model, temperature=temperature)
+    parsed = _safe_extract_json(raw)
+    # validate structure and length
+    if not isinstance(parsed, dict) or len(parsed) != n:
+        raise ValueError(f"Generator returned invalid structure. Raw:\n{raw}")
+    return parsed