Spaces:

tusarway
/

rag-backend

Running

App Files Files Community

imtrt004 commited on Feb 26

Commit

b5be2eb

0 Parent(s):

Initial backend

Browse files

Files changed (17) hide show

.env.example +5 -0
Dockerfile +22 -0
app.py +206 -0
generation/__init__.py +0 -0
generation/llm.py +49 -0
generation/quiz.py +50 -0
ingestion/__init__.py +0 -0
ingestion/chunker.py +11 -0
ingestion/parser.py +22 -0
model/__init__.py +0 -0
model/loader.py +23 -0
persistence/__init__.py +0 -0
persistence/tier.py +90 -0
requirements.txt +12 -0
retrieval/__init__.py +0 -0
retrieval/embedder.py +26 -0
retrieval/vectorstore.py +48 -0

.env.example ADDED Viewed

	@@ -0,0 +1,5 @@

+# hf-backend HuggingFace Space environment variables
+# Set these in your HF Space settings → Variables and Secrets
+SUPABASE_URL=https://YOUR_PROJECT_REF.supabase.co
+SUPABASE_KEY=your_service_role_key_here   # NOT the anon key — use service role

Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+FROM python:3.12-slim
+WORKDIR /app
+# Build tools for llama-cpp-python
+RUN apt-get update && apt-get install -y \
+    build-essential cmake git curl \
+    && rm -rf /var/lib/apt/lists/*
+# Copy and install Python deps first (layer cache)
+COPY requirements.txt .
+# Build llama-cpp-python for CPU (no GPU flags)
+RUN CMAKE_ARGS="-DLLAMA_BLAS=OFF -DLLAMA_NATIVE=OFF" \
+    pip install llama-cpp-python==0.3.8 --no-cache-dir
+RUN pip install -r requirements.txt --no-cache-dir
+COPY . .
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--timeout-keep-alive", "120"]

app.py ADDED Viewed

	@@ -0,0 +1,206 @@

+from contextlib import asynccontextmanager
+from fastapi import FastAPI, UploadFile, HTTPException, BackgroundTasks
+from fastapi.responses import StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from supabase import create_client
+import uuid
+import os
+from model.loader import get_llm
+from retrieval.embedder import get_model, embed_chunks, embed_query
+from retrieval.vectorstore import store_chunks, similarity_search
+from ingestion.parser import parse_file
+from ingestion.chunker import smart_chunk
+from generation.llm import stream_answer
+from generation.quiz import generate_quiz
+from persistence.tier import (
+    get_user_tier,
+    get_expiry,
+    can_upload,
+    check_message_limit,
+    Tier,
+)
+def _supa():
+    return create_client(os.environ["SUPABASE_URL"], os.environ["SUPABASE_KEY"])
+# ─── Lifespan (replaces deprecated @app.on_event) ───────────────────────────
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Startup: warm up both models so first user doesn't wait
+    print("🚀 Warming up models...")
+    get_model()   # BGE-small — ~2s
+    get_llm()     # Qwen3-4B  — ~30s on first boot
+    print("✅ Ready")
+    yield
+    # Shutdown: nothing needed, models unload with process
+app = FastAPI(title="RAG Backend", lifespan=lifespan)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],   # Restrict to your CF domain in production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ─── Upload ──────────────────────────────────────────────────────────────────
+@app.post("/upload")
+async def upload(
+    file: UploadFile,
+    user_id: str,
+    bg: BackgroundTasks,
+):
+    content = await file.read()
+    ok, msg = can_upload(user_id, len(content))
+    if not ok:
+        raise HTTPException(status_code=403, detail=msg)
+    tier    = get_user_tier(user_id)
+    expires = get_expiry(tier)
+    doc_id  = str(uuid.uuid4())
+    supa    = _supa()
+    # Store raw file in Supabase Storage
+    supa.storage.from_("documents").upload(
+        path=f"{user_id}/{doc_id}/{file.filename}",
+        file=content,
+        file_options={"content-type": file.content_type or "application/octet-stream"},
+    )
+    # Create doc metadata row
+    supa.table("documents").insert({
+        "id": doc_id,
+        "user_id": user_id,
+        "filename": file.filename,
+        "status": "processing",
+        "tier_at_upload": str(tier),
+        "expires_at": expires.isoformat(),
+    }).execute()
+    # Process in background (parse → chunk → embed → store)
+    bg.add_task(_process_doc, content, doc_id, user_id, expires, file.filename)
+    return {"doc_id": doc_id, "status": "processing", "expires_at": expires.isoformat()}
+async def _process_doc(content, doc_id, user_id, expires, filename):
+    supa = _supa()
+    try:
+        text   = parse_file(content, filename)
+        chunks = smart_chunk(text)
+        embeds = embed_chunks(chunks)
+        store_chunks(doc_id, user_id, chunks, embeds, expires)
+        supa.table("documents").update({"status": "ready", "chunk_count": len(chunks)}) \
+            .eq("id", doc_id).execute()
+    except Exception as e:
+        supa.table("documents").update({"status": "error", "error": str(e)}) \
+            .eq("id", doc_id).execute()
+# ─── Chat ────────────────────────────────────────────────────────────────────
+class ChatRequest(BaseModel):
+    doc_id:     str
+    query:      str
+    user_id:    str
+    session_id: str
+@app.post("/chat")
+async def chat(req: ChatRequest):
+    ok, msg = check_message_limit(req.user_id, req.session_id)
+    if not ok:
+        raise HTTPException(status_code=429, detail=msg)
+    tier    = get_user_tier(req.user_id)
+    expires = get_expiry(tier)
+    q_vec   = embed_query(req.query)
+    chunks  = similarity_search(req.doc_id, q_vec, top_k=5)
+    if not chunks:
+        raise HTTPException(status_code=404, detail="Document expired or not found.")
+    # Scholar tier gets Qwen3's thinking mode for deeper answers
+    use_thinking = (tier == Tier.SCHOLAR)
+    supa         = _supa()
+    full_resp: list[str] = []
+    # Save user message
+    supa.table("chat_history").insert({
+        "doc_id": req.doc_id,
+        "session_id": req.session_id,
+        "user_id": req.user_id,
+        "role": "user",
+        "content": req.query,
+        "expires_at": expires.isoformat(),
+    }).execute()
+    def generate():
+        for token in stream_answer(req.query, chunks, thinking_mode=use_thinking):
+            full_resp.append(token)
+            yield f"data: {token}\n\n"
+        # Persist assistant response after stream completes
+        supa.table("chat_history").insert({
+            "doc_id": req.doc_id,
+            "session_id": req.session_id,
+            "user_id": req.user_id,
+            "role": "assistant",
+            "content": "".join(full_resp),
+            "expires_at": expires.isoformat(),
+        }).execute()
+        yield "data: [DONE]\n\n"
+    return StreamingResponse(
+        generate(),
+        media_type="text/event-stream",
+        headers={"X-Accel-Buffering": "no"},   # disable nginx buffering
+    )
+# ─── Quiz ────────────────────────────────────────────────────────────────────
+class QuizRequest(BaseModel):
+    doc_id:  str
+    query:   str    # last question asked — use same context
+    user_id: str
+@app.post("/quiz")
+async def quiz(req: QuizRequest):
+    tier = get_user_tier(req.user_id)
+    if tier not in (Tier.SCHOLAR, Tier.PRO):
+        raise HTTPException(status_code=403, detail="Quiz mode requires Pro or Scholar plan.")
+    q_vec  = embed_query(req.query)
+    chunks = similarity_search(req.doc_id, q_vec, top_k=3)
+    if not chunks:
+        raise HTTPException(status_code=404, detail="Document not found or expired.")
+    questions = generate_quiz(chunks)
+    return {"questions": questions}
+# ─── Utility ─────────────────────────────────────────────────────────────────
+@app.get("/doc-status/{doc_id}")
+async def doc_status(doc_id: str):
+    supa   = _supa()
+    result = supa.table("documents").select("status,chunk_count,expires_at") \
+        .eq("id", doc_id).single().execute()
+    return result.data
+@app.get("/health")
+def health():
+    return {"status": "alive", "model": "Qwen3-4B-Instruct-Q4_K_M"}

generation/__init__.py ADDED Viewed

File without changes

generation/llm.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from model.loader import get_llm
+from typing import Generator
+SYSTEM_PROMPT = """You are a precise document study assistant by Md Tusar Akon.
+Answer ONLY from the provided context. Be concise and factual.
+If the answer is not in the context, say exactly: "I couldn't find that in your document."
+Never make up or infer information not present in the context."""
+def stream_answer(
+    query: str,
+    context_chunks: list[str],
+    thinking_mode: bool = False,
+) -> Generator[str, None, None]:
+    llm = get_llm()
+    context = "\n\n---\n\n".join(context_chunks)
+    # Qwen3 native thinking toggle — appended to user message
+    think_tag = "/think" if thinking_mode else "/no_think"
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": f"Context:\n{context}\n\nQuestion: {query} {think_tag}",
+        },
+    ]
+    in_think_block = False
+    for chunk in llm.create_chat_completion(
+        messages=messages,
+        max_tokens=600,
+        temperature=0.2,
+        top_p=0.95,
+        top_k=20,
+        stream=True,
+    ):
+        delta = chunk["choices"][0]["delta"].get("content", "")
+        if not delta:
+            continue
+        # Strip <think>...</think> blocks from output stream
+        if "<think>" in delta:
+            in_think_block = True
+        if "</think>" in delta:
+            in_think_block = False
+            continue
+        if not in_think_block:
+            yield delta

generation/quiz.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from model.loader import get_llm
+import json
+import re
+QUIZ_PROMPT = """Based on the context below, generate exactly 3 multiple-choice quiz questions.
+Each question must test understanding of the content, not trivia.
+Context:
+{context}
+Respond ONLY with a JSON array, no markdown, no explanation:
+[
+  {{
+    "question": "...",
+    "options": ["A) ...", "B) ...", "C) ...", "D) ..."],
+    "answer": "A",
+    "explanation": "Brief explanation why"
+  }},
+  ...
+]"""
+def generate_quiz(context_chunks: list[str]) -> list[dict]:
+    llm = get_llm()
+    context = "\n\n".join(context_chunks[:3])  # Use top 3 chunks
+    messages = [
+        {
+            "role": "user",
+            "content": QUIZ_PROMPT.format(context=context) + " /no_think",
+        }
+    ]
+    result = llm.create_chat_completion(
+        messages=messages,
+        max_tokens=800,
+        temperature=0.4,
+        stream=False,
+    )
+    raw = result["choices"][0]["message"]["content"]
+    # Strip any accidental markdown fences
+    raw = re.sub(r"```json|```", "", raw).strip()
+    try:
+        questions = json.loads(raw)
+        return questions if isinstance(questions, list) else []
+    except json.JSONDecodeError:
+        return []

ingestion/__init__.py ADDED Viewed

File without changes

ingestion/chunker.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from langchain_text_splitters import RecursiveCharacterTextSplitter
+def smart_chunk(text: str, chunk_size: int = 512, overlap: int = 64) -> list[str]:
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=overlap,
+        separators=["\n\n", "\n", ".", "!", "?", " ", ""],
+        length_function=len,
+    )
+    return [c for c in splitter.split_text(text) if len(c.strip()) > 30]

ingestion/parser.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import io
+import pymupdf                      # pymupdf 1.25+ import (not fitz)
+from docx import Document
+def parse_file(content: bytes, filename: str) -> str:
+    fname = filename.lower()
+    if fname.endswith(".pdf"):
+        doc = pymupdf.open(stream=content, filetype="pdf")
+        pages = [page.get_text() for page in doc]
+        doc.close()
+        return "\n\n".join(pages)
+    if fname.endswith(".docx"):
+        doc = Document(io.BytesIO(content))
+        return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())
+    if fname.endswith(".txt") or fname.endswith(".md"):
+        return content.decode("utf-8", errors="replace")
+    raise ValueError(f"Unsupported file type: {filename}")

model/__init__.py ADDED Viewed

File without changes

model/loader.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from llama_cpp import Llama
+from contextlib import asynccontextmanager
+_llm: Llama | None = None
+def get_llm() -> Llama:
+    global _llm
+    if _llm is None:
+        print("⏳ Loading Qwen3-4B-Instruct Q4_K_M...")
+        _llm = Llama.from_pretrained(
+            repo_id="Qwen/Qwen3-4B-GGUF",
+            filename="qwen3-4b-q4_k_m.gguf",
+            # Use jinja template embedded in GGUF — recommended for Qwen3
+            # avoids any chat_format string mismatch
+            chat_format=None,
+            n_ctx=8192,
+            n_threads=2,        # HF free CPU = 2 vCPUs
+            n_gpu_layers=0,     # CPU only
+            verbose=False,
+        )
+        print("✅ Qwen3-4B loaded and ready")
+    return _llm

persistence/__init__.py ADDED Viewed

File without changes

persistence/tier.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from datetime import datetime, timedelta, UTC
+from enum import StrEnum
+from supabase import create_client
+import os
+def _client():
+    return create_client(os.environ["SUPABASE_URL"], os.environ["SUPABASE_KEY"])
+class Tier(StrEnum):
+    FREE    = "free"
+    PRO     = "pro"
+    SCHOLAR = "scholar"
+TTL: dict[Tier, timedelta] = {
+    Tier.FREE:    timedelta(hours=3),
+    Tier.PRO:     timedelta(weeks=1),
+    Tier.SCHOLAR: timedelta(days=30),
+}
+FILE_LIMIT_MB: dict[Tier, int]  = {Tier.FREE: 5,  Tier.PRO: 25, Tier.SCHOLAR: 50}
+DOC_LIMIT: dict[Tier, int | None] = {Tier.FREE: 1,  Tier.PRO: 10, Tier.SCHOLAR: None}
+MSG_LIMIT: dict[Tier, int | None] = {Tier.FREE: 5,  Tier.PRO: 100, Tier.SCHOLAR: None}
+def get_user_tier(user_id: str) -> Tier:
+    r = _client().table("profiles").select("tier").eq("id", user_id).single().execute()
+    return Tier(r.data.get("tier", "free"))
+def get_expiry(tier: Tier) -> datetime:
+    return datetime.now(UTC) + TTL[tier]
+def can_upload(user_id: str, file_bytes: int) -> tuple[bool, str]:
+    tier = get_user_tier(user_id)
+    max_bytes = FILE_LIMIT_MB[tier] * 1024 * 1024
+    if file_bytes > max_bytes:
+        return False, f"File exceeds {FILE_LIMIT_MB[tier]}MB limit on {tier} plan."
+    max_docs = DOC_LIMIT[tier]
+    if max_docs is not None:
+        count = (
+            _client()
+            .table("documents")
+            .select("id", count="exact")
+            .eq("user_id", user_id)
+            .execute()
+            .count
+        )
+        if count >= max_docs:
+            return False, f"{tier.capitalize()} allows {max_docs} doc(s). Upgrade to store more."
+    return True, "ok"
+def check_message_limit(user_id: str, session_id: str) -> tuple[bool, str]:
+    tier = get_user_tier(user_id)
+    limit = MSG_LIMIT[tier]
+    if limit is None:
+        return True, "ok"
+    client = _client()
+    if tier == Tier.FREE:
+        count = (
+            client.table("chat_history")
+            .select("id", count="exact")
+            .eq("session_id", session_id)
+            .eq("role", "user")
+            .execute()
+            .count
+        )
+    else:
+        today = datetime.now(UTC).date().isoformat()
+        count = (
+            client.table("chat_history")
+            .select("id", count="exact")
+            .eq("user_id", user_id)
+            .gte("created_at", today)
+            .eq("role", "user")
+            .execute()
+            .count
+        )
+    if count >= limit:
+        return False, f"Message limit reached on {tier} plan. Upgrade to continue."
+    return True, "ok"

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+fastapi==0.129.0
+uvicorn[standard]==0.34.0
+llama-cpp-python==0.3.8
+sentence-transformers==4.1.0
+huggingface-hub==0.29.1
+supabase==2.13.0
+pymupdf==1.25.3
+python-docx==1.1.2
+langchain-text-splitters==0.3.8
+pydantic==2.11.0
+python-multipart==0.0.20
+httpx==0.28.1

retrieval/__init__.py ADDED Viewed

File without changes

retrieval/embedder.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from sentence_transformers import SentenceTransformer
+import numpy as np
+_model: SentenceTransformer | None = None
+def get_model() -> SentenceTransformer:
+    global _model
+    if _model is None:
+        # 130MB, 384-dim, fastest accurate model on CPU
+        _model = SentenceTransformer("BAAI/bge-small-en-v1.5")
+    return _model
+def embed_chunks(chunks: list[str]) -> list[list[float]]:
+    model = get_model()
+    vecs = model.encode(chunks, normalize_embeddings=True, batch_size=32)
+    return vecs.tolist()
+def embed_query(query: str) -> list[float]:
+    model = get_model()
+    # BGE needs this prefix for queries
+    prefixed = f"Represent this sentence for searching: {query}"
+    vec = model.encode(prefixed, normalize_embeddings=True)
+    return vec.tolist()

retrieval/vectorstore.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from supabase import create_client, Client
+from datetime import datetime
+import os
+def _client() -> Client:
+    return create_client(os.environ["SUPABASE_URL"], os.environ["SUPABASE_KEY"])
+def store_chunks(
+    doc_id: str,
+    user_id: str,
+    chunks: list[str],
+    embeddings: list[list[float]],
+    expires_at: datetime,
+) -> None:
+    client = _client()
+    rows = [
+        {
+            "doc_id": doc_id,
+            "user_id": user_id,
+            "chunk_text": chunk,
+            "embedding": embedding,
+            "chunk_index": i,
+            "expires_at": expires_at.isoformat(),
+        }
+        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings))
+    ]
+    # Insert in batches of 100 to avoid payload limits
+    for i in range(0, len(rows), 100):
+        client.table("chunks").insert(rows[i : i + 100]).execute()
+def similarity_search(
+    doc_id: str,
+    query_embedding: list[float],
+    top_k: int = 5,
+) -> list[str]:
+    client = _client()
+    result = client.rpc(
+        "match_chunks",
+        {
+            "query_embedding": query_embedding,
+            "doc_id_filter": doc_id,
+            "match_count": top_k,
+        },
+    ).execute()
+    return [r["chunk_text"] for r in result.data]