Spaces:

tusarway
/

rag-backend

Running

App Files Files Community

imtrt004 commited on Feb 28

Commit

391fc60

1 Parent(s): e2cc6a2

feat: line number and multi docs

Browse files

Files changed (8) hide show

app.py +84 -29
generation/groq_llm.py +22 -4
generation/llm.py +23 -3
generation/quiz.py +27 -2
ingestion/chunker.py +51 -14
ingestion/parser.py +52 -7
persistence/tier.py +71 -1
retrieval/vectorstore.py +104 -16

app.py CHANGED Viewed

@@ -7,12 +7,16 @@ from supabase import create_client
 import uuid
 import os
 import json
 from model.loader import get_llm, get_model_name, is_llm_ready
 from retrieval.embedder import get_model, embed_chunks, embed_query
-from retrieval.vectorstore import store_chunks, similarity_search, get_all_chunks
-from ingestion.parser import parse_file
-from ingestion.chunker import smart_chunk
 from generation.llm import stream_answer
 from generation.quiz import generate_quiz
 from generation.groq_llm import stream_answer_groq, generate_quiz_groq
@@ -21,6 +25,8 @@ from persistence.tier import (
     get_expiry,
     can_upload,
     check_message_limit,
     Tier,
 )
 from persistence.queue import (
@@ -194,9 +200,9 @@ async def process_from_storage(
 async def _process_doc(content, doc_id, user_id, expires, filename):
     supa = _supa()
     try:
-        text   = parse_file(content, filename)
-        chunks = smart_chunk(text, filename=filename)
-        embeds = embed_chunks(chunks)
         store_chunks(doc_id, user_id, chunks, embeds, expires)
         supa.table("documents").update({"status": "ready", "chunk_count": len(chunks)}) \
             .eq("id", doc_id).execute()
@@ -208,7 +214,8 @@ async def _process_doc(content, doc_id, user_id, expires, filename):
 # ─── Chat ────────────────────────────────────────────────────────────────────
 class ChatRequest(BaseModel):
-    doc_id:       str
     query:        str
     user_id:      str
     session_id:   str
@@ -231,33 +238,71 @@ async def chat(req: ChatRequest):
     tier    = get_user_tier(req.user_id)
     expires = get_expiry(tier)
-    # DeepMind mode: only Pro and Scholar may activate it (needed before chunk fetch)
-    deepmind_allowed = tier in (Tier.PRO, Tier.SCHOLAR)
     use_deepmind     = req.use_deepmind and deepmind_allowed
     if use_deepmind:
-        # Groq models have 128k context — fetch every chunk in document order
-        chunks = get_all_chunks(req.doc_id)
     else:
         q_vec  = embed_query(req.query)
-        chunks = similarity_search(req.doc_id, q_vec, top_k=15)
     if not chunks:
         raise HTTPException(status_code=404, detail="Document expired or not found.")
     # Scholar tier gets thinking mode on the local model (ignored when DeepMind is on)
     use_thinking = (tier == Tier.SCHOLAR) and not use_deepmind
-    supa         = _supa()
     full_resp: list[str] = []
-    # Save user message
     supa.table("chat_history").insert({
-        "doc_id": req.doc_id,
-        "session_id": req.session_id,
-        "user_id": req.user_id,
-        "role": "user",
-        "content": req.query,
-        "expires_at": expires.isoformat(),
     }).execute()
     def generate():
@@ -268,24 +313,30 @@ async def chat(req: ChatRequest):
         )
         for token in token_iter:
             full_resp.append(token)
-            # JSON-encode so embedded newlines in tokens don't break SSE framing
             yield f"data: {json.dumps(token)}\n\n"
-        # Persist assistant response after stream completes
         supa.table("chat_history").insert({
-            "doc_id": req.doc_id,
-            "session_id": req.session_id,
-            "user_id": req.user_id,
-            "role": "assistant",
-            "content": "".join(full_resp),
-            "expires_at": expires.isoformat(),
         }).execute()
         yield "data: [DONE]\n\n"
     return StreamingResponse(
         generate(),
         media_type="text/event-stream",
-        headers={"X-Accel-Buffering": "no"},   # disable nginx buffering
     )
@@ -317,6 +368,10 @@ async def quiz(req: QuizRequest):
         questions = generate_quiz(chunks)
     return {"questions": questions}
 # ─── Utility ─────────────────────────────────────────────────────────────────

 import uuid
 import os
 import json
+from typing import Optional
 from model.loader import get_llm, get_model_name, is_llm_ready
 from retrieval.embedder import get_model, embed_chunks, embed_query
+from retrieval.vectorstore import (
+    store_chunks, similarity_search, similarity_search_multi,
+    get_all_chunks, get_all_chunks_multi,
+)
+from ingestion.parser import parse_file_pages, parse_file
+from ingestion.chunker import smart_chunk_pages, smart_chunk, ChunkMeta
 from generation.llm import stream_answer
 from generation.quiz import generate_quiz
 from generation.groq_llm import stream_answer_groq, generate_quiz_groq
     get_expiry,
     can_upload,
     check_message_limit,
+    check_deepmind_limit,
+    get_deepmind_usage,
     Tier,
 )
 from persistence.queue import (
 async def _process_doc(content, doc_id, user_id, expires, filename):
     supa = _supa()
     try:
+        pages  = parse_file_pages(content, filename)
+        chunks = smart_chunk_pages(pages, filename=filename)
+        embeds = embed_chunks([c.text for c in chunks])
         store_chunks(doc_id, user_id, chunks, embeds, expires)
         supa.table("documents").update({"status": "ready", "chunk_count": len(chunks)}) \
             .eq("id", doc_id).execute()
 # ─── Chat ────────────────────────────────────────────────────────────────────
 class ChatRequest(BaseModel):
+    doc_id:       str                    # primary document (required for backward compat)
+    doc_ids:      Optional[list[str]] = None  # additional / override doc list for multi-doc chat
     query:        str
     user_id:      str
     session_id:   str
     tier    = get_user_tier(req.user_id)
     expires = get_expiry(tier)
+    # ── DeepMind gate ─────────────────────────────────────────────────────────
+    deepmind_allowed = tier in (Tier.PRO, Tier.SCHOLAR, Tier.FREE)
     use_deepmind     = req.use_deepmind and deepmind_allowed
     if use_deepmind:
+        dm_ok, dm_msg = check_deepmind_limit(req.user_id)
+        if not dm_ok:
+            raise HTTPException(status_code=429, detail=dm_msg)
+    # ── Resolve document list ─────────────────────────────────────────────────
+    # Use doc_ids when provided (multi-doc), otherwise fall back to single doc_id
+    all_doc_ids: list[str] = req.doc_ids if req.doc_ids else [req.doc_id]
+    # ── Fetch filename map for citation display ───────────────────────────────
+    supa = _supa()
+    docs_result = supa.table("documents").select("id, filename") \
+        .in_("id", all_doc_ids).execute()
+    filename_map: dict[str, str] = {
+        d["id"]: d["filename"] for d in (docs_result.data or [])
+    }
+    # ── Retrieve chunks ───────────────────────────────────────────────────────
+    if use_deepmind:
+        # Groq has 128k context — fetch every chunk
+        if len(all_doc_ids) == 1:
+            chunks = get_all_chunks(all_doc_ids[0])
+        else:
+            chunks = get_all_chunks_multi(all_doc_ids)
     else:
         q_vec  = embed_query(req.query)
+        if len(all_doc_ids) == 1:
+            chunks = similarity_search(all_doc_ids[0], q_vec, top_k=15)
+        else:
+            chunks = similarity_search_multi(all_doc_ids, q_vec, top_k=20)
     if not chunks:
         raise HTTPException(status_code=404, detail="Document expired or not found.")
     # Scholar tier gets thinking mode on the local model (ignored when DeepMind is on)
     use_thinking = (tier == Tier.SCHOLAR) and not use_deepmind
     full_resp: list[str] = []
+    # ── Build citation map: source_N → {filename, page, doc_id, text} ────────
+    citation_map = {}
+    for i, chunk in enumerate(chunks, 1):
+        doc_id_chunk  = chunk.doc_id      if hasattr(chunk, "doc_id")      else req.doc_id
+        page_number   = chunk.page_number if hasattr(chunk, "page_number") else 1
+        chunk_text    = chunk.text        if hasattr(chunk, "text")        else str(chunk)
+        citation_map[str(i)] = {
+            "n":        i,
+            "doc_id":   doc_id_chunk,
+            "filename": filename_map.get(doc_id_chunk, "Document"),
+            "page":     page_number,
+            "text":     chunk_text[:400],   # snippet for tooltip/panel
+        }
+    # Save user message (primary doc_id for legacy compatibility)
     supa.table("chat_history").insert({
+        "doc_id":      req.doc_id,
+        "session_id":  req.session_id,
+        "user_id":     req.user_id,
+        "role":        "user",
+        "content":     req.query,
+        "expires_at":  expires.isoformat(),
+        "is_deepmind": use_deepmind,
     }).execute()
     def generate():
         )
         for token in token_iter:
             full_resp.append(token)
             yield f"data: {json.dumps(token)}\n\n"
+        # ── Emit citation map before [DONE] ───────────────────────────────────
+        citations_payload = json.dumps({
+            "__citations__": list(citation_map.values())
+        })
+        yield f"data: {citations_payload}\n\n"
+        # Persist assistant response
         supa.table("chat_history").insert({
+            "doc_id":      req.doc_id,
+            "session_id":  req.session_id,
+            "user_id":     req.user_id,
+            "role":        "assistant",
+            "content":     "".join(full_resp),
+            "expires_at":  expires.isoformat(),
+            "is_deepmind": use_deepmind,
         }).execute()
         yield "data: [DONE]\n\n"
     return StreamingResponse(
         generate(),
         media_type="text/event-stream",
+        headers={"X-Accel-Buffering": "no"},
     )
         questions = generate_quiz(chunks)
     return {"questions": questions}
+@app.get("/deepmind-usage/{user_id}")
+async def deepmind_usage(user_id: str):
+    """Return DeepMind daily usage stats for a user: {used, limit, remaining, tier}."""
+    return get_deepmind_usage(user_id)
 # ─── Utility ─────────────────────────────────────────────────────────────────

generation/groq_llm.py CHANGED Viewed

@@ -111,9 +111,25 @@ DEFAULT_MODEL = os.environ.get("GROQ_MODEL", "llama-3.3-70b-versatile")
 SYSTEM_PROMPT = """You are a precise document study assistant by Md Tusar Akon.
 Answer ONLY from the provided context. Be concise and factual.
 If the answer is not in the context, say exactly: "I couldn't find that in your document."
 Never make up or infer information not present in the context."""
 QUIZ_PROMPT = """Based on the context below, generate exactly 10 multiple-choice quiz questions.
 Each question must test understanding of the content, not trivia.
@@ -201,7 +217,7 @@ def _inc(key_id: int) -> None:
 def stream_answer_groq(
     query: str,
-    context_chunks: list[str],
 ) -> Generator[str, None, None]:
     """Stream a Groq answer, auto-rotating keys on rate-limit errors."""
     try:
@@ -210,7 +226,7 @@ def stream_answer_groq(
         yield "DeepMind mode requires the `groq` package. Please contact support."
         return
-    context = "\n\n---\n\n".join(context_chunks)
     messages = [
         {"role": "system", "content": SYSTEM_PROMPT},
         {"role": "user",   "content": f"Context:\n{context}\n\nQuestion: {query}"},
@@ -252,14 +268,16 @@ def stream_answer_groq(
 # ── Quiz generation ────────────────────────────────────────────────────────────
-def generate_quiz_groq(context_chunks: list[str]) -> list[dict]:
     """Generate 10 quiz questions via Groq API with key rotation."""
     try:
         from groq import Groq, RateLimitError  # type: ignore[import]
     except ImportError:
         return []
-    context = "\n\n".join(context_chunks[:5])
     messages = [{"role": "user", "content": QUIZ_PROMPT.format(context=context)}]
     for _attempt in range(_MAX_RETRIES):

 SYSTEM_PROMPT = """You are a precise document study assistant by Md Tusar Akon.
 Answer ONLY from the provided context. Be concise and factual.
+CRITICAL: Whenever you use information from the context, you MUST cite the source using the
+notation [[N]] (e.g., [[1]], [[2]]) immediately after the relevant sentence or phrase.
+Each source reference number N corresponds to the [Source N] header in the context below.
+Multiple citations are written as [[1]][[2]].
 If the answer is not in the context, say exactly: "I couldn't find that in your document."
 Never make up or infer information not present in the context."""
+def _build_context(chunks: list) -> str:
+    """Format chunks with numbered source headers for [Source N] citation notation."""
+    parts = []
+    for i, chunk in enumerate(chunks, 1):
+        text        = chunk.text        if hasattr(chunk, "text")        else str(chunk)
+        page_number = chunk.page_number if hasattr(chunk, "page_number") else 1
+        parts.append(f"[Source {i} \u2014 Page {page_number}]\n{text}")
+    return "\n\n---\n\n".join(parts)
 QUIZ_PROMPT = """Based on the context below, generate exactly 10 multiple-choice quiz questions.
 Each question must test understanding of the content, not trivia.
 def stream_answer_groq(
     query: str,
+    context_chunks: list,
 ) -> Generator[str, None, None]:
     """Stream a Groq answer, auto-rotating keys on rate-limit errors."""
     try:
         yield "DeepMind mode requires the `groq` package. Please contact support."
         return
+    context = _build_context(context_chunks)
     messages = [
         {"role": "system", "content": SYSTEM_PROMPT},
         {"role": "user",   "content": f"Context:\n{context}\n\nQuestion: {query}"},
 # ── Quiz generation ────────────────────────────────────────────────────────────
+def generate_quiz_groq(context_chunks: list) -> list[dict]:
     """Generate 10 quiz questions via Groq API with key rotation."""
     try:
         from groq import Groq, RateLimitError  # type: ignore[import]
     except ImportError:
         return []
+    context = "\n\n".join(
+        (c.text if hasattr(c, "text") else str(c)) for c in context_chunks[:5]
+    )
     messages = [{"role": "user", "content": QUIZ_PROMPT.format(context=context)}]
     for _attempt in range(_MAX_RETRIES):

generation/llm.py CHANGED Viewed

@@ -1,23 +1,43 @@
 import torch
 from model.loader import get_tokenizer, get_llm
 from transformers import TextIteratorStreamer
 from threading import Thread
-from typing import Generator
 SYSTEM_PROMPT = """You are a precise document study assistant by Md Tusar Akon.
 Answer ONLY from the provided context. Be concise and factual.
 If the answer is not in the context, say exactly: "I couldn't find that in your document."
 Never make up or infer information not present in the context."""
 def stream_answer(
     query: str,
-    context_chunks: list[str],
     thinking_mode: bool = False,
 ) -> Generator[str, None, None]:
     tokenizer = get_tokenizer()
     model = get_llm()
-    context = "\n\n---\n\n".join(context_chunks)
     messages = [
         {"role": "system", "content": SYSTEM_PROMPT},

+from __future__ import annotations
 import torch
 from model.loader import get_tokenizer, get_llm
 from transformers import TextIteratorStreamer
 from threading import Thread
+from typing import Generator, TYPE_CHECKING
+if TYPE_CHECKING:
+    from retrieval.vectorstore import ChunkResult
 SYSTEM_PROMPT = """You are a precise document study assistant by Md Tusar Akon.
 Answer ONLY from the provided context. Be concise and factual.
+CRITICAL: Whenever you use information from the context, you MUST cite the source using the
+notation [[N]] (e.g., [[1]], [[2]]) immediately after the relevant sentence or phrase.
+Each source reference number N corresponds to the [Source N] header in the context below.
+Multiple citations are written as [[1]][[2]].
 If the answer is not in the context, say exactly: "I couldn't find that in your document."
 Never make up or infer information not present in the context."""
+def _build_context(chunks: list) -> str:
+    """Format chunks into a numbered context block with source references."""
+    parts = []
+    for i, chunk in enumerate(chunks, 1):
+        text        = chunk.text        if hasattr(chunk, "text")        else str(chunk)
+        page_number = chunk.page_number if hasattr(chunk, "page_number") else 1
+        parts.append(f"[Source {i} — Page {page_number}]\n{text}")
+    return "\n\n---\n\n".join(parts)
 def stream_answer(
     query: str,
+    context_chunks: list,
     thinking_mode: bool = False,
 ) -> Generator[str, None, None]:
     tokenizer = get_tokenizer()
     model = get_llm()
+    context = _build_context(context_chunks)
     messages = [
         {"role": "system", "content": SYSTEM_PROMPT},

generation/quiz.py CHANGED Viewed

@@ -21,10 +21,12 @@ Respond ONLY with a JSON array, no markdown, no explanation:
 ]"""
-def generate_quiz(context_chunks: list[str]) -> list[dict]:
     tokenizer = get_tokenizer()
     model = get_llm()
-    context = "\n\n".join(context_chunks[:5])
     messages = [{"role": "user", "content": QUIZ_PROMPT.format(context=context)}]
     input_ids = tokenizer.apply_chat_template(
@@ -45,6 +47,29 @@ def generate_quiz(context_chunks: list[str]) -> list[dict]:
     raw = tokenizer.decode(new_tokens, skip_special_tokens=True)
     raw = re.sub(r"```json|```", "", raw).strip()
     try:
         questions = json.loads(raw)
         return questions if isinstance(questions, list) else []

 ]"""
+def generate_quiz(context_chunks: list) -> list[dict]:
     tokenizer = get_tokenizer()
     model = get_llm()
+    # Support both ChunkResult objects and plain strings
+    texts = [(c.text if hasattr(c, "text") else str(c)) for c in context_chunks[:5]]
+    context = "\n\n".join(texts)
     messages = [{"role": "user", "content": QUIZ_PROMPT.format(context=context)}]
     input_ids = tokenizer.apply_chat_template(
     raw = tokenizer.decode(new_tokens, skip_special_tokens=True)
     raw = re.sub(r"```json|```", "", raw).strip()
+    try:
+        questions = json.loads(raw)
+        return questions if isinstance(questions, list) else []
+    except json.JSONDecodeError:
+        return []
+    input_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        return_tensors="pt",
+    )
+    with torch.no_grad():
+        output_ids = model.generate(
+            input_ids,
+            max_new_tokens=2048,
+            do_sample=False,          # greedy - faster on CPU
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    new_tokens = output_ids[0][input_ids.shape[-1]:]
+    raw = tokenizer.decode(new_tokens, skip_special_tokens=True)
+    raw = re.sub(r"```json|```", "", raw).strip()
     try:
         questions = json.loads(raw)
         return questions if isinstance(questions, list) else []

ingestion/chunker.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
 # Map file extension → LangChain Language enum for code-aware splitting
@@ -23,22 +24,58 @@ _EXT_TO_LANGUAGE: dict[str, Language] = {
 }
-def smart_chunk(text: str, chunk_size: int = 1024, overlap: int = 128,
-                filename: str = "") -> list[str]:
     ext = ("."+filename.lower().rsplit(".", 1)[-1]) if "." in filename else ""
     lang = _EXT_TO_LANGUAGE.get(ext)
     if lang is not None:
-        splitter = RecursiveCharacterTextSplitter.from_language(
-            language=lang,
-            chunk_size=chunk_size,
-            chunk_overlap=overlap,
-        )
-    else:
-        splitter = RecursiveCharacterTextSplitter(
-            chunk_size=chunk_size,
-            chunk_overlap=overlap,
-            separators=["\n\n", "\n", ".", "!", "?", " ", ""],
-            length_function=len,
         )
     return [c for c in splitter.split_text(text) if len(c.strip()) > 30]

+from __future__ import annotations
 from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
 # Map file extension → LangChain Language enum for code-aware splitting
 }
+def _make_splitter(filename: str, chunk_size: int, overlap: int) -> RecursiveCharacterTextSplitter:
     ext = ("."+filename.lower().rsplit(".", 1)[-1]) if "." in filename else ""
     lang = _EXT_TO_LANGUAGE.get(ext)
     if lang is not None:
+        return RecursiveCharacterTextSplitter.from_language(
+            language=lang, chunk_size=chunk_size, chunk_overlap=overlap,
         )
+    return RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size, chunk_overlap=overlap,
+        separators=["\n\n", "\n", ".", "!", "?", " ", ""],
+        length_function=len,
+    )
+# ── ChunkMeta: a chunk of text with its origin page ──────────────────────────
+class ChunkMeta:
+    """Thin container so callers can access `.text` and `.page_number`."""
+    __slots__ = ("text", "page_number")
+    def __init__(self, text: str, page_number: int):
+        self.text = text
+        self.page_number = page_number
+    # Make it behave like a plain string in legacy code paths
+    def __str__(self)  -> str: return self.text
+    def __repr__(self) -> str: return f"ChunkMeta(page={self.page_number}, text={self.text[:40]!r})"
+    def __len__(self)  -> int: return len(self.text)
+def smart_chunk_pages(
+    pages: list[tuple[int, str]],        # (page_number, page_text)
+    chunk_size: int = 1024,
+    overlap: int = 128,
+    filename: str = "",
+) -> list[ChunkMeta]:
+    """Split page-tagged text into chunks, preserving page origin.
+    Each page is chunked independently so page numbers stay accurate.
+    Returns a list of ChunkMeta objects ordered by (page, chunk_within_page).
+    """
+    splitter = _make_splitter(filename, chunk_size, overlap)
+    result: list[ChunkMeta] = []
+    for page_num, text in pages:
+        for piece in splitter.split_text(text):
+            if len(piece.strip()) > 30:
+                result.append(ChunkMeta(piece, page_num))
+    return result
+def smart_chunk(text: str, chunk_size: int = 1024, overlap: int = 128,
+                filename: str = "") -> list[str]:
+    """Legacy helper — returns plain strings without page info."""
+    splitter = _make_splitter(filename, chunk_size, overlap)
     return [c for c in splitter.split_text(text) if len(c.strip()) > 30]

ingestion/parser.py CHANGED Viewed

@@ -1,9 +1,12 @@
 import csv
 import io
 import json
 import pymupdf                      # pymupdf 1.25+ import (not fitz)
 from docx import Document
 # Plain-text and code extensions decoded as-is
 _TEXT_EXTENSIONS = {
@@ -28,6 +31,33 @@ _TEXT_EXTENSIONS = {
 }
 def _parse_csv(content: bytes) -> str:
     """Convert CSV to a readable pipe-delimited table."""
     text = content.decode("utf-8", errors="replace")
@@ -62,32 +92,47 @@ def _parse_ipynb(content: bytes) -> str:
     return "\n\n".join(parts)
-def parse_file(content: bytes, filename: str) -> str:
     fname = filename.lower()
     # ── PDF ──────────────────────────────────────────────────────────────────
     if fname.endswith(".pdf"):
         doc = pymupdf.open(stream=content, filetype="pdf")
-        pages = [page.get_text() for page in doc]
         doc.close()
-        return "\n\n".join(pages)
     # ── Word ─────────────────────────────────────────────────────────────────
     if fname.endswith(".docx"):
         doc = Document(io.BytesIO(content))
-        return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())
     # ── CSV ──────────────────────────────────────────────────────────────────
     if fname.endswith(".csv"):
-        return _parse_csv(content)
     # ── Jupyter Notebook ─────────────────────────────────────────────────────
     if fname.endswith(".ipynb"):
-        return _parse_ipynb(content)
     # ── Plain text, markdown, RMD, and all code/config files ─────────────────
     ext = "." + fname.rsplit(".", 1)[-1] if "." in fname else ""
     if ext in _TEXT_EXTENSIONS:
-        return content.decode("utf-8", errors="replace")
     raise ValueError(f"Unsupported file type: {filename}")

 import csv
 import io
 import json
+import math
 import pymupdf                      # pymupdf 1.25+ import (not fitz)
 from docx import Document
+# Approximate characters per "page" used when splitting non-PDF content.
+_CHARS_PER_PAGE = 3_000
 # Plain-text and code extensions decoded as-is
 _TEXT_EXTENSIONS = {
 }
+def _assign_pages(text: str) -> list[tuple[int, str]]:
+    """Split flat text into virtual pages of ~_CHARS_PER_PAGE characters each.
+    Returns a list of (page_number, chunk_text) tuples starting from page 1.
+    Splitting is done on paragraph boundaries so words are never cut.
+    """
+    paragraphs = text.split("\n\n")
+    pages: list[tuple[int, str]] = []
+    current_page = 1
+    current_chars = 0
+    current_parts: list[str] = []
+    for para in paragraphs:
+        if current_chars + len(para) > _CHARS_PER_PAGE and current_parts:
+            pages.append((current_page, "\n\n".join(current_parts)))
+            current_page += 1
+            current_parts = []
+            current_chars = 0
+        current_parts.append(para)
+        current_chars += len(para) + 2  # +2 for the "\n\n" separator
+    if current_parts:
+        pages.append((current_page, "\n\n".join(current_parts)))
+    return pages if pages else [(1, text)]
 def _parse_csv(content: bytes) -> str:
     """Convert CSV to a readable pipe-delimited table."""
     text = content.decode("utf-8", errors="replace")
     return "\n\n".join(parts)
+# ── Public API ────────────────────────────────────────────────────────────────
+def parse_file_pages(content: bytes, filename: str) -> list[tuple[int, str]]:
+    """Parse a file and return a list of (page_number, text) tuples.
+    For PDFs each physical page maps to one tuple.
+    For all other formats pages are approximated using _CHARS_PER_PAGE.
+    """
     fname = filename.lower()
     # ── PDF ──────────────────────────────────────────────────────────────────
     if fname.endswith(".pdf"):
         doc = pymupdf.open(stream=content, filetype="pdf")
+        pages = [(i + 1, page.get_text()) for i, page in enumerate(doc)]
         doc.close()
+        # Filter out blank pages
+        return [(p, t) for p, t in pages if t.strip()] or [(1, "")]
     # ── Word ─────────────────────────────────────────────────────────────────
     if fname.endswith(".docx"):
         doc = Document(io.BytesIO(content))
+        text = "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())
+        return _assign_pages(text)
     # ── CSV ──────────────────────────────────────────────────────────────────
     if fname.endswith(".csv"):
+        return _assign_pages(_parse_csv(content))
     # ── Jupyter Notebook ─────────────────────────────────────────────────────
     if fname.endswith(".ipynb"):
+        return _assign_pages(_parse_ipynb(content))
     # ── Plain text, markdown, RMD, and all code/config files ─────────────────
     ext = "." + fname.rsplit(".", 1)[-1] if "." in fname else ""
     if ext in _TEXT_EXTENSIONS:
+        return _assign_pages(content.decode("utf-8", errors="replace"))
     raise ValueError(f"Unsupported file type: {filename}")
+def parse_file(content: bytes, filename: str) -> str:
+    """Legacy helper — returns the full document as a single string."""
+    pages = parse_file_pages(content, filename)
+    return "\n\n".join(text for _, text in pages)

persistence/tier.py CHANGED Viewed

@@ -22,7 +22,16 @@ TTL: dict[Tier, timedelta] = {
 FILE_LIMIT_MB: dict[Tier, int]  = {Tier.FREE: 5,  Tier.PRO: 25, Tier.SCHOLAR: 50}
 DOC_LIMIT: dict[Tier, int | None] = {Tier.FREE: 1,  Tier.PRO: 10, Tier.SCHOLAR: None}
-MSG_LIMIT: dict[Tier, int | None] = {Tier.FREE: 5,  Tier.PRO: 100, Tier.SCHOLAR: None}
 def get_user_tier(user_id: str) -> Tier:
@@ -75,6 +84,7 @@ def check_message_limit(user_id: str, session_id: str) -> tuple[bool, str]:
     client = _client()
     if tier == Tier.FREE:
         count = (
             client.table("chat_history")
             .select("id", count="exact")
@@ -84,6 +94,7 @@ def check_message_limit(user_id: str, session_id: str) -> tuple[bool, str]:
             .count
         )
     else:
         today = datetime.now(UTC).date().isoformat()
         count = (
             client.table("chat_history")
@@ -98,3 +109,62 @@ def check_message_limit(user_id: str, session_id: str) -> tuple[bool, str]:
     if count >= limit:
         return False, f"Message limit reached on {tier} plan. Upgrade to continue."
     return True, "ok"

 FILE_LIMIT_MB: dict[Tier, int]  = {Tier.FREE: 5,  Tier.PRO: 25, Tier.SCHOLAR: 50}
 DOC_LIMIT: dict[Tier, int | None] = {Tier.FREE: 1,  Tier.PRO: 10, Tier.SCHOLAR: None}
+# General message limits (per session for FREE, per day for paid)
+MSG_LIMIT: dict[Tier, int | None] = {Tier.FREE: 5,  Tier.PRO: 200, Tier.SCHOLAR: None}
+# DeepMind (Groq) daily message limits — separate quota from general messages
+DEEPMIND_LIMIT: dict[Tier, int | None] = {
+    Tier.FREE:    5,    # Free users get 5 DeepMind messages/day
+    Tier.PRO:     200,  # Pro users get 200 DeepMind messages/day
+    Tier.SCHOLAR: 500,  # Scholar users get 500 DeepMind messages/day
+}
 def get_user_tier(user_id: str) -> Tier:
     client = _client()
     if tier == Tier.FREE:
+        # FREE: count per session
         count = (
             client.table("chat_history")
             .select("id", count="exact")
             .count
         )
     else:
+        # Paid: count per day (UTC)
         today = datetime.now(UTC).date().isoformat()
         count = (
             client.table("chat_history")
     if count >= limit:
         return False, f"Message limit reached on {tier} plan. Upgrade to continue."
     return True, "ok"
+def check_deepmind_limit(user_id: str) -> tuple[bool, str]:
+    """Check the user's DeepMind (Groq) daily message limit.
+    DeepMind messages are tracked via the is_deepmind flag on chat_history rows.
+    Limits: Free=5/day, Pro=200/day, Scholar=500/day (all per UTC calendar day).
+    """
+    tier  = get_user_tier(user_id)
+    limit = DEEPMIND_LIMIT[tier]
+    if limit is None:
+        return True, "ok"
+    today = datetime.now(UTC).date().isoformat()
+    count = (
+        _client()
+        .table("chat_history")
+        .select("id", count="exact")
+        .eq("user_id", user_id)
+        .eq("role", "user")
+        .eq("is_deepmind", True)
+        .gte("created_at", today)
+        .execute()
+        .count
+    )
+    if count >= limit:
+        tier_label = tier.capitalize()
+        return False, (
+            f"DeepMind daily limit reached ({limit} messages/day on {tier_label} plan). "
+            "Resets at midnight UTC."
+        )
+    return True, "ok"
+def get_deepmind_usage(user_id: str) -> dict:
+    """Return DeepMind usage stats for today: {used, limit, remaining}."""
+    tier  = get_user_tier(user_id)
+    limit = DEEPMIND_LIMIT[tier]
+    today = datetime.now(UTC).date().isoformat()
+    used = (
+        _client()
+        .table("chat_history")
+        .select("id", count="exact")
+        .eq("user_id", user_id)
+        .eq("role", "user")
+        .eq("is_deepmind", True)
+        .gte("created_at", today)
+        .execute()
+        .count
+    ) or 0
+    return {
+        "used":      used,
+        "limit":     limit,
+        "remaining": (limit - used) if limit is not None else None,
+        "tier":      str(tier),
+    }

retrieval/vectorstore.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from supabase import create_client, Client
 from datetime import datetime
 import os
@@ -7,25 +8,44 @@ def _client() -> Client:
     return create_client(os.environ["SUPABASE_URL"], os.environ["SUPABASE_KEY"])
 def store_chunks(
     doc_id: str,
     user_id: str,
-    chunks: list[str],
     embeddings: list[list[float]],
     expires_at: datetime,
 ) -> None:
     client = _client()
-    rows = [
-        {
-            "doc_id": doc_id,
-            "user_id": user_id,
-            "chunk_text": chunk,
-            "embedding": embedding,
             "chunk_index": i,
-            "expires_at": expires_at.isoformat(),
-        }
-        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings))
-    ]
     # Insert in batches of 100 to avoid payload limits
     for i in range(0, len(rows), 100):
         client.table("chunks").insert(rows[i : i + 100]).execute()
@@ -35,7 +55,8 @@ def similarity_search(
     doc_id: str,
     query_embedding: list[float],
     top_k: int = 5,
-) -> list[str]:
     client = _client()
     result = client.rpc(
         "match_chunks",
@@ -45,17 +66,84 @@ def similarity_search(
             "match_count": top_k,
         },
     ).execute()
-    return [r["chunk_text"] for r in result.data]
-def get_all_chunks(doc_id: str) -> list[str]:
     """Return every chunk for a document in order, for full-context retrieval."""
     client = _client()
     result = (
         client.table("chunks")
-        .select("chunk_text")
         .eq("doc_id", doc_id)
         .order("chunk_index", desc=False)
         .execute()
     )
-    return [r["chunk_text"] for r in result.data]

+from __future__ import annotations
 from supabase import create_client, Client
 from datetime import datetime
 import os
     return create_client(os.environ["SUPABASE_URL"], os.environ["SUPABASE_KEY"])
+# ── ChunkResult: rich return type for similarity search ──────────────────────
+class ChunkResult:
+    """Holds chunk text, its page of origin, and source document."""
+    __slots__ = ("text", "page_number", "doc_id")
+    def __init__(self, text: str, page_number: int, doc_id: str):
+        self.text        = text
+        self.page_number = page_number
+        self.doc_id      = doc_id
+    # Behaves like a plain string so old code that does `"\n".join(chunks)` still works
+    def __str__(self)  -> str: return self.text
+    def __repr__(self) -> str: return f"ChunkResult(doc={self.doc_id[:8]}, page={self.page_number})"
 def store_chunks(
     doc_id: str,
     user_id: str,
+    chunks,                         # list[ChunkMeta] or list[str]
     embeddings: list[list[float]],
     expires_at: datetime,
 ) -> None:
     client = _client()
+    rows = []
+    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+        # Support both ChunkMeta objects (with .text/.page_number) and plain strings
+        text        = chunk.text        if hasattr(chunk, "text")        else str(chunk)
+        page_number = chunk.page_number if hasattr(chunk, "page_number") else 1
+        rows.append({
+            "doc_id":      doc_id,
+            "user_id":     user_id,
+            "chunk_text":  text,
+            "embedding":   embedding,
             "chunk_index": i,
+            "page_number": page_number,
+            "expires_at":  expires_at.isoformat(),
+        })
     # Insert in batches of 100 to avoid payload limits
     for i in range(0, len(rows), 100):
         client.table("chunks").insert(rows[i : i + 100]).execute()
     doc_id: str,
     query_embedding: list[float],
     top_k: int = 5,
+) -> list[ChunkResult]:
+    """Search a single document and return rich ChunkResult objects."""
     client = _client()
     result = client.rpc(
         "match_chunks",
             "match_count": top_k,
         },
     ).execute()
+    return [
+        ChunkResult(
+            text        = r["chunk_text"],
+            page_number = r.get("page_number", 1),
+            doc_id      = str(r.get("doc_id", doc_id)),
+        )
+        for r in result.data
+    ]
+def similarity_search_multi(
+    doc_ids: list[str],
+    query_embedding: list[float],
+    top_k: int = 20,
+) -> list[ChunkResult]:
+    """Search across multiple documents and return rich ChunkResult objects."""
+    if not doc_ids:
+        return []
+    if len(doc_ids) == 1:
+        return similarity_search(doc_ids[0], query_embedding, top_k)
+    client = _client()
+    result = client.rpc(
+        "match_chunks_multi",
+        {
+            "query_embedding": query_embedding,
+            "doc_ids_filter":  doc_ids,
+            "match_count":     top_k,
+        },
+    ).execute()
+    return [
+        ChunkResult(
+            text        = r["chunk_text"],
+            page_number = r.get("page_number", 1),
+            doc_id      = str(r["doc_id"]),
+        )
+        for r in result.data
+    ]
+def get_all_chunks(doc_id: str) -> list[ChunkResult]:
     """Return every chunk for a document in order, for full-context retrieval."""
     client = _client()
     result = (
         client.table("chunks")
+        .select("chunk_text, page_number, doc_id")
         .eq("doc_id", doc_id)
         .order("chunk_index", desc=False)
         .execute()
     )
+    return [
+        ChunkResult(
+            text        = r["chunk_text"],
+            page_number = r.get("page_number", 1),
+            doc_id      = str(r.get("doc_id", doc_id)),
+        )
+        for r in result.data
+    ]
+def get_all_chunks_multi(doc_ids: list[str]) -> list[ChunkResult]:
+    """Return all chunks for multiple documents in document+chunk order."""
+    if not doc_ids:
+        return []
+    client = _client()
+    result = (
+        client.table("chunks")
+        .select("chunk_text, page_number, doc_id, chunk_index")
+        .in_("doc_id", doc_ids)
+        .order("doc_id", desc=False)
+        .order("chunk_index", desc=False)
+        .execute()
+    )
+    return [
+        ChunkResult(
+            text        = r["chunk_text"],
+            page_number = r.get("page_number", 1),
+            doc_id      = str(r["doc_id"]),
+        )
+        for r in result.data
+    ]