imtrt004 commited on
Commit
391fc60
Β·
1 Parent(s): e2cc6a2

feat: line number and multi docs

Browse files
app.py CHANGED
@@ -7,12 +7,16 @@ from supabase import create_client
7
  import uuid
8
  import os
9
  import json
 
10
 
11
  from model.loader import get_llm, get_model_name, is_llm_ready
12
  from retrieval.embedder import get_model, embed_chunks, embed_query
13
- from retrieval.vectorstore import store_chunks, similarity_search, get_all_chunks
14
- from ingestion.parser import parse_file
15
- from ingestion.chunker import smart_chunk
 
 
 
16
  from generation.llm import stream_answer
17
  from generation.quiz import generate_quiz
18
  from generation.groq_llm import stream_answer_groq, generate_quiz_groq
@@ -21,6 +25,8 @@ from persistence.tier import (
21
  get_expiry,
22
  can_upload,
23
  check_message_limit,
 
 
24
  Tier,
25
  )
26
  from persistence.queue import (
@@ -194,9 +200,9 @@ async def process_from_storage(
194
  async def _process_doc(content, doc_id, user_id, expires, filename):
195
  supa = _supa()
196
  try:
197
- text = parse_file(content, filename)
198
- chunks = smart_chunk(text, filename=filename)
199
- embeds = embed_chunks(chunks)
200
  store_chunks(doc_id, user_id, chunks, embeds, expires)
201
  supa.table("documents").update({"status": "ready", "chunk_count": len(chunks)}) \
202
  .eq("id", doc_id).execute()
@@ -208,7 +214,8 @@ async def _process_doc(content, doc_id, user_id, expires, filename):
208
  # ─── Chat ────────────────────────────────────────────────────────────────────
209
 
210
  class ChatRequest(BaseModel):
211
- doc_id: str
 
212
  query: str
213
  user_id: str
214
  session_id: str
@@ -231,33 +238,71 @@ async def chat(req: ChatRequest):
231
  tier = get_user_tier(req.user_id)
232
  expires = get_expiry(tier)
233
 
234
- # DeepMind mode: only Pro and Scholar may activate it (needed before chunk fetch)
235
- deepmind_allowed = tier in (Tier.PRO, Tier.SCHOLAR)
236
  use_deepmind = req.use_deepmind and deepmind_allowed
237
 
238
  if use_deepmind:
239
- # Groq models have 128k context β€” fetch every chunk in document order
240
- chunks = get_all_chunks(req.doc_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  else:
242
  q_vec = embed_query(req.query)
243
- chunks = similarity_search(req.doc_id, q_vec, top_k=15)
 
 
 
244
 
245
  if not chunks:
246
  raise HTTPException(status_code=404, detail="Document expired or not found.")
247
 
248
  # Scholar tier gets thinking mode on the local model (ignored when DeepMind is on)
249
  use_thinking = (tier == Tier.SCHOLAR) and not use_deepmind
250
- supa = _supa()
251
  full_resp: list[str] = []
252
 
253
- # Save user message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  supa.table("chat_history").insert({
255
- "doc_id": req.doc_id,
256
- "session_id": req.session_id,
257
- "user_id": req.user_id,
258
- "role": "user",
259
- "content": req.query,
260
- "expires_at": expires.isoformat(),
 
261
  }).execute()
262
 
263
  def generate():
@@ -268,24 +313,30 @@ async def chat(req: ChatRequest):
268
  )
269
  for token in token_iter:
270
  full_resp.append(token)
271
- # JSON-encode so embedded newlines in tokens don't break SSE framing
272
  yield f"data: {json.dumps(token)}\n\n"
273
 
274
- # Persist assistant response after stream completes
 
 
 
 
 
 
275
  supa.table("chat_history").insert({
276
- "doc_id": req.doc_id,
277
- "session_id": req.session_id,
278
- "user_id": req.user_id,
279
- "role": "assistant",
280
- "content": "".join(full_resp),
281
- "expires_at": expires.isoformat(),
 
282
  }).execute()
283
  yield "data: [DONE]\n\n"
284
 
285
  return StreamingResponse(
286
  generate(),
287
  media_type="text/event-stream",
288
- headers={"X-Accel-Buffering": "no"}, # disable nginx buffering
289
  )
290
 
291
 
@@ -317,6 +368,10 @@ async def quiz(req: QuizRequest):
317
  questions = generate_quiz(chunks)
318
  return {"questions": questions}
319
 
 
 
 
 
320
 
321
  # ─── Utility ─────────────────────────────────────────────────────────────────
322
 
 
7
  import uuid
8
  import os
9
  import json
10
+ from typing import Optional
11
 
12
  from model.loader import get_llm, get_model_name, is_llm_ready
13
  from retrieval.embedder import get_model, embed_chunks, embed_query
14
+ from retrieval.vectorstore import (
15
+ store_chunks, similarity_search, similarity_search_multi,
16
+ get_all_chunks, get_all_chunks_multi,
17
+ )
18
+ from ingestion.parser import parse_file_pages, parse_file
19
+ from ingestion.chunker import smart_chunk_pages, smart_chunk, ChunkMeta
20
  from generation.llm import stream_answer
21
  from generation.quiz import generate_quiz
22
  from generation.groq_llm import stream_answer_groq, generate_quiz_groq
 
25
  get_expiry,
26
  can_upload,
27
  check_message_limit,
28
+ check_deepmind_limit,
29
+ get_deepmind_usage,
30
  Tier,
31
  )
32
  from persistence.queue import (
 
200
  async def _process_doc(content, doc_id, user_id, expires, filename):
201
  supa = _supa()
202
  try:
203
+ pages = parse_file_pages(content, filename)
204
+ chunks = smart_chunk_pages(pages, filename=filename)
205
+ embeds = embed_chunks([c.text for c in chunks])
206
  store_chunks(doc_id, user_id, chunks, embeds, expires)
207
  supa.table("documents").update({"status": "ready", "chunk_count": len(chunks)}) \
208
  .eq("id", doc_id).execute()
 
214
  # ─── Chat ────────────────────────────────────────────────────────────────────
215
 
216
  class ChatRequest(BaseModel):
217
+ doc_id: str # primary document (required for backward compat)
218
+ doc_ids: Optional[list[str]] = None # additional / override doc list for multi-doc chat
219
  query: str
220
  user_id: str
221
  session_id: str
 
238
  tier = get_user_tier(req.user_id)
239
  expires = get_expiry(tier)
240
 
241
+ # ── DeepMind gate ─────────────────────────────────────────────────────────
242
+ deepmind_allowed = tier in (Tier.PRO, Tier.SCHOLAR, Tier.FREE)
243
  use_deepmind = req.use_deepmind and deepmind_allowed
244
 
245
  if use_deepmind:
246
+ dm_ok, dm_msg = check_deepmind_limit(req.user_id)
247
+ if not dm_ok:
248
+ raise HTTPException(status_code=429, detail=dm_msg)
249
+
250
+ # ── Resolve document list ─────────────────────────────────────────────────
251
+ # Use doc_ids when provided (multi-doc), otherwise fall back to single doc_id
252
+ all_doc_ids: list[str] = req.doc_ids if req.doc_ids else [req.doc_id]
253
+
254
+ # ── Fetch filename map for citation display ───────────────────────────────
255
+ supa = _supa()
256
+ docs_result = supa.table("documents").select("id, filename") \
257
+ .in_("id", all_doc_ids).execute()
258
+ filename_map: dict[str, str] = {
259
+ d["id"]: d["filename"] for d in (docs_result.data or [])
260
+ }
261
+
262
+ # ── Retrieve chunks ───────────────────────────────────────────────────────
263
+ if use_deepmind:
264
+ # Groq has 128k context β€” fetch every chunk
265
+ if len(all_doc_ids) == 1:
266
+ chunks = get_all_chunks(all_doc_ids[0])
267
+ else:
268
+ chunks = get_all_chunks_multi(all_doc_ids)
269
  else:
270
  q_vec = embed_query(req.query)
271
+ if len(all_doc_ids) == 1:
272
+ chunks = similarity_search(all_doc_ids[0], q_vec, top_k=15)
273
+ else:
274
+ chunks = similarity_search_multi(all_doc_ids, q_vec, top_k=20)
275
 
276
  if not chunks:
277
  raise HTTPException(status_code=404, detail="Document expired or not found.")
278
 
279
  # Scholar tier gets thinking mode on the local model (ignored when DeepMind is on)
280
  use_thinking = (tier == Tier.SCHOLAR) and not use_deepmind
 
281
  full_resp: list[str] = []
282
 
283
+ # ── Build citation map: source_N β†’ {filename, page, doc_id, text} ────────
284
+ citation_map = {}
285
+ for i, chunk in enumerate(chunks, 1):
286
+ doc_id_chunk = chunk.doc_id if hasattr(chunk, "doc_id") else req.doc_id
287
+ page_number = chunk.page_number if hasattr(chunk, "page_number") else 1
288
+ chunk_text = chunk.text if hasattr(chunk, "text") else str(chunk)
289
+ citation_map[str(i)] = {
290
+ "n": i,
291
+ "doc_id": doc_id_chunk,
292
+ "filename": filename_map.get(doc_id_chunk, "Document"),
293
+ "page": page_number,
294
+ "text": chunk_text[:400], # snippet for tooltip/panel
295
+ }
296
+
297
+ # Save user message (primary doc_id for legacy compatibility)
298
  supa.table("chat_history").insert({
299
+ "doc_id": req.doc_id,
300
+ "session_id": req.session_id,
301
+ "user_id": req.user_id,
302
+ "role": "user",
303
+ "content": req.query,
304
+ "expires_at": expires.isoformat(),
305
+ "is_deepmind": use_deepmind,
306
  }).execute()
307
 
308
  def generate():
 
313
  )
314
  for token in token_iter:
315
  full_resp.append(token)
 
316
  yield f"data: {json.dumps(token)}\n\n"
317
 
318
+ # ── Emit citation map before [DONE] ───────────────────────────────────
319
+ citations_payload = json.dumps({
320
+ "__citations__": list(citation_map.values())
321
+ })
322
+ yield f"data: {citations_payload}\n\n"
323
+
324
+ # Persist assistant response
325
  supa.table("chat_history").insert({
326
+ "doc_id": req.doc_id,
327
+ "session_id": req.session_id,
328
+ "user_id": req.user_id,
329
+ "role": "assistant",
330
+ "content": "".join(full_resp),
331
+ "expires_at": expires.isoformat(),
332
+ "is_deepmind": use_deepmind,
333
  }).execute()
334
  yield "data: [DONE]\n\n"
335
 
336
  return StreamingResponse(
337
  generate(),
338
  media_type="text/event-stream",
339
+ headers={"X-Accel-Buffering": "no"},
340
  )
341
 
342
 
 
368
  questions = generate_quiz(chunks)
369
  return {"questions": questions}
370
 
371
+ @app.get("/deepmind-usage/{user_id}")
372
+ async def deepmind_usage(user_id: str):
373
+ """Return DeepMind daily usage stats for a user: {used, limit, remaining, tier}."""
374
+ return get_deepmind_usage(user_id)
375
 
376
  # ─── Utility ─────────────────────────────────────────────────────────────────
377
 
generation/groq_llm.py CHANGED
@@ -111,9 +111,25 @@ DEFAULT_MODEL = os.environ.get("GROQ_MODEL", "llama-3.3-70b-versatile")
111
 
112
  SYSTEM_PROMPT = """You are a precise document study assistant by Md Tusar Akon.
113
  Answer ONLY from the provided context. Be concise and factual.
 
 
 
 
 
 
114
  If the answer is not in the context, say exactly: "I couldn't find that in your document."
115
  Never make up or infer information not present in the context."""
116
 
 
 
 
 
 
 
 
 
 
 
117
  QUIZ_PROMPT = """Based on the context below, generate exactly 10 multiple-choice quiz questions.
118
  Each question must test understanding of the content, not trivia.
119
 
@@ -201,7 +217,7 @@ def _inc(key_id: int) -> None:
201
 
202
  def stream_answer_groq(
203
  query: str,
204
- context_chunks: list[str],
205
  ) -> Generator[str, None, None]:
206
  """Stream a Groq answer, auto-rotating keys on rate-limit errors."""
207
  try:
@@ -210,7 +226,7 @@ def stream_answer_groq(
210
  yield "DeepMind mode requires the `groq` package. Please contact support."
211
  return
212
 
213
- context = "\n\n---\n\n".join(context_chunks)
214
  messages = [
215
  {"role": "system", "content": SYSTEM_PROMPT},
216
  {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"},
@@ -252,14 +268,16 @@ def stream_answer_groq(
252
 
253
  # ── Quiz generation ────────────────────────────────────────────────────────────
254
 
255
- def generate_quiz_groq(context_chunks: list[str]) -> list[dict]:
256
  """Generate 10 quiz questions via Groq API with key rotation."""
257
  try:
258
  from groq import Groq, RateLimitError # type: ignore[import]
259
  except ImportError:
260
  return []
261
 
262
- context = "\n\n".join(context_chunks[:5])
 
 
263
  messages = [{"role": "user", "content": QUIZ_PROMPT.format(context=context)}]
264
 
265
  for _attempt in range(_MAX_RETRIES):
 
111
 
112
  SYSTEM_PROMPT = """You are a precise document study assistant by Md Tusar Akon.
113
  Answer ONLY from the provided context. Be concise and factual.
114
+
115
+ CRITICAL: Whenever you use information from the context, you MUST cite the source using the
116
+ notation [[N]] (e.g., [[1]], [[2]]) immediately after the relevant sentence or phrase.
117
+ Each source reference number N corresponds to the [Source N] header in the context below.
118
+ Multiple citations are written as [[1]][[2]].
119
+
120
  If the answer is not in the context, say exactly: "I couldn't find that in your document."
121
  Never make up or infer information not present in the context."""
122
 
123
+
124
+ def _build_context(chunks: list) -> str:
125
+ """Format chunks with numbered source headers for [Source N] citation notation."""
126
+ parts = []
127
+ for i, chunk in enumerate(chunks, 1):
128
+ text = chunk.text if hasattr(chunk, "text") else str(chunk)
129
+ page_number = chunk.page_number if hasattr(chunk, "page_number") else 1
130
+ parts.append(f"[Source {i} \u2014 Page {page_number}]\n{text}")
131
+ return "\n\n---\n\n".join(parts)
132
+
133
  QUIZ_PROMPT = """Based on the context below, generate exactly 10 multiple-choice quiz questions.
134
  Each question must test understanding of the content, not trivia.
135
 
 
217
 
218
  def stream_answer_groq(
219
  query: str,
220
+ context_chunks: list,
221
  ) -> Generator[str, None, None]:
222
  """Stream a Groq answer, auto-rotating keys on rate-limit errors."""
223
  try:
 
226
  yield "DeepMind mode requires the `groq` package. Please contact support."
227
  return
228
 
229
+ context = _build_context(context_chunks)
230
  messages = [
231
  {"role": "system", "content": SYSTEM_PROMPT},
232
  {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"},
 
268
 
269
  # ── Quiz generation ────────────────────────────────────────────────────────────
270
 
271
+ def generate_quiz_groq(context_chunks: list) -> list[dict]:
272
  """Generate 10 quiz questions via Groq API with key rotation."""
273
  try:
274
  from groq import Groq, RateLimitError # type: ignore[import]
275
  except ImportError:
276
  return []
277
 
278
+ context = "\n\n".join(
279
+ (c.text if hasattr(c, "text") else str(c)) for c in context_chunks[:5]
280
+ )
281
  messages = [{"role": "user", "content": QUIZ_PROMPT.format(context=context)}]
282
 
283
  for _attempt in range(_MAX_RETRIES):
generation/llm.py CHANGED
@@ -1,23 +1,43 @@
 
1
  import torch
2
  from model.loader import get_tokenizer, get_llm
3
  from transformers import TextIteratorStreamer
4
  from threading import Thread
5
- from typing import Generator
 
 
 
6
 
7
  SYSTEM_PROMPT = """You are a precise document study assistant by Md Tusar Akon.
8
  Answer ONLY from the provided context. Be concise and factual.
 
 
 
 
 
 
9
  If the answer is not in the context, say exactly: "I couldn't find that in your document."
10
  Never make up or infer information not present in the context."""
11
 
12
 
 
 
 
 
 
 
 
 
 
 
13
  def stream_answer(
14
  query: str,
15
- context_chunks: list[str],
16
  thinking_mode: bool = False,
17
  ) -> Generator[str, None, None]:
18
  tokenizer = get_tokenizer()
19
  model = get_llm()
20
- context = "\n\n---\n\n".join(context_chunks)
21
 
22
  messages = [
23
  {"role": "system", "content": SYSTEM_PROMPT},
 
1
+ from __future__ import annotations
2
  import torch
3
  from model.loader import get_tokenizer, get_llm
4
  from transformers import TextIteratorStreamer
5
  from threading import Thread
6
+ from typing import Generator, TYPE_CHECKING
7
+
8
+ if TYPE_CHECKING:
9
+ from retrieval.vectorstore import ChunkResult
10
 
11
  SYSTEM_PROMPT = """You are a precise document study assistant by Md Tusar Akon.
12
  Answer ONLY from the provided context. Be concise and factual.
13
+
14
+ CRITICAL: Whenever you use information from the context, you MUST cite the source using the
15
+ notation [[N]] (e.g., [[1]], [[2]]) immediately after the relevant sentence or phrase.
16
+ Each source reference number N corresponds to the [Source N] header in the context below.
17
+ Multiple citations are written as [[1]][[2]].
18
+
19
  If the answer is not in the context, say exactly: "I couldn't find that in your document."
20
  Never make up or infer information not present in the context."""
21
 
22
 
23
+ def _build_context(chunks: list) -> str:
24
+ """Format chunks into a numbered context block with source references."""
25
+ parts = []
26
+ for i, chunk in enumerate(chunks, 1):
27
+ text = chunk.text if hasattr(chunk, "text") else str(chunk)
28
+ page_number = chunk.page_number if hasattr(chunk, "page_number") else 1
29
+ parts.append(f"[Source {i} β€” Page {page_number}]\n{text}")
30
+ return "\n\n---\n\n".join(parts)
31
+
32
+
33
  def stream_answer(
34
  query: str,
35
+ context_chunks: list,
36
  thinking_mode: bool = False,
37
  ) -> Generator[str, None, None]:
38
  tokenizer = get_tokenizer()
39
  model = get_llm()
40
+ context = _build_context(context_chunks)
41
 
42
  messages = [
43
  {"role": "system", "content": SYSTEM_PROMPT},
generation/quiz.py CHANGED
@@ -21,10 +21,12 @@ Respond ONLY with a JSON array, no markdown, no explanation:
21
  ]"""
22
 
23
 
24
- def generate_quiz(context_chunks: list[str]) -> list[dict]:
25
  tokenizer = get_tokenizer()
26
  model = get_llm()
27
- context = "\n\n".join(context_chunks[:5])
 
 
28
 
29
  messages = [{"role": "user", "content": QUIZ_PROMPT.format(context=context)}]
30
  input_ids = tokenizer.apply_chat_template(
@@ -45,6 +47,29 @@ def generate_quiz(context_chunks: list[str]) -> list[dict]:
45
  raw = tokenizer.decode(new_tokens, skip_special_tokens=True)
46
  raw = re.sub(r"```json|```", "", raw).strip()
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  try:
49
  questions = json.loads(raw)
50
  return questions if isinstance(questions, list) else []
 
21
  ]"""
22
 
23
 
24
+ def generate_quiz(context_chunks: list) -> list[dict]:
25
  tokenizer = get_tokenizer()
26
  model = get_llm()
27
+ # Support both ChunkResult objects and plain strings
28
+ texts = [(c.text if hasattr(c, "text") else str(c)) for c in context_chunks[:5]]
29
+ context = "\n\n".join(texts)
30
 
31
  messages = [{"role": "user", "content": QUIZ_PROMPT.format(context=context)}]
32
  input_ids = tokenizer.apply_chat_template(
 
47
  raw = tokenizer.decode(new_tokens, skip_special_tokens=True)
48
  raw = re.sub(r"```json|```", "", raw).strip()
49
 
50
+ try:
51
+ questions = json.loads(raw)
52
+ return questions if isinstance(questions, list) else []
53
+ except json.JSONDecodeError:
54
+ return []
55
+ input_ids = tokenizer.apply_chat_template(
56
+ messages,
57
+ add_generation_prompt=True,
58
+ return_tensors="pt",
59
+ )
60
+
61
+ with torch.no_grad():
62
+ output_ids = model.generate(
63
+ input_ids,
64
+ max_new_tokens=2048,
65
+ do_sample=False, # greedy - faster on CPU
66
+ pad_token_id=tokenizer.eos_token_id,
67
+ )
68
+
69
+ new_tokens = output_ids[0][input_ids.shape[-1]:]
70
+ raw = tokenizer.decode(new_tokens, skip_special_tokens=True)
71
+ raw = re.sub(r"```json|```", "", raw).strip()
72
+
73
  try:
74
  questions = json.loads(raw)
75
  return questions if isinstance(questions, list) else []
ingestion/chunker.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
2
 
3
  # Map file extension β†’ LangChain Language enum for code-aware splitting
@@ -23,22 +24,58 @@ _EXT_TO_LANGUAGE: dict[str, Language] = {
23
  }
24
 
25
 
26
- def smart_chunk(text: str, chunk_size: int = 1024, overlap: int = 128,
27
- filename: str = "") -> list[str]:
28
  ext = ("."+filename.lower().rsplit(".", 1)[-1]) if "." in filename else ""
29
  lang = _EXT_TO_LANGUAGE.get(ext)
30
-
31
  if lang is not None:
32
- splitter = RecursiveCharacterTextSplitter.from_language(
33
- language=lang,
34
- chunk_size=chunk_size,
35
- chunk_overlap=overlap,
36
- )
37
- else:
38
- splitter = RecursiveCharacterTextSplitter(
39
- chunk_size=chunk_size,
40
- chunk_overlap=overlap,
41
- separators=["\n\n", "\n", ".", "!", "?", " ", ""],
42
- length_function=len,
43
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  return [c for c in splitter.split_text(text) if len(c.strip()) > 30]
 
1
+ from __future__ import annotations
2
  from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
3
 
4
  # Map file extension β†’ LangChain Language enum for code-aware splitting
 
24
  }
25
 
26
 
27
+ def _make_splitter(filename: str, chunk_size: int, overlap: int) -> RecursiveCharacterTextSplitter:
 
28
  ext = ("."+filename.lower().rsplit(".", 1)[-1]) if "." in filename else ""
29
  lang = _EXT_TO_LANGUAGE.get(ext)
 
30
  if lang is not None:
31
+ return RecursiveCharacterTextSplitter.from_language(
32
+ language=lang, chunk_size=chunk_size, chunk_overlap=overlap,
 
 
 
 
 
 
 
 
 
33
  )
34
+ return RecursiveCharacterTextSplitter(
35
+ chunk_size=chunk_size, chunk_overlap=overlap,
36
+ separators=["\n\n", "\n", ".", "!", "?", " ", ""],
37
+ length_function=len,
38
+ )
39
+
40
+
41
+ # ── ChunkMeta: a chunk of text with its origin page ──────────────────────────
42
+
43
+ class ChunkMeta:
44
+ """Thin container so callers can access `.text` and `.page_number`."""
45
+ __slots__ = ("text", "page_number")
46
+
47
+ def __init__(self, text: str, page_number: int):
48
+ self.text = text
49
+ self.page_number = page_number
50
+
51
+ # Make it behave like a plain string in legacy code paths
52
+ def __str__(self) -> str: return self.text
53
+ def __repr__(self) -> str: return f"ChunkMeta(page={self.page_number}, text={self.text[:40]!r})"
54
+ def __len__(self) -> int: return len(self.text)
55
+
56
+
57
+ def smart_chunk_pages(
58
+ pages: list[tuple[int, str]], # (page_number, page_text)
59
+ chunk_size: int = 1024,
60
+ overlap: int = 128,
61
+ filename: str = "",
62
+ ) -> list[ChunkMeta]:
63
+ """Split page-tagged text into chunks, preserving page origin.
64
+
65
+ Each page is chunked independently so page numbers stay accurate.
66
+ Returns a list of ChunkMeta objects ordered by (page, chunk_within_page).
67
+ """
68
+ splitter = _make_splitter(filename, chunk_size, overlap)
69
+ result: list[ChunkMeta] = []
70
+ for page_num, text in pages:
71
+ for piece in splitter.split_text(text):
72
+ if len(piece.strip()) > 30:
73
+ result.append(ChunkMeta(piece, page_num))
74
+ return result
75
+
76
+
77
+ def smart_chunk(text: str, chunk_size: int = 1024, overlap: int = 128,
78
+ filename: str = "") -> list[str]:
79
+ """Legacy helper β€” returns plain strings without page info."""
80
+ splitter = _make_splitter(filename, chunk_size, overlap)
81
  return [c for c in splitter.split_text(text) if len(c.strip()) > 30]
ingestion/parser.py CHANGED
@@ -1,9 +1,12 @@
1
  import csv
2
  import io
3
  import json
 
4
  import pymupdf # pymupdf 1.25+ import (not fitz)
5
  from docx import Document
6
 
 
 
7
 
8
  # Plain-text and code extensions decoded as-is
9
  _TEXT_EXTENSIONS = {
@@ -28,6 +31,33 @@ _TEXT_EXTENSIONS = {
28
  }
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def _parse_csv(content: bytes) -> str:
32
  """Convert CSV to a readable pipe-delimited table."""
33
  text = content.decode("utf-8", errors="replace")
@@ -62,32 +92,47 @@ def _parse_ipynb(content: bytes) -> str:
62
  return "\n\n".join(parts)
63
 
64
 
65
- def parse_file(content: bytes, filename: str) -> str:
 
 
 
 
 
 
 
66
  fname = filename.lower()
67
 
68
  # ── PDF ──────────────────────────────────────────────────────────────────
69
  if fname.endswith(".pdf"):
70
  doc = pymupdf.open(stream=content, filetype="pdf")
71
- pages = [page.get_text() for page in doc]
72
  doc.close()
73
- return "\n\n".join(pages)
 
74
 
75
  # ── Word ─────────────────────────────────────────────────────────────────
76
  if fname.endswith(".docx"):
77
  doc = Document(io.BytesIO(content))
78
- return "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())
 
79
 
80
  # ── CSV ──────────────────────────────────────────────────────────────────
81
  if fname.endswith(".csv"):
82
- return _parse_csv(content)
83
 
84
  # ── Jupyter Notebook ─────────────────────────────────────────────────────
85
  if fname.endswith(".ipynb"):
86
- return _parse_ipynb(content)
87
 
88
  # ── Plain text, markdown, RMD, and all code/config files ─────────────────
89
  ext = "." + fname.rsplit(".", 1)[-1] if "." in fname else ""
90
  if ext in _TEXT_EXTENSIONS:
91
- return content.decode("utf-8", errors="replace")
92
 
93
  raise ValueError(f"Unsupported file type: {filename}")
 
 
 
 
 
 
 
1
  import csv
2
  import io
3
  import json
4
+ import math
5
  import pymupdf # pymupdf 1.25+ import (not fitz)
6
  from docx import Document
7
 
8
+ # Approximate characters per "page" used when splitting non-PDF content.
9
+ _CHARS_PER_PAGE = 3_000
10
 
11
  # Plain-text and code extensions decoded as-is
12
  _TEXT_EXTENSIONS = {
 
31
  }
32
 
33
 
34
+ def _assign_pages(text: str) -> list[tuple[int, str]]:
35
+ """Split flat text into virtual pages of ~_CHARS_PER_PAGE characters each.
36
+
37
+ Returns a list of (page_number, chunk_text) tuples starting from page 1.
38
+ Splitting is done on paragraph boundaries so words are never cut.
39
+ """
40
+ paragraphs = text.split("\n\n")
41
+ pages: list[tuple[int, str]] = []
42
+ current_page = 1
43
+ current_chars = 0
44
+ current_parts: list[str] = []
45
+
46
+ for para in paragraphs:
47
+ if current_chars + len(para) > _CHARS_PER_PAGE and current_parts:
48
+ pages.append((current_page, "\n\n".join(current_parts)))
49
+ current_page += 1
50
+ current_parts = []
51
+ current_chars = 0
52
+ current_parts.append(para)
53
+ current_chars += len(para) + 2 # +2 for the "\n\n" separator
54
+
55
+ if current_parts:
56
+ pages.append((current_page, "\n\n".join(current_parts)))
57
+
58
+ return pages if pages else [(1, text)]
59
+
60
+
61
  def _parse_csv(content: bytes) -> str:
62
  """Convert CSV to a readable pipe-delimited table."""
63
  text = content.decode("utf-8", errors="replace")
 
92
  return "\n\n".join(parts)
93
 
94
 
95
+ # ── Public API ────────────────────────────────────────────────────────────────
96
+
97
+ def parse_file_pages(content: bytes, filename: str) -> list[tuple[int, str]]:
98
+ """Parse a file and return a list of (page_number, text) tuples.
99
+
100
+ For PDFs each physical page maps to one tuple.
101
+ For all other formats pages are approximated using _CHARS_PER_PAGE.
102
+ """
103
  fname = filename.lower()
104
 
105
  # ── PDF ──────────────────────────────────────────────────────────────────
106
  if fname.endswith(".pdf"):
107
  doc = pymupdf.open(stream=content, filetype="pdf")
108
+ pages = [(i + 1, page.get_text()) for i, page in enumerate(doc)]
109
  doc.close()
110
+ # Filter out blank pages
111
+ return [(p, t) for p, t in pages if t.strip()] or [(1, "")]
112
 
113
  # ── Word ─────────────────────────────────────────────────────────────────
114
  if fname.endswith(".docx"):
115
  doc = Document(io.BytesIO(content))
116
+ text = "\n\n".join(p.text for p in doc.paragraphs if p.text.strip())
117
+ return _assign_pages(text)
118
 
119
  # ── CSV ──────────────────────────────────────────────────────────────────
120
  if fname.endswith(".csv"):
121
+ return _assign_pages(_parse_csv(content))
122
 
123
  # ── Jupyter Notebook ─────────────────────────────────────────────────────
124
  if fname.endswith(".ipynb"):
125
+ return _assign_pages(_parse_ipynb(content))
126
 
127
  # ── Plain text, markdown, RMD, and all code/config files ─────────────────
128
  ext = "." + fname.rsplit(".", 1)[-1] if "." in fname else ""
129
  if ext in _TEXT_EXTENSIONS:
130
+ return _assign_pages(content.decode("utf-8", errors="replace"))
131
 
132
  raise ValueError(f"Unsupported file type: {filename}")
133
+
134
+
135
+ def parse_file(content: bytes, filename: str) -> str:
136
+ """Legacy helper β€” returns the full document as a single string."""
137
+ pages = parse_file_pages(content, filename)
138
+ return "\n\n".join(text for _, text in pages)
persistence/tier.py CHANGED
@@ -22,7 +22,16 @@ TTL: dict[Tier, timedelta] = {
22
 
23
  FILE_LIMIT_MB: dict[Tier, int] = {Tier.FREE: 5, Tier.PRO: 25, Tier.SCHOLAR: 50}
24
  DOC_LIMIT: dict[Tier, int | None] = {Tier.FREE: 1, Tier.PRO: 10, Tier.SCHOLAR: None}
25
- MSG_LIMIT: dict[Tier, int | None] = {Tier.FREE: 5, Tier.PRO: 100, Tier.SCHOLAR: None}
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  def get_user_tier(user_id: str) -> Tier:
@@ -75,6 +84,7 @@ def check_message_limit(user_id: str, session_id: str) -> tuple[bool, str]:
75
 
76
  client = _client()
77
  if tier == Tier.FREE:
 
78
  count = (
79
  client.table("chat_history")
80
  .select("id", count="exact")
@@ -84,6 +94,7 @@ def check_message_limit(user_id: str, session_id: str) -> tuple[bool, str]:
84
  .count
85
  )
86
  else:
 
87
  today = datetime.now(UTC).date().isoformat()
88
  count = (
89
  client.table("chat_history")
@@ -98,3 +109,62 @@ def check_message_limit(user_id: str, session_id: str) -> tuple[bool, str]:
98
  if count >= limit:
99
  return False, f"Message limit reached on {tier} plan. Upgrade to continue."
100
  return True, "ok"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  FILE_LIMIT_MB: dict[Tier, int] = {Tier.FREE: 5, Tier.PRO: 25, Tier.SCHOLAR: 50}
24
  DOC_LIMIT: dict[Tier, int | None] = {Tier.FREE: 1, Tier.PRO: 10, Tier.SCHOLAR: None}
25
+
26
+ # General message limits (per session for FREE, per day for paid)
27
+ MSG_LIMIT: dict[Tier, int | None] = {Tier.FREE: 5, Tier.PRO: 200, Tier.SCHOLAR: None}
28
+
29
+ # DeepMind (Groq) daily message limits β€” separate quota from general messages
30
+ DEEPMIND_LIMIT: dict[Tier, int | None] = {
31
+ Tier.FREE: 5, # Free users get 5 DeepMind messages/day
32
+ Tier.PRO: 200, # Pro users get 200 DeepMind messages/day
33
+ Tier.SCHOLAR: 500, # Scholar users get 500 DeepMind messages/day
34
+ }
35
 
36
 
37
  def get_user_tier(user_id: str) -> Tier:
 
84
 
85
  client = _client()
86
  if tier == Tier.FREE:
87
+ # FREE: count per session
88
  count = (
89
  client.table("chat_history")
90
  .select("id", count="exact")
 
94
  .count
95
  )
96
  else:
97
+ # Paid: count per day (UTC)
98
  today = datetime.now(UTC).date().isoformat()
99
  count = (
100
  client.table("chat_history")
 
109
  if count >= limit:
110
  return False, f"Message limit reached on {tier} plan. Upgrade to continue."
111
  return True, "ok"
112
+
113
+
114
+ def check_deepmind_limit(user_id: str) -> tuple[bool, str]:
115
+ """Check the user's DeepMind (Groq) daily message limit.
116
+
117
+ DeepMind messages are tracked via the is_deepmind flag on chat_history rows.
118
+ Limits: Free=5/day, Pro=200/day, Scholar=500/day (all per UTC calendar day).
119
+ """
120
+ tier = get_user_tier(user_id)
121
+ limit = DEEPMIND_LIMIT[tier]
122
+ if limit is None:
123
+ return True, "ok"
124
+
125
+ today = datetime.now(UTC).date().isoformat()
126
+ count = (
127
+ _client()
128
+ .table("chat_history")
129
+ .select("id", count="exact")
130
+ .eq("user_id", user_id)
131
+ .eq("role", "user")
132
+ .eq("is_deepmind", True)
133
+ .gte("created_at", today)
134
+ .execute()
135
+ .count
136
+ )
137
+
138
+ if count >= limit:
139
+ tier_label = tier.capitalize()
140
+ return False, (
141
+ f"DeepMind daily limit reached ({limit} messages/day on {tier_label} plan). "
142
+ "Resets at midnight UTC."
143
+ )
144
+ return True, "ok"
145
+
146
+
147
+ def get_deepmind_usage(user_id: str) -> dict:
148
+ """Return DeepMind usage stats for today: {used, limit, remaining}."""
149
+ tier = get_user_tier(user_id)
150
+ limit = DEEPMIND_LIMIT[tier]
151
+
152
+ today = datetime.now(UTC).date().isoformat()
153
+ used = (
154
+ _client()
155
+ .table("chat_history")
156
+ .select("id", count="exact")
157
+ .eq("user_id", user_id)
158
+ .eq("role", "user")
159
+ .eq("is_deepmind", True)
160
+ .gte("created_at", today)
161
+ .execute()
162
+ .count
163
+ ) or 0
164
+
165
+ return {
166
+ "used": used,
167
+ "limit": limit,
168
+ "remaining": (limit - used) if limit is not None else None,
169
+ "tier": str(tier),
170
+ }
retrieval/vectorstore.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from supabase import create_client, Client
2
  from datetime import datetime
3
  import os
@@ -7,25 +8,44 @@ def _client() -> Client:
7
  return create_client(os.environ["SUPABASE_URL"], os.environ["SUPABASE_KEY"])
8
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def store_chunks(
11
  doc_id: str,
12
  user_id: str,
13
- chunks: list[str],
14
  embeddings: list[list[float]],
15
  expires_at: datetime,
16
  ) -> None:
17
  client = _client()
18
- rows = [
19
- {
20
- "doc_id": doc_id,
21
- "user_id": user_id,
22
- "chunk_text": chunk,
23
- "embedding": embedding,
 
 
 
 
24
  "chunk_index": i,
25
- "expires_at": expires_at.isoformat(),
26
- }
27
- for i, (chunk, embedding) in enumerate(zip(chunks, embeddings))
28
- ]
29
  # Insert in batches of 100 to avoid payload limits
30
  for i in range(0, len(rows), 100):
31
  client.table("chunks").insert(rows[i : i + 100]).execute()
@@ -35,7 +55,8 @@ def similarity_search(
35
  doc_id: str,
36
  query_embedding: list[float],
37
  top_k: int = 5,
38
- ) -> list[str]:
 
39
  client = _client()
40
  result = client.rpc(
41
  "match_chunks",
@@ -45,17 +66,84 @@ def similarity_search(
45
  "match_count": top_k,
46
  },
47
  ).execute()
48
- return [r["chunk_text"] for r in result.data]
 
 
 
 
 
 
 
49
 
50
 
51
- def get_all_chunks(doc_id: str) -> list[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  """Return every chunk for a document in order, for full-context retrieval."""
53
  client = _client()
54
  result = (
55
  client.table("chunks")
56
- .select("chunk_text")
57
  .eq("doc_id", doc_id)
58
  .order("chunk_index", desc=False)
59
  .execute()
60
  )
61
- return [r["chunk_text"] for r in result.data]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
  from supabase import create_client, Client
3
  from datetime import datetime
4
  import os
 
8
  return create_client(os.environ["SUPABASE_URL"], os.environ["SUPABASE_KEY"])
9
 
10
 
11
+ # ── ChunkResult: rich return type for similarity search ──────────────────────
12
+
13
+ class ChunkResult:
14
+ """Holds chunk text, its page of origin, and source document."""
15
+ __slots__ = ("text", "page_number", "doc_id")
16
+
17
+ def __init__(self, text: str, page_number: int, doc_id: str):
18
+ self.text = text
19
+ self.page_number = page_number
20
+ self.doc_id = doc_id
21
+
22
+ # Behaves like a plain string so old code that does `"\n".join(chunks)` still works
23
+ def __str__(self) -> str: return self.text
24
+ def __repr__(self) -> str: return f"ChunkResult(doc={self.doc_id[:8]}, page={self.page_number})"
25
+
26
+
27
  def store_chunks(
28
  doc_id: str,
29
  user_id: str,
30
+ chunks, # list[ChunkMeta] or list[str]
31
  embeddings: list[list[float]],
32
  expires_at: datetime,
33
  ) -> None:
34
  client = _client()
35
+ rows = []
36
+ for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
37
+ # Support both ChunkMeta objects (with .text/.page_number) and plain strings
38
+ text = chunk.text if hasattr(chunk, "text") else str(chunk)
39
+ page_number = chunk.page_number if hasattr(chunk, "page_number") else 1
40
+ rows.append({
41
+ "doc_id": doc_id,
42
+ "user_id": user_id,
43
+ "chunk_text": text,
44
+ "embedding": embedding,
45
  "chunk_index": i,
46
+ "page_number": page_number,
47
+ "expires_at": expires_at.isoformat(),
48
+ })
 
49
  # Insert in batches of 100 to avoid payload limits
50
  for i in range(0, len(rows), 100):
51
  client.table("chunks").insert(rows[i : i + 100]).execute()
 
55
  doc_id: str,
56
  query_embedding: list[float],
57
  top_k: int = 5,
58
+ ) -> list[ChunkResult]:
59
+ """Search a single document and return rich ChunkResult objects."""
60
  client = _client()
61
  result = client.rpc(
62
  "match_chunks",
 
66
  "match_count": top_k,
67
  },
68
  ).execute()
69
+ return [
70
+ ChunkResult(
71
+ text = r["chunk_text"],
72
+ page_number = r.get("page_number", 1),
73
+ doc_id = str(r.get("doc_id", doc_id)),
74
+ )
75
+ for r in result.data
76
+ ]
77
 
78
 
79
+ def similarity_search_multi(
80
+ doc_ids: list[str],
81
+ query_embedding: list[float],
82
+ top_k: int = 20,
83
+ ) -> list[ChunkResult]:
84
+ """Search across multiple documents and return rich ChunkResult objects."""
85
+ if not doc_ids:
86
+ return []
87
+ if len(doc_ids) == 1:
88
+ return similarity_search(doc_ids[0], query_embedding, top_k)
89
+
90
+ client = _client()
91
+ result = client.rpc(
92
+ "match_chunks_multi",
93
+ {
94
+ "query_embedding": query_embedding,
95
+ "doc_ids_filter": doc_ids,
96
+ "match_count": top_k,
97
+ },
98
+ ).execute()
99
+ return [
100
+ ChunkResult(
101
+ text = r["chunk_text"],
102
+ page_number = r.get("page_number", 1),
103
+ doc_id = str(r["doc_id"]),
104
+ )
105
+ for r in result.data
106
+ ]
107
+
108
+
109
+ def get_all_chunks(doc_id: str) -> list[ChunkResult]:
110
  """Return every chunk for a document in order, for full-context retrieval."""
111
  client = _client()
112
  result = (
113
  client.table("chunks")
114
+ .select("chunk_text, page_number, doc_id")
115
  .eq("doc_id", doc_id)
116
  .order("chunk_index", desc=False)
117
  .execute()
118
  )
119
+ return [
120
+ ChunkResult(
121
+ text = r["chunk_text"],
122
+ page_number = r.get("page_number", 1),
123
+ doc_id = str(r.get("doc_id", doc_id)),
124
+ )
125
+ for r in result.data
126
+ ]
127
+
128
+
129
+ def get_all_chunks_multi(doc_ids: list[str]) -> list[ChunkResult]:
130
+ """Return all chunks for multiple documents in document+chunk order."""
131
+ if not doc_ids:
132
+ return []
133
+ client = _client()
134
+ result = (
135
+ client.table("chunks")
136
+ .select("chunk_text, page_number, doc_id, chunk_index")
137
+ .in_("doc_id", doc_ids)
138
+ .order("doc_id", desc=False)
139
+ .order("chunk_index", desc=False)
140
+ .execute()
141
+ )
142
+ return [
143
+ ChunkResult(
144
+ text = r["chunk_text"],
145
+ page_number = r.get("page_number", 1),
146
+ doc_id = str(r["doc_id"]),
147
+ )
148
+ for r in result.data
149
+ ]