Exodus2004 commited on
Commit
3fa4680
·
2 Parent(s): 48711f1c66ac65

Merge remote-tracking branch 'upstream/dev' into feat/issue-114-hybrid-search

Browse files
.env.example CHANGED
@@ -122,6 +122,16 @@ HF_TOKEN=your_huggingface_token_here
122
 
123
  # ── RAG Config (Optional — defaults shown) ───────────
124
 
 
 
 
 
 
 
 
 
 
 
125
  # ── ChromaDB (Vector Store) ─────────────────────────────────
126
 
127
  # Directory where ChromaDB persists its vector index to disk.
 
122
 
123
  # ── RAG Config (Optional — defaults shown) ───────────
124
 
125
+ # ── Knowledge Graph / GraphRAG (Optional — defaults shown) ─────────────────
126
+
127
+ # Directory where GraphRAG stores per-document knowledge graphs.
128
+ # Optional — defaults to "./data/graphs"
129
+ # GRAPH_PERSIST_DIR=./data/graphs
130
+
131
+ # Maximum number of graph relationships appended to the RAG prompt.
132
+ # Optional — defaults to 12
133
+ # GRAPH_MAX_RELATIONSHIPS=12
134
+
135
  # ── ChromaDB (Vector Store) ─────────────────────────────────
136
 
137
  # Directory where ChromaDB persists its vector index to disk.
.gitignore CHANGED
@@ -29,3 +29,4 @@ Thumbs.db
29
  # Misc
30
  *.log
31
  static/
 
 
29
  # Misc
30
  *.log
31
  static/
32
+ .planning/
.pre-commit-config.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ # ── Python Formatting ─────────────────────────────────────
3
+ - repo: https://github.com/psf/black
4
+ rev: 24.10.0
5
+ hooks:
6
+ - id: black
7
+ language_version: python3
8
+ args: [--line-length=120]
9
+ files: ^backend/
10
+
11
+ # ── Python Linting ────────────────────────────────────────
12
+ - repo: https://github.com/PyCQA/flake8
13
+ rev: 7.1.1
14
+ hooks:
15
+ - id: flake8
16
+ args:
17
+ - --max-line-length=120
18
+ - --select=E9,F63,F7,F82,E501
19
+ - --count
20
+ files: ^backend/
21
+
22
+ # ── JavaScript / TypeScript / JSON / CSS / Markdown Formatting ──
23
+ - repo: https://github.com/pre-commit/mirrors-prettier
24
+ rev: v4.0.0-alpha.8
25
+ hooks:
26
+ - id: prettier
27
+ types_or: [javascript, jsx, ts, tsx, json, css, markdown]
28
+ files: ^frontend/
29
+ exclude: ^frontend/(node_modules|.next|dist|build)/
30
+
31
+ # ── General Hygiene ───────────────────────────────────────
32
+ - repo: https://github.com/pre-commit/pre-commit-hooks
33
+ rev: v5.0.0
34
+ hooks:
35
+ - id: trailing-whitespace
36
+ args: [--markdown-linebreak-ext=md]
37
+ - id: end-of-file-fixer
38
+ - id: check-yaml
39
+ args: [--allow-multiple-documents]
40
+ - id: check-json
41
+ exclude: ^frontend/(node_modules|.next)/
42
+ - id: check-merge-conflict
43
+ - id: check-added-large-files
44
+ args: [--maxkb=1024]
45
+ - id: mixed-line-ending
46
+ args: [--fix=lf]
47
+ exclude: \.(bat|cmd|ps1)$
48
+
49
+ # ── Security ─────────────────────────────────────────────
50
+ - repo: https://github.com/Yelp/detect-secrets
51
+ rev: v1.5.0
52
+ hooks:
53
+ - id: detect-secrets
54
+ args: [--baseline, .secrets.baseline]
55
+ exclude: \.env\.example$
CONTRIBUTING.md CHANGED
@@ -61,6 +61,36 @@ cp ../.env.example .env # Fill in your own dev values
61
  uvicorn app.main:app --reload --port 8000
62
  ```
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  ### Frontend (Next.js)
65
 
66
  ```bash
 
61
  uvicorn app.main:app --reload --port 8000
62
  ```
63
 
64
+ ### Pre-commit Hooks (Required)
65
+
66
+ We use [`pre-commit`](https://pre-commit.com/) to enforce code style automatically before every commit. This prevents style-related CI failures.
67
+
68
+ ```bash
69
+ # Install pre-commit (one-time setup)
70
+ pip install pre-commit
71
+
72
+ # Install the hooks into your local clone (one-time per checkout)
73
+ pre-commit install
74
+
75
+ # (Optional) Run against all files to verify setup
76
+ pre-commit run --all-files
77
+ ```
78
+
79
+ **What the hooks check:**
80
+
81
+ | Hook | Tool | Scope |
82
+ |------|------|-------|
83
+ | Python formatting | `black` (line-length 120) | `backend/` |
84
+ | Python linting | `flake8` (errors only) | `backend/` |
85
+ | JS/TS/JSON/CSS/MD formatting | `prettier` | `frontend/` |
86
+ | Trailing whitespace | `pre-commit-hooks` | All files |
87
+ | YAML/JSON validity | `pre-commit-hooks` | All files |
88
+ | Merge-conflict markers | `pre-commit-hooks` | All files |
89
+ | Large file guard (>1 MB) | `pre-commit-hooks` | All files |
90
+ | Secret detection | `detect-secrets` | All files |
91
+
92
+ > ⚠️ If a hook modifies files, it will block your commit. Just `git add` the auto-fixed files and commit again.
93
+
94
  ### Frontend (Next.js)
95
 
96
  ```bash
Dockerfile CHANGED
@@ -33,7 +33,8 @@ RUN python -m venv "$VIRTUAL_ENV"
33
 
34
  COPY backend/requirements.txt ./requirements.txt
35
  RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
36
- pip install --no-cache-dir -r requirements.txt
 
37
 
38
  # --------------------------------------------------------
39
  # Stage 3: Runtime image with only app code and artifacts
@@ -68,7 +69,7 @@ COPY backend/__init__.py ./backend/__init__.py
68
  COPY --from=frontend-builder /app/frontend/out ./frontend/out
69
 
70
  # Create data directories with proper permissions
71
- RUN mkdir -p /app/data/uploads /app/data/chroma_db /app/data/huggingface && \
72
  chown -R appuser:appuser /app
73
 
74
  # Copy entrypoint
 
33
 
34
  COPY backend/requirements.txt ./requirements.txt
35
  RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
36
+ pip install --no-cache-dir -r requirements.txt && \
37
+ python -m spacy download en_core_web_sm
38
 
39
  # --------------------------------------------------------
40
  # Stage 3: Runtime image with only app code and artifacts
 
69
  COPY --from=frontend-builder /app/frontend/out ./frontend/out
70
 
71
  # Create data directories with proper permissions
72
+ RUN mkdir -p /app/data/uploads /app/data/chroma_db /app/data/graphs /app/data/huggingface && \
73
  chown -R appuser:appuser /app
74
 
75
  # Copy entrypoint
backend/app/config.py CHANGED
@@ -45,6 +45,22 @@ class Settings(BaseSettings):
45
  TOP_K_RETRIEVAL: int = 10
46
  TOP_K_RERANK: int = 5
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  # ── Embeddings (local HuggingFace model) ─────────────
49
  EMBEDDING_MODEL: str = "sentence-transformers/all-MiniLM-L6-v2"
50
  EMBEDDING_DIMENSION: int = 384
 
45
  TOP_K_RETRIEVAL: int = 10
46
  TOP_K_RERANK: int = 5
47
 
48
+ # ── Knowledge Graph (GraphRAG) ───────────────────────
49
+ GRAPH_PERSIST_DIR: str = "./data/graphs"
50
+ GRAPH_ENTITY_LABELS: set = {
51
+ "PERSON",
52
+ "ORG",
53
+ "GPE",
54
+ "LOC",
55
+ "PRODUCT",
56
+ "EVENT",
57
+ "WORK_OF_ART",
58
+ "LAW",
59
+ "NORP",
60
+ "FAC",
61
+ }
62
+ GRAPH_MAX_RELATIONSHIPS: int = 12
63
+
64
  # ── Embeddings (local HuggingFace model) ─────────────
65
  EMBEDDING_MODEL: str = "sentence-transformers/all-MiniLM-L6-v2"
66
  EMBEDDING_DIMENSION: int = 384
backend/app/models.py CHANGED
@@ -8,11 +8,9 @@ import hashlib
8
  from datetime import datetime, timezone
9
 
10
  from cryptography.fernet import Fernet
11
- from sqlalchemy import Column, String, Integer, DateTime, ForeignKey, Text, Boolean
12
  from sqlalchemy.types import TypeDecorator, CHAR
13
  from sqlalchemy.dialects.postgresql import UUID as PG_UUID
14
- from sqlalchemy import Column, String, Integer, DateTime, ForeignKey, Text, Boolean, Enum as SQLAlchemyEnum
15
- from sqlalchemy.types import TypeDecorator
16
  from sqlalchemy.orm import relationship
17
 
18
  from app.database import Base
@@ -85,11 +83,6 @@ class EncryptedString(TypeDecorator):
85
  return value
86
 
87
 
88
- def generate_uuid():
89
- """Generates a standard unique string identifier for database records."""
90
- return str(uuid.uuid4())
91
-
92
-
93
  class UserRole(str, enum.Enum):
94
  """
95
  Defines the available user roles for Role-Based Access Control (RBAC).
@@ -129,6 +122,7 @@ class User(Base):
129
  documents = relationship("Document", back_populates="owner", cascade="all, delete-orphan")
130
  messages = relationship("ChatMessage", back_populates="user", cascade="all, delete-orphan")
131
  api_keys = relationship("ApiKey", back_populates="user", cascade="all, delete-orphan")
 
132
 
133
 
134
  class ApiKey(Base):
@@ -148,6 +142,22 @@ class ApiKey(Base):
148
  user = relationship("User", back_populates="api_keys")
149
 
150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  class Document(Base):
152
  """
153
  Metadata and processing status for files uploaded by users.
@@ -159,11 +169,6 @@ class Document(Base):
159
  filename = Column(String(255), nullable=False) # Stored filename (UUID-based)
160
  original_name = Column(String(255), nullable=False) # User's original filename
161
  file_size = Column(Integer, default=0) # Size in bytes
162
- id = Column(String, primary_key=True, default=generate_uuid)
163
- user_id = Column(String, ForeignKey("users.id"), nullable=False, index=True)
164
- filename = Column(String(255), nullable=False) # Internal UUID-based filename
165
- original_name = Column(String(255), nullable=False) # Original name for user display
166
- file_size = Column(Integer, default=0) # Size in bytes
167
  page_count = Column(Integer, default=0)
168
  chunk_count = Column(Integer, default=0)
169
  status = Column(String(20), default="pending") # pending | processing | ready | failed
@@ -185,6 +190,7 @@ class ChatMessage(Base):
185
  id = Column(GUID, primary_key=True, default=uuid.uuid4)
186
  user_id = Column(GUID, ForeignKey("users.id"), nullable=False, index=True)
187
  document_id = Column(GUID, ForeignKey("documents.id"), nullable=True, index=True)
 
188
  role = Column(String(20), nullable=False) # "user" | "assistant"
189
  content = Column(Text, nullable=False)
190
  sources_json = Column(Text, nullable=True) # JSON representation of retrieved sources
@@ -193,6 +199,7 @@ class ChatMessage(Base):
193
  # Relationships
194
  user = relationship("User", back_populates="messages")
195
  document = relationship("Document", back_populates="messages")
 
196
  shared_message = relationship("SharedMessage", back_populates="message", uselist=False, cascade="all, delete-orphan")
197
 
198
 
 
8
  from datetime import datetime, timezone
9
 
10
  from cryptography.fernet import Fernet
11
+ from sqlalchemy import Column, String, Integer, DateTime, ForeignKey, Text, Boolean, Enum as SQLAlchemyEnum
12
  from sqlalchemy.types import TypeDecorator, CHAR
13
  from sqlalchemy.dialects.postgresql import UUID as PG_UUID
 
 
14
  from sqlalchemy.orm import relationship
15
 
16
  from app.database import Base
 
83
  return value
84
 
85
 
 
 
 
 
 
86
  class UserRole(str, enum.Enum):
87
  """
88
  Defines the available user roles for Role-Based Access Control (RBAC).
 
122
  documents = relationship("Document", back_populates="owner", cascade="all, delete-orphan")
123
  messages = relationship("ChatMessage", back_populates="user", cascade="all, delete-orphan")
124
  api_keys = relationship("ApiKey", back_populates="user", cascade="all, delete-orphan")
125
+ chat_sessions = relationship("ChatSession", back_populates="user", cascade="all, delete-orphan")
126
 
127
 
128
  class ApiKey(Base):
 
142
  user = relationship("User", back_populates="api_keys")
143
 
144
 
145
+ class ChatSession(Base):
146
+ """
147
+ Groups chat messages into logical sessions/threads.
148
+ """
149
+ __tablename__ = "chat_sessions"
150
+
151
+ id = Column(GUID, primary_key=True, default=uuid.uuid4)
152
+ user_id = Column(GUID, ForeignKey("users.id"), nullable=False, index=True)
153
+ title = Column(String(255), nullable=False)
154
+ created_at = Column(DateTime, default=lambda: datetime.now(timezone.utc))
155
+
156
+ # Relationships
157
+ user = relationship("User", back_populates="chat_sessions")
158
+ messages = relationship("ChatMessage", back_populates="session", cascade="all, delete-orphan")
159
+
160
+
161
  class Document(Base):
162
  """
163
  Metadata and processing status for files uploaded by users.
 
169
  filename = Column(String(255), nullable=False) # Stored filename (UUID-based)
170
  original_name = Column(String(255), nullable=False) # User's original filename
171
  file_size = Column(Integer, default=0) # Size in bytes
 
 
 
 
 
172
  page_count = Column(Integer, default=0)
173
  chunk_count = Column(Integer, default=0)
174
  status = Column(String(20), default="pending") # pending | processing | ready | failed
 
190
  id = Column(GUID, primary_key=True, default=uuid.uuid4)
191
  user_id = Column(GUID, ForeignKey("users.id"), nullable=False, index=True)
192
  document_id = Column(GUID, ForeignKey("documents.id"), nullable=True, index=True)
193
+ session_id = Column(GUID, ForeignKey("chat_sessions.id"), nullable=True, index=True)
194
  role = Column(String(20), nullable=False) # "user" | "assistant"
195
  content = Column(Text, nullable=False)
196
  sources_json = Column(Text, nullable=True) # JSON representation of retrieved sources
 
199
  # Relationships
200
  user = relationship("User", back_populates="messages")
201
  document = relationship("Document", back_populates="messages")
202
+ session = relationship("ChatSession", back_populates="messages")
203
  shared_message = relationship("SharedMessage", back_populates="message", uselist=False, cascade="all, delete-orphan")
204
 
205
 
backend/app/rag/agent.py CHANGED
@@ -9,6 +9,7 @@ from typing import List, Dict, Any, Optional, Generator
9
  from huggingface_hub import InferenceClient
10
  from app.config import get_settings
11
  from app.rag.retriever import retrieve
 
12
  from app.rag.prompts import SYSTEM_PROMPT, RAG_PROMPT_TEMPLATE, GREETING_PROMPT
13
  from app.rag.tracing import trace_function
14
 
@@ -48,6 +49,26 @@ def build_context(chunks: List[Dict[str, Any]]) -> str:
48
  return "\n\n---\n\n".join(context_parts)
49
 
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def _chat_messages(system: str, user_content: str) -> list:
52
  """Build messages list for chat completion API."""
53
  return [
@@ -108,7 +129,12 @@ def generate_answer(
108
 
109
  # ── Build prompt ─────────────────────────────────
110
  # Format retrieved chunks into a readable context block, then inject into the RAG prompt template
111
- context = build_context(chunks)
 
 
 
 
 
112
  user_content = RAG_PROMPT_TEMPLATE.format(context=context, question=question)
113
  messages = _chat_messages(SYSTEM_PROMPT, user_content)
114
 
@@ -222,7 +248,12 @@ def generate_answer_stream(
222
 
223
  # ── Build prompt ─────────────────────────────────
224
  # Format retrieved chunks into a readable context block, then inject into the RAG prompt template
225
- context = build_context(chunks)
 
 
 
 
 
226
  user_content = RAG_PROMPT_TEMPLATE.format(context=context, question=question)
227
  messages = _chat_messages(SYSTEM_PROMPT, user_content)
228
 
 
9
  from huggingface_hub import InferenceClient
10
  from app.config import get_settings
11
  from app.rag.retriever import retrieve
12
+ from app.rag.graph_retriever import get_entity_context
13
  from app.rag.prompts import SYSTEM_PROMPT, RAG_PROMPT_TEMPLATE, GREETING_PROMPT
14
  from app.rag.tracing import trace_function
15
 
 
49
  return "\n\n---\n\n".join(context_parts)
50
 
51
 
52
+ def build_augmented_context(
53
+ chunks: List[Dict[str, Any]],
54
+ question: str,
55
+ user_id: str,
56
+ document_id: Optional[str] = None,
57
+ ) -> str:
58
+ """Combine vector-retrieved excerpts with GraphRAG relationships."""
59
+ context = build_context(chunks)
60
+ graph_context = get_entity_context(
61
+ query=question,
62
+ user_id=user_id,
63
+ document_id=document_id,
64
+ )
65
+
66
+ if not graph_context:
67
+ return context
68
+
69
+ return f"{context}\n\n---\n\n{graph_context}"
70
+
71
+
72
  def _chat_messages(system: str, user_content: str) -> list:
73
  """Build messages list for chat completion API."""
74
  return [
 
129
 
130
  # ── Build prompt ─────────────────────────────────
131
  # Format retrieved chunks into a readable context block, then inject into the RAG prompt template
132
+ context = build_augmented_context(
133
+ chunks=chunks,
134
+ question=question,
135
+ user_id=user_id,
136
+ document_id=document_id,
137
+ )
138
  user_content = RAG_PROMPT_TEMPLATE.format(context=context, question=question)
139
  messages = _chat_messages(SYSTEM_PROMPT, user_content)
140
 
 
248
 
249
  # ── Build prompt ─────────────────────────────────
250
  # Format retrieved chunks into a readable context block, then inject into the RAG prompt template
251
+ context = build_augmented_context(
252
+ chunks=chunks,
253
+ question=question,
254
+ user_id=user_id,
255
+ document_id=document_id,
256
+ )
257
  user_content = RAG_PROMPT_TEMPLATE.format(context=context, question=question)
258
  messages = _chat_messages(SYSTEM_PROMPT, user_content)
259
 
backend/app/rag/chunker.py CHANGED
@@ -2,6 +2,7 @@
2
  Smart document chunking using LangChain's RecursiveCharacterTextSplitter.
3
  Supports PDF, DOCX, TXT, and Markdown files with page-level metadata.
4
  """
 
5
  import fitz # PyMuPDF
6
  import docx
7
  from typing import List, Dict, Any
@@ -11,8 +12,72 @@ from app.config import get_settings
11
  settings = get_settings()
12
 
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def extract_pdf(filepath: str) -> List[Dict[str, Any]]:
15
- """Extract text from PDF with page numbers."""
 
 
 
 
 
 
 
 
16
  doc = fitz.open(filepath)
17
  pages = []
18
 
@@ -22,12 +87,52 @@ def extract_pdf(filepath: str) -> List[Dict[str, Any]]:
22
  pages.append({
23
  "text": text,
24
  "page": page_num + 1,
 
25
  })
26
 
27
  doc.close()
28
  return pages
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def extract_pdf_images(filepath: str) -> List[Dict[str, Any]]:
32
  """Extract images from a PDF and return list of dicts with image bytes and page number.
33
 
@@ -109,6 +214,19 @@ def chunk_document(filepath: str) -> List[Dict[str, Any]]:
109
  for page_data in pages:
110
  text = page_data["text"]
111
  page_num = page_data["page"]
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  # Split this page's text
114
  splits = splitter.split_text(text)
@@ -119,6 +237,7 @@ def chunk_document(filepath: str) -> List[Dict[str, Any]]:
119
  "text": split_text.strip(),
120
  "page": page_num,
121
  "chunk_index": chunk_index,
 
122
  })
123
  chunk_index += 1
124
 
 
2
  Smart document chunking using LangChain's RecursiveCharacterTextSplitter.
3
  Supports PDF, DOCX, TXT, and Markdown files with page-level metadata.
4
  """
5
+ import json
6
  import fitz # PyMuPDF
7
  import docx
8
  from typing import List, Dict, Any
 
12
  settings = get_settings()
13
 
14
 
15
+ def _is_word_inside_bbox(word: Dict[str, Any], bbox: tuple) -> bool:
16
+ """Return True when the word center falls inside a pdfplumber bbox."""
17
+ x0, top, x1, bottom = bbox
18
+ word_x = (float(word["x0"]) + float(word["x1"])) / 2
19
+ word_y = (float(word["top"]) + float(word["bottom"])) / 2
20
+ return x0 <= word_x <= x1 and top <= word_y <= bottom
21
+
22
+
23
+ def _words_to_text(words: List[Dict[str, Any]], line_tolerance: float = 3.0) -> str:
24
+ """Rebuild readable text from positioned pdfplumber words."""
25
+ if not words:
26
+ return ""
27
+
28
+ sorted_words = sorted(words, key=lambda item: (round(float(item["top"]) / line_tolerance), item["x0"]))
29
+ lines: List[List[Dict[str, Any]]] = []
30
+
31
+ for word in sorted_words:
32
+ if not lines:
33
+ lines.append([word])
34
+ continue
35
+
36
+ current_top = sum(float(item["top"]) for item in lines[-1]) / len(lines[-1])
37
+ if abs(float(word["top"]) - current_top) <= line_tolerance:
38
+ lines[-1].append(word)
39
+ else:
40
+ lines.append([word])
41
+
42
+ text_lines = [
43
+ " ".join(item["text"] for item in sorted(line, key=lambda item: item["x0"]))
44
+ for line in lines
45
+ ]
46
+ return "\n".join(line for line in text_lines if line.strip())
47
+
48
+
49
+ def _table_to_markdown(rows: List[List[Any]]) -> str:
50
+ """Serialize extracted table rows into Markdown for retrieval."""
51
+ cleaned_rows = [
52
+ ["" if cell is None else str(cell).replace("\n", " ").strip() for cell in row]
53
+ for row in rows
54
+ if row and any(cell is not None and str(cell).strip() for cell in row)
55
+ ]
56
+ if not cleaned_rows:
57
+ return ""
58
+
59
+ width = max(len(row) for row in cleaned_rows)
60
+ normalized = [row + [""] * (width - len(row)) for row in cleaned_rows]
61
+
62
+ def fmt(row: List[str]) -> str:
63
+ return "| " + " | ".join(cell.replace("|", "\\|") for cell in row) + " |"
64
+
65
+ header = normalized[0]
66
+ separator = ["---"] * width
67
+ body = normalized[1:]
68
+ return "\n".join([fmt(header), fmt(separator), *[fmt(row) for row in body]])
69
+
70
+
71
  def extract_pdf(filepath: str) -> List[Dict[str, Any]]:
72
+ """Extract PDF text while preserving tables as separate bbox-aware chunks."""
73
+ try:
74
+ return extract_pdf_with_tables(filepath)
75
+ except ImportError:
76
+ return extract_pdf_with_pymupdf(filepath)
77
+
78
+
79
+ def extract_pdf_with_pymupdf(filepath: str) -> List[Dict[str, Any]]:
80
+ """Fallback PDF extraction with page numbers using PyMuPDF."""
81
  doc = fitz.open(filepath)
82
  pages = []
83
 
 
87
  pages.append({
88
  "text": text,
89
  "page": page_num + 1,
90
+ "chunk_type": "text",
91
  })
92
 
93
  doc.close()
94
  return pages
95
 
96
 
97
+ def extract_pdf_with_tables(filepath: str) -> List[Dict[str, Any]]:
98
+ """Detect tables with pdfplumber, remove table text from paragraphs, and keep table bboxes."""
99
+ import pdfplumber
100
+
101
+ pages: List[Dict[str, Any]] = []
102
+
103
+ with pdfplumber.open(filepath) as pdf:
104
+ for page_num, page in enumerate(pdf.pages, start=1):
105
+ tables = page.find_tables()
106
+ table_bboxes = [table.bbox for table in tables]
107
+
108
+ words = page.extract_words() or []
109
+ paragraph_words = [
110
+ word for word in words
111
+ if not any(_is_word_inside_bbox(word, bbox) for bbox in table_bboxes)
112
+ ]
113
+ paragraph_text = _words_to_text(paragraph_words)
114
+
115
+ if paragraph_text.strip():
116
+ pages.append({
117
+ "text": paragraph_text,
118
+ "page": page_num,
119
+ "chunk_type": "text",
120
+ })
121
+
122
+ for table_index, table in enumerate(tables):
123
+ table_text = _table_to_markdown(table.extract() or [])
124
+ if table_text.strip():
125
+ pages.append({
126
+ "text": table_text,
127
+ "page": page_num,
128
+ "chunk_type": "table",
129
+ "bbox": json.dumps([round(float(value), 2) for value in table.bbox]),
130
+ "table_index": table_index,
131
+ })
132
+
133
+ return pages
134
+
135
+
136
  def extract_pdf_images(filepath: str) -> List[Dict[str, Any]]:
137
  """Extract images from a PDF and return list of dicts with image bytes and page number.
138
 
 
214
  for page_data in pages:
215
  text = page_data["text"]
216
  page_num = page_data["page"]
217
+ chunk_type = page_data.get("chunk_type", "text")
218
+
219
+ if chunk_type == "table":
220
+ all_chunks.append({
221
+ "text": text.strip(),
222
+ "page": page_num,
223
+ "chunk_index": chunk_index,
224
+ "chunk_type": "table",
225
+ "bbox": page_data.get("bbox", ""),
226
+ "table_index": page_data.get("table_index", 0),
227
+ })
228
+ chunk_index += 1
229
+ continue
230
 
231
  # Split this page's text
232
  splits = splitter.split_text(text)
 
237
  "text": split_text.strip(),
238
  "page": page_num,
239
  "chunk_index": chunk_index,
240
+ "chunk_type": chunk_type,
241
  })
242
  chunk_index += 1
243
 
backend/app/rag/graph_builder.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Knowledge graph construction and persistence for GraphRAG.
3
+ """
4
+ import json
5
+ import logging
6
+ import re
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import Any, Dict, Iterable, List, Optional
10
+
11
+ import networkx as nx
12
+
13
+ from app.config import get_settings
14
+
15
+ logger = logging.getLogger(__name__)
16
+ settings = get_settings()
17
+
18
+ _nlp = None
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class Entity:
23
+ id: str
24
+ text: str
25
+ label: str
26
+
27
+
28
+ def _safe_id(value: str) -> str:
29
+ safe = re.sub(r"[^A-Za-z0-9_.-]+", "_", value).strip("._")
30
+ return safe or "unknown"
31
+
32
+
33
+ def get_graph_path(user_id: str, document_id: str) -> Path:
34
+ """Return the on-disk graph path for one user/document pair."""
35
+ filename = f"{_safe_id(user_id)}_{_safe_id(document_id)}.json"
36
+ return Path(settings.GRAPH_PERSIST_DIR) / filename
37
+
38
+
39
+ def iter_graph_paths(user_id: str) -> Iterable[Path]:
40
+ """Yield every persisted graph path for a user."""
41
+ graph_dir = Path(settings.GRAPH_PERSIST_DIR)
42
+ if not graph_dir.exists():
43
+ return []
44
+
45
+ prefix = f"{_safe_id(user_id)}_"
46
+ return sorted(graph_dir.glob(f"{prefix}*.json"))
47
+
48
+
49
+ def _get_nlp():
50
+ """Load the spaCy English NER model lazily."""
51
+ global _nlp
52
+ if _nlp is None:
53
+ import spacy
54
+
55
+ try:
56
+ _nlp = spacy.load("en_core_web_sm")
57
+ except OSError as exc:
58
+ raise RuntimeError(
59
+ "spaCy model 'en_core_web_sm' is required for GraphRAG entity extraction. "
60
+ "Install it with: python -m spacy download en_core_web_sm"
61
+ ) from exc
62
+ return _nlp
63
+
64
+
65
+ def _entity_id(text: str, label: str) -> str:
66
+ normalized = " ".join(text.split()).casefold()
67
+ return f"{label}:{normalized}"
68
+
69
+
70
+ def extract_entities(text: str) -> List[Entity]:
71
+ """Extract configured named entities from text."""
72
+ if not text or not text.strip():
73
+ return []
74
+
75
+ doc = _get_nlp()(text)
76
+ entities: Dict[str, Entity] = {}
77
+
78
+ for ent in doc.ents:
79
+ value = " ".join(ent.text.split()).strip()
80
+ if not value or ent.label_ not in settings.GRAPH_ENTITY_LABELS:
81
+ continue
82
+
83
+ entity_id = _entity_id(value, ent.label_)
84
+ entities.setdefault(
85
+ entity_id,
86
+ Entity(id=entity_id, text=value, label=ent.label_),
87
+ )
88
+
89
+ return list(entities.values())
90
+
91
+
92
+ def build_graph(chunks: List[Dict[str, Any]]) -> nx.Graph:
93
+ """Build an entity co-occurrence graph from document chunks."""
94
+ graph = nx.Graph()
95
+
96
+ for chunk in chunks:
97
+ text = chunk.get("text", "")
98
+ page = chunk.get("page")
99
+ chunk_index = chunk.get("chunk_index")
100
+ entities = extract_entities(text)
101
+
102
+ for entity in entities:
103
+ if graph.has_node(entity.id):
104
+ graph.nodes[entity.id]["mentions"] += 1
105
+ graph.nodes[entity.id]["pages"].add(page)
106
+ graph.nodes[entity.id]["chunks"].add(chunk_index)
107
+ else:
108
+ graph.add_node(
109
+ entity.id,
110
+ name=entity.text,
111
+ label=entity.label,
112
+ mentions=1,
113
+ pages={page},
114
+ chunks={chunk_index},
115
+ )
116
+
117
+ for left_index, left in enumerate(entities):
118
+ for right in entities[left_index + 1:]:
119
+ if graph.has_edge(left.id, right.id):
120
+ graph[left.id][right.id]["weight"] += 1
121
+ graph[left.id][right.id]["pages"].add(page)
122
+ graph[left.id][right.id]["chunks"].add(chunk_index)
123
+ else:
124
+ graph.add_edge(
125
+ left.id,
126
+ right.id,
127
+ weight=1,
128
+ pages={page},
129
+ chunks={chunk_index},
130
+ )
131
+
132
+ _convert_sets_for_json(graph)
133
+ return graph
134
+
135
+
136
+ def _convert_sets_for_json(graph: nx.Graph) -> None:
137
+ for _, data in graph.nodes(data=True):
138
+ data["pages"] = sorted(item for item in data.get("pages", []) if item is not None)
139
+ data["chunks"] = sorted(item for item in data.get("chunks", []) if item is not None)
140
+
141
+ for _, _, data in graph.edges(data=True):
142
+ data["pages"] = sorted(item for item in data.get("pages", []) if item is not None)
143
+ data["chunks"] = sorted(item for item in data.get("chunks", []) if item is not None)
144
+
145
+
146
+ def save_graph(graph: nx.Graph, user_id: str, document_id: str) -> Path:
147
+ """Persist a graph to disk as node-link JSON."""
148
+ graph_path = get_graph_path(user_id, document_id)
149
+ graph_path.parent.mkdir(parents=True, exist_ok=True)
150
+
151
+ data = nx.node_link_data(graph)
152
+ data["metadata"] = {
153
+ "user_id": user_id,
154
+ "document_id": document_id,
155
+ "node_count": graph.number_of_nodes(),
156
+ "edge_count": graph.number_of_edges(),
157
+ }
158
+
159
+ graph_path.write_text(json.dumps(data, ensure_ascii=True, indent=2), encoding="utf-8")
160
+ logger.info(
161
+ "Saved knowledge graph for document %s with %s nodes and %s edges",
162
+ document_id,
163
+ graph.number_of_nodes(),
164
+ graph.number_of_edges(),
165
+ )
166
+ return graph_path
167
+
168
+
169
+ def load_graph(user_id: str, document_id: str) -> Optional[nx.Graph]:
170
+ """Load a persisted graph for one user/document pair."""
171
+ return load_graph_path(get_graph_path(user_id, document_id))
172
+
173
+
174
+ def load_graph_path(graph_path: Path) -> Optional[nx.Graph]:
175
+ """Load a graph from a concrete JSON path."""
176
+ if not graph_path.exists():
177
+ return None
178
+
179
+ data = json.loads(graph_path.read_text(encoding="utf-8"))
180
+ return nx.node_link_graph(data)
181
+
182
+
183
+ def delete_graph(user_id: str, document_id: str) -> None:
184
+ """Delete a persisted graph file if it exists."""
185
+ get_graph_path(user_id, document_id).unlink(missing_ok=True)
backend/app/rag/graph_retriever.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Knowledge graph retrieval for augmenting RAG context.
3
+ """
4
+ import logging
5
+ from typing import Dict, Iterable, List, Optional, Set, Tuple
6
+
7
+ import networkx as nx
8
+
9
+ from app.config import get_settings
10
+ from app.rag.graph_builder import (
11
+ extract_entities,
12
+ iter_graph_paths,
13
+ load_graph,
14
+ load_graph_path,
15
+ )
16
+
17
+ logger = logging.getLogger(__name__)
18
+ settings = get_settings()
19
+
20
+
21
+ def _candidate_graphs(user_id: str, document_id: Optional[str]) -> Iterable[nx.Graph]:
22
+ if document_id:
23
+ graph = load_graph(user_id, document_id)
24
+ return [graph] if graph is not None else []
25
+
26
+ graphs = []
27
+ for path in iter_graph_paths(user_id):
28
+ graph = load_graph_path(path)
29
+ if graph is not None:
30
+ graphs.append(graph)
31
+ return graphs
32
+
33
+
34
+ def _node_name(graph: nx.Graph, node_id: str) -> str:
35
+ return graph.nodes[node_id].get("name", node_id.split(":", 1)[-1])
36
+
37
+
38
+ def _match_query_nodes(graph: nx.Graph, query: str) -> Set[str]:
39
+ query_entities = extract_entities(query)
40
+ matched = {entity.id for entity in query_entities if graph.has_node(entity.id)}
41
+
42
+ if matched:
43
+ return matched
44
+
45
+ query_text = query.casefold()
46
+ for node_id, data in graph.nodes(data=True):
47
+ name = data.get("name", "").casefold()
48
+ if name and name in query_text:
49
+ matched.add(node_id)
50
+
51
+ return matched
52
+
53
+
54
+ def _format_pages(pages: List[int]) -> str:
55
+ if not pages:
56
+ return "unknown pages"
57
+ if len(pages) == 1:
58
+ return f"page {pages[0]}"
59
+ return "pages " + ", ".join(str(page) for page in pages[:4])
60
+
61
+
62
+ def _relationship_key(left: str, right: str) -> Tuple[str, str]:
63
+ return tuple(sorted((left, right)))
64
+
65
+
66
+ def get_entity_context(
67
+ query: str,
68
+ user_id: str,
69
+ document_id: Optional[str] = None,
70
+ ) -> str:
71
+ """Return compact graph relationship context relevant to the query."""
72
+ relationships: Dict[Tuple[str, str], Dict[str, object]] = {}
73
+
74
+ try:
75
+ graphs = _candidate_graphs(user_id=user_id, document_id=document_id)
76
+ for graph in graphs:
77
+ matched_nodes = _match_query_nodes(graph, query)
78
+
79
+ for node_id in matched_nodes:
80
+ neighbors = sorted(
81
+ graph.neighbors(node_id),
82
+ key=lambda neighbor: graph[node_id][neighbor].get("weight", 0),
83
+ reverse=True,
84
+ )
85
+ for neighbor_id in neighbors:
86
+ edge = graph[node_id][neighbor_id]
87
+ left = _node_name(graph, node_id)
88
+ right = _node_name(graph, neighbor_id)
89
+ key = _relationship_key(left.casefold(), right.casefold())
90
+ existing = relationships.setdefault(
91
+ key,
92
+ {
93
+ "left": left,
94
+ "right": right,
95
+ "weight": 0,
96
+ "pages": set(),
97
+ },
98
+ )
99
+ existing["weight"] = int(existing["weight"]) + int(edge.get("weight", 1))
100
+ existing["pages"].update(edge.get("pages", []))
101
+ except Exception as exc:
102
+ logger.warning("GraphRAG context retrieval failed: %s", exc)
103
+ return ""
104
+
105
+ if not relationships:
106
+ return ""
107
+
108
+ ranked = sorted(
109
+ relationships.values(),
110
+ key=lambda item: int(item["weight"]),
111
+ reverse=True,
112
+ )[: settings.GRAPH_MAX_RELATIONSHIPS]
113
+
114
+ lines = ["## Knowledge Graph Context"]
115
+ for item in ranked:
116
+ pages = sorted(item["pages"])
117
+ lines.append(
118
+ f"- {item['left']} is related to {item['right']} "
119
+ f"through document co-occurrence on {_format_pages(pages)} "
120
+ f"(strength: {item['weight']})."
121
+ )
122
+
123
+ return "\n".join(lines)
backend/app/rag/vectorstore.py CHANGED
@@ -91,6 +91,9 @@ def store_chunks(
91
  "document_id": document_id,
92
  "page": chunk["page"],
93
  "chunk_index": chunk["chunk_index"],
 
 
 
94
  # Indicate whether this chunk was originally an image and include a short caption
95
  **({"is_image": True, "image_caption": chunk.get("image_caption", "")}
96
  if chunk.get("is_image") else {}),
@@ -169,6 +172,8 @@ def query_chunks(
169
  "filename": metadata.get("filename", ""),
170
  "document_id": metadata.get("document_id", ""),
171
  "page": metadata.get("page", 1),
 
 
172
  "score": round(similarity, 4),
173
  })
174
 
 
91
  "document_id": document_id,
92
  "page": chunk["page"],
93
  "chunk_index": chunk["chunk_index"],
94
+ "chunk_type": chunk.get("chunk_type", "text"),
95
+ **({"bbox": chunk.get("bbox", "")} if chunk.get("bbox") else {}),
96
+ **({"table_index": chunk.get("table_index", 0)} if chunk.get("chunk_type") == "table" else {}),
97
  # Indicate whether this chunk was originally an image and include a short caption
98
  **({"is_image": True, "image_caption": chunk.get("image_caption", "")}
99
  if chunk.get("is_image") else {}),
 
172
  "filename": metadata.get("filename", ""),
173
  "document_id": metadata.get("document_id", ""),
174
  "page": metadata.get("page", 1),
175
+ "chunk_type": metadata.get("chunk_type", "text"),
176
+ "bbox": metadata.get("bbox", ""),
177
  "score": round(similarity, 4),
178
  })
179
 
backend/app/routes/chat.py CHANGED
@@ -7,20 +7,16 @@ import time
7
  from datetime import datetime
8
  from io import BytesIO
9
  import logging
10
- from typing import Optional
11
 
12
  from fastapi import APIRouter, Depends, HTTPException, Request
13
  from fastapi.responses import Response, StreamingResponse
14
- from reportlab.lib.pagesizes import letter
15
- from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
16
- from reportlab.lib.units import inch
17
- from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer
18
  from sqlalchemy.orm import Session
19
 
20
  from app.auth import get_current_user
21
  from app.database import get_db
22
  from app.metrics import record_query_response_time
23
- from app.models import User, ChatMessage, Document, SharedMessage
24
  from app.rate_limit import limiter
25
  from app.schemas import (
26
  ChatRequest,
@@ -30,6 +26,8 @@ from app.schemas import (
30
  ShareAnswerResponse,
31
  ShareLinkResponse,
32
  SourceChunk,
 
 
33
  )
34
 
35
  logger = logging.getLogger(__name__)
@@ -77,11 +75,139 @@ def create_share_link(
77
  db.commit()
78
 
79
  return ShareLinkResponse(
80
- message_id=message.id,
81
  share_url=f"/share?message_id={message.id}",
82
  )
83
 
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  def generate_answer(question: str, user_id: str, document_id: Optional[str] = None, hf_token: Optional[str] = None):
86
  from app.rag.agent import generate_answer as _generate_answer
87
 
@@ -102,33 +228,7 @@ def ask_question(
102
  user: User = Depends(get_current_user),
103
  db: Session = Depends(get_db),
104
  ):
105
- """Ask a question with RAG retrieval (non-streaming).
106
-
107
- Processes a user's question by retrieving relevant document chunks,
108
- generating an answer using an LLM, and saving the conversation to chat
109
- history. If a `document_id` is provided, the retrieval is scoped to that
110
- specific document; otherwise, it searches across all documents owned by
111
- the user.
112
-
113
- Args:
114
- payload: ChatRequest containing the `question` text and optionally a
115
- `document_id` to limit the retrieval scope.
116
- user: The currently authenticated user, obtained from the dependency.
117
- db: SQLAlchemy database session, obtained from the dependency.
118
-
119
- Returns:
120
- ChatResponse: An object containing:
121
- - answer: The generated answer text.
122
- - sources: A list of `SourceChunk` objects with metadata about
123
- the retrieved chunks (e.g., filename, page number, text snippet).
124
- - document_id: The document ID that was used (if any).
125
-
126
- Raises:
127
- HTTPException: 404 if the specified `document_id` does not exist or
128
- does not belong to the authenticated user.
129
- HTTPException: 400 if the document exists but its status is not
130
- "ready" (e.g., still processing or failed).
131
- """
132
  started_at = time.perf_counter()
133
  try:
134
  # Validate document exists if specified
@@ -147,6 +247,17 @@ def ask_question(
147
  detail=f"Document is still {doc.status}. Please wait for processing to complete.",
148
  )
149
 
 
 
 
 
 
 
 
 
 
 
 
150
  result = generate_answer(
151
  question=payload.question,
152
  user_id=user.id,
@@ -155,8 +266,8 @@ def ask_question(
155
  )
156
 
157
  # Save to chat history
158
- _save_message(db, user.id, payload.document_id, "user", payload.question)
159
- _save_message(db, user.id, payload.document_id, "assistant", result["answer"], result["sources"])
160
 
161
  return ChatResponse(
162
  answer=result["answer"],
@@ -175,41 +286,7 @@ def ask_question_stream(
175
  user: User = Depends(get_current_user),
176
  db: Session = Depends(get_db),
177
  ):
178
- """Ask a question with Server-Sent Events (SSE) streaming response.
179
-
180
- Processes a user's question using RAG and streams the answer token by
181
- token over SSE. The user's question is saved to chat history immediately.
182
- The assistant's answer is accumulated on the server and saved to history
183
- only after the stream completes. If a `document_id` is provided, retrieval
184
- is scoped to that document.
185
-
186
- Args:
187
- payload: ChatRequest containing the `question` text and optionally a
188
- `document_id` to limit the retrieval scope.
189
- user: The currently authenticated user, obtained from the dependency.
190
- db: SQLAlchemy database session, obtained from the dependency.
191
-
192
- Returns:
193
- StreamingResponse: A FastAPI `StreamingResponse` with:
194
- - media_type: "text/event-stream"
195
- - Headers: Cache-Control, Connection, and X-Accel-Buffering set
196
- for proper SSE behavior.
197
- - Body: A generator yielding SSE messages with `token` (partial
198
- answer) and `sources` (final source metadata) events.
199
-
200
- Raises:
201
- HTTPException: 404 if the specified `document_id` does not exist or
202
- does not belong to the authenticated user.
203
- HTTPException: 400 if the document exists but its status is not
204
- "ready" (e.g., still processing or failed).
205
-
206
- Note:
207
- The streaming response uses a generator `event_stream` that yields
208
- raw SSE chunks. The assistant's full answer is reconstructed from
209
- the stream to save the complete conversation history. A separate
210
- database session is created inside the generator to avoid using the
211
- closed request session.
212
- """
213
  # Validate document
214
  if payload.document_id:
215
  doc = db.query(Document).filter(
@@ -228,8 +305,19 @@ def ask_question_stream(
228
 
229
  started_at = time.perf_counter()
230
 
 
 
 
 
 
 
 
 
 
 
 
231
  # Save user message immediately
232
- _save_message(db, user.id, payload.document_id, "user", payload.question)
233
 
234
  # Stream response
235
  def event_stream():
@@ -260,7 +348,7 @@ def ask_question_stream(
260
  from app.database import SessionLocal
261
  save_db = SessionLocal()
262
  try:
263
- _save_message(save_db, user.id, payload.document_id, "assistant", full_answer, sources)
264
  finally:
265
  save_db.close()
266
  finally:
@@ -283,25 +371,7 @@ def get_chat_history(
283
  user: User = Depends(get_current_user),
284
  db: Session = Depends(get_db),
285
  ):
286
- """Retrieve the complete chat history for a specific document.
287
-
288
- Fetches all messages (both user and assistant) associated with the given
289
- document and the authenticated user, ordered chronologically from oldest
290
- to newest. Assistant messages that contain source metadata will have the
291
- `sources` field populated.
292
-
293
- Args:
294
- document_id: The unique identifier of the document whose chat history is requested.
295
- user: The currently authenticated user, obtained from the dependency.
296
- db: SQLAlchemy database session, obtained from the dependency.
297
-
298
- Returns:
299
- ChatHistoryResponse: An object containing:
300
- - messages: A list of `ChatMessageResponse` objects, each with
301
- `id`, `role` ("user" or "assistant"), `content`, `sources`
302
- (list of `SourceChunk` for assistant messages), and `created_at`.
303
- - document_id: The document ID that was queried.
304
- """
305
  messages = (
306
  db.query(ChatMessage)
307
  .filter(
@@ -322,7 +392,7 @@ def get_chat_history(
322
  pass
323
 
324
  formatted.append(ChatMessageResponse(
325
- id=msg.id,
326
  role=msg.role,
327
  content=msg.content,
328
  sources=sources,
@@ -339,33 +409,7 @@ def export_chat_history(
339
  token: Optional[str] = None,
340
  db: Session = Depends(get_db),
341
  ):
342
- """Export the chat history for a document as a downloadable file.
343
-
344
- Supports Markdown (.md), plain text (.txt), or PDF (.pdf) export. The function accepts
345
- authentication via either the standard `Authorization: Bearer <token>`
346
- header (handled by the dependency chain) or a `token` query parameter to
347
- facilitate browser-initiated downloads that cannot set custom headers.
348
-
349
- Args:
350
- document_id: The unique identifier of the document whose chat history is to be exported.
351
- format: Output format, either "md" (Markdown), "txt" (plain text), or "pdf". Defaults to "md".
352
- token: Optional JWT token passed as a query parameter. Used for browser
353
- downloads when the `Authorization` header is not available.
354
- db: SQLAlchemy database session, obtained from the dependency.
355
-
356
- Returns:
357
- Response: A FastAPI `Response` object with:
358
- - `content`: Formatted chat history as a string or PDF bytes.
359
- - `media_type`: `text/markdown`, `text/plain`, or `application/pdf`.
360
- - `headers`: `Content-Disposition` attachment header with a generated filename.
361
-
362
- Raises:
363
- HTTPException: 401 if neither the token query parameter nor a valid
364
- bearer token provides an authenticated user.
365
- HTTPException: 400 if the `format` parameter is not "md", "txt", or "pdf".
366
- HTTPException: 404 if the document does not exist or does not belong
367
- to the user, or if no chat messages are found for the document.
368
- """
369
  from app.auth import decode_token as _decode
370
 
371
  # Resolve user from query-param token (browser download links can't set headers)
@@ -412,6 +456,7 @@ def export_chat_history(
412
  media_type = "text/plain"
413
  extension = "txt"
414
  else:
 
415
  content = _format_pdf(doc, messages)
416
  media_type = "application/pdf"
417
  extension = "pdf"
@@ -434,20 +479,7 @@ def clear_chat_history(
434
  user: User = Depends(get_current_user),
435
  db: Session = Depends(get_db),
436
  ):
437
- """Delete all chat messages associated with a specific document.
438
-
439
- Removes every chat message (both user and assistant) linked to the given
440
- `document_id` and the authenticated user. The deletion is permanent and
441
- cannot be undone.
442
-
443
- Args:
444
- document_id: The unique identifier of the document whose chat history should be cleared.
445
- user: The currently authenticated user, obtained from the dependency.
446
- db: SQLAlchemy database session, obtained from the dependency.
447
-
448
- Returns:
449
- dict: A simple JSON object with a `message` field confirming the deletion.
450
- """
451
  db.query(ChatMessage).filter(
452
  ChatMessage.user_id == user.id,
453
  ChatMessage.document_id == document_id,
@@ -464,35 +496,22 @@ def _save_message(
464
  role: str,
465
  content: str,
466
  sources: list = None,
 
467
  ):
468
- """Save a chat message to the database.
469
-
470
- Creates a `ChatMessage` record with the provided user, document,
471
- role, content, and optional source metadata. The message is added to
472
- the session and committed immediately. The database session must be
473
- managed by the caller (e.g., closed after use).
474
-
475
- Args:
476
- user_id: The ID of the authenticated user.
477
- document_id: Optional document ID that the message pertains to.
478
- Can be `None` for global chat contexts.
479
- db: SQLAlchemy database session (active, typically from a dependency).
480
- role: The message sender role, e.g., "user" or "assistant".
481
- content: The full text content of the message.
482
- sources: Optional list of source dictionaries (usually from RAG
483
- retrieval) to be stored as JSON. Defaults to `None`.
484
-
485
- Returns:
486
- None
487
-
488
- Note:
489
- The function commits the transaction. It does not close the session,
490
- leaving that responsibility to the caller. If `sources` is provided,
491
- it is serialized using `json.dumps()`.
492
- """
493
  msg = ChatMessage(
494
  user_id=user_id,
495
  document_id=document_id,
 
496
  role=role,
497
  content=content,
498
  sources_json=json.dumps(sources) if sources else None,
@@ -511,7 +530,7 @@ def _share_answer_response(message: ChatMessage) -> ShareAnswerResponse:
511
  sources = []
512
 
513
  return ShareAnswerResponse(
514
- id=message.id,
515
  content=message.content,
516
  created_at=message.created_at,
517
  sources=sources,
@@ -519,28 +538,12 @@ def _share_answer_response(message: ChatMessage) -> ShareAnswerResponse:
519
 
520
 
521
  def _format_markdown(doc, messages) -> str:
522
- """Format chat history as a Markdown document.
523
-
524
- Generates a Markdown string containing the document metadata and the
525
- full conversation. User messages are labeled "You", assistant messages
526
- are labeled "Assistant". For assistant responses, if source information
527
- is available, it is rendered as a numbered list with filename, page,
528
- confidence, and a text preview.
529
-
530
- Args:
531
- doc: The Document object (must have `original_name` attribute).
532
- messages: List of ChatMessage objects, each with attributes:
533
- `role` (str), `content` (str), `created_at` (datetime, optional),
534
- and `sources_json` (str, JSON-encoded list of source dicts).
535
-
536
- Returns:
537
- str: A Markdown string ready for writing to a `.md` file.
538
- """
539
  lines = [
540
  f"# Chat History — {doc.original_name}",
541
  "",
542
  f"**Document:** {doc.original_name} ",
543
- f"**Exported at:** {__import__('datetime').datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ",
544
  f"**Total messages:** {len(messages)}",
545
  "",
546
  "---",
@@ -557,7 +560,6 @@ def _format_markdown(doc, messages) -> str:
557
  lines.append(msg.content)
558
  lines.append("")
559
 
560
- # Include source citations for assistant messages
561
  if msg.role == "assistant" and msg.sources_json:
562
  try:
563
  sources = json.loads(msg.sources_json)
@@ -583,26 +585,10 @@ def _format_markdown(doc, messages) -> str:
583
 
584
 
585
  def _format_plaintext(doc, messages) -> str:
586
- """Format chat history as a plain text document.
587
-
588
- Generates a plain text string containing the document metadata and the
589
- full conversation. User messages are labeled "You", assistant messages
590
- are labeled "Assistant". For assistant responses, if source information
591
- is available, it is rendered as a numbered list with filename, page,
592
- and confidence (text preview is omitted in plain text format).
593
-
594
- Args:
595
- doc: The Document object (must have `original_name` attribute).
596
- messages: List of ChatMessage objects, each with attributes:
597
- `role` (str), `content` (str), `created_at` (datetime, optional),
598
- and `sources_json` (str, JSON‑encoded list of source dicts).
599
-
600
- Returns:
601
- str: A plain text string ready for writing to a `.txt` file.
602
- """
603
  lines = [
604
  f"Chat History — {doc.original_name}",
605
- f"Exported at: {__import__('datetime').datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
606
  f"Total messages: {len(messages)}",
607
  "=" * 60,
608
  "",
@@ -615,7 +601,6 @@ def _format_plaintext(doc, messages) -> str:
615
  lines.append(f"[{role_label}] ({timestamp})")
616
  lines.append(msg.content)
617
 
618
- # Include source citations for assistant messages
619
  if msg.role == "assistant" and msg.sources_json:
620
  try:
621
  sources = json.loads(msg.sources_json)
@@ -633,81 +618,3 @@ def _format_plaintext(doc, messages) -> str:
633
  lines.append("")
634
 
635
  return "\n".join(lines)
636
-
637
-
638
- def _format_pdf(doc, messages) -> bytes:
639
- """Format chat history as a PDF document."""
640
- buffer = BytesIO()
641
- pdf = SimpleDocTemplate(
642
- buffer,
643
- pagesize=letter,
644
- leftMargin=0.75 * inch,
645
- rightMargin=0.75 * inch,
646
- topMargin=0.75 * inch,
647
- bottomMargin=0.75 * inch,
648
- )
649
-
650
- styles = getSampleStyleSheet()
651
- metadata_style = styles["Normal"]
652
- metadata_style.spaceAfter = 6
653
- content_style = ParagraphStyle(
654
- "ChatContent",
655
- parent=styles["BodyText"],
656
- leading=14,
657
- spaceAfter=10,
658
- )
659
- source_style = ParagraphStyle(
660
- "ChatSource",
661
- parent=styles["BodyText"],
662
- leftIndent=14,
663
- leading=12,
664
- spaceAfter=4,
665
- )
666
-
667
- story = [
668
- Paragraph(f"Chat History - {html.escape(doc.original_name)}", styles["Title"]),
669
- Spacer(1, 0.15 * inch),
670
- Paragraph(f"Document: {html.escape(doc.original_name)}", metadata_style),
671
- Paragraph(f"Exported at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", metadata_style),
672
- Paragraph(f"Total messages: {len(messages)}", metadata_style),
673
- Spacer(1, 0.2 * inch),
674
- ]
675
-
676
- for msg in messages:
677
- timestamp = msg.created_at.strftime("%Y-%m-%d %H:%M:%S") if msg.created_at else ""
678
- role_label = "You" if msg.role == "user" else "Assistant"
679
-
680
- story.append(Paragraph(f"<b>{html.escape(role_label)}</b>", styles["Heading3"]))
681
- story.append(Paragraph(html.escape(timestamp), styles["Italic"]))
682
- story.append(Paragraph(_pdf_text(msg.content), content_style))
683
-
684
- if msg.role == "assistant" and msg.sources_json:
685
- try:
686
- sources = json.loads(msg.sources_json)
687
- if sources:
688
- story.append(Paragraph("<b>Sources:</b>", metadata_style))
689
- for i, src in enumerate(sources, 1):
690
- filename = html.escape(str(src.get("filename", "Unknown")))
691
- page = html.escape(str(src.get("page", "?")))
692
- confidence = html.escape(str(src.get("confidence", 0)))
693
- story.append(
694
- Paragraph(
695
- f"[{i}] {filename}, Page {page} (Confidence: {confidence}%)",
696
- source_style,
697
- )
698
- )
699
- text_preview = str(src.get("text", "")).strip()
700
- if text_preview:
701
- story.append(Paragraph(_pdf_text(text_preview), source_style))
702
- except Exception:
703
- pass
704
-
705
- story.append(Spacer(1, 0.15 * inch))
706
-
707
- pdf.build(story)
708
- return buffer.getvalue()
709
-
710
-
711
- def _pdf_text(text: str) -> str:
712
- """Escape text for ReportLab paragraphs while preserving line breaks."""
713
- return html.escape(text or "").replace("\n", "<br/>")
 
7
  from datetime import datetime
8
  from io import BytesIO
9
  import logging
10
+ from typing import Optional, List
11
 
12
  from fastapi import APIRouter, Depends, HTTPException, Request
13
  from fastapi.responses import Response, StreamingResponse
 
 
 
 
14
  from sqlalchemy.orm import Session
15
 
16
  from app.auth import get_current_user
17
  from app.database import get_db
18
  from app.metrics import record_query_response_time
19
+ from app.models import User, ChatMessage, Document, SharedMessage, ChatSession
20
  from app.rate_limit import limiter
21
  from app.schemas import (
22
  ChatRequest,
 
26
  ShareAnswerResponse,
27
  ShareLinkResponse,
28
  SourceChunk,
29
+ ChatSessionCreate,
30
+ ChatSessionResponse,
31
  )
32
 
33
  logger = logging.getLogger(__name__)
 
75
  db.commit()
76
 
77
  return ShareLinkResponse(
78
+ message_id=str(message.id),
79
  share_url=f"/share?message_id={message.id}",
80
  )
81
 
82
 
83
+ @router.get("/sessions", response_model=List[ChatSessionResponse])
84
+ def get_chat_sessions(
85
+ user: User = Depends(get_current_user),
86
+ db: Session = Depends(get_db),
87
+ ):
88
+ """Retrieve all chat sessions for the authenticated user."""
89
+ sessions = (
90
+ db.query(ChatSession)
91
+ .filter(ChatSession.user_id == user.id)
92
+ .order_by(ChatSession.created_at.desc())
93
+ .all()
94
+ )
95
+ return sessions
96
+
97
+
98
+ @router.post("/sessions", response_model=ChatSessionResponse, status_code=201)
99
+ def create_chat_session(
100
+ payload: ChatSessionCreate,
101
+ user: User = Depends(get_current_user),
102
+ db: Session = Depends(get_db),
103
+ ):
104
+ """Create a new chat session."""
105
+ session = ChatSession(
106
+ user_id=user.id,
107
+ title=payload.title,
108
+ )
109
+ db.add(session)
110
+ db.commit()
111
+ db.refresh(session)
112
+ return session
113
+
114
+
115
+ @router.put("/sessions/{session_id}", response_model=ChatSessionResponse)
116
+ def rename_chat_session(
117
+ session_id: str,
118
+ payload: ChatSessionCreate,
119
+ user: User = Depends(get_current_user),
120
+ db: Session = Depends(get_db),
121
+ ):
122
+ """Rename an existing chat session."""
123
+ session = (
124
+ db.query(ChatSession)
125
+ .filter(
126
+ ChatSession.id == session_id,
127
+ ChatSession.user_id == user.id,
128
+ )
129
+ .first()
130
+ )
131
+ if not session:
132
+ raise HTTPException(status_code=404, detail="Chat session not found")
133
+ session.title = payload.title
134
+ db.commit()
135
+ db.refresh(session)
136
+ return session
137
+
138
+
139
+ @router.delete("/sessions/{session_id}")
140
+ def delete_chat_session(
141
+ session_id: str,
142
+ user: User = Depends(get_current_user),
143
+ db: Session = Depends(get_db),
144
+ ):
145
+ """Delete a chat session and all its messages."""
146
+ session = (
147
+ db.query(ChatSession)
148
+ .filter(
149
+ ChatSession.id == session_id,
150
+ ChatSession.user_id == user.id,
151
+ )
152
+ .first()
153
+ )
154
+ if not session:
155
+ raise HTTPException(status_code=404, detail="Chat session not found")
156
+ db.delete(session)
157
+ db.commit()
158
+ return Response(status_code=204)
159
+
160
+
161
+ @router.get("/history/session/{session_id}", response_model=ChatHistoryResponse)
162
+ def get_session_history(
163
+ session_id: str,
164
+ user: User = Depends(get_current_user),
165
+ db: Session = Depends(get_db),
166
+ ):
167
+ """Retrieve chat history for a specific chat session."""
168
+ session = (
169
+ db.query(ChatSession)
170
+ .filter(
171
+ ChatSession.id == session_id,
172
+ ChatSession.user_id == user.id,
173
+ )
174
+ .first()
175
+ )
176
+ if not session:
177
+ raise HTTPException(status_code=404, detail="Chat session not found")
178
+
179
+ messages = (
180
+ db.query(ChatMessage)
181
+ .filter(
182
+ ChatMessage.session_id == session_id,
183
+ ChatMessage.user_id == user.id,
184
+ )
185
+ .order_by(ChatMessage.created_at.asc())
186
+ .all()
187
+ )
188
+
189
+ formatted = []
190
+ for msg in messages:
191
+ sources = []
192
+ if msg.sources_json:
193
+ try:
194
+ sources = [SourceChunk(**s) for s in json.loads(msg.sources_json)]
195
+ except Exception:
196
+ pass
197
+
198
+ formatted.append(
199
+ ChatMessageResponse(
200
+ id=str(msg.id),
201
+ role=msg.role,
202
+ content=msg.content,
203
+ sources=sources,
204
+ created_at=msg.created_at,
205
+ )
206
+ )
207
+
208
+ return ChatHistoryResponse(messages=formatted, document_id=None)
209
+
210
+
211
  def generate_answer(question: str, user_id: str, document_id: Optional[str] = None, hf_token: Optional[str] = None):
212
  from app.rag.agent import generate_answer as _generate_answer
213
 
 
228
  user: User = Depends(get_current_user),
229
  db: Session = Depends(get_db),
230
  ):
231
+ """Ask a question with RAG retrieval (non-streaming)."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  started_at = time.perf_counter()
233
  try:
234
  # Validate document exists if specified
 
247
  detail=f"Document is still {doc.status}. Please wait for processing to complete.",
248
  )
249
 
250
+ # Resolve or create session
251
+ session_id = payload.session_id
252
+ if not session_id:
253
+ session = db.query(ChatSession).filter(ChatSession.user_id == user.id).first()
254
+ if not session:
255
+ session = ChatSession(user_id=user.id, title="Default Chat")
256
+ db.add(session)
257
+ db.commit()
258
+ db.refresh(session)
259
+ session_id = session.id
260
+
261
  result = generate_answer(
262
  question=payload.question,
263
  user_id=user.id,
 
266
  )
267
 
268
  # Save to chat history
269
+ _save_message(db, user.id, payload.document_id, "user", payload.question, session_id=session_id)
270
+ _save_message(db, user.id, payload.document_id, "assistant", result["answer"], result["sources"], session_id=session_id)
271
 
272
  return ChatResponse(
273
  answer=result["answer"],
 
286
  user: User = Depends(get_current_user),
287
  db: Session = Depends(get_db),
288
  ):
289
+ """Ask a question with Server-Sent Events (SSE) streaming response."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  # Validate document
291
  if payload.document_id:
292
  doc = db.query(Document).filter(
 
305
 
306
  started_at = time.perf_counter()
307
 
308
+ # Resolve or create session
309
+ session_id = payload.session_id
310
+ if not session_id:
311
+ session = db.query(ChatSession).filter(ChatSession.user_id == user.id).first()
312
+ if not session:
313
+ session = ChatSession(user_id=user.id, title="Default Chat")
314
+ db.add(session)
315
+ db.commit()
316
+ db.refresh(session)
317
+ session_id = session.id
318
+
319
  # Save user message immediately
320
+ _save_message(db, user.id, payload.document_id, "user", payload.question, session_id=session_id)
321
 
322
  # Stream response
323
  def event_stream():
 
348
  from app.database import SessionLocal
349
  save_db = SessionLocal()
350
  try:
351
+ _save_message(save_db, user.id, payload.document_id, "assistant", full_answer, sources, session_id=session_id)
352
  finally:
353
  save_db.close()
354
  finally:
 
371
  user: User = Depends(get_current_user),
372
  db: Session = Depends(get_db),
373
  ):
374
+ """Retrieve the complete chat history for a specific document."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  messages = (
376
  db.query(ChatMessage)
377
  .filter(
 
392
  pass
393
 
394
  formatted.append(ChatMessageResponse(
395
+ id=str(msg.id),
396
  role=msg.role,
397
  content=msg.content,
398
  sources=sources,
 
409
  token: Optional[str] = None,
410
  db: Session = Depends(get_db),
411
  ):
412
+ """Export the chat history for a document as a downloadable file."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
  from app.auth import decode_token as _decode
414
 
415
  # Resolve user from query-param token (browser download links can't set headers)
 
456
  media_type = "text/plain"
457
  extension = "txt"
458
  else:
459
+ from app.routes.chat_export import format_pdf as _format_pdf
460
  content = _format_pdf(doc, messages)
461
  media_type = "application/pdf"
462
  extension = "pdf"
 
479
  user: User = Depends(get_current_user),
480
  db: Session = Depends(get_db),
481
  ):
482
+ """Delete all chat messages associated with a specific document."""
 
 
 
 
 
 
 
 
 
 
 
 
 
483
  db.query(ChatMessage).filter(
484
  ChatMessage.user_id == user.id,
485
  ChatMessage.document_id == document_id,
 
496
  role: str,
497
  content: str,
498
  sources: list = None,
499
+ session_id: Optional[str] = None,
500
  ):
501
+ """Save a chat message to the database."""
502
+ if not session_id:
503
+ session = db.query(ChatSession).filter(ChatSession.user_id == user_id).first()
504
+ if not session:
505
+ session = ChatSession(user_id=user_id, title="Default Chat")
506
+ db.add(session)
507
+ db.commit()
508
+ db.refresh(session)
509
+ session_id = session.id
510
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
  msg = ChatMessage(
512
  user_id=user_id,
513
  document_id=document_id,
514
+ session_id=session_id,
515
  role=role,
516
  content=content,
517
  sources_json=json.dumps(sources) if sources else None,
 
530
  sources = []
531
 
532
  return ShareAnswerResponse(
533
+ id=str(message.id),
534
  content=message.content,
535
  created_at=message.created_at,
536
  sources=sources,
 
538
 
539
 
540
  def _format_markdown(doc, messages) -> str:
541
+ """Format chat history as a Markdown document."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  lines = [
543
  f"# Chat History — {doc.original_name}",
544
  "",
545
  f"**Document:** {doc.original_name} ",
546
+ f"**Exported at:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ",
547
  f"**Total messages:** {len(messages)}",
548
  "",
549
  "---",
 
560
  lines.append(msg.content)
561
  lines.append("")
562
 
 
563
  if msg.role == "assistant" and msg.sources_json:
564
  try:
565
  sources = json.loads(msg.sources_json)
 
585
 
586
 
587
  def _format_plaintext(doc, messages) -> str:
588
+ """Format chat history as a plain text document."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
589
  lines = [
590
  f"Chat History — {doc.original_name}",
591
+ f"Exported at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
592
  f"Total messages: {len(messages)}",
593
  "=" * 60,
594
  "",
 
601
  lines.append(f"[{role_label}] ({timestamp})")
602
  lines.append(msg.content)
603
 
 
604
  if msg.role == "assistant" and msg.sources_json:
605
  try:
606
  sources = json.loads(msg.sources_json)
 
618
  lines.append("")
619
 
620
  return "\n".join(lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/routes/documents.py CHANGED
@@ -172,6 +172,15 @@ def _ingest_document(document_id: str, filepath: str, original_name: str, user_i
172
  db.commit()
173
  return
174
 
 
 
 
 
 
 
 
 
 
175
  # Store embeddings in ChromaDB
176
  chunk_count = store_chunks(
177
  chunks=chunks,
@@ -629,6 +638,14 @@ def delete_document(
629
  except Exception as e:
630
  logger.warning(f"Error deleting vectors: {e}")
631
 
 
 
 
 
 
 
 
 
632
  # Delete from database (cascades to chat messages)
633
  db.delete(doc)
634
  db.commit()
 
172
  db.commit()
173
  return
174
 
175
+ # Build and persist a lightweight entity co-occurrence graph for GraphRAG.
176
+ try:
177
+ from app.rag.graph_builder import build_graph, save_graph
178
+
179
+ graph = build_graph(chunks)
180
+ save_graph(graph, user_id=user_id, document_id=document_id)
181
+ except Exception as e:
182
+ logger.warning(f"Could not build knowledge graph for document {document_id}: {e}")
183
+
184
  # Store embeddings in ChromaDB
185
  chunk_count = store_chunks(
186
  chunks=chunks,
 
638
  except Exception as e:
639
  logger.warning(f"Error deleting vectors: {e}")
640
 
641
+ # Delete persisted knowledge graph
642
+ try:
643
+ from app.rag.graph_builder import delete_graph
644
+
645
+ delete_graph(user_id=user.id, document_id=document_id)
646
+ except Exception as e:
647
+ logger.warning(f"Error deleting knowledge graph: {e}")
648
+
649
  # Delete from database (cascades to chat messages)
650
  db.delete(doc)
651
  db.commit()
backend/app/schemas.py CHANGED
@@ -146,6 +146,7 @@ class AdminStatsResponse(BaseModel):
146
  class ChatRequest(BaseModel):
147
  question: str = Field(..., min_length=1, max_length=2000)
148
  document_id: Optional[str] = None
 
149
 
150
 
151
  class SourceChunk(BaseModel):
@@ -192,5 +193,20 @@ class ShareLinkResponse(BaseModel):
192
  share_url: str
193
 
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  # Rebuild models for forward references
196
  TokenResponse.model_rebuild()
 
146
  class ChatRequest(BaseModel):
147
  question: str = Field(..., min_length=1, max_length=2000)
148
  document_id: Optional[str] = None
149
+ session_id: Optional[str] = None
150
 
151
 
152
  class SourceChunk(BaseModel):
 
193
  share_url: str
194
 
195
 
196
+ # ── Chat Session ──────────────────────────────────────
197
+
198
+ class ChatSessionCreate(BaseModel):
199
+ title: str = Field(..., min_length=1, max_length=255)
200
+
201
+
202
+ class ChatSessionResponse(BaseModel):
203
+ id: str
204
+ title: str
205
+ created_at: datetime
206
+
207
+ class Config:
208
+ from_attributes = True
209
+
210
+
211
  # Rebuild models for forward references
212
  TokenResponse.model_rebuild()
backend/requirements.txt CHANGED
@@ -25,6 +25,7 @@ httpx
25
 
26
  # Document Processing
27
  PyMuPDF
 
28
  python-docx
29
 
30
  # LangChain & RAG
@@ -42,6 +43,9 @@ transformers
42
 
43
  # Vector Database
44
  chromadb
 
 
 
45
 
46
  # LLM Inference
47
  huggingface-hub
 
25
 
26
  # Document Processing
27
  PyMuPDF
28
+ pdfplumber
29
  python-docx
30
 
31
  # LangChain & RAG
 
43
 
44
  # Vector Database
45
  chromadb
46
+ networkx>=3.3
47
+ spacy>=3.7
48
+ neo4j>=5.0
49
 
50
  # LLM Inference
51
  huggingface-hub
backend/tests/test_chunker.py CHANGED
@@ -1,7 +1,10 @@
1
  from pathlib import Path
 
 
2
 
3
  import pytest
4
 
 
5
  from app.rag.chunker import chunk_document, get_page_count
6
 
7
 
@@ -36,3 +39,49 @@ def test_get_page_count_for_txt_returns_one(tmp_path):
36
  file_path.write_text("hello", encoding="utf-8")
37
 
38
  assert get_page_count(str(file_path)) == 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from pathlib import Path
2
+ import sys
3
+ import types
4
 
5
  import pytest
6
 
7
+ from app.rag import chunker
8
  from app.rag.chunker import chunk_document, get_page_count
9
 
10
 
 
39
  file_path.write_text("hello", encoding="utf-8")
40
 
41
  assert get_page_count(str(file_path)) == 1
42
+
43
+
44
+ def test_pdf_table_detection_separates_table_from_paragraph(monkeypatch):
45
+ class FakeTable:
46
+ bbox = (40, 90, 300, 160)
47
+
48
+ def extract(self):
49
+ return [["Name", "Amount"], ["Alpha", "$10"]]
50
+
51
+ class FakePage:
52
+ def find_tables(self):
53
+ return [FakeTable()]
54
+
55
+ def extract_words(self):
56
+ return [
57
+ {"text": "Intro", "x0": 40, "x1": 70, "top": 20, "bottom": 30},
58
+ {"text": "paragraph", "x0": 75, "x1": 140, "top": 20, "bottom": 30},
59
+ {"text": "Name", "x0": 45, "x1": 80, "top": 100, "bottom": 110},
60
+ {"text": "Amount", "x0": 160, "x1": 220, "top": 100, "bottom": 110},
61
+ {"text": "Alpha", "x0": 45, "x1": 85, "top": 125, "bottom": 135},
62
+ {"text": "$10", "x0": 160, "x1": 185, "top": 125, "bottom": 135},
63
+ ]
64
+
65
+ class FakePdf:
66
+ pages = [FakePage()]
67
+
68
+ def __enter__(self):
69
+ return self
70
+
71
+ def __exit__(self, exc_type, exc, traceback):
72
+ return False
73
+
74
+ fake_pdfplumber = types.SimpleNamespace(open=lambda _filepath: FakePdf())
75
+ monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
76
+ monkeypatch.setattr(chunker, "extract_pdf_images", lambda _filepath: [])
77
+
78
+ chunks = chunk_document("report.pdf")
79
+
80
+ assert len(chunks) == 2
81
+ assert chunks[0]["chunk_type"] == "text"
82
+ assert chunks[0]["text"] == "Intro paragraph"
83
+ assert "Name" not in chunks[0]["text"]
84
+ assert chunks[1]["chunk_type"] == "table"
85
+ assert chunks[1]["bbox"] == "[40.0, 90.0, 300.0, 160.0]"
86
+ assert "| Name | Amount |" in chunks[1]["text"]
87
+ assert "| Alpha | $10 |" in chunks[1]["text"]
backend/tests/test_documents.py CHANGED
@@ -1,3 +1,9 @@
 
 
 
 
 
 
1
  def test_api_health(client):
2
  response = client.get("/api/health")
3
 
@@ -32,3 +38,76 @@ def test_upload_rejects_unsupported_extension_before_deep_validation(client, aut
32
 
33
  assert response.status_code == 400
34
  assert "not supported" in response.json()["detail"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import types
2
+
3
+ from app.models import Document
4
+ from app.routes.documents import _ingest_document
5
+
6
+
7
  def test_api_health(client):
8
  response = client.get("/api/health")
9
 
 
38
 
39
  assert response.status_code == 400
40
  assert "not supported" in response.json()["detail"]
41
+
42
+
43
+ def test_ingest_document_builds_and_saves_graph(db_session, monkeypatch, tmp_path, user):
44
+ document = Document(
45
+ user_id=user.id,
46
+ filename="graph.txt",
47
+ original_name="graph.txt",
48
+ file_size=128,
49
+ status="pending",
50
+ )
51
+ db_session.add(document)
52
+ db_session.commit()
53
+ db_session.refresh(document)
54
+ user_id = user.id
55
+ document_id = document.id
56
+ chunks = [{"text": "OpenAI works with Microsoft.", "page": 1, "chunk_index": 0}]
57
+ saved = {}
58
+
59
+ monkeypatch.setattr("app.routes.documents.get_page_count", lambda filepath: 1)
60
+ monkeypatch.setattr("app.routes.documents.chunk_document", lambda filepath: chunks)
61
+ monkeypatch.setattr("app.routes.documents.store_chunks", lambda **kwargs: len(chunks))
62
+ monkeypatch.setattr("app.database.SessionLocal", lambda: db_session)
63
+
64
+ fake_summary = types.ModuleType("app.rag.summarizer")
65
+ fake_summary.generate_document_summary = lambda filepath, max_sentences=2: "Summary"
66
+ monkeypatch.setitem(__import__("sys").modules, "app.rag.summarizer", fake_summary)
67
+
68
+ monkeypatch.setattr(
69
+ "app.rag.graph_builder.build_graph",
70
+ lambda received_chunks: {"chunks": received_chunks},
71
+ )
72
+ monkeypatch.setattr(
73
+ "app.rag.graph_builder.save_graph",
74
+ lambda graph, user_id, document_id: saved.update(
75
+ {"graph": graph, "user_id": user_id, "document_id": document_id}
76
+ ),
77
+ )
78
+
79
+ _ingest_document(
80
+ document_id=document_id,
81
+ filepath=str(tmp_path / "graph.txt"),
82
+ original_name=document.original_name,
83
+ user_id=user_id,
84
+ )
85
+
86
+ assert saved == {
87
+ "graph": {"chunks": chunks},
88
+ "user_id": user_id,
89
+ "document_id": document_id,
90
+ }
91
+ refreshed = db_session.get(Document, document_id)
92
+ assert refreshed.status == "ready"
93
+ assert refreshed.chunk_count == 1
94
+
95
+
96
+ def test_delete_document_removes_knowledge_graph(client, auth_headers, ready_document, monkeypatch):
97
+ deleted = {}
98
+
99
+ monkeypatch.setattr("app.routes.documents.delete_document_chunks", lambda **kwargs: None)
100
+ monkeypatch.setattr(
101
+ "app.rag.graph_builder.delete_graph",
102
+ lambda user_id, document_id: deleted.update(
103
+ {"user_id": user_id, "document_id": document_id}
104
+ ),
105
+ )
106
+
107
+ response = client.delete(
108
+ f"/api/v1/documents/{ready_document.id}",
109
+ headers=auth_headers,
110
+ )
111
+
112
+ assert response.status_code == 200
113
+ assert deleted["document_id"] == ready_document.id
backend/tests/test_graph_builder.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from app.rag import graph_builder
4
+
5
+
6
+ class FakeEntity:
7
+ def __init__(self, text, label):
8
+ self.text = text
9
+ self.label_ = label
10
+
11
+
12
+ class FakeDoc:
13
+ def __init__(self, entities):
14
+ self.ents = entities
15
+
16
+
17
+ class FakeNlp:
18
+ def __call__(self, text):
19
+ entities = []
20
+ for value, label in (
21
+ ("OpenAI", "ORG"),
22
+ ("Microsoft", "ORG"),
23
+ ("Azure", "PRODUCT"),
24
+ ("Ignored Date", "DATE"),
25
+ ):
26
+ if value in text:
27
+ entities.append(FakeEntity(value, label))
28
+ return FakeDoc(entities)
29
+
30
+
31
+ def test_extract_entities_filters_configured_labels(monkeypatch):
32
+ monkeypatch.setattr(graph_builder, "_nlp", FakeNlp())
33
+
34
+ entities = graph_builder.extract_entities("OpenAI works with Microsoft on Ignored Date")
35
+
36
+ assert {entity.text for entity in entities} == {"OpenAI", "Microsoft"}
37
+ assert {entity.label for entity in entities} == {"ORG"}
38
+
39
+
40
+ def test_build_graph_tracks_entity_edges_and_weights(monkeypatch):
41
+ monkeypatch.setattr(graph_builder, "_nlp", FakeNlp())
42
+ chunks = [
43
+ {
44
+ "text": "OpenAI works with Microsoft.",
45
+ "page": 1,
46
+ "chunk_index": 0,
47
+ },
48
+ {
49
+ "text": "OpenAI and Microsoft use Azure.",
50
+ "page": 2,
51
+ "chunk_index": 1,
52
+ },
53
+ ]
54
+
55
+ graph = graph_builder.build_graph(chunks)
56
+
57
+ openai_id = "ORG:openai"
58
+ microsoft_id = "ORG:microsoft"
59
+ azure_id = "PRODUCT:azure"
60
+ assert graph.nodes[openai_id]["name"] == "OpenAI"
61
+ assert graph.nodes[openai_id]["pages"] == [1, 2]
62
+ assert graph[openai_id][microsoft_id]["weight"] == 2
63
+ assert graph[openai_id][microsoft_id]["pages"] == [1, 2]
64
+ assert graph.has_edge(microsoft_id, azure_id)
65
+
66
+
67
+ def test_save_load_and_delete_graph_roundtrip(tmp_path, monkeypatch):
68
+ monkeypatch.setattr(graph_builder.settings, "GRAPH_PERSIST_DIR", str(tmp_path))
69
+ graph = graph_builder.build_graph([])
70
+ graph.add_node("ORG:openai", name="OpenAI", label="ORG", mentions=1, pages=[1], chunks=[0])
71
+
72
+ path = graph_builder.save_graph(graph, user_id="user-1", document_id="doc-1")
73
+ payload = json.loads(path.read_text(encoding="utf-8"))
74
+ loaded = graph_builder.load_graph(user_id="user-1", document_id="doc-1")
75
+
76
+ assert payload["metadata"]["document_id"] == "doc-1"
77
+ assert loaded.nodes["ORG:openai"]["name"] == "OpenAI"
78
+
79
+ graph_builder.delete_graph(user_id="user-1", document_id="doc-1")
80
+ assert not path.exists()
81
+
82
+
83
+ def test_empty_chunks_produce_empty_graph(monkeypatch):
84
+ monkeypatch.setattr(graph_builder, "_nlp", FakeNlp())
85
+
86
+ graph = graph_builder.build_graph([])
87
+
88
+ assert graph.number_of_nodes() == 0
89
+ assert graph.number_of_edges() == 0
backend/tests/test_graph_retriever.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.rag import graph_builder, graph_retriever
2
+
3
+
4
+ class FakeEntity:
5
+ def __init__(self, text, label):
6
+ self.text = text
7
+ self.label_ = label
8
+
9
+
10
+ class FakeDoc:
11
+ def __init__(self, entities):
12
+ self.ents = entities
13
+
14
+
15
+ class FakeNlp:
16
+ def __call__(self, text):
17
+ entities = []
18
+ for value, label in (
19
+ ("OpenAI", "ORG"),
20
+ ("Microsoft", "ORG"),
21
+ ("Azure", "PRODUCT"),
22
+ ):
23
+ if value in text:
24
+ entities.append(FakeEntity(value, label))
25
+ return FakeDoc(entities)
26
+
27
+
28
+ def _save_sample_graph(tmp_path, monkeypatch, user_id="user-1", document_id="doc-1"):
29
+ monkeypatch.setattr(graph_builder.settings, "GRAPH_PERSIST_DIR", str(tmp_path))
30
+ monkeypatch.setattr(graph_builder, "_nlp", FakeNlp())
31
+ graph = graph_builder.build_graph(
32
+ [
33
+ {
34
+ "text": "OpenAI works with Microsoft.",
35
+ "page": 1,
36
+ "chunk_index": 0,
37
+ },
38
+ {
39
+ "text": "Microsoft deploys Azure.",
40
+ "page": 2,
41
+ "chunk_index": 1,
42
+ },
43
+ ]
44
+ )
45
+ graph_builder.save_graph(graph, user_id=user_id, document_id=document_id)
46
+
47
+
48
+ def test_get_entity_context_returns_one_hop_relationships(tmp_path, monkeypatch):
49
+ _save_sample_graph(tmp_path, monkeypatch)
50
+
51
+ context = graph_retriever.get_entity_context(
52
+ query="How is OpenAI related to Microsoft?",
53
+ user_id="user-1",
54
+ document_id="doc-1",
55
+ )
56
+
57
+ assert "## Knowledge Graph Context" in context
58
+ assert "OpenAI" in context
59
+ assert "Microsoft" in context
60
+ assert "page 1" in context
61
+
62
+
63
+ def test_get_entity_context_returns_empty_for_no_match(tmp_path, monkeypatch):
64
+ _save_sample_graph(tmp_path, monkeypatch)
65
+
66
+ context = graph_retriever.get_entity_context(
67
+ query="What about Google?",
68
+ user_id="user-1",
69
+ document_id="doc-1",
70
+ )
71
+
72
+ assert context == ""
73
+
74
+
75
+ def test_get_entity_context_returns_empty_for_missing_graph(tmp_path, monkeypatch):
76
+ monkeypatch.setattr(graph_builder.settings, "GRAPH_PERSIST_DIR", str(tmp_path))
77
+ monkeypatch.setattr(graph_builder, "_nlp", FakeNlp())
78
+
79
+ context = graph_retriever.get_entity_context(
80
+ query="OpenAI",
81
+ user_id="user-1",
82
+ document_id="missing",
83
+ )
84
+
85
+ assert context == ""
86
+
87
+
88
+ def test_get_entity_context_isolates_users(tmp_path, monkeypatch):
89
+ _save_sample_graph(tmp_path, monkeypatch, user_id="user-1", document_id="doc-1")
90
+
91
+ context = graph_retriever.get_entity_context(
92
+ query="OpenAI",
93
+ user_id="user-2",
94
+ document_id="doc-1",
95
+ )
96
+
97
+ assert context == ""
backend/tests/test_graphrag_agent.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.rag import agent
2
+
3
+
4
+ class FakeMessage:
5
+ content = "Graph answer"
6
+
7
+
8
+ class FakeChoice:
9
+ message = FakeMessage()
10
+
11
+
12
+ class FakeResponse:
13
+ choices = [FakeChoice()]
14
+
15
+
16
+ class FakeClient:
17
+ def __init__(self):
18
+ self.messages = None
19
+
20
+ def chat_completion(self, messages, **kwargs):
21
+ self.messages = messages
22
+ return FakeResponse()
23
+
24
+
25
+ def test_generate_answer_appends_graph_context_without_changing_sources(monkeypatch):
26
+ client = FakeClient()
27
+ chunks = [
28
+ {
29
+ "text": "Vector context",
30
+ "filename": "doc.pdf",
31
+ "page": 1,
32
+ "score": 0.9,
33
+ "confidence": 100.0,
34
+ }
35
+ ]
36
+
37
+ monkeypatch.setattr(agent, "get_llm_client", lambda: client)
38
+ monkeypatch.setattr(agent, "retrieve", lambda **kwargs: chunks)
39
+ monkeypatch.setattr(
40
+ agent,
41
+ "get_entity_context",
42
+ lambda **kwargs: "## Knowledge Graph Context\n- OpenAI is related to Microsoft on page 1.",
43
+ )
44
+
45
+ result = agent.generate_answer("How are OpenAI and Microsoft related?", "user-1", "doc-1")
46
+
47
+ prompt = client.messages[1]["content"]
48
+ assert "Vector context" in prompt
49
+ assert "Knowledge Graph Context" in prompt
50
+ assert result["sources"] == [
51
+ {
52
+ "text": "Vector context",
53
+ "filename": "doc.pdf",
54
+ "page": 1,
55
+ "score": 0.9,
56
+ "confidence": 100.0,
57
+ }
58
+ ]
59
+
60
+
61
+ def test_generate_answer_stream_appends_graph_context(monkeypatch):
62
+ captured = {}
63
+
64
+ class StreamingClient:
65
+ def chat_completion(self, messages, **kwargs):
66
+ captured["messages"] = messages
67
+ return iter([])
68
+
69
+ monkeypatch.setattr(agent, "get_llm_client", lambda: StreamingClient())
70
+ monkeypatch.setattr(
71
+ agent,
72
+ "retrieve",
73
+ lambda **kwargs: [
74
+ {
75
+ "text": "Vector stream context",
76
+ "filename": "doc.pdf",
77
+ "page": 1,
78
+ "score": 0.9,
79
+ "confidence": 100.0,
80
+ }
81
+ ],
82
+ )
83
+ monkeypatch.setattr(
84
+ agent,
85
+ "get_entity_context",
86
+ lambda **kwargs: "## Knowledge Graph Context\n- OpenAI is related to Microsoft on page 1.",
87
+ )
88
+
89
+ events = list(agent.generate_answer_stream("OpenAI Microsoft", "user-1", "doc-1"))
90
+
91
+ assert events[0].startswith("data:")
92
+ assert "Knowledge Graph Context" in captured["messages"][1]["content"]
frontend/e2e/snapshots.spec.ts ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { expect, test, type Page } from "@playwright/test";
2
+
3
+ const user = {
4
+ id: "user-1",
5
+ username: "tester",
6
+ email: "tester@example.com",
7
+ is_admin: false,
8
+ created_at: "2026-05-28T00:00:00Z",
9
+ };
10
+
11
+ const uploadedDocument = {
12
+ id: "doc-1",
13
+ original_name: "notes.txt",
14
+ file_size: 11,
15
+ page_count: 1,
16
+ chunk_count: 1,
17
+ status: "ready",
18
+ error_message: null,
19
+ uploaded_at: "2026-05-28T00:00:00Z",
20
+ };
21
+
22
+ async function mockDashboardApis(page: Page, documents: typeof uploadedDocument[] = []) {
23
+ await page.route("**/api/v1/auth/me", async (route) => {
24
+ await route.fulfill({ json: user });
25
+ });
26
+
27
+ await page.route("**/api/v1/documents/", async (route) => {
28
+ await route.fulfill({
29
+ json: {
30
+ items: documents,
31
+ total: documents.length,
32
+ page: 1,
33
+ pages: documents.length > 0 ? 1 : 0,
34
+ },
35
+ });
36
+ });
37
+ }
38
+
39
+ test.describe("Frontend Snapshot Tests", () => {
40
+ test("login page snapshot", async ({ page }) => {
41
+ await page.goto("/login");
42
+ await page.waitForSelector("#login-email");
43
+
44
+ if (!process.env.CI) {
45
+ await expect(page).toHaveScreenshot("login-page.png", {
46
+ maxDiffPixelRatio: 0.1,
47
+ threshold: 0.2,
48
+ });
49
+ } else {
50
+ await expect(page.locator("#login-email")).toBeVisible();
51
+ }
52
+ });
53
+
54
+ test("register page snapshot", async ({ page }) => {
55
+ await page.goto("/register");
56
+ await page.waitForSelector("#reg-username");
57
+
58
+ if (!process.env.CI) {
59
+ await expect(page).toHaveScreenshot("register-page.png", {
60
+ maxDiffPixelRatio: 0.1,
61
+ threshold: 0.2,
62
+ });
63
+ } else {
64
+ await expect(page.locator("#reg-username")).toBeVisible();
65
+ }
66
+ });
67
+
68
+ test("dashboard empty page snapshot", async ({ page }) => {
69
+ // Set mock token
70
+ await page.addInitScript(() => {
71
+ localStorage.setItem("token", "access-token");
72
+ localStorage.setItem("refresh_token", "refresh-token");
73
+ });
74
+
75
+ await mockDashboardApis(page, []);
76
+ await page.goto("/dashboard");
77
+ await page.waitForSelector("text=No documents yet");
78
+
79
+ if (!process.env.CI) {
80
+ await expect(page).toHaveScreenshot("dashboard-empty.png", {
81
+ maxDiffPixelRatio: 0.1,
82
+ threshold: 0.2,
83
+ });
84
+ } else {
85
+ await expect(page.locator("text=No documents yet")).toBeVisible();
86
+ }
87
+ });
88
+
89
+ test("dashboard with document page snapshot", async ({ page }) => {
90
+ // Set mock token
91
+ await page.addInitScript(() => {
92
+ localStorage.setItem("token", "access-token");
93
+ localStorage.setItem("refresh_token", "refresh-token");
94
+ });
95
+
96
+ await mockDashboardApis(page, [uploadedDocument]);
97
+ await page.goto("/dashboard");
98
+ await page.waitForSelector("text=notes.txt");
99
+
100
+ if (!process.env.CI) {
101
+ await expect(page).toHaveScreenshot("dashboard-with-doc.png", {
102
+ maxDiffPixelRatio: 0.1,
103
+ threshold: 0.2,
104
+ });
105
+ } else {
106
+ await expect(page.locator("text=notes.txt")).toBeVisible();
107
+ }
108
+ });
109
+ });
frontend/src/app/dashboard/page.tsx CHANGED
@@ -7,8 +7,8 @@ import { useAuth } from "@/lib/auth";
7
  import { api, CONNECTION_ERROR_BANNER_MESSAGE, CONNECTION_ERROR_MESSAGE } from "@/lib/api";
8
  import Header from "@/components/layout/Header";
9
  import DocumentSidebar from "@/components/document/DocumentSidebar";
 
10
  import ChatPanel from "@/components/chat/ChatPanel";
11
-
12
  function PDFViewerSkeleton() {
13
  return (
14
  <div
@@ -164,6 +164,9 @@ export default function DashboardPage() {
164
  </div>
165
  )}
166
 
 
 
 
167
  {/* ── Center: Chat Panel ──────────────────────────────────── */}
168
  <div className="flex-1 min-w-0 flex flex-col">
169
  <ChatPanel
 
7
  import { api, CONNECTION_ERROR_BANNER_MESSAGE, CONNECTION_ERROR_MESSAGE } from "@/lib/api";
8
  import Header from "@/components/layout/Header";
9
  import DocumentSidebar from "@/components/document/DocumentSidebar";
10
+ import ChatSessionSidebar from "@/components/chat/ChatSessionSidebar";
11
  import ChatPanel from "@/components/chat/ChatPanel";
 
12
  function PDFViewerSkeleton() {
13
  return (
14
  <div
 
164
  </div>
165
  )}
166
 
167
+ {/* ── Left-Center: Chat Sessions Sidebar ──── */}
168
+ <ChatSessionSidebar />
169
+
170
  {/* ── Center: Chat Panel ──────────────────────────────────── */}
171
  <div className="flex-1 min-w-0 flex flex-col">
172
  <ChatPanel
frontend/src/components/auth/HuggingFaceTokenModal.tsx ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client";
2
+
3
+ import { useState, useRef, useEffect, isValidElement, type ReactNode } from "react";
4
+ import { Button } from "@/components/ui/button";
5
+ import { Input } from "@/components/ui/input";
6
+ import {
7
+ Dialog,
8
+ DialogContent,
9
+ DialogDescription,
10
+ DialogFooter,
11
+ DialogHeader,
12
+ DialogTitle,
13
+ DialogTrigger,
14
+ } from "@/components/ui/dialog";
15
+ import { useAuthStore } from "@/store/auth-store";
16
+ import { Eye, EyeOff, AlertCircle, CheckCircle2, Loader2, ExternalLink, Key } from "lucide-react";
17
+
18
+ interface HuggingFaceTokenModalProps {
19
+ /** Optional — if provided, allows a button-triggered dialog pattern */
20
+ children?: ReactNode;
21
+ }
22
+
23
+ export default function HuggingFaceTokenModal({ children }: HuggingFaceTokenModalProps) {
24
+ const user = useAuthStore((state) => state.user);
25
+ const setHfToken = useAuthStore((state) => state.setHfToken);
26
+
27
+ const existingToken = user?.hf_token ?? "";
28
+ const hasExistingToken = existingToken.length > 0;
29
+
30
+ const [open, setOpen] = useState(false);
31
+ const [inputToken, setInputToken] = useState(existingToken);
32
+ const [saving, setSaving] = useState(false);
33
+ const [error, setError] = useState<string | null>(null);
34
+ const [success, setSuccess] = useState(false);
35
+ const [showToken, setShowToken] = useState(false);
36
+
37
+ const mountedRef = useRef(true);
38
+ const timeoutRef = useRef<ReturnType<typeof setTimeout> | null>(null);
39
+
40
+ // Cleanup auto-close timeout and unmount guard on unmount
41
+ useEffect(() => {
42
+ return () => {
43
+ mountedRef.current = false;
44
+ if (timeoutRef.current) {
45
+ clearTimeout(timeoutRef.current);
46
+ timeoutRef.current = null;
47
+ }
48
+ };
49
+ }, []);
50
+
51
+ const clearAutoCloseTimeout = () => {
52
+ if (timeoutRef.current) {
53
+ clearTimeout(timeoutRef.current);
54
+ timeoutRef.current = null;
55
+ }
56
+ };
57
+
58
+ const handleOpenChange = (newOpen: boolean) => {
59
+ clearAutoCloseTimeout();
60
+ setOpen(newOpen);
61
+ if (newOpen) {
62
+ // Reset to current store value when opening (picks up changes from background saves)
63
+ const currentToken = useAuthStore.getState().user?.hf_token ?? "";
64
+ setInputToken(currentToken);
65
+ setSaving(false);
66
+ setError(null);
67
+ setSuccess(false);
68
+ setShowToken(false);
69
+ }
70
+ };
71
+
72
+ const handleSave = async () => {
73
+ if (saving) return;
74
+ const token = inputToken.trim();
75
+ if (!token) {
76
+ setError("Please enter a valid token");
77
+ return;
78
+ }
79
+
80
+ setSaving(true);
81
+ setError(null);
82
+ setSuccess(false);
83
+
84
+ try {
85
+ await setHfToken(token);
86
+ if (!mountedRef.current) return;
87
+ setSaving(false);
88
+ setSuccess(true);
89
+ // Auto-close after 1.5s
90
+ timeoutRef.current = setTimeout(() => setOpen(false), 1500);
91
+ } catch (err) {
92
+ if (!mountedRef.current) return;
93
+ setSaving(false);
94
+ setError(err instanceof Error ? err.message : "Failed to save token");
95
+ }
96
+ };
97
+
98
+ const isSaveDisabled = inputToken.trim() === "" || saving;
99
+
100
+ return (
101
+ <Dialog open={open} onOpenChange={handleOpenChange}>
102
+ {children ? (
103
+ <DialogTrigger render={isValidElement(children) ? children : <span>{children}</span>} />
104
+ ) : (
105
+ <DialogTrigger
106
+ render={
107
+ <button className="flex w-full cursor-pointer items-center rounded-sm px-2 py-1.5 text-sm outline-none transition-colors hover:bg-accent hover:text-accent-foreground">
108
+ <Key className="mr-2 h-4 w-4" />
109
+ <span>HuggingFace Token</span>
110
+ </button>
111
+ }
112
+ />
113
+ )}
114
+ <DialogContent className="max-w-md sm:rounded-2xl border-border/40 p-6 md:p-8 bg-background/95 backdrop-blur-xl shadow-2xl" showCloseButton={false}>
115
+ <DialogHeader className="gap-1">
116
+ <DialogTitle className="text-2xl font-bold tracking-tight">
117
+ 🤗 HuggingFace Token
118
+ </DialogTitle>
119
+ <DialogDescription className="text-sm text-muted-foreground mt-1.5">
120
+ Enter your HuggingFace API token to enable inference endpoints and model access.
121
+ </DialogDescription>
122
+ </DialogHeader>
123
+
124
+ <form onSubmit={(e) => { e.preventDefault(); if (!isSaveDisabled) handleSave(); }}>
125
+ <div className="space-y-4 mt-6">
126
+ {/* Token label with configured indicator */}
127
+ <div className="flex items-center gap-2">
128
+ <label htmlFor="hf-token-input" className="text-sm font-medium text-foreground/80">
129
+ Token
130
+ </label>
131
+ {hasExistingToken && (
132
+ <span className="inline-flex items-center gap-1 text-xs text-primary">
133
+ <CheckCircle2 className="w-3 h-3" />
134
+ Token configured
135
+ </span>
136
+ )}
137
+ </div>
138
+
139
+ {/* Input wrapper with visibility toggle */}
140
+ <div className="relative">
141
+ <Input
142
+ id="hf-token-input"
143
+ type={showToken ? "text" : "password"}
144
+ value={inputToken}
145
+ onChange={(e) => {
146
+ setInputToken(e.target.value);
147
+ if (error) setError(null);
148
+ if (success) setSuccess(false);
149
+ }}
150
+ placeholder="hf_..."
151
+ className="pr-10 font-mono"
152
+ disabled={saving}
153
+ autoFocus
154
+ aria-label="HuggingFace API Token"
155
+ />
156
+ <Button
157
+ variant="ghost"
158
+ size="icon-xs"
159
+ className="absolute right-2 top-1/2 -translate-y-1/2"
160
+ onClick={() => setShowToken(!showToken)}
161
+ type="button"
162
+ aria-label={showToken ? "Hide token" : "Show token"}
163
+ disabled={saving}
164
+ >
165
+ {showToken ? <EyeOff className="w-4 h-4" /> : <Eye className="w-4 h-4" />}
166
+ </Button>
167
+ </div>
168
+
169
+ {/* External link */}
170
+ <a
171
+ href="https://huggingface.co/settings/tokens"
172
+ target="_blank"
173
+ rel="noopener noreferrer"
174
+ className="text-xs text-muted-foreground hover:text-primary underline-offset-2 transition-colors inline-flex items-center gap-1"
175
+ >
176
+ <ExternalLink className="w-3 h-3" />
177
+ Get your API token from HuggingFace Settings
178
+ </a>
179
+ </div>
180
+
181
+ {/* Error banner */}
182
+ {error && (
183
+ <div
184
+ className="p-4 border border-destructive/30 bg-destructive/5 rounded-xl text-sm text-destructive flex items-start gap-2 mt-4 animate-in fade-in slide-in-from-top-2 duration-200"
185
+ role="alert"
186
+ aria-live="polite"
187
+ >
188
+ <AlertCircle className="w-4 h-4 mt-0.5 shrink-0" />
189
+ <span>{error}</span>
190
+ </div>
191
+ )}
192
+
193
+ {/* Success banner */}
194
+ {success && (
195
+ <div
196
+ className="p-4 border border-primary/20 bg-primary/5 rounded-xl text-sm text-primary flex items-start gap-2 mt-4 animate-in fade-in slide-in-from-top-2 duration-200"
197
+ aria-live="polite"
198
+ >
199
+ <CheckCircle2 className="w-4 h-4 mt-0.5 shrink-0" />
200
+ <span>Token saved successfully</span>
201
+ </div>
202
+ )}
203
+ </form>
204
+
205
+ {/* Footer */}
206
+ <DialogFooter className="mt-4">
207
+ <Button variant="outline" onClick={() => setOpen(false)}>
208
+ Cancel
209
+ </Button>
210
+ <Button
211
+ onClick={handleSave}
212
+ disabled={isSaveDisabled}
213
+ aria-busy={saving}
214
+ title={hasExistingToken ? "Replace existing token with a new one" : undefined}
215
+ >
216
+ {saving ? (
217
+ <>
218
+ <Loader2 className="w-4 h-4 animate-spin mr-1.5" />
219
+ Saving...
220
+ </>
221
+ ) : hasExistingToken ? (
222
+ "Update Token"
223
+ ) : (
224
+ "Save Token"
225
+ )}
226
+ </Button>
227
+ </DialogFooter>
228
+ </DialogContent>
229
+ </Dialog>
230
+ );
231
+ }
frontend/src/components/chat/ChatPanel.tsx CHANGED
@@ -22,11 +22,13 @@ export default function ChatPanel({ activeDoc, onCitationClick }: Props) {
22
  const input = useChatStore((state) => state.input);
23
  const streaming = useChatStore((state) => state.streaming);
24
  const isTyping = useChatStore((state) => state.isTyping);
 
25
  const setMessages = useChatStore((state) => state.setMessages);
26
  const setInput = useChatStore((state) => state.setInput);
27
  const setStreaming = useChatStore((state) => state.setStreaming);
28
  const setIsTyping = useChatStore((state) => state.setIsTyping);
29
  const resetChat = useChatStore((state) => state.resetChat);
 
30
  const [showExportMenu, setShowExportMenu] = useState(false);
31
  const textareaRef = useRef<HTMLTextAreaElement>(null);
32
  const bottomRef = useRef<HTMLDivElement>(null);
@@ -61,8 +63,13 @@ export default function ChatPanel({ activeDoc, onCitationClick }: Props) {
61
  };
62
  }, [resetChat]);
63
 
64
- // Load history on doc change
65
  useEffect(() => {
 
 
 
 
 
66
  if (!activeDoc) {
67
  prevDocId.current = null;
68
  setMessages([]);
@@ -100,7 +107,7 @@ export default function ChatPanel({ activeDoc, onCitationClick }: Props) {
100
  return () => {
101
  cancelled = true;
102
  };
103
- }, [activeDoc, resetChat, setMessages]);
104
 
105
  const handleSend = async () => {
106
  if (!input.trim() || streaming) return;
@@ -128,6 +135,7 @@ export default function ChatPanel({ activeDoc, onCitationClick }: Props) {
128
  const stream = api.streamPost("/api/v1/chat/ask/stream", {
129
  question,
130
  document_id: activeDoc?.id || null,
 
131
  });
132
 
133
  for await (const event of stream) {
 
22
  const input = useChatStore((state) => state.input);
23
  const streaming = useChatStore((state) => state.streaming);
24
  const isTyping = useChatStore((state) => state.isTyping);
25
+ const activeSessionId = useChatStore((state) => state.activeSessionId);
26
  const setMessages = useChatStore((state) => state.setMessages);
27
  const setInput = useChatStore((state) => state.setInput);
28
  const setStreaming = useChatStore((state) => state.setStreaming);
29
  const setIsTyping = useChatStore((state) => state.setIsTyping);
30
  const resetChat = useChatStore((state) => state.resetChat);
31
+ const fetchSessionHistory = useChatStore((state) => state.fetchSessionHistory);
32
  const [showExportMenu, setShowExportMenu] = useState(false);
33
  const textareaRef = useRef<HTMLTextAreaElement>(null);
34
  const bottomRef = useRef<HTMLDivElement>(null);
 
63
  };
64
  }, [resetChat]);
65
 
66
+ // Load history on activeSessionId or fallback to activeDoc change
67
  useEffect(() => {
68
+ if (activeSessionId) {
69
+ fetchSessionHistory(activeSessionId);
70
+ return;
71
+ }
72
+
73
  if (!activeDoc) {
74
  prevDocId.current = null;
75
  setMessages([]);
 
107
  return () => {
108
  cancelled = true;
109
  };
110
+ }, [activeSessionId, activeDoc, fetchSessionHistory, setMessages]);
111
 
112
  const handleSend = async () => {
113
  if (!input.trim() || streaming) return;
 
135
  const stream = api.streamPost("/api/v1/chat/ask/stream", {
136
  question,
137
  document_id: activeDoc?.id || null,
138
+ session_id: activeSessionId,
139
  });
140
 
141
  for await (const event of stream) {
frontend/src/components/chat/ChatSessionSidebar.tsx ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client";
2
+
3
+ import { useState, useEffect } from "react";
4
+ import { Plus, Edit2, Trash2, MessageSquare, ChevronLeft } from "lucide-react";
5
+ import { useChatStore, type ChatSession } from "@/store/chat-store";
6
+ import { Button } from "@/components/ui/button";
7
+ import { Input } from "@/components/ui/input";
8
+ import { cn } from "@/lib/utils";
9
+
10
+ export default function ChatSessionSidebar() {
11
+ const sessions = useChatStore((state) => state.sessions);
12
+ const activeSessionId = useChatStore((state) => state.activeSessionId);
13
+ const fetchSessions = useChatStore((state) => state.fetchSessions);
14
+ const createSession = useChatStore((state) => state.createSession);
15
+ const renameSession = useChatStore((state) => state.renameSession);
16
+ const deleteSession = useChatStore((state) => state.deleteSession);
17
+ const setActiveSessionId = useChatStore((state) => state.setActiveSessionId);
18
+ const fetchSessionHistory = useChatStore((state) => state.fetchSessionHistory);
19
+
20
+ const [isOpen, setIsOpen] = useState(true);
21
+ const [editingId, setEditingId] = useState<string | null>(null);
22
+ const [editTitle, setEditTitle] = useState("");
23
+ const [creating, setCreating] = useState(false);
24
+
25
+ // Load sessions on mount
26
+ useEffect(() => {
27
+ fetchSessions();
28
+ }, [fetchSessions]);
29
+
30
+ const handleCreate = async () => {
31
+ if (creating) return;
32
+ setCreating(true);
33
+ try {
34
+ const defaultTitle = `Chat ${sessions.length + 1}`;
35
+ const newId = await createSession(defaultTitle);
36
+ setEditingId(newId);
37
+ setEditTitle(defaultTitle);
38
+ } catch (err) {
39
+ console.error(err);
40
+ } finally {
41
+ setCreating(false);
42
+ }
43
+ };
44
+
45
+ const handleStartRename = (session: ChatSession, e: React.MouseEvent) => {
46
+ e.stopPropagation();
47
+ setEditingId(session.id);
48
+ setEditTitle(session.title);
49
+ };
50
+
51
+ const handleSaveRename = async (id: string, e?: React.FormEvent) => {
52
+ if (e) e.preventDefault();
53
+ if (!editTitle.trim()) {
54
+ setEditingId(null);
55
+ return;
56
+ }
57
+ try {
58
+ await renameSession(id, editTitle.trim());
59
+ } catch (err) {
60
+ console.error(err);
61
+ } finally {
62
+ setEditingId(null);
63
+ }
64
+ };
65
+
66
+ const handleDelete = async (id: string, e: React.MouseEvent) => {
67
+ e.stopPropagation();
68
+ if (confirm("Are you sure you want to delete this chat session?")) {
69
+ try {
70
+ await deleteSession(id);
71
+ } catch (err) {
72
+ console.error(err);
73
+ }
74
+ }
75
+ };
76
+
77
+ const handleSelectSession = async (id: string) => {
78
+ setActiveSessionId(id);
79
+ await fetchSessionHistory(id);
80
+ };
81
+
82
+ return (
83
+ <div className={cn("relative flex h-full border-r border-border/50 bg-card/20 select-none transition-all duration-300", isOpen ? "w-64" : "w-0")}>
84
+ <div className={cn("flex flex-col h-full w-full overflow-hidden transition-opacity duration-200", isOpen ? "opacity-100" : "opacity-0 pointer-events-none")}>
85
+ {/* Sidebar Header */}
86
+ <div className="flex items-center justify-between p-3 border-b border-border/50 shrink-0 bg-card/45">
87
+ <span className="text-xs font-semibold uppercase tracking-wider text-muted-foreground">Chat Sessions</span>
88
+ <Button
89
+ onClick={handleCreate}
90
+ variant="outline"
91
+ size="icon"
92
+ className="h-7 w-7 bg-background/50 hover:bg-accent hover:text-accent-foreground"
93
+ disabled={creating}
94
+ >
95
+ <Plus className="w-4 h-4" />
96
+ </Button>
97
+ </div>
98
+
99
+ {/* Sessions List */}
100
+ <div className="flex-1 overflow-y-auto p-2 space-y-1 scrollbar-thin">
101
+ {sessions.length === 0 ? (
102
+ <div className="text-center py-8 px-4">
103
+ <p className="text-xs text-muted-foreground">No chat sessions. Click &quot;+&quot; to start a new chat.</p>
104
+ </div>
105
+ ) : (
106
+ sessions.map((session) => {
107
+ const isActive = session.id === activeSessionId;
108
+ const isEditing = session.id === editingId;
109
+
110
+ return (
111
+ <div
112
+ key={session.id}
113
+ onClick={() => !isEditing && handleSelectSession(session.id)}
114
+ className={cn(
115
+ "group flex items-center justify-between rounded-lg px-3 py-2 text-sm transition-all duration-200 cursor-pointer border",
116
+ isActive
117
+ ? "bg-accent/80 border-accent text-accent-foreground shadow-sm"
118
+ : "border-transparent hover:bg-card/60 hover:text-foreground text-muted-foreground"
119
+ )}
120
+ >
121
+ <div className="flex items-center gap-2 min-w-0 flex-1">
122
+ <MessageSquare className={cn("w-4 h-4 shrink-0", isActive ? "text-primary" : "text-muted-foreground")} />
123
+
124
+ {isEditing ? (
125
+ <form
126
+ onSubmit={(e) => handleSaveRename(session.id, e)}
127
+ className="flex items-center gap-1 w-full"
128
+ onClick={(e) => e.stopPropagation()}
129
+ >
130
+ <Input
131
+ value={editTitle}
132
+ onChange={(e) => setEditTitle(e.target.value)}
133
+ className="h-6 text-xs px-1 py-0 bg-background/50 border-input w-full"
134
+ autoFocus
135
+ onBlur={() => handleSaveRename(session.id)}
136
+ />
137
+ </form>
138
+ ) : (
139
+ <span className="truncate text-xs font-medium">{session.title}</span>
140
+ )}
141
+ </div>
142
+
143
+ {!isEditing && (
144
+ <div className="flex items-center gap-1 opacity-0 group-hover:opacity-100 transition-opacity duration-150 shrink-0 ml-1">
145
+ <Button
146
+ variant="ghost"
147
+ size="icon"
148
+ className="h-5 w-5 rounded-md hover:bg-background/80"
149
+ onClick={(e) => handleStartRename(session, e)}
150
+ >
151
+ <Edit2 className="w-3 h-3" />
152
+ </Button>
153
+ <Button
154
+ variant="ghost"
155
+ size="icon"
156
+ className="h-5 w-5 rounded-md hover:bg-destructive/10 hover:text-destructive"
157
+ onClick={(e) => handleDelete(session.id, e)}
158
+ >
159
+ <Trash2 className="w-3 h-3" />
160
+ </Button>
161
+ </div>
162
+ )}
163
+ </div>
164
+ );
165
+ })
166
+ )}
167
+ </div>
168
+ </div>
169
+
170
+ {/* Collapse Toggle Button */}
171
+ <Button
172
+ onClick={() => setIsOpen(!isOpen)}
173
+ variant="ghost"
174
+ size="icon"
175
+ className={cn(
176
+ "absolute -right-3 top-1/2 -translate-y-1/2 z-40 h-6 w-6 rounded-full border border-border bg-background shadow-md hover:bg-accent hover:text-accent-foreground",
177
+ !isOpen && "right-auto -left-3 rotate-180"
178
+ )}
179
+ >
180
+ <ChevronLeft className="w-3.5 h-3.5" />
181
+ </Button>
182
+ </div>
183
+ );
184
+ }
frontend/src/components/chat/SourceCard.tsx CHANGED
@@ -13,6 +13,77 @@ import { ChevronDown, ChevronUp, FileText, Eye, TextQuote } from "lucide-react";
13
 
14
  const EXCERPT_THRESHOLD = 200;
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  interface Props {
17
  sources: SourceChunk[];
18
  onPageClick: (page: number) => void;
@@ -36,34 +107,37 @@ export default function SourceCard({ sources = [], onPageClick }: Props) {
36
 
37
  return (
38
  <div className="rounded-lg border border-border/50 bg-card/50 overflow-hidden">
39
- {/* ── Header ──────────────────────────────────── */}
40
- <button
41
- onClick={() => setExpanded(!expanded)}
42
- className="w-full flex items-center justify-between px-3 py-2 text-xs hover:bg-accent/30 transition-colors"
43
- >
44
- <span className="flex items-center gap-1.5 text-muted-foreground">
45
- <FileText className="w-3.5 h-3.5" />
46
- {sources.length} source{sources.length > 1 ? "s" : ""} cited
47
- </span>
48
- {expanded ? (
49
- <ChevronUp className="w-3.5 h-3.5 text-muted-foreground" />
50
- ) : (
51
- <ChevronDown className="w-3.5 h-3.5 text-muted-foreground" />
52
- )}
53
- </button>
54
-
55
- {/* ── Collapsed: Mini badges with hover preview ── */}
56
- {!expanded && (
57
- <div className="px-3 pb-2 flex flex-wrap gap-1">
58
- {sources.map((src, i) => (
 
 
 
59
  <Tooltip key={i}>
60
  <TooltipTrigger className="inline-flex">
61
  <Badge
62
- variant="secondary"
63
- className="text-[10px] h-5 cursor-pointer hover:bg-primary/20 transition-colors"
64
  onClick={() => onPageClick(src.page + 1)}
65
  >
66
- p.{src.page + 1} {src.confidence}%
67
  </Badge>
68
  </TooltipTrigger>
69
  <TooltipContent
@@ -71,74 +145,68 @@ export default function SourceCard({ sources = [], onPageClick }: Props) {
71
  align="center"
72
  className="max-w-xs p-2"
73
  >
 
 
 
 
74
  <p className="text-[11px] leading-relaxed line-clamp-6">
75
  {src.text}
76
  </p>
77
  </TooltipContent>
78
  </Tooltip>
79
- ))}
80
- </div>
81
- )}
 
82
 
83
- {/* ── Expanded: Full source cards ─────────────── */}
84
- {expanded && (
85
- <div className="border-t border-border/30">
86
- {sources.map((src, i) => (
87
- <div
88
- key={i}
89
- className="px-3 py-2.5 border-b border-border/20 last:border-b-0 hover:bg-accent/20 transition-colors"
90
- >
91
- <div className="flex items-center justify-between mb-1.5">
92
- <div className="flex items-center gap-2">
93
- <span className="text-[10px] font-medium text-muted-foreground">
94
- {src.filename}
95
- </span>
96
- <Badge variant="outline" className="text-[9px] h-4 px-1.5">
97
- Page {src.page + 1}
98
- </Badge>
99
- <Badge
100
- variant="secondary"
101
- className={`text-[9px] h-4 px-1.5 ${
102
- src.confidence >= 80
103
- ? "text-emerald-400 bg-emerald-400/10"
104
- : src.confidence >= 50
105
- ? "text-yellow-400 bg-yellow-400/10"
106
- : "text-muted-foreground"
107
- }`}
108
- >
109
- {src.confidence}% match
110
- </Badge>
111
- </div>
112
- <Button
113
- variant="ghost"
114
- size="sm"
115
- className="h-6 px-2 text-[10px]"
116
- onClick={() => onPageClick(src.page + 1)}
117
- >
118
- <Eye className="w-3 h-3 mr-1" />
119
- View
120
- </Button>
121
  </div>
122
- <p
123
- className={`text-[11px] text-muted-foreground leading-relaxed ${
124
- excerptOpen.has(i) ? "" : "line-clamp-3"
125
- }`}
 
126
  >
127
- {src.text}
128
- </p>
129
- {src.text.length > EXCERPT_THRESHOLD && (
130
- <button
131
- onClick={() => toggleExcerpt(i)}
132
- className="mt-1.5 flex items-center gap-1 text-[10px] text-primary/70 hover:text-primary transition-colors"
133
- >
134
- <TextQuote className="w-3 h-3" />
135
- {excerptOpen.has(i) ? "Hide excerpt" : "Show excerpt"}
136
- </button>
137
- )}
138
  </div>
139
- ))}
140
- </div>
141
- )}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  </div>
143
  );
144
  }
 
13
 
14
  const EXCERPT_THRESHOLD = 200;
15
 
16
+ type ConfidenceLevel = "High" | "Medium" | "Low" | "Unknown";
17
+
18
+ interface ConfidenceBadgeMeta {
19
+ label: ConfidenceLevel;
20
+ className: string;
21
+ }
22
+
23
+ const normalizeMetricValue = (value?: number) => {
24
+ if (typeof value !== "number" || Number.isNaN(value)) return undefined;
25
+ return value > 1 ? value / 100 : value;
26
+ };
27
+
28
+ const formatMetricValue = (value?: number) => {
29
+ const normalizedValue = normalizeMetricValue(value);
30
+ if (normalizedValue === undefined) return "N/A";
31
+ return `${Math.round(normalizedValue * 100)}%`;
32
+ };
33
+
34
+ const getConfidenceBadgeMeta = (value?: number): ConfidenceBadgeMeta => {
35
+ const normalizedValue = normalizeMetricValue(value);
36
+
37
+ if (normalizedValue === undefined) {
38
+ return {
39
+ label: "Unknown",
40
+ className: "border-muted bg-muted/40 text-muted-foreground",
41
+ };
42
+ }
43
+
44
+ if (normalizedValue >= 0.8) {
45
+ return {
46
+ label: "High",
47
+ className: "border-emerald-500/30 bg-emerald-500/10 text-emerald-600",
48
+ };
49
+ }
50
+
51
+ if (normalizedValue >= 0.5) {
52
+ return {
53
+ label: "Medium",
54
+ className: "border-amber-500/30 bg-amber-500/10 text-amber-600",
55
+ };
56
+ }
57
+
58
+ return {
59
+ label: "Low",
60
+ className: "border-red-500/30 bg-red-500/10 text-red-600",
61
+ };
62
+ };
63
+
64
+ const getPrimarySourceMetric = (source: SourceChunk) =>
65
+ source.confidence ?? source.score;
66
+
67
+ const MetricBadge = ({
68
+ label,
69
+ value,
70
+ }: {
71
+ label: "Score" | "Confidence";
72
+ value?: number;
73
+ }) => {
74
+ const badgeMeta = getConfidenceBadgeMeta(value);
75
+
76
+ return (
77
+ <Badge
78
+ variant="outline"
79
+ className={`h-5 px-1.5 text-[9px] font-medium ${badgeMeta.className}`}
80
+ title={`${label}: ${formatMetricValue(value)}`}
81
+ >
82
+ {label}: {badgeMeta.label}
83
+ </Badge>
84
+ );
85
+ };
86
+
87
  interface Props {
88
  sources: SourceChunk[];
89
  onPageClick: (page: number) => void;
 
107
 
108
  return (
109
  <div className="rounded-lg border border-border/50 bg-card/50 overflow-hidden">
110
+ <button
111
+ onClick={() => setExpanded(!expanded)}
112
+ className="w-full flex items-center justify-between px-3 py-2 text-xs hover:bg-accent/30 transition-colors"
113
+ >
114
+ <span className="flex items-center gap-1.5 text-muted-foreground">
115
+ <FileText className="w-3.5 h-3.5" />
116
+ {sources.length} source{sources.length > 1 ? "s" : ""} cited
117
+ </span>
118
+ {expanded ? (
119
+ <ChevronUp className="w-3.5 h-3.5 text-muted-foreground" />
120
+ ) : (
121
+ <ChevronDown className="w-3.5 h-3.5 text-muted-foreground" />
122
+ )}
123
+ </button>
124
+
125
+ {!expanded && (
126
+ <div className="px-3 pb-2 flex flex-wrap gap-1">
127
+ {sources.map((src, i) => {
128
+ const badgeMeta = getConfidenceBadgeMeta(
129
+ getPrimarySourceMetric(src)
130
+ );
131
+
132
+ return (
133
  <Tooltip key={i}>
134
  <TooltipTrigger className="inline-flex">
135
  <Badge
136
+ variant="outline"
137
+ className={`text-[10px] h-5 cursor-pointer hover:bg-primary/20 transition-colors ${badgeMeta.className}`}
138
  onClick={() => onPageClick(src.page + 1)}
139
  >
140
+ p.{src.page + 1} - {badgeMeta.label}
141
  </Badge>
142
  </TooltipTrigger>
143
  <TooltipContent
 
145
  align="center"
146
  className="max-w-xs p-2"
147
  >
148
+ <div className="mb-1 flex flex-wrap gap-1">
149
+ <MetricBadge label="Score" value={src.score} />
150
+ <MetricBadge label="Confidence" value={src.confidence} />
151
+ </div>
152
  <p className="text-[11px] leading-relaxed line-clamp-6">
153
  {src.text}
154
  </p>
155
  </TooltipContent>
156
  </Tooltip>
157
+ );
158
+ })}
159
+ </div>
160
+ )}
161
 
162
+ {expanded && (
163
+ <div className="border-t border-border/30">
164
+ {sources.map((src, i) => (
165
+ <div
166
+ key={i}
167
+ className="px-3 py-2.5 border-b border-border/20 last:border-b-0 hover:bg-accent/20 transition-colors"
168
+ >
169
+ <div className="flex items-center justify-between gap-2 mb-1.5">
170
+ <div className="flex min-w-0 flex-wrap items-center gap-2">
171
+ <span className="truncate text-[10px] font-medium text-muted-foreground">
172
+ {src.filename}
173
+ </span>
174
+ <Badge variant="outline" className="h-5 px-1.5 text-[9px]">
175
+ Page {src.page + 1}
176
+ </Badge>
177
+ <MetricBadge label="Score" value={src.score} />
178
+ <MetricBadge label="Confidence" value={src.confidence} />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  </div>
180
+ <Button
181
+ variant="ghost"
182
+ size="sm"
183
+ className="h-6 shrink-0 px-2 text-[10px]"
184
+ onClick={() => onPageClick(src.page + 1)}
185
  >
186
+ <Eye className="w-3 h-3 mr-1" />
187
+ View
188
+ </Button>
 
 
 
 
 
 
 
 
189
  </div>
190
+ <p
191
+ className={`text-[11px] text-muted-foreground leading-relaxed ${
192
+ excerptOpen.has(i) ? "" : "line-clamp-3"
193
+ }`}
194
+ >
195
+ {src.text}
196
+ </p>
197
+ {src.text.length > EXCERPT_THRESHOLD && (
198
+ <button
199
+ onClick={() => toggleExcerpt(i)}
200
+ className="mt-1.5 flex items-center gap-1 text-[10px] text-primary/70 hover:text-primary transition-colors"
201
+ >
202
+ <TextQuote className="w-3 h-3" />
203
+ {excerptOpen.has(i) ? "Hide excerpt" : "Show excerpt"}
204
+ </button>
205
+ )}
206
+ </div>
207
+ ))}
208
+ </div>
209
+ )}
210
  </div>
211
  );
212
  }
frontend/src/components/layout/ContributorsPanel.tsx CHANGED
@@ -1,6 +1,7 @@
1
  "use client";
2
 
3
  import { useState, useEffect } from "react";
 
4
  import { GitBranch, Star, GitPullRequest, Users, X, Trophy, ExternalLink } from "lucide-react";
5
  import { Button } from "@/components/ui/button";
6
  import { api } from "@/lib/api";
@@ -124,7 +125,7 @@ export default function ContributorsPanel({ onClose }: { onClose: () => void })
124
  {medals[i]}
125
  </span>
126
  )}
127
- <img
128
  src={c.avatar_url}
129
  alt={c.login}
130
  width={56}
 
1
  "use client";
2
 
3
  import { useState, useEffect } from "react";
4
+ import Image from "next/image";
5
  import { GitBranch, Star, GitPullRequest, Users, X, Trophy, ExternalLink } from "lucide-react";
6
  import { Button } from "@/components/ui/button";
7
  import { api } from "@/lib/api";
 
125
  {medals[i]}
126
  </span>
127
  )}
128
+ <Image
129
  src={c.avatar_url}
130
  alt={c.login}
131
  width={56}
frontend/src/components/layout/Header.tsx CHANGED
@@ -27,6 +27,7 @@ import {
27
  X,
28
  } from "lucide-react";
29
  import { useTheme } from "next-themes";
 
30
  import { useSyncExternalStore } from "react";
31
 
32
  interface HeaderProps {
 
27
  X,
28
  } from "lucide-react";
29
  import { useTheme } from "next-themes";
30
+
31
  import { useSyncExternalStore } from "react";
32
 
33
  interface HeaderProps {
frontend/src/store/chat-store.ts CHANGED
@@ -1,13 +1,14 @@
1
  "use client";
2
 
3
  import { create } from "zustand";
 
4
 
5
  export interface SourceChunk {
6
  text: string;
7
  filename: string;
8
  page: number;
9
- score: number;
10
- confidence: number;
11
  }
12
 
13
  export interface ChatMsg {
@@ -18,6 +19,12 @@ export interface ChatMsg {
18
  isStreaming?: boolean;
19
  }
20
 
 
 
 
 
 
 
21
  type Setter<T> = T | ((prev: T) => T);
22
 
23
  interface ChatStore {
@@ -25,21 +32,32 @@ interface ChatStore {
25
  input: string;
26
  streaming: boolean;
27
  isTyping: boolean;
 
 
28
  setMessages: (value: Setter<ChatMsg[]>) => void;
29
  setInput: (value: Setter<string>) => void;
30
  setStreaming: (value: Setter<boolean>) => void;
31
  setIsTyping: (value: Setter<boolean>) => void;
 
 
 
 
 
 
 
32
  resetChat: () => void;
33
  }
34
 
35
  const resolveValue = <T,>(value: Setter<T>, current: T): T =>
36
  typeof value === "function" ? (value as (prev: T) => T)(current) : value;
37
 
38
- export const useChatStore = create<ChatStore>((set) => ({
39
  messages: [],
40
  input: "",
41
  streaming: false,
42
  isTyping: false,
 
 
43
 
44
  setMessages(value) {
45
  set((state) => ({ messages: resolveValue(value, state.messages) }));
@@ -57,12 +75,97 @@ export const useChatStore = create<ChatStore>((set) => ({
57
  set((state) => ({ isTyping: resolveValue(value, state.isTyping) }));
58
  },
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  resetChat() {
61
  set({
62
  messages: [],
63
  input: "",
64
  streaming: false,
65
  isTyping: false,
 
 
66
  });
67
  },
68
  }));
 
1
  "use client";
2
 
3
  import { create } from "zustand";
4
+ import { api } from "@/lib/api";
5
 
6
  export interface SourceChunk {
7
  text: string;
8
  filename: string;
9
  page: number;
10
+ score?: number;
11
+ confidence?: number;
12
  }
13
 
14
  export interface ChatMsg {
 
19
  isStreaming?: boolean;
20
  }
21
 
22
+ export interface ChatSession {
23
+ id: string;
24
+ title: string;
25
+ created_at: string;
26
+ }
27
+
28
  type Setter<T> = T | ((prev: T) => T);
29
 
30
  interface ChatStore {
 
32
  input: string;
33
  streaming: boolean;
34
  isTyping: boolean;
35
+ sessions: ChatSession[];
36
+ activeSessionId: string | null;
37
  setMessages: (value: Setter<ChatMsg[]>) => void;
38
  setInput: (value: Setter<string>) => void;
39
  setStreaming: (value: Setter<boolean>) => void;
40
  setIsTyping: (value: Setter<boolean>) => void;
41
+ setSessions: (value: Setter<ChatSession[]>) => void;
42
+ setActiveSessionId: (value: Setter<string | null>) => void;
43
+ fetchSessions: () => Promise<void>;
44
+ createSession: (title: string) => Promise<string>;
45
+ renameSession: (id: string, title: string) => Promise<void>;
46
+ deleteSession: (id: string) => Promise<void>;
47
+ fetchSessionHistory: (id: string) => Promise<void>;
48
  resetChat: () => void;
49
  }
50
 
51
  const resolveValue = <T,>(value: Setter<T>, current: T): T =>
52
  typeof value === "function" ? (value as (prev: T) => T)(current) : value;
53
 
54
+ export const useChatStore = create<ChatStore>((set, get) => ({
55
  messages: [],
56
  input: "",
57
  streaming: false,
58
  isTyping: false,
59
+ sessions: [],
60
+ activeSessionId: null,
61
 
62
  setMessages(value) {
63
  set((state) => ({ messages: resolveValue(value, state.messages) }));
 
75
  set((state) => ({ isTyping: resolveValue(value, state.isTyping) }));
76
  },
77
 
78
+ setSessions(value) {
79
+ set((state) => ({ sessions: resolveValue(value, state.sessions) }));
80
+ },
81
+
82
+ setActiveSessionId(value) {
83
+ set((state) => ({ activeSessionId: resolveValue(value, state.activeSessionId) }));
84
+ },
85
+
86
+ async fetchSessions() {
87
+ try {
88
+ const data = await api.get<ChatSession[]>("/api/v1/chat/sessions");
89
+ set({ sessions: data });
90
+ if (data.length > 0 && !get().activeSessionId) {
91
+ set({ activeSessionId: data[0].id });
92
+ await get().fetchSessionHistory(data[0].id);
93
+ }
94
+ } catch (err) {
95
+ console.error("Failed to fetch chat sessions:", err);
96
+ }
97
+ },
98
+
99
+ async createSession(title) {
100
+ try {
101
+ const session = await api.post<ChatSession>("/api/v1/chat/sessions", { title });
102
+ set((state) => ({
103
+ sessions: [session, ...state.sessions],
104
+ activeSessionId: session.id,
105
+ messages: [],
106
+ }));
107
+ return session.id;
108
+ } catch (err) {
109
+ console.error("Failed to create chat session:", err);
110
+ throw err;
111
+ }
112
+ },
113
+
114
+ async renameSession(id, title) {
115
+ try {
116
+ const updated = await api.put<ChatSession>(`/api/v1/chat/sessions/${id}`, { title });
117
+ set((state) => ({
118
+ sessions: state.sessions.map((s) => (s.id === id ? updated : s)),
119
+ }));
120
+ } catch (err) {
121
+ console.error("Failed to rename chat session:", err);
122
+ throw err;
123
+ }
124
+ },
125
+
126
+ async deleteSession(id) {
127
+ try {
128
+ await api.delete(`/api/v1/chat/sessions/${id}`);
129
+ set((state) => {
130
+ const nextSessions = state.sessions.filter((s) => s.id !== id);
131
+ let nextActiveId = state.activeSessionId;
132
+ if (state.activeSessionId === id) {
133
+ nextActiveId = nextSessions.length > 0 ? nextSessions[0].id : null;
134
+ }
135
+ return {
136
+ sessions: nextSessions,
137
+ activeSessionId: nextActiveId,
138
+ };
139
+ });
140
+ const activeId = get().activeSessionId;
141
+ if (activeId) {
142
+ await get().fetchSessionHistory(activeId);
143
+ } else {
144
+ set({ messages: [] });
145
+ }
146
+ } catch (err) {
147
+ console.error("Failed to delete chat session:", err);
148
+ throw err;
149
+ }
150
+ },
151
+
152
+ async fetchSessionHistory(id) {
153
+ try {
154
+ const data = await api.get<{ messages: ChatMsg[] }>(`/api/v1/chat/history/session/${id}`);
155
+ set({ messages: data.messages });
156
+ } catch (err) {
157
+ console.error("Failed to fetch session history:", err);
158
+ }
159
+ },
160
+
161
  resetChat() {
162
  set({
163
  messages: [],
164
  input: "",
165
  streaming: false,
166
  isTyping: false,
167
+ sessions: [],
168
+ activeSessionId: null,
169
  });
170
  },
171
  }));
package-lock.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "name": "PDF-Assistant-RAG",
3
+ "lockfileVersion": 3,
4
+ "requires": true,
5
+ "packages": {}
6
+ }
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
  flask
2
  python-dotenv
3
  pymupdf
 
4
  flask-login
5
  pymongo
6
  werkzeug
@@ -12,4 +13,4 @@ requests-oauthlib
12
  google-genai
13
  cryptography
14
  gunicorn
15
- pinecone
 
1
  flask
2
  python-dotenv
3
  pymupdf
4
+ pdfplumber
5
  flask-login
6
  pymongo
7
  werkzeug
 
13
  google-genai
14
  cryptography
15
  gunicorn
16
+ pinecone