Spaces:
Sleeping
Sleeping
github-actions[bot] commited on
Commit ·
dba1a8e
1
Parent(s): 6e9aef8
Sync from GitHub e2e802be5157aa05d1251459f529eb7eb4242ef2
Browse files- DATABASE_SCHEMA.md +69 -139
- ER_DIAGRAM.md +44 -41
- app.py +91 -14
- auth/session.py +18 -4
- data/crud.py +28 -0
- docs/DESIGN_BRIEF.md +164 -0
- frontend/app.py +4 -0
- requirements.txt +1 -0
- tests/test_chat_citations.py +119 -0
- tests/test_notebook_management_api.py +46 -1
DATABASE_SCHEMA.md
CHANGED
|
@@ -1,161 +1,91 @@
|
|
| 1 |
# Database Schema
|
| 2 |
|
| 3 |
-
This document
|
| 4 |
|
| 5 |
## Engine and Initialization
|
| 6 |
- ORM: SQLAlchemy 2.x
|
| 7 |
- Base class: `data.db.Base`
|
| 8 |
-
- Default
|
| 9 |
-
-
|
| 10 |
-
|
| 11 |
-
##
|
| 12 |
-
- `users` 1:N `
|
| 13 |
-
- `
|
| 14 |
-
- `
|
| 15 |
-
- `
|
| 16 |
-
- `
|
|
|
|
|
|
|
| 17 |
|
| 18 |
## Tables
|
| 19 |
|
| 20 |
### `users`
|
| 21 |
-
Stores app users.
|
| 22 |
-
|
| 23 |
-
Columns:
|
| 24 |
-
- `id` INTEGER, PK
|
| 25 |
-
- `email` VARCHAR(255), nullable, UNIQUE, indexed
|
| 26 |
-
- `display_name` VARCHAR(255), nullable
|
| 27 |
-
- `avatar_url` VARCHAR(1024), nullable
|
| 28 |
-
- `is_active` BOOLEAN, NOT NULL, default `true`
|
| 29 |
-
- `created_at` DATETIME(timezone=True), NOT NULL, default `now()`
|
| 30 |
-
- `updated_at` DATETIME(timezone=True), NOT NULL, default `now()`, auto-updated on row update
|
| 31 |
-
|
| 32 |
-
Relationships:
|
| 33 |
-
- One-to-many with `oauth_accounts`
|
| 34 |
-
- One-to-many with `documents`
|
| 35 |
-
- One-to-many with `conversations`
|
| 36 |
-
|
| 37 |
-
Indexes and constraints:
|
| 38 |
-
- PK: `id`
|
| 39 |
-
- UNIQUE: `email`
|
| 40 |
-
- INDEX: `email` (implicit from `index=True`)
|
| 41 |
-
|
| 42 |
-
---
|
| 43 |
-
|
| 44 |
-
### `oauth_accounts`
|
| 45 |
-
OAuth provider identities linked to users (supports Hugging Face via `provider='huggingface'`).
|
| 46 |
-
|
| 47 |
Columns:
|
| 48 |
-
- `id` INTEGER
|
| 49 |
-
- `
|
| 50 |
-
- `
|
| 51 |
-
- `
|
| 52 |
-
- `
|
| 53 |
-
- `access_token` TEXT, nullable
|
| 54 |
-
- `refresh_token` TEXT, nullable
|
| 55 |
-
- `token_type` VARCHAR(50), nullable
|
| 56 |
-
- `scope` TEXT, nullable
|
| 57 |
-
- `expires_at` DATETIME(timezone=True), nullable
|
| 58 |
-
- `created_at` DATETIME(timezone=True), NOT NULL, default `now()`
|
| 59 |
-
- `updated_at` DATETIME(timezone=True), NOT NULL, default `now()`, auto-updated on row update
|
| 60 |
-
|
| 61 |
-
Relationships:
|
| 62 |
-
- Many-to-one with `users`
|
| 63 |
-
|
| 64 |
-
Indexes and constraints:
|
| 65 |
-
- PK: `id`
|
| 66 |
-
- UNIQUE: (`provider`, `provider_user_id`) as `uq_provider_user`
|
| 67 |
-
- INDEX: (`user_id`, `provider`) as `ix_oauth_user_provider`
|
| 68 |
-
- INDEX: `provider` (implicit)
|
| 69 |
-
- INDEX: `provider_user_id` (implicit)
|
| 70 |
-
|
| 71 |
-
---
|
| 72 |
-
|
| 73 |
-
### `documents`
|
| 74 |
-
Uploaded/ingested source documents owned by users.
|
| 75 |
|
|
|
|
| 76 |
Columns:
|
| 77 |
-
- `id` INTEGER
|
| 78 |
-
- `
|
| 79 |
-
- `title` VARCHAR(255)
|
| 80 |
-
- `
|
| 81 |
-
- `
|
| 82 |
-
- `storage_path` VARCHAR(1024), nullable
|
| 83 |
-
- `summary` TEXT, nullable
|
| 84 |
-
- `created_at` DATETIME(timezone=True), NOT NULL, default `now()`
|
| 85 |
-
- `updated_at` DATETIME(timezone=True), NOT NULL, default `now()`, auto-updated on row update
|
| 86 |
-
|
| 87 |
-
Relationships:
|
| 88 |
-
- Many-to-one with `users`
|
| 89 |
-
- One-to-many with `chunks`
|
| 90 |
-
|
| 91 |
-
Indexes and constraints:
|
| 92 |
-
- PK: `id`
|
| 93 |
-
- INDEX: (`user_id`, `created_at`) as `ix_documents_user_created`
|
| 94 |
-
|
| 95 |
-
---
|
| 96 |
-
|
| 97 |
-
### `chunks`
|
| 98 |
-
Document chunks for retrieval and embedding linkage.
|
| 99 |
|
|
|
|
| 100 |
Columns:
|
| 101 |
-
- `id` INTEGER
|
| 102 |
-
- `
|
| 103 |
-
- `
|
| 104 |
-
- `
|
| 105 |
-
- `
|
| 106 |
-
- `
|
| 107 |
-
- `
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
Indexes and constraints:
|
| 113 |
-
- PK: `id`
|
| 114 |
-
- UNIQUE: (`document_id`, `chunk_index`) as `uq_document_chunk_index`
|
| 115 |
-
- INDEX: (`document_id`, `chunk_index`) as `ix_chunks_document_index`
|
| 116 |
-
- INDEX: `embedding_id` (implicit)
|
| 117 |
-
|
| 118 |
-
---
|
| 119 |
-
|
| 120 |
-
### `conversations`
|
| 121 |
-
User chat sessions.
|
| 122 |
-
|
| 123 |
Columns:
|
| 124 |
-
- `id` INTEGER
|
| 125 |
-
- `
|
| 126 |
-
- `title` VARCHAR(255)
|
| 127 |
-
- `created_at` DATETIME(timezone=True)
|
| 128 |
-
- `updated_at` DATETIME(timezone=True), NOT NULL, default `now()`, auto-updated on row update
|
| 129 |
-
|
| 130 |
-
Relationships:
|
| 131 |
-
- Many-to-one with `users`
|
| 132 |
-
- One-to-many with `messages`
|
| 133 |
-
|
| 134 |
-
Indexes and constraints:
|
| 135 |
-
- PK: `id`
|
| 136 |
-
- INDEX: (`user_id`, `created_at`) as `ix_conversations_user_created`
|
| 137 |
-
|
| 138 |
-
---
|
| 139 |
|
| 140 |
### `messages`
|
| 141 |
-
Conversation messages, including optional citation payload.
|
| 142 |
-
|
| 143 |
Columns:
|
| 144 |
-
- `id` INTEGER
|
| 145 |
-
- `
|
| 146 |
-
- `role` VARCHAR(20)
|
| 147 |
-
- `content` TEXT
|
| 148 |
-
- `
|
| 149 |
-
- `created_at` DATETIME(timezone=True), NOT NULL, default `now()`
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
-
|
| 156 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
## Notes
|
| 159 |
-
-
|
| 160 |
-
-
|
| 161 |
-
-
|
|
|
|
| 1 |
# Database Schema
|
| 2 |
|
| 3 |
+
This document reflects the active SQLAlchemy models in `data/models.py`.
|
| 4 |
|
| 5 |
## Engine and Initialization
|
| 6 |
- ORM: SQLAlchemy 2.x
|
| 7 |
- Base class: `data.db.Base`
|
| 8 |
+
- Default DB: `sqlite:///./notebooklm.db`
|
| 9 |
+
- Initialization: `data.db.init_db()`
|
| 10 |
+
|
| 11 |
+
## Relationship Overview
|
| 12 |
+
- `users` 1:N `notebooks`
|
| 13 |
+
- `notebooks` 1:N `sources`
|
| 14 |
+
- `notebooks` 1:N `chat_threads`
|
| 15 |
+
- `chat_threads` 1:N `messages`
|
| 16 |
+
- `messages` 1:N `message_citations`
|
| 17 |
+
- `sources` 1:N `message_citations`
|
| 18 |
+
- `notebooks` 1:N `artifacts`
|
| 19 |
|
| 20 |
## Tables
|
| 21 |
|
| 22 |
### `users`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
Columns:
|
| 24 |
+
- `id` INTEGER PK
|
| 25 |
+
- `email` VARCHAR(255) NOT NULL UNIQUE INDEX
|
| 26 |
+
- `display_name` VARCHAR(255) NULL
|
| 27 |
+
- `avatar_url` VARCHAR(1024) NULL
|
| 28 |
+
- `created_at` DATETIME(timezone=True) NOT NULL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
### `notebooks`
|
| 31 |
Columns:
|
| 32 |
+
- `id` INTEGER PK
|
| 33 |
+
- `owner_user_id` INTEGER NOT NULL FK -> `users.id` ON DELETE CASCADE INDEX
|
| 34 |
+
- `title` VARCHAR(255) NOT NULL
|
| 35 |
+
- `created_at` DATETIME(timezone=True) NOT NULL
|
| 36 |
+
- `updated_at` DATETIME(timezone=True) NOT NULL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
+
### `sources`
|
| 39 |
Columns:
|
| 40 |
+
- `id` INTEGER PK
|
| 41 |
+
- `notebook_id` INTEGER NOT NULL FK -> `notebooks.id` ON DELETE CASCADE INDEX
|
| 42 |
+
- `type` VARCHAR(50) NOT NULL
|
| 43 |
+
- `title` VARCHAR(255) NULL
|
| 44 |
+
- `original_name` VARCHAR(1024) NULL
|
| 45 |
+
- `url` VARCHAR(2048) NULL
|
| 46 |
+
- `storage_path` VARCHAR(1024) NULL
|
| 47 |
+
- `status` VARCHAR(50) NOT NULL
|
| 48 |
+
- `ingested_at` DATETIME(timezone=True) NULL
|
| 49 |
+
|
| 50 |
+
### `chat_threads`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
Columns:
|
| 52 |
+
- `id` INTEGER PK
|
| 53 |
+
- `notebook_id` INTEGER NOT NULL FK -> `notebooks.id` ON DELETE CASCADE INDEX
|
| 54 |
+
- `title` VARCHAR(255) NULL
|
| 55 |
+
- `created_at` DATETIME(timezone=True) NOT NULL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
### `messages`
|
|
|
|
|
|
|
| 58 |
Columns:
|
| 59 |
+
- `id` INTEGER PK
|
| 60 |
+
- `thread_id` INTEGER NOT NULL FK -> `chat_threads.id` ON DELETE CASCADE INDEX
|
| 61 |
+
- `role` VARCHAR(20) NOT NULL
|
| 62 |
+
- `content` TEXT NOT NULL
|
| 63 |
+
- `created_at` DATETIME(timezone=True) NOT NULL
|
|
|
|
| 64 |
|
| 65 |
+
### `message_citations`
|
| 66 |
+
Columns:
|
| 67 |
+
- `id` INTEGER PK
|
| 68 |
+
- `message_id` INTEGER NOT NULL FK -> `messages.id` ON DELETE CASCADE INDEX
|
| 69 |
+
- `source_id` INTEGER NOT NULL FK -> `sources.id` ON DELETE CASCADE INDEX
|
| 70 |
+
- `chunk_ref` VARCHAR(255) NULL
|
| 71 |
+
- `quote` TEXT NULL
|
| 72 |
+
- `score` FLOAT NULL
|
| 73 |
+
|
| 74 |
+
### `artifacts`
|
| 75 |
+
Columns:
|
| 76 |
+
- `id` INTEGER PK
|
| 77 |
+
- `notebook_id` INTEGER NOT NULL FK -> `notebooks.id` ON DELETE CASCADE INDEX
|
| 78 |
+
- `type` VARCHAR(50) NOT NULL
|
| 79 |
+
- `title` VARCHAR(255) NULL
|
| 80 |
+
- `status` VARCHAR(50) NOT NULL
|
| 81 |
+
- `file_path` VARCHAR(1024) NULL
|
| 82 |
+
- `metadata` JSON NULL (mapped as `artifact_metadata`)
|
| 83 |
+
- `content` TEXT NULL
|
| 84 |
+
- `error_message` TEXT NULL
|
| 85 |
+
- `created_at` DATETIME(timezone=True) NOT NULL
|
| 86 |
+
- `generated_at` DATETIME(timezone=True) NULL
|
| 87 |
|
| 88 |
## Notes
|
| 89 |
+
- Ownership and isolation are anchored by `notebooks.owner_user_id`.
|
| 90 |
+
- Child records are deleted via `ON DELETE CASCADE`.
|
| 91 |
+
- Schema creation is currently handled with `Base.metadata.create_all(...)` (no Alembic yet).
|
ER_DIAGRAM.md
CHANGED
|
@@ -2,79 +2,82 @@
|
|
| 2 |
|
| 3 |
```mermaid
|
| 4 |
erDiagram
|
| 5 |
-
users ||--o{
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
| 10 |
|
| 11 |
users {
|
| 12 |
int id PK
|
| 13 |
string email UK
|
| 14 |
string display_name
|
| 15 |
string avatar_url
|
| 16 |
-
boolean is_active
|
| 17 |
datetime created_at
|
| 18 |
-
datetime updated_at
|
| 19 |
}
|
| 20 |
|
| 21 |
-
|
| 22 |
int id PK
|
| 23 |
-
int
|
| 24 |
-
string
|
| 25 |
-
string provider_user_id
|
| 26 |
-
string username
|
| 27 |
-
text access_token
|
| 28 |
-
text refresh_token
|
| 29 |
-
string token_type
|
| 30 |
-
text scope
|
| 31 |
-
datetime expires_at
|
| 32 |
datetime created_at
|
| 33 |
datetime updated_at
|
| 34 |
}
|
| 35 |
|
| 36 |
-
|
| 37 |
int id PK
|
| 38 |
-
int
|
|
|
|
| 39 |
string title
|
| 40 |
-
string
|
| 41 |
-
string
|
| 42 |
string storage_path
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
datetime created_at
|
| 45 |
-
datetime updated_at
|
| 46 |
}
|
| 47 |
|
| 48 |
-
|
| 49 |
int id PK
|
| 50 |
-
int
|
| 51 |
-
|
| 52 |
text content
|
| 53 |
-
int token_count
|
| 54 |
-
string embedding_id
|
| 55 |
datetime created_at
|
| 56 |
}
|
| 57 |
|
| 58 |
-
|
| 59 |
int id PK
|
| 60 |
-
int
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
| 64 |
}
|
| 65 |
|
| 66 |
-
|
| 67 |
int id PK
|
| 68 |
-
int
|
| 69 |
-
string
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
text content
|
| 71 |
-
|
| 72 |
datetime created_at
|
|
|
|
| 73 |
}
|
| 74 |
```
|
| 75 |
|
| 76 |
## Notes
|
| 77 |
-
-
|
| 78 |
-
-
|
| 79 |
-
|
| 80 |
-
- `uq_document_chunk_index` on (`document_id`, `chunk_index`)
|
|
|
|
| 2 |
|
| 3 |
```mermaid
|
| 4 |
erDiagram
|
| 5 |
+
users ||--o{ notebooks : owns
|
| 6 |
+
notebooks ||--o{ sources : contains
|
| 7 |
+
notebooks ||--o{ chat_threads : has
|
| 8 |
+
chat_threads ||--o{ messages : contains
|
| 9 |
+
messages ||--o{ message_citations : has
|
| 10 |
+
sources ||--o{ message_citations : cited_by
|
| 11 |
+
notebooks ||--o{ artifacts : generates
|
| 12 |
|
| 13 |
users {
|
| 14 |
int id PK
|
| 15 |
string email UK
|
| 16 |
string display_name
|
| 17 |
string avatar_url
|
|
|
|
| 18 |
datetime created_at
|
|
|
|
| 19 |
}
|
| 20 |
|
| 21 |
+
notebooks {
|
| 22 |
int id PK
|
| 23 |
+
int owner_user_id FK
|
| 24 |
+
string title
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
datetime created_at
|
| 26 |
datetime updated_at
|
| 27 |
}
|
| 28 |
|
| 29 |
+
sources {
|
| 30 |
int id PK
|
| 31 |
+
int notebook_id FK
|
| 32 |
+
string type
|
| 33 |
string title
|
| 34 |
+
string original_name
|
| 35 |
+
string url
|
| 36 |
string storage_path
|
| 37 |
+
string status
|
| 38 |
+
datetime ingested_at
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
chat_threads {
|
| 42 |
+
int id PK
|
| 43 |
+
int notebook_id FK
|
| 44 |
+
string title
|
| 45 |
datetime created_at
|
|
|
|
| 46 |
}
|
| 47 |
|
| 48 |
+
messages {
|
| 49 |
int id PK
|
| 50 |
+
int thread_id FK
|
| 51 |
+
string role
|
| 52 |
text content
|
|
|
|
|
|
|
| 53 |
datetime created_at
|
| 54 |
}
|
| 55 |
|
| 56 |
+
message_citations {
|
| 57 |
int id PK
|
| 58 |
+
int message_id FK
|
| 59 |
+
int source_id FK
|
| 60 |
+
string chunk_ref
|
| 61 |
+
text quote
|
| 62 |
+
float score
|
| 63 |
}
|
| 64 |
|
| 65 |
+
artifacts {
|
| 66 |
int id PK
|
| 67 |
+
int notebook_id FK
|
| 68 |
+
string type
|
| 69 |
+
string title
|
| 70 |
+
string status
|
| 71 |
+
string file_path
|
| 72 |
+
json metadata
|
| 73 |
text content
|
| 74 |
+
text error_message
|
| 75 |
datetime created_at
|
| 76 |
+
datetime generated_at
|
| 77 |
}
|
| 78 |
```
|
| 79 |
|
| 80 |
## Notes
|
| 81 |
+
- User isolation is enforced through ownership on `notebooks.owner_user_id`.
|
| 82 |
+
- Thread, source, citation, and artifact records are notebook-scoped.
|
| 83 |
+
- Artifact metadata is stored in JSON (`artifacts.metadata`).
|
|
|
app.py
CHANGED
|
@@ -2,9 +2,12 @@ from __future__ import annotations
|
|
| 2 |
|
| 3 |
from contextlib import asynccontextmanager
|
| 4 |
import os
|
|
|
|
|
|
|
| 5 |
from datetime import datetime, timezone
|
| 6 |
from pathlib import Path
|
| 7 |
from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
|
|
|
|
| 8 |
|
| 9 |
from fastapi.concurrency import run_in_threadpool
|
| 10 |
from fastapi import APIRouter, BackgroundTasks, Depends, FastAPI, File, Form, HTTPException, Request, UploadFile, status
|
|
@@ -101,14 +104,6 @@ class ThreadResponse(BaseModel):
|
|
| 101 |
created_at: datetime
|
| 102 |
|
| 103 |
|
| 104 |
-
class MessageResponse(BaseModel):
|
| 105 |
-
id: int
|
| 106 |
-
thread_id: int
|
| 107 |
-
role: str
|
| 108 |
-
content: str
|
| 109 |
-
created_at: datetime
|
| 110 |
-
|
| 111 |
-
|
| 112 |
class CitationResponse(BaseModel):
|
| 113 |
source_title: str | None = None
|
| 114 |
source_id: int
|
|
@@ -117,6 +112,15 @@ class CitationResponse(BaseModel):
|
|
| 117 |
score: float | None = None
|
| 118 |
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
class ChatRequest(BaseModel):
|
| 121 |
question: str = Field(min_length=1)
|
| 122 |
top_k: int = Field(default=5, ge=1, le=12)
|
|
@@ -191,6 +195,9 @@ class ArtifactResponse(BaseModel):
|
|
| 191 |
|
| 192 |
MAX_HISTORY_MESSAGES = 8
|
| 193 |
MAX_HISTORY_CHARS_PER_MESSAGE = 1000
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
|
| 196 |
def _build_conversation_history(
|
|
@@ -246,6 +253,53 @@ def _append_query_param(url: str, key: str, value: str) -> str:
|
|
| 246 |
return urlunsplit((split.scheme, split.netloc, split.path, updated_query, split.fragment))
|
| 247 |
|
| 248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
@app.get("/health", tags=["system"])
|
| 250 |
def health_check() -> dict[str, str]:
|
| 251 |
return {"status": "ok"}
|
|
@@ -461,6 +515,11 @@ def delete_notebook(
|
|
| 461 |
if notebook is None:
|
| 462 |
raise HTTPException(status_code=404, detail="Notebook not found for this user.")
|
| 463 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
crud.delete_notebook(db=db, notebook=notebook)
|
| 465 |
return NotebookDeleteResponse(status="deleted", notebook_id=notebook_id)
|
| 466 |
|
|
@@ -544,18 +603,18 @@ async def upload_source_for_notebook(
|
|
| 544 |
if notebook is None:
|
| 545 |
raise HTTPException(status_code=404, detail="Notebook not found for this user.")
|
| 546 |
|
| 547 |
-
|
| 548 |
-
upload_dir.mkdir(parents=True, exist_ok=True)
|
| 549 |
-
destination = upload_dir / file.filename
|
| 550 |
content = await file.read()
|
| 551 |
destination.write_bytes(content)
|
|
|
|
|
|
|
| 552 |
|
| 553 |
source = crud.create_source(
|
| 554 |
db=db,
|
| 555 |
notebook_id=notebook_id,
|
| 556 |
source_type="file",
|
| 557 |
-
title=
|
| 558 |
-
original_name=
|
| 559 |
url=None,
|
| 560 |
storage_path=str(destination),
|
| 561 |
status=status,
|
|
@@ -680,9 +739,25 @@ def list_messages_for_thread(
|
|
| 680 |
raise HTTPException(status_code=404, detail="Thread not found for this notebook.")
|
| 681 |
|
| 682 |
messages = crud.list_messages_for_thread(db=db, thread_id=thread_id)
|
|
|
|
| 683 |
return [
|
| 684 |
MessageResponse(
|
| 685 |
-
id=m.id,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 686 |
)
|
| 687 |
for m in messages
|
| 688 |
]
|
|
@@ -783,6 +858,7 @@ def chat_on_thread(
|
|
| 783 |
role=user_message.role,
|
| 784 |
content=user_message.content,
|
| 785 |
created_at=user_message.created_at,
|
|
|
|
| 786 |
),
|
| 787 |
assistant_message=MessageResponse(
|
| 788 |
id=assistant_message.id,
|
|
@@ -790,6 +866,7 @@ def chat_on_thread(
|
|
| 790 |
role=assistant_message.role,
|
| 791 |
content=assistant_message.content,
|
| 792 |
created_at=assistant_message.created_at,
|
|
|
|
| 793 |
),
|
| 794 |
citations=citations,
|
| 795 |
)
|
|
|
|
| 2 |
|
| 3 |
from contextlib import asynccontextmanager
|
| 4 |
import os
|
| 5 |
+
import re
|
| 6 |
+
import shutil
|
| 7 |
from datetime import datetime, timezone
|
| 8 |
from pathlib import Path
|
| 9 |
from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
|
| 10 |
+
from uuid import uuid4
|
| 11 |
|
| 12 |
from fastapi.concurrency import run_in_threadpool
|
| 13 |
from fastapi import APIRouter, BackgroundTasks, Depends, FastAPI, File, Form, HTTPException, Request, UploadFile, status
|
|
|
|
| 104 |
created_at: datetime
|
| 105 |
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
class CitationResponse(BaseModel):
|
| 108 |
source_title: str | None = None
|
| 109 |
source_id: int
|
|
|
|
| 112 |
score: float | None = None
|
| 113 |
|
| 114 |
|
| 115 |
+
class MessageResponse(BaseModel):
|
| 116 |
+
id: int
|
| 117 |
+
thread_id: int
|
| 118 |
+
role: str
|
| 119 |
+
content: str
|
| 120 |
+
created_at: datetime
|
| 121 |
+
citations: list[CitationResponse] = Field(default_factory=list)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
class ChatRequest(BaseModel):
|
| 125 |
question: str = Field(min_length=1)
|
| 126 |
top_k: int = Field(default=5, ge=1, le=12)
|
|
|
|
| 195 |
|
| 196 |
MAX_HISTORY_MESSAGES = 8
|
| 197 |
MAX_HISTORY_CHARS_PER_MESSAGE = 1000
|
| 198 |
+
MAX_UPLOAD_FILENAME_LENGTH = 255
|
| 199 |
+
SAFE_FILENAME_RE = re.compile(r"[^A-Za-z0-9._-]+")
|
| 200 |
+
UPLOADS_ROOT = Path("uploads")
|
| 201 |
|
| 202 |
|
| 203 |
def _build_conversation_history(
|
|
|
|
| 253 |
return urlunsplit((split.scheme, split.netloc, split.path, updated_query, split.fragment))
|
| 254 |
|
| 255 |
|
| 256 |
+
def _sanitize_upload_filename(filename: str | None) -> str:
|
| 257 |
+
raw_name = Path(str(filename or "")).name.replace("\x00", "").strip()
|
| 258 |
+
sanitized = SAFE_FILENAME_RE.sub("_", raw_name).strip("._-")
|
| 259 |
+
if not sanitized:
|
| 260 |
+
sanitized = f"upload_{uuid4().hex[:10]}.bin"
|
| 261 |
+
if len(sanitized) > MAX_UPLOAD_FILENAME_LENGTH:
|
| 262 |
+
ext = Path(sanitized).suffix[:20]
|
| 263 |
+
stem_limit = max(1, MAX_UPLOAD_FILENAME_LENGTH - len(ext))
|
| 264 |
+
sanitized = f"{Path(sanitized).stem[:stem_limit]}{ext}"
|
| 265 |
+
return sanitized
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def _resolve_notebook_upload_path(notebook_id: int, filename: str | None) -> Path:
|
| 269 |
+
upload_dir = UPLOADS_ROOT / f"notebook_{notebook_id}"
|
| 270 |
+
upload_dir.mkdir(parents=True, exist_ok=True)
|
| 271 |
+
upload_dir_resolved = upload_dir.resolve()
|
| 272 |
+
|
| 273 |
+
safe_name = _sanitize_upload_filename(filename)
|
| 274 |
+
destination = (upload_dir_resolved / safe_name).resolve()
|
| 275 |
+
if destination.parent != upload_dir_resolved:
|
| 276 |
+
raise HTTPException(status_code=400, detail="Invalid upload filename.")
|
| 277 |
+
|
| 278 |
+
if destination.exists():
|
| 279 |
+
destination = (upload_dir_resolved / f"{destination.stem}_{uuid4().hex[:8]}{destination.suffix}").resolve()
|
| 280 |
+
return destination
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
def _remove_tree_within_root(root: Path, target: Path) -> None:
|
| 284 |
+
if not target.exists():
|
| 285 |
+
return
|
| 286 |
+
root_resolved = root.resolve()
|
| 287 |
+
target_resolved = target.resolve()
|
| 288 |
+
if target_resolved == root_resolved or root_resolved not in target_resolved.parents:
|
| 289 |
+
raise RuntimeError(f"Refusing to delete path outside root: {target_resolved}")
|
| 290 |
+
shutil.rmtree(target_resolved)
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
def _cleanup_notebook_storage(owner_user_id: int, notebook_id: int) -> None:
|
| 294 |
+
storage_base = Path(os.getenv("STORAGE_BASE_DIR", "data"))
|
| 295 |
+
notebook_root = storage_base / "users" / str(owner_user_id) / "notebooks"
|
| 296 |
+
notebook_path = notebook_root / str(notebook_id)
|
| 297 |
+
_remove_tree_within_root(notebook_root, notebook_path)
|
| 298 |
+
|
| 299 |
+
upload_path = UPLOADS_ROOT / f"notebook_{notebook_id}"
|
| 300 |
+
_remove_tree_within_root(UPLOADS_ROOT, upload_path)
|
| 301 |
+
|
| 302 |
+
|
| 303 |
@app.get("/health", tags=["system"])
|
| 304 |
def health_check() -> dict[str, str]:
|
| 305 |
return {"status": "ok"}
|
|
|
|
| 515 |
if notebook is None:
|
| 516 |
raise HTTPException(status_code=404, detail="Notebook not found for this user.")
|
| 517 |
|
| 518 |
+
try:
|
| 519 |
+
_cleanup_notebook_storage(owner_user_id=current_user.id, notebook_id=notebook_id)
|
| 520 |
+
except Exception as exc:
|
| 521 |
+
raise HTTPException(status_code=500, detail=f"Failed to delete notebook storage: {exc}") from exc
|
| 522 |
+
|
| 523 |
crud.delete_notebook(db=db, notebook=notebook)
|
| 524 |
return NotebookDeleteResponse(status="deleted", notebook_id=notebook_id)
|
| 525 |
|
|
|
|
| 603 |
if notebook is None:
|
| 604 |
raise HTTPException(status_code=404, detail="Notebook not found for this user.")
|
| 605 |
|
| 606 |
+
destination = _resolve_notebook_upload_path(notebook_id=notebook_id, filename=file.filename)
|
|
|
|
|
|
|
| 607 |
content = await file.read()
|
| 608 |
destination.write_bytes(content)
|
| 609 |
+
original_name = Path(str(file.filename or destination.name)).name
|
| 610 |
+
source_title = title or original_name or destination.name
|
| 611 |
|
| 612 |
source = crud.create_source(
|
| 613 |
db=db,
|
| 614 |
notebook_id=notebook_id,
|
| 615 |
source_type="file",
|
| 616 |
+
title=source_title,
|
| 617 |
+
original_name=original_name,
|
| 618 |
url=None,
|
| 619 |
storage_path=str(destination),
|
| 620 |
status=status,
|
|
|
|
| 739 |
raise HTTPException(status_code=404, detail="Thread not found for this notebook.")
|
| 740 |
|
| 741 |
messages = crud.list_messages_for_thread(db=db, thread_id=thread_id)
|
| 742 |
+
citations_by_message = crud.list_message_citations_for_thread(db=db, thread_id=thread_id)
|
| 743 |
return [
|
| 744 |
MessageResponse(
|
| 745 |
+
id=m.id,
|
| 746 |
+
thread_id=m.thread_id,
|
| 747 |
+
role=m.role,
|
| 748 |
+
content=m.content,
|
| 749 |
+
created_at=m.created_at,
|
| 750 |
+
citations=[
|
| 751 |
+
CitationResponse(
|
| 752 |
+
source_title=entry.get("source_title"),
|
| 753 |
+
source_id=int(entry.get("source_id", 0)),
|
| 754 |
+
chunk_ref=(str(entry.get("chunk_ref")) if entry.get("chunk_ref") else None),
|
| 755 |
+
quote=(str(entry.get("quote")) if entry.get("quote") else None),
|
| 756 |
+
score=(float(entry["score"]) if entry.get("score") is not None else None),
|
| 757 |
+
)
|
| 758 |
+
for entry in citations_by_message.get(m.id, [])
|
| 759 |
+
if int(entry.get("source_id", 0)) > 0
|
| 760 |
+
],
|
| 761 |
)
|
| 762 |
for m in messages
|
| 763 |
]
|
|
|
|
| 858 |
role=user_message.role,
|
| 859 |
content=user_message.content,
|
| 860 |
created_at=user_message.created_at,
|
| 861 |
+
citations=[],
|
| 862 |
),
|
| 863 |
assistant_message=MessageResponse(
|
| 864 |
id=assistant_message.id,
|
|
|
|
| 866 |
role=assistant_message.role,
|
| 867 |
content=assistant_message.content,
|
| 868 |
created_at=assistant_message.created_at,
|
| 869 |
+
citations=citations,
|
| 870 |
),
|
| 871 |
citations=citations,
|
| 872 |
)
|
auth/session.py
CHANGED
|
@@ -15,6 +15,7 @@ from data.db import get_db
|
|
| 15 |
AUTH_MODE_DEV = "dev"
|
| 16 |
AUTH_MODE_HF = "hf_oauth"
|
| 17 |
AUTH_BRIDGE_SALT = "streamlit-auth-bridge"
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
@dataclass(frozen=True)
|
|
@@ -36,18 +37,31 @@ def get_auth_mode() -> str:
|
|
| 36 |
|
| 37 |
def configure_session_middleware(app) -> None:
|
| 38 |
"""Attach Starlette session middleware once during app setup."""
|
| 39 |
-
secret = os.getenv("APP_SESSION_SECRET",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
app.add_middleware(
|
| 41 |
SessionMiddleware,
|
| 42 |
secret_key=secret,
|
| 43 |
-
same_site=
|
| 44 |
-
https_only=
|
| 45 |
max_age=60 * 60 * 24 * 7, # 7 days
|
| 46 |
)
|
| 47 |
|
| 48 |
|
| 49 |
def _bridge_serializer() -> URLSafeTimedSerializer:
|
| 50 |
-
secret = os.getenv("APP_SESSION_SECRET",
|
| 51 |
return URLSafeTimedSerializer(secret_key=secret, salt=AUTH_BRIDGE_SALT)
|
| 52 |
|
| 53 |
|
|
|
|
| 15 |
AUTH_MODE_DEV = "dev"
|
| 16 |
AUTH_MODE_HF = "hf_oauth"
|
| 17 |
AUTH_BRIDGE_SALT = "streamlit-auth-bridge"
|
| 18 |
+
DEFAULT_DEV_SESSION_SECRET = "dev-only-session-secret-change-me"
|
| 19 |
|
| 20 |
|
| 21 |
@dataclass(frozen=True)
|
|
|
|
| 37 |
|
| 38 |
def configure_session_middleware(app) -> None:
|
| 39 |
"""Attach Starlette session middleware once during app setup."""
|
| 40 |
+
secret = os.getenv("APP_SESSION_SECRET", DEFAULT_DEV_SESSION_SECRET).strip()
|
| 41 |
+
auth_mode = get_auth_mode()
|
| 42 |
+
if auth_mode == AUTH_MODE_HF and (not secret or secret == DEFAULT_DEV_SESSION_SECRET):
|
| 43 |
+
raise RuntimeError("APP_SESSION_SECRET must be set to a non-default value in hf_oauth mode.")
|
| 44 |
+
same_site = os.getenv("SESSION_COOKIE_SAMESITE", "lax").strip().lower()
|
| 45 |
+
if same_site not in {"lax", "strict", "none"}:
|
| 46 |
+
same_site = "lax"
|
| 47 |
+
secure_default = "1" if auth_mode == AUTH_MODE_HF else "0"
|
| 48 |
+
https_only = os.getenv("SESSION_COOKIE_SECURE", secure_default).strip().lower() in {
|
| 49 |
+
"1",
|
| 50 |
+
"true",
|
| 51 |
+
"yes",
|
| 52 |
+
"on",
|
| 53 |
+
}
|
| 54 |
app.add_middleware(
|
| 55 |
SessionMiddleware,
|
| 56 |
secret_key=secret,
|
| 57 |
+
same_site=same_site,
|
| 58 |
+
https_only=https_only,
|
| 59 |
max_age=60 * 60 * 24 * 7, # 7 days
|
| 60 |
)
|
| 61 |
|
| 62 |
|
| 63 |
def _bridge_serializer() -> URLSafeTimedSerializer:
|
| 64 |
+
secret = os.getenv("APP_SESSION_SECRET", DEFAULT_DEV_SESSION_SECRET)
|
| 65 |
return URLSafeTimedSerializer(secret_key=secret, salt=AUTH_BRIDGE_SALT)
|
| 66 |
|
| 67 |
|
data/crud.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
|
|
|
| 3 |
from datetime import datetime
|
| 4 |
from data.models import Artifact
|
| 5 |
from sqlalchemy.orm import Session
|
|
@@ -212,6 +213,33 @@ def create_message_citations(
|
|
| 212 |
db.refresh(row)
|
| 213 |
return rows
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
def get_artifact(db: Session, artifact_id: int) -> Artifact | None:
|
| 216 |
return db.get(Artifact, artifact_id)
|
| 217 |
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
+
from collections import defaultdict
|
| 4 |
from datetime import datetime
|
| 5 |
from data.models import Artifact
|
| 6 |
from sqlalchemy.orm import Session
|
|
|
|
| 213 |
db.refresh(row)
|
| 214 |
return rows
|
| 215 |
|
| 216 |
+
|
| 217 |
+
def list_message_citations_for_thread(
|
| 218 |
+
db: Session, thread_id: int
|
| 219 |
+
) -> dict[int, list[dict[str, int | str | float | None]]]:
|
| 220 |
+
rows = (
|
| 221 |
+
db.query(MessageCitation, Source.title)
|
| 222 |
+
.join(Source, Source.id == MessageCitation.source_id)
|
| 223 |
+
.join(Message, Message.id == MessageCitation.message_id)
|
| 224 |
+
.filter(Message.thread_id == thread_id)
|
| 225 |
+
.order_by(MessageCitation.id.asc())
|
| 226 |
+
.all()
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
citations_by_message: dict[int, list[dict[str, int | str | float | None]]] = defaultdict(list)
|
| 230 |
+
for citation, source_title in rows:
|
| 231 |
+
citations_by_message[int(citation.message_id)].append(
|
| 232 |
+
{
|
| 233 |
+
"source_id": int(citation.source_id),
|
| 234 |
+
"source_title": source_title,
|
| 235 |
+
"chunk_ref": citation.chunk_ref,
|
| 236 |
+
"quote": citation.quote,
|
| 237 |
+
"score": citation.score,
|
| 238 |
+
}
|
| 239 |
+
)
|
| 240 |
+
return dict(citations_by_message)
|
| 241 |
+
|
| 242 |
+
|
| 243 |
def get_artifact(db: Session, artifact_id: int) -> Artifact | None:
|
| 244 |
return db.get(Artifact, artifact_id)
|
| 245 |
|
docs/DESIGN_BRIEF.md
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# NotebookLM Clone Design Brief
|
| 2 |
+
|
| 3 |
+
## 1. System Overview
|
| 4 |
+
This system is a full-stack NotebookLM-style application that supports:
|
| 5 |
+
- source ingestion (`.pdf`, `.pptx`, `.txt`, web URL)
|
| 6 |
+
- retrieval-augmented chat with citations
|
| 7 |
+
- artifact generation (report, quiz, podcast transcript + audio)
|
| 8 |
+
- strict per-user data isolation with multiple notebooks per user
|
| 9 |
+
|
| 10 |
+
The stack is optimized for Hugging Face Spaces deployment:
|
| 11 |
+
- frontend: Streamlit (`frontend/app.py`)
|
| 12 |
+
- backend API: FastAPI (`app.py`)
|
| 13 |
+
- metadata store: SQLite via SQLAlchemy (`data/models.py`, `data/crud.py`)
|
| 14 |
+
- vector store: ChromaDB per user+notebook (`src/ingestion/vectorstore.py`)
|
| 15 |
+
- ingestion/artifact services: `src/ingestion/*`, `src/artifacts/*`
|
| 16 |
+
|
| 17 |
+
## 2. Architecture Diagram
|
| 18 |
+
```mermaid
|
| 19 |
+
flowchart TD
|
| 20 |
+
A[Streamlit Frontend] --> B[FastAPI Backend]
|
| 21 |
+
B --> C[Auth Layer<br/>HF OAuth / Dev Auth]
|
| 22 |
+
B --> D[Notebook & Source APIs]
|
| 23 |
+
B --> E[Thread & Chat APIs]
|
| 24 |
+
B --> F[Artifact APIs]
|
| 25 |
+
|
| 26 |
+
D --> G[Ingestion Service]
|
| 27 |
+
G --> H[Extractors<br/>PDF/PPTX/TXT/URL]
|
| 28 |
+
G --> I[Chunker]
|
| 29 |
+
G --> J[Embedding Adapter]
|
| 30 |
+
G --> K[ChromaDB]
|
| 31 |
+
|
| 32 |
+
E --> K
|
| 33 |
+
E --> L[LLM Client]
|
| 34 |
+
E --> M[Message + Citation Tables]
|
| 35 |
+
|
| 36 |
+
F --> L
|
| 37 |
+
F --> N[TTS Adapter<br/>Edge/OpenAI/ElevenLabs]
|
| 38 |
+
F --> O[Artifacts on Disk]
|
| 39 |
+
|
| 40 |
+
B --> P[(SQLite DB)]
|
| 41 |
+
B --> Q[/data + uploads Storage]
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
## 3. Component Responsibilities
|
| 45 |
+
- `frontend/app.py`
|
| 46 |
+
- authentication-aware UI
|
| 47 |
+
- notebook switching
|
| 48 |
+
- source upload/URL ingestion
|
| 49 |
+
- chat interface + citation display
|
| 50 |
+
- artifact generation, preview, and downloads
|
| 51 |
+
- `app.py`
|
| 52 |
+
- route orchestration and auth enforcement
|
| 53 |
+
- notebook/source/thread/artifact lifecycle endpoints
|
| 54 |
+
- chat orchestration with retrieval + prompting
|
| 55 |
+
- background podcast generation
|
| 56 |
+
- `auth/oauth.py`, `auth/session.py`
|
| 57 |
+
- HF OAuth code exchange
|
| 58 |
+
- secure session bridging to Streamlit
|
| 59 |
+
- current-user resolution
|
| 60 |
+
- `src/ingestion/*`
|
| 61 |
+
- extraction, chunking, embedding, vector upsert/query
|
| 62 |
+
- `src/artifacts/*`
|
| 63 |
+
- report/quiz/podcast generation and storage
|
| 64 |
+
- pluggable TTS providers (`edge`, `openai`, `elevenlabs`)
|
| 65 |
+
- `data/models.py`, `data/crud.py`
|
| 66 |
+
- relational schema and ownership-scoped queries
|
| 67 |
+
|
| 68 |
+
## 4. Data Model and Storage Strategy
|
| 69 |
+
Relational entities:
|
| 70 |
+
- `users`
|
| 71 |
+
- `notebooks` (`owner_user_id` foreign key)
|
| 72 |
+
- `sources` (per notebook)
|
| 73 |
+
- `chat_threads` and `messages`
|
| 74 |
+
- `message_citations` (assistant message -> source references)
|
| 75 |
+
- `artifacts` (status, metadata, content, file path)
|
| 76 |
+
|
| 77 |
+
Filesystem layout:
|
| 78 |
+
```text
|
| 79 |
+
<STORAGE_BASE_DIR>/users/<user_id>/notebooks/<notebook_id>/
|
| 80 |
+
files_raw/
|
| 81 |
+
files_extracted/
|
| 82 |
+
chroma/
|
| 83 |
+
artifacts/reports/
|
| 84 |
+
artifacts/quizzes/
|
| 85 |
+
artifacts/podcasts/
|
| 86 |
+
uploads/notebook_<notebook_id>/
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
Design rationale:
|
| 90 |
+
- SQLite keeps operational complexity low for MVP.
|
| 91 |
+
- Chroma per notebook enables practical RAG retrieval with low infra overhead.
|
| 92 |
+
- Disk layout mirrors ownership boundaries for simple cleanup and auditability.
|
| 93 |
+
|
| 94 |
+
## 5. End-to-End Flow
|
| 95 |
+
### 5.1 Ingestion
|
| 96 |
+
1. User uploads file or submits URL from Streamlit.
|
| 97 |
+
2. Backend verifies notebook ownership and validates URL safety (if URL).
|
| 98 |
+
3. Source record is created with `processing` status.
|
| 99 |
+
4. Ingestion service extracts text, chunks, embeds, and upserts into Chroma.
|
| 100 |
+
5. Source status transitions to `ready` or `failed`.
|
| 101 |
+
|
| 102 |
+
### 5.2 Retrieval + Chat
|
| 103 |
+
1. User sends a message in a notebook thread.
|
| 104 |
+
2. Backend checks notebook/thread ownership.
|
| 105 |
+
3. Query embedding is computed and top-k chunks are retrieved from notebook Chroma.
|
| 106 |
+
4. Prompt is assembled with conversation history and retrieved context.
|
| 107 |
+
5. LLM generates an answer.
|
| 108 |
+
6. Assistant message and structured citations are persisted.
|
| 109 |
+
7. UI shows answer and citations; citations remain available on subsequent reloads.
|
| 110 |
+
|
| 111 |
+
## 6. Security Plan
|
| 112 |
+
Authentication and identity:
|
| 113 |
+
- `AUTH_MODE=hf_oauth` for production deployments.
|
| 114 |
+
- Session-based current-user identity with signed bridge tokens.
|
| 115 |
+
|
| 116 |
+
User isolation:
|
| 117 |
+
- all notebook/thread/source/artifact endpoints verify ownership (`owner_user_id`)
|
| 118 |
+
- retrieval path binds queries to current user and notebook
|
| 119 |
+
|
| 120 |
+
Path/data protection:
|
| 121 |
+
- upload filenames are sanitized and constrained to notebook upload roots
|
| 122 |
+
- deletion is bounded to expected storage roots to prevent unsafe recursive deletes
|
| 123 |
+
- URL ingestion blocks local/private network targets (SSRF reduction)
|
| 124 |
+
|
| 125 |
+
Operational controls:
|
| 126 |
+
- environment-based secrets (`APP_SESSION_SECRET`, API keys)
|
| 127 |
+
- CI test gate before deploy
|
| 128 |
+
|
| 129 |
+
## 7. Milestone Plan
|
| 130 |
+
### MVP (Milestone 1)
|
| 131 |
+
- auth + sessions
|
| 132 |
+
- notebook CRUD + isolation checks
|
| 133 |
+
- ingestion for PDF/PPTX/TXT/URL
|
| 134 |
+
- notebook-scoped RAG chat with citations
|
| 135 |
+
|
| 136 |
+
### Milestone 2
|
| 137 |
+
- artifact generation endpoints (report/quiz/podcast)
|
| 138 |
+
- transcript/audio persistence and frontend playback/download
|
| 139 |
+
- improved chat UX and citation persistence in history
|
| 140 |
+
|
| 141 |
+
### Milestone 3 (Extensions)
|
| 142 |
+
- compare retrieval techniques (baseline semantic vs hybrid/rerank)
|
| 143 |
+
- latency/quality benchmarking and report
|
| 144 |
+
- stronger observability and error analytics
|
| 145 |
+
|
| 146 |
+
## 8. Key Risks and Mitigations
|
| 147 |
+
- LLM/API cost volatility
|
| 148 |
+
- mitigate with model selection defaults, request limits, caching opportunities
|
| 149 |
+
- HF `/data` ephemerality on free tier
|
| 150 |
+
- document tradeoff; optional HF dataset persistence extension
|
| 151 |
+
- retrieval quality drift across document types
|
| 152 |
+
- tune chunking and top-k; evaluate reranking/hybrid methods
|
| 153 |
+
- URL ingestion abuse
|
| 154 |
+
- strict scheme/host/IP/redirect/content-size checks
|
| 155 |
+
- dependency/runtime mismatch
|
| 156 |
+
- CI tests and pinned dependency strategy where practical
|
| 157 |
+
|
| 158 |
+
## 9. Specifications and References in Repo
|
| 159 |
+
- ingestion spec: `docs/INGESTION_SPEC.md`
|
| 160 |
+
- architecture spec: `docs/STREAMLIT_ARCHITECTURE_SPEC.md`
|
| 161 |
+
- integration notes: `INTEGRATION.md`
|
| 162 |
+
- schema docs: `ER_DIAGRAM.md`, `DATABASE_SCHEMA.md`
|
| 163 |
+
|
| 164 |
+
This brief is intended for export to PDF as the 2-4 page design deliverable.
|
frontend/app.py
CHANGED
|
@@ -525,8 +525,12 @@ elif page == "Notebooks":
|
|
| 525 |
for msg in message_result:
|
| 526 |
role = msg.get("role", "unknown")
|
| 527 |
content = msg.get("content", "")
|
|
|
|
| 528 |
if role == "assistant":
|
| 529 |
st.markdown(f"**Assistant:** {content}")
|
|
|
|
|
|
|
|
|
|
| 530 |
else:
|
| 531 |
st.markdown(f"**You:** {content}")
|
| 532 |
else:
|
|
|
|
| 525 |
for msg in message_result:
|
| 526 |
role = msg.get("role", "unknown")
|
| 527 |
content = msg.get("content", "")
|
| 528 |
+
citations = msg.get("citations", [])
|
| 529 |
if role == "assistant":
|
| 530 |
st.markdown(f"**Assistant:** {content}")
|
| 531 |
+
if isinstance(citations, list) and citations:
|
| 532 |
+
with st.expander("Citations", expanded=False):
|
| 533 |
+
st.dataframe(citations, use_container_width=True)
|
| 534 |
else:
|
| 535 |
st.markdown(f"**You:** {content}")
|
| 536 |
else:
|
requirements.txt
CHANGED
|
@@ -25,6 +25,7 @@ nltk
|
|
| 25 |
tqdm
|
| 26 |
pytest
|
| 27 |
edge-tts
|
|
|
|
| 28 |
pydub
|
| 29 |
ffmpeg-python
|
| 30 |
# NOTE: install ffmpeg system binary separately (e.g., `brew install ffmpeg`)
|
|
|
|
| 25 |
tqdm
|
| 26 |
pytest
|
| 27 |
edge-tts
|
| 28 |
+
elevenlabs>=1.0.0
|
| 29 |
pydub
|
| 30 |
ffmpeg-python
|
| 31 |
# NOTE: install ffmpeg system binary separately (e.g., `brew install ffmpeg`)
|
tests/test_chat_citations.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Integration tests for citation persistence in chat threads.
|
| 3 |
+
"""
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import pathlib
|
| 7 |
+
import sys
|
| 8 |
+
from unittest.mock import patch
|
| 9 |
+
|
| 10 |
+
import pytest
|
| 11 |
+
from fastapi.testclient import TestClient
|
| 12 |
+
from sqlalchemy import create_engine
|
| 13 |
+
from sqlalchemy.orm import sessionmaker
|
| 14 |
+
|
| 15 |
+
ROOT = pathlib.Path(__file__).resolve().parents[1]
|
| 16 |
+
sys.path.insert(0, str(ROOT))
|
| 17 |
+
|
| 18 |
+
from app import app
|
| 19 |
+
from data.db import Base, get_db
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@pytest.fixture()
|
| 23 |
+
def db_engine(tmp_path):
|
| 24 |
+
db_file = tmp_path / "test_chat_citations.db"
|
| 25 |
+
engine = create_engine(
|
| 26 |
+
f"sqlite:///{db_file}",
|
| 27 |
+
connect_args={"check_same_thread": False},
|
| 28 |
+
)
|
| 29 |
+
import data.models # noqa: F401
|
| 30 |
+
|
| 31 |
+
Base.metadata.create_all(bind=engine)
|
| 32 |
+
yield engine
|
| 33 |
+
Base.metadata.drop_all(bind=engine)
|
| 34 |
+
engine.dispose()
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@pytest.fixture()
|
| 38 |
+
def db_session(db_engine):
|
| 39 |
+
Session = sessionmaker(autocommit=False, autoflush=False, bind=db_engine)
|
| 40 |
+
session = Session()
|
| 41 |
+
yield session
|
| 42 |
+
session.close()
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@pytest.fixture()
|
| 46 |
+
def client(db_session, monkeypatch):
|
| 47 |
+
monkeypatch.setenv("AUTH_MODE", "dev")
|
| 48 |
+
monkeypatch.setenv("APP_SESSION_SECRET", "chat-citations-test-secret")
|
| 49 |
+
|
| 50 |
+
def _override_get_db():
|
| 51 |
+
yield db_session
|
| 52 |
+
|
| 53 |
+
app.dependency_overrides[get_db] = _override_get_db
|
| 54 |
+
with TestClient(app, raise_server_exceptions=True) as c:
|
| 55 |
+
yield c
|
| 56 |
+
app.dependency_overrides.clear()
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def test_thread_messages_include_persisted_citations(client):
|
| 60 |
+
create_notebook = client.post("/notebooks", json={"title": "Citation Notebook"})
|
| 61 |
+
assert create_notebook.status_code == 200
|
| 62 |
+
notebook_id = int(create_notebook.json()["id"])
|
| 63 |
+
|
| 64 |
+
create_source = client.post(
|
| 65 |
+
f"/notebooks/{notebook_id}/sources",
|
| 66 |
+
json={
|
| 67 |
+
"type": "text",
|
| 68 |
+
"title": "Lecture Notes",
|
| 69 |
+
"status": "ready",
|
| 70 |
+
},
|
| 71 |
+
)
|
| 72 |
+
assert create_source.status_code == 200
|
| 73 |
+
source_id = int(create_source.json()["id"])
|
| 74 |
+
|
| 75 |
+
create_thread = client.post(
|
| 76 |
+
f"/notebooks/{notebook_id}/threads",
|
| 77 |
+
json={"title": "Q&A"},
|
| 78 |
+
)
|
| 79 |
+
assert create_thread.status_code == 200
|
| 80 |
+
thread_id = int(create_thread.json()["id"])
|
| 81 |
+
|
| 82 |
+
retrieval_rows = [
|
| 83 |
+
{
|
| 84 |
+
"chunk_id": "chunk-1",
|
| 85 |
+
"score": 0.12,
|
| 86 |
+
"document": "Neural networks learn from examples.",
|
| 87 |
+
"metadata": {
|
| 88 |
+
"source_id": str(source_id),
|
| 89 |
+
"source_title": "Lecture Notes",
|
| 90 |
+
"chunk_index": 0,
|
| 91 |
+
},
|
| 92 |
+
}
|
| 93 |
+
]
|
| 94 |
+
|
| 95 |
+
with patch("app.query_notebook_chunks", return_value=retrieval_rows), patch(
|
| 96 |
+
"app.generate_chat_completion", return_value="They learn from examples in the data."
|
| 97 |
+
):
|
| 98 |
+
chat_resp = client.post(
|
| 99 |
+
f"/threads/{thread_id}/chat",
|
| 100 |
+
params={"notebook_id": notebook_id},
|
| 101 |
+
json={"question": "How do neural networks learn?", "top_k": 5},
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
assert chat_resp.status_code == 200
|
| 105 |
+
chat_payload = chat_resp.json()
|
| 106 |
+
assert len(chat_payload["citations"]) == 1
|
| 107 |
+
assert int(chat_payload["citations"][0]["source_id"]) == source_id
|
| 108 |
+
|
| 109 |
+
messages_resp = client.get(
|
| 110 |
+
f"/threads/{thread_id}/messages",
|
| 111 |
+
params={"notebook_id": notebook_id},
|
| 112 |
+
)
|
| 113 |
+
assert messages_resp.status_code == 200
|
| 114 |
+
messages = messages_resp.json()
|
| 115 |
+
assistant_message = next((m for m in messages if m["role"] == "assistant"), None)
|
| 116 |
+
assert assistant_message is not None
|
| 117 |
+
assert len(assistant_message["citations"]) == 1
|
| 118 |
+
assert int(assistant_message["citations"][0]["source_id"]) == source_id
|
| 119 |
+
assert assistant_message["citations"][0]["source_title"] == "Lecture Notes"
|
tests/test_notebook_management_api.py
CHANGED
|
@@ -3,6 +3,7 @@ Integration tests for notebook rename/delete management endpoints.
|
|
| 3 |
"""
|
| 4 |
from __future__ import annotations
|
| 5 |
|
|
|
|
| 6 |
import pathlib
|
| 7 |
import sys
|
| 8 |
from unittest.mock import patch
|
|
@@ -15,6 +16,7 @@ from sqlalchemy.orm import sessionmaker
|
|
| 15 |
ROOT = pathlib.Path(__file__).resolve().parents[1]
|
| 16 |
sys.path.insert(0, str(ROOT))
|
| 17 |
|
|
|
|
| 18 |
from app import app
|
| 19 |
from data.db import Base, get_db
|
| 20 |
|
|
@@ -43,9 +45,11 @@ def db_session(db_engine):
|
|
| 43 |
|
| 44 |
|
| 45 |
@pytest.fixture()
|
| 46 |
-
def client(db_session, monkeypatch):
|
| 47 |
monkeypatch.setenv("AUTH_MODE", "dev")
|
| 48 |
monkeypatch.setenv("APP_SESSION_SECRET", "notebook-mgmt-test-secret")
|
|
|
|
|
|
|
| 49 |
|
| 50 |
def _override_get_db():
|
| 51 |
yield db_session
|
|
@@ -179,3 +183,44 @@ def test_create_url_source_accepts_public_url(client):
|
|
| 179 |
assert payload["status"] == "ready"
|
| 180 |
assert payload["ingested_at"] is not None
|
| 181 |
mock_ingest.assert_called_once()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
"""
|
| 4 |
from __future__ import annotations
|
| 5 |
|
| 6 |
+
import os
|
| 7 |
import pathlib
|
| 8 |
import sys
|
| 9 |
from unittest.mock import patch
|
|
|
|
| 16 |
ROOT = pathlib.Path(__file__).resolve().parents[1]
|
| 17 |
sys.path.insert(0, str(ROOT))
|
| 18 |
|
| 19 |
+
import app as app_module
|
| 20 |
from app import app
|
| 21 |
from data.db import Base, get_db
|
| 22 |
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
@pytest.fixture()
|
| 48 |
+
def client(db_session, monkeypatch, tmp_path):
|
| 49 |
monkeypatch.setenv("AUTH_MODE", "dev")
|
| 50 |
monkeypatch.setenv("APP_SESSION_SECRET", "notebook-mgmt-test-secret")
|
| 51 |
+
monkeypatch.setenv("STORAGE_BASE_DIR", str(tmp_path / "storage"))
|
| 52 |
+
monkeypatch.setattr("app.UPLOADS_ROOT", tmp_path / "uploads")
|
| 53 |
|
| 54 |
def _override_get_db():
|
| 55 |
yield db_session
|
|
|
|
| 183 |
assert payload["status"] == "ready"
|
| 184 |
assert payload["ingested_at"] is not None
|
| 185 |
mock_ingest.assert_called_once()
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def test_upload_source_sanitizes_filename(client):
|
| 189 |
+
create_resp = client.post("/notebooks", json={"title": "Uploads"})
|
| 190 |
+
assert create_resp.status_code == 200
|
| 191 |
+
notebook_id = create_resp.json()["id"]
|
| 192 |
+
|
| 193 |
+
with patch("app.ingest_source", return_value=1):
|
| 194 |
+
upload_resp = client.post(
|
| 195 |
+
f"/notebooks/{notebook_id}/sources/upload",
|
| 196 |
+
data={"status": "pending"},
|
| 197 |
+
files={"file": ("../../../../evil.txt", b"hello world", "text/plain")},
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
assert upload_resp.status_code == 200
|
| 201 |
+
payload = upload_resp.json()
|
| 202 |
+
assert payload["original_name"] == "evil.txt"
|
| 203 |
+
assert payload["storage_path"] is not None
|
| 204 |
+
assert ".." not in payload["storage_path"]
|
| 205 |
+
assert f"notebook_{notebook_id}" in payload["storage_path"]
|
| 206 |
+
assert pathlib.Path(payload["storage_path"]).exists()
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def test_delete_notebook_removes_notebook_storage_and_uploads(client):
|
| 210 |
+
create_resp = client.post("/notebooks", json={"title": "Delete storage"})
|
| 211 |
+
assert create_resp.status_code == 200
|
| 212 |
+
notebook_id = create_resp.json()["id"]
|
| 213 |
+
|
| 214 |
+
storage_root = pathlib.Path(os.environ["STORAGE_BASE_DIR"]) / "users" / "1" / "notebooks" / str(notebook_id)
|
| 215 |
+
storage_root.mkdir(parents=True, exist_ok=True)
|
| 216 |
+
(storage_root / "marker.txt").write_text("x", encoding="utf-8")
|
| 217 |
+
|
| 218 |
+
upload_root = pathlib.Path(app_module.UPLOADS_ROOT) / f"notebook_{notebook_id}"
|
| 219 |
+
upload_root.mkdir(parents=True, exist_ok=True)
|
| 220 |
+
(upload_root / "upload.txt").write_text("x", encoding="utf-8")
|
| 221 |
+
|
| 222 |
+
delete_resp = client.delete(f"/notebooks/{notebook_id}")
|
| 223 |
+
assert delete_resp.status_code == 200
|
| 224 |
+
|
| 225 |
+
assert not storage_root.exists()
|
| 226 |
+
assert not upload_root.exists()
|