Update README, app, and requirements for PDF ingestion feature
Browse files- README.md +1 -1
- app.py +158 -5
- backend/ingestion_service.py +72 -0
- requirements.txt +2 -1
README.md
CHANGED
|
@@ -4,7 +4,7 @@ emoji: 📓
|
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: "4.44.
|
| 8 |
python_version: "3.10"
|
| 9 |
app_file: app.py
|
| 10 |
pinned: false
|
|
|
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: "4.44.1"
|
| 8 |
python_version: "3.10"
|
| 9 |
app_file: app.py
|
| 10 |
pinned: false
|
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
from pathlib import Path
|
|
|
|
| 2 |
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
|
|
@@ -7,9 +8,30 @@ load_dotenv(Path(__file__).resolve().parent.parent / ".env")
|
|
| 7 |
load_dotenv(Path(__file__).resolve().parent / ".env")
|
| 8 |
|
| 9 |
import gradio as gr
|
|
|
|
| 10 |
|
|
|
|
| 11 |
from backend.notebook_service import create_notebook, list_notebooks, rename_notebook, delete_notebook
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
# Theme: adapts to light/dark mode
|
| 14 |
theme = gr.themes.Soft(
|
| 15 |
primary_hue="blue",
|
|
@@ -137,6 +159,99 @@ def _initial_load(profile: gr.OAuthProfile | None):
|
|
| 137 |
return state, selected, status, *updates
|
| 138 |
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
def _build_row_updates(notebooks):
|
| 141 |
"""Return gr.update values for each row: visibility, then text value."""
|
| 142 |
out = []
|
|
@@ -173,6 +288,25 @@ with gr.Blocks(
|
|
| 173 |
)
|
| 174 |
create_btn = gr.Button("Create", variant="primary", scale=1)
|
| 175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
gr.Markdown("---")
|
| 177 |
gr.Markdown("**Your notebooks** (selected notebook used for chat/ingestion)")
|
| 178 |
|
|
@@ -195,14 +329,30 @@ with gr.Blocks(
|
|
| 195 |
|
| 196 |
status = gr.Markdown("Sign in with Hugging Face to manage notebooks.", elem_classes=["status"])
|
| 197 |
|
| 198 |
-
demo.load(_initial_load, inputs=None, outputs=[nb_state, selected_notebook_id, status] + row_outputs)
|
|
|
|
| 199 |
|
| 200 |
# Create button
|
| 201 |
create_btn.click(
|
| 202 |
_safe_create,
|
| 203 |
inputs=[create_txt, nb_state, selected_notebook_id],
|
| 204 |
outputs=[create_txt, nb_state, selected_notebook_id, status] + row_outputs,
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
# Per-row: Rename, Delete, Select (profile injected by Gradio for OAuth)
|
| 208 |
for i in range(MAX_NOTEBOOKS):
|
|
@@ -215,18 +365,21 @@ with gr.Blocks(
|
|
| 215 |
_safe_rename,
|
| 216 |
inputs=[gr.State(i), name_txt, nb_state, selected_notebook_id],
|
| 217 |
outputs=[nb_state, selected_notebook_id, status] + row_outputs,
|
|
|
|
| 218 |
)
|
| 219 |
delete_btn.click(
|
| 220 |
_safe_delete,
|
| 221 |
inputs=[gr.State(i), nb_state, selected_notebook_id],
|
| 222 |
outputs=[nb_state, selected_notebook_id, status] + row_outputs,
|
| 223 |
-
|
|
|
|
| 224 |
def _on_select():
|
| 225 |
return "Selected notebook updated. Use this for chat/ingestion."
|
| 226 |
select_btn.click(
|
| 227 |
_select_notebook,
|
| 228 |
inputs=[gr.State(i), nb_state],
|
| 229 |
outputs=[selected_notebook_id],
|
| 230 |
-
|
|
|
|
| 231 |
|
| 232 |
-
demo.launch()
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
+
import shutil
|
| 3 |
|
| 4 |
from dotenv import load_dotenv
|
| 5 |
|
|
|
|
| 8 |
load_dotenv(Path(__file__).resolve().parent / ".env")
|
| 9 |
|
| 10 |
import gradio as gr
|
| 11 |
+
import gradio_client.utils as gradio_client_utils
|
| 12 |
|
| 13 |
+
from backend.ingestion_service import ingest_pdf_chunks, remove_chunks_for_source
|
| 14 |
from backend.notebook_service import create_notebook, list_notebooks, rename_notebook, delete_notebook
|
| 15 |
|
| 16 |
+
_original_gradio_get_type = gradio_client_utils.get_type
|
| 17 |
+
_original_json_schema_to_python_type = gradio_client_utils._json_schema_to_python_type
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _patched_gradio_get_type(schema):
|
| 21 |
+
if isinstance(schema, bool):
|
| 22 |
+
return "Any"
|
| 23 |
+
return _original_gradio_get_type(schema)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _patched_json_schema_to_python_type(schema, defs=None):
|
| 27 |
+
if isinstance(schema, bool):
|
| 28 |
+
return "Any"
|
| 29 |
+
return _original_json_schema_to_python_type(schema, defs)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
gradio_client_utils.get_type = _patched_gradio_get_type
|
| 33 |
+
gradio_client_utils._json_schema_to_python_type = _patched_json_schema_to_python_type
|
| 34 |
+
|
| 35 |
# Theme: adapts to light/dark mode
|
| 36 |
theme = gr.themes.Soft(
|
| 37 |
primary_hue="blue",
|
|
|
|
| 159 |
return state, selected, status, *updates
|
| 160 |
|
| 161 |
|
| 162 |
+
def _safe_upload_pdfs(files, selected_id, profile: gr.OAuthProfile | None):
|
| 163 |
+
"""Upload PDF files for the selected notebook."""
|
| 164 |
+
try:
|
| 165 |
+
user_id = _user_id(profile)
|
| 166 |
+
if not user_id:
|
| 167 |
+
return "Please sign in with Hugging Face before uploading PDFs."
|
| 168 |
+
if not selected_id:
|
| 169 |
+
return "Select a notebook first, then upload PDFs."
|
| 170 |
+
if not files:
|
| 171 |
+
return "Choose at least one PDF to upload."
|
| 172 |
+
|
| 173 |
+
if isinstance(files, str):
|
| 174 |
+
file_paths = [files]
|
| 175 |
+
else:
|
| 176 |
+
file_paths = []
|
| 177 |
+
for file_item in files:
|
| 178 |
+
file_path = getattr(file_item, "name", file_item)
|
| 179 |
+
if file_path:
|
| 180 |
+
file_paths.append(file_path)
|
| 181 |
+
|
| 182 |
+
if not file_paths:
|
| 183 |
+
return "No files were received. Try uploading again."
|
| 184 |
+
|
| 185 |
+
target_dir = Path("data") / "uploads" / user_id / str(selected_id)
|
| 186 |
+
target_dir.mkdir(parents=True, exist_ok=True)
|
| 187 |
+
|
| 188 |
+
uploaded = []
|
| 189 |
+
total_chunks = 0
|
| 190 |
+
for file_path in file_paths:
|
| 191 |
+
source_path = Path(file_path)
|
| 192 |
+
if source_path.suffix.lower() != ".pdf":
|
| 193 |
+
continue
|
| 194 |
+
|
| 195 |
+
destination = target_dir / source_path.name
|
| 196 |
+
if destination.exists():
|
| 197 |
+
index = 1
|
| 198 |
+
while True:
|
| 199 |
+
candidate = target_dir / f"{source_path.stem}_{index}{source_path.suffix}"
|
| 200 |
+
if not candidate.exists():
|
| 201 |
+
destination = candidate
|
| 202 |
+
break
|
| 203 |
+
index += 1
|
| 204 |
+
|
| 205 |
+
shutil.copy2(source_path, destination)
|
| 206 |
+
uploaded.append(destination.name)
|
| 207 |
+
total_chunks += ingest_pdf_chunks(str(selected_id), destination.name, destination)
|
| 208 |
+
|
| 209 |
+
if not uploaded:
|
| 210 |
+
return "Only .pdf files are allowed."
|
| 211 |
+
|
| 212 |
+
return f"Uploaded {len(uploaded)} PDF(s): {', '.join(uploaded)}. Indexed {total_chunks} chunk(s) for RAG."
|
| 213 |
+
except Exception as error:
|
| 214 |
+
return f"Error uploading PDFs: {error}"
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def _list_uploaded_pdfs(selected_id, profile: gr.OAuthProfile | None):
|
| 218 |
+
"""List uploaded PDFs for the selected notebook."""
|
| 219 |
+
user_id = _user_id(profile)
|
| 220 |
+
if not user_id or not selected_id:
|
| 221 |
+
return gr.update(choices=[], value=None)
|
| 222 |
+
|
| 223 |
+
target_dir = Path("data") / "uploads" / user_id / str(selected_id)
|
| 224 |
+
if not target_dir.exists():
|
| 225 |
+
return gr.update(choices=[], value=None)
|
| 226 |
+
|
| 227 |
+
pdf_names = sorted([path.name for path in target_dir.glob("*.pdf")])
|
| 228 |
+
selected_name = pdf_names[0] if pdf_names else None
|
| 229 |
+
return gr.update(choices=pdf_names, value=selected_name)
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def _safe_remove_pdf(file_name, selected_id, profile: gr.OAuthProfile | None):
|
| 233 |
+
"""Remove one uploaded PDF from the selected notebook."""
|
| 234 |
+
try:
|
| 235 |
+
user_id = _user_id(profile)
|
| 236 |
+
if not user_id:
|
| 237 |
+
return "Please sign in with Hugging Face before removing PDFs."
|
| 238 |
+
if not selected_id:
|
| 239 |
+
return "Select a notebook first."
|
| 240 |
+
if not file_name:
|
| 241 |
+
return "Select a PDF to remove."
|
| 242 |
+
|
| 243 |
+
safe_name = Path(file_name).name
|
| 244 |
+
target_file = Path("data") / "uploads" / user_id / str(selected_id) / safe_name
|
| 245 |
+
if not target_file.exists() or target_file.suffix.lower() != ".pdf":
|
| 246 |
+
return "Selected PDF was not found."
|
| 247 |
+
|
| 248 |
+
target_file.unlink()
|
| 249 |
+
remove_chunks_for_source(str(selected_id), safe_name)
|
| 250 |
+
return f"Removed PDF: {safe_name}"
|
| 251 |
+
except Exception as error:
|
| 252 |
+
return f"Error removing PDF: {error}"
|
| 253 |
+
|
| 254 |
+
|
| 255 |
def _build_row_updates(notebooks):
|
| 256 |
"""Return gr.update values for each row: visibility, then text value."""
|
| 257 |
out = []
|
|
|
|
| 288 |
)
|
| 289 |
create_btn = gr.Button("Create", variant="primary", scale=1)
|
| 290 |
|
| 291 |
+
with gr.Row():
|
| 292 |
+
pdf_upload_btn = gr.UploadButton(
|
| 293 |
+
"Upload PDFs",
|
| 294 |
+
file_types=[".pdf"],
|
| 295 |
+
file_count="multiple",
|
| 296 |
+
type="filepath",
|
| 297 |
+
variant="secondary",
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
with gr.Row():
|
| 301 |
+
uploaded_pdf_dd = gr.Dropdown(
|
| 302 |
+
label="Uploaded PDFs",
|
| 303 |
+
choices=[],
|
| 304 |
+
value=None,
|
| 305 |
+
scale=3,
|
| 306 |
+
allow_custom_value=False,
|
| 307 |
+
)
|
| 308 |
+
remove_pdf_btn = gr.Button("Remove selected PDF", variant="stop", scale=1)
|
| 309 |
+
|
| 310 |
gr.Markdown("---")
|
| 311 |
gr.Markdown("**Your notebooks** (selected notebook used for chat/ingestion)")
|
| 312 |
|
|
|
|
| 329 |
|
| 330 |
status = gr.Markdown("Sign in with Hugging Face to manage notebooks.", elem_classes=["status"])
|
| 331 |
|
| 332 |
+
demo.load(_initial_load, inputs=None, outputs=[nb_state, selected_notebook_id, status] + row_outputs, api_name=False)
|
| 333 |
+
demo.load(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd], api_name=False)
|
| 334 |
|
| 335 |
# Create button
|
| 336 |
create_btn.click(
|
| 337 |
_safe_create,
|
| 338 |
inputs=[create_txt, nb_state, selected_notebook_id],
|
| 339 |
outputs=[create_txt, nb_state, selected_notebook_id, status] + row_outputs,
|
| 340 |
+
api_name=False,
|
| 341 |
+
).then(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd])
|
| 342 |
+
|
| 343 |
+
pdf_upload_btn.upload(
|
| 344 |
+
_safe_upload_pdfs,
|
| 345 |
+
inputs=[pdf_upload_btn, selected_notebook_id],
|
| 346 |
+
outputs=[status],
|
| 347 |
+
api_name=False,
|
| 348 |
+
).then(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd])
|
| 349 |
+
|
| 350 |
+
remove_pdf_btn.click(
|
| 351 |
+
_safe_remove_pdf,
|
| 352 |
+
inputs=[uploaded_pdf_dd, selected_notebook_id],
|
| 353 |
+
outputs=[status],
|
| 354 |
+
api_name=False,
|
| 355 |
+
).then(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd])
|
| 356 |
|
| 357 |
# Per-row: Rename, Delete, Select (profile injected by Gradio for OAuth)
|
| 358 |
for i in range(MAX_NOTEBOOKS):
|
|
|
|
| 365 |
_safe_rename,
|
| 366 |
inputs=[gr.State(i), name_txt, nb_state, selected_notebook_id],
|
| 367 |
outputs=[nb_state, selected_notebook_id, status] + row_outputs,
|
| 368 |
+
api_name=False,
|
| 369 |
)
|
| 370 |
delete_btn.click(
|
| 371 |
_safe_delete,
|
| 372 |
inputs=[gr.State(i), nb_state, selected_notebook_id],
|
| 373 |
outputs=[nb_state, selected_notebook_id, status] + row_outputs,
|
| 374 |
+
api_name=False,
|
| 375 |
+
).then(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd])
|
| 376 |
def _on_select():
|
| 377 |
return "Selected notebook updated. Use this for chat/ingestion."
|
| 378 |
select_btn.click(
|
| 379 |
_select_notebook,
|
| 380 |
inputs=[gr.State(i), nb_state],
|
| 381 |
outputs=[selected_notebook_id],
|
| 382 |
+
api_name=False,
|
| 383 |
+
).then(_on_select, None, [status]).then(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd])
|
| 384 |
|
| 385 |
+
demo.launch(show_api=False)
|
backend/ingestion_service.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""PDF ingestion for RAG: extract text, chunk, and persist to chunks table."""
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from pypdf import PdfReader
|
| 6 |
+
|
| 7 |
+
from backend.db import supabase
|
| 8 |
+
|
| 9 |
+
DEFAULT_CHUNK_SIZE = 1200
|
| 10 |
+
DEFAULT_CHUNK_OVERLAP = 200
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _extract_pdf_text(pdf_path: Path) -> str:
|
| 14 |
+
reader = PdfReader(str(pdf_path))
|
| 15 |
+
pages = []
|
| 16 |
+
for page in reader.pages:
|
| 17 |
+
pages.append(page.extract_text() or "")
|
| 18 |
+
return "\n".join(pages).strip()
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _chunk_text(text: str, chunk_size: int = DEFAULT_CHUNK_SIZE, overlap: int = DEFAULT_CHUNK_OVERLAP) -> list[str]:
|
| 22 |
+
clean = " ".join(text.split())
|
| 23 |
+
if not clean:
|
| 24 |
+
return []
|
| 25 |
+
|
| 26 |
+
chunks: list[str] = []
|
| 27 |
+
start = 0
|
| 28 |
+
step = max(1, chunk_size - overlap)
|
| 29 |
+
|
| 30 |
+
while start < len(clean):
|
| 31 |
+
end = min(len(clean), start + chunk_size)
|
| 32 |
+
chunks.append(clean[start:end])
|
| 33 |
+
start += step
|
| 34 |
+
|
| 35 |
+
return chunks
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def ingest_pdf_chunks(notebook_id: str, source_id: str, pdf_path: Path) -> int:
|
| 39 |
+
"""Extract and store chunks for a single PDF. Returns number of chunks inserted."""
|
| 40 |
+
text = _extract_pdf_text(pdf_path)
|
| 41 |
+
chunks = _chunk_text(text)
|
| 42 |
+
|
| 43 |
+
supabase.table("chunks").delete().eq("notebook_id", notebook_id).eq("source_id", source_id).execute()
|
| 44 |
+
|
| 45 |
+
if not chunks:
|
| 46 |
+
return 0
|
| 47 |
+
|
| 48 |
+
rows = [
|
| 49 |
+
{
|
| 50 |
+
"notebook_id": notebook_id,
|
| 51 |
+
"source_id": source_id,
|
| 52 |
+
"content": chunk,
|
| 53 |
+
"metadata": {
|
| 54 |
+
"file_name": source_id,
|
| 55 |
+
"file_path": str(pdf_path),
|
| 56 |
+
"chunk_index": index,
|
| 57 |
+
"total_chunks": len(chunks),
|
| 58 |
+
},
|
| 59 |
+
}
|
| 60 |
+
for index, chunk in enumerate(chunks)
|
| 61 |
+
]
|
| 62 |
+
|
| 63 |
+
batch_size = 100
|
| 64 |
+
for offset in range(0, len(rows), batch_size):
|
| 65 |
+
supabase.table("chunks").insert(rows[offset:offset + batch_size]).execute()
|
| 66 |
+
|
| 67 |
+
return len(rows)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def remove_chunks_for_source(notebook_id: str, source_id: str) -> None:
|
| 71 |
+
"""Delete all chunks tied to one source file for a notebook."""
|
| 72 |
+
supabase.table("chunks").delete().eq("notebook_id", notebook_id).eq("source_id", source_id).execute()
|
requirements.txt
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
-
gradio[oauth]==4.44.
|
| 2 |
huggingface_hub==0.24.7
|
| 3 |
supabase>=2.0.0
|
| 4 |
python-dotenv>=1.0.0
|
| 5 |
realtime==2.3.0
|
|
|
|
|
|
| 1 |
+
gradio[oauth]==4.44.1
|
| 2 |
huggingface_hub==0.24.7
|
| 3 |
supabase>=2.0.0
|
| 4 |
python-dotenv>=1.0.0
|
| 5 |
realtime==2.3.0
|
| 6 |
+
pypdf>=4.2.0
|