tAnboyyy commited on
Commit
fa22601
·
1 Parent(s): 69068b7

Update README, app, and requirements for PDF ingestion feature

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +158 -5
  3. backend/ingestion_service.py +72 -0
  4. requirements.txt +2 -1
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 📓
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: "4.44.0"
8
  python_version: "3.10"
9
  app_file: app.py
10
  pinned: false
 
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: "4.44.1"
8
  python_version: "3.10"
9
  app_file: app.py
10
  pinned: false
app.py CHANGED
@@ -1,4 +1,5 @@
1
  from pathlib import Path
 
2
 
3
  from dotenv import load_dotenv
4
 
@@ -7,9 +8,30 @@ load_dotenv(Path(__file__).resolve().parent.parent / ".env")
7
  load_dotenv(Path(__file__).resolve().parent / ".env")
8
 
9
  import gradio as gr
 
10
 
 
11
  from backend.notebook_service import create_notebook, list_notebooks, rename_notebook, delete_notebook
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # Theme: adapts to light/dark mode
14
  theme = gr.themes.Soft(
15
  primary_hue="blue",
@@ -137,6 +159,99 @@ def _initial_load(profile: gr.OAuthProfile | None):
137
  return state, selected, status, *updates
138
 
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  def _build_row_updates(notebooks):
141
  """Return gr.update values for each row: visibility, then text value."""
142
  out = []
@@ -173,6 +288,25 @@ with gr.Blocks(
173
  )
174
  create_btn = gr.Button("Create", variant="primary", scale=1)
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  gr.Markdown("---")
177
  gr.Markdown("**Your notebooks** (selected notebook used for chat/ingestion)")
178
 
@@ -195,14 +329,30 @@ with gr.Blocks(
195
 
196
  status = gr.Markdown("Sign in with Hugging Face to manage notebooks.", elem_classes=["status"])
197
 
198
- demo.load(_initial_load, inputs=None, outputs=[nb_state, selected_notebook_id, status] + row_outputs)
 
199
 
200
  # Create button
201
  create_btn.click(
202
  _safe_create,
203
  inputs=[create_txt, nb_state, selected_notebook_id],
204
  outputs=[create_txt, nb_state, selected_notebook_id, status] + row_outputs,
205
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  # Per-row: Rename, Delete, Select (profile injected by Gradio for OAuth)
208
  for i in range(MAX_NOTEBOOKS):
@@ -215,18 +365,21 @@ with gr.Blocks(
215
  _safe_rename,
216
  inputs=[gr.State(i), name_txt, nb_state, selected_notebook_id],
217
  outputs=[nb_state, selected_notebook_id, status] + row_outputs,
 
218
  )
219
  delete_btn.click(
220
  _safe_delete,
221
  inputs=[gr.State(i), nb_state, selected_notebook_id],
222
  outputs=[nb_state, selected_notebook_id, status] + row_outputs,
223
- )
 
224
  def _on_select():
225
  return "Selected notebook updated. Use this for chat/ingestion."
226
  select_btn.click(
227
  _select_notebook,
228
  inputs=[gr.State(i), nb_state],
229
  outputs=[selected_notebook_id],
230
- ).then(_on_select, None, [status])
 
231
 
232
- demo.launch()
 
1
  from pathlib import Path
2
+ import shutil
3
 
4
  from dotenv import load_dotenv
5
 
 
8
  load_dotenv(Path(__file__).resolve().parent / ".env")
9
 
10
  import gradio as gr
11
+ import gradio_client.utils as gradio_client_utils
12
 
13
+ from backend.ingestion_service import ingest_pdf_chunks, remove_chunks_for_source
14
  from backend.notebook_service import create_notebook, list_notebooks, rename_notebook, delete_notebook
15
 
16
+ _original_gradio_get_type = gradio_client_utils.get_type
17
+ _original_json_schema_to_python_type = gradio_client_utils._json_schema_to_python_type
18
+
19
+
20
+ def _patched_gradio_get_type(schema):
21
+ if isinstance(schema, bool):
22
+ return "Any"
23
+ return _original_gradio_get_type(schema)
24
+
25
+
26
+ def _patched_json_schema_to_python_type(schema, defs=None):
27
+ if isinstance(schema, bool):
28
+ return "Any"
29
+ return _original_json_schema_to_python_type(schema, defs)
30
+
31
+
32
+ gradio_client_utils.get_type = _patched_gradio_get_type
33
+ gradio_client_utils._json_schema_to_python_type = _patched_json_schema_to_python_type
34
+
35
  # Theme: adapts to light/dark mode
36
  theme = gr.themes.Soft(
37
  primary_hue="blue",
 
159
  return state, selected, status, *updates
160
 
161
 
162
+ def _safe_upload_pdfs(files, selected_id, profile: gr.OAuthProfile | None):
163
+ """Upload PDF files for the selected notebook."""
164
+ try:
165
+ user_id = _user_id(profile)
166
+ if not user_id:
167
+ return "Please sign in with Hugging Face before uploading PDFs."
168
+ if not selected_id:
169
+ return "Select a notebook first, then upload PDFs."
170
+ if not files:
171
+ return "Choose at least one PDF to upload."
172
+
173
+ if isinstance(files, str):
174
+ file_paths = [files]
175
+ else:
176
+ file_paths = []
177
+ for file_item in files:
178
+ file_path = getattr(file_item, "name", file_item)
179
+ if file_path:
180
+ file_paths.append(file_path)
181
+
182
+ if not file_paths:
183
+ return "No files were received. Try uploading again."
184
+
185
+ target_dir = Path("data") / "uploads" / user_id / str(selected_id)
186
+ target_dir.mkdir(parents=True, exist_ok=True)
187
+
188
+ uploaded = []
189
+ total_chunks = 0
190
+ for file_path in file_paths:
191
+ source_path = Path(file_path)
192
+ if source_path.suffix.lower() != ".pdf":
193
+ continue
194
+
195
+ destination = target_dir / source_path.name
196
+ if destination.exists():
197
+ index = 1
198
+ while True:
199
+ candidate = target_dir / f"{source_path.stem}_{index}{source_path.suffix}"
200
+ if not candidate.exists():
201
+ destination = candidate
202
+ break
203
+ index += 1
204
+
205
+ shutil.copy2(source_path, destination)
206
+ uploaded.append(destination.name)
207
+ total_chunks += ingest_pdf_chunks(str(selected_id), destination.name, destination)
208
+
209
+ if not uploaded:
210
+ return "Only .pdf files are allowed."
211
+
212
+ return f"Uploaded {len(uploaded)} PDF(s): {', '.join(uploaded)}. Indexed {total_chunks} chunk(s) for RAG."
213
+ except Exception as error:
214
+ return f"Error uploading PDFs: {error}"
215
+
216
+
217
+ def _list_uploaded_pdfs(selected_id, profile: gr.OAuthProfile | None):
218
+ """List uploaded PDFs for the selected notebook."""
219
+ user_id = _user_id(profile)
220
+ if not user_id or not selected_id:
221
+ return gr.update(choices=[], value=None)
222
+
223
+ target_dir = Path("data") / "uploads" / user_id / str(selected_id)
224
+ if not target_dir.exists():
225
+ return gr.update(choices=[], value=None)
226
+
227
+ pdf_names = sorted([path.name for path in target_dir.glob("*.pdf")])
228
+ selected_name = pdf_names[0] if pdf_names else None
229
+ return gr.update(choices=pdf_names, value=selected_name)
230
+
231
+
232
+ def _safe_remove_pdf(file_name, selected_id, profile: gr.OAuthProfile | None):
233
+ """Remove one uploaded PDF from the selected notebook."""
234
+ try:
235
+ user_id = _user_id(profile)
236
+ if not user_id:
237
+ return "Please sign in with Hugging Face before removing PDFs."
238
+ if not selected_id:
239
+ return "Select a notebook first."
240
+ if not file_name:
241
+ return "Select a PDF to remove."
242
+
243
+ safe_name = Path(file_name).name
244
+ target_file = Path("data") / "uploads" / user_id / str(selected_id) / safe_name
245
+ if not target_file.exists() or target_file.suffix.lower() != ".pdf":
246
+ return "Selected PDF was not found."
247
+
248
+ target_file.unlink()
249
+ remove_chunks_for_source(str(selected_id), safe_name)
250
+ return f"Removed PDF: {safe_name}"
251
+ except Exception as error:
252
+ return f"Error removing PDF: {error}"
253
+
254
+
255
  def _build_row_updates(notebooks):
256
  """Return gr.update values for each row: visibility, then text value."""
257
  out = []
 
288
  )
289
  create_btn = gr.Button("Create", variant="primary", scale=1)
290
 
291
+ with gr.Row():
292
+ pdf_upload_btn = gr.UploadButton(
293
+ "Upload PDFs",
294
+ file_types=[".pdf"],
295
+ file_count="multiple",
296
+ type="filepath",
297
+ variant="secondary",
298
+ )
299
+
300
+ with gr.Row():
301
+ uploaded_pdf_dd = gr.Dropdown(
302
+ label="Uploaded PDFs",
303
+ choices=[],
304
+ value=None,
305
+ scale=3,
306
+ allow_custom_value=False,
307
+ )
308
+ remove_pdf_btn = gr.Button("Remove selected PDF", variant="stop", scale=1)
309
+
310
  gr.Markdown("---")
311
  gr.Markdown("**Your notebooks** (selected notebook used for chat/ingestion)")
312
 
 
329
 
330
  status = gr.Markdown("Sign in with Hugging Face to manage notebooks.", elem_classes=["status"])
331
 
332
+ demo.load(_initial_load, inputs=None, outputs=[nb_state, selected_notebook_id, status] + row_outputs, api_name=False)
333
+ demo.load(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd], api_name=False)
334
 
335
  # Create button
336
  create_btn.click(
337
  _safe_create,
338
  inputs=[create_txt, nb_state, selected_notebook_id],
339
  outputs=[create_txt, nb_state, selected_notebook_id, status] + row_outputs,
340
+ api_name=False,
341
+ ).then(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd])
342
+
343
+ pdf_upload_btn.upload(
344
+ _safe_upload_pdfs,
345
+ inputs=[pdf_upload_btn, selected_notebook_id],
346
+ outputs=[status],
347
+ api_name=False,
348
+ ).then(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd])
349
+
350
+ remove_pdf_btn.click(
351
+ _safe_remove_pdf,
352
+ inputs=[uploaded_pdf_dd, selected_notebook_id],
353
+ outputs=[status],
354
+ api_name=False,
355
+ ).then(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd])
356
 
357
  # Per-row: Rename, Delete, Select (profile injected by Gradio for OAuth)
358
  for i in range(MAX_NOTEBOOKS):
 
365
  _safe_rename,
366
  inputs=[gr.State(i), name_txt, nb_state, selected_notebook_id],
367
  outputs=[nb_state, selected_notebook_id, status] + row_outputs,
368
+ api_name=False,
369
  )
370
  delete_btn.click(
371
  _safe_delete,
372
  inputs=[gr.State(i), nb_state, selected_notebook_id],
373
  outputs=[nb_state, selected_notebook_id, status] + row_outputs,
374
+ api_name=False,
375
+ ).then(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd])
376
  def _on_select():
377
  return "Selected notebook updated. Use this for chat/ingestion."
378
  select_btn.click(
379
  _select_notebook,
380
  inputs=[gr.State(i), nb_state],
381
  outputs=[selected_notebook_id],
382
+ api_name=False,
383
+ ).then(_on_select, None, [status]).then(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd])
384
 
385
+ demo.launch(show_api=False)
backend/ingestion_service.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """PDF ingestion for RAG: extract text, chunk, and persist to chunks table."""
2
+
3
+ from pathlib import Path
4
+
5
+ from pypdf import PdfReader
6
+
7
+ from backend.db import supabase
8
+
9
+ DEFAULT_CHUNK_SIZE = 1200
10
+ DEFAULT_CHUNK_OVERLAP = 200
11
+
12
+
13
+ def _extract_pdf_text(pdf_path: Path) -> str:
14
+ reader = PdfReader(str(pdf_path))
15
+ pages = []
16
+ for page in reader.pages:
17
+ pages.append(page.extract_text() or "")
18
+ return "\n".join(pages).strip()
19
+
20
+
21
+ def _chunk_text(text: str, chunk_size: int = DEFAULT_CHUNK_SIZE, overlap: int = DEFAULT_CHUNK_OVERLAP) -> list[str]:
22
+ clean = " ".join(text.split())
23
+ if not clean:
24
+ return []
25
+
26
+ chunks: list[str] = []
27
+ start = 0
28
+ step = max(1, chunk_size - overlap)
29
+
30
+ while start < len(clean):
31
+ end = min(len(clean), start + chunk_size)
32
+ chunks.append(clean[start:end])
33
+ start += step
34
+
35
+ return chunks
36
+
37
+
38
+ def ingest_pdf_chunks(notebook_id: str, source_id: str, pdf_path: Path) -> int:
39
+ """Extract and store chunks for a single PDF. Returns number of chunks inserted."""
40
+ text = _extract_pdf_text(pdf_path)
41
+ chunks = _chunk_text(text)
42
+
43
+ supabase.table("chunks").delete().eq("notebook_id", notebook_id).eq("source_id", source_id).execute()
44
+
45
+ if not chunks:
46
+ return 0
47
+
48
+ rows = [
49
+ {
50
+ "notebook_id": notebook_id,
51
+ "source_id": source_id,
52
+ "content": chunk,
53
+ "metadata": {
54
+ "file_name": source_id,
55
+ "file_path": str(pdf_path),
56
+ "chunk_index": index,
57
+ "total_chunks": len(chunks),
58
+ },
59
+ }
60
+ for index, chunk in enumerate(chunks)
61
+ ]
62
+
63
+ batch_size = 100
64
+ for offset in range(0, len(rows), batch_size):
65
+ supabase.table("chunks").insert(rows[offset:offset + batch_size]).execute()
66
+
67
+ return len(rows)
68
+
69
+
70
+ def remove_chunks_for_source(notebook_id: str, source_id: str) -> None:
71
+ """Delete all chunks tied to one source file for a notebook."""
72
+ supabase.table("chunks").delete().eq("notebook_id", notebook_id).eq("source_id", source_id).execute()
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
- gradio[oauth]==4.44.0
2
  huggingface_hub==0.24.7
3
  supabase>=2.0.0
4
  python-dotenv>=1.0.0
5
  realtime==2.3.0
 
 
1
+ gradio[oauth]==4.44.1
2
  huggingface_hub==0.24.7
3
  supabase>=2.0.0
4
  python-dotenv>=1.0.0
5
  realtime==2.3.0
6
+ pypdf>=4.2.0