Thanmay Mohandas Das commited on
Commit
8b6162c
·
unverified ·
2 Parent(s): b7e4f670599cd0

Merge branch 'main' into feature/podcast-generation

Browse files
README_PYTHON.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python Version
2
+
3
+ **Gradio does not work reliably with Python 3.13.** Use Python 3.10, 3.11, or 3.12.
4
+
5
+ ## Quick run
6
+
7
+ ```powershell
8
+ .\run.bat
9
+ ```
10
+
11
+ Or manually:
12
+
13
+ ```powershell
14
+ py -3.10 -m pip install -r requirements.txt
15
+ py -3.10 app.py
16
+ ```
17
+
18
+ ## Install Python 3.10
19
+
20
+ If you don't have Python 3.10:
21
+
22
+ 1. Download from https://www.python.org/downloads/release/python-31011/
23
+ 2. Run installer, check "Add Python to PATH"
24
+ 3. Restart terminal, then run `.\run.bat`
app.py CHANGED
@@ -1,5 +1,16 @@
1
  from pathlib import Path
2
  import shutil
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  from dotenv import load_dotenv
5
 
@@ -7,15 +18,21 @@ from dotenv import load_dotenv
7
  load_dotenv(Path(__file__).resolve().parent.parent / ".env")
8
  load_dotenv(Path(__file__).resolve().parent / ".env")
9
 
 
10
  from datetime import datetime
11
  import gradio as gr
 
12
  import gradio_client.utils as gradio_client_utils
13
 
 
14
  from backend.ingestion_service import ingest_pdf_chunks, ingest_url_chunks, remove_chunks_for_source
15
  from backend.notebook_service import create_notebook, list_notebooks, rename_notebook, delete_notebook
16
  from backend.podcast_service import generate_podcast, generate_podcast_audio
 
 
17
 
18
  import hashlib
 
19
 
20
  _original_gradio_get_type = gradio_client_utils.get_type
21
  _original_json_schema_to_python_type = gradio_client_utils._json_schema_to_python_type
@@ -36,43 +53,76 @@ def _patched_json_schema_to_python_type(schema, defs=None):
36
  gradio_client_utils.get_type = _patched_gradio_get_type
37
  gradio_client_utils._json_schema_to_python_type = _patched_json_schema_to_python_type
38
 
39
- # Theme: adapts to light/dark mode
40
  theme = gr.themes.Soft(
41
  primary_hue="blue",
42
  secondary_hue="slate",
43
- font=gr.themes.GoogleFont("Inter"),
44
  )
45
 
46
  CUSTOM_CSS = """
47
- .container { max-width: 720px; margin: 0 auto; padding: 0 24px; }
48
- .login-center { display: flex; flex-direction: column; align-items: center; justify-content: center; gap: 12px; padding: 24px 0; }
49
- .login-center .login-btn-wrap { display: flex; justify-content: center; width: 100%; }
50
- .login-center .login-btn-wrap button { display: inline-flex; align-items: center; gap: 8px; }
51
- .hero { font-size: 1.5rem; font-weight: 600; color: #1e293b; margin-bottom: 8px; }
52
- .sub { font-size: 0.875rem; color: #64748b; margin-bottom: 24px; }
53
- .nb-row { display: flex; align-items: center; gap: 12px; padding: 10px 0; border-bottom: 1px solid #e2e8f0; }
54
- .nb-row:last-child { border-bottom: none; }
55
- .gr-button { min-height: 36px !important; padding: 0 16px !important; font-weight: 500 !important; border-radius: 8px !important; }
56
- .gr-input { min-height: 40px !important; border-radius: 8px !important; }
57
- .status { font-size: 0.875rem; color: #64748b; margin-top: 16px; padding: 12px 16px; background: #f8fafc; border-radius: 8px; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  @media (prefers-color-scheme: dark) {
59
- .hero { color: #f1f5f9 !important; }
60
- .sub { color: #94a3b8 !important; }
61
- .nb-row { border-color: #334155 !important; }
62
- .status { color: #94a3b8 !important; background: #1e293b !important; }
 
 
 
63
  }
64
- .dark .hero { color: #f1f5f9 !important; }
65
- .dark .sub { color: #94a3b8 !important; }
66
- .dark .nb-row { border-color: #334155 !important; }
67
- .dark .status { color: #94a3b8 !important; background: #1e293b !important; }
 
 
 
68
  """
69
 
70
- MAX_NOTEBOOKS = 20
71
-
72
-
73
  def _user_id(profile: gr.OAuthProfile | None) -> str | None:
74
  """Extract user_id from HF OAuth profile. None if not logged in."""
75
- return profile.name if profile else None
 
 
 
 
 
 
 
 
76
 
77
 
78
  def _get_notebooks(user_id: str | None):
@@ -86,70 +136,59 @@ def _safe_create(new_name, state, selected_id, profile: gr.OAuthProfile | None =
86
  try:
87
  user_id = _user_id(profile)
88
  if not user_id:
89
- return gr.skip(), gr.skip(), gr.skip(), "Please sign in with Hugging Face", *([gr.skip()] * (MAX_NOTEBOOKS * 2))
90
  name = (new_name or "").strip() or "Untitled Notebook"
91
  nb = create_notebook(user_id, name)
92
  if nb:
93
  notebooks = _get_notebooks(user_id)
94
- state = [(n["notebook_id"], n["name"]) for n in notebooks]
95
- updates = _build_row_updates(notebooks)
96
- new_selected = nb["notebook_id"]
97
  status = f"Created: {nb['name']}"
98
- return "", state, new_selected, status, *updates
99
- return gr.skip(), gr.skip(), gr.skip(), "Failed to create", *([gr.skip()] * (MAX_NOTEBOOKS * 2))
100
  except Exception as e:
101
- return gr.skip(), gr.skip(), gr.skip(), f"Error: {e}", *([gr.skip()] * (MAX_NOTEBOOKS * 2))
102
 
103
 
104
  def _safe_rename(idx, new_name, state, selected_id, profile: gr.OAuthProfile | None = None):
105
  """Rename notebook at index."""
106
  try:
107
  if idx is None or idx < 0 or idx >= len(state):
108
- return gr.skip(), gr.skip(), gr.skip(), *([gr.skip()] * (MAX_NOTEBOOKS * 2))
109
  nb_id, _ = state[idx]
110
  name = (new_name or "").strip()
111
  if not name:
112
- return gr.skip(), gr.skip(), gr.skip(), "Enter a name.", *([gr.skip()] * (MAX_NOTEBOOKS * 2))
113
  user_id = _user_id(profile)
114
  if not user_id:
115
- return gr.skip(), gr.skip(), gr.skip(), "Please sign in", *([gr.skip()] * (MAX_NOTEBOOKS * 2))
116
  ok = rename_notebook(user_id, nb_id, name)
117
  if ok:
118
  notebooks = _get_notebooks(user_id)
119
- state = [(n["notebook_id"], n["name"]) for n in notebooks]
120
- updates = _build_row_updates(notebooks)
121
- return state, selected_id, f"Renamed to: {name}", *updates
122
- return gr.skip(), gr.skip(), gr.skip(), "Failed to rename", *([gr.skip()] * (MAX_NOTEBOOKS * 2))
123
  except Exception as e:
124
- return gr.skip(), gr.skip(), gr.skip(), f"Error: {e}", *([gr.skip()] * (MAX_NOTEBOOKS * 2))
125
 
126
 
127
  def _safe_delete(idx, state, selected_id, profile: gr.OAuthProfile | None = None):
128
  """Delete notebook at index."""
129
  try:
130
  if idx is None or idx < 0 or idx >= len(state):
131
- return gr.skip(), gr.skip(), gr.skip(), *([gr.skip()] * (MAX_NOTEBOOKS * 2))
132
  nb_id, _ = state[idx]
133
  user_id = _user_id(profile)
134
  if not user_id:
135
- return gr.skip(), gr.skip(), gr.skip(), "Please sign in", *([gr.skip()] * (MAX_NOTEBOOKS * 2))
136
  ok = delete_notebook(user_id, nb_id)
137
  if ok:
138
  notebooks = _get_notebooks(user_id)
139
- state = [(n["notebook_id"], n["name"]) for n in notebooks]
140
- updates = _build_row_updates(notebooks)
141
  new_selected = notebooks[0]["notebook_id"] if notebooks else None
142
- return state, new_selected, "Notebook deleted", *updates
143
- return gr.skip(), gr.skip(), gr.skip(), "Failed to delete", *([gr.skip()] * (MAX_NOTEBOOKS * 2))
144
  except Exception as e:
145
- return gr.skip(), gr.skip(), gr.skip(), f"Error: {e}", *([gr.skip()] * (MAX_NOTEBOOKS * 2))
146
-
147
-
148
- def _select_notebook(idx, state):
149
- """Set selected notebook when user interacts with a row."""
150
- if idx is None or idx < 0 or idx >= len(state):
151
- return gr.skip()
152
- return state[idx][0]
153
 
154
 
155
  def _initial_load(profile: gr.OAuthProfile | None = None):
@@ -158,9 +197,10 @@ def _initial_load(profile: gr.OAuthProfile | None = None):
158
  notebooks = _get_notebooks(user_id)
159
  state = [(n["notebook_id"], n["name"]) for n in notebooks]
160
  selected = notebooks[0]["notebook_id"] if notebooks else None
161
- updates = _build_row_updates(notebooks)
162
  status = f"Signed in as {user_id}" if user_id else "Sign in with Hugging Face to manage notebooks."
163
- return state, selected, status, *updates
 
 
164
 
165
 
166
  def _safe_upload_pdfs(files, selected_id, profile: gr.OAuthProfile | None = None):
@@ -311,17 +351,7 @@ def _safe_remove_url(url, selected_id, profile: gr.OAuthProfile | None = None):
311
 
312
 
313
 
314
- def _build_row_updates(notebooks):
315
- """Return gr.update values for each row: visibility, then text value."""
316
- out = []
317
- for i in range(MAX_NOTEBOOKS):
318
- visible = i < len(notebooks)
319
- name = notebooks[i]["name"] if visible else ""
320
- out.append(gr.update(visible=visible))
321
- out.append(gr.update(value=name, visible=visible))
322
- return out
323
-
324
- #Upload Handler Functions
325
  def _do_upload(text_content, title, notebook_id, profile: gr.OAuthProfile | None):
326
  """Handle direct text input and ingestion."""
327
  from backend.ingestion_txt import ingest_txt
@@ -493,91 +523,217 @@ def _submit_quiz(questions, *answers):
493
 
494
  lines.append(f"\n**Score: {score}/{len(questions)}**")
495
  return "\n\n".join(lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
496
 
497
  with gr.Blocks(
498
  title="NotebookLM Clone - Notebooks",
499
  theme=theme,
500
  css=CUSTOM_CSS,
501
  ) as demo:
502
- gr.HTML('<div class="container"><p class="hero">Notebook Manager</p><p class="sub">Create notebook below, then manage with Rename and Delete</p></div>')
503
-
504
- with gr.Row(elem_classes=["login-center"]):
505
- gr.Markdown("**Sign in with Hugging Face to access your notebooks**")
506
- with gr.Row(elem_classes=["login-btn-wrap"]):
507
- login_btn = gr.LoginButton(value="🤗 Login with Hugging Face", size="lg")
508
-
509
- nb_state = gr.State([])
510
- selected_notebook_id = gr.State(None)
511
-
512
- # Create section: text box + Create button
513
- with gr.Row():
514
- create_txt = gr.Textbox(
515
- label="Create notebook",
516
- placeholder="Enter new notebook name",
517
- value="",
518
- scale=3,
519
- )
520
- create_btn = gr.Button("Create", variant="primary", scale=1)
521
-
522
- with gr.Row():
523
- pdf_upload_btn = gr.UploadButton(
524
- "Upload PDFs",
525
- file_types=[".pdf"],
526
- file_count="multiple",
527
- type="filepath",
528
- variant="secondary",
529
- )
530
-
531
- with gr.Row():
532
- uploaded_pdf_dd = gr.Dropdown(
533
- label="Uploaded PDFs",
534
- choices=[],
535
- value=None,
536
- scale=3,
537
- allow_custom_value=False,
538
- )
539
- remove_pdf_btn = gr.Button("Remove selected PDF", variant="stop", scale=1)
540
-
541
- with gr.Row():
542
- url_txt = gr.Textbox(
543
- label="Ingest web URL",
544
- placeholder="https://example.com",
545
- value="",
546
- scale=3,
547
- )
548
- ingest_url_btn = gr.Button("Ingest URL", variant="primary", scale=1)
549
- remove_url_btn = gr.Button("Delete URL", variant="stop", scale=1)
550
-
551
- gr.Markdown("---")
552
- gr.Markdown("**Your notebooks** (selected notebook used for chat/ingestion)")
553
-
554
- # Rows: each notebook has [name] [Rename] [Delete]
555
- row_components = []
556
- row_outputs = []
557
- for i in range(MAX_NOTEBOOKS):
558
- with gr.Row(visible=False) as row:
559
- name_txt = gr.Textbox(
560
- value="",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561
  show_label=False,
562
- scale=3,
563
- min_width=200,
564
  )
565
- rename_btn = gr.Button("Rename", scale=1, min_width=80)
566
- delete_btn = gr.Button("Delete", variant="stop", scale=1, min_width=80)
567
- select_btn = gr.Button("Select", scale=1, min_width=70)
568
- row_components.append({"row": row, "name": name_txt, "rename": rename_btn, "delete": delete_btn, "select": select_btn})
569
- row_outputs.extend([row, name_txt])
570
 
571
- status = gr.Markdown("Sign in with Hugging Face to manage notebooks.", elem_classes=["status"])
572
-
573
- demo.load(_initial_load, inputs=None, outputs=[nb_state, selected_notebook_id, status] + row_outputs, api_name=False)
 
 
 
574
  demo.load(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd], api_name=False)
575
 
576
- # Create button
 
 
 
 
 
 
 
 
 
 
577
  create_btn.click(
578
  _safe_create,
579
  inputs=[create_txt, nb_state, selected_notebook_id],
580
- outputs=[create_txt, nb_state, selected_notebook_id, status] + row_outputs,
581
  api_name=False,
582
  ).then(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd])
583
 
@@ -747,7 +903,17 @@ with gr.Blocks(
747
  api_name=False,
748
  )
749
 
750
- demo.launch()
751
-
752
-
 
 
 
 
 
 
 
753
 
 
 
 
 
1
  from pathlib import Path
2
  import shutil
3
+ import sys
4
+ import warnings
5
+
6
+ # Flush print immediately
7
+ def _log(msg):
8
+ print(msg, flush=True)
9
+
10
+ _log("1. Loading env...")
11
+ # Suppress noisy dependency warnings
12
+ warnings.filterwarnings("ignore", message=".*urllib3.*")
13
+ warnings.filterwarnings("ignore", message=".*chardet.*")
14
 
15
  from dotenv import load_dotenv
16
 
 
18
  load_dotenv(Path(__file__).resolve().parent.parent / ".env")
19
  load_dotenv(Path(__file__).resolve().parent / ".env")
20
 
21
+ _log("2. Loading Gradio...")
22
  from datetime import datetime
23
  import gradio as gr
24
+ _log("2a. Loading gradio_client...")
25
  import gradio_client.utils as gradio_client_utils
26
 
27
+ _log("3. Loading backend...")
28
  from backend.ingestion_service import ingest_pdf_chunks, ingest_url_chunks, remove_chunks_for_source
29
  from backend.notebook_service import create_notebook, list_notebooks, rename_notebook, delete_notebook
30
  from backend.podcast_service import generate_podcast, generate_podcast_audio
31
+ from backend.chat_service import load_chat
32
+ from backend.rag_service import rag_chat
33
 
34
  import hashlib
35
+ _log("4. Imports done.")
36
 
37
  _original_gradio_get_type = gradio_client_utils.get_type
38
  _original_json_schema_to_python_type = gradio_client_utils._json_schema_to_python_type
 
53
  gradio_client_utils.get_type = _patched_gradio_get_type
54
  gradio_client_utils._json_schema_to_python_type = _patched_json_schema_to_python_type
55
 
56
+ # Theme: adapts to light/dark mode (use default font to avoid network fetch on startup)
57
  theme = gr.themes.Soft(
58
  primary_hue="blue",
59
  secondary_hue="slate",
 
60
  )
61
 
62
  CUSTOM_CSS = """
63
+ .gradio-container { max-width: 1000px !important; margin: 0 auto !important; }
64
+ .container { max-width: 1000px; margin: 0 auto; padding: 0 24px; }
65
+
66
+ .header-bar { padding: 12px 0; border-bottom: 1px solid #e2e8f0; margin-bottom: 24px; display: flex !important; justify-content: space-between !important; align-items: center !important; white-space: nowrap; }
67
+ .login-center { display: flex; justify-content: center; width: 100%; }
68
+ #auth-text { white-space: nowrap; margin: 8px 0 16px 0; font-size: 0.95rem; opacity: 0.9; }
69
+ .gr-button { padding: 14px 28px !important; font-size: 0.9rem !important; border-radius: 12px !important; white-space: nowrap !important; width: auto !important; }
70
+ .gr-button[aria-label*="Logout"] { min-width: auto !important; display: inline-flex !important; align-items: center !important; justify-content: center !important; }
71
+ .header-bar .gr-button { padding-left: 40px !important; padding-right: 40px !important; min-width: 220px !important; font-size: 0.8rem !important; }
72
+ .dark .header-bar { border-bottom: 1px solid #334155; }
73
+
74
+ .hero-section { margin-bottom: 16px; }
75
+ .login-container { padding: 12px 0; }
76
+ .create-strip { padding: 18px; border-radius: 16px; }
77
+ .create-row { display: flex !important; align-items: center !important; gap: 16px !important; }
78
+ .create-label { white-space: nowrap; font-size: 0.95rem; margin: 0; min-width: 180px; }
79
+ .create-row .gr-textbox { flex: 1 !important; }
80
+ .create-row .gr-textbox textarea,
81
+ .create-row .gr-textbox input { border-radius: 10px !important; }
82
+ .create-row .gr-button { border-radius: 10px !important; padding: 10px 20px !important; }
83
+ .hero-title { font-size: 2rem; font-weight: 700; color: #1e293b; margin: 0 0 8px 0; }
84
+ .hero-sub { font-size: 1rem; color: #64748b; margin: 0; line-height: 1.5; }
85
+
86
+ .section-card { padding: 24px; border-radius: 16px; background: #f8fafc; margin-bottom: 24px; box-shadow: 0 2px 8px rgba(0,0,0,0.06); }
87
+ .notebook-card { padding: 14px 20px; border-radius: 12px; background: #fff; margin-bottom: 8px; border: 1px solid #e2e8f0; display: flex; align-items: center; gap: 12px; transition: background 0.15s ease; }
88
+ .notebook-card:hover { background: #f8fafc; }
89
+
90
+ .section-title { font-size: 1.125rem; font-weight: 600; color: #1e293b; margin: 0 0 16px 0; }
91
+ .section-row { display: flex !important; align-items: center !important; gap: 16px !important; margin-bottom: 12px; }
92
+ .section-row .gr-textbox { flex: 1 !important; }
93
+ .section-row .gr-button { border-radius: 10px !important; padding: 10px 20px !important; }
94
+
95
+ .status { font-size: 0.875rem; color: #64748b; margin-top: 16px; padding: 12px 16px; background: #f1f5f9; border-radius: 12px; }
96
+
97
  @media (prefers-color-scheme: dark) {
98
+ .hero-title { color: #f1f5f9 !important; }
99
+ .hero-sub { color: #94a3b8 !important; }
100
+ .section-card { background: #1e293b !important; box-shadow: 0 2px 8px rgba(0,0,0,0.3); }
101
+ .section-title { color: #f1f5f9 !important; }
102
+ .notebook-card { background: #334155 !important; border-color: #475569; }
103
+ .notebook-card:hover { background: #475569 !important; }
104
+ .status { color: #94a3b8 !important; background: #334155 !important; }
105
  }
106
+ .dark .hero-title { color: #f1f5f9 !important; }
107
+ .dark .hero-sub { color: #94a3b8 !important; }
108
+ .dark .section-card { background: #1e293b !important; }
109
+ .dark .section-title { color: #f1f5f9 !important; }
110
+ .dark .notebook-card { background: #334155 !important; border-color: #475569; }
111
+ .dark .notebook-card:hover { background: #475569 !important; }
112
+ .dark .status { color: #94a3b8 !important; background: #334155 !important; }
113
  """
114
 
 
 
 
115
  def _user_id(profile: gr.OAuthProfile | None) -> str | None:
116
  """Extract user_id from HF OAuth profile. None if not logged in."""
117
+ if not profile:
118
+ return None
119
+ return (
120
+ getattr(profile, "id", None)
121
+ or getattr(profile, "sub", None)
122
+ or getattr(profile, "preferred_username", None)
123
+ or getattr(profile, "username", None)
124
+ or getattr(profile, "name", None)
125
+ )
126
 
127
 
128
  def _get_notebooks(user_id: str | None):
 
136
  try:
137
  user_id = _user_id(profile)
138
  if not user_id:
139
+ return gr.skip(), gr.skip(), gr.skip(), "Please sign in with Hugging Face"
140
  name = (new_name or "").strip() or "Untitled Notebook"
141
  nb = create_notebook(user_id, name)
142
  if nb:
143
  notebooks = _get_notebooks(user_id)
144
+ new_state = [(n["notebook_id"], n["name"]) for n in notebooks]
 
 
145
  status = f"Created: {nb['name']}"
146
+ return "", new_state, nb["notebook_id"], status
147
+ return gr.skip(), gr.skip(), gr.skip(), "Failed to create"
148
  except Exception as e:
149
+ return gr.skip(), gr.skip(), gr.skip(), f"Error: {e}"
150
 
151
 
152
  def _safe_rename(idx, new_name, state, selected_id, profile: gr.OAuthProfile | None = None):
153
  """Rename notebook at index."""
154
  try:
155
  if idx is None or idx < 0 or idx >= len(state):
156
+ return gr.skip(), gr.skip(), "Invalid selection"
157
  nb_id, _ = state[idx]
158
  name = (new_name or "").strip()
159
  if not name:
160
+ return gr.skip(), gr.skip(), "Enter a name."
161
  user_id = _user_id(profile)
162
  if not user_id:
163
+ return gr.skip(), gr.skip(), "Please sign in"
164
  ok = rename_notebook(user_id, nb_id, name)
165
  if ok:
166
  notebooks = _get_notebooks(user_id)
167
+ new_state = [(n["notebook_id"], n["name"]) for n in notebooks]
168
+ return new_state, selected_id, f"Renamed to: {name}"
169
+ return gr.skip(), gr.skip(), "Failed to rename"
 
170
  except Exception as e:
171
+ return gr.skip(), gr.skip(), f"Error: {e}"
172
 
173
 
174
  def _safe_delete(idx, state, selected_id, profile: gr.OAuthProfile | None = None):
175
  """Delete notebook at index."""
176
  try:
177
  if idx is None or idx < 0 or idx >= len(state):
178
+ return gr.skip(), gr.skip(), "Invalid selection"
179
  nb_id, _ = state[idx]
180
  user_id = _user_id(profile)
181
  if not user_id:
182
+ return gr.skip(), gr.skip(), "Please sign in"
183
  ok = delete_notebook(user_id, nb_id)
184
  if ok:
185
  notebooks = _get_notebooks(user_id)
186
+ new_state = [(n["notebook_id"], n["name"]) for n in notebooks]
 
187
  new_selected = notebooks[0]["notebook_id"] if notebooks else None
188
+ return new_state, new_selected, "Notebook deleted"
189
+ return gr.skip(), gr.skip(), "Failed to delete"
190
  except Exception as e:
191
+ return gr.skip(), gr.skip(), f"Error: {e}"
 
 
 
 
 
 
 
192
 
193
 
194
  def _initial_load(profile: gr.OAuthProfile | None = None):
 
197
  notebooks = _get_notebooks(user_id)
198
  state = [(n["notebook_id"], n["name"]) for n in notebooks]
199
  selected = notebooks[0]["notebook_id"] if notebooks else None
 
200
  status = f"Signed in as {user_id}" if user_id else "Sign in with Hugging Face to manage notebooks."
201
+ auth_update = f"You are logged in as {getattr(profile, 'name', None) or user_id} ({_user_id(profile)})" if user_id else ""
202
+ auth_row_visible = bool(user_id)
203
+ return state, selected, status, auth_update, gr.update(visible=auth_row_visible), gr.update(visible=bool(user_id)), gr.update(visible=not bool(user_id))
204
 
205
 
206
  def _safe_upload_pdfs(files, selected_id, profile: gr.OAuthProfile | None = None):
 
351
 
352
 
353
 
354
+ # ── Upload Handler Functions ──────────────────────────────────
 
 
 
 
 
 
 
 
 
 
355
  def _do_upload(text_content, title, notebook_id, profile: gr.OAuthProfile | None):
356
  """Handle direct text input and ingestion."""
357
  from backend.ingestion_txt import ingest_txt
 
523
 
524
  lines.append(f"\n**Score: {score}/{len(questions)}**")
525
  return "\n\n".join(lines)
526
+ def _chat_history_to_pairs(messages: list[dict]) -> list[tuple[str, str]]:
527
+ """Convert load_chat output to Gradio Chatbot format [(user, assistant), ...]."""
528
+ pairs = []
529
+ i = 0
530
+ while i < len(messages):
531
+ m = messages[i]
532
+ if m["role"] == "user":
533
+ user_content = m["content"] or ""
534
+ asst_content = ""
535
+ if i + 1 < len(messages) and messages[i + 1]["role"] == "assistant":
536
+ asst_content = messages[i + 1]["content"] or ""
537
+ i += 1
538
+ pairs.append((user_content, asst_content))
539
+ i += 1
540
+ return pairs
541
+
542
+
543
+ def _load_chat_history(notebook_id) -> tuple[list[tuple[str, str]], list[tuple[str, str]]]:
544
+ """Load chat for notebook. Returns (history_pairs, history_pairs) for State and Chatbot."""
545
+ if not notebook_id:
546
+ return [], []
547
+ messages = load_chat(notebook_id)
548
+ pairs = _chat_history_to_pairs(messages)
549
+ return pairs, pairs
550
+
551
+
552
+ def _on_chat_submit(query, notebook_id, chat_history, profile: gr.OAuthProfile | None):
553
+ """Handle chat submit: call RAG, return updated history."""
554
+ if not notebook_id:
555
+ return "", chat_history, "Select a notebook first."
556
+ if not query or not query.strip():
557
+ return "", chat_history, "Enter a message."
558
+ user_id = _user_id(profile)
559
+ if not user_id:
560
+ return "", chat_history, "Please sign in first."
561
+ try:
562
+ answer, updated = rag_chat(notebook_id, query.strip(), chat_history)
563
+ return "", updated, ""
564
+ except Exception as e:
565
+ return "", chat_history, f"Error: {e}"
566
 
567
  with gr.Blocks(
568
  title="NotebookLM Clone - Notebooks",
569
  theme=theme,
570
  css=CUSTOM_CSS,
571
  ) as demo:
572
+ with gr.Row(elem_classes=["header-bar"]):
573
+ gr.Markdown("### 📓 NotebookLM Clone")
574
+ login_btn = gr.LoginButton(value="🤗 Login with Hugging Face", size="lg")
575
+
576
+ with gr.Row(visible=False) as auth_info_row:
577
+ auth_text = gr.Markdown("", elem_id="auth-text")
578
+
579
+ gr.HTML("""
580
+ <div class="container hero-section">
581
+ <h1 class="hero-title">📓 NotebookLM Clone</h1>
582
+ <p class="hero-sub">Chat with your documents. Generate reports, quizzes, and podcasts with citations.</p>
583
+ </div>
584
+ """)
585
+
586
+ with gr.Column(visible=False, elem_classes=["login-container"]) as login_container:
587
+ gr.Markdown("**Sign in with Hugging Face to access your notebooks.**", elem_classes=["login-center"])
588
+
589
+ with gr.Column(visible=False) as app_content:
590
+ nb_state = gr.State([])
591
+ selected_notebook_id = gr.State(None)
592
+
593
+ with gr.Group(elem_classes=["create-strip"]):
594
+ with gr.Row(elem_classes=["create-row"]):
595
+ gr.Markdown("Create new notebook", elem_classes=["create-label"])
596
+ create_txt = gr.Textbox(
597
+ placeholder="Enter new notebook name",
598
+ show_label=False,
599
+ container=False,
600
+ value="",
601
+ )
602
+ create_btn = gr.Button("Create", variant="primary", size="sm")
603
+
604
+ with gr.Group(elem_classes=["section-card"]):
605
+ gr.Markdown("**Sources**", elem_classes=["section-title"])
606
+ gr.Markdown("*Upload PDFs, ingest URLs, or add text to your selected notebook*")
607
+ with gr.Row(elem_classes=["section-row"]):
608
+ pdf_upload_btn = gr.UploadButton(
609
+ "Upload PDFs",
610
+ file_types=[".pdf"],
611
+ file_count="multiple",
612
+ type="filepath",
613
+ variant="secondary",
614
+ )
615
+ with gr.Row(elem_classes=["section-row"]):
616
+ uploaded_pdf_dd = gr.Dropdown(
617
+ label="Uploaded PDFs",
618
+ choices=[],
619
+ value=None,
620
+ scale=3,
621
+ allow_custom_value=False,
622
+ )
623
+ remove_pdf_btn = gr.Button("Remove selected PDF", variant="stop", scale=1)
624
+ with gr.Row(elem_classes=["section-row"]):
625
+ url_txt = gr.Textbox(
626
+ label="Ingest web URL",
627
+ placeholder="https://example.com",
628
+ value="",
629
+ scale=3,
630
+ )
631
+ ingest_url_btn = gr.Button("Ingest URL", variant="primary", scale=1)
632
+ remove_url_btn = gr.Button("Delete URL", variant="stop", scale=1)
633
+
634
+ gr.HTML("<br>")
635
+ gr.Markdown("**Your Notebooks**", elem_classes=["section-title"])
636
+ gr.Markdown("*Selected notebook is used for chat and ingestion*", elem_id="sub-hint")
637
+ gr.HTML("<br>")
638
+
639
+ status = gr.Markdown("Sign in with Hugging Face to manage notebooks.", elem_classes=["status"])
640
+
641
+ @gr.render(inputs=[nb_state])
642
+ def render_notebooks(state):
643
+ if not state:
644
+ gr.Markdown("No notebooks yet. Create one to get started.")
645
+ else:
646
+ for i, (nb_id, name) in enumerate(state):
647
+ idx = i
648
+ with gr.Row(elem_classes=["notebook-card"]):
649
+ name_txt = gr.Textbox(value=name, show_label=False, scale=4, min_width=240, key=f"nb-name-{nb_id}")
650
+ select_btn = gr.Button("Select", variant="primary", scale=1, min_width=80, size="sm")
651
+ rename_btn = gr.Button("Rename", variant="secondary", scale=1, min_width=80, size="sm")
652
+ delete_btn = gr.Button("Delete", variant="secondary", scale=1, min_width=80, size="sm")
653
+
654
+ def on_select(nb_id=nb_id):
655
+ return nb_id
656
+
657
+ def on_select_status():
658
+ return "Selected notebook updated. Use this for chat/ingestion."
659
+
660
+ select_btn.click(
661
+ on_select,
662
+ inputs=None,
663
+ outputs=[selected_notebook_id],
664
+ ).then(on_select_status, None, [status]).then(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd])
665
+
666
+ rename_btn.click(
667
+ _safe_rename,
668
+ inputs=[gr.State(idx), name_txt, nb_state, selected_notebook_id],
669
+ outputs=[nb_state, selected_notebook_id, status],
670
+ api_name=False,
671
+ )
672
+
673
+ delete_btn.click(
674
+ _safe_delete,
675
+ inputs=[gr.State(idx), nb_state, selected_notebook_id],
676
+ outputs=[nb_state, selected_notebook_id, status],
677
+ api_name=False,
678
+ ).then(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd])
679
+
680
+ gr.HTML("<br>")
681
+
682
+ with gr.Group(elem_classes=["section-card"]):
683
+ gr.Markdown("**Add Text**", elem_classes=["section-title"])
684
+ gr.Markdown("*Select a notebook above, then paste or type your text*")
685
+ with gr.Row():
686
+ txt_title = gr.Textbox(
687
+ label="Title",
688
+ placeholder="Give this text a name (e.g. 'Lecture Notes Week 1')",
689
+ scale=1,
690
+ )
691
+ txt_input = gr.Textbox(
692
+ label="Text Content",
693
+ placeholder="Paste or type your text here...",
694
+ lines=10,
695
+ )
696
+ submit_btn = gr.Button("Save & Process", variant="primary")
697
+ upload_status = gr.Markdown("", elem_classes=["status"])
698
+ sources_display = gr.Markdown("")
699
+
700
+ with gr.Group(elem_classes=["section-card"]):
701
+ gr.Markdown("**Chat**", elem_classes=["section-title"])
702
+ gr.Markdown("*Ask questions about your notebook sources. Answers are grounded in retrieved chunks with citations.*")
703
+ chat_history_state = gr.State([])
704
+ chatbot = gr.Chatbot(label="Chat history", height=400)
705
+ chat_input = gr.Textbox(
706
+ label="Message",
707
+ placeholder="Ask a question about your sources...",
708
  show_label=False,
709
+ lines=2,
 
710
  )
711
+ chat_submit_btn = gr.Button("Send", variant="primary")
712
+ chat_status = gr.Markdown("", elem_classes=["status"])
 
 
 
713
 
714
+ demo.load(
715
+ _initial_load,
716
+ inputs=None,
717
+ outputs=[nb_state, selected_notebook_id, status, auth_text, auth_info_row, app_content, login_container],
718
+ api_name=False,
719
+ )
720
  demo.load(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd], api_name=False)
721
 
722
+ def _on_notebook_select_for_chat(notebook_id):
723
+ hist, _ = _load_chat_history(notebook_id)
724
+ return hist, hist
725
+
726
+ selected_notebook_id.change(
727
+ _on_notebook_select_for_chat,
728
+ inputs=[selected_notebook_id],
729
+ outputs=[chat_history_state, chatbot],
730
+ api_name=False,
731
+ )
732
+
733
  create_btn.click(
734
  _safe_create,
735
  inputs=[create_txt, nb_state, selected_notebook_id],
736
+ outputs=[create_txt, nb_state, selected_notebook_id, status],
737
  api_name=False,
738
  ).then(_list_uploaded_pdfs, inputs=[selected_notebook_id], outputs=[uploaded_pdf_dd])
739
 
 
903
  api_name=False,
904
  )
905
 
906
+ chat_submit_btn.click(
907
+ _on_chat_submit,
908
+ inputs=[chat_input, selected_notebook_id, chat_history_state],
909
+ outputs=[chat_input, chat_history_state, chat_status],
910
+ api_name=False,
911
+ ).then(
912
+ lambda h: (h, h),
913
+ inputs=[chat_history_state],
914
+ outputs=[chat_history_state, chatbot],
915
+ )
916
 
917
+ if __name__ == "__main__":
918
+ _log("5. Launching Gradio...")
919
+ demo.launch()
backend/embedding_service.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared embedding service - 384-dim vectors for RAG (ingestion + retrieval). Uses MiniLM for low memory."""
2
+
3
+ from sentence_transformers import SentenceTransformer
4
+
5
+ _MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
6
+ _model = None
7
+
8
+
9
+ def _get_model() -> SentenceTransformer:
10
+ """Lazy-load the embedding model."""
11
+ global _model
12
+ if _model is None:
13
+ _model = SentenceTransformer(_MODEL_NAME)
14
+ return _model
15
+
16
+
17
+ def encode(texts: list[str], task: str = "search_document") -> list[list[float]]:
18
+ """
19
+ Embed texts. Returns list of 384-dim vectors.
20
+
21
+ Args:
22
+ texts: List of strings to embed.
23
+ task: Unused (MiniLM doesn't need prefix); kept for API compatibility.
24
+ """
25
+ if not texts:
26
+ return []
27
+
28
+ model = _get_model()
29
+ embeddings = model.encode(texts, show_progress_bar=False)
30
+ return [e.tolist() for e in embeddings]
backend/ingestion_service.py CHANGED
@@ -1,10 +1,11 @@
1
- """PDF ingestion for RAG: extract text, chunk, and persist to chunks table."""
2
 
3
  from pathlib import Path
4
 
5
  from pypdf import PdfReader
6
 
7
  from backend.db import supabase
 
8
 
9
  import requests
10
  from bs4 import BeautifulSoup
@@ -39,7 +40,7 @@ def _chunk_text(text: str, chunk_size: int = DEFAULT_CHUNK_SIZE, overlap: int =
39
 
40
 
41
  def ingest_pdf_chunks(notebook_id: str, source_id: str, pdf_path: Path) -> int:
42
- """Extract and store chunks for a single PDF. Returns number of chunks inserted."""
43
  text = _extract_pdf_text(pdf_path)
44
  chunks = _chunk_text(text)
45
 
@@ -48,11 +49,14 @@ def ingest_pdf_chunks(notebook_id: str, source_id: str, pdf_path: Path) -> int:
48
  if not chunks:
49
  return 0
50
 
 
 
51
  rows = [
52
  {
53
  "notebook_id": notebook_id,
54
  "source_id": source_id,
55
  "content": chunk,
 
56
  "metadata": {
57
  "file_name": source_id,
58
  "file_path": str(pdf_path),
@@ -88,6 +92,7 @@ def _extract_url_text(url: str) -> str:
88
  return " ".join(text.split()).strip()
89
 
90
  def ingest_url_chunks(notebook_id: str, source_id: str, url: str) -> int:
 
91
  text = _extract_url_text(url)
92
  chunks = _chunk_text(text)
93
 
@@ -96,11 +101,14 @@ def ingest_url_chunks(notebook_id: str, source_id: str, url: str) -> int:
96
  if not chunks:
97
  return 0
98
 
 
 
99
  rows = [
100
  {
101
  "notebook_id": notebook_id,
102
  "source_id": source_id,
103
  "content": chunk,
 
104
  "metadata": {
105
  "url": url,
106
  "chunk_index": index,
 
1
+ """PDF ingestion for RAG: extract text, chunk, embed, and persist to chunks table."""
2
 
3
  from pathlib import Path
4
 
5
  from pypdf import PdfReader
6
 
7
  from backend.db import supabase
8
+ from backend.embedding_service import encode as embed_texts
9
 
10
  import requests
11
  from bs4 import BeautifulSoup
 
40
 
41
 
42
  def ingest_pdf_chunks(notebook_id: str, source_id: str, pdf_path: Path) -> int:
43
+ """Extract, embed, and store chunks for a single PDF. Returns number of chunks inserted."""
44
  text = _extract_pdf_text(pdf_path)
45
  chunks = _chunk_text(text)
46
 
 
49
  if not chunks:
50
  return 0
51
 
52
+ embeddings = embed_texts(chunks, task="search_document")
53
+
54
  rows = [
55
  {
56
  "notebook_id": notebook_id,
57
  "source_id": source_id,
58
  "content": chunk,
59
+ "embedding": embeddings[index],
60
  "metadata": {
61
  "file_name": source_id,
62
  "file_path": str(pdf_path),
 
92
  return " ".join(text.split()).strip()
93
 
94
  def ingest_url_chunks(notebook_id: str, source_id: str, url: str) -> int:
95
+ """Extract, embed, and store chunks for a URL. Returns number of chunks inserted."""
96
  text = _extract_url_text(url)
97
  chunks = _chunk_text(text)
98
 
 
101
  if not chunks:
102
  return 0
103
 
104
+ embeddings = embed_texts(chunks, task="search_document")
105
+
106
  rows = [
107
  {
108
  "notebook_id": notebook_id,
109
  "source_id": source_id,
110
  "content": chunk,
111
+ "embedding": embeddings[index],
112
  "metadata": {
113
  "url": url,
114
  "chunk_index": index,
backend/ingestion_txt.py CHANGED
@@ -17,6 +17,9 @@ from sentence_transformers import SentenceTransformer
17
  # Load model once at module level (not on every call)
18
  _model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
19
  # Constants
 
 
 
20
  MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB
21
 
22
 
@@ -114,16 +117,14 @@ def chunk_text(text: str, source_id: str, notebook_id: str, filename: str = "")
114
  # Embed + Store
115
  def embed_and_store_chunks(chunks: list[dict]) -> None:
116
  """
117
- Embed chunks using sentence-transformers and store in pgvector.
118
  """
119
  if not chunks:
120
  return
121
 
122
- # Embed all chunks in one batch
123
  texts = [c["content"] for c in chunks]
124
- embeddings = _model.encode(texts, show_progress_bar=False)
125
 
126
- # Build rows for Supabase insert
127
  rows = []
128
  for chunk, embedding in zip(chunks, embeddings):
129
  rows.append({
@@ -131,7 +132,7 @@ def embed_and_store_chunks(chunks: list[dict]) -> None:
131
  "source_id": str(chunk["source_id"]),
132
  "notebook_id": str(chunk["notebook_id"]),
133
  "content": chunk["content"],
134
- "embedding": embedding.tolist(),
135
  "metadata": chunk["metadata"]
136
  })
137
 
 
17
  # Load model once at module level (not on every call)
18
  _model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
19
  # Constants
20
+ from backend.embedding_service import encode as embed_texts
21
+ # ── Constants ────────────────────────────────────────────────
22
+
23
  MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB
24
 
25
 
 
117
  # Embed + Store
118
  def embed_and_store_chunks(chunks: list[dict]) -> None:
119
  """
120
+ Embed chunks using shared 1536-dim model and store in pgvector.
121
  """
122
  if not chunks:
123
  return
124
 
 
125
  texts = [c["content"] for c in chunks]
126
+ embeddings = embed_texts(texts, task="search_document")
127
 
 
128
  rows = []
129
  for chunk, embedding in zip(chunks, embeddings):
130
  rows.append({
 
132
  "source_id": str(chunk["source_id"]),
133
  "notebook_id": str(chunk["notebook_id"]),
134
  "content": chunk["content"],
135
+ "embedding": embedding,
136
  "metadata": chunk["metadata"]
137
  })
138
 
backend/rag_service.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """RAG chat service - retrieve chunks, call LLM, persist messages."""
2
+
3
+ import os
4
+ import re
5
+
6
+ from openai import OpenAI
7
+
8
+ from backend.chat_service import save_message, load_chat
9
+ from backend.retrieval_service import retrieve_chunks
10
+
11
+ MAX_HISTORY_MESSAGES = 20
12
+ # Together AI - you have recent usage. Or :groq for Groq.
13
+ DEFAULT_MODEL = "meta-llama/Llama-3.2-3B-Instruct:together"
14
+ TOP_K = 5
15
+
16
+ _client: OpenAI | None = None
17
+
18
+
19
+ def _get_client() -> OpenAI:
20
+ global _client
21
+ if _client is None:
22
+ token = os.getenv("HF_TOKEN")
23
+ _client = OpenAI(
24
+ base_url="https://router.huggingface.co/v1",
25
+ api_key=token,
26
+ )
27
+ return _client
28
+
29
+
30
+ def _validate_citations(text: str, num_chunks: int) -> str:
31
+ """Strip or fix citation numbers [N] where N > num_chunks."""
32
+ if num_chunks <= 0:
33
+ return text
34
+
35
+ def replace_citation(match):
36
+ n = int(match.group(1))
37
+ if 1 <= n <= num_chunks:
38
+ return match.group(0)
39
+ return ""
40
+
41
+ return re.sub(r"\[(\d+)\]", replace_citation, text)
42
+
43
+
44
+ def rag_chat(notebook_id: str, query: str, chat_history: list) -> tuple[str, list]:
45
+ """
46
+ RAG chat: retrieve chunks, build prompt, call LLM, persist, return answer and updated history.
47
+
48
+ chat_history: list of [user_msg, assistant_msg] pairs (Gradio Chatbot format).
49
+ Returns: (assistant_reply, updated_history).
50
+ """
51
+ save_message(notebook_id, "user", query)
52
+
53
+ chunks = retrieve_chunks(notebook_id, query, top_k=TOP_K)
54
+
55
+ context_parts = []
56
+ for i, c in enumerate(chunks, 1):
57
+ context_parts.append(f"[{i}] {c['content']}")
58
+ context = "\n\n".join(context_parts) if context_parts else "(No relevant sources found.)"
59
+
60
+ system_content = (
61
+ "You are a helpful assistant. Answer ONLY from the provided context. "
62
+ "Cite sources using [1], [2], etc. corresponding to the numbered passages. "
63
+ "If the answer is not in the context, say so clearly.\n\n"
64
+ f"Context:\n{context}"
65
+ )
66
+
67
+ # Truncate history to last MAX_HISTORY_MESSAGES (pairs -> 2*N messages)
68
+ max_pairs = MAX_HISTORY_MESSAGES // 2
69
+ truncated = chat_history[-max_pairs:] if len(chat_history) > max_pairs else chat_history
70
+
71
+ messages = [{"role": "system", "content": system_content}]
72
+ for user_msg, asst_msg in truncated:
73
+ if user_msg:
74
+ messages.append({"role": "user", "content": user_msg})
75
+ if asst_msg:
76
+ messages.append({"role": "assistant", "content": asst_msg})
77
+ messages.append({"role": "user", "content": query})
78
+
79
+ try:
80
+ client = _get_client()
81
+ response = client.chat.completions.create(
82
+ model=DEFAULT_MODEL,
83
+ messages=messages,
84
+ max_tokens=512,
85
+ )
86
+ raw_answer = response.choices[0].message.content or ""
87
+ answer = _validate_citations(raw_answer, len(chunks))
88
+ except Exception as e:
89
+ answer = f"Error calling model: {e}"
90
+
91
+ save_message(notebook_id, "assistant", answer)
92
+
93
+ updated_history = chat_history + [[query, answer]]
94
+ return answer, updated_history
backend/retrieval_service.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Retrieval service - vector similarity search for RAG."""
2
+
3
+ from backend.db import supabase
4
+ from backend.embedding_service import encode
5
+
6
+
7
+ def retrieve_chunks(notebook_id: str, query: str, top_k: int = 5) -> list[dict]:
8
+ """
9
+ Retrieve top-k chunks for a query, filtered by notebook_id.
10
+
11
+ Returns list of dicts with keys: id, content, metadata, similarity.
12
+ """
13
+ if not query or not query.strip():
14
+ return []
15
+
16
+ query_embedding = encode([query.strip()], task="search_query")[0]
17
+
18
+ try:
19
+ result = supabase.rpc(
20
+ "match_chunks",
21
+ {
22
+ "query_embedding": query_embedding,
23
+ "match_count": top_k,
24
+ "p_notebook_id": notebook_id,
25
+ },
26
+ ).execute()
27
+
28
+ rows = result.data or []
29
+ return [
30
+ {
31
+ "id": str(r["id"]),
32
+ "content": r["content"],
33
+ "metadata": r.get("metadata") or {},
34
+ "similarity": float(r.get("similarity", 0)),
35
+ }
36
+ for r in rows
37
+ ]
38
+ except Exception:
39
+ return []
db/migrate_to_384.sql ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Migration: Switch from 1536-dim to 384-dim embeddings (MiniLM)
2
+ -- Run this in Supabase SQL Editor if you already have the chunks table with vector(1536)
3
+
4
+ -- 1. Drop the ivfflat index (required before altering column)
5
+ drop index if exists idx_chunks_embedding;
6
+
7
+ -- 2. Clear existing chunks (old 1536-dim embeddings are incompatible)
8
+ truncate table chunks;
9
+
10
+ -- 3. Replace embedding column with 384-dim version
11
+ alter table chunks drop column embedding;
12
+ alter table chunks add column embedding vector(384);
13
+
14
+ -- 4. Recreate the ivfflat index (run AFTER ingesting new PDF/TXT - requires rows)
15
+ -- create index if not exists idx_chunks_embedding on chunks using ivfflat (embedding vector_cosine_ops) with (lists = 100);
16
+
17
+ -- 5. Update match_chunks RPC
18
+ create or replace function match_chunks(
19
+ query_embedding vector(384),
20
+ match_count int,
21
+ p_notebook_id uuid
22
+ )
23
+ returns table (id uuid, content text, metadata jsonb, similarity float)
24
+ language plpgsql as $$
25
+ begin
26
+ return query
27
+ select c.id, c.content, c.metadata,
28
+ 1 - (c.embedding <=> query_embedding) as similarity
29
+ from chunks c
30
+ where c.notebook_id = p_notebook_id
31
+ and c.embedding is not null
32
+ order by c.embedding <=> query_embedding
33
+ limit match_count;
34
+ end;
35
+ $$;
db/schema.sql CHANGED
@@ -33,19 +33,40 @@ create index if not exists idx_artifacts_notebook_id on artifacts(notebook_id);
33
  -- pgvector extension for embeddings
34
  create extension if not exists vector;
35
 
36
- -- chunks with embeddings (for RAG)
37
  create table if not exists chunks (
38
  id uuid primary key default gen_random_uuid(),
39
  notebook_id uuid not null references notebooks(id) on delete cascade,
40
  source_id text,
41
  content text not null,
42
- embedding vector(1536),
43
  metadata jsonb,
44
  created_at timestamptz default now()
45
  );
46
  create index if not exists idx_chunks_notebook_id on chunks(notebook_id);
47
- -- Vector index (run after you have data; ivfflat requires rows):
48
- -- create index idx_chunks_embedding on chunks using ivfflat (embedding vector_cosine_ops) with (lists = 100);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  -- sources table (ingestion pipeline)
51
  create table if not exists sources (
 
33
  -- pgvector extension for embeddings
34
  create extension if not exists vector;
35
 
36
+ -- chunks with embeddings (for RAG) - 384 dims for MiniLM
37
  create table if not exists chunks (
38
  id uuid primary key default gen_random_uuid(),
39
  notebook_id uuid not null references notebooks(id) on delete cascade,
40
  source_id text,
41
  content text not null,
42
+ embedding vector(384),
43
  metadata jsonb,
44
  created_at timestamptz default now()
45
  );
46
  create index if not exists idx_chunks_notebook_id on chunks(notebook_id);
47
+
48
+ -- Vector index for fast similarity search (run after chunks have data; ivfflat requires rows)
49
+ create index if not exists idx_chunks_embedding on chunks using ivfflat (embedding vector_cosine_ops) with (lists = 100);
50
+
51
+ -- RPC for RAG retrieval: top-k chunks by cosine similarity, filtered by notebook_id
52
+ create or replace function match_chunks(
53
+ query_embedding vector(384),
54
+ match_count int,
55
+ p_notebook_id uuid
56
+ )
57
+ returns table (id uuid, content text, metadata jsonb, similarity float)
58
+ language plpgsql as $$
59
+ begin
60
+ return query
61
+ select c.id, c.content, c.metadata,
62
+ 1 - (c.embedding <=> query_embedding) as similarity
63
+ from chunks c
64
+ where c.notebook_id = p_notebook_id
65
+ and c.embedding is not null
66
+ order by c.embedding <=> query_embedding
67
+ limit match_count;
68
+ end;
69
+ $$;
70
 
71
  -- sources table (ingestion pipeline)
72
  create table if not exists sources (
requirements.txt CHANGED
@@ -1,10 +1,12 @@
1
  gradio[oauth]==4.44.1
2
  huggingface_hub==0.24.7
 
3
  supabase>=2.0.0
4
  python-dotenv>=1.0.0
5
  realtime==2.3.0
6
  chardet>=5.0.0
7
  sentence-transformers>=2.0.0
 
8
  pypdf>=4.2.0
9
  beautifulsoup4>=4.12.3
10
  pyttsx3>=2.90
 
1
  gradio[oauth]==4.44.1
2
  huggingface_hub==0.24.7
3
+ openai>=1.0.0
4
  supabase>=2.0.0
5
  python-dotenv>=1.0.0
6
  realtime==2.3.0
7
  chardet>=5.0.0
8
  sentence-transformers>=2.0.0
9
+ einops>=0.7.0
10
  pypdf>=4.2.0
11
  beautifulsoup4>=4.12.3
12
  pyttsx3>=2.90
run.bat ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+ REM Gradio has issues with Python 3.13 - use 3.10, 3.11, or 3.12
3
+ echo Checking for Python 3.10/3.11/3.12...
4
+ py -3.10 --version 2>nul && goto run310
5
+ py -3.11 --version 2>nul && goto run311
6
+ py -3.12 --version 2>nul && goto run312
7
+ echo.
8
+ echo Python 3.10, 3.11, or 3.12 not found.
9
+ echo Gradio does NOT work with Python 3.13.
10
+ echo Install Python 3.10 from https://www.python.org/downloads/
11
+ pause
12
+ exit /b 1
13
+
14
+ :run310
15
+ echo Using Python 3.10
16
+ py -3.10 -m pip install -r requirements.txt -q
17
+ py -3.10 app.py
18
+ goto end
19
+
20
+ :run311
21
+ echo Using Python 3.11
22
+ py -3.11 -m pip install -r requirements.txt -q
23
+ py -3.11 app.py
24
+ goto end
25
+
26
+ :run312
27
+ echo Using Python 3.12
28
+ py -3.12 -m pip install -r requirements.txt -q
29
+ py -3.12 app.py
30
+ goto end
31
+
32
+ :end
33
+ pause