Hitakshi26 commited on
Commit
8593064
·
1 Parent(s): ca39256

Testing Flow

Browse files
README.md CHANGED
@@ -1,12 +1,39 @@
1
  ---
2
- title: GPP1 - NotebookLM Clone
3
  emoji: 📓
4
- colorFrom: indigo
5
- colorTo: purple
6
  sdk: gradio
7
  sdk_version: "4.44.1"
8
- python_version: "3.10"
9
  app_file: app.py
10
  pinned: false
11
  hf_oauth: true
12
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: NotebookLM Clone (GPP1)
3
  emoji: 📓
4
+ colorFrom: blue
5
+ colorTo: pink
6
  sdk: gradio
7
  sdk_version: "4.44.1"
 
8
  app_file: app.py
9
  pinned: false
10
  hf_oauth: true
11
+ ---
12
+
13
+ # NotebookLM Clone (HF OAuth + Chroma + RAG)
14
+
15
+ ## Overview
16
+ This project is a simplified clone of Google NotebookLM. Users can create multiple notebooks, upload sources (PDF/PPTX/TXT/URL), chat with their sources using Retrieval-Augmented Generation (RAG) with citations, and generate study artifacts (report, quiz, podcast).
17
+
18
+ ## Features
19
+ - HF OAuth login (per-user isolation)
20
+ - Multi-notebook support: create/rename/delete
21
+ - Ingestion: PDF / PPTX / TXT / URL
22
+ - Chunking + Embedding (Sentence-Transformers all-MiniLM-L6-v2)
23
+ - Vector search using ChromaDB (persistent per notebook)
24
+ - Chat with citations
25
+ - Artifact generation:
26
+ - report (.md)
27
+ - quiz with answer key (.md)
28
+ - podcast transcript (.md) + audio (.mp3)
29
+
30
+ ## Environment Variables
31
+ ### Hugging Face Space
32
+ - DATA_ROOT=/data
33
+
34
+ ## Local Dev
35
+ 1. Create venv + install dependencies:
36
+ - pip install -r requirements.txt
37
+ 2. Run:
38
+ - python app.py
39
+ Note: HF OAuth is best tested in a Space.
app.py CHANGED
@@ -1,7 +1,12 @@
1
  import os
2
 
3
- # Disable Chroma telemetry noise
4
- os.environ["ANONYMIZED_TELEMETRY"] = "FALSE"
 
 
 
 
 
5
 
6
  from src.frontend.ui import build_app
7
 
@@ -12,4 +17,4 @@ if __name__ == "__main__":
12
  server_name="0.0.0.0",
13
  server_port=int(os.getenv("PORT", "7860")),
14
  show_api=False,
15
- )
 
1
  import os
2
 
3
+ # ----- Disable telemetry / analytics noise -----
4
+ # Gradio analytics (UI usage pings)
5
+ os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
6
+ # HF hub telemetry (optional)
7
+ os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
8
+ # Chroma telemetry (we also disable via Settings in chroma_store.py)
9
+ os.environ["ANONYMIZED_TELEMETRY"] = "False"
10
 
11
  from src.frontend.ui import build_app
12
 
 
17
  server_name="0.0.0.0",
18
  server_port=int(os.getenv("PORT", "7860")),
19
  show_api=False,
20
+ )
src/backend/__pycache__/artifacts.cpython-310.pyc CHANGED
Binary files a/src/backend/__pycache__/artifacts.cpython-310.pyc and b/src/backend/__pycache__/artifacts.cpython-310.pyc differ
 
src/backend/__pycache__/auth.cpython-310.pyc CHANGED
Binary files a/src/backend/__pycache__/auth.cpython-310.pyc and b/src/backend/__pycache__/auth.cpython-310.pyc differ
 
src/backend/__pycache__/ingest.cpython-310.pyc CHANGED
Binary files a/src/backend/__pycache__/ingest.cpython-310.pyc and b/src/backend/__pycache__/ingest.cpython-310.pyc differ
 
src/backend/__pycache__/llm.cpython-310.pyc CHANGED
Binary files a/src/backend/__pycache__/llm.cpython-310.pyc and b/src/backend/__pycache__/llm.cpython-310.pyc differ
 
src/backend/__pycache__/notebooks.cpython-310.pyc CHANGED
Binary files a/src/backend/__pycache__/notebooks.cpython-310.pyc and b/src/backend/__pycache__/notebooks.cpython-310.pyc differ
 
src/backend/__pycache__/rag.cpython-310.pyc CHANGED
Binary files a/src/backend/__pycache__/rag.cpython-310.pyc and b/src/backend/__pycache__/rag.cpython-310.pyc differ
 
src/backend/auth.py CHANGED
@@ -2,32 +2,51 @@ import os
2
  import gradio as gr
3
 
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  def require_login(request: gr.Request) -> str:
6
  """
7
- Hugging Face Spaces OAuth provides user info via request in some Gradio versions,
8
- but not always. We use multiple fallbacks:
9
- 1) request.username (best case)
10
- 2) HF-proxy headers (x-forwarded-*)
11
- 3) local/dev fallback
12
  """
13
- # 1) Best-case Gradio field
14
  username = getattr(request, "username", None)
15
  if username:
16
  return str(username)
17
 
18
- # 2) Fallback: HF spaces headers (varies by proxy/version)
19
- headers = getattr(request, "headers", {}) or {}
 
20
  for key in [
21
  "x-forwarded-user",
22
  "x-hf-user",
23
  "x-forwarded-preferred-username",
24
  "x-auth-request-preferred-username",
25
  ]:
26
- if key in headers and headers[key]:
27
- return str(headers[key])
 
28
 
29
- # 3) Optional local fallback (so app doesn't hard-crash during dev)
30
- if os.getenv("HF_SPACE_ID") is None:
 
31
  return "localuser"
32
 
33
- raise gr.Error("Please log in using 'Sign in with Hugging Face' to use this app.")
 
2
  import gradio as gr
3
 
4
 
5
+ def _get_headers_dict(request: gr.Request) -> dict:
6
+ h = getattr(request, "headers", None) or {}
7
+ # Normalize keys to lowercase
8
+ out = {}
9
+ try:
10
+ for k, v in dict(h).items():
11
+ out[str(k).lower()] = v
12
+ except Exception:
13
+ return {}
14
+ return out
15
+
16
+
17
+ def _first_value(v):
18
+ # Some frameworks store header values as lists
19
+ if isinstance(v, (list, tuple)) and v:
20
+ return v[0]
21
+ return v
22
+
23
+
24
  def require_login(request: gr.Request) -> str:
25
  """
26
+ Returns a stable username for storage paths.
27
+ Works in HF Spaces and local dev.
 
 
 
28
  """
29
+ # 1) Best-case (some Gradio versions populate this)
30
  username = getattr(request, "username", None)
31
  if username:
32
  return str(username)
33
 
34
+ # 2) HF proxy headers (varies by setup)
35
+ headers = _get_headers_dict(request)
36
+
37
  for key in [
38
  "x-forwarded-user",
39
  "x-hf-user",
40
  "x-forwarded-preferred-username",
41
  "x-auth-request-preferred-username",
42
  ]:
43
+ val = _first_value(headers.get(key))
44
+ if val:
45
+ return str(val)
46
 
47
+ # 3) Local/dev fallback (so app doesn't hard-crash during dev)
48
+ # HF Spaces usually sets SPACE_ID; locally it won't exist.
49
+ if os.getenv("SPACE_ID") is None and os.getenv("HF_SPACE_ID") is None:
50
  return "localuser"
51
 
52
+ raise gr.Error("Please log in using 'Sign in with Hugging Face' to use this app.")
src/backend/ingest.py CHANGED
@@ -1,4 +1,6 @@
1
- import os, pathlib
 
 
2
  import requests
3
  from bs4 import BeautifulSoup
4
  from pypdf import PdfReader
@@ -11,6 +13,17 @@ from src.utils.text import safe_name
11
 
12
  EMBED_MODEL = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
13
 
 
 
 
 
 
 
 
 
 
 
 
14
  def simple_chunk(text: str, max_chars=2200, overlap=250):
15
  text = "\n".join([ln.strip() for ln in (text or "").splitlines() if ln.strip()]).strip()
16
  if not text:
@@ -21,19 +34,25 @@ def simple_chunk(text: str, max_chars=2200, overlap=250):
21
  while start < len(text):
22
  end = min(len(text), start + max_chars)
23
  out.append(text[start:end])
24
- if end == len(text): break
 
25
  start = max(0, end - overlap)
26
  return out
27
 
 
 
 
 
28
  def extract_pdf(path: str):
29
  reader = PdfReader(path)
30
  items = []
31
  for i, page in enumerate(reader.pages):
32
  txt = (page.extract_text() or "").strip()
33
  if txt:
34
- items.append({"text": txt, "page": i+1})
35
  return items
36
 
 
37
  def extract_pptx(path: str):
38
  prs = Presentation(path)
39
  items = []
@@ -44,41 +63,67 @@ def extract_pptx(path: str):
44
  texts.append(shape.text)
45
  txt = "\n".join(t.strip() for t in texts if t.strip()).strip()
46
  if txt:
47
- items.append({"text": txt, "slide": i+1})
48
  return items
49
 
 
50
  def extract_txt(path: str):
51
  with open(path, "r", encoding="utf-8", errors="ignore") as f:
52
  txt = f.read().strip()
53
- return [{"text": txt, "page": None}] if txt else []
 
54
 
55
  def extract_url(url: str):
56
  r = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
57
  r.raise_for_status()
58
  soup = BeautifulSoup(r.text, "html.parser")
59
- for tag in soup(["script","style","noscript"]):
60
  tag.decompose()
61
  text = soup.get_text("\n")
62
  text = "\n".join([ln.strip() for ln in text.splitlines() if ln.strip()])
63
- return [{"text": text[:200000], "page": None}]
64
-
65
- def upsert_extracted(username: str, notebook_id: str, source_title: str, source_id: str, extracted_items: list[dict]) -> int:
 
 
 
 
 
 
 
 
 
 
 
66
  col = get_collection(username, notebook_id)
 
67
  ids, docs, metas = [], [], []
68
 
69
- for item in extracted_items:
70
- page = item.get("page", None)
71
- slide = item.get("slide", None)
 
 
 
 
 
 
 
 
 
 
72
 
73
- for j, ch in enumerate(simple_chunk(item["text"])):
74
- ids.append(f"{source_id}::chunk{j}")
 
 
75
  docs.append(ch)
76
 
77
  meta = {
78
  "source_title": str(source_title),
79
  "source_id": str(source_id),
80
  }
81
- # IMPORTANT: Chroma metadata cannot include None
82
  if page is not None:
83
  meta["page"] = int(page)
84
  if slide is not None:
@@ -94,36 +139,32 @@ def upsert_extracted(username: str, notebook_id: str, source_title: str, source_
94
  return len(docs)
95
 
96
 
97
- def ingest_files(username: str, notebook_id: str, filepaths) -> int:
98
- """
99
- filepaths may be:
100
- - list[str]
101
- - list[Gradio file objects]
102
- """
103
  ensure_tree(username, notebook_id)
 
104
  raw_dir = os.path.join(nb_root(username, notebook_id), "files_raw")
105
  ex_dir = os.path.join(nb_root(username, notebook_id), "files_extracted")
 
 
 
 
106
  added = 0
107
 
108
- # Normalize gradio file objects -> local paths
109
- normalized_paths = []
110
- if isinstance(filepaths, (list, tuple)):
111
- for f in filepaths:
112
- if f is None:
113
- continue
114
- # Gradio may pass objects with .name
115
- if hasattr(f, "name") and isinstance(f.name, str):
116
- normalized_paths.append(f.name)
117
- elif isinstance(f, str):
118
- normalized_paths.append(f)
119
- elif isinstance(f, dict) and "name" in f:
120
- normalized_paths.append(f["name"])
121
- elif isinstance(filepaths, str):
122
- normalized_paths = [filepaths]
123
-
124
- for fp in normalized_paths:
125
- dest = os.path.join(raw_dir, os.path.basename(fp))
126
- pathlib.Path(dest).write_bytes(pathlib.Path(fp).read_bytes())
127
 
128
  ext = os.path.splitext(dest)[1].lower()
129
  if ext == ".pdf":
@@ -135,24 +176,27 @@ def ingest_files(username: str, notebook_id: str, filepaths) -> int:
135
  else:
136
  continue
137
 
138
- # save extracted
139
- ex_path = os.path.join(ex_dir, os.path.basename(dest) + ".txt")
140
- with open(ex_path, "w", encoding="utf-8") as f:
141
  for item in extracted:
 
142
  if item.get("page") is not None:
143
  loc = f"page={item.get('page')}"
144
  elif item.get("slide") is not None:
145
  loc = f"slide={item.get('slide')}"
146
- else:
147
- loc = ""
148
- f.write(f"\n--- {loc} ---\n{item['text']}\n")
 
 
149
 
150
  added += upsert_extracted(
151
- username,
152
- notebook_id,
153
- os.path.basename(dest),
154
- f"file:{os.path.basename(dest)}",
155
- extracted,
156
  )
157
 
158
  return added
@@ -160,9 +204,24 @@ def ingest_files(username: str, notebook_id: str, filepaths) -> int:
160
 
161
  def ingest_url(username: str, notebook_id: str, url: str) -> int:
162
  ensure_tree(username, notebook_id)
 
163
  extracted = extract_url(url)
164
  ex_dir = os.path.join(nb_root(username, notebook_id), "files_extracted")
165
- fname = safe_name(url.replace("https://","").replace("http://","").replace("/","_")) + ".txt"
 
 
 
166
  with open(os.path.join(ex_dir, fname), "w", encoding="utf-8") as f:
167
- f.write(extracted[0]["text"])
168
- return upsert_extracted(username, notebook_id, url, f"url:{url}", extracted)
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pathlib
3
+ import hashlib
4
  import requests
5
  from bs4 import BeautifulSoup
6
  from pypdf import PdfReader
 
13
 
14
  EMBED_MODEL = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
15
 
16
+
17
+ # -------------------------
18
+ # Helpers
19
+ # -------------------------
20
+ def _sha10_bytes(b: bytes) -> str:
21
+ return hashlib.sha256(b).hexdigest()[:10]
22
+
23
+ def _sha10_text(s: str) -> str:
24
+ return hashlib.sha256((s or "").encode("utf-8", errors="ignore")).hexdigest()[:10]
25
+
26
+
27
  def simple_chunk(text: str, max_chars=2200, overlap=250):
28
  text = "\n".join([ln.strip() for ln in (text or "").splitlines() if ln.strip()]).strip()
29
  if not text:
 
34
  while start < len(text):
35
  end = min(len(text), start + max_chars)
36
  out.append(text[start:end])
37
+ if end == len(text):
38
+ break
39
  start = max(0, end - overlap)
40
  return out
41
 
42
+
43
+ # -------------------------
44
+ # Extractors
45
+ # -------------------------
46
  def extract_pdf(path: str):
47
  reader = PdfReader(path)
48
  items = []
49
  for i, page in enumerate(reader.pages):
50
  txt = (page.extract_text() or "").strip()
51
  if txt:
52
+ items.append({"text": txt, "page": i + 1})
53
  return items
54
 
55
+
56
  def extract_pptx(path: str):
57
  prs = Presentation(path)
58
  items = []
 
63
  texts.append(shape.text)
64
  txt = "\n".join(t.strip() for t in texts if t.strip()).strip()
65
  if txt:
66
+ items.append({"text": txt, "slide": i + 1})
67
  return items
68
 
69
+
70
  def extract_txt(path: str):
71
  with open(path, "r", encoding="utf-8", errors="ignore") as f:
72
  txt = f.read().strip()
73
+ return [{"text": txt}] if txt else []
74
+
75
 
76
  def extract_url(url: str):
77
  r = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
78
  r.raise_for_status()
79
  soup = BeautifulSoup(r.text, "html.parser")
80
+ for tag in soup(["script", "style", "noscript"]):
81
  tag.decompose()
82
  text = soup.get_text("\n")
83
  text = "\n".join([ln.strip() for ln in text.splitlines() if ln.strip()])
84
+ # hard cap so we don’t embed infinite pages
85
+ return [{"text": text[:200000]}]
86
+
87
+
88
+ # -------------------------
89
+ # Chroma upsert
90
+ # -------------------------
91
+ def upsert_extracted(
92
+ username: str,
93
+ notebook_id: str,
94
+ source_title: str,
95
+ source_id: str,
96
+ extracted_items: list[dict],
97
+ ) -> int:
98
  col = get_collection(username, notebook_id)
99
+
100
  ids, docs, metas = [], [], []
101
 
102
+ for item_idx, item in enumerate(extracted_items):
103
+ page = item.get("page")
104
+ slide = item.get("slide")
105
+
106
+ # stable location string (never None)
107
+ if page is not None:
108
+ loc = f"p{int(page)}"
109
+ elif slide is not None:
110
+ loc = f"s{int(slide)}"
111
+ else:
112
+ loc = f"item{item_idx}"
113
+
114
+ chunks = simple_chunk(item.get("text", ""))
115
 
116
+ for chunk_idx, ch in enumerate(chunks):
117
+ # ✅ unique per (source + loc + chunk)
118
+ cid = f"{source_id}::{loc}::chunk{chunk_idx}"
119
+ ids.append(cid)
120
  docs.append(ch)
121
 
122
  meta = {
123
  "source_title": str(source_title),
124
  "source_id": str(source_id),
125
  }
126
+ # Chroma metadata cannot contain None → only set if present
127
  if page is not None:
128
  meta["page"] = int(page)
129
  if slide is not None:
 
139
  return len(docs)
140
 
141
 
142
+ # -------------------------
143
+ # Public API used by callbacks.py
144
+ # -------------------------
145
+ def ingest_files(username: str, notebook_id: str, files) -> int:
 
 
146
  ensure_tree(username, notebook_id)
147
+
148
  raw_dir = os.path.join(nb_root(username, notebook_id), "files_raw")
149
  ex_dir = os.path.join(nb_root(username, notebook_id), "files_extracted")
150
+
151
+ os.makedirs(raw_dir, exist_ok=True)
152
+ os.makedirs(ex_dir, exist_ok=True)
153
+
154
  added = 0
155
 
156
+ for f in (files or []):
157
+ fp = getattr(f, "name", None)
158
+ if not fp:
159
+ continue
160
+
161
+ # copy uploaded file to raw_dir
162
+ src_path = pathlib.Path(fp)
163
+ file_bytes = src_path.read_bytes()
164
+
165
+ base = os.path.basename(fp)
166
+ dest = os.path.join(raw_dir, base)
167
+ pathlib.Path(dest).write_bytes(file_bytes)
 
 
 
 
 
 
 
168
 
169
  ext = os.path.splitext(dest)[1].lower()
170
  if ext == ".pdf":
 
176
  else:
177
  continue
178
 
179
+ # save extracted text
180
+ ex_path = os.path.join(ex_dir, base + ".txt")
181
+ with open(ex_path, "w", encoding="utf-8") as ftxt:
182
  for item in extracted:
183
+ loc = ""
184
  if item.get("page") is not None:
185
  loc = f"page={item.get('page')}"
186
  elif item.get("slide") is not None:
187
  loc = f"slide={item.get('slide')}"
188
+ ftxt.write(f"\n--- {loc} ---\n{item.get('text','')}\n")
189
+
190
+ # Add a hash so repeated ingest of same filename won't collide
191
+ file_hash = _sha10_bytes(file_bytes)
192
+ source_id = f"file:{base}:{file_hash}"
193
 
194
  added += upsert_extracted(
195
+ username=username,
196
+ notebook_id=notebook_id,
197
+ source_title=base,
198
+ source_id=source_id,
199
+ extracted_items=extracted,
200
  )
201
 
202
  return added
 
204
 
205
  def ingest_url(username: str, notebook_id: str, url: str) -> int:
206
  ensure_tree(username, notebook_id)
207
+
208
  extracted = extract_url(url)
209
  ex_dir = os.path.join(nb_root(username, notebook_id), "files_extracted")
210
+ os.makedirs(ex_dir, exist_ok=True)
211
+
212
+ # save extracted page text
213
+ fname = safe_name(url.replace("https://", "").replace("http://", "").replace("/", "_")) + ".txt"
214
  with open(os.path.join(ex_dir, fname), "w", encoding="utf-8") as f:
215
+ f.write(extracted[0].get("text", ""))
216
+
217
+ # ✅ Hash text so re-ingest doesn’t collide
218
+ text_hash = _sha10_text(extracted[0].get("text", ""))
219
+ source_id = f"url:{url}:{text_hash}"
220
+
221
+ return upsert_extracted(
222
+ username=username,
223
+ notebook_id=notebook_id,
224
+ source_title=url,
225
+ source_id=source_id,
226
+ extracted_items=extracted,
227
+ )
src/frontend/__pycache__/callbacks.cpython-310.pyc CHANGED
Binary files a/src/frontend/__pycache__/callbacks.cpython-310.pyc and b/src/frontend/__pycache__/callbacks.cpython-310.pyc differ
 
src/frontend/__pycache__/ui.cpython-310.pyc CHANGED
Binary files a/src/frontend/__pycache__/ui.cpython-310.pyc and b/src/frontend/__pycache__/ui.cpython-310.pyc differ
 
src/frontend/ui.py CHANGED
@@ -11,15 +11,13 @@ from src.frontend.callbacks import (
11
  on_report,
12
  on_quiz,
13
  on_podcast,
14
- on_download
15
  )
16
  from src.backend.auth import require_login
17
 
18
 
19
  def build_app():
20
-
21
  with gr.Blocks(title="NotebookLM Clone") as demo:
22
-
23
  gr.Markdown("# 📓 NotebookLM Clone (HF Auth + Chroma + RAG)")
24
 
25
  login = gr.LoginButton()
@@ -27,296 +25,156 @@ def build_app():
27
 
28
  username_state = gr.State("")
29
 
30
- # ---------- LOAD ----------
31
- def on_load(request: gr.Request):
32
- username = require_login(request)
33
- dd, chat, arts = ui_bootstrap(username)
34
- return username, dd, chat, arts
35
-
36
  with gr.Row():
37
-
38
- # ---------- LEFT PANEL ----------
39
  with gr.Column(scale=1):
 
40
 
41
- user_box = gr.Textbox(
42
- label="User",
43
- interactive=False
44
- )
45
-
46
- notebook_dd = gr.Dropdown(
47
- label="Notebooks",
48
- choices=[],
49
- interactive=True
50
- )
51
-
52
- nb_new = gr.Textbox(
53
- label="Create notebook",
54
- placeholder="Name"
55
- )
56
 
 
57
  btn_create = gr.Button("Create")
58
 
59
- nb_rename = gr.Textbox(
60
- label="Rename notebook",
61
- placeholder="New name"
62
- )
63
-
64
  btn_rename = gr.Button("Rename")
65
 
66
- btn_delete = gr.Button(
67
- "Delete current",
68
- variant="stop"
69
- )
70
 
71
- # ---------- INGEST ----------
72
  gr.Markdown("## Ingest")
73
-
74
- file_up = gr.File(
75
- label="Upload PDF/PPTX/TXT",
76
- file_count="multiple"
77
- )
78
-
79
  btn_ingest_files = gr.Button("Ingest Files")
 
80
 
81
- ingest_status = gr.Textbox(
82
- label="Status",
83
- interactive=False
84
- )
85
-
86
- url_in = gr.Textbox(
87
- label="URL",
88
- placeholder="https://..."
89
- )
90
-
91
  btn_ingest_url = gr.Button("Ingest URL")
 
92
 
93
- url_status = gr.Textbox(
94
- label="Status",
95
- interactive=False
96
- )
97
-
98
- # ---------- ARTIFACTS ----------
99
  gr.Markdown("## Artifacts")
100
-
101
- topic = gr.Textbox(
102
- label="Topic / prompt"
103
- )
104
-
105
- extra = gr.Textbox(
106
- label="Extra prompt (optional)"
107
- )
108
-
109
  btn_report = gr.Button("Generate Report")
110
  btn_quiz = gr.Button("Generate Quiz")
111
  btn_podcast = gr.Button("Generate Podcast")
112
 
113
- artifact_status = gr.Textbox(
114
- label="Artifact status",
115
- interactive=False
116
- )
117
-
118
- artifacts_list = gr.Dropdown(
119
- label="Artifacts",
120
- choices=[],
121
- interactive=True
122
- )
123
-
124
  download_btn = gr.Button("Download selected")
 
 
125
 
126
- download_file = gr.File(
127
- label="Download",
128
- interactive=False
129
- )
130
-
131
- podcast_audio = gr.Audio(
132
- label="Podcast Audio",
133
- interactive=False
134
- )
135
-
136
- # ---------- RIGHT PANEL ----------
137
  with gr.Column(scale=2):
138
-
139
- chatbot = gr.Chatbot(
140
- height=520,
141
- label="Chat (RAG + citations)"
142
- )
143
-
144
  msg = gr.Textbox(label="Message")
145
-
146
  send = gr.Button("Send")
147
 
148
- # ---------- EVENTS (API DISABLED FIX) ----------
 
 
 
 
 
149
 
150
  demo.load(
151
  on_load,
152
  inputs=None,
153
  outputs=[
154
  username_state,
 
155
  notebook_dd,
156
  chatbot,
157
- artifacts_list
158
  ],
159
  queue=False,
160
- api_name=False
161
- )
162
-
163
- username_state.change(
164
- lambda u: u,
165
- inputs=username_state,
166
- outputs=user_box,
167
- queue=False,
168
- api_name=False
169
  )
170
 
 
171
  notebook_dd.change(
172
  on_switch_notebook,
173
  inputs=[username_state, notebook_dd],
174
  outputs=[chatbot, artifacts_list],
175
  queue=False,
176
- api_name=False
177
  )
178
 
179
  btn_create.click(
180
  on_create_notebook,
181
  inputs=[username_state, nb_new],
182
- outputs=[
183
- notebook_dd,
184
- chatbot,
185
- artifacts_list
186
- ],
187
  queue=False,
188
- api_name=False
189
  )
190
 
191
  btn_rename.click(
192
  on_rename_notebook,
193
- inputs=[
194
- username_state,
195
- notebook_dd,
196
- nb_rename
197
- ],
198
  outputs=[notebook_dd],
199
  queue=False,
200
- api_name=False
201
  )
202
 
203
  btn_delete.click(
204
  on_delete_notebook,
205
- inputs=[
206
- username_state,
207
- notebook_dd
208
- ],
209
- outputs=[
210
- notebook_dd,
211
- chatbot,
212
- artifacts_list
213
- ],
214
  queue=False,
215
- api_name=False
216
  )
217
 
218
  btn_ingest_files.click(
219
  on_ingest_files,
220
- inputs=[
221
- username_state,
222
- notebook_dd,
223
- file_up
224
- ],
225
  outputs=[ingest_status],
226
  queue=True,
227
- api_name=False
228
  )
229
 
230
  btn_ingest_url.click(
231
  on_ingest_url,
232
- inputs=[
233
- username_state,
234
- notebook_dd,
235
- url_in
236
- ],
237
  outputs=[url_status],
238
  queue=True,
239
- api_name=False
240
  )
241
 
242
  send.click(
243
  on_chat,
244
- inputs=[
245
- username_state,
246
- notebook_dd,
247
- chatbot,
248
- msg
249
- ],
250
- outputs=[
251
- chatbot,
252
- msg
253
- ],
254
  queue=True,
255
- api_name=False
256
  )
257
 
258
  btn_report.click(
259
  on_report,
260
- inputs=[
261
- username_state,
262
- notebook_dd,
263
- topic,
264
- extra
265
- ],
266
- outputs=[
267
- artifact_status,
268
- artifacts_list,
269
- download_file
270
- ],
271
  queue=True,
272
- api_name=False
273
  )
274
 
275
  btn_quiz.click(
276
  on_quiz,
277
- inputs=[
278
- username_state,
279
- notebook_dd,
280
- topic,
281
- extra
282
- ],
283
- outputs=[
284
- artifact_status,
285
- artifacts_list,
286
- download_file
287
- ],
288
  queue=True,
289
- api_name=False
290
  )
291
 
292
  btn_podcast.click(
293
  on_podcast,
294
- inputs=[
295
- username_state,
296
- notebook_dd,
297
- topic,
298
- extra
299
- ],
300
- outputs=[
301
- artifact_status,
302
- artifacts_list,
303
- download_file,
304
- podcast_audio
305
- ],
306
  queue=True,
307
- api_name=False
308
  )
309
 
310
  download_btn.click(
311
  on_download,
312
- inputs=[
313
- username_state,
314
- notebook_dd,
315
- artifacts_list
316
- ],
317
  outputs=[download_file],
318
  queue=False,
319
- api_name=False
320
  )
321
 
322
- return demo
 
11
  on_report,
12
  on_quiz,
13
  on_podcast,
14
+ on_download,
15
  )
16
  from src.backend.auth import require_login
17
 
18
 
19
  def build_app():
 
20
  with gr.Blocks(title="NotebookLM Clone") as demo:
 
21
  gr.Markdown("# 📓 NotebookLM Clone (HF Auth + Chroma + RAG)")
22
 
23
  login = gr.LoginButton()
 
25
 
26
  username_state = gr.State("")
27
 
28
+ # ---------- UI ----------
 
 
 
 
 
29
  with gr.Row():
 
 
30
  with gr.Column(scale=1):
31
+ user_box = gr.Textbox(label="User", interactive=False)
32
 
33
+ notebook_dd = gr.Dropdown(label="Notebooks", choices=[], interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ nb_new = gr.Textbox(label="Create notebook", placeholder="Name")
36
  btn_create = gr.Button("Create")
37
 
38
+ nb_rename = gr.Textbox(label="Rename notebook", placeholder="New name")
 
 
 
 
39
  btn_rename = gr.Button("Rename")
40
 
41
+ btn_delete = gr.Button("Delete current", variant="stop")
 
 
 
42
 
 
43
  gr.Markdown("## Ingest")
44
+ file_up = gr.File(label="Upload PDF/PPTX/TXT", file_count="multiple")
 
 
 
 
 
45
  btn_ingest_files = gr.Button("Ingest Files")
46
+ ingest_status = gr.Textbox(label="Status", interactive=False)
47
 
48
+ url_in = gr.Textbox(label="URL", placeholder="https://...")
 
 
 
 
 
 
 
 
 
49
  btn_ingest_url = gr.Button("Ingest URL")
50
+ url_status = gr.Textbox(label="Status", interactive=False)
51
 
 
 
 
 
 
 
52
  gr.Markdown("## Artifacts")
53
+ topic = gr.Textbox(label="Topic / prompt")
54
+ extra = gr.Textbox(label="Extra prompt (optional)")
 
 
 
 
 
 
 
55
  btn_report = gr.Button("Generate Report")
56
  btn_quiz = gr.Button("Generate Quiz")
57
  btn_podcast = gr.Button("Generate Podcast")
58
 
59
+ artifact_status = gr.Textbox(label="Artifact status", interactive=False)
60
+ artifacts_list = gr.Dropdown(label="Artifacts", choices=[], interactive=True)
 
 
 
 
 
 
 
 
 
61
  download_btn = gr.Button("Download selected")
62
+ download_file = gr.File(label="Download", interactive=False)
63
+ podcast_audio = gr.Audio(label="Podcast Audio", interactive=False)
64
 
 
 
 
 
 
 
 
 
 
 
 
65
  with gr.Column(scale=2):
66
+ chatbot = gr.Chatbot(height=520, label="Chat (RAG + citations)")
 
 
 
 
 
67
  msg = gr.Textbox(label="Message")
 
68
  send = gr.Button("Send")
69
 
70
+ # ---------- LOAD ----------
71
+ def on_load(request: gr.Request):
72
+ username = require_login(request)
73
+ dd, chat, arts = ui_bootstrap(username)
74
+ # ✅ Return user_box value directly so it always shows
75
+ return username, username, dd, chat, arts
76
 
77
  demo.load(
78
  on_load,
79
  inputs=None,
80
  outputs=[
81
  username_state,
82
+ user_box, # ✅ always filled
83
  notebook_dd,
84
  chatbot,
85
+ artifacts_list,
86
  ],
87
  queue=False,
88
+ api_name=False,
 
 
 
 
 
 
 
 
89
  )
90
 
91
+ # ---------- EVENTS ----------
92
  notebook_dd.change(
93
  on_switch_notebook,
94
  inputs=[username_state, notebook_dd],
95
  outputs=[chatbot, artifacts_list],
96
  queue=False,
97
+ api_name=False,
98
  )
99
 
100
  btn_create.click(
101
  on_create_notebook,
102
  inputs=[username_state, nb_new],
103
+ outputs=[notebook_dd, chatbot, artifacts_list],
 
 
 
 
104
  queue=False,
105
+ api_name=False,
106
  )
107
 
108
  btn_rename.click(
109
  on_rename_notebook,
110
+ inputs=[username_state, notebook_dd, nb_rename],
 
 
 
 
111
  outputs=[notebook_dd],
112
  queue=False,
113
+ api_name=False,
114
  )
115
 
116
  btn_delete.click(
117
  on_delete_notebook,
118
+ inputs=[username_state, notebook_dd],
119
+ outputs=[notebook_dd, chatbot, artifacts_list],
 
 
 
 
 
 
 
120
  queue=False,
121
+ api_name=False,
122
  )
123
 
124
  btn_ingest_files.click(
125
  on_ingest_files,
126
+ inputs=[username_state, notebook_dd, file_up],
 
 
 
 
127
  outputs=[ingest_status],
128
  queue=True,
129
+ api_name=False,
130
  )
131
 
132
  btn_ingest_url.click(
133
  on_ingest_url,
134
+ inputs=[username_state, notebook_dd, url_in],
 
 
 
 
135
  outputs=[url_status],
136
  queue=True,
137
+ api_name=False,
138
  )
139
 
140
  send.click(
141
  on_chat,
142
+ inputs=[username_state, notebook_dd, chatbot, msg],
143
+ outputs=[chatbot, msg],
 
 
 
 
 
 
 
 
144
  queue=True,
145
+ api_name=False,
146
  )
147
 
148
  btn_report.click(
149
  on_report,
150
+ inputs=[username_state, notebook_dd, topic, extra],
151
+ outputs=[artifact_status, artifacts_list, download_file],
 
 
 
 
 
 
 
 
 
152
  queue=True,
153
+ api_name=False,
154
  )
155
 
156
  btn_quiz.click(
157
  on_quiz,
158
+ inputs=[username_state, notebook_dd, topic, extra],
159
+ outputs=[artifact_status, artifacts_list, download_file],
 
 
 
 
 
 
 
 
 
160
  queue=True,
161
+ api_name=False,
162
  )
163
 
164
  btn_podcast.click(
165
  on_podcast,
166
+ inputs=[username_state, notebook_dd, topic, extra],
167
+ outputs=[artifact_status, artifacts_list, download_file, podcast_audio],
 
 
 
 
 
 
 
 
 
 
168
  queue=True,
169
+ api_name=False,
170
  )
171
 
172
  download_btn.click(
173
  on_download,
174
+ inputs=[username_state, notebook_dd, artifacts_list],
 
 
 
 
175
  outputs=[download_file],
176
  queue=False,
177
+ api_name=False,
178
  )
179
 
180
+ return demo
src/storage/__pycache__/artifact_store.cpython-310.pyc CHANGED
Binary files a/src/storage/__pycache__/artifact_store.cpython-310.pyc and b/src/storage/__pycache__/artifact_store.cpython-310.pyc differ
 
src/storage/__pycache__/chat_store.cpython-310.pyc CHANGED
Binary files a/src/storage/__pycache__/chat_store.cpython-310.pyc and b/src/storage/__pycache__/chat_store.cpython-310.pyc differ
 
src/storage/__pycache__/chroma_store.cpython-310.pyc CHANGED
Binary files a/src/storage/__pycache__/chroma_store.cpython-310.pyc and b/src/storage/__pycache__/chroma_store.cpython-310.pyc differ
 
src/storage/__pycache__/index_store.cpython-310.pyc CHANGED
Binary files a/src/storage/__pycache__/index_store.cpython-310.pyc and b/src/storage/__pycache__/index_store.cpython-310.pyc differ
 
src/storage/__pycache__/paths.cpython-310.pyc CHANGED
Binary files a/src/storage/__pycache__/paths.cpython-310.pyc and b/src/storage/__pycache__/paths.cpython-310.pyc differ
 
src/storage/chroma_store.py CHANGED
@@ -1,17 +1,25 @@
1
  import os
2
  import chromadb
 
3
 
4
  from src.storage.paths import nb_root
5
 
6
  # Cache clients by persist_dir to avoid "ephemeral with different settings"
7
  _CLIENTS: dict[str, chromadb.PersistentClient] = {}
8
 
 
 
 
 
 
 
9
  def chroma_client(username: str, notebook_id: str) -> chromadb.PersistentClient:
10
  persist_dir = os.path.join(nb_root(username, notebook_id), "chroma")
11
  os.makedirs(persist_dir, exist_ok=True)
12
 
13
  if persist_dir not in _CLIENTS:
14
- _CLIENTS[persist_dir] = chromadb.PersistentClient(path=persist_dir)
 
15
  return _CLIENTS[persist_dir]
16
 
17
  def get_collection(username: str, notebook_id: str, name: str = "chunks"):
 
1
  import os
2
  import chromadb
3
+ from chromadb.config import Settings
4
 
5
  from src.storage.paths import nb_root
6
 
7
  # Cache clients by persist_dir to avoid "ephemeral with different settings"
8
  _CLIENTS: dict[str, chromadb.PersistentClient] = {}
9
 
10
+ # One shared Settings object (important: consistent settings!)
11
+ _SETTINGS = Settings(
12
+ anonymized_telemetry=False, # ✅ disables telemetry (stops capture() errors)
13
+ allow_reset=True,
14
+ )
15
+
16
  def chroma_client(username: str, notebook_id: str) -> chromadb.PersistentClient:
17
  persist_dir = os.path.join(nb_root(username, notebook_id), "chroma")
18
  os.makedirs(persist_dir, exist_ok=True)
19
 
20
  if persist_dir not in _CLIENTS:
21
+ _CLIENTS[persist_dir] = chromadb.PersistentClient(path=persist_dir, settings=_SETTINGS)
22
+
23
  return _CLIENTS[persist_dir]
24
 
25
  def get_collection(username: str, notebook_id: str, name: str = "chunks"):
src/utils/__pycache__/text.cpython-310.pyc CHANGED
Binary files a/src/utils/__pycache__/text.cpython-310.pyc and b/src/utils/__pycache__/text.cpython-310.pyc differ