Hitakshi26 commited on
Commit
1bfb390
·
1 Parent(s): b40b09c
app.py CHANGED
@@ -1,12 +1,9 @@
 
 
1
  import os
2
 
3
- # ----- Disable telemetry / analytics noise -----
4
- # Gradio analytics (UI usage pings)
5
- os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
6
- # HF hub telemetry (optional)
7
- os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
8
- # Chroma telemetry (we also disable via Settings in chroma_store.py)
9
- os.environ["ANONYMIZED_TELEMETRY"] = "False"
10
 
11
  from src.frontend.ui import build_app
12
 
@@ -17,4 +14,4 @@ if __name__ == "__main__":
17
  server_name="0.0.0.0",
18
  server_port=int(os.getenv("PORT", "7860")),
19
  show_api=False,
20
- )
 
1
+ #"use client"
2
+
3
  import os
4
 
5
+ # Disable Chroma telemetry noise
6
+ os.environ["ANONYMIZED_TELEMETRY"] = "FALSE"
 
 
 
 
 
7
 
8
  from src.frontend.ui import build_app
9
 
 
14
  server_name="0.0.0.0",
15
  server_port=int(os.getenv("PORT", "7860")),
16
  show_api=False,
17
+ )
requirements.txt CHANGED
@@ -6,6 +6,5 @@ python-pptx==1.0.2
6
  beautifulsoup4==4.12.3
7
  requests==2.32.3
8
  gTTS==2.5.3
9
- huggingface_hub==0.24.6
10
- pydub==0.25.1
11
 
 
6
  beautifulsoup4==4.12.3
7
  requests==2.32.3
8
  gTTS==2.5.3
9
+ huggingface_hub>=0.31.0,<1.0
 
10
 
src/backend/__pycache__/artifacts.cpython-310.pyc CHANGED
Binary files a/src/backend/__pycache__/artifacts.cpython-310.pyc and b/src/backend/__pycache__/artifacts.cpython-310.pyc differ
 
src/backend/__pycache__/auth.cpython-310.pyc CHANGED
Binary files a/src/backend/__pycache__/auth.cpython-310.pyc and b/src/backend/__pycache__/auth.cpython-310.pyc differ
 
src/backend/__pycache__/ingest.cpython-310.pyc CHANGED
Binary files a/src/backend/__pycache__/ingest.cpython-310.pyc and b/src/backend/__pycache__/ingest.cpython-310.pyc differ
 
src/backend/__pycache__/llm.cpython-310.pyc CHANGED
Binary files a/src/backend/__pycache__/llm.cpython-310.pyc and b/src/backend/__pycache__/llm.cpython-310.pyc differ
 
src/backend/__pycache__/notebooks.cpython-310.pyc CHANGED
Binary files a/src/backend/__pycache__/notebooks.cpython-310.pyc and b/src/backend/__pycache__/notebooks.cpython-310.pyc differ
 
src/backend/__pycache__/rag.cpython-310.pyc CHANGED
Binary files a/src/backend/__pycache__/rag.cpython-310.pyc and b/src/backend/__pycache__/rag.cpython-310.pyc differ
 
src/backend/auth.py CHANGED
@@ -2,67 +2,32 @@ import os
2
  import gradio as gr
3
 
4
 
5
- def _header_get(headers: dict, key: str):
6
- """Case-insensitive header lookup."""
7
- if not headers:
8
- return None
9
- lk = key.lower()
10
- for k, v in headers.items():
11
- if str(k).lower() == lk:
12
- return v
13
- return None
14
-
15
-
16
- def get_username_from_request(request: gr.Request) -> str | None:
17
  """
18
- Try multiple ways to extract username from HF Spaces OAuth / proxy.
19
- Different Gradio + Spaces versions expose this differently.
 
 
 
20
  """
21
- if request is None:
22
- return None
23
-
24
- # 1) Best-case: gradio sets request.username
25
  username = getattr(request, "username", None)
26
  if username:
27
  return str(username)
28
 
29
- # 2) Headers (varies by HF proxy / auth config)
30
- headers = getattr(request, "headers", None) or {}
31
- for key in (
32
  "x-forwarded-user",
33
  "x-hf-user",
34
  "x-forwarded-preferred-username",
35
  "x-auth-request-preferred-username",
36
- "hf-user",
37
- ):
38
- v = _header_get(headers, key)
39
- if v:
40
- return str(v)
41
-
42
- # 3) Some deployments put user info in query params / cookies (rare)
43
- # Keep simple: if not found, return None
44
- return None
45
-
46
 
47
- def require_login(request: gr.Request) -> str:
48
- """
49
- 'Strict' login:
50
- - If REQUIRE_LOGIN=1, enforce that we got a username.
51
- - Otherwise, gracefully fall back to a guest user (so the app runs).
52
- """
53
- username = get_username_from_request(request)
54
- if username:
55
- return username
56
-
57
- # Local/dev always allow
58
  if os.getenv("HF_SPACE_ID") is None:
59
  return "localuser"
60
 
61
- # On Spaces: optionally enforce
62
- if os.getenv("REQUIRE_LOGIN", "0") == "1":
63
- raise gr.Error("Please log in using 'Sign in with Hugging Face' to use this app.")
64
-
65
- # Default: allow guest mode so the app works
66
- # NOTE: single shared guest account. If you want per-user separation without auth:
67
- # use request.client.host or a random session id (but keep simple for submission).
68
- return "guest"
 
2
  import gradio as gr
3
 
4
 
5
+ def require_login(request: gr.Request) -> str:
 
 
 
 
 
 
 
 
 
 
 
6
  """
7
+ Hugging Face Spaces OAuth provides user info via request in some Gradio versions,
8
+ but not always. We use multiple fallbacks:
9
+ 1) request.username (best case)
10
+ 2) HF-proxy headers (x-forwarded-*)
11
+ 3) local/dev fallback
12
  """
13
+ # 1) Best-case Gradio field
 
 
 
14
  username = getattr(request, "username", None)
15
  if username:
16
  return str(username)
17
 
18
+ # 2) Fallback: HF spaces headers (varies by proxy/version)
19
+ headers = getattr(request, "headers", {}) or {}
20
+ for key in [
21
  "x-forwarded-user",
22
  "x-hf-user",
23
  "x-forwarded-preferred-username",
24
  "x-auth-request-preferred-username",
25
+ ]:
26
+ if key in headers and headers[key]:
27
+ return str(headers[key])
 
 
 
 
 
 
 
28
 
29
+ # 3) Optional local fallback (so app doesn't hard-crash during dev)
 
 
 
 
 
 
 
 
 
 
30
  if os.getenv("HF_SPACE_ID") is None:
31
  return "localuser"
32
 
33
+ raise gr.Error("Please log in using 'Sign in with Hugging Face' to use this app.")
 
 
 
 
 
 
 
src/backend/ingest.py CHANGED
@@ -1,6 +1,4 @@
1
- import os
2
- import pathlib
3
- import hashlib
4
  import requests
5
  from bs4 import BeautifulSoup
6
  from pypdf import PdfReader
@@ -14,15 +12,13 @@ from src.utils.text import safe_name
14
  EMBED_MODEL = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
15
 
16
 
17
- # -------------------------
18
- # Helpers
19
- # -------------------------
20
- def _sha10_bytes(b: bytes) -> str:
21
- return hashlib.sha256(b).hexdigest()[:10]
22
-
23
- def _sha10_text(s: str) -> str:
24
- return hashlib.sha256((s or "").encode("utf-8", errors="ignore")).hexdigest()[:10]
25
-
26
 
27
  def simple_chunk(text: str, max_chars=2200, overlap=250):
28
  text = "\n".join([ln.strip() for ln in (text or "").splitlines() if ln.strip()]).strip()
@@ -34,25 +30,19 @@ def simple_chunk(text: str, max_chars=2200, overlap=250):
34
  while start < len(text):
35
  end = min(len(text), start + max_chars)
36
  out.append(text[start:end])
37
- if end == len(text):
38
- break
39
  start = max(0, end - overlap)
40
  return out
41
 
42
-
43
- # -------------------------
44
- # Extractors
45
- # -------------------------
46
  def extract_pdf(path: str):
47
  reader = PdfReader(path)
48
  items = []
49
  for i, page in enumerate(reader.pages):
50
  txt = (page.extract_text() or "").strip()
51
  if txt:
52
- items.append({"text": txt, "page": i + 1})
53
  return items
54
 
55
-
56
  def extract_pptx(path: str):
57
  prs = Presentation(path)
58
  items = []
@@ -63,108 +53,63 @@ def extract_pptx(path: str):
63
  texts.append(shape.text)
64
  txt = "\n".join(t.strip() for t in texts if t.strip()).strip()
65
  if txt:
66
- items.append({"text": txt, "slide": i + 1})
67
  return items
68
 
69
-
70
  def extract_txt(path: str):
71
  with open(path, "r", encoding="utf-8", errors="ignore") as f:
72
  txt = f.read().strip()
73
- return [{"text": txt}] if txt else []
74
-
75
 
76
  def extract_url(url: str):
77
  r = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
78
  r.raise_for_status()
79
  soup = BeautifulSoup(r.text, "html.parser")
80
- for tag in soup(["script", "style", "noscript"]):
81
  tag.decompose()
82
  text = soup.get_text("\n")
83
  text = "\n".join([ln.strip() for ln in text.splitlines() if ln.strip()])
84
- # hard cap so we don’t embed infinite pages
85
- return [{"text": text[:200000]}]
86
-
87
-
88
- # -------------------------
89
- # Chroma upsert
90
- # -------------------------
91
- def upsert_extracted(
92
- username: str,
93
- notebook_id: str,
94
- source_title: str,
95
- source_id: str,
96
- extracted_items: list[dict],
97
- ) -> int:
98
- col = get_collection(username, notebook_id)
99
 
 
 
100
  ids, docs, metas = [], [], []
101
-
102
  for item_idx, item in enumerate(extracted_items):
103
- page = item.get("page")
104
- slide = item.get("slide")
105
-
106
- # stable location string (never None)
107
- if page is not None:
108
- loc = f"p{int(page)}"
109
- elif slide is not None:
110
- loc = f"s{int(slide)}"
111
- else:
112
- loc = f"item{item_idx}"
113
-
114
- chunks = simple_chunk(item.get("text", ""))
115
-
116
- for chunk_idx, ch in enumerate(chunks):
117
- # ✅ unique per (source + loc + chunk)
118
- cid = f"{source_id}::{loc}::chunk{chunk_idx}"
119
- ids.append(cid)
120
  docs.append(ch)
121
-
122
  meta = {
123
- "source_title": str(source_title),
124
- "source_id": str(source_id),
 
 
125
  }
126
- # Chroma metadata cannot contain None only set if present
127
- if page is not None:
128
- meta["page"] = int(page)
129
- if slide is not None:
130
- meta["slide"] = int(slide)
131
-
132
  metas.append(meta)
133
-
134
  if not docs:
135
  return 0
136
-
137
  embs = EMBED_MODEL.encode(docs, normalize_embeddings=True).tolist()
138
  col.upsert(ids=ids, documents=docs, metadatas=metas, embeddings=embs)
139
  return len(docs)
140
 
141
-
142
- # -------------------------
143
- # Public API used by callbacks.py
144
- # -------------------------
145
  def ingest_files(username: str, notebook_id: str, files) -> int:
146
  ensure_tree(username, notebook_id)
147
-
148
  raw_dir = os.path.join(nb_root(username, notebook_id), "files_raw")
149
  ex_dir = os.path.join(nb_root(username, notebook_id), "files_extracted")
150
-
151
- os.makedirs(raw_dir, exist_ok=True)
152
- os.makedirs(ex_dir, exist_ok=True)
153
-
154
  added = 0
155
 
156
  for f in (files or []):
157
- fp = getattr(f, "name", None)
158
  if not fp:
159
  continue
160
 
161
- # copy uploaded file to raw_dir
162
- src_path = pathlib.Path(fp)
163
- file_bytes = src_path.read_bytes()
164
 
165
- base = os.path.basename(fp)
166
- dest = os.path.join(raw_dir, base)
167
- pathlib.Path(dest).write_bytes(file_bytes)
168
 
169
  ext = os.path.splitext(dest)[1].lower()
170
  if ext == ".pdf":
@@ -176,52 +121,31 @@ def ingest_files(username: str, notebook_id: str, files) -> int:
176
  else:
177
  continue
178
 
179
- # save extracted text
180
- ex_path = os.path.join(ex_dir, base + ".txt")
181
  with open(ex_path, "w", encoding="utf-8") as ftxt:
182
  for item in extracted:
183
  loc = ""
184
- if item.get("page") is not None:
185
  loc = f"page={item.get('page')}"
186
- elif item.get("slide") is not None:
187
  loc = f"slide={item.get('slide')}"
188
- ftxt.write(f"\n--- {loc} ---\n{item.get('text','')}\n")
189
-
190
- # ✅ Add a hash so repeated ingest of same filename won't collide
191
- file_hash = _sha10_bytes(file_bytes)
192
- source_id = f"file:{base}:{file_hash}"
193
 
194
  added += upsert_extracted(
195
- username=username,
196
- notebook_id=notebook_id,
197
- source_title=base,
198
- source_id=source_id,
199
- extracted_items=extracted,
200
  )
201
 
202
  return added
203
 
204
-
205
  def ingest_url(username: str, notebook_id: str, url: str) -> int:
206
  ensure_tree(username, notebook_id)
207
-
208
  extracted = extract_url(url)
209
  ex_dir = os.path.join(nb_root(username, notebook_id), "files_extracted")
210
- os.makedirs(ex_dir, exist_ok=True)
211
-
212
- # save extracted page text
213
- fname = safe_name(url.replace("https://", "").replace("http://", "").replace("/", "_")) + ".txt"
214
  with open(os.path.join(ex_dir, fname), "w", encoding="utf-8") as f:
215
- f.write(extracted[0].get("text", ""))
216
-
217
- # ✅ Hash text so re-ingest doesn’t collide
218
- text_hash = _sha10_text(extracted[0].get("text", ""))
219
- source_id = f"url:{url}:{text_hash}"
220
-
221
- return upsert_extracted(
222
- username=username,
223
- notebook_id=notebook_id,
224
- source_title=url,
225
- source_id=source_id,
226
- extracted_items=extracted,
227
- )
 
1
+ import os, pathlib
 
 
2
  import requests
3
  from bs4 import BeautifulSoup
4
  from pypdf import PdfReader
 
12
  EMBED_MODEL = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
13
 
14
 
15
+ def _file_path_from_gradio_obj(file_obj):
16
+ if isinstance(file_obj, str):
17
+ return file_obj
18
+ path = getattr(file_obj, "name", None)
19
+ if isinstance(path, str):
20
+ return path
21
+ return None
 
 
22
 
23
  def simple_chunk(text: str, max_chars=2200, overlap=250):
24
  text = "\n".join([ln.strip() for ln in (text or "").splitlines() if ln.strip()]).strip()
 
30
  while start < len(text):
31
  end = min(len(text), start + max_chars)
32
  out.append(text[start:end])
33
+ if end == len(text): break
 
34
  start = max(0, end - overlap)
35
  return out
36
 
 
 
 
 
37
  def extract_pdf(path: str):
38
  reader = PdfReader(path)
39
  items = []
40
  for i, page in enumerate(reader.pages):
41
  txt = (page.extract_text() or "").strip()
42
  if txt:
43
+ items.append({"text": txt, "page": i+1})
44
  return items
45
 
 
46
  def extract_pptx(path: str):
47
  prs = Presentation(path)
48
  items = []
 
53
  texts.append(shape.text)
54
  txt = "\n".join(t.strip() for t in texts if t.strip()).strip()
55
  if txt:
56
+ items.append({"text": txt, "slide": i+1})
57
  return items
58
 
 
59
  def extract_txt(path: str):
60
  with open(path, "r", encoding="utf-8", errors="ignore") as f:
61
  txt = f.read().strip()
62
+ return [{"text": txt, "page": None}] if txt else []
 
63
 
64
  def extract_url(url: str):
65
  r = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0"})
66
  r.raise_for_status()
67
  soup = BeautifulSoup(r.text, "html.parser")
68
+ for tag in soup(["script","style","noscript"]):
69
  tag.decompose()
70
  text = soup.get_text("\n")
71
  text = "\n".join([ln.strip() for ln in text.splitlines() if ln.strip()])
72
+ return [{"text": text[:200000], "page": None}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ def upsert_extracted(username: str, notebook_id: str, source_title: str, source_id: str, extracted_items: list[dict]) -> int:
75
+ col = get_collection(username, notebook_id)
76
  ids, docs, metas = [], [], []
77
+ chunk_counter = 0
78
  for item_idx, item in enumerate(extracted_items):
79
+ for j, ch in enumerate(simple_chunk(item["text"])):
80
+ ids.append(f"{source_id}::item{item_idx}::chunk{j}::{chunk_counter}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  docs.append(ch)
 
82
  meta = {
83
+ "source_title": source_title,
84
+ "source_id": source_id,
85
+ "page": item.get("page"),
86
+ "slide": item.get("slide"),
87
  }
88
+ meta = {k: v for k, v in meta.items() if v is not None}
 
 
 
 
 
89
  metas.append(meta)
90
+ chunk_counter += 1
91
  if not docs:
92
  return 0
 
93
  embs = EMBED_MODEL.encode(docs, normalize_embeddings=True).tolist()
94
  col.upsert(ids=ids, documents=docs, metadatas=metas, embeddings=embs)
95
  return len(docs)
96
 
 
 
 
 
97
  def ingest_files(username: str, notebook_id: str, files) -> int:
98
  ensure_tree(username, notebook_id)
 
99
  raw_dir = os.path.join(nb_root(username, notebook_id), "files_raw")
100
  ex_dir = os.path.join(nb_root(username, notebook_id), "files_extracted")
 
 
 
 
101
  added = 0
102
 
103
  for f in (files or []):
104
+ fp = _file_path_from_gradio_obj(f)
105
  if not fp:
106
  continue
107
 
108
+ if not os.path.exists(fp):
109
+ continue
 
110
 
111
+ dest = os.path.join(raw_dir, os.path.basename(fp))
112
+ pathlib.Path(dest).write_bytes(pathlib.Path(fp).read_bytes())
 
113
 
114
  ext = os.path.splitext(dest)[1].lower()
115
  if ext == ".pdf":
 
121
  else:
122
  continue
123
 
124
+ ex_path = os.path.join(ex_dir, os.path.basename(dest) + ".txt")
 
125
  with open(ex_path, "w", encoding="utf-8") as ftxt:
126
  for item in extracted:
127
  loc = ""
128
+ if item.get("page"):
129
  loc = f"page={item.get('page')}"
130
+ elif item.get("slide"):
131
  loc = f"slide={item.get('slide')}"
132
+ ftxt.write(f"\n--- {loc} ---\n{item['text']}\n")
 
 
 
 
133
 
134
  added += upsert_extracted(
135
+ username,
136
+ notebook_id,
137
+ os.path.basename(dest),
138
+ f"file:{os.path.basename(dest)}",
139
+ extracted,
140
  )
141
 
142
  return added
143
 
 
144
  def ingest_url(username: str, notebook_id: str, url: str) -> int:
145
  ensure_tree(username, notebook_id)
 
146
  extracted = extract_url(url)
147
  ex_dir = os.path.join(nb_root(username, notebook_id), "files_extracted")
148
+ fname = safe_name(url.replace("https://","").replace("http://","").replace("/","_")) + ".txt"
 
 
 
149
  with open(os.path.join(ex_dir, fname), "w", encoding="utf-8") as f:
150
+ f.write(extracted[0]["text"])
151
+ return upsert_extracted(username, notebook_id, url, f"url:{url}", extracted)
 
 
 
 
 
 
 
 
 
 
 
src/backend/llm.py CHANGED
@@ -1,20 +1,48 @@
1
  import os
2
  import gradio as gr
3
  from huggingface_hub import InferenceClient
 
4
 
5
  HF_INFERENCE_TOKEN = os.environ.get("HF_INFERENCE_TOKEN","").strip()
 
 
6
  HF_LLM_MODEL = os.environ.get("HF_LLM_MODEL","HuggingFaceH4/zephyr-7b-beta").strip()
7
 
8
- _client = InferenceClient(model=HF_LLM_MODEL, token=HF_INFERENCE_TOKEN) if HF_INFERENCE_TOKEN else None
9
 
10
  def llm_generate(prompt: str, max_new_tokens=450, temperature=0.2) -> str:
11
  if _client is None:
12
- raise gr.Error("HF_INFERENCE_TOKEN not set. Add it in Space secrets.")
13
- out = _client.text_generation(
14
- prompt,
15
- max_new_tokens=max_new_tokens,
16
- temperature=temperature,
17
- do_sample=temperature > 0,
18
- return_full_text=False,
19
- )
20
- return (out or "").strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import gradio as gr
3
  from huggingface_hub import InferenceClient
4
+ from huggingface_hub.utils import HfHubHTTPError
5
 
6
  HF_INFERENCE_TOKEN = os.environ.get("HF_INFERENCE_TOKEN","").strip()
7
+ HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
8
+ HF_API_TOKEN = HF_INFERENCE_TOKEN or HF_TOKEN
9
  HF_LLM_MODEL = os.environ.get("HF_LLM_MODEL","HuggingFaceH4/zephyr-7b-beta").strip()
10
 
11
+ _client = InferenceClient(model=HF_LLM_MODEL, token=HF_API_TOKEN) if HF_API_TOKEN else None
12
 
13
  def llm_generate(prompt: str, max_new_tokens=450, temperature=0.2) -> str:
14
  if _client is None:
15
+ raise gr.Error("Set HF_INFERENCE_TOKEN (or HF_TOKEN) in Space secrets or local environment.")
16
+ try:
17
+ out = _client.text_generation(
18
+ prompt,
19
+ max_new_tokens=max_new_tokens,
20
+ temperature=temperature,
21
+ do_sample=temperature > 0,
22
+ return_full_text=False,
23
+ )
24
+ return (out or "").strip()
25
+ except ValueError as e:
26
+ msg = str(e)
27
+ if "not supported for task text-generation" in msg or "Supported task: conversational" in msg:
28
+ try:
29
+ resp = _client.chat.completions.create(
30
+ model=HF_LLM_MODEL,
31
+ messages=[{"role": "user", "content": prompt}],
32
+ max_tokens=max_new_tokens,
33
+ temperature=temperature,
34
+ )
35
+ choice = (resp.choices or [None])[0]
36
+ content = getattr(getattr(choice, "message", None), "content", "") if choice else ""
37
+ return (content or "").strip()
38
+ except Exception as inner:
39
+ raise gr.Error(f"LLM request failed after conversational fallback: {inner}")
40
+ raise gr.Error(f"LLM request failed: {msg}")
41
+ except HfHubHTTPError as e:
42
+ msg = str(e)
43
+ if "api-inference.huggingface.co is no longer supported" in msg or "410 Client Error" in msg:
44
+ raise gr.Error(
45
+ "Your Hugging Face Hub client is outdated for inference routing. "
46
+ "Upgrade `huggingface_hub` and restart the app."
47
+ )
48
+ raise gr.Error(f"LLM request failed: {msg}")
src/backend/rag.py CHANGED
@@ -8,13 +8,18 @@ EMBED_MODEL = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
8
  def retrieve(username: str, notebook_id: str, query: str, k=6):
9
  col = get_collection(username, notebook_id)
10
 
 
 
 
 
 
11
  qemb = EMBED_MODEL.encode([query], normalize_embeddings=True).tolist()
12
 
13
  res = col.query(
14
- query_embeddings=qemb,
15
- n_results=k,
16
- include=["documents", "metadatas", "distances"]
17
- )
18
 
19
  ids = res.get("ids", [[]])[0]
20
  docs = res.get("documents", [[]])[0]
 
8
  def retrieve(username: str, notebook_id: str, query: str, k=6):
9
  col = get_collection(username, notebook_id)
10
 
11
+ current_count = col.count()
12
+ if current_count <= 0:
13
+ return []
14
+ n_results = min(k, current_count)
15
+
16
  qemb = EMBED_MODEL.encode([query], normalize_embeddings=True).tolist()
17
 
18
  res = col.query(
19
+ query_embeddings=qemb,
20
+ n_results=n_results,
21
+ include=["documents", "metadatas", "distances"],
22
+ )
23
 
24
  ids = res.get("ids", [[]])[0]
25
  docs = res.get("documents", [[]])[0]
src/frontend/__pycache__/callbacks.cpython-310.pyc CHANGED
Binary files a/src/frontend/__pycache__/callbacks.cpython-310.pyc and b/src/frontend/__pycache__/callbacks.cpython-310.pyc differ
 
src/frontend/__pycache__/ui.cpython-310.pyc CHANGED
Binary files a/src/frontend/__pycache__/ui.cpython-310.pyc and b/src/frontend/__pycache__/ui.cpython-310.pyc differ
 
src/frontend/callbacks.py CHANGED
@@ -91,7 +91,12 @@ def on_ingest_files(username: str, notebook_id: str, files):
91
  _require_notebook(notebook_id)
92
  if not files:
93
  raise gr.Error("Upload at least one file.")
94
- added = ingest_files_backend(username, notebook_id, files)
 
 
 
 
 
95
  return f"Ingested files. Added {added} chunks."
96
 
97
 
@@ -100,7 +105,12 @@ def on_ingest_url(username: str, notebook_id: str, url: str):
100
  url = (url or "").strip()
101
  if not url:
102
  raise gr.Error("Enter a URL.")
103
- added = ingest_url_backend(username, notebook_id, url)
 
 
 
 
 
104
  return f"Ingested URL. Added {added} chunks."
105
 
106
 
 
91
  _require_notebook(notebook_id)
92
  if not files:
93
  raise gr.Error("Upload at least one file.")
94
+ try:
95
+ added = ingest_files_backend(username, notebook_id, files)
96
+ except Exception as e:
97
+ raise gr.Error(f"File ingest failed: {e}")
98
+ if added == 0:
99
+ raise gr.Error("No chunks were indexed. Use supported files (PDF/PPTX/TXT) with extractable text.")
100
  return f"Ingested files. Added {added} chunks."
101
 
102
 
 
105
  url = (url or "").strip()
106
  if not url:
107
  raise gr.Error("Enter a URL.")
108
+ try:
109
+ added = ingest_url_backend(username, notebook_id, url)
110
+ except Exception as e:
111
+ raise gr.Error(f"URL ingest failed: {e}")
112
+ if added == 0:
113
+ raise gr.Error("No chunks were indexed from the URL.")
114
  return f"Ingested URL. Added {added} chunks."
115
 
116
 
src/frontend/ui.py CHANGED
@@ -16,79 +16,163 @@ from src.frontend.callbacks import (
16
  from src.backend.auth import require_login
17
 
18
 
19
- def build_app():
20
- with gr.Blocks(title="NotebookLM Clone") as demo:
21
- gr.Markdown("# 📓 NotebookLM Clone (HF Auth + Chroma + RAG)")
 
 
 
22
 
23
- gr.LoginButton().activate()
 
 
 
 
 
 
24
 
 
 
 
 
 
25
 
26
- username_state = gr.State("")
 
 
 
27
 
28
- # ---------- UI ----------
29
- with gr.Row():
30
- with gr.Column(scale=1):
31
- user_box = gr.Textbox(label="User", interactive=False)
 
 
32
 
33
- notebook_dd = gr.Dropdown(label="Notebooks", choices=[], interactive=True)
 
 
 
 
 
34
 
35
- nb_new = gr.Textbox(label="Create notebook", placeholder="Name")
36
- btn_create = gr.Button("Create")
 
37
 
38
- nb_rename = gr.Textbox(label="Rename notebook", placeholder="New name")
39
- btn_rename = gr.Button("Rename")
 
40
 
41
- btn_delete = gr.Button("Delete current", variant="stop")
 
 
 
42
 
43
- gr.Markdown("## Ingest")
44
- file_up = gr.File(label="Upload PDF/PPTX/TXT", file_count="multiple")
45
- btn_ingest_files = gr.Button("Ingest Files")
46
- ingest_status = gr.Textbox(label="Status", interactive=False)
47
 
48
- url_in = gr.Textbox(label="URL", placeholder="https://...")
49
- btn_ingest_url = gr.Button("Ingest URL")
50
- url_status = gr.Textbox(label="Status", interactive=False)
 
 
 
 
 
51
 
52
- gr.Markdown("## Artifacts")
53
- topic = gr.Textbox(label="Topic / prompt")
54
- extra = gr.Textbox(label="Extra prompt (optional)")
55
- btn_report = gr.Button("Generate Report")
56
- btn_quiz = gr.Button("Generate Quiz")
57
- btn_podcast = gr.Button("Generate Podcast")
 
 
 
58
 
59
- artifact_status = gr.Textbox(label="Artifact status", interactive=False)
60
- artifacts_list = gr.Dropdown(label="Artifacts", choices=[], interactive=True)
61
- download_btn = gr.Button("Download selected")
62
- download_file = gr.File(label="Download", interactive=False)
63
- podcast_audio = gr.Audio(label="Podcast Audio", interactive=False)
64
 
65
- with gr.Column(scale=2):
66
- chatbot = gr.Chatbot(height=520, label="Chat (RAG + citations)")
67
- msg = gr.Textbox(label="Message")
68
- send = gr.Button("Send")
69
 
70
- # ---------- LOAD ----------
71
  def on_load(request: gr.Request):
72
- username = require_login(request) # will fall back to "guest" if missing
73
  dd, chat, arts = ui_bootstrap(username)
74
  return username, dd, chat, arts
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  demo.load(
78
  on_load,
79
  inputs=None,
80
- outputs=[
81
- username_state,
82
- user_box, # ✅ always filled
83
- notebook_dd,
84
- chatbot,
85
- artifacts_list,
86
- ],
 
 
87
  queue=False,
88
  api_name=False,
89
  )
90
 
91
- # ---------- EVENTS ----------
92
  notebook_dd.change(
93
  on_switch_notebook,
94
  inputs=[username_state, notebook_dd],
 
16
  from src.backend.auth import require_login
17
 
18
 
19
+ CUSTOM_CSS = """
20
+ .gradio-container {
21
+ max-width: 1320px !important;
22
+ margin: 0 auto !important;
23
+ padding-top: 18px !important;
24
+ }
25
 
26
+ .hero {
27
+ border: 1px solid rgba(255,255,255,.08);
28
+ border-radius: 16px;
29
+ padding: 16px 18px;
30
+ background: linear-gradient(145deg, rgba(50,87,255,.16), rgba(145,92,255,.12));
31
+ backdrop-filter: blur(6px);
32
+ }
33
 
34
+ .hero h1 {
35
+ margin: 0;
36
+ font-size: 1.5rem;
37
+ letter-spacing: .2px;
38
+ }
39
 
40
+ .hero p {
41
+ margin: 6px 0 0;
42
+ opacity: .88;
43
+ }
44
 
45
+ .panel {
46
+ border: 1px solid rgba(255,255,255,.08);
47
+ border-radius: 14px;
48
+ background: linear-gradient(180deg, rgba(255,255,255,.04), rgba(255,255,255,.02));
49
+ padding: 10px;
50
+ }
51
 
52
+ .chat-panel {
53
+ border: 1px solid rgba(255,255,255,.08);
54
+ border-radius: 14px;
55
+ padding: 10px;
56
+ background: linear-gradient(180deg, rgba(255,255,255,.04), rgba(255,255,255,.02));
57
+ }
58
 
59
+ .chat-panel .message-wrap {
60
+ border-radius: 12px;
61
+ }
62
 
63
+ .chat-input textarea {
64
+ min-height: 92px !important;
65
+ }
66
 
67
+ .primary-btn button {
68
+ border-radius: 10px !important;
69
+ }
70
+ """
71
 
 
 
 
 
72
 
73
+ def build_app():
74
+ theme = gr.themes.Soft(
75
+ primary_hue="blue",
76
+ secondary_hue="indigo",
77
+ neutral_hue="slate",
78
+ spacing_size="md",
79
+ radius_size="lg",
80
+ )
81
 
82
+ with gr.Blocks(title="NotebookLM Clone", theme=theme, css=CUSTOM_CSS) as demo:
83
+ gr.Markdown(
84
+ """
85
+ <div class='hero'>
86
+ <h1>📓 NotebookLM Clone</h1>
87
+ <p>Organize notebooks, ingest sources, and chat with RAG-backed citations.</p>
88
+ </div>
89
+ """
90
+ )
91
 
92
+ login = gr.LoginButton()
93
+ login.activate()
 
 
 
94
 
95
+ username_state = gr.State("")
 
 
 
96
 
 
97
  def on_load(request: gr.Request):
98
+ username = require_login(request)
99
  dd, chat, arts = ui_bootstrap(username)
100
  return username, dd, chat, arts
101
 
102
+ with gr.Row(equal_height=True):
103
+ with gr.Column(scale=1, min_width=360, elem_classes=["panel"]):
104
+ user_box = gr.Textbox(label="User", interactive=False)
105
+
106
+ with gr.Accordion("Notebook", open=True):
107
+ notebook_dd = gr.Dropdown(
108
+ label="Notebooks",
109
+ choices=[],
110
+ interactive=True,
111
+ )
112
+ nb_new = gr.Textbox(label="Create notebook", placeholder="Name")
113
+ btn_create = gr.Button("Create", elem_classes=["primary-btn"])
114
+ nb_rename = gr.Textbox(label="Rename notebook", placeholder="New name")
115
+ btn_rename = gr.Button("Rename")
116
+ btn_delete = gr.Button("Delete current", variant="stop")
117
+
118
+ with gr.Accordion("Ingest", open=True):
119
+ file_up = gr.File(label="Upload PDF / PPTX / TXT", file_count="multiple")
120
+ btn_ingest_files = gr.Button("Ingest Files", elem_classes=["primary-btn"])
121
+ ingest_status = gr.Textbox(label="File ingest status", interactive=False)
122
+ url_in = gr.Textbox(label="URL", placeholder="https://...")
123
+ btn_ingest_url = gr.Button("Ingest URL")
124
+ url_status = gr.Textbox(label="URL ingest status", interactive=False)
125
+
126
+ with gr.Accordion("Artifacts", open=False):
127
+ topic = gr.Textbox(label="Topic / prompt")
128
+ extra = gr.Textbox(label="Extra prompt (optional)")
129
+ with gr.Row():
130
+ btn_report = gr.Button("Generate Report")
131
+ btn_quiz = gr.Button("Generate Quiz")
132
+ btn_podcast = gr.Button("Generate Podcast")
133
+ artifact_status = gr.Textbox(label="Artifact status", interactive=False)
134
+ artifacts_list = gr.Dropdown(label="Artifacts", choices=[], interactive=True)
135
+ download_btn = gr.Button("Download selected")
136
+ download_file = gr.File(label="Download", interactive=False)
137
+ podcast_audio = gr.Audio(label="Podcast Audio", interactive=False)
138
+
139
+ with gr.Column(scale=2, min_width=560, elem_classes=["chat-panel"]):
140
+ chatbot = gr.Chatbot(
141
+ height=520,
142
+ label="Chat (RAG + citations)",
143
+ bubble_full_width=False,
144
+ )
145
+
146
+ with gr.Row():
147
+ msg = gr.Textbox(
148
+ label="Message",
149
+ placeholder="Ask about your uploaded sources...",
150
+ elem_classes=["chat-input"],
151
+ scale=5,
152
+ )
153
+ send = gr.Button(
154
+ "Send",
155
+ variant="primary",
156
+ scale=1,
157
+ elem_classes=["primary-btn"],
158
+ )
159
 
160
  demo.load(
161
  on_load,
162
  inputs=None,
163
+ outputs=[username_state, notebook_dd, chatbot, artifacts_list],
164
+ queue=False,
165
+ api_name=False,
166
+ )
167
+
168
+ username_state.change(
169
+ lambda u: u,
170
+ inputs=username_state,
171
+ outputs=user_box,
172
  queue=False,
173
  api_name=False,
174
  )
175
 
 
176
  notebook_dd.change(
177
  on_switch_notebook,
178
  inputs=[username_state, notebook_dd],
src/storage/__pycache__/artifact_store.cpython-310.pyc CHANGED
Binary files a/src/storage/__pycache__/artifact_store.cpython-310.pyc and b/src/storage/__pycache__/artifact_store.cpython-310.pyc differ
 
src/storage/__pycache__/chat_store.cpython-310.pyc CHANGED
Binary files a/src/storage/__pycache__/chat_store.cpython-310.pyc and b/src/storage/__pycache__/chat_store.cpython-310.pyc differ
 
src/storage/__pycache__/chroma_store.cpython-310.pyc CHANGED
Binary files a/src/storage/__pycache__/chroma_store.cpython-310.pyc and b/src/storage/__pycache__/chroma_store.cpython-310.pyc differ
 
src/storage/__pycache__/index_store.cpython-310.pyc CHANGED
Binary files a/src/storage/__pycache__/index_store.cpython-310.pyc and b/src/storage/__pycache__/index_store.cpython-310.pyc differ
 
src/storage/__pycache__/paths.cpython-310.pyc CHANGED
Binary files a/src/storage/__pycache__/paths.cpython-310.pyc and b/src/storage/__pycache__/paths.cpython-310.pyc differ
 
src/storage/chroma_store.py CHANGED
@@ -1,27 +1,29 @@
1
  import os
2
  import chromadb
3
- from chromadb.config import Settings
4
-
5
  from src.storage.paths import nb_root
6
 
7
- # Cache clients by persist_dir to avoid "ephemeral with different settings"
8
- _CLIENTS: dict[str, chromadb.PersistentClient] = {}
9
 
10
- # One shared Settings object (important: consistent settings!)
11
- _SETTINGS = Settings(
12
- anonymized_telemetry=False, # ✅ disables telemetry (stops capture() errors)
13
- allow_reset=True,
14
- )
 
15
 
16
- def chroma_client(username: str, notebook_id: str) -> chromadb.PersistentClient:
17
- persist_dir = os.path.join(nb_root(username, notebook_id), "chroma")
18
  os.makedirs(persist_dir, exist_ok=True)
19
 
20
- if persist_dir not in _CLIENTS:
21
- _CLIENTS[persist_dir] = chromadb.PersistentClient(path=persist_dir, settings=_SETTINGS)
 
 
 
 
22
 
23
- return _CLIENTS[persist_dir]
24
 
25
- def get_collection(username: str, notebook_id: str, name: str = "chunks"):
 
26
  client = chroma_client(username, notebook_id)
27
- return client.get_or_create_collection(name=name)
 
 
 
 
1
  import os
2
  import chromadb
 
 
3
  from src.storage.paths import nb_root
4
 
 
 
5
 
6
+ def chroma_client(username, notebook_id):
7
+
8
+ persist_dir = os.path.join(
9
+ nb_root(username, notebook_id),
10
+ "chroma"
11
+ )
12
 
 
 
13
  os.makedirs(persist_dir, exist_ok=True)
14
 
15
+ return chromadb.PersistentClient(
16
+ path=persist_dir,
17
+ settings=chromadb.config.Settings(
18
+ anonymized_telemetry=False
19
+ )
20
+ )
21
 
 
22
 
23
+ def get_collection(username, notebook_id):
24
+
25
  client = chroma_client(username, notebook_id)
26
+
27
+ return client.get_or_create_collection(
28
+ name="notebook"
29
+ )
src/utils/__pycache__/text.cpython-310.pyc CHANGED
Binary files a/src/utils/__pycache__/text.cpython-310.pyc and b/src/utils/__pycache__/text.cpython-310.pyc differ