Hitakshi26 commited on
Commit
c60446c
·
1 Parent(s): 7921d06

Fix storage paths/index_path + chroma persistence + ingestion + telemetry

Browse files
app.py CHANGED
@@ -1,4 +1,8 @@
1
  import os
 
 
 
 
2
  from src.frontend.ui import build_app
3
 
4
  demo = build_app()
 
1
  import os
2
+
3
+ # Disable Chroma telemetry noise
4
+ os.environ["ANONYMIZED_TELEMETRY"] = "FALSE"
5
+
6
  from src.frontend.ui import build_app
7
 
8
  demo = build_app()
src/backend/ingest.py CHANGED
@@ -81,13 +81,17 @@ def upsert_extracted(username: str, notebook_id: str, source_title: str, source_
81
  col.upsert(ids=ids, documents=docs, metadatas=metas, embeddings=embs)
82
  return len(docs)
83
 
84
- def ingest_files(username: str, notebook_id: str, filepaths: list[str]) -> int:
85
  ensure_tree(username, notebook_id)
86
  raw_dir = os.path.join(nb_root(username, notebook_id), "files_raw")
87
  ex_dir = os.path.join(nb_root(username, notebook_id), "files_extracted")
88
  added = 0
89
 
90
- for fp in filepaths:
 
 
 
 
91
  dest = os.path.join(raw_dir, os.path.basename(fp))
92
  pathlib.Path(dest).write_bytes(pathlib.Path(fp).read_bytes())
93
 
@@ -101,17 +105,25 @@ def ingest_files(username: str, notebook_id: str, filepaths: list[str]) -> int:
101
  else:
102
  continue
103
 
104
- # save extracted
105
  ex_path = os.path.join(ex_dir, os.path.basename(dest) + ".txt")
106
- with open(ex_path, "w", encoding="utf-8") as f:
107
  for item in extracted:
108
- loc = f"page={item.get('page')}" if item.get("page") else f"slide={item.get('slide')}" if item.get("slide") else ""
109
- f.write(f"\n--- {loc} ---\n{item['text']}\n")
 
 
 
 
110
 
111
- added += upsert_extracted(username, notebook_id, os.path.basename(dest), f"file:{os.path.basename(dest)}", extracted)
 
 
 
 
 
 
112
 
113
  return added
114
-
115
  def ingest_url(username: str, notebook_id: str, url: str) -> int:
116
  ensure_tree(username, notebook_id)
117
  extracted = extract_url(url)
 
81
  col.upsert(ids=ids, documents=docs, metadatas=metas, embeddings=embs)
82
  return len(docs)
83
 
84
+ def ingest_files(username: str, notebook_id: str, files) -> int:
85
  ensure_tree(username, notebook_id)
86
  raw_dir = os.path.join(nb_root(username, notebook_id), "files_raw")
87
  ex_dir = os.path.join(nb_root(username, notebook_id), "files_extracted")
88
  added = 0
89
 
90
+ for f in (files or []):
91
+ fp = getattr(f, "name", None)
92
+ if not fp:
93
+ continue
94
+
95
  dest = os.path.join(raw_dir, os.path.basename(fp))
96
  pathlib.Path(dest).write_bytes(pathlib.Path(fp).read_bytes())
97
 
 
105
  else:
106
  continue
107
 
 
108
  ex_path = os.path.join(ex_dir, os.path.basename(dest) + ".txt")
109
+ with open(ex_path, "w", encoding="utf-8") as ftxt:
110
  for item in extracted:
111
+ loc = ""
112
+ if item.get("page"):
113
+ loc = f"page={item.get('page')}"
114
+ elif item.get("slide"):
115
+ loc = f"slide={item.get('slide')}"
116
+ ftxt.write(f"\n--- {loc} ---\n{item['text']}\n")
117
 
118
+ added += upsert_extracted(
119
+ username,
120
+ notebook_id,
121
+ os.path.basename(dest),
122
+ f"file:{os.path.basename(dest)}",
123
+ extracted,
124
+ )
125
 
126
  return added
 
127
  def ingest_url(username: str, notebook_id: str, url: str) -> int:
128
  ensure_tree(username, notebook_id)
129
  extracted = extract_url(url)
src/backend/notebooks.py CHANGED
@@ -1,49 +1,44 @@
 
 
1
  import uuid
2
- import gradio as gr
3
- from src.storage.index_store import load_index, save_index, list_notebooks
4
- from src.storage.paths import ensure_tree
5
- from src.utils.text import safe_name
6
  from datetime import datetime
7
 
8
- def now_iso():
 
 
 
 
9
  return datetime.utcnow().isoformat() + "Z"
10
 
 
11
  def create_notebook(username: str, name: str) -> str:
12
- name = safe_name(name)
13
- idx = load_index(username)
14
  nb_id = str(uuid.uuid4())
15
- idx["notebooks"].append({
16
- "id": nb_id,
17
- "name": name,
18
- "created_at": now_iso(),
19
- "updated_at": now_iso(),
20
- })
21
  save_index(username, idx)
 
22
  ensure_tree(username, nb_id)
23
  return nb_id
24
 
 
25
  def rename_notebook(username: str, notebook_id: str, new_name: str):
26
- new_name = safe_name(new_name)
27
- if not new_name:
28
- raise gr.Error("Notebook name cannot be empty.")
29
  idx = load_index(username)
30
- found = False
31
- for nb in idx.get("notebooks", []):
32
  if nb["id"] == notebook_id:
33
  nb["name"] = new_name
34
- nb["updated_at"] = now_iso()
35
- found = True
36
  break
37
- if not found:
38
- raise gr.Error("Notebook not found.")
39
  save_index(username, idx)
40
 
 
41
  def delete_notebook(username: str, notebook_id: str):
42
- import shutil, os
43
- from src.storage.paths import nb_root
 
 
 
 
44
  idx = load_index(username)
45
- idx["notebooks"] = [n for n in idx.get("notebooks", []) if n["id"] != notebook_id]
46
- save_index(username, idx)
47
- base = nb_root(username, notebook_id)
48
- if os.path.exists(base):
49
- shutil.rmtree(base, ignore_errors=True)
 
1
+ import os
2
+ import shutil
3
  import uuid
 
 
 
 
4
  from datetime import datetime
5
 
6
+ from src.storage.index_store import load_index, save_index
7
+ from src.storage.paths import nb_root, ensure_tree
8
+
9
+
10
+ def _now():
11
  return datetime.utcnow().isoformat() + "Z"
12
 
13
+
14
  def create_notebook(username: str, name: str) -> str:
 
 
15
  nb_id = str(uuid.uuid4())
16
+
17
+ idx = load_index(username)
18
+ idx.append({"id": nb_id, "name": name or "Untitled", "created_at": _now(), "updated_at": _now()})
 
 
 
19
  save_index(username, idx)
20
+
21
  ensure_tree(username, nb_id)
22
  return nb_id
23
 
24
+
25
  def rename_notebook(username: str, notebook_id: str, new_name: str):
 
 
 
26
  idx = load_index(username)
27
+ for nb in idx:
 
28
  if nb["id"] == notebook_id:
29
  nb["name"] = new_name
30
+ nb["updated_at"] = _now()
 
31
  break
 
 
32
  save_index(username, idx)
33
 
34
+
35
  def delete_notebook(username: str, notebook_id: str):
36
+ # remove folder
37
+ p = nb_root(username, notebook_id)
38
+ if os.path.exists(p):
39
+ shutil.rmtree(p, ignore_errors=True)
40
+
41
+ # remove from index
42
  idx = load_index(username)
43
+ idx = [nb for nb in idx if nb.get("id") != notebook_id]
44
+ save_index(username, idx)
 
 
 
src/backend/rag.py CHANGED
@@ -8,81 +8,61 @@ EMBED_MODEL = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
8
  def retrieve(username: str, notebook_id: str, query: str, k=6):
9
  col = get_collection(username, notebook_id)
10
 
11
- qemb = EMBED_MODEL.encode(
12
- [query],
13
- normalize_embeddings=True
14
- ).tolist()
15
 
16
- # Compatible with HF Chroma version
17
  res = col.query(
18
  query_embeddings=qemb,
19
  n_results=k,
20
- include=["documents", "metadatas", "distances"]
21
  )
22
 
23
- # Safe extraction (prevents crashes)
24
  ids = res.get("ids", [[]])[0]
25
  docs = res.get("documents", [[]])[0]
26
  mets = res.get("metadatas", [[]])[0]
27
  dists = res.get("distances", [[]])[0]
28
 
29
  hits = []
30
-
31
  for i in range(len(docs)):
32
- hits.append({
33
- "id": ids[i] if i < len(ids) else f"chunk_{i}",
34
- "doc": docs[i],
35
- "meta": mets[i] if i < len(mets) else {},
36
- "distance": dists[i] if i < len(dists) else None
37
- })
38
-
 
39
  return hits
40
 
41
 
42
  def format_sources(hits):
43
  lines = []
44
-
45
  for i, h in enumerate(hits, start=1):
46
- m = h["meta"] or {}
47
-
48
  loc = ""
49
  if m.get("page"):
50
  loc = f"p.{m['page']}"
51
-
52
  if m.get("slide"):
53
  loc = f"slide {m['slide']}"
54
-
55
- title = m.get("source_title", "source")
56
-
57
  lines.append(f"[S{i}] {title} {loc}".strip())
58
-
59
  return "\n".join(lines)
60
 
61
 
62
  def context_block(hits):
63
  blocks = []
64
-
65
  for i, h in enumerate(hits, start=1):
66
- m = h["meta"] or {}
67
-
68
  loc = ""
69
  if m.get("page"):
70
  loc = f"(page {m['page']})"
71
-
72
  if m.get("slide"):
73
  loc = f"(slide {m['slide']})"
74
-
75
- title = m.get("source_title", "source")
76
-
77
- blocks.append(
78
- f"[S{i}] {title} {loc}\n{h['doc']}"
79
- )
80
-
81
  return "\n\n---\n\n".join(blocks)
82
 
83
 
84
  def rag_answer(query: str, hits):
85
-
86
  if not hits:
87
  return "Not found in the provided sources. (No indexed chunks yet.)"
88
 
@@ -90,31 +70,19 @@ def rag_answer(query: str, hits):
90
  You are a research assistant.
91
 
92
  Answer ONLY using the sources below.
93
-
94
  Every non-trivial claim must end with citations like [S1] or [S2].
95
-
96
- If not present in sources say:
97
- Not found in the provided sources.
98
 
99
  Question:
100
  {query}
101
 
102
-
103
  Sources list:
104
  {format_sources(hits)}
105
 
106
-
107
  Source excerpts:
108
  {context_block(hits)}
109
 
110
-
111
  Answer with citations:
112
  """
113
-
114
- ans = llm_generate(
115
- prompt,
116
- max_new_tokens=450,
117
- temperature=0.2
118
- )
119
-
120
  return f"{ans}\n\nSources:\n{format_sources(hits)}"
 
8
  def retrieve(username: str, notebook_id: str, query: str, k=6):
9
  col = get_collection(username, notebook_id)
10
 
11
+ qemb = EMBED_MODEL.encode([query], normalize_embeddings=True).tolist()
 
 
 
12
 
 
13
  res = col.query(
14
  query_embeddings=qemb,
15
  n_results=k,
16
+ include=["documents", "metadatas", "distances"],
17
  )
18
 
 
19
  ids = res.get("ids", [[]])[0]
20
  docs = res.get("documents", [[]])[0]
21
  mets = res.get("metadatas", [[]])[0]
22
  dists = res.get("distances", [[]])[0]
23
 
24
  hits = []
 
25
  for i in range(len(docs)):
26
+ hits.append(
27
+ {
28
+ "id": ids[i] if i < len(ids) else f"chunk_{i}",
29
+ "doc": docs[i],
30
+ "meta": mets[i] if i < len(mets) else {},
31
+ "distance": dists[i] if i < len(dists) else None,
32
+ }
33
+ )
34
  return hits
35
 
36
 
37
  def format_sources(hits):
38
  lines = []
 
39
  for i, h in enumerate(hits, start=1):
40
+ m = h.get("meta") or {}
41
+ title = m.get("source_title", "source")
42
  loc = ""
43
  if m.get("page"):
44
  loc = f"p.{m['page']}"
 
45
  if m.get("slide"):
46
  loc = f"slide {m['slide']}"
 
 
 
47
  lines.append(f"[S{i}] {title} {loc}".strip())
 
48
  return "\n".join(lines)
49
 
50
 
51
  def context_block(hits):
52
  blocks = []
 
53
  for i, h in enumerate(hits, start=1):
54
+ m = h.get("meta") or {}
55
+ title = m.get("source_title", "source")
56
  loc = ""
57
  if m.get("page"):
58
  loc = f"(page {m['page']})"
 
59
  if m.get("slide"):
60
  loc = f"(slide {m['slide']})"
61
+ blocks.append(f"[S{i}] {title} {loc}\n{h.get('doc','')}")
 
 
 
 
 
 
62
  return "\n\n---\n\n".join(blocks)
63
 
64
 
65
  def rag_answer(query: str, hits):
 
66
  if not hits:
67
  return "Not found in the provided sources. (No indexed chunks yet.)"
68
 
 
70
  You are a research assistant.
71
 
72
  Answer ONLY using the sources below.
 
73
  Every non-trivial claim must end with citations like [S1] or [S2].
74
+ If not present in sources, say: Not found in the provided sources.
 
 
75
 
76
  Question:
77
  {query}
78
 
 
79
  Sources list:
80
  {format_sources(hits)}
81
 
 
82
  Source excerpts:
83
  {context_block(hits)}
84
 
 
85
  Answer with citations:
86
  """
87
+ ans = llm_generate(prompt, max_new_tokens=450, temperature=0.2)
 
 
 
 
 
 
88
  return f"{ans}\n\nSources:\n{format_sources(hits)}"
src/storage/index_store.py CHANGED
@@ -1,31 +1,32 @@
1
- import os, json
2
- from datetime import datetime
3
- from .paths import user_root, index_path, ensure_tree
4
 
5
- def now_iso():
6
- return datetime.utcnow().isoformat() + "Z"
7
 
8
- def load_index(username: str) -> dict:
9
  os.makedirs(user_root(username), exist_ok=True)
10
- p = index_path(username)
11
- if not os.path.exists(p):
12
- with open(p, "w", encoding="utf-8") as f:
13
- json.dump({"notebooks": []}, f, indent=2)
14
- with open(p, "r", encoding="utf-8") as f:
 
15
  return json.load(f)
16
 
17
- def save_index(username: str, idx: dict):
18
- with open(index_path(username), "w", encoding="utf-8") as f:
19
- json.dump(idx, f, indent=2)
20
 
21
- def list_notebooks(username: str):
22
- idx = load_index(username)
23
- return [(nb["name"], nb["id"]) for nb in idx.get("notebooks", [])]
 
 
24
 
25
- def touch_updated(username: str, notebook_id: str):
 
 
 
 
26
  idx = load_index(username)
27
- for nb in idx.get("notebooks", []):
28
- if nb["id"] == notebook_id:
29
- nb["updated_at"] = now_iso()
30
- break
31
- save_index(username, idx)
 
1
+ import json
2
+ import os
3
+ from .paths import user_root, index_path
4
 
 
 
5
 
6
+ def load_index(username: str):
7
  os.makedirs(user_root(username), exist_ok=True)
8
+ ip = index_path(username)
9
+ if not os.path.exists(ip):
10
+ with open(ip, "w", encoding="utf-8") as f:
11
+ f.write("[]")
12
+ return []
13
+ with open(ip, "r", encoding="utf-8") as f:
14
  return json.load(f)
15
 
 
 
 
16
 
17
+ def save_index(username: str, items):
18
+ os.makedirs(user_root(username), exist_ok=True)
19
+ ip = index_path(username)
20
+ with open(ip, "w", encoding="utf-8") as f:
21
+ json.dump(items, f, ensure_ascii=False, indent=2)
22
 
23
+
24
+ def list_notebooks(username: str):
25
+ """
26
+ Returns list of tuples (label, id) for gr.Dropdown choices.
27
+ """
28
  idx = load_index(username)
29
+ out = []
30
+ for nb in idx:
31
+ out.append((nb.get("name", "Untitled"), nb.get("id")))
32
+ return out
 
src/storage/paths.py CHANGED
@@ -1,18 +1,41 @@
1
  import os
2
 
 
3
  DATA_ROOT = os.getenv("DATA_ROOT", "./data")
4
 
5
- def user_root(username: str):
6
- return os.path.join(DATA_ROOT, username)
7
 
8
- def nb_root(username: str, notebook_id: str):
 
 
 
 
 
 
 
 
 
 
9
  return os.path.join(user_root(username), notebook_id)
10
 
11
- def ensure_tree(username: str, notebook_id: str):
12
 
 
 
13
  base = nb_root(username, notebook_id)
14
 
 
15
  os.makedirs(base, exist_ok=True)
 
16
  os.makedirs(os.path.join(base, "files_raw"), exist_ok=True)
17
  os.makedirs(os.path.join(base, "files_extracted"), exist_ok=True)
18
- os.makedirs(os.path.join(base, "artifacts"), exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
 
3
+ # HF Spaces uses /data; locally it can fall back to ./data
4
  DATA_ROOT = os.getenv("DATA_ROOT", "./data")
5
 
 
 
6
 
7
+ def user_root(username: str) -> str:
8
+ return os.path.join(DATA_ROOT, "users", username, "notebooks")
9
+
10
+
11
+ def index_path(username: str) -> str:
12
+ # /data/users/<username>/notebooks/index.json
13
+ return os.path.join(user_root(username), "index.json")
14
+
15
+
16
+ def nb_root(username: str, notebook_id: str) -> str:
17
+ # /data/users/<username>/notebooks/<uuid>/
18
  return os.path.join(user_root(username), notebook_id)
19
 
 
20
 
21
+ def ensure_tree(username: str, notebook_id: str):
22
+ # Ensure notebook folder layout exists
23
  base = nb_root(username, notebook_id)
24
 
25
+ os.makedirs(user_root(username), exist_ok=True)
26
  os.makedirs(base, exist_ok=True)
27
+
28
  os.makedirs(os.path.join(base, "files_raw"), exist_ok=True)
29
  os.makedirs(os.path.join(base, "files_extracted"), exist_ok=True)
30
+ os.makedirs(os.path.join(base, "chroma"), exist_ok=True)
31
+
32
+ os.makedirs(os.path.join(base, "chat"), exist_ok=True)
33
+ os.makedirs(os.path.join(base, "artifacts", "reports"), exist_ok=True)
34
+ os.makedirs(os.path.join(base, "artifacts", "quizzes"), exist_ok=True)
35
+ os.makedirs(os.path.join(base, "artifacts", "podcasts"), exist_ok=True)
36
+
37
+ # Make sure index.json exists
38
+ ip = index_path(username)
39
+ if not os.path.exists(ip):
40
+ with open(ip, "w", encoding="utf-8") as f:
41
+ f.write("[]")