Hitakshi26 commited on
Commit
ca39256
·
1 Parent(s): c60446c

Fixed Storage

Browse files
src/backend/ingest.py CHANGED
@@ -65,33 +65,63 @@ def extract_url(url: str):
65
  def upsert_extracted(username: str, notebook_id: str, source_title: str, source_id: str, extracted_items: list[dict]) -> int:
66
  col = get_collection(username, notebook_id)
67
  ids, docs, metas = [], [], []
 
68
  for item in extracted_items:
 
 
 
69
  for j, ch in enumerate(simple_chunk(item["text"])):
70
  ids.append(f"{source_id}::chunk{j}")
71
  docs.append(ch)
72
- metas.append({
73
- "source_title": source_title,
74
- "source_id": source_id,
75
- "page": item.get("page"),
76
- "slide": item.get("slide"),
77
- })
 
 
 
 
 
 
 
78
  if not docs:
79
  return 0
 
80
  embs = EMBED_MODEL.encode(docs, normalize_embeddings=True).tolist()
81
  col.upsert(ids=ids, documents=docs, metadatas=metas, embeddings=embs)
82
  return len(docs)
83
 
84
- def ingest_files(username: str, notebook_id: str, files) -> int:
 
 
 
 
 
 
85
  ensure_tree(username, notebook_id)
86
  raw_dir = os.path.join(nb_root(username, notebook_id), "files_raw")
87
  ex_dir = os.path.join(nb_root(username, notebook_id), "files_extracted")
88
  added = 0
89
 
90
- for f in (files or []):
91
- fp = getattr(f, "name", None)
92
- if not fp:
93
- continue
 
 
 
 
 
 
 
 
 
 
 
94
 
 
95
  dest = os.path.join(raw_dir, os.path.basename(fp))
96
  pathlib.Path(dest).write_bytes(pathlib.Path(fp).read_bytes())
97
 
@@ -105,15 +135,17 @@ def ingest_files(username: str, notebook_id: str, files) -> int:
105
  else:
106
  continue
107
 
 
108
  ex_path = os.path.join(ex_dir, os.path.basename(dest) + ".txt")
109
- with open(ex_path, "w", encoding="utf-8") as ftxt:
110
  for item in extracted:
111
- loc = ""
112
- if item.get("page"):
113
  loc = f"page={item.get('page')}"
114
- elif item.get("slide"):
115
  loc = f"slide={item.get('slide')}"
116
- ftxt.write(f"\n--- {loc} ---\n{item['text']}\n")
 
 
117
 
118
  added += upsert_extracted(
119
  username,
@@ -124,6 +156,8 @@ def ingest_files(username: str, notebook_id: str, files) -> int:
124
  )
125
 
126
  return added
 
 
127
  def ingest_url(username: str, notebook_id: str, url: str) -> int:
128
  ensure_tree(username, notebook_id)
129
  extracted = extract_url(url)
 
65
  def upsert_extracted(username: str, notebook_id: str, source_title: str, source_id: str, extracted_items: list[dict]) -> int:
66
  col = get_collection(username, notebook_id)
67
  ids, docs, metas = [], [], []
68
+
69
  for item in extracted_items:
70
+ page = item.get("page", None)
71
+ slide = item.get("slide", None)
72
+
73
  for j, ch in enumerate(simple_chunk(item["text"])):
74
  ids.append(f"{source_id}::chunk{j}")
75
  docs.append(ch)
76
+
77
+ meta = {
78
+ "source_title": str(source_title),
79
+ "source_id": str(source_id),
80
+ }
81
+ # IMPORTANT: Chroma metadata cannot include None
82
+ if page is not None:
83
+ meta["page"] = int(page)
84
+ if slide is not None:
85
+ meta["slide"] = int(slide)
86
+
87
+ metas.append(meta)
88
+
89
  if not docs:
90
  return 0
91
+
92
  embs = EMBED_MODEL.encode(docs, normalize_embeddings=True).tolist()
93
  col.upsert(ids=ids, documents=docs, metadatas=metas, embeddings=embs)
94
  return len(docs)
95
 
96
+
97
+ def ingest_files(username: str, notebook_id: str, filepaths) -> int:
98
+ """
99
+ filepaths may be:
100
+ - list[str]
101
+ - list[Gradio file objects]
102
+ """
103
  ensure_tree(username, notebook_id)
104
  raw_dir = os.path.join(nb_root(username, notebook_id), "files_raw")
105
  ex_dir = os.path.join(nb_root(username, notebook_id), "files_extracted")
106
  added = 0
107
 
108
+ # Normalize gradio file objects -> local paths
109
+ normalized_paths = []
110
+ if isinstance(filepaths, (list, tuple)):
111
+ for f in filepaths:
112
+ if f is None:
113
+ continue
114
+ # Gradio may pass objects with .name
115
+ if hasattr(f, "name") and isinstance(f.name, str):
116
+ normalized_paths.append(f.name)
117
+ elif isinstance(f, str):
118
+ normalized_paths.append(f)
119
+ elif isinstance(f, dict) and "name" in f:
120
+ normalized_paths.append(f["name"])
121
+ elif isinstance(filepaths, str):
122
+ normalized_paths = [filepaths]
123
 
124
+ for fp in normalized_paths:
125
  dest = os.path.join(raw_dir, os.path.basename(fp))
126
  pathlib.Path(dest).write_bytes(pathlib.Path(fp).read_bytes())
127
 
 
135
  else:
136
  continue
137
 
138
+ # save extracted
139
  ex_path = os.path.join(ex_dir, os.path.basename(dest) + ".txt")
140
+ with open(ex_path, "w", encoding="utf-8") as f:
141
  for item in extracted:
142
+ if item.get("page") is not None:
 
143
  loc = f"page={item.get('page')}"
144
+ elif item.get("slide") is not None:
145
  loc = f"slide={item.get('slide')}"
146
+ else:
147
+ loc = ""
148
+ f.write(f"\n--- {loc} ---\n{item['text']}\n")
149
 
150
  added += upsert_extracted(
151
  username,
 
156
  )
157
 
158
  return added
159
+
160
+
161
  def ingest_url(username: str, notebook_id: str, url: str) -> int:
162
  ensure_tree(username, notebook_id)
163
  extracted = extract_url(url)
src/backend/rag.py CHANGED
@@ -11,10 +11,10 @@ def retrieve(username: str, notebook_id: str, query: str, k=6):
11
  qemb = EMBED_MODEL.encode([query], normalize_embeddings=True).tolist()
12
 
13
  res = col.query(
14
- query_embeddings=qemb,
15
- n_results=k,
16
- include=["documents", "metadatas", "distances"],
17
- )
18
 
19
  ids = res.get("ids", [[]])[0]
20
  docs = res.get("documents", [[]])[0]
 
11
  qemb = EMBED_MODEL.encode([query], normalize_embeddings=True).tolist()
12
 
13
  res = col.query(
14
+ query_embeddings=qemb,
15
+ n_results=k,
16
+ include=["documents", "metadatas", "distances"]
17
+ )
18
 
19
  ids = res.get("ids", [[]])[0]
20
  docs = res.get("documents", [[]])[0]
src/storage/chroma_store.py CHANGED
@@ -1,27 +1,19 @@
1
  import os
2
  import chromadb
3
- from src.storage.paths import nb_root
4
-
5
-
6
- def chroma_client(username, notebook_id):
7
 
8
- persist_dir = os.path.join(
9
- nb_root(username, notebook_id),
10
- "chroma"
11
- )
12
 
13
- return chromadb.Client(
14
- chromadb.config.Settings(
15
- persist_directory=persist_dir,
16
- anonymized_telemetry=False
17
- )
18
- )
19
 
 
 
 
20
 
21
- def get_collection(username, notebook_id):
 
 
22
 
 
23
  client = chroma_client(username, notebook_id)
24
-
25
- return client.get_or_create_collection(
26
- name="notebook"
27
- )
 
1
  import os
2
  import chromadb
 
 
 
 
3
 
4
+ from src.storage.paths import nb_root
 
 
 
5
 
6
+ # Cache clients by persist_dir to avoid "ephemeral with different settings"
7
+ _CLIENTS: dict[str, chromadb.PersistentClient] = {}
 
 
 
 
8
 
9
+ def chroma_client(username: str, notebook_id: str) -> chromadb.PersistentClient:
10
+ persist_dir = os.path.join(nb_root(username, notebook_id), "chroma")
11
+ os.makedirs(persist_dir, exist_ok=True)
12
 
13
+ if persist_dir not in _CLIENTS:
14
+ _CLIENTS[persist_dir] = chromadb.PersistentClient(path=persist_dir)
15
+ return _CLIENTS[persist_dir]
16
 
17
+ def get_collection(username: str, notebook_id: str, name: str = "chunks"):
18
  client = chroma_client(username, notebook_id)
19
+ return client.get_or_create_collection(name=name)