Azizahalq commited on
Commit
e2d584e
·
verified ·
1 Parent(s): 037781c

Update rag_mini.py

Browse files
Files changed (1) hide show
  1. rag_mini.py +46 -41
rag_mini.py CHANGED
@@ -10,17 +10,29 @@ from pathlib import Path
10
  from typing import List, Tuple
11
 
12
  # ---------- Paths ----------
13
- ROOT_DIR = Path(__file__).parent.resolve()
14
- DATA_ROOT = ROOT_DIR / "MaterialMind"
15
- DATA_DIR = DATA_ROOT / "sources" # kept for compatibility; not required
16
- DEFAULT_TOPK = 5
17
- DEFAULT_MODEL = "" # kept for compatibility with imports
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  # Base folder where the Chroma catalog (or its subfolder) lives
20
- _BASE_INDEX_DIR = Path(os.getenv("INDEX_DIR", str(DATA_ROOT / "index" / "chroma_v3"))).resolve()
 
21
 
22
  # ---------- Embedding ----------
23
- EMB_MODEL = "BAAI/bge-small-en-v1.5"
24
  _EMBED_FAST = None
25
  _EMBED_ST = None
26
 
@@ -41,45 +53,44 @@ def _init_embedder():
41
  def _embed(texts: List[str]) -> List[List[float]]:
42
  _init_embedder()
43
  if _EMBED_FAST is not None:
44
- # FastEmbed returns a generator of np arrays
45
- return [v for v in _EMBED_FAST.embed(texts)]
46
  return _EMBED_ST.encode(texts, normalize_embeddings=True).tolist()
47
 
48
  # ---------- Catalog discovery ----------
49
  def _has_catalog(dirpath: Path) -> bool:
50
  """Heuristic check for a Chroma catalog in dirpath."""
51
- key_files = [
 
 
52
  "chroma.sqlite3",
 
 
53
  "index_metadata.pickle",
54
  "data_level0.bin",
55
  "length.bin",
56
  "link_lists.bin",
57
  "header.bin",
58
- "chroma.sqlite",
59
- "chroma-collections.parquet",
60
- ]
61
  return any((dirpath / kf).exists() for kf in key_files)
62
 
63
  def _find_catalog_dir(base: Path) -> Path:
64
- """
65
- Return the directory that actually contains the Chroma catalog.
66
- Handles a nested UUID subfolder (as in your screenshot).
67
- """
68
- # direct
69
  if _has_catalog(base):
70
  return base
71
 
72
- # look at one or two levels deep for chroma.sqlite3 (or other catalog markers)
73
- hits = list(base.rglob("chroma.sqlite3"))
74
- if hits:
75
- return hits[0].parent
76
-
77
- # fallback to any of the other known files
78
- for pat in ["**/chroma-collections.parquet", "**/index_metadata.pickle",
79
- "**/data_level0.bin", "**/chroma.sqlite"]:
80
- alt = list(base.rglob(pat))
81
- if alt:
82
- return alt[0].parent
83
 
84
  return base # may be empty; ensure_ready() will warn
85
 
@@ -89,7 +100,7 @@ INDEX_DIR = _find_catalog_dir(_BASE_INDEX_DIR)
89
  def _get_client_and_collection():
90
  """
91
  Open the persistent Chroma client on INDEX_DIR and return an existing collection.
92
- If no collection is listed, fall back to 'materialmind'.
93
  """
94
  import chromadb
95
  client = chromadb.PersistentClient(path=str(INDEX_DIR))
@@ -99,23 +110,20 @@ def _get_client_and_collection():
99
  # Use the first existing collection (the one you built locally)
100
  return client, client.get_collection(cols[0].name)
101
  except Exception as e:
102
- print(f"[RAG] Warning: list_collections failed: {e}")
103
 
104
  # Fallback: create/get 'materialmind'
105
  return client, client.get_or_create_collection(name="materialmind")
106
 
107
  # ---------- Public API ----------
108
  def ensure_ready() -> None:
109
- """
110
- Called by app.py at startup. Verifies the index path and prints a small stat.
111
- """
112
  INDEX_DIR.mkdir(parents=True, exist_ok=True)
113
  if not _has_catalog(INDEX_DIR):
114
  print(f"[RAG] WARNING: No Chroma catalog found in: {INDEX_DIR}")
115
- print(" If you uploaded a nested folder (UUID), set INDEX_DIR env/secret to that folder,")
116
- print(" or keep the layout: MaterialMind/index/chroma_v3/<uuid>/chroma.sqlite3")
117
  return
118
-
119
  try:
120
  stats = index_stats()
121
  print(f"[RAG] Index OK at {INDEX_DIR} — chunks={stats.get('count')}")
@@ -130,7 +138,6 @@ def search(query: str, k: int = DEFAULT_TOPK) -> List[Tuple[str, str]]:
130
  if not _has_catalog(INDEX_DIR):
131
  print("[RAG] search aborted: catalog not found")
132
  return []
133
-
134
  try:
135
  _, col = _get_client_and_collection()
136
  qvec = _embed([query])[0]
@@ -158,9 +165,7 @@ def search(query: str, k: int = DEFAULT_TOPK) -> List[Tuple[str, str]]:
158
  return hits
159
 
160
  def index_stats() -> dict:
161
- """
162
- Return simple stats {count: int}. Safe if catalog missing.
163
- """
164
  if not _has_catalog(INDEX_DIR):
165
  return {"count": 0, "note": f"no catalog at {INDEX_DIR}"}
166
  try:
 
10
  from typing import List, Tuple
11
 
12
  # ---------- Paths ----------
13
+ ROOT_DIR = Path(__file__).parent.resolve()
14
+ DATA_ROOT = ROOT_DIR / "MaterialMind"
15
+ DEFAULT_TOPK = 5
16
+ DEFAULT_MODEL = "" # kept for compatibility with imports
17
+
18
+ def _normalize_to_dir(p: Path) -> Path:
19
+ """If p is a file (e.g., .../chroma.sqlite3) return its parent; else p."""
20
+ try:
21
+ if p.is_file():
22
+ return p.parent
23
+ # If someone passed a string ending with .sqlite/.sqlite3, treat as file
24
+ if p.name.endswith(".sqlite3") or p.name.endswith(".sqlite"):
25
+ return p.parent
26
+ except Exception:
27
+ pass
28
+ return p
29
 
30
  # Base folder where the Chroma catalog (or its subfolder) lives
31
+ _base_from_env = Path(os.getenv("INDEX_DIR", DATA_ROOT / "index" / "chroma_v3"))
32
+ _BASE_INDEX_DIR = _normalize_to_dir(_base_from_env).resolve()
33
 
34
  # ---------- Embedding ----------
35
+ EMB_MODEL = "BAAI/bge-small-en-v1.5"
36
  _EMBED_FAST = None
37
  _EMBED_ST = None
38
 
 
53
  def _embed(texts: List[str]) -> List[List[float]]:
54
  _init_embedder()
55
  if _EMBED_FAST is not None:
56
+ # Ensure plain Python lists
57
+ return [vec.tolist() for vec in _EMBED_FAST.embed(texts)]
58
  return _EMBED_ST.encode(texts, normalize_embeddings=True).tolist()
59
 
60
  # ---------- Catalog discovery ----------
61
  def _has_catalog(dirpath: Path) -> bool:
62
  """Heuristic check for a Chroma catalog in dirpath."""
63
+ if not dirpath.exists() or not dirpath.is_dir():
64
+ return False
65
+ key_files = {
66
  "chroma.sqlite3",
67
+ "chroma.sqlite",
68
+ "chroma-collections.parquet",
69
  "index_metadata.pickle",
70
  "data_level0.bin",
71
  "length.bin",
72
  "link_lists.bin",
73
  "header.bin",
74
+ }
 
 
75
  return any((dirpath / kf).exists() for kf in key_files)
76
 
77
  def _find_catalog_dir(base: Path) -> Path:
78
+ """Return the directory that actually contains the Chroma catalog."""
79
+ base = _normalize_to_dir(base)
 
 
 
80
  if _has_catalog(base):
81
  return base
82
 
83
+ # look a bit deeper
84
+ for pat in (
85
+ "**/chroma.sqlite3",
86
+ "**/chroma.sqlite",
87
+ "**/chroma-collections.parquet",
88
+ "**/index_metadata.pickle",
89
+ "**/data_level0.bin",
90
+ ):
91
+ found = list(base.rglob(pat))
92
+ if found:
93
+ return found[0].parent
94
 
95
  return base # may be empty; ensure_ready() will warn
96
 
 
100
  def _get_client_and_collection():
101
  """
102
  Open the persistent Chroma client on INDEX_DIR and return an existing collection.
103
+ If none is listed, fall back to 'materialmind'.
104
  """
105
  import chromadb
106
  client = chromadb.PersistentClient(path=str(INDEX_DIR))
 
110
  # Use the first existing collection (the one you built locally)
111
  return client, client.get_collection(cols[0].name)
112
  except Exception as e:
113
+ print(f"[RAG] list_collections failed: {e}")
114
 
115
  # Fallback: create/get 'materialmind'
116
  return client, client.get_or_create_collection(name="materialmind")
117
 
118
  # ---------- Public API ----------
119
  def ensure_ready() -> None:
120
+ """Called by app.py at startup. Verifies the index path and prints a small stat."""
 
 
121
  INDEX_DIR.mkdir(parents=True, exist_ok=True)
122
  if not _has_catalog(INDEX_DIR):
123
  print(f"[RAG] WARNING: No Chroma catalog found in: {INDEX_DIR}")
124
+ print(" If your files are inside a UUID folder, set INDEX_DIR to that folder,")
125
+ print(" e.g. MaterialMind/index/chroma_v3/22ddc0e5-...-d7a3fde16dd5")
126
  return
 
127
  try:
128
  stats = index_stats()
129
  print(f"[RAG] Index OK at {INDEX_DIR} — chunks={stats.get('count')}")
 
138
  if not _has_catalog(INDEX_DIR):
139
  print("[RAG] search aborted: catalog not found")
140
  return []
 
141
  try:
142
  _, col = _get_client_and_collection()
143
  qvec = _embed([query])[0]
 
165
  return hits
166
 
167
  def index_stats() -> dict:
168
+ """Return simple stats {count: int}. Safe if catalog missing."""
 
 
169
  if not _has_catalog(INDEX_DIR):
170
  return {"count": 0, "note": f"no catalog at {INDEX_DIR}"}
171
  try: