Azizahalq commited on
Commit
71a256d
·
1 Parent(s): 8f2affe

Update rag_mini.py

Browse files
Files changed (1) hide show
  1. rag_mini.py +21 -8
rag_mini.py CHANGED
@@ -3,14 +3,18 @@ import os, re, uuid, textwrap, hashlib, json, shutil
3
  from pathlib import Path
4
  from typing import Iterable, List, Tuple, Dict, Any
5
 
 
 
 
6
  # --- Paths relative to repo ---
7
  ROOT_DIR = Path(__file__).parent.resolve()
8
  DATA_DIR = ROOT_DIR / "MaterialMind" / "sources"
9
  INDEX_DIR = ROOT_DIR / "MaterialMind" / "index" / "chroma_v3"
10
  MANIFEST = ROOT_DIR / "MaterialMind" / "index" / "manifest.json"
11
 
12
- DEFAULT_TOPK = 5
13
- EMB_MODEL = "BAAI/bge-small-en-v1.5"
 
14
 
15
  def ensure_dirs():
16
  DATA_DIR.mkdir(parents=True, exist_ok=True)
@@ -41,7 +45,6 @@ def embed_texts(texts: List[str]) -> List[List[float]]:
41
 
42
  # --- Loaders ---
43
  def normalize_spaces(text: str) -> str:
44
- import re
45
  text = text.replace("\r", "\n")
46
  text = re.sub(r"[ \t]+", " ", text)
47
  text = re.sub(r"\n{3,}", "\n\n", text)
@@ -122,7 +125,7 @@ def get_collection(reset: bool = False):
122
  if reset:
123
  try: client.delete_collection("materialmind")
124
  except Exception: pass
125
- return client.get_or_create_collection(name="materialmind") # embeddings provided manually
126
 
127
  def add_batch(col, ids, docs, metas):
128
  embs = embed_texts(docs)
@@ -145,7 +148,6 @@ def build_index(batch_size=256) -> int:
145
 
146
  # --- Incremental update with manifest ---
147
  def file_sig(path: Path):
148
- import hashlib
149
  h = hashlib.sha1()
150
  try:
151
  with open(path, "rb") as f:
@@ -203,7 +205,6 @@ def update_index():
203
 
204
  # --- Search ---
205
  def search(query: str, k: int = DEFAULT_TOPK) -> List[Tuple[str, str]]:
206
- import chromadb
207
  col = get_collection(reset=False)
208
  qvec = embed_texts([query])[0]
209
  res = col.query(query_embeddings=[qvec], n_results=k, include=["documents", "metadatas"])
@@ -218,7 +219,7 @@ def search(query: str, k: int = DEFAULT_TOPK) -> List[Tuple[str, str]]:
218
  # --- HF dataset bootstrap ---
219
  def bootstrap_corpus_and_index():
220
  """
221
- Download the dataset Azizahalq/materialmind-corpus into DATA_DIR,
222
  then build or update the local vector index.
223
  """
224
  ensure_dirs()
@@ -230,7 +231,6 @@ def bootstrap_corpus_and_index():
230
  local_dir=DATA_DIR, local_dir_use_symlinks=False,
231
  ignore_patterns=["*.ipynb", ".*", "__pycache__/*"]
232
  )
233
- # If index is empty, build fresh; otherwise update
234
  if not any(INDEX_DIR.iterdir()):
235
  n = build_index()
236
  print(f"[BUILD] indexed {n} chunks")
@@ -238,3 +238,16 @@ def bootstrap_corpus_and_index():
238
  update_index()
239
  except Exception as e:
240
  print(f"[WARN] dataset bootstrap failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from pathlib import Path
4
  from typing import Iterable, List, Tuple, Dict, Any
5
 
6
+ # Make Chroma quieter on Spaces
7
+ os.environ.setdefault("CHROMADB_DISABLE_TELEMETRY", "1")
8
+
9
  # --- Paths relative to repo ---
10
  ROOT_DIR = Path(__file__).parent.resolve()
11
  DATA_DIR = ROOT_DIR / "MaterialMind" / "sources"
12
  INDEX_DIR = ROOT_DIR / "MaterialMind" / "index" / "chroma_v3"
13
  MANIFEST = ROOT_DIR / "MaterialMind" / "index" / "manifest.json"
14
 
15
+ DEFAULT_TOPK = int(os.getenv("DEFAULT_TOPK", "5"))
16
+ DEFAULT_MODEL = os.getenv("LLM_MODEL", "HuggingFaceH4/zephyr-7b-beta")
17
+ EMB_MODEL = os.getenv("EMB_MODEL", "BAAI/bge-small-en-v1.5")
18
 
19
  def ensure_dirs():
20
  DATA_DIR.mkdir(parents=True, exist_ok=True)
 
45
 
46
  # --- Loaders ---
47
  def normalize_spaces(text: str) -> str:
 
48
  text = text.replace("\r", "\n")
49
  text = re.sub(r"[ \t]+", " ", text)
50
  text = re.sub(r"\n{3,}", "\n\n", text)
 
125
  if reset:
126
  try: client.delete_collection("materialmind")
127
  except Exception: pass
128
+ return client.get_or_create_collection(name="materialmind") # we provide embeddings manually
129
 
130
  def add_batch(col, ids, docs, metas):
131
  embs = embed_texts(docs)
 
148
 
149
  # --- Incremental update with manifest ---
150
  def file_sig(path: Path):
 
151
  h = hashlib.sha1()
152
  try:
153
  with open(path, "rb") as f:
 
205
 
206
  # --- Search ---
207
  def search(query: str, k: int = DEFAULT_TOPK) -> List[Tuple[str, str]]:
 
208
  col = get_collection(reset=False)
209
  qvec = embed_texts([query])[0]
210
  res = col.query(query_embeddings=[qvec], n_results=k, include=["documents", "metadatas"])
 
219
  # --- HF dataset bootstrap ---
220
  def bootstrap_corpus_and_index():
221
  """
222
+ Download dataset (default: Azizahalq/materialmind-corpus) into DATA_DIR,
223
  then build or update the local vector index.
224
  """
225
  ensure_dirs()
 
231
  local_dir=DATA_DIR, local_dir_use_symlinks=False,
232
  ignore_patterns=["*.ipynb", ".*", "__pycache__/*"]
233
  )
 
234
  if not any(INDEX_DIR.iterdir()):
235
  n = build_index()
236
  print(f"[BUILD] indexed {n} chunks")
 
238
  update_index()
239
  except Exception as e:
240
  print(f"[WARN] dataset bootstrap failed: {e}")
241
+
242
+ def ensure_ready():
243
+ """
244
+ Ensure folders exist; if sources/ is empty, pull the dataset and build index.
245
+ """
246
+ ensure_dirs()
247
+ is_empty = not any(DATA_DIR.glob("*"))
248
+ if is_empty:
249
+ print("[BOOTSTRAP] sources/ is empty → pulling dataset and indexing…")
250
+ bootstrap_corpus_and_index()
251
+ elif not any(INDEX_DIR.glob("*")):
252
+ print("[BOOTSTRAP] index folder empty → building from existing sources/")
253
+ build_index()