Shubham 10000 commited on
Commit
bf11009
·
1 Parent(s): a3517b2

version 2.0 storage file & requirment for sentence changes

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. storage.py +175 -99
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  streamlit>=1.20.0
2
  requests>=2.28.0
3
  pypdf>=3.0.0
4
- numpy>=1.23.0
 
 
1
  streamlit>=1.20.0
2
  requests>=2.28.0
3
  pypdf>=3.0.0
4
+ numpy>=1.23.0
5
+ sentence-transformers>=2.2.2
storage.py CHANGED
@@ -13,16 +13,11 @@ logging.basicConfig(level=logging.INFO)
13
 
14
  class VectorIndex:
15
  """
16
- Lightweight vector index using Hugging Face Inference API for embeddings
17
- and NumPy for similarity search. Supports .pdf and .txt uploads.
18
- Stores data under a runtime directory (ephemeral in Spaces).
19
-
20
- To operate:
21
- - Set HF_HUB_TOKEN as a Space Secret (recommended) or env var.
22
- - Default embedding model: "sentence-transformers/all-MiniLM-L6-v2".
23
- - If the HF model is not accessible from the Inference API, either pick a public model
24
- that supports embeddings or enable local fallback (use_local_fallback=True) and
25
- install sentence-transformers in requirements.txt.
26
  """
27
 
28
  def __init__(
@@ -61,6 +56,7 @@ class VectorIndex:
61
  if os.path.exists(self.emb_path):
62
  self.embeddings = np.load(self.emb_path)
63
  if self.embeddings is None:
 
64
  self.embeddings = np.zeros((0, 384), dtype=np.float32)
65
  logger.info(f"Loaded store: {len(self.doc_store)} chunks")
66
  except Exception as e:
@@ -111,93 +107,177 @@ class VectorIndex:
111
  return chunks
112
 
113
  # ---------------- embeddings via HF Inference API ---------------- #
114
- def _get_embeddings_api(self, texts: List[str]) -> List[List[float]]:
115
  """
116
- Use HF Inference API at /models/{model} endpoint.
117
- Returns list of vectors for each input text.
118
-
119
- Raises RuntimeError containing HF response body for easy debugging.
120
  """
121
  import requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  model_path = self.embedding_model
124
- # Use new models endpoint (more robust than pipeline/... path)
125
  url = f"https://api-inference.huggingface.co/models/{model_path}"
126
  headers = {"Content-Type": "application/json"}
127
  if self.hf_token:
128
  headers["Authorization"] = f"Bearer {self.hf_token}"
129
- payload = {"inputs": texts}
130
 
131
- try:
132
- resp = requests.post(url, headers=headers, json=payload, timeout=90)
133
 
134
- # Log and surface HF error responses (status >= 400)
135
- if resp.status_code >= 400:
136
- body_text = None
 
 
 
137
  try:
138
- body_text = resp.json()
139
- except Exception:
140
- body_text = resp.text
141
- logger.error(f"HF Inference error: status={resp.status_code}, body={body_text}")
 
 
142
 
143
- if resp.status_code == 403:
144
- if self.hf_token:
145
- raise RuntimeError(
146
- f"HF Inference API 403 Forbidden. Response: {body_text}. "
147
- "Your HF_HUB_TOKEN may be invalid/expired or lacks model access. "
148
- "Check the token in Space Secrets and the model permissions."
149
- )
150
- else:
151
- raise RuntimeError(
152
- f"HF Inference API 403 Forbidden. Response: {body_text}. "
153
- "Unauthenticated requests may be blocked for this model. Add HF_HUB_TOKEN to Space Secrets."
154
- )
155
- if resp.status_code == 404:
156
- raise RuntimeError(
157
- f"HF Inference API 404 Not Found. Response: {body_text}. Model path may be incorrect: '{model_path}'. "
158
- "Try a public model or verify the model id."
159
- )
160
- raise RuntimeError(f"HF Inference API HTTP {resp.status_code}. Response: {body_text}")
161
-
162
- data = resp.json()
163
- vectors = []
164
- # Normalize possible returned shapes: pooled vector or token vectors
165
- for item in data:
166
- if isinstance(item, list) and item and all(isinstance(x, (int, float)) for x in item):
167
- vectors.append(item)
168
- elif isinstance(item, list) and item and isinstance(item[0], list):
169
- token_vecs = np.asarray(item, dtype=np.float32)
170
- if token_vecs.ndim == 2:
171
- avg = token_vecs.mean(axis=0).tolist()
172
- vectors.append(avg)
173
- else:
174
- vectors.append(token_vecs.flatten().tolist())
175
- else:
176
- raise ValueError("Unexpected embedding format from HF Inference API")
177
- if not vectors or len(vectors) != len(texts):
178
- raise RuntimeError("Embeddings API returned unexpected number of vectors.")
179
- return vectors
180
 
 
 
 
 
 
 
 
 
 
 
 
181
  except Exception as e:
182
- # Attempt local fallback if enabled
183
- if self.use_local_fallback:
184
- try:
185
- from sentence_transformers import SentenceTransformer
186
- except Exception as imp_err:
187
- raise RuntimeError(
188
- f"Embedding API failed: {e}. Local fallback requested but sentence-transformers is not installed: {imp_err}"
189
- )
190
- try:
191
- local_model_name = model_path.split("sentence-transformers/")[-1]
192
- model = SentenceTransformer(local_model_name)
193
- emb = model.encode(texts, convert_to_numpy=True)
194
- return emb.tolist()
195
- except Exception as local_e:
196
- raise RuntimeError(f"Embedding API failed: {e}. Local fallback also failed: {local_e}")
197
- raise RuntimeError(
198
- f"Embedding API failed: {e}. If you see a 403, check HF_HUB_TOKEN and model access. "
199
- "Add HF_HUB_TOKEN to Space Secrets or enable a local fallback with sentence-transformers."
200
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
  # ---------------- index operations ---------------- #
203
  def add_file(self, file_path: str, source: str = "user-upload", metadata: dict = None) -> int:
@@ -233,15 +313,13 @@ class VectorIndex:
233
  )
234
  self.embeddings = np.vstack([self.embeddings, vecs])
235
  for j, chunk in enumerate(batch):
236
- self.doc_store.append(
237
- {
238
- "chunk_id": str(uuid.uuid4()),
239
- "content": chunk,
240
- "source": source,
241
- "metadata": metadata or {},
242
- "vector_idx": len(self.doc_store),
243
- }
244
- )
245
  added += len(batch)
246
  self._persist()
247
  logger.info(f"Added {added} chunks from {os.path.basename(file_path)}")
@@ -266,14 +344,12 @@ class VectorIndex:
266
  results = []
267
  for idx in idxs:
268
  entry = self.doc_store[idx]
269
- results.append(
270
- {
271
- "content": entry["content"],
272
- "metadata": entry.get("metadata", {}),
273
- "source": entry.get("source"),
274
- "score": float(sims[idx]),
275
- }
276
- )
277
  return results
278
 
279
  def list_documents(self) -> List[Dict]:
 
13
 
14
  class VectorIndex:
15
  """
16
+ Robust VectorIndex for HF Inference API embeddings with multiple request shape fallbacks
17
+ and optional local sentence-transformers fallback.
18
+
19
+ Usage:
20
+ vi = VectorIndex(storage_dir="/tmp/vector_data", hf_token_env_value=HF_HUB_TOKEN, use_local_fallback=False)
 
 
 
 
 
21
  """
22
 
23
  def __init__(
 
56
  if os.path.exists(self.emb_path):
57
  self.embeddings = np.load(self.emb_path)
58
  if self.embeddings is None:
59
+ # default shape if no embeddings yet
60
  self.embeddings = np.zeros((0, 384), dtype=np.float32)
61
  logger.info(f"Loaded store: {len(self.doc_store)} chunks")
62
  except Exception as e:
 
107
  return chunks
108
 
109
  # ---------------- embeddings via HF Inference API ---------------- #
110
+ def _call_hf(self, url: str, headers: dict, payload) -> Dict:
111
  """
112
+ Helper to call HF Inference models endpoint and return (status_code, body).
 
 
 
113
  """
114
  import requests
115
+ resp = requests.post(url, headers=headers, json=payload, timeout=90)
116
+ # attempt to parse body
117
+ body = None
118
+ try:
119
+ body = resp.json()
120
+ except Exception:
121
+ body = resp.text
122
+ return {"status": resp.status_code, "body": body, "raw": resp}
123
+
124
+ def _parse_embedding_response(self, data, expected_len: int) -> List[List[float]]:
125
+ """
126
+ Parse known embedding shapes from HF response body into list-of-vectors.
127
+ Raises on unexpected formats.
128
+ """
129
+ vectors = []
130
+ # If the model returned a dict containing embeddings under some key, try to find them
131
+ if isinstance(data, dict):
132
+ # common key candidates
133
+ for key in ("embeddings", "embedding", "vectors", "array"):
134
+ if key in data:
135
+ data = data[key]
136
+ break
137
+
138
+ if isinstance(data, list):
139
+ # Case: list of vectors or list of token vectors per input
140
+ # If each item is a list of floats -> direct
141
+ if all(isinstance(item, list) and item and all(isinstance(x, (int, float)) for x in item) for item in data):
142
+ # Might be list-of-vectors for batch
143
+ if len(data) == expected_len:
144
+ return [list(map(float, v)) for v in data]
145
+ # If returned token vectors for a single input, handle below
146
+ # If data is a nested list (token vectors), try averaging per item
147
+ # Try to coerce one vector per input
148
+ # Heuristic: if len(data) == expected_len and each entry is vector -> done
149
+ # If len(data) == 1 and expected_len >1, maybe API returned single vector for first input
150
+ # Fallback: if length mismatch but elements are lists of lists (token vectors), average them
151
+ out = []
152
+ for item in data:
153
+ if isinstance(item, list) and item and all(isinstance(x, (int, float)) for x in item):
154
+ out.append([float(x) for x in item])
155
+ elif isinstance(item, list) and item and isinstance(item[0], list):
156
+ arr = np.asarray(item, dtype=np.float32)
157
+ if arr.ndim == 2:
158
+ out.append(arr.mean(axis=0).tolist())
159
+ else:
160
+ out.append(arr.flatten().tolist())
161
+ else:
162
+ # unknown item shape
163
+ raise ValueError("Unexpected embedding item format")
164
+ if len(out) == expected_len:
165
+ return out
166
+ # If out length differs, but equals 1 and expected >1, maybe API returned pooled vector for all inputs -> broadcast
167
+ if len(out) == 1 and expected_len > 1:
168
+ return [out[0] for _ in range(expected_len)]
169
+ return out
170
+
171
+ raise ValueError("Unexpected embedding response format")
172
+
173
+ def _get_embeddings_api(self, texts: List[str]) -> List[List[float]]:
174
+ """
175
+ Robust embedding retrieval that attempts multiple request formats to handle different hosted pipeline types.
176
+ Tries:
177
+ 1) batch inputs: {"inputs": texts}
178
+ 2) per-text calls: {"inputs": single_text} for each text
179
+ 3) similarity-style: {"inputs": {"sentences": texts}} or {"inputs": {"sentence": texts}}
180
+ If all fail and use_local_fallback=True, tries local sentence-transformers.
181
+ Surfaces HF response body in raised errors for debugging.
182
+ """
183
+ import requests # local import for runtime environments
184
 
185
  model_path = self.embedding_model
 
186
  url = f"https://api-inference.huggingface.co/models/{model_path}"
187
  headers = {"Content-Type": "application/json"}
188
  if self.hf_token:
189
  headers["Authorization"] = f"Bearer {self.hf_token}"
 
190
 
191
+ attempts = []
 
192
 
193
+ # Attempt A: batch inputs (most common)
194
+ try:
195
+ payload = {"inputs": texts}
196
+ res = self._call_hf(url, headers, payload)
197
+ attempts.append(("batch", res))
198
+ if res["status"] < 400:
199
  try:
200
+ return self._parse_embedding_response(res["body"], len(texts))
201
+ except Exception as e:
202
+ # parsing failed; proceed to next attempt
203
+ logger.info(f"Batch parse failed: {e}")
204
+ except Exception as e:
205
+ logger.info(f"Batch request failed: {e}")
206
 
207
+ # Attempt B: single-item calls (some models only accept single input)
208
+ try:
209
+ per_vecs = []
210
+ ok = True
211
+ for t in texts:
212
+ payload = {"inputs": t}
213
+ res = self._call_hf(url, headers, payload)
214
+ attempts.append(("single", res))
215
+ if res["status"] >= 400:
216
+ ok = False
217
+ break
218
+ try:
219
+ parsed = self._parse_embedding_response(res["body"], 1)
220
+ per_vecs.extend(parsed)
221
+ except Exception as e:
222
+ logger.info(f"Single parse failed for input: {e}")
223
+ ok = False
224
+ break
225
+ if ok and len(per_vecs) == len(texts):
226
+ return per_vecs
227
+ except Exception as e:
228
+ logger.info(f"Single-item requests failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
+ # Attempt C: similarity-style payloads
231
+ try:
232
+ for key in ("sentences", "sentence", "texts"):
233
+ payload = {"inputs": {key: texts}}
234
+ res = self._call_hf(url, headers, payload)
235
+ attempts.append((f"key:{key}", res))
236
+ if res["status"] < 400:
237
+ try:
238
+ return self._parse_embedding_response(res["body"], len(texts))
239
+ except Exception as e:
240
+ logger.info(f"Parse after key {key} failed: {e}")
241
  except Exception as e:
242
+ logger.info(f"Similarity-key attempts failed: {e}")
243
+
244
+ # If reached here all HF attempts failed
245
+ # Build an informative error showing the attempts and last HF body if available
246
+ last_body = None
247
+ last_status = None
248
+ if attempts:
249
+ last_status = attempts[-1][1]["status"]
250
+ last_body = attempts[-1][1]["body"]
251
+ # Log all attempts for debugging
252
+ logger.error("HF embedding attempts failed. Attempts summary:")
253
+ for name, res in attempts:
254
+ logger.error(f"Attempt '{name}': status={res['status']}, body={res['body']}")
255
+
256
+ # Optional local fallback
257
+ if self.use_local_fallback:
258
+ try:
259
+ from sentence_transformers import SentenceTransformer
260
+ except Exception as imp_err:
261
+ raise RuntimeError(
262
+ f"Embedding API failed (HF attempts). Last status={last_status}, body={last_body}. "
263
+ f"Local fallback requested but sentence-transformers not installed: {imp_err}"
264
+ )
265
+ try:
266
+ local_model_name = model_path.split("sentence-transformers/")[-1]
267
+ model = SentenceTransformer(local_model_name)
268
+ emb = model.encode(texts, convert_to_numpy=True)
269
+ return emb.tolist()
270
+ except Exception as local_e:
271
+ raise RuntimeError(
272
+ f"Embedding API failed (HF attempts). Last status={last_status}, body={last_body}. "
273
+ f"Local fallback also failed: {local_e}"
274
+ )
275
+
276
+ # No fallback: raise with HF details
277
+ raise RuntimeError(
278
+ f"Embedding API failed after multiple request formats. Last status={last_status}, body={last_body}. "
279
+ "If you see 403, check HF_HUB_TOKEN and model access. Consider enabling local fallback with sentence-transformers."
280
+ )
281
 
282
  # ---------------- index operations ---------------- #
283
  def add_file(self, file_path: str, source: str = "user-upload", metadata: dict = None) -> int:
 
313
  )
314
  self.embeddings = np.vstack([self.embeddings, vecs])
315
  for j, chunk in enumerate(batch):
316
+ self.doc_store.append({
317
+ "chunk_id": str(uuid.uuid4()),
318
+ "content": chunk,
319
+ "source": source,
320
+ "metadata": metadata or {},
321
+ "vector_idx": len(self.doc_store),
322
+ })
 
 
323
  added += len(batch)
324
  self._persist()
325
  logger.info(f"Added {added} chunks from {os.path.basename(file_path)}")
 
344
  results = []
345
  for idx in idxs:
346
  entry = self.doc_store[idx]
347
+ results.append({
348
+ "content": entry["content"],
349
+ "metadata": entry.get("metadata", {}),
350
+ "source": entry.get("source"),
351
+ "score": float(sims[idx]),
352
+ })
 
 
353
  return results
354
 
355
  def list_documents(self) -> List[Dict]: