Qar-Raz commited on
Commit
f5ff6c4
·
1 Parent(s): c7256ee

hf-space: fix docker checks and include data module for API runtime

Browse files
Files changed (3) hide show
  1. Dockerfile +2 -2
  2. data/__init__.py +0 -0
  3. data/vector_db.py +245 -0
Dockerfile CHANGED
@@ -17,8 +17,8 @@ RUN pip install --upgrade pip && pip install -r requirements.txt
17
 
18
  COPY . .
19
 
20
- # Fail fast during build if critical runtime folders are missing from context.
21
- RUN test -d /app/backend && test -d /app/data && test -d /app/results
22
 
23
  # Hugging Face Spaces exposes apps on port 7860 by default.
24
  EXPOSE 7860
 
17
 
18
  COPY . .
19
 
20
+ # Fail fast during build if critical runtime modules are missing from context.
21
+ RUN test -d /app/backend && test -d /app/retriever && test -d /app/models && test -f /app/config.yaml
22
 
23
  # Hugging Face Spaces exposes apps on port 7860 by default.
24
  EXPOSE 7860
data/__init__.py ADDED
File without changes
data/vector_db.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import re
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List
6
+ from pinecone import Pinecone, ServerlessSpec
7
+
8
+
9
+ # Added cacheing to reduce consecutive startup time
10
+ # --@Qamar
11
+
12
+ def slugify_technique(name):
13
+ """Converts 'Sentence Splitter' to 'sentence-splitter' for Pinecone naming."""
14
+ return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
15
+
16
+ def get_index_by_name(api_key: str, index_name: str):
17
+ """
18
+ Directly connects to a Pinecone index by its full string name.
19
+ Useful for the API/Production side where the name is already known.
20
+ """
21
+ pc = Pinecone(api_key=api_key)
22
+
23
+ # Check if it exists first to avoid a 404 crash
24
+ existing_indexes = [idx.name for idx in pc.list_indexes()]
25
+ if index_name not in existing_indexes:
26
+ raise ValueError(f"Index '{index_name}' does not exist in your Pinecone project.")
27
+
28
+ print(f" Connecting to Index: {index_name}")
29
+ return pc.Index(index_name)
30
+
31
+ def get_pinecone_index(api_key, base_name, technique, dimension=384, metric="cosine"):
32
+ """
33
+ Creates/Returns an index specifically for a technique.
34
+ Example: 'arxiv-index-token'
35
+ """
36
+ pc = Pinecone(api_key=api_key)
37
+ tech_slug = slugify_technique(technique)
38
+ full_index_name = f"{base_name}-{tech_slug}"
39
+
40
+ existing_indexes = [idx.name for idx in pc.list_indexes()]
41
+
42
+ if full_index_name not in existing_indexes:
43
+ print(f" Creating specialized index: {full_index_name}...")
44
+ pc.create_index(
45
+ name=full_index_name,
46
+ dimension=dimension,
47
+ metric=metric,
48
+ spec=ServerlessSpec(cloud="aws", region="us-east-1")
49
+ )
50
+ # Wait for index to spin up
51
+ while not pc.describe_index(full_index_name).status['ready']:
52
+ time.sleep(1)
53
+
54
+ # Use our new helper to return the index object
55
+ return get_index_by_name(api_key, full_index_name)
56
+
57
+ def refresh_pinecone_index(index, final_chunks, batch_size=100):
58
+ """
59
+ Refreshes the specific index. Since index is now technique-specific,
60
+ we just check if it's already populated.
61
+ """
62
+ if not final_chunks:
63
+ print("No chunks provided to refresh.")
64
+ return False
65
+
66
+ try:
67
+ # Check current stats for this specific index
68
+ stats = index.describe_index_stats()
69
+ current_count = stats.get('total_vector_count', 0)
70
+ expected_count = len(final_chunks)
71
+
72
+ print(f" Index Stats -> Existing: {current_count} | New Chunks: {expected_count}")
73
+
74
+ if current_count == 0:
75
+ print(f"➕ Index is empty. Upserting {expected_count} vectors...")
76
+ vectors = prepare_vectors_for_upsert(final_chunks)
77
+ upsert_to_pinecone(index, vectors, batch_size)
78
+ return True
79
+
80
+ elif current_count < expected_count:
81
+ # Simple check to see if we need to top up or refresh
82
+ print(f" Vector count mismatch ({current_count} < {expected_count}). Updating index...")
83
+ vectors = prepare_vectors_for_upsert(final_chunks)
84
+ upsert_to_pinecone(index, vectors, batch_size)
85
+ return True
86
+
87
+ else:
88
+ print(f" Index is already populated with {current_count} vectors. Ready for search.")
89
+ return False
90
+
91
+ except Exception as e:
92
+ print(f" Error refreshing index: {e}")
93
+ return False
94
+
95
+ # Utility functions remain the same as previous version
96
+ def prepare_vectors_for_upsert(final_chunks):
97
+ vectors = []
98
+ for chunk in final_chunks:
99
+ meta = chunk.get('metadata', {})
100
+ metadata_payload = dict(meta) if isinstance(meta, dict) else {}
101
+ metadata_payload.setdefault('text', meta.get('text', "") if isinstance(meta, dict) else "")
102
+ metadata_payload.setdefault('title', meta.get('title', "") if isinstance(meta, dict) else "")
103
+ metadata_payload.setdefault('url', meta.get('url', "") if isinstance(meta, dict) else "")
104
+ metadata_payload.setdefault('chunk_index', meta.get('chunk_index', 0) if isinstance(meta, dict) else 0)
105
+ metadata_payload.setdefault('technique', meta.get('technique', "unknown") if isinstance(meta, dict) else "unknown")
106
+ metadata_payload.setdefault('chunking_technique', meta.get('chunking_technique', "unknown") if isinstance(meta, dict) else "unknown")
107
+
108
+ vectors.append({
109
+ 'id': chunk['id'],
110
+ 'values': chunk['values'],
111
+ 'metadata': metadata_payload
112
+ })
113
+ return vectors
114
+
115
+ def upsert_to_pinecone(index, chunks, batch_size=100):
116
+ for i in range(0, len(chunks), batch_size):
117
+ batch = chunks[i : i + batch_size]
118
+ index.upsert(vectors=batch)
119
+
120
+ # Some methods for loading chunks back from Pinecone with local caching to speed up BM25 initialization
121
+
122
+ def _sanitize_index_name(index_name: str) -> str:
123
+ return re.sub(r'[^a-zA-Z0-9._-]+', '-', index_name).strip('-') or 'default-index'
124
+
125
+
126
+ def _chunk_cache_path(cache_dir: str, index_name: str) -> Path:
127
+ cache_root = Path(cache_dir)
128
+ cache_root.mkdir(parents=True, exist_ok=True)
129
+ safe_name = _sanitize_index_name(index_name)
130
+ return cache_root / f"bm25_chunks_{safe_name}.json"
131
+
132
+
133
+ def _read_chunk_cache(path: Path) -> Dict[str, Any]:
134
+ with path.open("r", encoding="utf-8") as f:
135
+ return json.load(f)
136
+
137
+
138
+ def _write_chunk_cache(path: Path, payload: Dict[str, Any]) -> None:
139
+ with path.open("w", encoding="utf-8") as f:
140
+ json.dump(payload, f)
141
+
142
+
143
+ def load_chunks_with_local_cache(
144
+ index,
145
+ index_name: str,
146
+ cache_dir: str = ".cache",
147
+ batch_size: int = 100,
148
+ force_refresh: bool = False,
149
+ ) -> tuple[List[Dict[str, Any]], str]:
150
+
151
+ cache_file = _chunk_cache_path(cache_dir=cache_dir, index_name=index_name)
152
+ stats = index.describe_index_stats()
153
+ current_count = stats.get("total_vector_count", 0)
154
+
155
+ if not force_refresh and cache_file.exists():
156
+ try:
157
+ cached_payload = _read_chunk_cache(cache_file)
158
+ cached_meta = cached_payload.get("meta", {})
159
+ cached_count = cached_meta.get("vector_count", -1)
160
+ cached_chunks = cached_payload.get("chunks", [])
161
+
162
+ if cached_count == current_count and cached_chunks:
163
+ print(
164
+ f" Loaded BM25 chunk cache: {cache_file} "
165
+ f"(chunks={len(cached_chunks)}, vectors={cached_count})"
166
+ )
167
+ return cached_chunks, "cache"
168
+
169
+ print(
170
+ " BM25 cache stale or empty. "
171
+ f"cache_vectors={cached_count}, pinecone_vectors={current_count}. Refreshing..."
172
+ )
173
+ except Exception as e:
174
+ print(f" Failed to read BM25 cache ({cache_file}): {e}. Refreshing from Pinecone...")
175
+
176
+ chunks = load_chunks_from_pinecone(index=index, batch_size=batch_size)
177
+ payload = {
178
+ "meta": {
179
+ "index_name": index_name,
180
+ "vector_count": current_count,
181
+ "updated_at_epoch_s": int(time.time()),
182
+ },
183
+ "chunks": chunks,
184
+ }
185
+
186
+ try:
187
+ _write_chunk_cache(cache_file, payload)
188
+ print(f" Saved BM25 chunk cache: {cache_file} (chunks={len(chunks)})")
189
+ except Exception as e:
190
+ print(f" Failed to write BM25 cache ({cache_file}): {e}")
191
+
192
+ return chunks, "pinecone"
193
+
194
+
195
+ def load_chunks_from_pinecone(index, batch_size: int = 100) -> list[dict[str, any]]:
196
+ """
197
+ Scans the Pinecone index to retrieve all text metadata for the BM25 corpus.
198
+ """
199
+ stats = index.describe_index_stats()
200
+ namespaces = list(stats.get('namespaces', {}).keys())
201
+ # If no namespaces are explicitly named, Pinecone uses an empty string for the default
202
+ if not namespaces:
203
+ namespaces = [""]
204
+
205
+ all_chunks: List[Dict[str, Any]] = []
206
+ seen_ids = set()
207
+
208
+ print(f"Loading vectors for BM25 from namespaces: {namespaces}")
209
+
210
+ for ns in namespaces:
211
+ # Pinecone's list() generator returns batches of IDs
212
+ for id_batch in index.list(namespace=ns, limit=batch_size):
213
+ if not id_batch:
214
+ continue
215
+
216
+ # Fetch the actual content (metadata) for this batch of IDs
217
+ fetched = index.fetch(ids=id_batch, namespace=ns)
218
+ vectors = getattr(fetched, "vectors", {})
219
+
220
+ for vector_id, vector_data in vectors.items():
221
+ if vector_id in seen_ids:
222
+ continue
223
+ seen_ids.add(vector_id)
224
+
225
+ # Safely extract metadata
226
+ metadata = getattr(vector_data, "metadata", {})
227
+ if metadata is None:
228
+ metadata = {}
229
+ if not isinstance(metadata, dict):
230
+ metadata = dict(metadata)
231
+
232
+ text = metadata.get("text")
233
+
234
+ if not text:
235
+ continue
236
+
237
+ all_chunks.append({
238
+ "id": vector_id,
239
+ "metadata": metadata
240
+ })
241
+
242
+ print(f" Finished namespace: '{ns if ns else 'default'}'")
243
+
244
+ print(f"Total chunks loaded into memory: {len(all_chunks)}")
245
+ return all_chunks