Spaces:

Qar-Raz
/

NLP-RAG

Sleeping

App Files Files Community

Qar-Raz commited on Apr 3

Commit

f5ff6c4

1 Parent(s): c7256ee

hf-space: fix docker checks and include data module for API runtime

Browse files

Files changed (3) hide show

Dockerfile +2 -2
data/__init__.py +0 -0
data/vector_db.py +245 -0

Dockerfile CHANGED Viewed

@@ -17,8 +17,8 @@ RUN pip install --upgrade pip && pip install -r requirements.txt
 COPY . .
-# Fail fast during build if critical runtime folders are missing from context.
-RUN test -d /app/backend && test -d /app/data && test -d /app/results
 # Hugging Face Spaces exposes apps on port 7860 by default.
 EXPOSE 7860

 COPY . .
+# Fail fast during build if critical runtime modules are missing from context.
+RUN test -d /app/backend && test -d /app/retriever && test -d /app/models && test -f /app/config.yaml
 # Hugging Face Spaces exposes apps on port 7860 by default.
 EXPOSE 7860

data/__init__.py ADDED Viewed

File without changes

data/vector_db.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import time
+import re
+import json
+from pathlib import Path
+from typing import Any, Dict, List
+from pinecone import Pinecone, ServerlessSpec
+# Added cacheing to reduce consecutive startup time
+# --@Qamar
+def slugify_technique(name):
+    """Converts 'Sentence Splitter' to 'sentence-splitter' for Pinecone naming."""
+    return re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
+def get_index_by_name(api_key: str, index_name: str):
+    """
+    Directly connects to a Pinecone index by its full string name.
+    Useful for the API/Production side where the name is already known.
+    """
+    pc = Pinecone(api_key=api_key)
+    # Check if it exists first to avoid a 404 crash
+    existing_indexes = [idx.name for idx in pc.list_indexes()]
+    if index_name not in existing_indexes:
+        raise ValueError(f"Index '{index_name}' does not exist in your Pinecone project.")
+    print(f" Connecting to Index: {index_name}")
+    return pc.Index(index_name)
+def get_pinecone_index(api_key, base_name, technique, dimension=384, metric="cosine"):
+    """
+    Creates/Returns an index specifically for a technique.
+    Example: 'arxiv-index-token'
+    """
+    pc = Pinecone(api_key=api_key)
+    tech_slug = slugify_technique(technique)
+    full_index_name = f"{base_name}-{tech_slug}"
+    existing_indexes = [idx.name for idx in pc.list_indexes()]
+    if full_index_name not in existing_indexes:
+        print(f" Creating specialized index: {full_index_name}...")
+        pc.create_index(
+            name=full_index_name,
+            dimension=dimension,
+            metric=metric,
+            spec=ServerlessSpec(cloud="aws", region="us-east-1")
+        )
+        # Wait for index to spin up
+        while not pc.describe_index(full_index_name).status['ready']:
+            time.sleep(1)
+    # Use our new helper to return the index object
+    return get_index_by_name(api_key, full_index_name)
+def refresh_pinecone_index(index, final_chunks, batch_size=100):
+    """
+    Refreshes the specific index. Since index is now technique-specific,
+    we just check if it's already populated.
+    """
+    if not final_chunks:
+        print("No chunks provided to refresh.")
+        return False
+    try:
+        # Check current stats for this specific index
+        stats = index.describe_index_stats()
+        current_count = stats.get('total_vector_count', 0)
+        expected_count = len(final_chunks)
+        print(f" Index Stats -> Existing: {current_count} | New Chunks: {expected_count}")
+        if current_count == 0:
+            print(f"➕ Index is empty. Upserting {expected_count} vectors...")
+            vectors = prepare_vectors_for_upsert(final_chunks)
+            upsert_to_pinecone(index, vectors, batch_size)
+            return True
+        elif current_count < expected_count:
+            # Simple check to see if we need to top up or refresh
+            print(f" Vector count mismatch ({current_count} < {expected_count}). Updating index...")
+            vectors = prepare_vectors_for_upsert(final_chunks)
+            upsert_to_pinecone(index, vectors, batch_size)
+            return True
+        else:
+            print(f" Index is already populated with {current_count} vectors. Ready for search.")
+            return False
+    except Exception as e:
+        print(f" Error refreshing index: {e}")
+        return False
+# Utility functions remain the same as previous version
+def prepare_vectors_for_upsert(final_chunks):
+    vectors = []
+    for chunk in final_chunks:
+        meta = chunk.get('metadata', {})
+        metadata_payload = dict(meta) if isinstance(meta, dict) else {}
+        metadata_payload.setdefault('text', meta.get('text', "") if isinstance(meta, dict) else "")
+        metadata_payload.setdefault('title', meta.get('title', "") if isinstance(meta, dict) else "")
+        metadata_payload.setdefault('url', meta.get('url', "") if isinstance(meta, dict) else "")
+        metadata_payload.setdefault('chunk_index', meta.get('chunk_index', 0) if isinstance(meta, dict) else 0)
+        metadata_payload.setdefault('technique', meta.get('technique', "unknown") if isinstance(meta, dict) else "unknown")
+        metadata_payload.setdefault('chunking_technique', meta.get('chunking_technique', "unknown") if isinstance(meta, dict) else "unknown")
+        vectors.append({
+            'id': chunk['id'],
+            'values': chunk['values'],
+            'metadata': metadata_payload
+        })
+    return vectors
+def upsert_to_pinecone(index, chunks, batch_size=100):
+    for i in range(0, len(chunks), batch_size):
+        batch = chunks[i : i + batch_size]
+        index.upsert(vectors=batch)
+# Some methods for loading chunks back from Pinecone with local caching to speed up BM25 initialization
+def _sanitize_index_name(index_name: str) -> str:
+    return re.sub(r'[^a-zA-Z0-9._-]+', '-', index_name).strip('-') or 'default-index'
+def _chunk_cache_path(cache_dir: str, index_name: str) -> Path:
+    cache_root = Path(cache_dir)
+    cache_root.mkdir(parents=True, exist_ok=True)
+    safe_name = _sanitize_index_name(index_name)
+    return cache_root / f"bm25_chunks_{safe_name}.json"
+def _read_chunk_cache(path: Path) -> Dict[str, Any]:
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+def _write_chunk_cache(path: Path, payload: Dict[str, Any]) -> None:
+    with path.open("w", encoding="utf-8") as f:
+        json.dump(payload, f)
+def load_chunks_with_local_cache(
+    index,
+    index_name: str,
+    cache_dir: str = ".cache",
+    batch_size: int = 100,
+    force_refresh: bool = False,
+) -> tuple[List[Dict[str, Any]], str]:
+    cache_file = _chunk_cache_path(cache_dir=cache_dir, index_name=index_name)
+    stats = index.describe_index_stats()
+    current_count = stats.get("total_vector_count", 0)
+    if not force_refresh and cache_file.exists():
+        try:
+            cached_payload = _read_chunk_cache(cache_file)
+            cached_meta = cached_payload.get("meta", {})
+            cached_count = cached_meta.get("vector_count", -1)
+            cached_chunks = cached_payload.get("chunks", [])
+            if cached_count == current_count and cached_chunks:
+                print(
+                    f" Loaded BM25 chunk cache: {cache_file} "
+                    f"(chunks={len(cached_chunks)}, vectors={cached_count})"
+                )
+                return cached_chunks, "cache"
+            print(
+                " BM25 cache stale or empty. "
+                f"cache_vectors={cached_count}, pinecone_vectors={current_count}. Refreshing..."
+            )
+        except Exception as e:
+            print(f" Failed to read BM25 cache ({cache_file}): {e}. Refreshing from Pinecone...")
+    chunks = load_chunks_from_pinecone(index=index, batch_size=batch_size)
+    payload = {
+        "meta": {
+            "index_name": index_name,
+            "vector_count": current_count,
+            "updated_at_epoch_s": int(time.time()),
+        },
+        "chunks": chunks,
+    }
+    try:
+        _write_chunk_cache(cache_file, payload)
+        print(f" Saved BM25 chunk cache: {cache_file} (chunks={len(chunks)})")
+    except Exception as e:
+        print(f" Failed to write BM25 cache ({cache_file}): {e}")
+    return chunks, "pinecone"
+def load_chunks_from_pinecone(index, batch_size: int = 100) -> list[dict[str, any]]:
+    """
+    Scans the Pinecone index to retrieve all text metadata for the BM25 corpus.
+    """
+    stats = index.describe_index_stats()
+    namespaces = list(stats.get('namespaces', {}).keys())
+    # If no namespaces are explicitly named, Pinecone uses an empty string for the default
+    if not namespaces:
+        namespaces = [""]
+    all_chunks: List[Dict[str, Any]] = []
+    seen_ids = set()
+    print(f"Loading vectors for BM25 from namespaces: {namespaces}")
+    for ns in namespaces:
+        # Pinecone's list() generator returns batches of IDs
+        for id_batch in index.list(namespace=ns, limit=batch_size):
+            if not id_batch:
+                continue
+            # Fetch the actual content (metadata) for this batch of IDs
+            fetched = index.fetch(ids=id_batch, namespace=ns)
+            vectors = getattr(fetched, "vectors", {})
+            for vector_id, vector_data in vectors.items():
+                if vector_id in seen_ids:
+                    continue
+                seen_ids.add(vector_id)
+                # Safely extract metadata
+                metadata = getattr(vector_data, "metadata", {})
+                if metadata is None:
+                    metadata = {}
+                if not isinstance(metadata, dict):
+                    metadata = dict(metadata)
+                text = metadata.get("text")
+                if not text:
+                    continue
+                all_chunks.append({
+                    "id": vector_id,
+                    "metadata": metadata
+                })
+        print(f" Finished namespace: '{ns if ns else 'default'}'")
+    print(f"Total chunks loaded into memory: {len(all_chunks)}")
+    return all_chunks