Spaces:

samwaugh
/

ArteFact

Paused

App Files Files Community

samwaugh commited on Sep 1, 2025

Commit

267162d

1 Parent(s): a02b702

Try fix 5

Browse files

Files changed (3) hide show

backend/runner/config.py +13 -35
backend/runner/inference.py +128 -30
requirements.txt +1 -1

backend/runner/config.py CHANGED Viewed

@@ -6,7 +6,7 @@ All runner modules should import from this module instead of defining their own
 import os
 import json
 from pathlib import Path
-from typing import Any, Dict, Optional
 # Try to import required libraries
 try:
@@ -155,47 +155,25 @@ def load_json_datasets() -> Optional[Dict[str, Any]]:
         return None
 def load_embeddings_datasets() -> Optional[Dict[str, Any]]:
-    """Load embeddings datasets from Hugging Face"""
-    if not HF_HUB_AVAILABLE:
-        print("⚠️  huggingface_hub library not available - skipping HF embeddings loading")
         return None
     try:
-        print(f" Loading embeddings files from {ARTEFACT_EMBEDDINGS_DATASET}...")
-        # Download the files to local paths
-        clip_embeddings_path = hf_hub_download(
-            repo_id=ARTEFACT_EMBEDDINGS_DATASET,
-            filename='clip_embeddings.safetensors',
-            repo_type="dataset"
-        )
-        paintingclip_embeddings_path = hf_hub_download(
-            repo_id=ARTEFACT_EMBEDDINGS_DATASET,
-            filename='paintingclip_embeddings.safetensors',
-            repo_type="dataset"
-        )
-        clip_sentence_ids_path = hf_hub_download(
-            repo_id=ARTEFACT_EMBEDDINGS_DATASET,
-            filename='clip_embeddings_sentence_ids.json',
-            repo_type="dataset"
-        )
-        paintingclip_sentence_ids_path = hf_hub_download(
-            repo_id=ARTEFACT_EMBEDDINGS_DATASET,
-            filename='paintingclip_embeddings_sentence_ids.json',
-            repo_type="dataset"
-        )
-        print(f"✅ Successfully downloaded embeddings files:")
-        print(f"   CLIP embeddings: {clip_embeddings_path}")
-        print(f"   PaintingCLIP embeddings: {paintingclip_embeddings_path}")
-        print(f"   CLIP sentence IDs: {clip_sentence_ids_path}")
-        print(f"   PaintingCLIP sentence IDs: {paintingclip_sentence_ids_path}")
         return {
-            'clip_embeddings_path': clip_embeddings_path,
-            'paintingclip_embeddings_path': paintingclip_embeddings_path,
-            'clip_sentence_ids_path': clip_sentence_ids_path,
-            'paintingclip_sentence_ids_path': paintingclip_sentence_ids_path
         }
     except Exception as e:
         print(f"❌ Failed to load embeddings datasets from HF: {e}")

 import os
 import json
 from pathlib import Path
+from typing import Any, Dict, Optional, List, Tuple
 # Try to import required libraries
 try:
         return None
 def load_embeddings_datasets() -> Optional[Dict[str, Any]]:
+    """Load embeddings datasets from Hugging Face using streaming"""
+    if not DATASETS_AVAILABLE:
+        print("⚠️  datasets library not available - skipping HF embeddings loading")
         return None
     try:
+        print(f" Loading embeddings using streaming from {ARTEFACT_EMBEDDINGS_DATASET}...")
+        # Use streaming to avoid downloading large files
+        dataset = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, split='train', streaming=True)
+        print(f"✅ Successfully loaded streaming dataset")
+        print(f" Dataset type: {type(dataset)}")
+        # Return the streaming dataset for on-demand processing
         return {
+            'streaming_dataset': dataset,
+            'use_streaming': True,
+            'repo_id': ARTEFACT_EMBEDDINGS_DATASET
         }
     except Exception as e:
         print(f"❌ Failed to load embeddings datasets from HF: {e}")

backend/runner/inference.py CHANGED Viewed

@@ -68,34 +68,27 @@ TOP_K = 25  # Number of results to return
 # ─────────────────────────────────────────────────────────────────────────────
 def load_embeddings_from_hf():
-    """Load embeddings from HF dataset"""
     try:
-        print(f"🔍 Loading embeddings from {ARTEFACT_EMBEDDINGS_DATASET}...")
-        # Get the downloaded file paths from config
         if not EMBEDDINGS_DATASETS:
             print("❌ No embeddings datasets loaded")
             return None
-        # Load sentence IDs
-        with open(EMBEDDINGS_DATASETS['clip_sentence_ids_path'], 'r') as f:
-            clip_sentence_ids = json.load(f)
-        with open(EMBEDDINGS_DATASETS['paintingclip_sentence_ids_path'], 'r') as f:
-            paintingclip_sentence_ids = json.load(f)
-        # Load embeddings using safetensors
-        import safetensors
-        clip_embeddings = safetensors.safe_open(EMBEDDINGS_DATASETS['clip_embeddings_path'], framework="pt")
-        paintingclip_embeddings = safetensors.safe_open(EMBEDDINGS_DATASETS['paintingclip_embeddings_path'], framework="pt")
-        print(f"✅ Successfully loaded embeddings from HF:")
-        print(f"   CLIP: {len(clip_sentence_ids)} embeddings")
-        print(f"   PaintingCLIP: {len(paintingclip_sentence_ids)} embeddings")
-        return {
-            "clip": (clip_embeddings, clip_sentence_ids),
-            "paintingclip": (paintingclip_embeddings, paintingclip_sentence_ids)
-        }
     except Exception as e:
         print(f"❌ Failed to load embeddings from HF: {e}")
         return None
@@ -173,15 +166,26 @@ def _initialize_pipeline():
         if embeddings_data is None:
             raise ValueError(f"Failed to load embeddings from HF dataset: {ARTEFACT_EMBEDDINGS_DATASET}")
-        if MODEL_TYPE == "clip":
-            embeddings, sentence_ids = embeddings_data["clip"]
         else:
-            embeddings, sentence_ids = embeddings_data["paintingclip"]
-        if embeddings is None or sentence_ids is None:
-            raise ValueError(f"Failed to load embeddings for model type: {MODEL_TYPE}")
-        print(f"🔍 Loaded {len(sentence_ids)} embeddings with shape {embeddings.shape}")
     except Exception as e:
         print(f"❌ Error loading embeddings: {e}")
         raise
@@ -521,3 +525,97 @@ def st_load_file(file_path: Path) -> Any:
     except Exception as e:
         print(f"❌ Error loading {file_path}: {e}")
         return None

 # ─────────────────────────────────────────────────────────────────────────────
 def load_embeddings_from_hf():
+    """Load embeddings from HF dataset using streaming"""
     try:
+        print(f" Loading embeddings from {ARTEFACT_EMBEDDINGS_DATASET}...")
         if not EMBEDDINGS_DATASETS:
             print("❌ No embeddings datasets loaded")
             return None
+        # Check if we're using streaming
+        if EMBEDDINGS_DATASETS.get('use_streaming', False):
+            print("✅ Using streaming embeddings dataset")
+            return {
+                "streaming": True,
+                "dataset": EMBEDDINGS_DATASETS['streaming_dataset'],
+                "repo_id": EMBEDDINGS_DATASETS['repo_id']
+            }
+        else:
+            # Fallback to old method if not streaming
+            print("⚠️  Using fallback embedding loading method")
+            return None
     except Exception as e:
         print(f"❌ Failed to load embeddings from HF: {e}")
         return None
         if embeddings_data is None:
             raise ValueError(f"Failed to load embeddings from HF dataset: {ARTEFACT_EMBEDDINGS_DATASET}")
+        # Check if we're using streaming
+        if embeddings_data.get("streaming", False):
+            print("✅ Using streaming embeddings - will load on-demand")
+            # For streaming, we'll load embeddings as needed during inference
+            return {
+                "streaming": True,
+                "dataset": embeddings_data["dataset"],
+                "repo_id": embeddings_data["repo_id"]
+            }
         else:
+            # Old code path for non-streaming
+            if MODEL_TYPE == "clip":
+                embeddings, sentence_ids = embeddings_data["clip"]
+            else:
+                embeddings, sentence_ids = embeddings_data["paintingclip"]
+            if embeddings is None or sentence_ids is None:
+                raise ValueError(f"Failed to load embeddings for model type: {MODEL_TYPE}")
+            print(f"🔍 Loaded {len(sentence_ids)} embeddings with shape {embeddings.shape}")
     except Exception as e:
         print(f"❌ Error loading embeddings: {e}")
         raise
     except Exception as e:
         print(f"❌ Error loading {file_path}: {e}")
         return None
+def load_embedding_for_sentence(sentence_id: str, model_type: str = "clip") -> Optional[torch.Tensor]:
+    """Load a single embedding for a specific sentence using streaming"""
+    try:
+        if not EMBEDDINGS_DATASETS or not EMBEDDINGS_DATASETS.get('use_streaming', False):
+            print("❌ Streaming embeddings not available")
+            return None
+        dataset = EMBEDDINGS_DATASETS['streaming_dataset']
+        # Search for the sentence in the streaming dataset
+        for item in dataset:
+            if item.get('sentence_id') == sentence_id:
+                # Extract the appropriate embedding based on model type
+                if model_type == "clip" and 'clip_embedding' in item:
+                    return torch.tensor(item['clip_embedding'])
+                elif model_type == "paintingclip" and 'paintingclip_embedding' in item:
+                    return torch.tensor(item['paintingclip_embedding'])
+                else:
+                    print(f"⚠️  Embedding not found for {model_type} in sentence {sentence_id}")
+                    return None
+        print(f"⚠️  Sentence {sentence_id} not found in streaming dataset")
+        return None
+    except Exception as e:
+        print(f"❌ Error loading streaming embedding for {sentence_id}: {e}")
+        return None
+def get_top_k_embeddings(query_embedding: torch.Tensor, k: int = 10, model_type: str = "clip") -> List[Tuple[str, float]]:
+    """Get top-k most similar embeddings using streaming"""
+    try:
+        if not EMBEDDINGS_DATASETS or not EMBEDDINGS_DATASETS.get('use_streaming', False):
+            print("❌ Streaming embeddings not available")
+            return []
+        dataset = EMBEDDINGS_DATASETS['streaming_dataset']
+        similarities = []
+        # Process embeddings in batches to avoid memory issues
+        batch_size = 1000
+        batch = []
+        for item in dataset:
+            batch.append(item)
+            if len(batch) >= batch_size:
+                # Process batch
+                batch_similarities = process_embedding_batch(batch, query_embedding, model_type)
+                similarities.extend(batch_similarities)
+                batch = []
+                # Keep only top-k so far
+                similarities.sort(key=lambda x: x[1], reverse=True)
+                similarities = similarities[:k]
+        # Process remaining items
+        if batch:
+            batch_similarities = process_embedding_batch(batch, query_embedding, model_type)
+            similarities.extend(batch_similarities)
+            similarities.sort(key=lambda x: x[1], reverse=True)
+            similarities = similarities[:k]
+        return similarities
+    except Exception as e:
+        print(f"❌ Error getting top-k embeddings: {e}")
+        return []
+def process_embedding_batch(batch: List[Dict], query_embedding: torch.Tensor, model_type: str) -> List[Tuple[str, float]]:
+    """Process a batch of embeddings to find similarities"""
+    similarities = []
+    for item in batch:
+        try:
+            sentence_id = item.get('sentence_id', '')
+            # Get the appropriate embedding
+            if model_type == "clip" and 'clip_embedding' in item:
+                embedding = torch.tensor(item['clip_embedding'])
+            elif model_type == "paintingclip" and 'paintingclip_embedding' in item:
+                embedding = torch.tensor(item['paintingclip_embedding'])
+            else:
+                continue
+            # Calculate similarity
+            similarity = F.cosine_similarity(query_embedding.unsqueeze(0), embedding.unsqueeze(0), dim=1)
+            similarities.append((sentence_id, similarity.item()))
+        except Exception as e:
+            print(f"⚠️  Error processing item in batch: {e}")
+            continue
+    return similarities

requirements.txt CHANGED Viewed

@@ -21,4 +21,4 @@ numpy>=1.24.0
 # Optional: GPU acceleration (if available)
 # torchvision>=0.15.0  # Uncomment if you need additional vision models
-safetensors>=0.4.0

 # Optional: GPU acceleration (if available)
 # torchvision>=0.15.0  # Uncomment if you need additional vision models
+safetensors>=0.4.0