Spaces:

samwaugh
/

ArteFact

Paused

App Files Files Community

samwaugh commited on Sep 3, 2025

Commit

5b7bc70

1 Parent(s): 8e05ec6

New approach

Browse files

Files changed (1) hide show

backend/runner/inference.py +56 -24

backend/runner/inference.py CHANGED Viewed

@@ -68,27 +68,64 @@ TOP_K = 25  # Number of results to return
 # ─────────────────────────────────────────────────────────────────────────────
 def load_embeddings_from_hf():
-    """Load embeddings from HF dataset using streaming"""
     try:
         print(f" Loading embeddings from {ARTEFACT_EMBEDDINGS_DATASET}...")
-        if not EMBEDDINGS_DATASETS:
-            print("❌ No embeddings datasets loaded")
-            return None
-        # Check if we're using streaming
-        if EMBEDDINGS_DATASETS.get('use_streaming', False):
-            print("✅ Using streaming embeddings dataset")
-            return {
-                "streaming": True,
-                "dataset": EMBEDDINGS_DATASETS['streaming_dataset'],
-                "repo_id": EMBEDDINGS_DATASETS['repo_id']
-            }
-        else:
-            # Fallback to old method if not streaming
-            print("⚠️  Using fallback embedding loading method")
-            return None
     except Exception as e:
         print(f"❌ Failed to load embeddings from HF: {e}")
         return None
@@ -174,7 +211,7 @@ def _initialize_pipeline():
             # The calling code will need to handle this case
             return processor, model, "STREAMING", "STREAMING", "STREAMING", device
         else:
-            # Old code path for non-streaming
             if MODEL_TYPE == "clip":
                 embeddings, sentence_ids = embeddings_data["clip"]
             else:
@@ -773,12 +810,7 @@ def process_embedding_batch_streaming(
     # Debug: show first few items to understand the data structure
     for i, item in enumerate(batch[:3]):
         print(f" Item {i}: keys = {list(item.keys())}")
-        if 'clip_embedding' in item:
-            print(f"🔍 Item {i}: clip_embedding shape = {len(item['clip_embedding'])}")
-        if 'paintingclip_embedding' in item:
-            print(f" Item {i}: paintingclip_embedding shape = {len(item['paintingclip_embedding'])}")
-        if 'sentence_id' in item:
-            print(f" Item {i}: sentence_id = {item['sentence_id']}")
     for item in batch:
         try:

 # ─────────────────────────────────────────────────────────────────────────────
 def load_embeddings_from_hf():
+    """Load embeddings from HF dataset using safetensors files"""
     try:
         print(f" Loading embeddings from {ARTEFACT_EMBEDDINGS_DATASET}...")
+        # Download the safetensors files
+        from huggingface_hub import hf_hub_download
+        import safetensors
+        # Download CLIP embeddings
+        print("🔍 Downloading CLIP embeddings...")
+        clip_embeddings_path = hf_hub_download(
+            repo_id=ARTEFACT_EMBEDDINGS_DATASET,
+            filename="clip_embeddings.safetensors",
+            repo_type="dataset"
+        )
+        clip_ids_path = hf_hub_download(
+            repo_id=ARTEFACT_EMBEDDINGS_DATASET,
+            filename="clip_embeddings_sentence_ids.json",
+            repo_type="dataset"
+        )
+        # Download PaintingCLIP embeddings
+        print("🔍 Downloading PaintingCLIP embeddings...")
+        paintingclip_embeddings_path = hf_hub_download(
+            repo_id=ARTEFACT_EMBEDDINGS_DATASET,
+            filename="paintingclip_embeddings.safetensors",
+            repo_type="dataset"
+        )
+        paintingclip_ids_path = hf_hub_download(
+            repo_id=ARTEFACT_EMBEDDINGS_DATASET,
+            filename="paintingclip_embeddings_sentence_ids.json",
+            repo_type="dataset"
+        )
+        # Load the embeddings
+        print("🔍 Loading CLIP embeddings...")
+        clip_embeddings = safetensors.torch.load_file(clip_embeddings_path)['embeddings']
+        print("🔍 Loading PaintingCLIP embeddings...")
+        paintingclip_embeddings = safetensors.torch.load_file(paintingclip_embeddings_path)['embeddings']
+        # Load the sentence IDs
+        with open(clip_ids_path, 'r') as f:
+            clip_sentence_ids = json.load(f)
+        with open(paintingclip_ids_path, 'r') as f:
+            paintingclip_sentence_ids = json.load(f)
+        print(f"✅ Loaded CLIP embeddings: {clip_embeddings.shape}")
+        print(f"✅ Loaded PaintingCLIP embeddings: {paintingclip_embeddings.shape}")
+        return {
+            "clip": (clip_embeddings, clip_sentence_ids),
+            "paintingclip": (paintingclip_embeddings, paintingclip_sentence_ids)
+        }
     except Exception as e:
         print(f"❌ Failed to load embeddings from HF: {e}")
         return None
             # The calling code will need to handle this case
             return processor, model, "STREAMING", "STREAMING", "STREAMING", device
         else:
+            # New code path for safetensors files
             if MODEL_TYPE == "clip":
                 embeddings, sentence_ids = embeddings_data["clip"]
             else:
     # Debug: show first few items to understand the data structure
     for i, item in enumerate(batch[:3]):
         print(f" Item {i}: keys = {list(item.keys())}")
+        print(f" Item {i}: full item = {item}")
     for item in batch:
         try: