Spaces:

samwaugh
/

ArteFact

Paused

App Files Files Community

samwaugh commited on Sep 3, 2025

Commit

ae940aa

1 Parent(s): 5b7bc70

Download instead of stream

Browse files

Files changed (2) hide show

backend/runner/config.py +7 -13
backend/runner/inference.py +67 -58

backend/runner/config.py CHANGED Viewed

@@ -155,24 +155,18 @@ def load_json_datasets() -> Optional[Dict[str, Any]]:
         return None
 def load_embeddings_datasets() -> Optional[Dict[str, Any]]:
-    """Load embeddings datasets from Hugging Face using streaming"""
-    if not DATASETS_AVAILABLE:
-        print("⚠️  datasets library not available - skipping HF embeddings loading")
         return None
     try:
-        print(f" Loading embeddings using streaming from {ARTEFACT_EMBEDDINGS_DATASET}...")
-        # Use streaming to avoid downloading large files
-        dataset = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, split='train', streaming=True)
-        print(f"✅ Successfully loaded streaming dataset")
-        print(f" Dataset type: {type(dataset)}")
-        # Return the streaming dataset for on-demand processing
         return {
-            'streaming_dataset': dataset,
-            'use_streaming': True,
             'repo_id': ARTEFACT_EMBEDDINGS_DATASET
         }
     except Exception as e:

         return None
 def load_embeddings_datasets() -> Optional[Dict[str, Any]]:
+    """Load embeddings datasets from Hugging Face using direct file download"""
+    if not HF_HUB_AVAILABLE:
+        print("⚠️  huggingface_hub library not available - skipping HF embeddings loading")
         return None
     try:
+        print(f" Loading embeddings from {ARTEFACT_EMBEDDINGS_DATASET}...")
+        # Return a flag indicating we should use direct file download
+        # The actual loading will be done in inference.py
         return {
+            'use_direct_download': True,
             'repo_id': ARTEFACT_EMBEDDINGS_DATASET
         }
     except Exception as e:

backend/runner/inference.py CHANGED Viewed

@@ -72,60 +72,72 @@ def load_embeddings_from_hf():
     try:
         print(f" Loading embeddings from {ARTEFACT_EMBEDDINGS_DATASET}...")
-        # Download the safetensors files
-        from huggingface_hub import hf_hub_download
-        import safetensors
-        # Download CLIP embeddings
-        print("🔍 Downloading CLIP embeddings...")
-        clip_embeddings_path = hf_hub_download(
-            repo_id=ARTEFACT_EMBEDDINGS_DATASET,
-            filename="clip_embeddings.safetensors",
-            repo_type="dataset"
-        )
-        clip_ids_path = hf_hub_download(
-            repo_id=ARTEFACT_EMBEDDINGS_DATASET,
-            filename="clip_embeddings_sentence_ids.json",
-            repo_type="dataset"
-        )
-        # Download PaintingCLIP embeddings
-        print("🔍 Downloading PaintingCLIP embeddings...")
-        paintingclip_embeddings_path = hf_hub_download(
-            repo_id=ARTEFACT_EMBEDDINGS_DATASET,
-            filename="paintingclip_embeddings.safetensors",
-            repo_type="dataset"
-        )
-        paintingclip_ids_path = hf_hub_download(
-            repo_id=ARTEFACT_EMBEDDINGS_DATASET,
-            filename="paintingclip_embeddings_sentence_ids.json",
-            repo_type="dataset"
-        )
-        # Load the embeddings
-        print("🔍 Loading CLIP embeddings...")
-        clip_embeddings = safetensors.torch.load_file(clip_embeddings_path)['embeddings']
-        print("🔍 Loading PaintingCLIP embeddings...")
-        paintingclip_embeddings = safetensors.torch.load_file(paintingclip_embeddings_path)['embeddings']
-        # Load the sentence IDs
-        with open(clip_ids_path, 'r') as f:
-            clip_sentence_ids = json.load(f)
-        with open(paintingclip_ids_path, 'r') as f:
-            paintingclip_sentence_ids = json.load(f)
-        print(f"✅ Loaded CLIP embeddings: {clip_embeddings.shape}")
-        print(f"✅ Loaded PaintingCLIP embeddings: {paintingclip_embeddings.shape}")
-        return {
-            "clip": (clip_embeddings, clip_sentence_ids),
-            "paintingclip": (paintingclip_embeddings, paintingclip_sentence_ids)
-        }
     except Exception as e:
         print(f"❌ Failed to load embeddings from HF: {e}")
         return None
@@ -203,15 +215,12 @@ def _initialize_pipeline():
         if embeddings_data is None:
             raise ValueError(f"Failed to load embeddings from HF dataset: {ARTEFACT_EMBEDDINGS_DATASET}")
-        # Check if we're using streaming
         if embeddings_data.get("streaming", False):
             print("✅ Using streaming embeddings - will load on-demand")
-            # For streaming, we need to handle this differently
-            # We'll return the components but mark embeddings as streaming
-            # The calling code will need to handle this case
             return processor, model, "STREAMING", "STREAMING", "STREAMING", device
         else:
-            # New code path for safetensors files
             if MODEL_TYPE == "clip":
                 embeddings, sentence_ids = embeddings_data["clip"]
             else:

     try:
         print(f" Loading embeddings from {ARTEFACT_EMBEDDINGS_DATASET}...")
+        if not EMBEDDINGS_DATASETS:
+            print("❌ No embeddings datasets loaded")
+            return None
+        # Check if we're using direct download
+        if EMBEDDINGS_DATASETS.get('use_direct_download', False):
+            print("✅ Using direct file download for embeddings")
+            # Download the safetensors files
+            from huggingface_hub import hf_hub_download
+            import safetensors
+            # Download CLIP embeddings
+            print("🔍 Downloading CLIP embeddings...")
+            clip_embeddings_path = hf_hub_download(
+                repo_id=ARTEFACT_EMBEDDINGS_DATASET,
+                filename="clip_embeddings.safetensors",
+                repo_type="dataset"
+            )
+            clip_ids_path = hf_hub_download(
+                repo_id=ARTEFACT_EMBEDDINGS_DATASET,
+                filename="clip_embeddings_sentence_ids.json",
+                repo_type="dataset"
+            )
+            # Download PaintingCLIP embeddings
+            print("🔍 Downloading PaintingCLIP embeddings...")
+            paintingclip_embeddings_path = hf_hub_download(
+                repo_id=ARTEFACT_EMBEDDINGS_DATASET,
+                filename="paintingclip_embeddings.safetensors",
+                repo_type="dataset"
+            )
+            paintingclip_ids_path = hf_hub_download(
+                repo_id=ARTEFACT_EMBEDDINGS_DATASET,
+                filename="paintingclip_embeddings_sentence_ids.json",
+                repo_type="dataset"
+            )
+            # Load the embeddings
+            print("🔍 Loading CLIP embeddings...")
+            clip_embeddings = safetensors.torch.load_file(clip_embeddings_path)['embeddings']
+            print("🔍 Loading PaintingCLIP embeddings...")
+            paintingclip_embeddings = safetensors.torch.load_file(paintingclip_embeddings_path)['embeddings']
+            # Load the sentence IDs
+            with open(clip_ids_path, 'r') as f:
+                clip_sentence_ids = json.load(f)
+            with open(paintingclip_ids_path, 'r') as f:
+                paintingclip_sentence_ids = json.load(f)
+            print(f"✅ Loaded CLIP embeddings: {clip_embeddings.shape}")
+            print(f"✅ Loaded PaintingCLIP embeddings: {paintingclip_embeddings.shape}")
+            return {
+                "clip": (clip_embeddings, clip_sentence_ids),
+                "paintingclip": (paintingclip_embeddings, paintingclip_sentence_ids)
+            }
+        else:
+            # Fallback to old method if not using direct download
+            print("⚠️  Using fallback embedding loading method")
+            return None
     except Exception as e:
         print(f"❌ Failed to load embeddings from HF: {e}")
         return None
         if embeddings_data is None:
             raise ValueError(f"Failed to load embeddings from HF dataset: {ARTEFACT_EMBEDDINGS_DATASET}")
+        # Check if we're using streaming (old approach)
         if embeddings_data.get("streaming", False):
             print("✅ Using streaming embeddings - will load on-demand")
             return processor, model, "STREAMING", "STREAMING", "STREAMING", device
         else:
+            # New code path for direct file download
             if MODEL_TYPE == "clip":
                 embeddings, sentence_ids = embeddings_data["clip"]
             else: