Spaces:

samwaugh
/

ArteFact

Paused

App Files Files Community

samwaugh commited on Sep 2, 2025

Commit

fd6d67a

1 Parent(s): 28fdc3d

Add logging for large output

Browse files

Files changed (2) hide show

backend/runner/inference.py +54 -3
consolidate_embeddings.py +0 -81

backend/runner/inference.py CHANGED Viewed

@@ -646,6 +646,7 @@ def run_inference_streaming(
     """Run inference using streaming embeddings"""
     try:
         print(f"🔍 Running streaming inference for {image_path}")
         # Load and preprocess the image
         print(f"🔍 Loading and preprocessing image: {image_path}")
@@ -653,12 +654,14 @@ def run_inference_streaming(
         print(f"✅ Image loaded successfully, size: {image.size}")
         # Compute image embedding
         inputs = processor(images=image, return_tensors="pt")
         inputs = {k: v.to(device) for k, v in inputs.items()}
         with torch.no_grad():
             image_features = model.get_image_features(**inputs)
             image_embedding = F.normalize(image_features.squeeze(0), dim=-1)
         # Get streaming dataset
         if not EMBEDDINGS_DATASETS or not EMBEDDINGS_DATASETS.get('use_streaming', False):
@@ -670,14 +673,35 @@ def run_inference_streaming(
         results = []
         batch_size = 1000
         batch = []
-        print(f"🔍 Processing streaming embeddings...")
         for item in dataset:
             batch.append(item)
             if len(batch) >= batch_size:
                 # Process batch
                 batch_results = process_embedding_batch_streaming(
                     batch, image_embedding, model_type, device
                 )
@@ -688,10 +712,29 @@ def run_inference_streaming(
                 results.sort(key=lambda x: x["score"], reverse=True)
                 results = results[:top_k]
-                print(f"🔍 Processed batch, current top score: {results[0]['score'] if results else 'N/A'}")
         # Process remaining items
         if batch:
             batch_results = process_embedding_batch_streaming(
                 batch, image_embedding, model_type, device
             )
@@ -699,7 +742,15 @@ def run_inference_streaming(
             results.sort(key=lambda x: x["score"], reverse=True)
             results = results[:top_k]
-        print(f"✅ Streaming inference completed, returning {len(results)} results")
         return results
     except Exception as e:

     """Run inference using streaming embeddings"""
     try:
         print(f"🔍 Running streaming inference for {image_path}")
+        start_time = time.time()
         # Load and preprocess the image
         print(f"🔍 Loading and preprocessing image: {image_path}")
         print(f"✅ Image loaded successfully, size: {image.size}")
         # Compute image embedding
+        print(f"🔍 Computing image embedding...")
         inputs = processor(images=image, return_tensors="pt")
         inputs = {k: v.to(device) for k, v in inputs.items()}
         with torch.no_grad():
             image_features = model.get_image_features(**inputs)
             image_embedding = F.normalize(image_features.squeeze(0), dim=-1)
+        print(f"✅ Image embedding computed successfully")
         # Get streaming dataset
         if not EMBEDDINGS_DATASETS or not EMBEDDINGS_DATASETS.get('use_streaming', False):
         results = []
         batch_size = 1000
         batch = []
+        total_processed = 0
+        batch_count = 0
+        print(f"🔍 Starting streaming processing of 3.1M+ sentence embeddings...")
+        print(f"🔍 Batch size: {batch_size}")
+        print(f"🔍 Target top-k: {top_k}")
+        # Estimate total items for progress tracking
+        try:
+            # Try to get dataset size if available
+            if hasattr(dataset, '__len__'):
+                total_items = len(dataset)
+                print(f"🔍 Total embeddings to process: {total_items:,}")
+            else:
+                total_items = None
+                print(f"🔍 Dataset size unknown (streaming mode)")
+        except:
+            total_items = None
         for item in dataset:
             batch.append(item)
+            total_processed += 1
             if len(batch) >= batch_size:
+                batch_count += 1
+                batch_start_time = time.time()
                 # Process batch
+                print(f"🔍 Processing batch {batch_count} ({total_processed:,} items processed)...")
                 batch_results = process_embedding_batch_streaming(
                     batch, image_embedding, model_type, device
                 )
                 results.sort(key=lambda x: x["score"], reverse=True)
                 results = results[:top_k]
+                batch_time = time.time() - batch_start_time
+                elapsed_time = time.time() - start_time
+                # Progress reporting
+                if total_items:
+                    progress_pct = (total_processed / total_items) * 100
+                    print(f"🔍 Batch {batch_count} completed in {batch_time:.2f}s")
+                    print(f"🔍 Progress: {total_processed:,}/{total_items:,} ({progress_pct:.1f}%)")
+                    print(f"🔍 Elapsed time: {elapsed_time:.1f}s")
+                    print(f"🔍 Current top score: {results[0]['score']:.4f} if results else 'N/A'")
+                    print(f"🔍 Estimated time remaining: {((elapsed_time / total_processed) * (total_items - total_processed)):.1f}s")
+                else:
+                    print(f"🔍 Batch {batch_count} completed in {batch_time:.2f}s")
+                    print(f"🔍 Total processed: {total_processed:,}")
+                    print(f"🔍 Elapsed time: {elapsed_time:.1f}s")
+                    print(f"🔍 Current top score: {results[0]['score']:.4f} if results else 'N/A'")
+                print(f"🔍 Current top result: {results[0]['english_original'][:100]}..." if results else "No results yet")
+                print("─" * 80)
         # Process remaining items
         if batch:
+            print(f"🔍 Processing final batch of {len(batch)} items...")
             batch_results = process_embedding_batch_streaming(
                 batch, image_embedding, model_type, device
             )
             results.sort(key=lambda x: x["score"], reverse=True)
             results = results[:top_k]
+        total_time = time.time() - start_time
+        print(f"✅ Streaming inference completed!")
+        print(f"🔍 Total time: {total_time:.2f}s")
+        print(f"🔍 Total embeddings processed: {total_processed:,}")
+        print(f"🔍 Final results: {len(results)} items")
+        if results:
+            print(f"🔍 Top result score: {results[0]['score']:.4f}")
+            print(f"🔍 Top result: {results[0]['english_original'][:100]}...")
         return results
     except Exception as e:

consolidate_embeddings.py DELETED Viewed

@@ -1,81 +0,0 @@
-#!/usr/bin/env python3
-import json
-import sys
-from pathlib import Path
-from typing import List, Tuple
-import torch
-from safetensors.torch import save_file
-ROOT = Path(__file__).resolve().parent
-DATA_DIR = ROOT / "data" / "embeddings"
-CLIP_DIR = DATA_DIR / "CLIP_Embeddings"
-PAINTINGCLIP_DIR = DATA_DIR / "PaintingCLIP_Embeddings"
-def load_one(pt_path: Path) -> torch.Tensor:
-	"""Load a single .pt embedding, handling dict-or-tensor variants."""
-	obj = torch.load(pt_path, map_location="cpu", weights_only=True)
-	if isinstance(obj, torch.Tensor):
-		return obj
-	if isinstance(obj, dict):
-		for k in ("embedding", "embeddings", "features"):
-			if k in obj:
-				t = obj[k]
-				if isinstance(t, torch.Tensor):
-					return t
-	raise ValueError(f"Unsupported .pt content in {pt_path}")
-def derive_id_from_filename(stem: str) -> str:
-	"""
-	- CLIP:       Wxxxx_sYYYY_clip → Wxxxx_sYYYY
-	- PaintingCLIP: Wxxxx_sYYYY_painting_clip → Wxxxx_sYYYY
-	"""
-	if stem.endswith("_painting_clip"):
-		return stem[: -len("_painting_clip")]
-	if stem.endswith("_clip"):
-		return stem[: -len("_clip")]
-	return stem  # fallback
-def consolidate_dir(indir: Path) -> Tuple[torch.Tensor, List[str]]:
-	pt_files = sorted(indir.glob("*.pt"))
-	if not pt_files:
-		raise RuntimeError(f"No .pt files found under {indir}")
-	embs: List[torch.Tensor] = []
-	ids: List[str] = []
-	for i, p in enumerate(pt_files, 1):
-		e = load_one(p).float()
-		if e.ndim > 1:
-			e = e.squeeze()
-		if e.ndim != 1:
-			raise ValueError(f"Embedding is not 1D in {p}: shape={tuple(e.shape)}")
-		embs.append(e)
-		ids.append(derive_id_from_filename(p.stem))
-		if i % 1000 == 0:
-			print(f"... processed {i} files from {indir}")
-	# Stack to [N, D]
-	embeddings = torch.stack(embs, dim=0).contiguous()
-	return embeddings, ids
-def save_as_safetensors(embeddings: torch.Tensor, ids: List[str], out_prefix: Path) -> None:
-	out_st = out_prefix.with_suffix(".safetensors")
-	out_json = out_prefix.with_name(out_prefix.name + "_sentence_ids.json")
-	save_file({"embeddings": embeddings}, str(out_st))
-	with open(out_json, "w", encoding="utf-8") as f:
-		json.dump(ids, f, ensure_ascii=False, indent=2)
-	print(f"Saved embeddings: {out_st} [{tuple(embeddings.shape)}]")
-	print(f"Saved sentence IDs: {out_json} [{len(ids)} ids]")
-def main():
-	print("Consolidating CLIP...")
-	clip_emb, clip_ids = consolidate_dir(CLIP_DIR)
-	save_as_safetensors(clip_emb, clip_ids, DATA_DIR / "clip_embeddings")
-	print("Consolidating PaintingCLIP...")
-	pclip_emb, pclip_ids = consolidate_dir(PAINTINGCLIP_DIR)
-	save_as_safetensors(pclip_emb, pclip_ids, DATA_DIR / "paintingclip_embeddings")
-if __name__ == "__main__":
-	main()