Spaces:

Napron
/

small_object_detection

Sleeping

App Files Files Community

small_object_detection / nomic_fewshot.py

Napron

modified nomic encode images

736b2a1 verified about 1 month ago

raw

history blame contribute delete

11.2 kB

	"""
	Few-shot object classification using Nomic embed-vision-v1.5 + embed-text-v1.5 via ONNX Runtime.
	Same treatment as current PyTorch version:
	- vision refs -> average image embeddings
	- text prompts -> average text embeddings
	- combine with text_weight

	This version uses:
	- nomic-ai/nomic-embed-text-v1.5 -> ONNX
	- nomic-ai/nomic-embed-vision-v1.5 -> ONNX

	Transformers is used only for preprocessing:
	- AutoTokenizer
	- AutoImageProcessor
	"""

	import time
	from pathlib import Path

	import numpy as np
	import onnxruntime as ort
	from PIL import Image
	from huggingface_hub import hf_hub_download
	from transformers import AutoImageProcessor, AutoTokenizer

	from jina_fewshot import CLASS_PROMPTS, IMAGE_EXTS


	def _l2_normalize(x: np.ndarray, axis: int = -1, eps: float = 1e-12) -> np.ndarray:
	x = np.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0)
	norms = np.linalg.norm(x, axis=axis, keepdims=True)
	norms = np.maximum(norms, eps)
	return (x / norms).astype(np.float32)


	def _mean_pool(last_hidden_state: np.ndarray, attention_mask: np.ndarray) -> np.ndarray:
	"""
	last_hidden_state: [B, T, D]
	attention_mask: [B, T]
	"""
	mask = attention_mask.astype(np.float32)[..., None] # [B, T, 1]
	summed = np.sum(last_hidden_state * mask, axis=1)
	denom = np.clip(np.sum(mask, axis=1), 1e-9, None)
	return summed / denom


	def _pick_output(outputs: list[np.ndarray], output_names: list[str], kind: str) -> np.ndarray:
	"""
	Try to find the main embedding tensor robustly.
	For both text and vision Nomic ONNX exports, we expect a 3D tensor [B, T, D]
	or sometimes a 2D tensor [B, D].
	"""
	# Prefer names that look like hidden states / embeddings
	preferred_keywords = [
	"last_hidden_state",
	"hidden_state",
	"sentence_embedding",
	"embedding",
	"embeddings",
	]

	for kw in preferred_keywords:
	for i, name in enumerate(output_names):
	if kw in name.lower():
	arr = outputs[i]
	if arr.ndim in (2, 3):
	return arr

	# Fallback: first 3D output, then first 2D output
	for arr in outputs:
	if arr.ndim == 3:
	return arr
	for arr in outputs:
	if arr.ndim == 2:
	return arr

	raise RuntimeError(
	f"Could not identify a usable {kind} ONNX output. "
	f"Output names={output_names}, shapes={[getattr(o, 'shape', None) for o in outputs]}"
	)


	def _download_onnx_model(repo_id: str, filename: str = "onnx/model.onnx") -> str:
	print(f" Downloading ONNX model from {repo_id} ...")
	onnx_path = hf_hub_download(
	repo_id=repo_id,
	filename=filename,
	)
	print(f" Downloaded: {onnx_path}")
	return onnx_path


	class NomicTextEncoderONNX:
	"""
	Nomic embed-text-v1.5 ONNX:
	text -> token embeddings / hidden states -> mean pool -> L2 normalize
	"""

	def __init__(self, device: str = "cuda"):
	self.device = device
	self.repo_id = "nomic-ai/nomic-embed-text-v1.5"

	print("[*] Loading nomic-embed-text-v1.5 (ONNX)...")
	t0 = time.perf_counter()

	onnx_path = _download_onnx_model(self.repo_id)

	available = ort.get_available_providers()
	if "CUDAExecutionProvider" in available and device == "cuda":
	providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
	else:
	providers = ["CPUExecutionProvider"]
	print(f" ONNX providers: {providers}")

	self.session = ort.InferenceSession(onnx_path, providers=providers)
	self.tokenizer = AutoTokenizer.from_pretrained(self.repo_id, trust_remote_code=True)

	self.input_names = [inp.name for inp in self.session.get_inputs()]
	self.output_names = [out.name for out in self.session.get_outputs()]

	print(f" ONNX inputs: {self.input_names}")
	print(f" ONNX outputs: {self.output_names}")

	self._ids_name = None
	self._mask_name = None
	self._token_type_name = None

	for name in self.input_names:
	nl = name.lower()
	if nl == "input_ids" or "input_ids" in nl:
	self._ids_name = name
	elif nl == "attention_mask" or "attention" in nl:
	self._mask_name = name
	elif nl == "token_type_ids" or "token_type" in nl:
	self._token_type_name = name

	print(
	f" Mapped: input_ids={self._ids_name}, "
	f"attention_mask={self._mask_name}, token_type_ids={self._token_type_name}"
	)

	# Sanity check
	test = self.encode_texts(["a red square"])
	nrm = float(np.linalg.norm(test[0]))
	print(f" [SANITY] text embed norm={nrm:.4f}")
	print(f"[*] Loaded in {time.perf_counter() - t0:.1f}s\n")

	def encode_texts(self, texts: list[str]) -> np.ndarray:
	prefixed = [f"classification: {t}" for t in texts]
	tokens = self.tokenizer(
	prefixed,
	padding=True,
	truncation=True,
	return_tensors="np",
	max_length=512,
	)

	input_ids = np.asarray(tokens["input_ids"], dtype=np.int64)
	attention_mask = np.asarray(tokens["attention_mask"], dtype=np.int64)

	feeds = {}
	if self._ids_name is not None:
	feeds[self._ids_name] = input_ids
	if self._mask_name is not None:
	feeds[self._mask_name] = attention_mask
	if self._token_type_name is not None:
	feeds[self._token_type_name] = np.zeros_like(input_ids, dtype=np.int64)

	outputs = self.session.run(self.output_names, feeds)
	main_out = _pick_output(outputs, self.output_names, kind="text")

	# Current PyTorch behavior: mean-pool last_hidden_state
	if main_out.ndim == 3:
	embs = _mean_pool(main_out, attention_mask)
	elif main_out.ndim == 2:
	embs = main_out
	else:
	raise RuntimeError(f"Unexpected text output rank: {main_out.ndim}")

	return _l2_normalize(embs, axis=1)


	class NomicVisionEncoderONNX:
	"""
	Nomic embed-vision-v1.5 ONNX:
	image -> hidden states -> CLS token -> L2 normalize
	"""

	def __init__(self, device: str = "cuda"):
	self.device = device
	self.repo_id = "nomic-ai/nomic-embed-vision-v1.5"

	print("[*] Loading nomic-embed-vision-v1.5 (ONNX)...")
	t0 = time.perf_counter()

	onnx_path = _download_onnx_model(self.repo_id)

	available = ort.get_available_providers()
	if "CUDAExecutionProvider" in available and device == "cuda":
	providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
	else:
	providers = ["CPUExecutionProvider"]
	print(f" ONNX providers: {providers}")

	self.session = ort.InferenceSession(onnx_path, providers=providers)
	self.processor = AutoImageProcessor.from_pretrained(self.repo_id, trust_remote_code=True)

	self.input_names = [inp.name for inp in self.session.get_inputs()]
	self.output_names = [out.name for out in self.session.get_outputs()]

	print(f" ONNX inputs: {self.input_names}")
	print(f" ONNX outputs: {self.output_names}")

	self._pixel_name = None
	for name in self.input_names:
	if "pixel" in name.lower():
	self._pixel_name = name
	break

	print(f" Mapped: pixel_values={self._pixel_name}")

	# Sanity check
	dummy = Image.new("RGB", (224, 224), color=(255, 0, 0))
	test = self.encode_images([dummy])
	nrm = float(np.linalg.norm(test[0]))
	print(f" [SANITY] image embed norm={nrm:.4f}")
	print(f"[*] Loaded in {time.perf_counter() - t0:.1f}s\n")

	def encode_images(self, images: list[Image.Image]) -> np.ndarray:
	rgb = [img.convert("RGB") for img in images]
	processed = self.processor(images=rgb, return_tensors="pt")

	if "pixel_values" not in processed:
	raise RuntimeError(f"Processor did not return pixel_values. Keys={list(processed.keys())}")

	pixel_values = processed["pixel_values"].detach().cpu().numpy().astype(np.float32)

	if self._pixel_name is None:
	raise RuntimeError(f"Could not find pixel input in ONNX inputs: {self.input_names}")

	outputs = self.session.run(self.output_names, {self._pixel_name: pixel_values})
	main_out = _pick_output(outputs, self.output_names, kind="vision")

	if main_out.ndim == 3:
	embs = main_out[:, 0, :]
	elif main_out.ndim == 2:
	embs = main_out
	else:
	raise RuntimeError(f"Unexpected vision output rank: {main_out.ndim}")

	return _l2_normalize(embs, axis=1)


	def build_refs_nomic(
	encoder: NomicVisionEncoderONNX,
	refs_dir: Path,
	batch_size: int = 16,
	text_encoder: NomicTextEncoderONNX \| None = None,
	text_weight: float = 0.3,
	):
	"""
	Build one ref embedding per class.
	Same treatment as Jina:
	- average reference image embeddings
	- average class prompt text embeddings
	- combine with text_weight
	"""
	class_dirs = sorted(d for d in refs_dir.iterdir() if d.is_dir())
	if not class_dirs:
	raise ValueError(f"No subfolders in {refs_dir}")

	labels = []
	embeddings = []

	if text_encoder is not None:
	print(f" Text weight: {text_weight:.1f} \| Image weight: {1 - text_weight:.1f}\n")

	for d in class_dirs:
	name = d.name
	paths = sorted(str(p) for p in d.iterdir() if p.suffix.lower() in IMAGE_EXTS)
	if not paths:
	continue

	all_embs = []
	for i in range(0, len(paths), batch_size):
	batch = [Image.open(p).convert("RGB") for p in paths[i:i + batch_size]]
	all_embs.append(encoder.encode_images(batch))

	img_embs = np.concatenate(all_embs, axis=0)
	img_avg = np.nan_to_num(img_embs.mean(axis=0), nan=0.0, posinf=0.0, neginf=0.0)
	img_avg = img_avg / (np.linalg.norm(img_avg) + 1e-12)

	if text_encoder is not None:
	prompts = CLASS_PROMPTS.get(name, [f"a {name}", f"a person holding a {name}"])
	text_embs = text_encoder.encode_texts(prompts)
	text_avg = np.nan_to_num(text_embs.mean(axis=0), nan=0.0, posinf=0.0, neginf=0.0)
	text_avg = text_avg / (np.linalg.norm(text_avg) + 1e-12)

	combined = (1.0 - text_weight) * img_avg + text_weight * text_avg
	combined = np.nan_to_num(combined, nan=0.0, posinf=0.0, neginf=0.0)
	combined = combined / (np.linalg.norm(combined) + 1e-12)

	labels.append(name)
	embeddings.append(combined)

	sim = float(np.dot(img_avg, text_avg))
	print(
	f" {name:<14}: {len(paths)} imgs + {len(prompts)} prompts \| "
	f"img-text sim: {sim:.4f}"
	)
	else:
	labels.append(name)
	embeddings.append(img_avg)
	print(f" {name:<14}: {len(paths)} imgs")

	return labels, np.stack(embeddings).astype(np.float32)


	NomicTextEncoder = NomicTextEncoderONNX
	NomicVisionEncoder = NomicVisionEncoderONNX