Spaces:

lablab-ai-amd-developer-hackathon
/

ROCKIT-Vision-Intelligence

Sleeping

App Files Files Community

ROCKIT-Vision-Intelligence / embedding.py

Billavenu

adding filleeeesssss

fb12ddc verified 19 days ago

raw

history blame contribute delete

9.03 kB

	# HF_Space_hipVS/embedding.py
	# ============================
	# Multimodal embedding + LLM calls.
	#
	# Embedding strategy: NO CAPTIONING.
	# GPU: Qwen3-VL-Embedding (2B or 8B) — encodes images AND text into same space
	# CPU: CLIP ViT-L/14 — same idea, lighter weight
	#
	# LLM strategy:
	# Primary: Qwen3-35B-A3B (local or HF Inference API)
	# Fallback: Qwen3-1.7B or HF Inference API

	import logging
	import io
	import numpy as np
	from PIL import Image as PILImage

	logger = logging.getLogger(__name__)

	# ── Lazy-loaded model singletons ─────────────────────────────────────────────

	_embed_model = None
	_embed_processor = None
	_embed_tokenizer = None
	_is_clip = False


	def _load_embed_model():
	"""
	Lazy-init the multimodal embedding model.

	GPU path: Qwen3-VL-Embedding via transformers
	CPU path: CLIP via transformers (CLIPModel + CLIPProcessor)
	"""
	global _embed_model, _embed_processor, _embed_tokenizer, _is_clip
	if _embed_model is not None:
	return

	import torch
	from config import EMBED_MODEL, DEVICE, USE_GPU

	model_lower = EMBED_MODEL.lower()

	if "clip" in model_lower:
	# ── CLIP path (CPU fallback) ────────────────────────────────────
	from transformers import CLIPModel, CLIPProcessor

	logger.info(f"Loading CLIP model: {EMBED_MODEL} on {DEVICE}")
	_embed_model = CLIPModel.from_pretrained(EMBED_MODEL).to(DEVICE)
	_embed_processor = CLIPProcessor.from_pretrained(EMBED_MODEL)
	_embed_model.eval()
	_is_clip = True
	logger.info("CLIP model loaded")

	else:
	# ── Qwen3-VL-Embedding path (GPU) ──────────────────────────────
	from transformers import AutoModel, AutoProcessor

	dtype = torch.float16 if USE_GPU else torch.float32
	logger.info(f"Loading Qwen3-VL-Embedding: {EMBED_MODEL} on {DEVICE}")
	_embed_model = AutoModel.from_pretrained(
	EMBED_MODEL,
	torch_dtype=dtype,
	trust_remote_code=True,
	).to(DEVICE)
	_embed_processor = AutoProcessor.from_pretrained(
	EMBED_MODEL,
	trust_remote_code=True,
	)
	_embed_model.eval()
	_is_clip = False
	logger.info("Qwen3-VL-Embedding model loaded")


	# ── Text Embedding ──────────────────────────────────────────────────────────

	def embed_text(text: str) -> np.ndarray:
	"""
	Embed a text string into the shared multimodal vector space.
	Works with both CLIP and Qwen3-VL-Embedding.
	Returns a normalized float32 numpy vector.
	"""
	import torch
	from config import DEVICE

	_load_embed_model()

	with torch.no_grad():
	if _is_clip:
	inputs = _embed_processor(text=[text], return_tensors="pt", padding=True, truncation=True).to(DEVICE)
	features = _embed_model.get_text_features(**inputs)
	else:
	# Qwen3-VL-Embedding: text-only input
	inputs = _embed_processor(text=[text], return_tensors="pt", padding=True, truncation=True).to(DEVICE)
	outputs = _embed_model(**inputs)
	# Use the [CLS] token or mean pooling depending on model
	if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
	features = outputs.pooler_output
	else:
	features = outputs.last_hidden_state[:, 0, :]

	vec = features.squeeze(0).cpu().float().numpy()
	# L2 normalize
	norm = np.linalg.norm(vec)
	if norm > 0:
	vec = vec / norm
	return vec


	def embed_texts(texts: list[str]) -> np.ndarray:
	"""Batch embed multiple texts. Returns (N, D) float32 array."""
	import torch
	from config import DEVICE

	_load_embed_model()

	with torch.no_grad():
	if _is_clip:
	inputs = _embed_processor(text=texts, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
	features = _embed_model.get_text_features(**inputs)
	else:
	inputs = _embed_processor(text=texts, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
	outputs = _embed_model(**inputs)
	if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
	features = outputs.pooler_output
	else:
	features = outputs.last_hidden_state[:, 0, :]

	vecs = features.cpu().float().numpy()
	norms = np.linalg.norm(vecs, axis=1, keepdims=True)
	norms = np.where(norms == 0, 1, norms)
	return vecs / norms


	# ── Image Embedding (direct, no captioning) ─────────────────────────────────

	def embed_image(image: PILImage.Image) -> np.ndarray:
	"""
	Embed a PIL Image directly into the shared vector space.
	No captioning step — the vision encoder handles it natively.
	Returns a normalized float32 numpy vector.
	"""
	import torch
	from config import DEVICE

	_load_embed_model()

	if image.mode != "RGB":
	image = image.convert("RGB")

	with torch.no_grad():
	if _is_clip:
	inputs = _embed_processor(images=image, return_tensors="pt").to(DEVICE)
	features = _embed_model.get_image_features(**inputs)
	else:
	# Qwen3-VL-Embedding: image input via processor
	inputs = _embed_processor(images=image, return_tensors="pt").to(DEVICE)
	outputs = _embed_model(**inputs)
	if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
	features = outputs.pooler_output
	else:
	features = outputs.last_hidden_state[:, 0, :]

	vec = features.squeeze(0).cpu().float().numpy()
	norm = np.linalg.norm(vec)
	if norm > 0:
	vec = vec / norm
	return vec


	def embed_image_bytes(data: bytes, mime_type: str = "image/jpeg") -> np.ndarray:
	"""Embed raw image bytes. Returns normalized float32 vector."""
	image = PILImage.open(io.BytesIO(data))
	return embed_image(image)


	# ── LLM Summarization ──────────────────────────────────────────────────────

	def llm_summarize(query: str, search_results: list[dict], mode: str = "image") -> str:
	"""
	Pass search results through an LLM for human-friendly interpretation.
	Tries: local model -> HF Inference API -> plain text fallback.
	"""
	from config import LLM_MODEL, LLM_FALLBACK, HF_TOKEN

	if not search_results:
	return f'No results found for "{query}". Try uploading more media or using different search terms.'

	# Build prompt context
	if mode == "video":
	results_text = "\n".join(
	f" - Video: {r.get('video_name', '?')}, "
	f"Time: {r.get('timestamp_label', '?')} ({r.get('timestamp_sec', 0):.1f}s), "
	f"Score: {r.get('score', 0):.4f}"
	for r in search_results
	)
	instruction = (
	"You are a vision search assistant. Summarize the video search results below. "
	"Highlight the most relevant moments and time ranges. Be concise. Use markdown."
	)
	else:
	results_text = "\n".join(
	f" - Image: {r.get('file_name', '?')}, "
	f"Score: {r.get('score', 0):.4f}"
	for r in search_results
	)
	instruction = (
	"You are a vision search assistant. Summarize the image search results below. "
	"Highlight the most relevant matches. Be concise. Use markdown."
	)

	prompt = (
	f"{instruction}\n\n"
	f"User query: \"{query}\"\n\n"
	f"Search results ({len(search_results)} matches):\n{results_text}\n\n"
	f"Summary:"
	)

	# Try HF Inference API (works for both local and remote models)
	for model_id in (LLM_MODEL, LLM_FALLBACK):
	try:
	from huggingface_hub import InferenceClient

	client = InferenceClient(
	model=model_id,
	token=HF_TOKEN if HF_TOKEN else None,
	)
	response = client.text_generation(
	prompt,
	max_new_tokens=300,
	temperature=0.7,
	do_sample=True,
	)
	if response and response.strip():
	return response.strip()
	except Exception as e:
	logger.warning(f"LLM {model_id} failed: {e}")
	continue

	# Plain text fallback
	return (
	f"Found {len(search_results)} results for \"{query}\"\n\n"
	f"_(LLM summary unavailable)_\n\n"
	f"```\n{results_text}\n```"
	)