# HF_Space_hipVS/embedding.py # ============================ # Multimodal embedding + LLM calls. # # Embedding strategy: NO CAPTIONING. # GPU: Qwen3-VL-Embedding (2B or 8B) — encodes images AND text into same space # CPU: CLIP ViT-L/14 — same idea, lighter weight # # LLM strategy: # Primary: Qwen3-35B-A3B (local or HF Inference API) # Fallback: Qwen3-1.7B or HF Inference API import logging import io import numpy as np from PIL import Image as PILImage logger = logging.getLogger(__name__) # ── Lazy-loaded model singletons ───────────────────────────────────────────── _embed_model = None _embed_processor = None _embed_tokenizer = None _is_clip = False def _load_embed_model(): """ Lazy-init the multimodal embedding model. GPU path: Qwen3-VL-Embedding via transformers CPU path: CLIP via transformers (CLIPModel + CLIPProcessor) """ global _embed_model, _embed_processor, _embed_tokenizer, _is_clip if _embed_model is not None: return import torch from config import EMBED_MODEL, DEVICE, USE_GPU model_lower = EMBED_MODEL.lower() if "clip" in model_lower: # ── CLIP path (CPU fallback) ──────────────────────────────────── from transformers import CLIPModel, CLIPProcessor logger.info(f"Loading CLIP model: {EMBED_MODEL} on {DEVICE}") _embed_model = CLIPModel.from_pretrained(EMBED_MODEL).to(DEVICE) _embed_processor = CLIPProcessor.from_pretrained(EMBED_MODEL) _embed_model.eval() _is_clip = True logger.info("CLIP model loaded") else: # ── Qwen3-VL-Embedding path (GPU) ────────────────────────────── from transformers import AutoModel, AutoProcessor dtype = torch.float16 if USE_GPU else torch.float32 logger.info(f"Loading Qwen3-VL-Embedding: {EMBED_MODEL} on {DEVICE}") _embed_model = AutoModel.from_pretrained( EMBED_MODEL, torch_dtype=dtype, trust_remote_code=True, ).to(DEVICE) _embed_processor = AutoProcessor.from_pretrained( EMBED_MODEL, trust_remote_code=True, ) _embed_model.eval() _is_clip = False logger.info("Qwen3-VL-Embedding model loaded") # ── Text Embedding ────────────────────────────────────────────────────────── def embed_text(text: str) -> np.ndarray: """ Embed a text string into the shared multimodal vector space. Works with both CLIP and Qwen3-VL-Embedding. Returns a normalized float32 numpy vector. """ import torch from config import DEVICE _load_embed_model() with torch.no_grad(): if _is_clip: inputs = _embed_processor(text=[text], return_tensors="pt", padding=True, truncation=True).to(DEVICE) features = _embed_model.get_text_features(**inputs) else: # Qwen3-VL-Embedding: text-only input inputs = _embed_processor(text=[text], return_tensors="pt", padding=True, truncation=True).to(DEVICE) outputs = _embed_model(**inputs) # Use the [CLS] token or mean pooling depending on model if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None: features = outputs.pooler_output else: features = outputs.last_hidden_state[:, 0, :] vec = features.squeeze(0).cpu().float().numpy() # L2 normalize norm = np.linalg.norm(vec) if norm > 0: vec = vec / norm return vec def embed_texts(texts: list[str]) -> np.ndarray: """Batch embed multiple texts. Returns (N, D) float32 array.""" import torch from config import DEVICE _load_embed_model() with torch.no_grad(): if _is_clip: inputs = _embed_processor(text=texts, return_tensors="pt", padding=True, truncation=True).to(DEVICE) features = _embed_model.get_text_features(**inputs) else: inputs = _embed_processor(text=texts, return_tensors="pt", padding=True, truncation=True).to(DEVICE) outputs = _embed_model(**inputs) if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None: features = outputs.pooler_output else: features = outputs.last_hidden_state[:, 0, :] vecs = features.cpu().float().numpy() norms = np.linalg.norm(vecs, axis=1, keepdims=True) norms = np.where(norms == 0, 1, norms) return vecs / norms # ── Image Embedding (direct, no captioning) ───────────────────────────────── def embed_image(image: PILImage.Image) -> np.ndarray: """ Embed a PIL Image directly into the shared vector space. No captioning step — the vision encoder handles it natively. Returns a normalized float32 numpy vector. """ import torch from config import DEVICE _load_embed_model() if image.mode != "RGB": image = image.convert("RGB") with torch.no_grad(): if _is_clip: inputs = _embed_processor(images=image, return_tensors="pt").to(DEVICE) features = _embed_model.get_image_features(**inputs) else: # Qwen3-VL-Embedding: image input via processor inputs = _embed_processor(images=image, return_tensors="pt").to(DEVICE) outputs = _embed_model(**inputs) if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None: features = outputs.pooler_output else: features = outputs.last_hidden_state[:, 0, :] vec = features.squeeze(0).cpu().float().numpy() norm = np.linalg.norm(vec) if norm > 0: vec = vec / norm return vec def embed_image_bytes(data: bytes, mime_type: str = "image/jpeg") -> np.ndarray: """Embed raw image bytes. Returns normalized float32 vector.""" image = PILImage.open(io.BytesIO(data)) return embed_image(image) # ── LLM Summarization ────────────────────────────────────────────────────── def llm_summarize(query: str, search_results: list[dict], mode: str = "image") -> str: """ Pass search results through an LLM for human-friendly interpretation. Tries: local model -> HF Inference API -> plain text fallback. """ from config import LLM_MODEL, LLM_FALLBACK, HF_TOKEN if not search_results: return f'No results found for "{query}". Try uploading more media or using different search terms.' # Build prompt context if mode == "video": results_text = "\n".join( f" - Video: {r.get('video_name', '?')}, " f"Time: {r.get('timestamp_label', '?')} ({r.get('timestamp_sec', 0):.1f}s), " f"Score: {r.get('score', 0):.4f}" for r in search_results ) instruction = ( "You are a vision search assistant. Summarize the video search results below. " "Highlight the most relevant moments and time ranges. Be concise. Use markdown." ) else: results_text = "\n".join( f" - Image: {r.get('file_name', '?')}, " f"Score: {r.get('score', 0):.4f}" for r in search_results ) instruction = ( "You are a vision search assistant. Summarize the image search results below. " "Highlight the most relevant matches. Be concise. Use markdown." ) prompt = ( f"{instruction}\n\n" f"User query: \"{query}\"\n\n" f"Search results ({len(search_results)} matches):\n{results_text}\n\n" f"Summary:" ) # Try HF Inference API (works for both local and remote models) for model_id in (LLM_MODEL, LLM_FALLBACK): try: from huggingface_hub import InferenceClient client = InferenceClient( model=model_id, token=HF_TOKEN if HF_TOKEN else None, ) response = client.text_generation( prompt, max_new_tokens=300, temperature=0.7, do_sample=True, ) if response and response.strip(): return response.strip() except Exception as e: logger.warning(f"LLM {model_id} failed: {e}") continue # Plain text fallback return ( f"**Found {len(search_results)} results for \"{query}\"**\n\n" f"_(LLM summary unavailable)_\n\n" f"```\n{results_text}\n```" )