Spaces:

sanskar753
/

event_retrieval

Runtime error

sanskar407

change requirement.txt

546ee63 about 2 months ago

14.6 kB

	"""
	embed_worker.py — Single-GPU embedding worker.

	Usage (run one per GPU in separate terminals):
	CUDA_VISIBLE_DEVICES=5 python embed_worker.py --start 12586 --end 16000 --out embeddings_gpu5.pkl
	CUDA_VISIBLE_DEVICES=6 python embed_worker.py --start 16000 --end 20000 --out embeddings_gpu6.pkl
	CUDA_VISIBLE_DEVICES=7 python embed_worker.py --start 20000 --end 24586 --out embeddings_gpu7.pkl

	The existing embeddings_blip2.pkl already has frames 0-12585 — don't re-do those.
	"""

	import argparse, os, sys, pickle
	import ssl
	ssl._create_default_https_context = ssl._create_unverified_context

	# ── PATHS — adjust if needed ──────────────────────────────────────────────────
	FRAMES_ROOT = "/media/RTCIN15TB/Datasets/NvidiaPhisicalAIFrames"
	CACHE_DIR = "model_cache"

	script_dir = os.path.dirname(os.path.abspath(__file__))
	local_cache_path = os.path.join(script_dir, CACHE_DIR)
	os.makedirs(local_cache_path, exist_ok=True)
	os.environ['HF_HOME'] = local_cache_path
	os.environ['LAVIS_CACHE_ROOT'] = local_cache_path

	# ── LAVIS path ────────────────────────────────────────────────────────────────
	path_to_project_root = os.path.abspath(os.path.join(script_dir, ".."))
	path_to_lavis_parent_dir = os.path.join(path_to_project_root, "LAVIS")
	if not (os.path.isdir(path_to_lavis_parent_dir) and
	os.path.isdir(os.path.join(path_to_lavis_parent_dir, "lavis"))):
	path_to_lavis_parent_dir = "/media/RTCIN7TBDriveB/Interns/RDT2/gte3kor/LAVIS"
	sys.path.insert(0, path_to_lavis_parent_dir)

	# ── Patches (same as main script) ─────────────────────────────────────────────
	import torch, torch.nn as nn, torch.distributions.constraints as constraints, inspect
	from transformers.modeling_utils import PreTrainedModel
	from lavis.models.blip2_models.blip2_qformer import Blip2Qformer

	if hasattr(constraints, '_PositiveDefinite') and hasattr(constraints._PositiveDefinite, 'check'):
	_orig_pdc = constraints._PositiveDefinite.check
	def _patched_pdc(self, value):
	if isinstance(value, torch.Tensor) and value.is_meta:
	return torch.ones_like(value, dtype=torch.bool, device=value.device)
	return _orig_pdc(self, value)
	constraints._PositiveDefinite.check = _patched_pdc

	if hasattr(PreTrainedModel, '_init_added_embeddings_weights_with_mean'):
	_orig_iae = PreTrainedModel._init_added_embeddings_weights_with_mean
	def _patched_iae(self, new_emb, old_emb, num_added, args, *kwargs):
	if not (isinstance(new_emb, nn.Embedding) and isinstance(old_emb, nn.Embedding)):
	return _orig_iae(self, new_emb, old_emb, num_added, args, *kwargs)
	new_w, old_w = new_emb.weight, old_emb.weight
	if num_added > 0 and old_w.device.type == 'meta':
	start, end = old_w.shape[0], new_w.shape[0]
	sl = slice(start, end)
	if new_w.device.type != 'meta' and sl.start < sl.stop:
	with torch.no_grad():
	new_w[sl].normal_(mean=0.0, std=self.config.initializer_range)
	return
	return _orig_iae(self, new_emb, old_emb, num_added, args, *kwargs)
	PreTrainedModel._init_added_embeddings_weights_with_mean = _patched_iae

	_orig_lsd = nn.Module.load_state_dict
	# def _patched_lsd(self, state_dict, strict=True, assign=False):
	# if isinstance(self, Blip2Qformer):
	# model_sd = self.state_dict()
	# for key in ["Qformer.cls.predictions.bias", "Qformer.cls.predictions.decoder.weight"]:
	# if key in state_dict and key in model_sd:
	# ckpt_t, model_t = state_dict[key], model_sd[key]
	# if ckpt_t.shape[0] != model_t.shape[0]:
	# state_dict[key] = ckpt_t.narrow(0, 0, model_t.shape[0])
	# if any(p.is_meta for p in self.parameters()):
	# assign = True
	# sig = inspect.signature(_orig_lsd)
	# if 'assign' in sig.parameters:
	# return _orig_lsd(self, state_dict, strict=strict, assign=assign)
	# else:
	# return _orig_lsd(self, state_dict, strict=strict)
	# nn.Module.load_state_dict = _patched_lsd

	def _patched_lsd(self, state_dict, strict=True, assign=False):
	if isinstance(self, Blip2Qformer):
	model_sd = self.state_dict()
	for key in ["Qformer.cls.predictions.bias", "Qformer.cls.predictions.decoder.weight"]:
	if key in state_dict and key in model_sd:
	ckpt_t, model_t = state_dict[key], model_sd[key]
	if ckpt_t.shape[0] != model_t.shape[0]:
	state_dict[key] = ckpt_t.narrow(0, 0, model_t.shape[0])
	# ── ADD THIS BLOCK — interpolate pos_embed if size mismatch ──
	# if "visual_encoder.pos_embed" in state_dict and "visual_encoder.pos_embed" in model_sd:
	# ckpt_pos = state_dict["visual_encoder.pos_embed"] # [1, 1381, 1408]
	# model_pos = model_sd["visual_encoder.pos_embed"] # [1, 1370, 1408]
	# if ckpt_pos.shape != model_pos.shape:
	# import torch.nn.functional as F
	# # strip cls token, interpolate, re-attach
	# cls_tok = ckpt_pos[:, :1, :] # [1, 1, 1408]
	# patches = ckpt_pos[:, 1:, :] # [1, N_ckpt, 1408]
	# N_model = model_pos.shape[1] - 1
	# # reshape to 2D grid, interpolate, reshape back
	# import math
	# gs_ckpt = int(math.sqrt(patches.shape[1])) # 37
	# gs_model = int(math.sqrt(N_model)) # target grid

	# dim = patches.shape[-1]
	# print(f"DEBUG: patches.shape={patches.shape}, gs_ckpt={gs_ckpt}, gs_ckpt^2={gs_ckpt*gs_ckpt}")
	# patches = patches.reshape(1, gs_ckpt, gs_ckpt, dim).permute(0, 3, 1, 2)
	# # patches = patches.reshape(1, gs_ckpt, gs_ckpt, 1408).permute(0, 3, 1, 2) # [1,1408,37,37]
	# patches = F.interpolate(patches.float(), size=(gs_model, gs_model), mode='bicubic', align_corners=False)
	# patches = patches.permute(0, 2, 3, 1).reshape(1, gs_model*gs_model, dim)
	# state_dict["visual_encoder.pos_embed"] = torch.cat([cls_tok, patches], dim=1)
	# print(f"INFO: Interpolated pos_embed {ckpt_pos.shape} → {state_dict['visual_encoder.pos_embed'].shape}")
	if "visual_encoder.pos_embed" in state_dict and "visual_encoder.pos_embed" in model_sd:
	ckpt_pos = state_dict["visual_encoder.pos_embed"]
	model_pos = model_sd["visual_encoder.pos_embed"]
	if ckpt_pos.shape != model_pos.shape:
	import torch.nn.functional as F, math
	print(f"DEBUG: ckpt_pos={ckpt_pos.shape}, model_pos={model_pos.shape}")
	cls_tok = ckpt_pos[:, :1, :]
	patches = ckpt_pos[:, 1:, :]
	dim = patches.shape[-1]
	N_ckpt = patches.shape[1]
	N_model = model_pos.shape[1] - 1
	print(f"DEBUG: N_ckpt={N_ckpt}, N_model={N_model}, dim={dim}")
	# find grid sizes — may not be square
	gs_ckpt_h = gs_ckpt_w = int(math.sqrt(N_ckpt))
	# if not perfect square, brute force find h,w factors
	if gs_ckpt_h * gs_ckpt_w != N_ckpt:
	for h in range(int(math.sqrt(N_ckpt)), 0, -1):
	if N_ckpt % h == 0:
	gs_ckpt_h, gs_ckpt_w = h, N_ckpt // h
	break
	gs_model_h = gs_model_w = int(math.sqrt(N_model))
	if gs_model_h * gs_model_w != N_model:
	for h in range(int(math.sqrt(N_model)), 0, -1):
	if N_model % h == 0:
	gs_model_h, gs_model_w = h, N_model // h
	break
	print(f"DEBUG: ckpt grid={gs_ckpt_h}x{gs_ckpt_w}, model grid={gs_model_h}x{gs_model_w}")
	patches = patches.reshape(1, gs_ckpt_h, gs_ckpt_w, dim).permute(0, 3, 1, 2)
	patches = F.interpolate(patches.float(), size=(gs_model_h, gs_model_w), mode='bicubic', align_corners=False)
	patches = patches.permute(0, 2, 3, 1).reshape(1, gs_model_h * gs_model_w, dim)
	state_dict["visual_encoder.pos_embed"] = torch.cat([cls_tok, patches], dim=1)
	print(f"INFO: Interpolated pos_embed {ckpt_pos.shape} → {state_dict['visual_encoder.pos_embed'].shape}")
	# ─────────────────────────────────────────────────────────────
	if any(p.is_meta for p in self.parameters()):
	assign = True
	sig = inspect.signature(_orig_lsd)
	if 'assign' in sig.parameters:
	return _orig_lsd(self, state_dict, strict=strict, assign=assign)
	else:
	return _orig_lsd(self, state_dict, strict=strict)
	nn.Module.load_state_dict = _patched_lsd
	print("INFO: Patched nn.Module.load_state_dict.")

	# ── Main ──────────────────────────────────────────────────────────────────────
	from lavis.models import load_model_and_preprocess
	from PIL import Image
	import numpy as np

	def discover_frames(root):
	all_paths = []
	for chunk in sorted(os.listdir(root)):
	cp = os.path.join(root, chunk)
	if not os.path.isdir(cp) or chunk.startswith('.'): continue
	for vid in sorted(os.listdir(cp)):
	vp = os.path.join(cp, vid)
	if not os.path.isdir(vp): continue
	all_paths.extend(sorted(
	os.path.join(vp, f) for f in os.listdir(vp)
	if f.lower().endswith('.jpg')
	))
	return all_paths


	def run(start: int, end: int, out_pkl: str, override_paths=None):
	device = "cuda:0" # CUDA_VISIBLE_DEVICES remaps the GPU to index 0

	# print(f"Loading BLIP-2 on {device} …")
	# model, vis_processors, _ = load_model_and_preprocess(
	# name="blip2", model_type="gen3_322_840", is_eval=True, device=device
	# )
	model, vis_processors, text_processors = load_model_and_preprocess(
	name="blip2",
	model_type="gen3_518_518",
	is_eval=True,
	device=device
	)

	model.eval()

	# print("Discovering frames …")
	# all_paths = discover_frames(FRAMES_ROOT)
	# slice_paths = all_paths[start:end]
	# print(f"This worker: frames [{start}, {end}) → {len(slice_paths)} paths")

	if override_paths is not None:
	all_paths = override_paths
	start = 0
	end = len(all_paths)
	slice_paths = all_paths

	total = len(all_paths)
	# 🔥 AUTO HANDLE END
	if end is None:
	end = total

	# clamp safety
	end = min(end, total)

	if start >= total:
	print(f"⚠️ Start {start} exceeds total frames {total}")
	return

	print(f"Processing range [{start}, {end}) out of {total}")




	# Resume support: load existing partial output if present
	if os.path.exists(out_pkl):
	with open(out_pkl, "rb") as f:
	embedding_dict = pickle.load(f)
	print(f"Resumed from {out_pkl}: {len(embedding_dict)} already done.")
	else:
	embedding_dict = {}

	todo = [p for p in slice_paths if p not in embedding_dict]
	print(f"{len(todo)} frames still need embedding.")

	BATCH_SIZE = 32
	SAVE_EVERY = 500
	computed = 0

	model.eval()
	with torch.no_grad():
	for i in range(0, len(todo), BATCH_SIZE):
	batch_paths = todo[i: i + BATCH_SIZE]
	images, valid_paths = [], []
	for p in batch_paths:
	try:
	img = Image.open(p).convert("RGB")
	images.append(vis_processors["eval"](img))
	valid_paths.append(p)
	except Exception as e:
	print(f" WARNING: {p}: {e}")
	if not images:
	continue

	image_tensor = torch.stack(images, dim=0).to(device)
	feats = model.extract_features(
	{"image": image_tensor}, mode="image"
	).image_embeds_proj[:, 0, :]

	for path, emb in zip(valid_paths, feats):
	embedding_dict[path] = emb.cpu()
	computed += len(valid_paths)

	if computed % SAVE_EVERY == 0:
	print(f" [{computed}/{len(todo)}] Saving checkpoint → {out_pkl}")
	with open(out_pkl, "wb") as f:
	pickle.dump(embedding_dict, f)

	print(f"Done. Saving final output → {out_pkl} ({len(embedding_dict)} embeddings)")
	with open(out_pkl, "wb") as f:
	pickle.dump(embedding_dict, f)


	if __name__ == "__main__":
	# parser = argparse.ArgumentParser()
	# parser.add_argument("--start", type=int, required=True, help="Start frame index (inclusive)")
	# parser.add_argument("--end", type=int, default=None, help="End frame index (exclusive)")
	# parser.add_argument("--out", type=str, required=True, help="Output .pkl filename")
	# args = parser.parse_args()
	# run(args.start, args.end, args.out)
	parser = argparse.ArgumentParser()
	parser.add_argument("--start", type=int, default=0)
	parser.add_argument("--end", type=int, default=-1)
	parser.add_argument("--out", type=str, required=True)
	parser.add_argument("--folder", type=str, default=None,
	help="If set, embed only JPGs under this specific folder (overrides --start/--end)")
	args = parser.parse_args()

	if args.folder:
	# collect all jpgs directly from the specified folder
	import glob
	specific_paths = sorted(glob.glob(os.path.join(args.folder, "*", ".jpg"), recursive=True))
	if not specific_paths:
	specific_paths = sorted(glob.glob(os.path.join(args.folder, "*.jpg")))
	print(f"Folder mode: found {len(specific_paths)} JPGs under {args.folder}")
	run(0, len(specific_paths), args.out, override_paths=specific_paths)
	else:
	run(args.start, args.end, args.out)