import json, os, numpy as np, gradio as gr, torch from PIL import Image from sklearn.metrics.pairwise import cosine_similarity from sklearn.preprocessing import normalize # Load embeddings JSON_PATH = None for p in ["action_embeddings.json", "/app/action_embeddings.json"]: if os.path.exists(p): JSON_PATH = p break print(f"Loading from: {JSON_PATH}") with open(JSON_PATH, "r") as f: saved = json.load(f) ACTION_LABELS = saved["action_names"] EMBEDDINGS_MAT = normalize(np.array(saved["embeddings"])) DS_INDICES = saved.get("ds_indices", list(range(len(ACTION_LABELS)))) print(f"Loaded {len(ACTION_LABELS)} embeddings, dim={EMBEDDINGS_MAT.shape[1]}") # Centroids ACTION_NAMES = sorted(set(ACTION_LABELS)) centroid_dict = {} for aname in ACTION_NAMES: vecs = np.array([EMBEDDINGS_MAT[i] for i, l in enumerate(ACTION_LABELS) if l == aname]) centroid_dict[aname] = normalize(vecs.mean(axis=0).reshape(1,-1))[0] CENTROID_NAMES = list(centroid_dict.keys()) CENTROID_MATRIX = np.array(list(centroid_dict.values())) ACTION_EMOJI = { "calling":"๐","clapping":"๐","cycling":"๐ด","dancing":"๐", "drinking":"๐ฅค","eating":"๐ฝ๏ธ","fighting":"๐ฅ","hugging":"๐ค", "laughing":"๐","listening_to_music":"๐ง","running":"๐", "sitting":"๐ช","sleeping":"๐ด","texting":"๐ฑ","using_laptop":"๐ป", } # load _proc = _model = _ds = None def get_model(): global _proc, _model if _proc is None: from transformers import CLIPProcessor, CLIPModel print("Loading CLIP...") _proc = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") _model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", low_cpu_mem_usage=True).eval() print("CLIP ready.") return _proc, _model def get_ds(): global _ds if _ds is None: from datasets import load_dataset, concatenate_datasets import random random.seed(42) # Load full dataset df_full = concatenate_datasets([ load_dataset("Bingsu/Human_Action_Recognition", split="train"), load_dataset("Bingsu/Human_Action_Recognition", split="test"), ]) calling_label = [i for i, label in enumerate(df_full['labels']) if label == 0] other_labels = [i for i, label in enumerate(df_full['labels']) if label != 0] sampled_calling = random.sample(calling_label, 840) balanced_labels = sampled_calling + other_labels random.shuffle(balanced_labels) _ds = df_full.select(balanced_labels) print(f"Dataset ready: {len(_ds)} images") return _ds def embed(pil_img): proc, model = get_model() clean_image = pil_img.convert("RGB") processed_inputs = proc(images=clean_image, return_tensors="pt") with torch.no_grad(): feats = model.get_image_features(**processed_inputs) user_vector = feats.pooler_output.squeeze().cpu().numpy() return normalize(user_vector.reshape(1, -1))[0] def recommend(query_image): if query_image is None: return "", [], "" vec = embed(query_image) sims = cosine_similarity(vec.reshape(1,-1), EMBEDDINGS_MAT)[0] top3 = sims.argsort()[-3:][::-1] csims = cosine_similarity(vec.reshape(1,-1), CENTROID_MATRIX)[0] pred = CENTROID_NAMES[csims.argmax()] conf = round(float(csims.max()), 4) print(f"Pred: {pred} ({conf:.1%}) | Top3 scores: {[round(float(sims[i]),4) for i in top3]}") print(f"Top3 labels: {[ACTION_LABELS[i] for i in top3]}") emoji = ACTION_EMOJI.get(pred, "") pred_html = f"""
Upload any photo of a person โ detect the action + find the 3 most similar images from 5,000 real photos