import json, os, numpy as np, gradio as gr, torch from PIL import Image from sklearn.metrics.pairwise import cosine_similarity from sklearn.preprocessing import normalize # Load embeddings JSON_PATH = None for p in ["action_embeddings.json", "/app/action_embeddings.json"]: if os.path.exists(p): JSON_PATH = p break print(f"Loading from: {JSON_PATH}") with open(JSON_PATH, "r") as f: saved = json.load(f) ACTION_LABELS = saved["action_names"] EMBEDDINGS_MAT = normalize(np.array(saved["embeddings"])) DS_INDICES = saved.get("ds_indices", list(range(len(ACTION_LABELS)))) print(f"Loaded {len(ACTION_LABELS)} embeddings, dim={EMBEDDINGS_MAT.shape[1]}") # Centroids ACTION_NAMES = sorted(set(ACTION_LABELS)) centroid_dict = {} for aname in ACTION_NAMES: vecs = np.array([EMBEDDINGS_MAT[i] for i, l in enumerate(ACTION_LABELS) if l == aname]) centroid_dict[aname] = normalize(vecs.mean(axis=0).reshape(1,-1))[0] CENTROID_NAMES = list(centroid_dict.keys()) CENTROID_MATRIX = np.array(list(centroid_dict.values())) ACTION_EMOJI = { "calling":"📞","clapping":"👏","cycling":"🚴","dancing":"💃", "drinking":"🥤","eating":"🍽️","fighting":"🥊","hugging":"🤗", "laughing":"😂","listening_to_music":"🎧","running":"🏃", "sitting":"🪑","sleeping":"😴","texting":"📱","using_laptop":"💻", } # load _proc = _model = _ds = None def get_model(): global _proc, _model if _proc is None: from transformers import CLIPProcessor, CLIPModel print("Loading CLIP...") _proc = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") _model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", low_cpu_mem_usage=True).eval() print("CLIP ready.") return _proc, _model def get_ds(): global _ds if _ds is None: from datasets import load_dataset, concatenate_datasets import random random.seed(42) # Load full dataset df_full = concatenate_datasets([ load_dataset("Bingsu/Human_Action_Recognition", split="train"), load_dataset("Bingsu/Human_Action_Recognition", split="test"), ]) calling_label = [i for i, label in enumerate(df_full['labels']) if label == 0] other_labels = [i for i, label in enumerate(df_full['labels']) if label != 0] sampled_calling = random.sample(calling_label, 840) balanced_labels = sampled_calling + other_labels random.shuffle(balanced_labels) _ds = df_full.select(balanced_labels) print(f"Dataset ready: {len(_ds)} images") return _ds def embed(pil_img): proc, model = get_model() clean_image = pil_img.convert("RGB") processed_inputs = proc(images=clean_image, return_tensors="pt") with torch.no_grad(): feats = model.get_image_features(**processed_inputs) user_vector = feats.pooler_output.squeeze().cpu().numpy() return normalize(user_vector.reshape(1, -1))[0] def recommend(query_image): if query_image is None: return "", [], "" vec = embed(query_image) sims = cosine_similarity(vec.reshape(1,-1), EMBEDDINGS_MAT)[0] top3 = sims.argsort()[-3:][::-1] csims = cosine_similarity(vec.reshape(1,-1), CENTROID_MATRIX)[0] pred = CENTROID_NAMES[csims.argmax()] conf = round(float(csims.max()), 4) print(f"Pred: {pred} ({conf:.1%}) | Top3 scores: {[round(float(sims[i]),4) for i in top3]}") print(f"Top3 labels: {[ACTION_LABELS[i] for i in top3]}") emoji = ACTION_EMOJI.get(pred, "") pred_html = f"""

Detected Action

{emoji} {pred.replace('_',' ').title()}

Confidence: {conf:.1%}

""" ds = get_ds() gallery = [] bars = "" for rank, idx in enumerate(top3, 1): label = ACTION_LABELS[idx] score = round(float(sims[idx]), 4) emj = ACTION_EMOJI.get(label, "") real_idx = DS_INDICES[int(idx)] try: img = ds[int(real_idx)]["image"].convert("RGB") except: img = Image.new("RGB", (120,120), (60,60,60)) gallery.append((img, f"#{rank} {emj} {label.replace('_',' ')} · {score:.4f}")) pct = min(int(score * 100), 100) bars += f"""

#{rank} {emj} {label.replace('_',' ').title()} {score:.4f}

""" scores_html = f"""

Similarity Scores

{bars}

""" return pred_html, gallery, scores_html # UI with gr.Blocks(title="Human Action Recommender") as demo: gr.HTML("""

🏃

Human Action Recommender

Upload any photo of a person → detect the action + find the 3 most similar images from 5,000 real photos

""") with gr.Row(equal_height=False): with gr.Column(scale=1, min_width=300): inp = gr.Image(type="pil", label="📤 Upload a Photo", height=320, sources=["upload","webcam"]) btn = gr.Button("🔍 Analyse Action", variant="primary", size="lg") gr.HTML("""

15 Action Classes

📞 calling 👏 clapping 🚴 cycling 💃 dancing 🥤 drinking 🍽️ eating 🥊 fighting 🤗 hugging 😂 laughing 🎧 listening 🏃 running 🪑 sitting 😴 sleeping 📱 texting 💻 using laptop

First request loads the AI model

""") with gr.Column(scale=2, min_width=500): pred_out = gr.HTML("""

👆

Upload a photo to detect the action

""") gallery_out = gr.Gallery( label="🖼️ Top-3 Most Similar Images", columns=3, rows=1, height=260, object_fit="cover", show_label=True) scores_out = gr.HTML("") btn.click(fn=recommend, inputs=[inp], outputs=[pred_out, gallery_out, scores_out]) inp.change(fn=recommend, inputs=[inp], outputs=[pred_out, gallery_out, scores_out]) gr.HTML("""

Dataset: Bingsu/Human_Action_Recognition · 18,000 images · 15 classes | Model: openai/clip-vit-base-patch32 · CLIP embeddings · Cosine Similarity

""") if __name__ == "__main__": demo.launch()