| import json, os, numpy as np, gradio as gr, torch |
| from PIL import Image |
| from sklearn.metrics.pairwise import cosine_similarity |
| from sklearn.preprocessing import normalize |
|
|
| |
| JSON_PATH = None |
| for p in ["action_embeddings.json", "/app/action_embeddings.json"]: |
| if os.path.exists(p): |
| JSON_PATH = p |
| break |
|
|
| print(f"Loading from: {JSON_PATH}") |
| with open(JSON_PATH, "r") as f: |
| saved = json.load(f) |
|
|
| ACTION_LABELS = saved["action_names"] |
| EMBEDDINGS_MAT = normalize(np.array(saved["embeddings"])) |
| DS_INDICES = saved.get("ds_indices", list(range(len(ACTION_LABELS)))) |
| print(f"Loaded {len(ACTION_LABELS)} embeddings, dim={EMBEDDINGS_MAT.shape[1]}") |
|
|
| |
| ACTION_NAMES = sorted(set(ACTION_LABELS)) |
| centroid_dict = {} |
| for aname in ACTION_NAMES: |
| vecs = np.array([EMBEDDINGS_MAT[i] for i, l in enumerate(ACTION_LABELS) if l == aname]) |
| centroid_dict[aname] = normalize(vecs.mean(axis=0).reshape(1,-1))[0] |
|
|
| CENTROID_NAMES = list(centroid_dict.keys()) |
| CENTROID_MATRIX = np.array(list(centroid_dict.values())) |
|
|
| ACTION_EMOJI = { |
| "calling":"π","clapping":"π","cycling":"π΄","dancing":"π", |
| "drinking":"π₯€","eating":"π½οΈ","fighting":"π₯","hugging":"π€", |
| "laughing":"π","listening_to_music":"π§","running":"π", |
| "sitting":"πͺ","sleeping":"π΄","texting":"π±","using_laptop":"π»", |
| } |
|
|
| |
| _proc = _model = _ds = None |
|
|
| def get_model(): |
| global _proc, _model |
| if _proc is None: |
| from transformers import CLIPProcessor, CLIPModel |
| print("Loading CLIP...") |
| _proc = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") |
| _model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", |
| low_cpu_mem_usage=True).eval() |
| print("CLIP ready.") |
| return _proc, _model |
|
|
| def get_ds(): |
| global _ds |
| if _ds is None: |
| from datasets import load_dataset, concatenate_datasets |
| import random |
| random.seed(42) |
| |
| df_full = concatenate_datasets([ |
| load_dataset("Bingsu/Human_Action_Recognition", split="train"), |
| load_dataset("Bingsu/Human_Action_Recognition", split="test"), |
| ]) |
| calling_label = [i for i, label in enumerate(df_full['labels']) if label == 0] |
| other_labels = [i for i, label in enumerate(df_full['labels']) if label != 0] |
| sampled_calling = random.sample(calling_label, 840) |
| balanced_labels = sampled_calling + other_labels |
| random.shuffle(balanced_labels) |
| _ds = df_full.select(balanced_labels) |
| print(f"Dataset ready: {len(_ds)} images") |
| return _ds |
|
|
| def embed(pil_img): |
| proc, model = get_model() |
| clean_image = pil_img.convert("RGB") |
| processed_inputs = proc(images=clean_image, return_tensors="pt") |
| with torch.no_grad(): |
| feats = model.get_image_features(**processed_inputs) |
| user_vector = feats.pooler_output.squeeze().cpu().numpy() |
| return normalize(user_vector.reshape(1, -1))[0] |
|
|
| def recommend(query_image): |
| if query_image is None: |
| return "", [], "" |
|
|
| vec = embed(query_image) |
| sims = cosine_similarity(vec.reshape(1,-1), EMBEDDINGS_MAT)[0] |
| top3 = sims.argsort()[-3:][::-1] |
| csims = cosine_similarity(vec.reshape(1,-1), CENTROID_MATRIX)[0] |
| pred = CENTROID_NAMES[csims.argmax()] |
| conf = round(float(csims.max()), 4) |
|
|
| print(f"Pred: {pred} ({conf:.1%}) | Top3 scores: {[round(float(sims[i]),4) for i in top3]}") |
| print(f"Top3 labels: {[ACTION_LABELS[i] for i in top3]}") |
|
|
| emoji = ACTION_EMOJI.get(pred, "") |
| pred_html = f""" |
| <div style="background:linear-gradient(135deg,#667eea,#764ba2);border-radius:16px; |
| padding:24px 28px;color:white;margin-bottom:8px;"> |
| <div style="font-size:12px;opacity:0.8;text-transform:uppercase; |
| letter-spacing:0.08em;margin-bottom:6px;">Detected Action</div> |
| <div style="font-size:30px;font-weight:700;margin-bottom:4px;"> |
| {emoji} {pred.replace('_',' ').title()}</div> |
| <div style="font-size:14px;opacity:0.85;"> |
| Confidence: <strong>{conf:.1%}</strong></div> |
| </div>""" |
|
|
| ds = get_ds() |
| gallery = [] |
| bars = "" |
|
|
| for rank, idx in enumerate(top3, 1): |
| label = ACTION_LABELS[idx] |
| score = round(float(sims[idx]), 4) |
| emj = ACTION_EMOJI.get(label, "") |
| real_idx = DS_INDICES[int(idx)] |
| try: |
| img = ds[int(real_idx)]["image"].convert("RGB") |
| except: |
| img = Image.new("RGB", (120,120), (60,60,60)) |
| gallery.append((img, f"#{rank} {emj} {label.replace('_',' ')} Β· {score:.4f}")) |
| pct = min(int(score * 100), 100) |
| bars += f""" |
| <div style="margin-bottom:14px;"> |
| <div style="display:flex;justify-content:space-between;margin-bottom:5px;"> |
| <span style="font-size:14px;font-weight:500;"> |
| #{rank} {emj} {label.replace('_',' ').title()}</span> |
| <span style="font-size:13px;color:#7c3aed;font-weight:600;">{score:.4f}</span> |
| </div> |
| <div style="background:#f3f0ff;border-radius:99px;height:8px;overflow:hidden;"> |
| <div style="background:linear-gradient(90deg,#667eea,#764ba2); |
| width:{pct}%;height:100%;border-radius:99px;"></div> |
| </div> |
| </div>""" |
|
|
| scores_html = f""" |
| <div style="background:var(--color-background-primary); |
| border:0.5px solid var(--color-border-tertiary); |
| border-radius:16px;padding:20px 24px;"> |
| <div style="font-size:12px;font-weight:600;color:#7c3aed;text-transform:uppercase; |
| letter-spacing:0.06em;margin-bottom:16px;">Similarity Scores</div> |
| {bars} |
| </div>""" |
|
|
| return pred_html, gallery, scores_html |
|
|
|
|
| |
| with gr.Blocks(title="Human Action Recommender") as demo: |
| gr.HTML(""" |
| <div style="text-align:center;padding:2rem 0 1.5rem;"> |
| <div style="font-size:48px;margin-bottom:12px;">π</div> |
| <h1 style="font-size:2rem;font-weight:700;margin:0 0 8px; |
| background:linear-gradient(135deg,#667eea,#764ba2); |
| -webkit-background-clip:text;-webkit-text-fill-color:transparent;"> |
| Human Action Recommender</h1> |
| <p style="font-size:1rem;color:#6b7280;margin:0;"> |
| Upload any photo of a person β detect the action + |
| find the 3 most similar images from 5,000 real photos |
| </p> |
| </div>""") |
|
|
| |
| with gr.Row(equal_height=False): |
| with gr.Column(scale=1, min_width=300): |
| inp = gr.Image(type="pil", label="π€ Upload a Photo", |
| height=320, sources=["upload","webcam"]) |
| btn = gr.Button("π Analyse Action", variant="primary", size="lg") |
| gr.HTML(""" |
| <div style="margin-top:16px;background:var(--color-background-secondary); |
| border-radius:14px;padding:16px 20px;"> |
| <div style="font-size:12px;font-weight:600;color:#7c3aed; |
| text-transform:uppercase;letter-spacing:0.08em;margin-bottom:10px;"> |
| 15 Action Classes</div> |
| <div style="display:flex;flex-wrap:wrap;gap:6px;"> |
| <span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">π calling</span> |
| <span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">π clapping</span> |
| <span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">π΄ cycling</span> |
| <span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">π dancing</span> |
| <span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">π₯€ drinking</span> |
| <span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">π½οΈ eating</span> |
| <span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">π₯ fighting</span> |
| <span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">π€ hugging</span> |
| <span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">π laughing</span> |
| <span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">π§ listening</span> |
| <span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">π running</span> |
| <span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">πͺ sitting</span> |
| <span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">π΄ sleeping</span> |
| <span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">π± texting</span> |
| <span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">π» using laptop</span> |
| </div> |
| <div style="margin-top:12px;font-size:12px;color:#9ca3af;"> |
| First request loads the AI model |
| </div> |
| </div>""") |
|
|
| with gr.Column(scale=2, min_width=500): |
| pred_out = gr.HTML(""" |
| <div style="background:var(--color-background-secondary);border-radius:16px; |
| padding:24px;text-align:center;color:var(--color-text-secondary);"> |
| <div style="font-size:36px;margin-bottom:8px;">π</div> |
| <div>Upload a photo to detect the action</div> |
| </div>""") |
| gallery_out = gr.Gallery( |
| label="πΌοΈ Top-3 Most Similar Images", |
| columns=3, rows=1, height=260, |
| object_fit="cover", show_label=True) |
| scores_out = gr.HTML("") |
|
|
| btn.click(fn=recommend, inputs=[inp], outputs=[pred_out, gallery_out, scores_out]) |
| inp.change(fn=recommend, inputs=[inp], outputs=[pred_out, gallery_out, scores_out]) |
|
|
| gr.HTML(""" |
| <div style="text-align:center;padding:1.5rem 0 0.5rem; |
| border-top:0.5px solid var(--color-border-tertiary);margin-top:1rem;"> |
| <span style="font-size:12px;color:#9ca3af;"> |
| Dataset: Bingsu/Human_Action_Recognition Β· 18,000 images Β· 15 classes | |
| Model: openai/clip-vit-base-patch32 Β· CLIP embeddings Β· Cosine Similarity |
| </span> |
| </div>""") |
|
|
| if __name__ == "__main__": |
| demo.launch() |