romi2001's picture
Update app.py
e1998f2 verified
import json, os, numpy as np, gradio as gr, torch
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
# Load embeddings
JSON_PATH = None
for p in ["action_embeddings.json", "/app/action_embeddings.json"]:
if os.path.exists(p):
JSON_PATH = p
break
print(f"Loading from: {JSON_PATH}")
with open(JSON_PATH, "r") as f:
saved = json.load(f)
ACTION_LABELS = saved["action_names"]
EMBEDDINGS_MAT = normalize(np.array(saved["embeddings"]))
DS_INDICES = saved.get("ds_indices", list(range(len(ACTION_LABELS))))
print(f"Loaded {len(ACTION_LABELS)} embeddings, dim={EMBEDDINGS_MAT.shape[1]}")
# Centroids
ACTION_NAMES = sorted(set(ACTION_LABELS))
centroid_dict = {}
for aname in ACTION_NAMES:
vecs = np.array([EMBEDDINGS_MAT[i] for i, l in enumerate(ACTION_LABELS) if l == aname])
centroid_dict[aname] = normalize(vecs.mean(axis=0).reshape(1,-1))[0]
CENTROID_NAMES = list(centroid_dict.keys())
CENTROID_MATRIX = np.array(list(centroid_dict.values()))
ACTION_EMOJI = {
"calling":"πŸ“ž","clapping":"πŸ‘","cycling":"🚴","dancing":"πŸ’ƒ",
"drinking":"πŸ₯€","eating":"🍽️","fighting":"πŸ₯Š","hugging":"πŸ€—",
"laughing":"πŸ˜‚","listening_to_music":"🎧","running":"πŸƒ",
"sitting":"πŸͺ‘","sleeping":"😴","texting":"πŸ“±","using_laptop":"πŸ’»",
}
# load
_proc = _model = _ds = None
def get_model():
global _proc, _model
if _proc is None:
from transformers import CLIPProcessor, CLIPModel
print("Loading CLIP...")
_proc = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32",
low_cpu_mem_usage=True).eval()
print("CLIP ready.")
return _proc, _model
def get_ds():
global _ds
if _ds is None:
from datasets import load_dataset, concatenate_datasets
import random
random.seed(42)
# Load full dataset
df_full = concatenate_datasets([
load_dataset("Bingsu/Human_Action_Recognition", split="train"),
load_dataset("Bingsu/Human_Action_Recognition", split="test"),
])
calling_label = [i for i, label in enumerate(df_full['labels']) if label == 0]
other_labels = [i for i, label in enumerate(df_full['labels']) if label != 0]
sampled_calling = random.sample(calling_label, 840)
balanced_labels = sampled_calling + other_labels
random.shuffle(balanced_labels)
_ds = df_full.select(balanced_labels)
print(f"Dataset ready: {len(_ds)} images")
return _ds
def embed(pil_img):
proc, model = get_model()
clean_image = pil_img.convert("RGB")
processed_inputs = proc(images=clean_image, return_tensors="pt")
with torch.no_grad():
feats = model.get_image_features(**processed_inputs)
user_vector = feats.pooler_output.squeeze().cpu().numpy()
return normalize(user_vector.reshape(1, -1))[0]
def recommend(query_image):
if query_image is None:
return "", [], ""
vec = embed(query_image)
sims = cosine_similarity(vec.reshape(1,-1), EMBEDDINGS_MAT)[0]
top3 = sims.argsort()[-3:][::-1]
csims = cosine_similarity(vec.reshape(1,-1), CENTROID_MATRIX)[0]
pred = CENTROID_NAMES[csims.argmax()]
conf = round(float(csims.max()), 4)
print(f"Pred: {pred} ({conf:.1%}) | Top3 scores: {[round(float(sims[i]),4) for i in top3]}")
print(f"Top3 labels: {[ACTION_LABELS[i] for i in top3]}")
emoji = ACTION_EMOJI.get(pred, "")
pred_html = f"""
<div style="background:linear-gradient(135deg,#667eea,#764ba2);border-radius:16px;
padding:24px 28px;color:white;margin-bottom:8px;">
<div style="font-size:12px;opacity:0.8;text-transform:uppercase;
letter-spacing:0.08em;margin-bottom:6px;">Detected Action</div>
<div style="font-size:30px;font-weight:700;margin-bottom:4px;">
{emoji} {pred.replace('_',' ').title()}</div>
<div style="font-size:14px;opacity:0.85;">
Confidence: <strong>{conf:.1%}</strong></div>
</div>"""
ds = get_ds()
gallery = []
bars = ""
for rank, idx in enumerate(top3, 1):
label = ACTION_LABELS[idx]
score = round(float(sims[idx]), 4)
emj = ACTION_EMOJI.get(label, "")
real_idx = DS_INDICES[int(idx)]
try:
img = ds[int(real_idx)]["image"].convert("RGB")
except:
img = Image.new("RGB", (120,120), (60,60,60))
gallery.append((img, f"#{rank} {emj} {label.replace('_',' ')} Β· {score:.4f}"))
pct = min(int(score * 100), 100)
bars += f"""
<div style="margin-bottom:14px;">
<div style="display:flex;justify-content:space-between;margin-bottom:5px;">
<span style="font-size:14px;font-weight:500;">
#{rank} {emj} {label.replace('_',' ').title()}</span>
<span style="font-size:13px;color:#7c3aed;font-weight:600;">{score:.4f}</span>
</div>
<div style="background:#f3f0ff;border-radius:99px;height:8px;overflow:hidden;">
<div style="background:linear-gradient(90deg,#667eea,#764ba2);
width:{pct}%;height:100%;border-radius:99px;"></div>
</div>
</div>"""
scores_html = f"""
<div style="background:var(--color-background-primary);
border:0.5px solid var(--color-border-tertiary);
border-radius:16px;padding:20px 24px;">
<div style="font-size:12px;font-weight:600;color:#7c3aed;text-transform:uppercase;
letter-spacing:0.06em;margin-bottom:16px;">Similarity Scores</div>
{bars}
</div>"""
return pred_html, gallery, scores_html
# UI
with gr.Blocks(title="Human Action Recommender") as demo:
gr.HTML("""
<div style="text-align:center;padding:2rem 0 1.5rem;">
<div style="font-size:48px;margin-bottom:12px;">πŸƒ</div>
<h1 style="font-size:2rem;font-weight:700;margin:0 0 8px;
background:linear-gradient(135deg,#667eea,#764ba2);
-webkit-background-clip:text;-webkit-text-fill-color:transparent;">
Human Action Recommender</h1>
<p style="font-size:1rem;color:#6b7280;margin:0;">
Upload any photo of a person β†’ detect the action +
find the 3 most similar images from 5,000 real photos
</p>
</div>""")
with gr.Row(equal_height=False):
with gr.Column(scale=1, min_width=300):
inp = gr.Image(type="pil", label="πŸ“€ Upload a Photo",
height=320, sources=["upload","webcam"])
btn = gr.Button("πŸ” Analyse Action", variant="primary", size="lg")
gr.HTML("""
<div style="margin-top:16px;background:var(--color-background-secondary);
border-radius:14px;padding:16px 20px;">
<div style="font-size:12px;font-weight:600;color:#7c3aed;
text-transform:uppercase;letter-spacing:0.08em;margin-bottom:10px;">
15 Action Classes</div>
<div style="display:flex;flex-wrap:wrap;gap:6px;">
<span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">πŸ“ž calling</span>
<span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">πŸ‘ clapping</span>
<span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">🚴 cycling</span>
<span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">πŸ’ƒ dancing</span>
<span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">πŸ₯€ drinking</span>
<span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">🍽️ eating</span>
<span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">πŸ₯Š fighting</span>
<span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">πŸ€— hugging</span>
<span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">πŸ˜‚ laughing</span>
<span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">🎧 listening</span>
<span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">πŸƒ running</span>
<span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">πŸͺ‘ sitting</span>
<span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">😴 sleeping</span>
<span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">πŸ“± texting</span>
<span style="background:#f3f0ff;color:#5b21b6;font-size:11px;padding:3px 9px;border-radius:99px;">πŸ’» using laptop</span>
</div>
<div style="margin-top:12px;font-size:12px;color:#9ca3af;">
First request loads the AI model
</div>
</div>""")
with gr.Column(scale=2, min_width=500):
pred_out = gr.HTML("""
<div style="background:var(--color-background-secondary);border-radius:16px;
padding:24px;text-align:center;color:var(--color-text-secondary);">
<div style="font-size:36px;margin-bottom:8px;">πŸ‘†</div>
<div>Upload a photo to detect the action</div>
</div>""")
gallery_out = gr.Gallery(
label="πŸ–ΌοΈ Top-3 Most Similar Images",
columns=3, rows=1, height=260,
object_fit="cover", show_label=True)
scores_out = gr.HTML("")
btn.click(fn=recommend, inputs=[inp], outputs=[pred_out, gallery_out, scores_out])
inp.change(fn=recommend, inputs=[inp], outputs=[pred_out, gallery_out, scores_out])
gr.HTML("""
<div style="text-align:center;padding:1.5rem 0 0.5rem;
border-top:0.5px solid var(--color-border-tertiary);margin-top:1rem;">
<span style="font-size:12px;color:#9ca3af;">
Dataset: Bingsu/Human_Action_Recognition Β· 18,000 images Β· 15 classes &nbsp;|&nbsp;
Model: openai/clip-vit-base-patch32 Β· CLIP embeddings Β· Cosine Similarity
</span>
</div>""")
if __name__ == "__main__":
demo.launch()