active-orgs / app.py
Violette's picture
Violette HF Staff
Update app.py
a260d90 verified
import os
import requests
import pandas as pd
import gradio as gr
BASE = "https://huggingface.co/api"
HF_TOKEN = (os.getenv("HF_TOKEN") or "").strip()
HEADERS = {"User-Agent": "hf-most-active-orgs-multi-seed/1.0"}
if HF_TOKEN:
HEADERS["Authorization"] = f"Bearer {HF_TOKEN}"
def get_json(url: str):
try:
r = requests.get(url, headers=HEADERS, timeout=30)
if r.status_code != 200:
return None, f"{r.status_code}: {r.text[:180]}"
return r.json(), None
except Exception as e:
return None, f"{type(e).__name__}: {e}"
def normalize_plan(p):
if not p:
return ""
p = str(p).lower()
if p == "plus":
return "enterprise plus"
if p in ("team", "enterprise"):
return p
return p
def fetch_top_models(sort_key: str, limit: int):
# sort_key examples: downloads, likes, lastModified
url = f"{BASE}/models?limit={limit}&sort={sort_key}&direction=-1"
return get_json(url)
def extract_owner_from_model_id(m):
mid = m.get("modelId") or m.get("id")
if isinstance(mid, str) and "/" in mid:
return mid.split("/", 1)[0]
return None
def build_table(
top_downloads=500,
top_likes=500,
top_recent=500,
org_scan_limit=3000,
rows_to_show=400,
):
dbg = []
dbg.append(f"top_downloads={top_downloads}")
dbg.append(f"top_likes={top_likes}")
dbg.append(f"top_recent={top_recent}")
dbg.append(f"org_scan_limit={org_scan_limit}")
dbg.append(f"rows_to_show={rows_to_show}")
dbg.append(f"HF_TOKEN_present={bool(HF_TOKEN)}")
# 1) Fetch 3 seeds of models
models_downloads, err_d = fetch_top_models("downloads", int(top_downloads))
models_likes, err_l = fetch_top_models("likes", int(top_likes))
models_recent, err_r = fetch_top_models("lastModified", int(top_recent))
if err_d or not isinstance(models_downloads, list):
return f"<pre>/api/models downloads failed: {err_d}</pre>", "\n".join(["### Debug"] + dbg), None
if err_l or not isinstance(models_likes, list):
return f"<pre>/api/models likes failed: {err_l}</pre>", "\n".join(["### Debug"] + dbg), None
if err_r or not isinstance(models_recent, list):
return f"<pre>/api/models lastModified failed: {err_r}</pre>", "\n".join(["### Debug"] + dbg), None
# 2) Union org handles + track which seed(s) they come from
org_sources = {} # org -> dict flags
def mark(org, key):
if not org:
return
if org not in org_sources:
org_sources[org] = {"from_downloads": 0, "from_likes": 0, "from_recent": 0}
org_sources[org][key] = 1
for m in models_downloads:
if isinstance(m, dict):
mark(extract_owner_from_model_id(m), "from_downloads")
for m in models_likes:
if isinstance(m, dict):
mark(extract_owner_from_model_id(m), "from_likes")
for m in models_recent:
if isinstance(m, dict):
mark(extract_owner_from_model_id(m), "from_recent")
orgs = list(org_sources.keys())[: int(org_scan_limit)]
dbg.append(f"orgs_discovered_union={len(orgs)}")
dbg.append(
"orgs_by_seed_counts="
f"downloads={sum(v['from_downloads'] for v in org_sources.values())}, "
f"likes={sum(v['from_likes'] for v in org_sources.values())}, "
f"recent={sum(v['from_recent'] for v in org_sources.values())}"
)
# 3) Fetch org overviews + score
rows = []
failures = 0
for org in orgs:
data, e = get_json(f"{BASE}/organizations/{org}/overview")
if e or not isinstance(data, dict):
failures += 1
continue
plan = normalize_plan(data.get("plan"))
num_models = int(data.get("numModels") or 0)
num_datasets = int(data.get("numDatasets") or 0)
num_spaces = int(data.get("numSpaces") or 0)
num_followers = int(data.get("numFollowers") or 0)
num_users = int(data.get("numUsers") or 0)
# Score "activité footprint" (même logique que ton autre Space)
activity_score = num_models * 3 + num_datasets * 3 + num_spaces * 2 + num_followers
# Bonus: “multi-seed” score (si une org apparaît dans plusieurs seeds)
src = org_sources.get(org, {})
seed_score = int(src.get("from_downloads", 0)) + int(src.get("from_likes", 0)) + int(src.get("from_recent", 0))
rows.append({
"org": org,
"plan": plan,
"seed_score(0-3)": seed_score,
"from_downloads": int(src.get("from_downloads", 0)),
"from_likes": int(src.get("from_likes", 0)),
"from_recent": int(src.get("from_recent", 0)),
"num_models": num_models,
"num_datasets": num_datasets,
"num_spaces": num_spaces,
"num_users": num_users,
"num_followers": num_followers,
"activity_score": activity_score,
})
dbg.append(f"orgs_ranked={len(rows)}")
dbg.append(f"org_overview_failures={failures}")
if not rows:
probe_org = orgs[0] if orgs else "N/A"
probe, probe_err = get_json(f"{BASE}/organizations/{probe_org}/overview") if orgs else (None, "no orgs")
dbg.append(f"probe_org={probe_org}")
dbg.append(f"probe_err={probe_err}")
dbg.append(f"probe_type={type(probe).__name__ if probe is not None else 'None'}")
dbg.append(f"probe_keys={(list(probe.keys())[:20] if isinstance(probe, dict) else '')}")
return "<pre>No orgs ranked (see debug)</pre>", "\n".join(["### Debug"] + dbg), None
df = pd.DataFrame(rows).sort_values(
by=["activity_score", "seed_score(0-3)"],
ascending=[False, False]
)
# Display N rows in the UI
df_show = df.head(int(rows_to_show))
# Save full CSV for download
csv_path = "/tmp/most_active_orgs_multi_seed.csv"
df.to_csv(csv_path, index=False)
# Render HTML table with scroll
html_table = f"""
<div style="max-height: 1100px; overflow:auto; border:1px solid #333; border-radius:10px; padding:8px;">
{df_show.to_html(index=False)}
</div>
"""
return html_table, "\n".join(["### Debug"] + dbg), csv_path
with gr.Blocks() as demo:
gr.Markdown("# 🏆 Most Active Orgs (Global) — multi-seed discovery")
gr.Markdown(
"Découverte des orgs via 3 seeds de modèles: **downloads + likes + récents (lastModified)**, "
"puis ranking via `/api/organizations/{org}/overview`."
)
with gr.Row():
top_downloads = gr.Slider(50, 1000, value=500, step=50, label="Top models by downloads (N)")
top_likes = gr.Slider(50, 1000, value=500, step=50, label="Top models by likes (N)")
top_recent = gr.Slider(50, 1000, value=500, step=50, label="Top models by recency (lastModified) (N)")
with gr.Row():
org_scan_limit = gr.Slider(100, 8000, value=3000, step=100, label="Max orgs to evaluate (union cap)")
rows_to_show = gr.Slider(50, 2000, value=400, step=50, label="Rows to display (visible)")
btn = gr.Button("Refresh Ranking")
out = gr.HTML()
dbg = gr.Markdown()
csv_file = gr.File(label="Download full CSV")
btn.click(
build_table,
inputs=[top_downloads, top_likes, top_recent, org_scan_limit, rows_to_show],
outputs=[out, dbg, csv_file],
)
demo.launch()