|
|
|
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import json |
|
|
import shutil |
|
|
import subprocess |
|
|
import tempfile |
|
|
from datetime import datetime, timedelta |
|
|
from functools import lru_cache |
|
|
from pathlib import Path |
|
|
|
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from modular_graph_and_candidates import build_graph_json, generate_html |
|
|
|
|
|
HF_MAIN_REPO = "https://github.com/huggingface/transformers" |
|
|
|
|
|
|
|
|
@lru_cache(maxsize=4) |
|
|
def clone_or_cache(repo_url: str) -> Path: |
|
|
"""Clone *repo_url* at most **once per 24 h**. |
|
|
|
|
|
The repo is cached under /tmp/<hash>. A hidden ``.cloned_at`` file stores the |
|
|
UTC ISO timestamp of the last clone; if that stamp is < 24 h old we reuse the |
|
|
existing checkout, otherwise we wipe the directory and clone afresh. This |
|
|
guarantees deterministic daily snapshots while avoiding repeated network |
|
|
cost within the same day (even across independent Space sessions if the |
|
|
container persists). |
|
|
""" |
|
|
tmp_root = Path(tempfile.gettempdir()) |
|
|
cache_dir = tmp_root / f"repo_{abs(hash(repo_url))}" |
|
|
stamp = cache_dir / ".cloned_at" |
|
|
|
|
|
if cache_dir.exists() and stamp.exists(): |
|
|
try: |
|
|
last = datetime.fromisoformat(stamp.read_text().strip()) |
|
|
if datetime.utcnow() - last < timedelta(days=1): |
|
|
return cache_dir |
|
|
except Exception: |
|
|
|
|
|
pass |
|
|
|
|
|
shutil.rmtree(cache_dir, ignore_errors=True) |
|
|
|
|
|
subprocess.check_call(["git", "clone", "--depth", "1", repo_url, str(cache_dir)]) |
|
|
stamp.write_text(datetime.utcnow().isoformat()) |
|
|
return cache_dir |
|
|
|
|
|
|
|
|
def run(repo_url: str, threshold: float, multimodal: bool, sim_method: str): |
|
|
repo_path = clone_or_cache(repo_url) |
|
|
|
|
|
graph = build_graph_json( |
|
|
transformers_dir=repo_path, |
|
|
threshold=threshold, |
|
|
multimodal=multimodal, |
|
|
sim_method=sim_method, |
|
|
) |
|
|
|
|
|
html = generate_html(graph) |
|
|
|
|
|
|
|
|
json_path = Path(tempfile.mktemp(suffix=".json")) |
|
|
json_path.write_text(json.dumps(graph), encoding="utf-8") |
|
|
|
|
|
return html, str(json_path) |
|
|
|
|
|
|
|
|
with gr.Blocks(css="body{background:#fafafa;}") as demo: |
|
|
gr.Markdown("## 🔍 Modular‑candidate explorer for 🤗 Transformers") |
|
|
|
|
|
with gr.Row(): |
|
|
repo_in = gr.Text(value=HF_MAIN_REPO, label="Repo / fork URL") |
|
|
thresh = gr.Slider(0.50, 0.95, value=0.78, step=0.01, label="Similarity ≥") |
|
|
multi_cb = gr.Checkbox(label="Only multimodal models") |
|
|
sim_radio = gr.Radio(["jaccard", "embedding"], value="jaccard", label="Similarity metric") |
|
|
go_btn = gr.Button("Build graph") |
|
|
|
|
|
html_out = gr.HTML() |
|
|
json_out = gr.File(label="Download graph.json") |
|
|
|
|
|
go_btn.click(run, [repo_in, thresh, multi_cb, sim_radio], [html_out, json_out]) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |