Spaces:
Running
Running
| """CLI-1M Dataset Explorer — carosh/cli-1m | |
| Random-row viewer with bucket / shell / language filters. | |
| Deploy to HuggingFace Spaces (CPU Free tier). | |
| """ | |
| import random | |
| import gradio as gr | |
| from datasets import load_dataset | |
| _REVISION = "v1.0-rc1" | |
| _DS = None # lazy-loaded on first query | |
| SHELL_OPTS = ["(any)", "bash", "zsh", "fish", "powershell", "nu", "oils-osh"] | |
| LANG_OPTS = ["(any)", "en", "zh", "de", "es", "fr", "ja", "it", "pt", "ru", "ar", "hi", "ko", "he"] | |
| # Known buckets — avoids full dataset scan at startup | |
| BUCKET_OPTS = ["(any)", "devops", "cloud", "database", "security", "pkg_mgmt", | |
| "finance_web3", "bio_science", "data_ml", "network", "media", | |
| "editor_term", "editor_writer", "lang_tool", "mobile_embed", | |
| "modern_unix", "systems", "web_api", "misc"] | |
| def _load(): | |
| global _DS | |
| if _DS is None: | |
| _DS = load_dataset( | |
| "carosh/cli-1m", name="sample", revision=_REVISION, split="train" | |
| ) | |
| return _DS | |
| def sample_rows(shell_filter, lang_filter, bucket_filter, n_rows, seed): | |
| try: | |
| ds = _load() | |
| except Exception as e: | |
| return f"Error loading dataset: {e}", "" | |
| filtered = ds | |
| if shell_filter != "(any)": | |
| filtered = filtered.filter(lambda r: r["shell"] == shell_filter) | |
| if lang_filter != "(any)": | |
| filtered = filtered.filter(lambda r: r["language"] == lang_filter) | |
| if bucket_filter != "(any)": | |
| filtered = filtered.filter( | |
| lambda r: bucket_filter in (r["bucket"] if isinstance(r["bucket"], list) else []) | |
| ) | |
| total = len(filtered) | |
| if total == 0: | |
| return "No rows match the selected filters.", "0" | |
| rng = random.Random(int(seed) if str(seed).strip() else None) | |
| n = min(int(n_rows), total) | |
| indices = rng.sample(range(total), n) | |
| rows = filtered.select(indices) | |
| parts = [f"**{total:,} rows match** — showing {n}\n"] | |
| for i, row in enumerate(rows): | |
| msgs = row.get("messages") or [] | |
| user_msg = next((m["content"] for m in msgs if m.get("role") == "user"), "") | |
| assistant_msg = next((m["content"] for m in msgs if m.get("role") == "assistant"), "") | |
| bucket = ", ".join(row.get("bucket") or []) | |
| parts.append( | |
| f"---\n**Row {i+1}** · `shell={row.get('shell')}` · " | |
| f"`lang={row.get('language')}` · `bucket={bucket}`\n\n" | |
| f"**User:** {user_msg}\n\n" | |
| f"```{row.get('shell', 'bash')}\n{assistant_msg}\n```\n" | |
| ) | |
| return "\n".join(parts), f"{total:,}" | |
| with gr.Blocks(title="CLI-1M Explorer", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| "# CLI-1M Dataset Explorer\n" | |
| f"Browsing [`carosh/cli-1m`](https://huggingface.co/datasets/carosh/cli-1m) " | |
| f"— `sample` config (50k stratified rows), revision `{_REVISION}`\n\n" | |
| "Filter by shell, language, or industry bucket, then click **Sample rows**." | |
| ) | |
| with gr.Row(): | |
| shell_dd = gr.Dropdown(SHELL_OPTS, value="(any)", label="Shell") | |
| lang_dd = gr.Dropdown(LANG_OPTS, value="(any)", label="Language") | |
| bucket_dd = gr.Dropdown(BUCKET_OPTS, value="(any)", label="Industry bucket") | |
| with gr.Row(): | |
| n_rows = gr.Slider(1, 20, value=5, step=1, label="Rows to show") | |
| seed = gr.Number(value=42, label="Random seed (blank = random)") | |
| sample_btn = gr.Button("Sample rows", variant="primary") | |
| match_count = gr.Textbox(label="Matching rows", interactive=False) | |
| output = gr.Markdown() | |
| sample_btn.click( | |
| fn=sample_rows, | |
| inputs=[shell_dd, lang_dd, bucket_dd, n_rows, seed], | |
| outputs=[output, match_count], | |
| ) | |
| gr.Markdown( | |
| "---\n" | |
| "**Links:** " | |
| "[Dataset card](https://huggingface.co/datasets/carosh/cli-1m) · " | |
| "[Eval split (gated)](https://huggingface.co/datasets/carosh/cli-1m-eval) · " | |
| "[Source repo](https://github.com/wildcard/caro-eval) · " | |
| "Apache-2.0" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |