""" statlens — command-line entry point. Subcommands: statlens serve spin up local LLM backend + FastAPI; open browser statlens download fetch the LoRA adapter into the HF cache (NOT the 64 GB base — that's the user's job) statlens info show GPU / cache state / paths statlens classify one-shot CLI run (TSV + context.txt → result folder) """ from __future__ import annotations import os import signal import sys import webbrowser import click from . import ( __version__, DEFAULT_BASE_MODEL_REF, DEFAULT_LORA_REPO, DEFAULT_LORA_SUBFOLDER, DEFAULT_QUANTIZATION, LOCAL_LLM_PORT, LOCAL_WEB_PORT, ) from . import runtime def _common_serve_opts(f): """Decorator that attaches the model/LoRA/quantization options shared by `serve` and `classify`. The LLM backend is LLaMA-Factory (`llamafactory-cli api`); options here map directly to its YAML config. """ f = click.option("--quantization", default=DEFAULT_QUANTIZATION, type=click.Choice(["bitsandbytes", "awq", "gptq", "none"]), show_default=True, help="LLM weight-quantization mode (passed to LLaMA-Factory)")(f) f = click.option("--lora-path", default=None, type=click.Path(), help="local path to the LoRA folder; " "if omitted, downloaded from HF")(f) f = click.option("--base-model", default=None, type=click.Path(), help="local path to the BF16 base model directory; " "if omitted, auto-detect from common paths " "(~/models/qwen3-32b, /root/autodl-tmp/..., HF cache)")(f) return f @click.group(context_settings={"help_option_names": ["-h", "--help"]}) @click.version_option(__version__, prog_name="statlens") def main(): """statLens — DEA method selector. See `statlens serve --help`.""" # ─────────────────────────────── serve ─────────────────────────────── @main.command() @_common_serve_opts @click.option("--web-port", default=LOCAL_WEB_PORT, show_default=True, type=int) @click.option("--llm-port", default=LOCAL_LLM_PORT, show_default=True, type=int) @click.option("--no-browser", is_flag=True, help="don't auto-open browser") @click.option("--no-gpu-check", is_flag=True, help="skip GPU sanity check") def serve(base_model, lora_path, quantization, web_port, llm_port, no_browser, no_gpu_check): """Start the LLM backend (LLaMA-Factory) + the web app on http://localhost:7860 .""" if not no_gpu_check: runtime.check_gpu() base_path = runtime.resolve_base_model(base_model) lora_path = runtime.ensure_lora_cached(lora_path) llm_proc = runtime.start_server( base_path, lora_path, port=llm_port, quantization=quantization, ) def shutdown(signum=None, frame=None): runtime.stop_server(llm_proc) sys.exit(0) signal.signal(signal.SIGINT, shutdown) signal.signal(signal.SIGTERM, shutdown) try: # show animated progress; abort if subprocess dies runtime.wait_for_server(port=llm_port, proc=llm_proc) except (TimeoutError, RuntimeError) as e: click.secho(f"\n[statlens] {e}", fg="red") runtime.stop_server(llm_proc) sys.exit(2) os.environ["STATLENS_LLM_ENDPOINT"] = f"http://127.0.0.1:{llm_port}/v1" # LLaMA-Factory's API exposes the model under the name "gpt-3.5-turbo". os.environ["STATLENS_LLM_MODEL"] = "gpt-3.5-turbo" web_url = f"http://localhost:{web_port}/" # Prominent ready banner — only what end users care about. click.echo("") click.secho("══════════════════════════════════════════════════════", fg="green") click.secho(f" ✅ statLens ready", fg="green", bold=True) click.secho(f" open in browser: {web_url}", fg="green") click.secho(f" Ctrl+C to stop.", fg="green") click.secho("══════════════════════════════════════════════════════", fg="green") click.echo("") if not no_browser: import threading threading.Timer(1.0, lambda: webbrowser.open(web_url)).start() import uvicorn from . import server as server_mod try: # Quiet uvicorn so it doesn't drown the ready banner; users see HTTP # requests via the LLM log if they want. uvicorn.run(server_mod.app, host="127.0.0.1", port=web_port, log_level="warning") finally: runtime.stop_server(llm_proc) # ─────────────────────────── download ─────────────────────────── @main.command() @click.option("--lora-repo", default=DEFAULT_LORA_REPO, show_default=True) @click.option("--lora-subfolder", default=DEFAULT_LORA_SUBFOLDER, show_default=True) def download(lora_repo, lora_subfolder): """Pre-fetch the LoRA adapter (~1 GB) into the HF cache. statLens does NOT auto-download the 64 GB base model. Get it yourself with: huggingface-cli download Qwen/Qwen3-32B --local-dir ~/models/qwen3-32b """ p = runtime.ensure_lora_cached(None, lora_repo, lora_subfolder) click.secho(f"\n✅ LoRA cached at {p}", fg="green") click.echo( "\nReminder: download the BF16 base model separately, e.g.\n" " huggingface-cli download Qwen/Qwen3-32B --local-dir ~/models/qwen3-32b" ) # ─────────────────────────── info ─────────────────────────── @main.command() def info(): """Show GPU, cache, and default endpoint information.""" click.echo(f"statlens version: {__version__}\n") try: import torch if torch.cuda.is_available(): for i in range(torch.cuda.device_count()): p = torch.cuda.get_device_properties(i) click.echo(f" GPU {i}: {p.name} · {p.total_memory/1024**3:.1f} GB") else: click.echo(" GPU: (none)") except Exception as e: click.echo(f" GPU: (torch not importable: {e})") click.echo("") click.echo(f" base model (reference) : {DEFAULT_BASE_MODEL_REF} (NOT auto-downloaded)") click.echo(f" base model (resolved) : " f"{os.environ.get('STATLENS_BASE_MODEL', '(unset — pass --base-model)')}") click.echo(f" default quantization : {DEFAULT_QUANTIZATION}") click.echo(f" lora repo : {DEFAULT_LORA_REPO} (subfolder {DEFAULT_LORA_SUBFOLDER})") click.echo("") from huggingface_hub.constants import HF_HUB_CACHE click.echo(f" HF cache root : {HF_HUB_CACHE}") d = runtime.cache_dir_for(DEFAULT_LORA_REPO) if d.exists(): sz = sum(p.stat().st_size for p in d.rglob("*") if p.is_file()) click.echo(f" · {DEFAULT_LORA_REPO} ✓ ({sz/1024**3:.2f} GB at {d})") else: click.echo(f" · {DEFAULT_LORA_REPO} ✗ (run `statlens download` to fetch)") # ─────────────────────────── classify ─────────────────────────── @main.command() @_common_serve_opts @click.option("--tsv", required=True, type=click.Path(exists=True, dir_okay=False)) @click.option("--context", required=True, type=click.Path(exists=True, dir_okay=False)) @click.option("--out", required=True, type=click.Path(file_okay=False)) @click.option("--endpoint", default=None, help="OpenAI-compatible endpoint (default: spin up a temporary local LLM backend)") @click.option("--model", default="gpt-3.5-turbo", show_default=True) @click.option("--keep-llm", is_flag=True, help="don't shut down the LLM backend after classifying") def classify(base_model, lora_path, quantization, tsv, context, out, endpoint, model, keep_llm): """One-shot CLI: classify a single TSV+context, run pipeline, exit.""" from pathlib import Path from .statlens_run import run_one if endpoint is None: runtime.check_gpu() base_path = runtime.resolve_base_model(base_model) lora_resolved = runtime.ensure_lora_cached(lora_path) proc = runtime.start_server(base_path, lora_resolved, quantization=quantization) try: runtime.wait_for_server() endpoint = f"http://127.0.0.1:{LOCAL_LLM_PORT}/v1" run_one(Path(tsv), Path(context), Path(out), endpoint, model) finally: if not keep_llm: runtime.stop_server(proc) else: from pathlib import Path run_one(Path(tsv), Path(context), Path(out), endpoint, model) if __name__ == "__main__": main()