ethanker
/

symbol-fim-model

PyTorch

symbol_fim_transformer

Model card Files Files and versions

xet

Community

ethanker commited on Nov 7, 2025

Commit

fad9f17

verified ·

1 Parent(s): 65998db

Upload push_to_hf.py with huggingface_hub

Browse files

Files changed (1) hide show

push_to_hf.py +267 -0

push_to_hf.py ADDED Viewed

	@@ -0,0 +1,267 @@

+#!/usr/bin/env python
+import argparse
+import json
+import os
+from pathlib import Path
+from typing import Generator
+import torch
+from datasets import Dataset, load_dataset
+from huggingface_hub import HfApi, Repository, create_repo, upload_file
+from omegaconf import OmegaConf
+from rich.console import Console
+from rich.panel import Panel
+from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn
+from models import SymbolFIMModel
+console = Console()
+def load_jsonl(path: str) -> Generator[dict, None, None]:
+    with open(path, "r", encoding="utf-8") as handle:
+        for line in handle:
+            if not line.strip():
+                continue
+            yield json.loads(line)
+def push_model(
+    model_path: str,
+    config_path: str,
+    repo_id: str,
+    token: str,
+    private: bool = False,
+) -> None:
+    console.print(Panel(f"[bold cyan]Pushing Model to {repo_id}[/bold cyan]", border_style="cyan"))
+    cfg = OmegaConf.load(config_path)
+    model = SymbolFIMModel(
+        vocab_size=260,
+        d_model=cfg.model.d_model,
+        n_layers=cfg.model.n_layers,
+        n_heads=cfg.model.n_heads,
+        window=cfg.model.window,
+        max_len=cfg.max_len,
+        ast_head_cfg=None,
+    )
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"Model not found at {model_path}")
+    state_dict = torch.load(model_path, map_location="cpu", weights_only=False)
+    model.load_state_dict(state_dict)
+    model.eval()
+    try:
+        repo_url = create_repo(repo_id=repo_id, token=token, private=private, exist_ok=True)
+    except Exception as e:
+        if "403" in str(e) or "Forbidden" in str(e):
+            console.print(f"[bold yellow]⚠️  Warning:[/bold yellow] Cannot create repo. Make sure:")
+            console.print("  1. Your HF token has 'write' permissions (not just 'read')")
+            console.print("  2. The repo exists at https://huggingface.co/{repo_id}")
+            console.print("  3. You have access to the namespace")
+            console.print(f"\n[yellow]Trying to upload to existing repo...[/yellow]")
+        else:
+            raise
+    api = HfApi(token=token)
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+        console=console
+    ) as progress:
+        task = progress.add_task("[cyan]Uploading model files...", total=100)
+        model_dir = Path("/tmp/hf_model_upload")
+        model_dir.mkdir(exist_ok=True)
+        torch.save(model.state_dict(), model_dir / "pytorch_model.bin")
+        progress.update(task, advance=30)
+        model_config = {
+            "vocab_size": model.vocab_size,
+            "d_model": cfg.model.d_model,
+            "n_layers": cfg.model.n_layers,
+            "n_heads": cfg.model.n_heads,
+            "window": cfg.model.window,
+            "max_len": cfg.max_len,
+            "model_type": "symbol_fim_transformer",
+        }
+        with open(model_dir / "config.json", "w") as f:
+            json.dump(model_config, f, indent=2)
+        progress.update(task, advance=20)
+        api.upload_folder(
+            folder_path=str(model_dir),
+            repo_id=repo_id,
+            repo_type="model",
+            token=token,
+        )
+        progress.update(task, advance=50)
+    console.print(f"[green]✓[/green] Model pushed to: [cyan]https://huggingface.co/{repo_id}[/cyan]")
+def push_dataset(
+    dataset_path: str,
+    repo_id: str,
+    token: str,
+    private: bool = False,
+    max_samples: int = None,
+) -> None:
+    console.print(Panel(f"[bold cyan]Pushing Dataset to {repo_id}[/bold cyan]", border_style="cyan"))
+    records = []
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+        console=console
+    ) as progress:
+        task = progress.add_task("[cyan]Loading dataset...", total=None)
+        for idx, record in enumerate(load_jsonl(dataset_path)):
+            records.append(record)
+            if max_samples and idx + 1 >= max_samples:
+                break
+            if (idx + 1) % 1000 == 0:
+                progress.update(task, description=f"[cyan]Loaded {idx + 1:,} samples...")
+        progress.update(task, completed=True)
+        console.print(f"[green]✓[/green] Loaded {len(records):,} samples")
+    dataset = Dataset.from_list(records)
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+        console=console
+    ) as progress:
+        task = progress.add_task("[cyan]Pushing dataset to Hub...", total=100)
+        dataset.push_to_hub(
+            repo_id=repo_id,
+            token=token,
+            private=private,
+        )
+        progress.update(task, completed=True)
+    console.print(f"[green]✓[/green] Dataset pushed to: [cyan]https://huggingface.co/datasets/{repo_id}[/cyan]")
+def push_code(
+    code_dir: str,
+    repo_id: str,
+    token: str,
+    private: bool = False,
+) -> None:
+    console.print(Panel(f"[bold cyan]Pushing Code to {repo_id}[/bold cyan]", border_style="cyan"))
+    create_repo(repo_id=repo_id, token=token, private=private, exist_ok=True)
+    api = HfApi(token=token)
+    code_path = Path(code_dir)
+    files_to_upload = []
+    for ext in ["*.py", "*.yaml", "*.yml", "*.txt", "*.md"]:
+        files_to_upload.extend(code_path.rglob(ext))
+    files_to_upload = [f for f in files_to_upload if "__pycache__" not in str(f)]
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+        console=console
+    ) as progress:
+        task = progress.add_task(f"[cyan]Uploading {len(files_to_upload)} files...", total=len(files_to_upload))
+        for file_path in files_to_upload:
+            rel_path = file_path.relative_to(code_path)
+            api.upload_file(
+                path_or_fileobj=str(file_path),
+                path_in_repo=str(rel_path),
+                repo_id=repo_id,
+                repo_type="model",
+                token=token,
+            )
+            progress.update(task, advance=1)
+    console.print(f"[green]✓[/green] Code pushed to: [cyan]https://huggingface.co/{repo_id}[/cyan]")
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Push experiment to Hugging Face Hub")
+    parser.add_argument("--model-path", type=str, default="/workspace/runs/model.pt")
+    parser.add_argument("--config-path", type=str, required=True)
+    parser.add_argument("--dataset-path", type=str, required=True)
+    parser.add_argument("--code-dir", type=str, default="/workspace/experiments")
+    parser.add_argument("--model-repo", type=str, required=True, help="HF repo ID for model (e.g., username/model-name)")
+    parser.add_argument("--dataset-repo", type=str, required=True, help="HF repo ID for dataset (e.g., username/dataset-name)")
+    parser.add_argument("--code-repo", type=str, help="HF repo ID for code (optional, defaults to model-repo)")
+    parser.add_argument("--token", type=str, help="HF token (or set HF_TOKEN env var)")
+    parser.add_argument("--private", action="store_true", help="Make repos private")
+    parser.add_argument("--max-dataset-samples", type=int, help="Limit dataset samples (for testing)")
+    parser.add_argument("--push-model", action="store_true", default=True)
+    parser.add_argument("--push-dataset", action="store_true", default=True)
+    parser.add_argument("--push-code", action="store_true", default=True)
+    args = parser.parse_args()
+    token = args.token or os.getenv("HF_TOKEN")
+    if not token:
+        console.print("[bold red]Error:[/bold red] HF token required. Set --token or HF_TOKEN env var")
+        return
+    code_repo = args.code_repo or args.model_repo
+    try:
+        if args.push_model:
+            push_model(
+                model_path=args.model_path,
+                config_path=args.config_path,
+                repo_id=args.model_repo,
+                token=token,
+                private=args.private,
+            )
+            console.print()
+        if args.push_dataset:
+            push_dataset(
+                dataset_path=args.dataset_path,
+                repo_id=args.dataset_repo,
+                token=token,
+                private=args.private,
+                max_samples=args.max_dataset_samples,
+            )
+            console.print()
+        if args.push_code:
+            push_code(
+                code_dir=args.code_dir,
+                repo_id=code_repo,
+                token=token,
+                private=args.private,
+            )
+            console.print()
+        console.print(Panel("[bold green]✓ All components pushed successfully![/bold green]", border_style="green"))
+    except Exception as e:
+        console.print(f"[bold red]Error:[/bold red] {e}")
+        raise
+if __name__ == "__main__":
+    main()