File size: 2,680 Bytes
16dc556
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""Export the blessed v6 adapter (Modal volume) -> merged bf16 -> GGUF Q8_0 -> HF Hub.

Q8_0 only: Q4_K_M corrupts this model (documented v4 failure). CPU instance (merge +
convert are RAM-bound, no GPU needed). The HF token is passed as a function argument at
spawn time — transient, never baked into the image or logged.

    uv run modal run scripts/modal_export_gguf.py --repo ricalanis/scrubdata-qwen3-4b-v6-q8
"""

import modal

image = (
    modal.Image.debian_slim(python_version="3.11")
    .apt_install("git")
    .pip_install("torch", "transformers>=4.45", "peft", "accelerate", "sentencepiece",
                 "huggingface_hub", "gguf", "numpy", "safetensors")
    .run_commands("git clone --depth 1 https://github.com/ggml-org/llama.cpp /llama.cpp")
)
app = modal.App("scrubdata-export-gguf", image=image)
adapter_vol = modal.Volume.from_name("scrubdata-v5-adapter")


@app.function(cpu=8, memory=49152, timeout=3600, volumes={"/vol": adapter_vol})
def export(repo: str, hf_token: str, adapter: str = "/vol/v5_seed21"):
    import subprocess
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from peft import PeftModel
    from huggingface_hub import HfApi

    base_id = "unsloth/Qwen3-4B-Instruct-2507"
    print("loading base + adapter (CPU, bf16)...", flush=True)
    base = AutoModelForCausalLM.from_pretrained(base_id, torch_dtype=torch.bfloat16)
    model = PeftModel.from_pretrained(base, adapter).merge_and_unload()
    tok = AutoTokenizer.from_pretrained(base_id)
    model.save_pretrained("/tmp/merged", safe_serialization=True)
    tok.save_pretrained("/tmp/merged")
    print("merged model saved", flush=True)

    out = "/tmp/scrubdata-qwen3-4b-v6.Q8_0.gguf"
    subprocess.run(["python", "/llama.cpp/convert_hf_to_gguf.py", "/tmp/merged",
                    "--outfile", out, "--outtype", "q8_0"], check=True)
    import os
    print(f"GGUF ready: {os.path.getsize(out)/1e9:.2f} GB", flush=True)

    api = HfApi(token=hf_token)
    api.create_repo(repo, repo_type="model", exist_ok=True)
    api.upload_file(path_or_fileobj=out, path_in_repo="scrubdata-qwen3-4b-v6.Q8_0.gguf",
                    repo_id=repo, repo_type="model",
                    commit_message="v6 (mixA): Q8_0 GGUF — hospital repair 0.475/0.185")
    print(f"uploaded to https://huggingface.co/{repo}", flush=True)
    return repo


@app.local_entrypoint()
def main(repo: str = "ricalanis/scrubdata-qwen3-4b-v6-q8"):
    from huggingface_hub import get_token
    token = get_token()
    assert token, "no local HF token found"
    call = export.spawn(repo=repo, hf_token=token)
    print(f"Launched detached. call_id={call.object_id}")