File size: 4,541 Bytes
1fbdc40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d112ab
1fbdc40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3c2b7f
1fbdc40
7d112ab
 
 
 
f3c2b7f
 
59c9a78
f3c2b7f
59c9a78
f3c2b7f
1fbdc40
 
 
 
 
 
f3c2b7f
1fbdc40
 
 
 
 
 
 
f3c2b7f
1fbdc40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# /// script
# dependencies = ["transformers", "peft", "huggingface_hub", "torch"]
# ///

"""
Convert fine-tuned LoRA model to GGUF format with quantization.
Merges adapter with base model, converts to GGUF, and uploads to Hub.
"""

import os
import subprocess
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

# Configuration from environment variables or defaults
ADAPTER_MODEL = os.getenv("ADAPTER_MODEL", "nathens/qwen-codeforces-sft")
BASE_MODEL = os.getenv("BASE_MODEL", "Qwen/Qwen2.5-0.5B")
OUTPUT_REPO = os.getenv("OUTPUT_REPO", "nathens/my-model-gguf")
QUANTIZATION = os.getenv("QUANTIZATION", "Q4_K_M")

print(f"๐Ÿ”ง Converting model to GGUF")
print(f"  Base model: {BASE_MODEL}")
print(f"  Adapter: {ADAPTER_MODEL}")
print(f"  Output: {OUTPUT_REPO}")
print(f"  Quantization: {QUANTIZATION}")

# Step 1: Load base model and tokenizer
print("\n๐Ÿ“ฆ Loading base model and tokenizer...")
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)

# Step 2: Load and merge LoRA adapter
print(f"๐Ÿ”€ Loading and merging LoRA adapter from {ADAPTER_MODEL}...")
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
print("โš™๏ธ  Merging adapter weights into base model...")
merged_model = model.merge_and_unload()

# Step 3: Save merged model
print("๐Ÿ’พ Saving merged model...")
merged_dir = "./merged_model"
merged_model.save_pretrained(merged_dir)
tokenizer.save_pretrained(merged_dir)
print(f"โœ… Merged model saved to {merged_dir}")

# Step 4: Install llama.cpp for conversion
print("\n๐Ÿ“ฅ Installing llama.cpp for GGUF conversion...")
subprocess.run(["apt-get", "update", "-qq"], check=True)
subprocess.run(["apt-get", "install", "-y", "-qq", "git", "build-essential", "cmake"], check=True)
subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True)

# Get number of processors
nproc_result = subprocess.run(["nproc"], capture_output=True, text=True, check=True)
nproc = nproc_result.stdout.strip()
print(f"Building llama.cpp with {nproc} cores using CMake...")

# Use CMake to build (CPU only - CUDA not needed for conversion/quantization)
os.makedirs("llama.cpp/build", exist_ok=True)
subprocess.run(["cmake", "-B", "llama.cpp/build", "-S", "llama.cpp"], check=True)
subprocess.run(["cmake", "--build", "llama.cpp/build", "--config", "Release", "-j", nproc], check=True)

# Step 5: Convert to GGUF format
print("\n๐Ÿ”„ Converting to GGUF format...")
subprocess.run([
    "python3", "llama.cpp/convert_hf_to_gguf.py",
    merged_dir,
    "--outfile", "./model-f16.gguf",
    "--outtype", "f16"
], check=True)
print("โœ… Converted to FP16 GGUF")

# Step 6: Quantize to specified format
print(f"\nโšก Quantizing to {QUANTIZATION}...")
subprocess.run([
    "./llama.cpp/build/bin/llama-quantize",
    "./model-f16.gguf",
    f"./model-{QUANTIZATION}.gguf",
    QUANTIZATION
], check=True)
print(f"โœ… Quantized to {QUANTIZATION}")

# Step 7: Upload to Hub
print(f"\n๐Ÿ“ค Uploading to {OUTPUT_REPO}...")
from huggingface_hub import HfApi
api = HfApi()

# Create repo if it doesn't exist
try:
    api.create_repo(OUTPUT_REPO, repo_type="model", exist_ok=True)
except Exception as e:
    print(f"Note: {e}")

# Upload the quantized GGUF file
api.upload_file(
    path_or_fileobj=f"./model-{QUANTIZATION}.gguf",
    path_in_repo=f"model-{QUANTIZATION}.gguf",
    repo_id=OUTPUT_REPO,
    repo_type="model"
)

# Also upload the original FP16 version
api.upload_file(
    path_or_fileobj="./model-f16.gguf",
    path_in_repo="model-f16.gguf",
    repo_id=OUTPUT_REPO,
    repo_type="model"
)

# Upload tokenizer files
for file in ["tokenizer.json", "tokenizer_config.json", "vocab.json", "merges.txt", "special_tokens_map.json"]:
    try:
        api.upload_file(
            path_or_fileobj=f"{merged_dir}/{file}",
            path_in_repo=file,
            repo_id=OUTPUT_REPO,
            repo_type="model"
        )
    except Exception:
        pass  # Some files may not exist

print(f"\nโœ… Conversion complete!")
print(f"๐Ÿ“ GGUF model available at: https://huggingface.co/{OUTPUT_REPO}")
print(f"\n๐Ÿ’ก To use with Ollama:")
print(f"   1. Download: huggingface-cli download {OUTPUT_REPO} model-{QUANTIZATION}.gguf")
print(f"   2. Create Modelfile with the downloaded GGUF")
print(f"   3. Run: ollama create my-model -f Modelfile")
print(f"   4. Use: ollama run my-model")