|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
GGUF Conversion for QMD Query Expansion 1.7B Model |
|
|
|
|
|
Loads base model, applies SFT adapter, then GRPO adapter, merges all, |
|
|
and converts to GGUF format for use with Ollama/llama.cpp/LM Studio. |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import subprocess |
|
|
|
|
|
import torch |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
from peft import PeftModel |
|
|
from huggingface_hub import HfApi, login |
|
|
|
|
|
|
|
|
BASE_MODEL = "Qwen/Qwen3-1.7B" |
|
|
SFT_MODEL = "tobil/qmd-query-expansion-1.7B-sft" |
|
|
GRPO_MODEL = "tobil/qmd-query-expansion-1.7B-grpo" |
|
|
OUTPUT_REPO = "tobil/qmd-query-expansion-1.7B-gguf" |
|
|
|
|
|
def run_command(cmd, description): |
|
|
"""Run a command with error handling.""" |
|
|
print(f" {description}...") |
|
|
try: |
|
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True) |
|
|
return True |
|
|
except subprocess.CalledProcessError as e: |
|
|
print(f" β Command failed: {' '.join(cmd)}") |
|
|
if e.stderr: |
|
|
print(f" STDERR: {e.stderr[:500]}") |
|
|
return False |
|
|
except FileNotFoundError: |
|
|
print(f" β Command not found: {cmd[0]}") |
|
|
return False |
|
|
|
|
|
|
|
|
print("π QMD Query Expansion 1.7B GGUF Conversion") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
print("\nπ¦ Installing build dependencies...") |
|
|
subprocess.run(["apt-get", "update", "-qq"], capture_output=True) |
|
|
subprocess.run(["apt-get", "install", "-y", "-qq", "build-essential", "cmake", "git"], capture_output=True) |
|
|
print(" β
Build tools ready") |
|
|
|
|
|
|
|
|
hf_token = os.environ.get("HF_TOKEN") |
|
|
if hf_token: |
|
|
print("\nπ Logging in to HuggingFace...") |
|
|
login(token=hf_token) |
|
|
print(" β
Logged in") |
|
|
|
|
|
|
|
|
print(f"\nπ§ Step 1: Loading base model {BASE_MODEL}...") |
|
|
base_model = AutoModelForCausalLM.from_pretrained( |
|
|
BASE_MODEL, |
|
|
torch_dtype=torch.bfloat16, |
|
|
device_map="auto", |
|
|
trust_remote_code=True, |
|
|
) |
|
|
print(" β
Base model loaded") |
|
|
|
|
|
|
|
|
print(f"\nπ§ Step 2: Loading SFT adapter {SFT_MODEL}...") |
|
|
model = PeftModel.from_pretrained(base_model, SFT_MODEL) |
|
|
print(" Merging SFT adapter...") |
|
|
model = model.merge_and_unload() |
|
|
print(" β
SFT merged") |
|
|
|
|
|
|
|
|
print(f"\nπ§ Step 3: Loading GRPO adapter {GRPO_MODEL}...") |
|
|
model = PeftModel.from_pretrained(model, GRPO_MODEL) |
|
|
print(" Merging GRPO adapter...") |
|
|
merged_model = model.merge_and_unload() |
|
|
print(" β
GRPO merged - final model ready") |
|
|
|
|
|
|
|
|
print("\nπ Loading tokenizer...") |
|
|
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) |
|
|
print(" β
Tokenizer loaded") |
|
|
|
|
|
|
|
|
print("\nπΎ Step 4: Saving merged model to disk...") |
|
|
merged_dir = "/tmp/merged_model" |
|
|
merged_model.save_pretrained(merged_dir, safe_serialization=True) |
|
|
tokenizer.save_pretrained(merged_dir) |
|
|
print(f" β
Saved to {merged_dir}") |
|
|
|
|
|
|
|
|
print("\nπ₯ Step 5: Setting up llama.cpp...") |
|
|
if not os.path.exists("/tmp/llama.cpp"): |
|
|
run_command( |
|
|
["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"], |
|
|
"Cloning llama.cpp" |
|
|
) |
|
|
|
|
|
|
|
|
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-r", "/tmp/llama.cpp/requirements.txt"], capture_output=True) |
|
|
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "sentencepiece", "protobuf"], capture_output=True) |
|
|
print(" β
llama.cpp ready") |
|
|
|
|
|
|
|
|
print("\nπ Step 6: Converting to GGUF format (FP16)...") |
|
|
gguf_output_dir = "/tmp/gguf_output" |
|
|
os.makedirs(gguf_output_dir, exist_ok=True) |
|
|
|
|
|
model_name = "qmd-query-expansion-1.7B" |
|
|
gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf" |
|
|
|
|
|
convert_script = "/tmp/llama.cpp/convert_hf_to_gguf.py" |
|
|
if not run_command( |
|
|
[sys.executable, convert_script, merged_dir, "--outfile", gguf_file, "--outtype", "f16"], |
|
|
"Converting to FP16 GGUF" |
|
|
): |
|
|
print(" β Conversion failed!") |
|
|
sys.exit(1) |
|
|
|
|
|
size_mb = os.path.getsize(gguf_file) / (1024 * 1024) |
|
|
print(f" β
FP16 GGUF created: {size_mb:.1f} MB") |
|
|
|
|
|
|
|
|
print("\nβοΈ Step 7: Building quantize tool...") |
|
|
os.makedirs("/tmp/llama.cpp/build", exist_ok=True) |
|
|
|
|
|
run_command( |
|
|
["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"], |
|
|
"Configuring with CMake" |
|
|
) |
|
|
run_command( |
|
|
["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"], |
|
|
"Building llama-quantize" |
|
|
) |
|
|
|
|
|
quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize" |
|
|
print(" β
Quantize tool built") |
|
|
|
|
|
|
|
|
print("\nβοΈ Step 8: Creating quantized versions...") |
|
|
quant_formats = [ |
|
|
("Q4_K_M", "4-bit medium (recommended)"), |
|
|
("Q5_K_M", "5-bit medium"), |
|
|
("Q8_0", "8-bit"), |
|
|
] |
|
|
|
|
|
quantized_files = [] |
|
|
for quant_type, description in quant_formats: |
|
|
print(f" Creating {quant_type} ({description})...") |
|
|
quant_file = f"{gguf_output_dir}/{model_name}-{quant_type.lower()}.gguf" |
|
|
|
|
|
if run_command([quantize_bin, gguf_file, quant_file, quant_type], f"Quantizing to {quant_type}"): |
|
|
size_mb = os.path.getsize(quant_file) / (1024 * 1024) |
|
|
print(f" β
{quant_type}: {size_mb:.1f} MB") |
|
|
quantized_files.append((quant_file, quant_type)) |
|
|
else: |
|
|
print(f" β οΈ Skipping {quant_type}") |
|
|
|
|
|
|
|
|
print("\nβοΈ Step 9: Uploading to Hugging Face Hub...") |
|
|
api = HfApi() |
|
|
|
|
|
print(f" Creating repository: {OUTPUT_REPO}") |
|
|
api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True) |
|
|
|
|
|
|
|
|
print(" Uploading FP16...") |
|
|
api.upload_file( |
|
|
path_or_fileobj=gguf_file, |
|
|
path_in_repo=f"{model_name}-f16.gguf", |
|
|
repo_id=OUTPUT_REPO, |
|
|
) |
|
|
print(" β
FP16 uploaded") |
|
|
|
|
|
|
|
|
for quant_file, quant_type in quantized_files: |
|
|
print(f" Uploading {quant_type}...") |
|
|
api.upload_file( |
|
|
path_or_fileobj=quant_file, |
|
|
path_in_repo=f"{model_name}-{quant_type.lower()}.gguf", |
|
|
repo_id=OUTPUT_REPO, |
|
|
) |
|
|
print(f" β
{quant_type} uploaded") |
|
|
|
|
|
|
|
|
print("\nπ Creating README...") |
|
|
readme_content = f"""--- |
|
|
base_model: {BASE_MODEL} |
|
|
tags: |
|
|
- gguf |
|
|
- llama.cpp |
|
|
- quantized |
|
|
- query-expansion |
|
|
- qmd |
|
|
--- |
|
|
|
|
|
# QMD Query Expansion 1.7B (GGUF) |
|
|
|
|
|
GGUF conversion of the QMD Query Expansion model for use with Ollama, llama.cpp, and LM Studio. |
|
|
|
|
|
## Model Details |
|
|
|
|
|
- **Base Model:** {BASE_MODEL} |
|
|
- **SFT Adapter:** {SFT_MODEL} |
|
|
- **GRPO Adapter:** {GRPO_MODEL} |
|
|
- **Task:** Query expansion for hybrid search (lex/vec/hyde format) |
|
|
|
|
|
## Available Quantizations |
|
|
|
|
|
| File | Quant | Description | |
|
|
|------|-------|-------------| |
|
|
| {model_name}-f16.gguf | F16 | Full precision | |
|
|
| {model_name}-q8_0.gguf | Q8_0 | 8-bit | |
|
|
| {model_name}-q5_k_m.gguf | Q5_K_M | 5-bit medium | |
|
|
| {model_name}-q4_k_m.gguf | Q4_K_M | 4-bit medium (recommended) | |
|
|
|
|
|
## Usage |
|
|
|
|
|
### With Ollama |
|
|
|
|
|
```bash |
|
|
# Download |
|
|
huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf --local-dir . |
|
|
|
|
|
# Create Modelfile |
|
|
echo 'FROM ./{model_name}-q4_k_m.gguf' > Modelfile |
|
|
|
|
|
# Create and run |
|
|
ollama create qmd-expand -f Modelfile |
|
|
ollama run qmd-expand |
|
|
``` |
|
|
|
|
|
### Prompt Format |
|
|
|
|
|
Use Qwen3 chat format with `/no_think`: |
|
|
|
|
|
``` |
|
|
<|im_start|>user |
|
|
/no_think Expand this search query: your query here<|im_end|> |
|
|
<|im_start|>assistant |
|
|
``` |
|
|
|
|
|
### Expected Output |
|
|
|
|
|
``` |
|
|
lex: keyword variation 1 |
|
|
lex: keyword variation 2 |
|
|
vec: natural language reformulation |
|
|
hyde: Hypothetical document passage answering the query. |
|
|
``` |
|
|
|
|
|
## License |
|
|
|
|
|
Apache 2.0 (inherited from Qwen3) |
|
|
""" |
|
|
|
|
|
api.upload_file( |
|
|
path_or_fileobj=readme_content.encode(), |
|
|
path_in_repo="README.md", |
|
|
repo_id=OUTPUT_REPO, |
|
|
) |
|
|
print(" β
README uploaded") |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("β
GGUF Conversion Complete!") |
|
|
print(f"π¦ Repository: https://huggingface.co/{OUTPUT_REPO}") |
|
|
print("=" * 60) |
|
|
|