#!/usr/bin/env python3 # /// script # requires-python = ">=3.10" # dependencies = [ # "transformers>=4.36.0", # "peft>=0.7.0", # "torch>=2.0.0", # "accelerate>=0.24.0", # "huggingface_hub>=0.20.0", # "sentencepiece>=0.1.99", # "protobuf>=3.20.0", # "numpy", # "gguf", # ] # /// """ GGUF Conversion for QMD Query Expansion 1.7B Model Loads base model, applies SFT adapter, then GRPO adapter, merges all, and converts to GGUF format for use with Ollama/llama.cpp/LM Studio. """ import os import sys import subprocess import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel from huggingface_hub import HfApi, login # Configuration BASE_MODEL = "Qwen/Qwen3-1.7B" SFT_MODEL = "tobil/qmd-query-expansion-1.7B-sft" GRPO_MODEL = "tobil/qmd-query-expansion-1.7B-grpo" OUTPUT_REPO = "tobil/qmd-query-expansion-1.7B-gguf" def run_command(cmd, description): """Run a command with error handling.""" print(f" {description}...") try: result = subprocess.run(cmd, check=True, capture_output=True, text=True) return True except subprocess.CalledProcessError as e: print(f" āŒ Command failed: {' '.join(cmd)}") if e.stderr: print(f" STDERR: {e.stderr[:500]}") return False except FileNotFoundError: print(f" āŒ Command not found: {cmd[0]}") return False print("šŸ”„ QMD Query Expansion 1.7B GGUF Conversion") print("=" * 60) # Install build tools print("\nšŸ“¦ Installing build dependencies...") subprocess.run(["apt-get", "update", "-qq"], capture_output=True) subprocess.run(["apt-get", "install", "-y", "-qq", "build-essential", "cmake", "git"], capture_output=True) print(" āœ… Build tools ready") # Login to HuggingFace hf_token = os.environ.get("HF_TOKEN") if hf_token: print("\nšŸ” Logging in to HuggingFace...") login(token=hf_token) print(" āœ… Logged in") # Step 1: Load base model print(f"\nšŸ”§ Step 1: Loading base model {BASE_MODEL}...") base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, ) print(" āœ… Base model loaded") # Step 2: Load and merge SFT adapter print(f"\nšŸ”§ Step 2: Loading SFT adapter {SFT_MODEL}...") model = PeftModel.from_pretrained(base_model, SFT_MODEL) print(" Merging SFT adapter...") model = model.merge_and_unload() print(" āœ… SFT merged") # Step 3: Load and merge GRPO adapter print(f"\nšŸ”§ Step 3: Loading GRPO adapter {GRPO_MODEL}...") model = PeftModel.from_pretrained(model, GRPO_MODEL) print(" Merging GRPO adapter...") merged_model = model.merge_and_unload() print(" āœ… GRPO merged - final model ready") # Load tokenizer print("\nšŸ“ Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) print(" āœ… Tokenizer loaded") # Step 4: Save merged model print("\nšŸ’¾ Step 4: Saving merged model to disk...") merged_dir = "/tmp/merged_model" merged_model.save_pretrained(merged_dir, safe_serialization=True) tokenizer.save_pretrained(merged_dir) print(f" āœ… Saved to {merged_dir}") # Step 5: Setup llama.cpp print("\nšŸ“„ Step 5: Setting up llama.cpp...") if not os.path.exists("/tmp/llama.cpp"): run_command( ["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"], "Cloning llama.cpp" ) # Install Python deps subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-r", "/tmp/llama.cpp/requirements.txt"], capture_output=True) subprocess.run([sys.executable, "-m", "pip", "install", "-q", "sentencepiece", "protobuf"], capture_output=True) print(" āœ… llama.cpp ready") # Step 6: Convert to GGUF (FP16) print("\nšŸ”„ Step 6: Converting to GGUF format (FP16)...") gguf_output_dir = "/tmp/gguf_output" os.makedirs(gguf_output_dir, exist_ok=True) model_name = "qmd-query-expansion-1.7B" gguf_file = f"{gguf_output_dir}/{model_name}-f16.gguf" convert_script = "/tmp/llama.cpp/convert_hf_to_gguf.py" if not run_command( [sys.executable, convert_script, merged_dir, "--outfile", gguf_file, "--outtype", "f16"], "Converting to FP16 GGUF" ): print(" āŒ Conversion failed!") sys.exit(1) size_mb = os.path.getsize(gguf_file) / (1024 * 1024) print(f" āœ… FP16 GGUF created: {size_mb:.1f} MB") # Step 7: Build quantize tool print("\nāš™ļø Step 7: Building quantize tool...") os.makedirs("/tmp/llama.cpp/build", exist_ok=True) run_command( ["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"], "Configuring with CMake" ) run_command( ["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"], "Building llama-quantize" ) quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize" print(" āœ… Quantize tool built") # Step 8: Create quantized versions print("\nāš™ļø Step 8: Creating quantized versions...") quant_formats = [ ("Q4_K_M", "4-bit medium (recommended)"), ("Q5_K_M", "5-bit medium"), ("Q8_0", "8-bit"), ] quantized_files = [] for quant_type, description in quant_formats: print(f" Creating {quant_type} ({description})...") quant_file = f"{gguf_output_dir}/{model_name}-{quant_type.lower()}.gguf" if run_command([quantize_bin, gguf_file, quant_file, quant_type], f"Quantizing to {quant_type}"): size_mb = os.path.getsize(quant_file) / (1024 * 1024) print(f" āœ… {quant_type}: {size_mb:.1f} MB") quantized_files.append((quant_file, quant_type)) else: print(f" āš ļø Skipping {quant_type}") # Step 9: Upload to Hub print("\nā˜ļø Step 9: Uploading to Hugging Face Hub...") api = HfApi() print(f" Creating repository: {OUTPUT_REPO}") api.create_repo(repo_id=OUTPUT_REPO, repo_type="model", exist_ok=True) # Upload F16 print(" Uploading FP16...") api.upload_file( path_or_fileobj=gguf_file, path_in_repo=f"{model_name}-f16.gguf", repo_id=OUTPUT_REPO, ) print(" āœ… FP16 uploaded") # Upload quantized versions for quant_file, quant_type in quantized_files: print(f" Uploading {quant_type}...") api.upload_file( path_or_fileobj=quant_file, path_in_repo=f"{model_name}-{quant_type.lower()}.gguf", repo_id=OUTPUT_REPO, ) print(f" āœ… {quant_type} uploaded") # Create README print("\nšŸ“ Creating README...") readme_content = f"""--- base_model: {BASE_MODEL} tags: - gguf - llama.cpp - quantized - query-expansion - qmd --- # QMD Query Expansion 1.7B (GGUF) GGUF conversion of the QMD Query Expansion model for use with Ollama, llama.cpp, and LM Studio. ## Model Details - **Base Model:** {BASE_MODEL} - **SFT Adapter:** {SFT_MODEL} - **GRPO Adapter:** {GRPO_MODEL} - **Task:** Query expansion for hybrid search (lex/vec/hyde format) ## Available Quantizations | File | Quant | Description | |------|-------|-------------| | {model_name}-f16.gguf | F16 | Full precision | | {model_name}-q8_0.gguf | Q8_0 | 8-bit | | {model_name}-q5_k_m.gguf | Q5_K_M | 5-bit medium | | {model_name}-q4_k_m.gguf | Q4_K_M | 4-bit medium (recommended) | ## Usage ### With Ollama ```bash # Download huggingface-cli download {OUTPUT_REPO} {model_name}-q4_k_m.gguf --local-dir . # Create Modelfile echo 'FROM ./{model_name}-q4_k_m.gguf' > Modelfile # Create and run ollama create qmd-expand -f Modelfile ollama run qmd-expand ``` ### Prompt Format Use Qwen3 chat format with `/no_think`: ``` <|im_start|>user /no_think Expand this search query: your query here<|im_end|> <|im_start|>assistant ``` ### Expected Output ``` lex: keyword variation 1 lex: keyword variation 2 vec: natural language reformulation hyde: Hypothetical document passage answering the query. ``` ## License Apache 2.0 (inherited from Qwen3) """ api.upload_file( path_or_fileobj=readme_content.encode(), path_in_repo="README.md", repo_id=OUTPUT_REPO, ) print(" āœ… README uploaded") print("\n" + "=" * 60) print("āœ… GGUF Conversion Complete!") print(f"šŸ“¦ Repository: https://huggingface.co/{OUTPUT_REPO}") print("=" * 60)