Text Generation
Transformers
English
qwen2
code-generation
python
fine-tuning
Qwen
tools
agent-framework
multi-agent
conversational
Eval Results (legacy)
Instructions to use my-ai-stack/Stack-2-9-finetuned with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use my-ai-stack/Stack-2-9-finetuned with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("my-ai-stack/Stack-2-9-finetuned") model = AutoModelForCausalLM.from_pretrained("my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use my-ai-stack/Stack-2-9-finetuned with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "my-ai-stack/Stack-2-9-finetuned" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
- SGLang
How to use my-ai-stack/Stack-2-9-finetuned with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use my-ai-stack/Stack-2-9-finetuned with Docker Model Runner:
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
| #!/usr/bin/env python3 | |
| """ | |
| Simple standalone training script for Stack 2.9. | |
| Uses bfloat16 with optional 4-bit quantization via bitsandbytes. | |
| """ | |
| import argparse | |
| import os | |
| import sys | |
| from pathlib import Path | |
| import numpy as np # Ensure numpy is available (Kaggle pip installs can break it) | |
| import yaml | |
| from datasets import load_dataset | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| TrainingArguments, | |
| Trainer, | |
| DataCollatorForLanguageModeling, | |
| ) | |
| from peft import LoraConfig, get_peft_model, TaskType | |
| import torch | |
| def load_config(config_path: str) -> dict: | |
| with open(config_path, 'r') as f: | |
| return yaml.safe_load(f) | |
| def load_model_and_tokenizer( | |
| model_name: str, | |
| trust_remote_code: bool = True, | |
| use_4bit: bool = False, | |
| use_8bit: bool = False, | |
| use_fp16: bool = True, | |
| ): | |
| """Load base model with explicit GPU placement for single-GPU training.""" | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| model_name, trust_remote_code=trust_remote_code | |
| ) | |
| if use_4bit: | |
| from transformers import BitsAndBytesConfig | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| bnb_4bit_use_double_quant=True, | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| quantization_config=bnb_config, | |
| trust_remote_code=trust_remote_code, | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16, | |
| ) | |
| elif use_8bit: | |
| from transformers import BitsAndBytesConfig | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_8bit=True, | |
| llm_int8_has_fp16_weight=False, | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| quantization_config=bnb_config, | |
| trust_remote_code=trust_remote_code, | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16, | |
| ) | |
| else: | |
| # No quantization - load in FP32 for AMP compatibility | |
| # Trainer with fp16=True will handle casting during training | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| torch_dtype=torch.float32, | |
| trust_remote_code=trust_remote_code, | |
| device_map="auto", | |
| use_cache=False, | |
| ) | |
| return model, tokenizer | |
| def load_data( | |
| data_path: str, | |
| tokenizer, | |
| max_length: int = 2048, | |
| train_split: float = 0.9, | |
| ): | |
| """Load and tokenize dataset.""" | |
| raw_dataset = load_dataset("json", data_files=data_path, split="train") | |
| def tokenize_function(examples): | |
| texts = [] | |
| for instr, out in zip( | |
| examples.get("instruction", [""]), examples.get("output", [""]) | |
| ): | |
| if instr and out: | |
| texts.append( | |
| f"### Instruction:\n{instr}\n\n### Response:\n{out}" | |
| ) | |
| elif out: | |
| texts.append(out) | |
| elif instr: | |
| texts.append(instr) | |
| else: | |
| texts.append("") | |
| tokenized = tokenizer( | |
| texts, truncation=True, max_length=max_length, padding="max_length" | |
| ) | |
| tokenized["labels"] = tokenized["input_ids"].copy() | |
| return tokenized | |
| tokenized_dataset = raw_dataset.map( | |
| tokenize_function, batched=True, remove_columns=raw_dataset.column_names | |
| ) | |
| # Handle train_split logic | |
| total_samples = len(tokenized_dataset) | |
| if train_split >= 1.0: | |
| n_train = int(train_split) | |
| if n_train >= total_samples: | |
| return tokenized_dataset, None | |
| else: | |
| split = tokenized_dataset.train_test_split(train_size=n_train) | |
| return split["train"], split["test"] | |
| else: | |
| split = tokenized_dataset.train_test_split(train_size=train_split) | |
| return split["train"], split["test"] | |
| def train(config: dict): | |
| """Main training function.""" | |
| model_config = config["model"] | |
| data_config = config["data"] | |
| lora_config = config["lora"] | |
| training_config = config["training"] | |
| output_config = config["output"] | |
| hardware_config = config.get("hardware", {}) | |
| quantization_config = config.get("quantization", {}) | |
| use_4bit = hardware_config.get("use_4bit", False) or quantization_config.get("enabled", False) | |
| use_8bit = hardware_config.get("use_8bit", False) | |
| # Set environment variables for better CUDA memory management | |
| # expandable_segments:False fixes a known PyTorch bug (#124807, #128829) with gradient checkpointing | |
| os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:False" | |
| # Clear CUDA cache before loading | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| torch.cuda.reset_peak_memory_stats() | |
| # Determine precision BEFORE loading model | |
| # GPU BF16 support check — use the proper PyTorch API | |
| supports_bf16 = torch.cuda.is_bf16_supported() if torch.cuda.is_available() else False | |
| gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A" | |
| print(f" GPU: {gpu_name}") | |
| print(f" BF16 supported: {supports_bf16}") | |
| # Use fp16 for training on Turing/Pascal (T4, P100) | |
| # Use bf16 for training on Ampere+ (A100, A10, H100) | |
| use_bf16 = supports_bf16 | |
| use_fp16 = not use_bf16 | |
| if training_config.get("bf16", False) and not supports_bf16: | |
| print(f" ⚠️ bf16 requested but GPU doesn't support it — falling back to fp16") | |
| print(f" Mixed precision: bf16={use_bf16}, fp16={use_fp16}") | |
| # Load model and tokenizer (MUST use same dtype as training precision) | |
| print(f"Loading model: {model_config['name']} (4bit={use_4bit}, 8bit={use_8bit})") | |
| model, tokenizer = load_model_and_tokenizer( | |
| model_name=model_config["name"], | |
| trust_remote_code=model_config.get("trust_remote_code", True), | |
| use_4bit=use_4bit, | |
| use_8bit=use_8bit, | |
| use_fp16=use_fp16, | |
| ) | |
| # Print memory stats after model loading | |
| if torch.cuda.is_available(): | |
| allocated = torch.cuda.memory_allocated() / 1024**3 | |
| reserved = torch.cuda.memory_reserved() / 1024**3 | |
| print(f" GPU memory after model load: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved") | |
| # Load data | |
| print(f"Loading dataset: {data_config['input_path']}") | |
| train_dataset, eval_dataset = load_data( | |
| data_path=data_config["input_path"], | |
| tokenizer=tokenizer, | |
| max_length=data_config.get("max_length", 2048), | |
| train_split=data_config.get("train_split", 0.9), | |
| ) | |
| print(f" Train samples: {len(train_dataset)}") | |
| if eval_dataset: | |
| print(f" Eval samples: {len(eval_dataset)}") | |
| else: | |
| print(" No eval set (using all data for training)") | |
| # Apply LoRA | |
| peft_config = LoraConfig( | |
| r=lora_config["r"], | |
| lora_alpha=lora_config.get("lora_alpha", lora_config.get("alpha", 32)), | |
| lora_dropout=lora_config.get("lora_dropout", lora_config.get("dropout", 0.05)), | |
| target_modules=lora_config["target_modules"], | |
| bias=lora_config["bias"], | |
| task_type=TaskType.CAUSAL_LM, | |
| ) | |
| model = get_peft_model(model, peft_config) | |
| model.print_trainable_parameters() | |
| # Training arguments | |
| output_dir = output_config["lora_dir"] | |
| os.makedirs(output_dir, exist_ok=True) | |
| training_args = TrainingArguments( | |
| output_dir=output_dir, | |
| num_train_epochs=training_config["num_epochs"], | |
| per_device_train_batch_size=training_config["batch_size"], | |
| gradient_accumulation_steps=training_config["gradient_accumulation"], | |
| learning_rate=training_config["learning_rate"], | |
| warmup_steps=training_config.get("warmup_steps", 100), | |
| weight_decay=training_config.get("weight_decay", 0.01), | |
| max_grad_norm=training_config.get("max_grad_norm", 1.0), | |
| logging_steps=training_config.get("logging_steps", 10), | |
| save_steps=training_config.get("save_steps", 100), | |
| save_total_limit=training_config.get("save_total_limit", 2), | |
| bf16=False, | |
| fp16=False, # Disabled — P100/Pascal AMP has GradScaler bugs with fp16 | |
| gradient_checkpointing=training_config.get("gradient_checkpointing", True), | |
| gradient_checkpointing_kwargs={"use_reentrant": False}, | |
| evaluation_strategy="steps" if eval_dataset else "no", | |
| eval_steps=training_config.get("eval_steps", 100) if eval_dataset else None, | |
| report_to="none", | |
| dataloader_num_workers=0, | |
| remove_unused_columns=False, | |
| optim="paged_adamw_32bit" if (use_4bit or use_8bit) else "adamw_torch_fused", | |
| ) | |
| data_collator = DataCollatorForLanguageModeling( | |
| tokenizer=tokenizer, mlm=False | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| data_collator=data_collator, | |
| tokenizer=tokenizer, | |
| ) | |
| print("=" * 60) | |
| print("Starting training...") | |
| print("=" * 60) | |
| trainer.train() | |
| print("Training completed!") | |
| # Save final adapter | |
| trainer.save_model(output_dir) | |
| print(f"✅ Adapter saved to {output_dir}") | |
| return trainer | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--config", type=str, required=True, help="Path to YAML config") | |
| args = parser.parse_args() | |
| print("=" * 60) | |
| print("Stack 2.9 Simple Training") | |
| print("=" * 60) | |
| config = load_config(args.config) | |
| print(f"Config loaded: {args.config}") | |
| print(f"Model: {config['model']['name']}") | |
| print(f"Data: {config['data']['input_path']}") | |
| try: | |
| train(config) | |
| print("\n" + "=" * 60) | |
| print("✅ TRAINING SUCCESS") | |
| print("=" * 60) | |
| except Exception as e: | |
| print("\n" + "=" * 60) | |
| print(f"❌ TRAINING FAILED: {e}") | |
| print("=" * 60) | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |