Text Generation
Transformers
English
qwen2
code-generation
python
fine-tuning
Qwen
tools
agent-framework
multi-agent
conversational
Eval Results (legacy)
Instructions to use my-ai-stack/Stack-2-9-finetuned with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use my-ai-stack/Stack-2-9-finetuned with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("my-ai-stack/Stack-2-9-finetuned") model = AutoModelForCausalLM.from_pretrained("my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use my-ai-stack/Stack-2-9-finetuned with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "my-ai-stack/Stack-2-9-finetuned" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
- SGLang
How to use my-ai-stack/Stack-2-9-finetuned with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use my-ai-stack/Stack-2-9-finetuned with Docker Model Runner:
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
| #!/usr/bin/env python3 | |
| """ | |
| Extended Context Fine-tuning for Stack 2.9 | |
| Extends Qwen2.5-Coder-1.5B from 32K to 128K context using RoPE scaling. | |
| Run on cloud GPU (T4/A100). | |
| """ | |
| import argparse | |
| import os | |
| import torch | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| TrainingArguments, | |
| Trainer, | |
| DataCollatorForLanguageModeling | |
| ) | |
| from peft import LoraConfig, get_peft_model, TaskType | |
| from datasets import load_dataset | |
| def parse_args(): | |
| parser = argparse.ArgumentParser(description="Extended context fine-tuning") | |
| parser.add_argument("--model-path", type=str, required=True, | |
| help="Path to base Qwen2.5-Coder-1.5B model") | |
| parser.add_argument("--data-path", type=str, required=True, | |
| help="Path to training data (jsonl)") | |
| parser.add_argument("--output-dir", type=str, default="./output/stack-2.9-128k", | |
| help="Output directory") | |
| parser.add_argument("--context-length", type=int, default=131072, | |
| help="Target context length (default: 128K)") | |
| parser.add_argument("--lora-rank", type=int, default=64, | |
| help="LoRA rank (default: 64)") | |
| parser.add_argument("--epochs", type=int, default=3, | |
| help="Training epochs (default: 3)") | |
| parser.add_argument("--batch-size", type=int, default=1, | |
| help="Per-device batch size (default: 1)") | |
| parser.add_argument("--grad-accum", type=int, default=16, | |
| help="Gradient accumulation steps (default: 16)") | |
| parser.add_argument("--lr", type=float, default=5e-5, | |
| help="Learning rate (default: 5e-5)") | |
| parser.add_argument("--push-to-hub", action="store_true", | |
| help="Push final model to HuggingFace") | |
| parser.add_argument("--hub-model-id", type=str, default=None, | |
| help="HF model ID for push") | |
| return parser.parse_args() | |
| def main(): | |
| args = parse_args() | |
| print(f"Loading model from {args.model_path}") | |
| print(f"Target context: {args.context_length}") | |
| # Load tokenizer and update its max length | |
| tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) | |
| tokenizer.model_max_length = args.context_length | |
| # Load model with extended context config | |
| # The model's config.json should already have rope_scaling set | |
| model = AutoModelForCausalLM.from_pretrained( | |
| args.model_path, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16, | |
| device_map="auto", | |
| low_cpu_mem_usage=True, | |
| ) | |
| # Verify context extension | |
| print(f"Model max_position_embeddings: {model.config.max_position_embeddings}") | |
| if hasattr(model.config, 'rope_scaling'): | |
| print(f"RoPE scaling: {model.config.rope_scaling}") | |
| # Attach LoRA for efficient fine-tuning | |
| lora_config = LoraConfig( | |
| task_type=TaskType.CAUSAL_LM, | |
| r=args.lora_rank, | |
| lora_alpha=args.lora_rank * 2, | |
| lora_dropout=0.05, | |
| bias="none", | |
| target_modules=[ | |
| "q_proj", "k_proj", "v_proj", "o_proj", | |
| "gate_proj", "up_proj", "down_proj" | |
| ] | |
| ) | |
| model = get_peft_model(model, lora_config) | |
| model.print_trainable_parameters() | |
| # Load dataset | |
| print(f"Loading data from {args.data_path}") | |
| dataset = load_dataset("json", data_files=args.data_path, split="train") | |
| # Tokenize with long context | |
| def tokenize(example): | |
| text = example.get("text", "") | |
| encoding = tokenizer( | |
| text, | |
| truncation=True, | |
| max_length=args.context_length, | |
| padding="max_length", | |
| return_special_tokens_mask=True | |
| ) | |
| encoding["labels"] = encoding["input_ids"].copy() | |
| return encoding | |
| dataset = dataset.map(tokenize, remove_columns=dataset.column_names) | |
| # Training args | |
| training_args = TrainingArguments( | |
| output_dir=args.output_dir, | |
| per_device_train_batch_size=args.batch_size, | |
| gradient_accumulation_steps=args.grad_accum, | |
| learning_rate=args.lr, | |
| num_train_epochs=args.epochs, | |
| lr_scheduler_type="cosine", | |
| warmup_ratio=0.05, | |
| max_grad_norm=0.3, | |
| fp16=True, | |
| bf16=False, | |
| gradient_checkpointing=True, | |
| logging_steps=10, | |
| save_steps=500, | |
| eval_steps=500, | |
| save_total_limit=3, | |
| optim="paged_adamw_32bit", | |
| report_to="none", | |
| push_to_hub=args.push_to_hub, | |
| hub_model_id=args.hub_model_id, | |
| hub_strategy="save", | |
| ) | |
| # Data collator | |
| data_collator = DataCollatorForLanguageModeling( | |
| tokenizer=tokenizer, | |
| mlm=False, # Causal LM, not masked | |
| ) | |
| # Train | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=dataset, | |
| data_collator=data_collator, | |
| ) | |
| print("Starting extended context fine-tuning...") | |
| trainer.train() | |
| # Save | |
| merged_dir = os.path.join(args.output_dir, "merged") | |
| print(f"Saving merged model to {merged_dir}") | |
| trainer.model.merge_and_unload() | |
| trainer.save_model(merged_dir) | |
| tokenizer.save_pretrained(merged_dir) | |
| if args.push_to_hub and args.hub_model_id: | |
| print(f"Pushing to HuggingFace: {args.hub_model_id}") | |
| trainer.push_to_hub(args.hub_model_id) | |
| if __name__ == "__main__": | |
| main() | |