Text Generation
Transformers
English
qwen2
code-generation
python
fine-tuning
Qwen
tools
agent-framework
multi-agent
conversational
Eval Results (legacy)
Instructions to use my-ai-stack/Stack-2-9-finetuned with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use my-ai-stack/Stack-2-9-finetuned with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("my-ai-stack/Stack-2-9-finetuned") model = AutoModelForCausalLM.from_pretrained("my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use my-ai-stack/Stack-2-9-finetuned with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "my-ai-stack/Stack-2-9-finetuned" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
- SGLang
How to use my-ai-stack/Stack-2-9-finetuned with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use my-ai-stack/Stack-2-9-finetuned with Docker Model Runner:
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
| #!/usr/bin/env python3 | |
| """ | |
| Evaluate Stack 2.9 on tool-use test suite. | |
| Supports: local model (transformers) or vLLM API. | |
| """ | |
| import json | |
| import argparse | |
| from pathlib import Path | |
| from typing import Dict, Any, List | |
| import openai | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import torch | |
| def load_test_cases(path: str) -> List[Dict]: | |
| with open(path, 'r') as f: | |
| return json.load(f) | |
| def predict_with_transformers(model_path: str, prompt: str) -> Dict: | |
| """Query local model for tool prediction.""" | |
| tokenizer = AutoTokenizer.from_pretrained(model_path) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_path, | |
| device_map="auto", | |
| torch_dtype=torch.float16 | |
| ) | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=200, | |
| temperature=0.2, | |
| do_sample=False | |
| ) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Parse tool call from response (simple version - assumes structured format) | |
| # This is a placeholder - real parser needs to extract JSON tool use | |
| return { | |
| "tool": "UnknownTool", | |
| "params": {}, | |
| "raw_response": response | |
| } | |
| def predict_with_vllm(api_url: str, prompt: str, model_name: str = "stack-2.9") -> Dict: | |
| """Query vLLM server.""" | |
| client = openai.OpenAI( | |
| base_url=api_url, | |
| api_key="dummy" | |
| ) | |
| response = client.chat.completions.create( | |
| model=model_name, | |
| messages=[{"role": "user", "content": prompt}], | |
| max_tokens=200, | |
| temperature=0.2 | |
| ) | |
| content = response.choices[0].message.content | |
| # Parse tool call from content | |
| return { | |
| "tool": "UnknownTool", | |
| "params": {}, | |
| "raw_response": content | |
| } | |
| def evaluate_predictions( | |
| test_cases: List[Dict], | |
| predictions: List[Dict], | |
| tool_catalog: Dict[str, Any] = None | |
| ) -> Dict[str, Any]: | |
| """Compare predictions against ground truth.""" | |
| total = len(test_cases) | |
| correct_tool = 0 | |
| correct_params = 0 | |
| exact_match = 0 | |
| per_tool = {} | |
| for tc, pred in zip(test_cases, predictions): | |
| expected_tool = tc["expected_tool"] | |
| pred_tool = pred["tool"] | |
| # Tool accuracy | |
| tool_correct = pred_tool == expected_tool | |
| if tool_correct: | |
| correct_tool += 1 | |
| # Parameter accuracy (simple: exact match of params dict, or partial?) | |
| expected_params = tc["expected_params"] | |
| pred_params = pred["params"] | |
| # For now, check if all expected params are present with same values | |
| param_correct = all( | |
| pred_params.get(k) == v for k, v in expected_params.items() | |
| ) if expected_params else True | |
| if param_correct: | |
| correct_params += 1 | |
| # Exact match (tool + all params) | |
| if tool_correct and param_correct: | |
| exact_match += 1 | |
| # Track per-tool stats | |
| if expected_tool not in per_tool: | |
| per_tool[expected_tool] = {"total": 0, "correct_tool": 0, "correct_params": 0} | |
| per_tool[expected_tool]["total"] += 1 | |
| if tool_correct: | |
| per_tool[expected_tool]["correct_tool"] += 1 | |
| if param_correct: | |
| per_tool[expected_tool]["correct_params"] += 1 | |
| return { | |
| "total_examples": total, | |
| "tool_accuracy": correct_tool / total if total > 0 else 0, | |
| "parameter_accuracy": correct_params / total if total > 0 else 0, | |
| "exact_match_accuracy": exact_match / total if total > 0 else 0, | |
| "per_tool_breakdown": per_tool | |
| } | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--test-cases", type=str, default="stack-2.9-eval/tool_use/test_cases.json") | |
| parser.add_argument("--catalog", type=str, default="training-data/tools/catalog.json") | |
| parser.add_argument("--model-path", type=str, help="HuggingFace model path for transformers") | |
| parser.add_argument("--vllm-api", type=str, help="vLLM API URL (e.g., http://localhost:8000)") | |
| parser.add_argument("--output", type=str, default="stack-2.9-eval/tool_use/results.json") | |
| parser.add_argument("--limit", type=int, help="Limit number of test cases to evaluate") | |
| args = parser.parse_args() | |
| test_cases_path = Path(args.test_cases) | |
| catalog_path = Path(args.catalog) | |
| if not test_cases_path.exists(): | |
| print(f"❌ Test cases not found: {test_cases_path}") | |
| print(" Run generate_tool_use_tests.py first") | |
| return | |
| test_cases = load_test_cases(test_cases_path) | |
| if args.limit: | |
| test_cases = test_cases[:args.limit] | |
| print(f"🧪 Evaluating {len(test_cases)} tool-use test cases") | |
| # Load tool catalog | |
| tool_catalog = None | |
| if catalog_path.exists(): | |
| with open(catalog_path, 'r') as f: | |
| tool_catalog = {t["tool"]: t for t in json.load(f)} | |
| print(f"✅ Loaded tool catalog ({len(tool_catalog)} tools)") | |
| # Generate predictions | |
| predictions = [] | |
| for i, tc in enumerate(test_cases): | |
| prompt = tc["prompt"] | |
| if args.model_path: | |
| pred = predict_with_transformers(args.model_path, prompt) | |
| elif args.vllm_api: | |
| pred = predict_with_vllm(args.vllm_api, prompt) | |
| else: | |
| # Mock predictor (random baseline) | |
| import random | |
| tools = list(tool_catalog.keys()) if tool_catalog else ["UnknownTool"] | |
| pred = { | |
| "tool": random.choice(tools), | |
| "params": {}, | |
| "raw_response": "Mock prediction" | |
| } | |
| predictions.append(pred) | |
| if (i+1) % 10 == 0: | |
| print(f" Processed {i+1}/{len(test_cases)}...", end='\r') | |
| print(f"\n📊 Evaluating predictions...") | |
| # Evaluate | |
| results = evaluate_predictions(test_cases, predictions, tool_catalog) | |
| # Save results | |
| output_path = Path(args.output) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| with open(output_path, 'w') as f: | |
| json.dump(results, f, indent=2) | |
| print(f"\n✅ Evaluation results:") | |
| print(f" Tool accuracy: {results['tool_accuracy']*100:.1f}%") | |
| print(f" Param accuracy: {results['parameter_accuracy']*100:.1f}%") | |
| print(f" Exact match: {results['exact_match_accuracy']*100:.1f}%") | |
| print(f"\n Results saved to: {output_path}") | |
| # Show per-tool breakdown (top 5 worst) | |
| if results['per_tool_breakdown']: | |
| print("\n📉 Worst performing tools:") | |
| sorted_tools = sorted( | |
| results['per_tool_breakdown'].items(), | |
| key=lambda x: x[1]['correct_tool'] / x[1]['total'] if x[1]['total']>0 else 0 | |
| )[:5] | |
| for tool, stats in sorted_tools: | |
| acc = stats['correct_tool'] / stats['total'] * 100 | |
| print(f" {tool}: {acc:.1f}% ({stats['correct_tool']}/{stats['total']})") | |
| if __name__ == "__main__": | |
| main() |