Text Generation
Transformers
English
qwen2
code-generation
python
fine-tuning
Qwen
tools
agent-framework
multi-agent
conversational
Eval Results (legacy)
Instructions to use my-ai-stack/Stack-2-9-finetuned with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use my-ai-stack/Stack-2-9-finetuned with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("my-ai-stack/Stack-2-9-finetuned") model = AutoModelForCausalLM.from_pretrained("my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use my-ai-stack/Stack-2-9-finetuned with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "my-ai-stack/Stack-2-9-finetuned" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
- SGLang
How to use my-ai-stack/Stack-2-9-finetuned with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use my-ai-stack/Stack-2-9-finetuned with Docker Model Runner:
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
| #!/usr/bin/env python3 | |
| """ | |
| Benchmarks for Stack 2.9 - Token Efficiency Tests | |
| Token optimization benchmarks. | |
| """ | |
| import pytest | |
| import sys | |
| from pathlib import Path | |
| from unittest.mock import MagicMock, patch | |
| # Add stack_cli to path | |
| sys.path.insert(0, str(Path(__file__).parent.parent / "stack_cli")) | |
| from stack_cli.agent import StackAgent, create_agent, AgentResponse | |
| class TestTokenUsage: | |
| """Test token usage patterns.""" | |
| def test_response_token_efficiency(self): | |
| """Test response token efficiency.""" | |
| with patch('stack_cli.context.create_context_manager'): | |
| with patch('stack_cli.tools.get_tool') as mock_get_tool: | |
| mock_tool = MagicMock(return_value={"success": True, "content": "x"}) | |
| mock_get_tool.return_value = mock_tool | |
| agent = StackAgent() | |
| response = agent.process("read test.py") | |
| # Response should have content | |
| assert response.content is not None | |
| assert len(response.content) > 0 | |
| def test_context_truncation(self): | |
| """Test context truncation.""" | |
| with patch('stack_cli.context.create_context_manager'): | |
| with patch('stack_cli.context.Path') as mock_path: | |
| with patch.object(Path, 'exists', return_value=False): | |
| from stack_cli.context import ContextManager | |
| cm = ContextManager("/tmp") | |
| # Context should be generated | |
| context = cm.get_workspace_context() | |
| # Should be formatted string | |
| assert isinstance(context, str) | |
| class TestPromptEfficiency: | |
| """Test prompt efficiency.""" | |
| def test_intent_parsing_tokens(self): | |
| """Test intent parsing token usage.""" | |
| from stack_cli.agent import QueryUnderstanding | |
| qu = QueryUnderstanding() | |
| # Parse should be efficient | |
| result = qu.parse("read test.py") | |
| # Result should have required fields | |
| assert "intent" in result | |
| assert "confidence" in result | |
| assert result["intent"] == "file_read" | |
| def test_tool_selection_tokens(self): | |
| """Test tool selection token usage.""" | |
| from stack_cli.agent import ToolSelector | |
| ts = ToolSelector() | |
| # Selection should be minimal | |
| tools = ts.select("file_read", {}) | |
| # Should return list of tools | |
| assert isinstance(tools) | |
| assert len(tools) > 0 | |
| class TestResponseEfficiency: | |
| """Test response generation efficiency.""" | |
| def test_response_generation_size(self): | |
| """Test response generation output size.""" | |
| with patch('stack_cli.context.create_context_manager'): | |
| from stack_cli.agent import ResponseGenerator, ToolCall | |
| rg = ResponseGenerator() | |
| tool_calls = [ | |
| ToolCall( | |
| tool_name="read", | |
| arguments={"path": "test.py"}, | |
| result={"success": True, "content": "test content"}, | |
| success=True | |
| ) | |
| ] | |
| response = rg.generate(tool_calls, "file_read", {}) | |
| # Should produce reasonable output | |
| assert isinstance(response, str) | |
| assert len(response) > 0 | |
| # Should not be excessively long | |
| assert len(response) < 10000 | |
| def test_clarification_efficiency(self): | |
| """Test clarification generation efficiency.""" | |
| with patch('stack_cli.context.create_context_manager'): | |
| from stack_cli.agent import ResponseGenerator | |
| rg = ResponseGenerator() | |
| clarification = rg.generate_clarification("Which file?") | |
| # Should be concise | |
| assert isinstance(clarification, str) | |
| assert len(clarification) < 200 | |
| class TestContextTokenEfficiency: | |
| """Test context token efficiency.""" | |
| def test_context_summary_size(self): | |
| """Test context summary size.""" | |
| with patch('stack_cli.context.create_context_manager'): | |
| with patch('stack_cli.context.Path') as mock_path: | |
| with patch.object(Path, 'exists', return_value=False): | |
| from stack_cli.context import ContextManager | |
| cm = ContextManager("/tmp") | |
| summary = cm.get_context_summary() | |
| # Should be JSON-serializable dict | |
| import json | |
| serialized = json.dumps(summary) | |
| # Should be reasonable size | |
| assert len(serialized) < 10000 | |
| def test_workspace_context_size(self): | |
| """Test workspace context size.""" | |
| with patch('stack_cli.context.create_context_manager'): | |
| with patch('stack_cli.context.Path') as mock_path: | |
| with patch.object(Path, 'exists', return_value=False): | |
| from stack_cli.context import ContextManager | |
| cm = ContextManager("/tmp") | |
| context = cm.get_workspace_context() | |
| # Should be reasonable size | |
| assert len(context) < 10000 | |
| class TestToolSchemasEfficiency: | |
| """Test tool schemas token efficiency.""" | |
| def test_schemas_compactness(self): | |
| """Test schemas are compact.""" | |
| from stack_cli.tools import get_tool_schemas | |
| schemas = get_tool_schemas() | |
| import json | |
| serialized = json.dumps(schemas) | |
| # Should be reasonable size | |
| assert len(serialized) < 50000 | |
| def test_schema_required_fields(self): | |
| """Test schemas have required fields only.""" | |
| from stack_cli.tools import get_tool_schemas | |
| schemas = get_tool_schemas() | |
| for schema in schemas: | |
| # Should have minimal required fields | |
| assert "name" in schema | |
| assert "description" in schema | |
| assert "parameters" in schema | |
| # Parameters should be minimal | |
| params = schema["parameters"] | |
| assert "type" in params | |
| assert "properties" in params | |
| class TestConversationEfficiency: | |
| """Test conversation token efficiency.""" | |
| def test_history_truncation(self): | |
| """Test conversation history truncation.""" | |
| with patch('stack_cli.context.create_context_manager'): | |
| with patch('stack_cli.tools.get_tool') as mock_get_tool: | |
| mock_tool = MagicMock(return_value={"success": True}) | |
| mock_get_tool.return_value = mock_tool | |
| agent = StackAgent() | |
| # Add many conversations | |
| for i in range(50): | |
| agent.process(f"query {i}") | |
| # History may be truncated | |
| history_len = len(agent.conversation_history) | |
| # Should not grow unbounded | |
| assert history_len <= 100 | |
| def test_summary_efficiency(self): | |
| """Test summary efficiency.""" | |
| with patch('stack_cli.context.create_context_manager'): | |
| agent = StackAgent() | |
| session = agent.context_manager.session | |
| # Add some data | |
| for i in range(10): | |
| session.add_message("user", f"message {i}") | |
| summary = session.get_summary() | |
| # Summary should be compact | |
| import json | |
| serialized = json.dumps(summary) | |
| assert len(serialized) < 1000 | |
| class TestTokenOptimization: | |
| """Test token optimization strategies.""" | |
| def test_response_capping(self): | |
| """Test response content capping.""" | |
| with patch('stack_cli.context.create_context_manager'): | |
| from stack_cli.agent import ResponseGenerator, ToolCall | |
| rg = ResponseGenerator() | |
| # Long content should be capped | |
| long_content = "x" * 10000 | |
| tool_calls = [ | |
| ToolCall( | |
| tool_name="read", | |
| arguments={"path": "test.py"}, | |
| result={"success": True, "content": long_content}, | |
| success=True | |
| ) | |
| ] | |
| response = rg.generate(tool_calls, "file_read", {}) | |
| # Response should be capped | |
| assert len(response) < 15000 | |
| def test_context_truncation_strategy(self): | |
| """Test context truncation strategy.""" | |
| with patch('stack_cli.context.create_context_manager'): | |
| with patch('stack_cli.context.Path') as mock_path: | |
| with patch.object(Path, 'exists', return_value=False): | |
| from stack_cli.context import ContextManager | |
| cm = ContextManager("/tmp") | |
| # With no projects, context should be minimal | |
| context = cm.get_workspace_context() | |
| # Should be concise | |
| lines = context.split('\n') | |
| # Should not have excessive lines | |
| assert len(lines) < 100 | |
| if __name__ == "__main__": | |
| pytest.main([__file__, "-v"]) | |