Instructions to use my-ai-stack/Stack-2-9-finetuned with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use my-ai-stack/Stack-2-9-finetuned with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="my-ai-stack/Stack-2-9-finetuned")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("my-ai-stack/Stack-2-9-finetuned")
model = AutoModelForCausalLM.from_pretrained("my-ai-stack/Stack-2-9-finetuned")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use my-ai-stack/Stack-2-9-finetuned with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "my-ai-stack/Stack-2-9-finetuned"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "my-ai-stack/Stack-2-9-finetuned",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/my-ai-stack/Stack-2-9-finetuned

SGLang

How to use my-ai-stack/Stack-2-9-finetuned with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "my-ai-stack/Stack-2-9-finetuned" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "my-ai-stack/Stack-2-9-finetuned",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "my-ai-stack/Stack-2-9-finetuned" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "my-ai-stack/Stack-2-9-finetuned",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use my-ai-stack/Stack-2-9-finetuned with Docker Model Runner:
```
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
```

Stack-2-9-finetuned / samples /benchmarks /test_token_efficiency.py

walidsobhie-code

refactor: Squeeze folders further - cleaner structure

65888d5 about 2 months ago

raw

history blame

9.75 kB

	#!/usr/bin/env python3
	"""
	Benchmarks for Stack 2.9 - Token Efficiency Tests
	Token optimization benchmarks.
	"""

	import pytest
	import sys
	from pathlib import Path
	from unittest.mock import MagicMock, patch

	# Add stack_cli to path
	sys.path.insert(0, str(Path(__file__).parent.parent / "stack_cli"))

	from stack_cli.agent import StackAgent, create_agent, AgentResponse


	class TestTokenUsage:
	"""Test token usage patterns."""

	def test_response_token_efficiency(self):
	"""Test response token efficiency."""
	with patch('stack_cli.context.create_context_manager'):
	with patch('stack_cli.tools.get_tool') as mock_get_tool:
	mock_tool = MagicMock(return_value={"success": True, "content": "x"})
	mock_get_tool.return_value = mock_tool

	agent = StackAgent()
	response = agent.process("read test.py")

	# Response should have content
	assert response.content is not None
	assert len(response.content) > 0

	def test_context_truncation(self):
	"""Test context truncation."""
	with patch('stack_cli.context.create_context_manager'):
	with patch('stack_cli.context.Path') as mock_path:
	with patch.object(Path, 'exists', return_value=False):
	from stack_cli.context import ContextManager

	cm = ContextManager("/tmp")

	# Context should be generated
	context = cm.get_workspace_context()

	# Should be formatted string
	assert isinstance(context, str)


	class TestPromptEfficiency:
	"""Test prompt efficiency."""

	def test_intent_parsing_tokens(self):
	"""Test intent parsing token usage."""
	from stack_cli.agent import QueryUnderstanding

	qu = QueryUnderstanding()

	# Parse should be efficient
	result = qu.parse("read test.py")

	# Result should have required fields
	assert "intent" in result
	assert "confidence" in result
	assert result["intent"] == "file_read"

	def test_tool_selection_tokens(self):
	"""Test tool selection token usage."""
	from stack_cli.agent import ToolSelector

	ts = ToolSelector()

	# Selection should be minimal
	tools = ts.select("file_read", {})

	# Should return list of tools
	assert isinstance(tools)
	assert len(tools) > 0


	class TestResponseEfficiency:
	"""Test response generation efficiency."""

	def test_response_generation_size(self):
	"""Test response generation output size."""
	with patch('stack_cli.context.create_context_manager'):
	from stack_cli.agent import ResponseGenerator, ToolCall

	rg = ResponseGenerator()

	tool_calls = [
	ToolCall(
	tool_name="read",
	arguments={"path": "test.py"},
	result={"success": True, "content": "test content"},
	success=True
	)
	]

	response = rg.generate(tool_calls, "file_read", {})

	# Should produce reasonable output
	assert isinstance(response, str)
	assert len(response) > 0
	# Should not be excessively long
	assert len(response) < 10000

	def test_clarification_efficiency(self):
	"""Test clarification generation efficiency."""
	with patch('stack_cli.context.create_context_manager'):
	from stack_cli.agent import ResponseGenerator

	rg = ResponseGenerator()

	clarification = rg.generate_clarification("Which file?")

	# Should be concise
	assert isinstance(clarification, str)
	assert len(clarification) < 200


	class TestContextTokenEfficiency:
	"""Test context token efficiency."""

	def test_context_summary_size(self):
	"""Test context summary size."""
	with patch('stack_cli.context.create_context_manager'):
	with patch('stack_cli.context.Path') as mock_path:
	with patch.object(Path, 'exists', return_value=False):
	from stack_cli.context import ContextManager

	cm = ContextManager("/tmp")

	summary = cm.get_context_summary()

	# Should be JSON-serializable dict
	import json
	serialized = json.dumps(summary)

	# Should be reasonable size
	assert len(serialized) < 10000

	def test_workspace_context_size(self):
	"""Test workspace context size."""
	with patch('stack_cli.context.create_context_manager'):
	with patch('stack_cli.context.Path') as mock_path:
	with patch.object(Path, 'exists', return_value=False):
	from stack_cli.context import ContextManager

	cm = ContextManager("/tmp")

	context = cm.get_workspace_context()

	# Should be reasonable size
	assert len(context) < 10000


	class TestToolSchemasEfficiency:
	"""Test tool schemas token efficiency."""

	def test_schemas_compactness(self):
	"""Test schemas are compact."""
	from stack_cli.tools import get_tool_schemas

	schemas = get_tool_schemas()

	import json
	serialized = json.dumps(schemas)

	# Should be reasonable size
	assert len(serialized) < 50000

	def test_schema_required_fields(self):
	"""Test schemas have required fields only."""
	from stack_cli.tools import get_tool_schemas

	schemas = get_tool_schemas()

	for schema in schemas:
	# Should have minimal required fields
	assert "name" in schema
	assert "description" in schema
	assert "parameters" in schema

	# Parameters should be minimal
	params = schema["parameters"]
	assert "type" in params
	assert "properties" in params


	class TestConversationEfficiency:
	"""Test conversation token efficiency."""

	def test_history_truncation(self):
	"""Test conversation history truncation."""
	with patch('stack_cli.context.create_context_manager'):
	with patch('stack_cli.tools.get_tool') as mock_get_tool:
	mock_tool = MagicMock(return_value={"success": True})
	mock_get_tool.return_value = mock_tool

	agent = StackAgent()

	# Add many conversations
	for i in range(50):
	agent.process(f"query {i}")

	# History may be truncated
	history_len = len(agent.conversation_history)

	# Should not grow unbounded
	assert history_len <= 100

	def test_summary_efficiency(self):
	"""Test summary efficiency."""
	with patch('stack_cli.context.create_context_manager'):
	agent = StackAgent()
	session = agent.context_manager.session

	# Add some data
	for i in range(10):
	session.add_message("user", f"message {i}")

	summary = session.get_summary()

	# Summary should be compact
	import json
	serialized = json.dumps(summary)

	assert len(serialized) < 1000


	class TestTokenOptimization:
	"""Test token optimization strategies."""

	def test_response_capping(self):
	"""Test response content capping."""
	with patch('stack_cli.context.create_context_manager'):
	from stack_cli.agent import ResponseGenerator, ToolCall

	rg = ResponseGenerator()

	# Long content should be capped
	long_content = "x" * 10000

	tool_calls = [
	ToolCall(
	tool_name="read",
	arguments={"path": "test.py"},
	result={"success": True, "content": long_content},
	success=True
	)
	]

	response = rg.generate(tool_calls, "file_read", {})

	# Response should be capped
	assert len(response) < 15000

	def test_context_truncation_strategy(self):
	"""Test context truncation strategy."""
	with patch('stack_cli.context.create_context_manager'):
	with patch('stack_cli.context.Path') as mock_path:
	with patch.object(Path, 'exists', return_value=False):
	from stack_cli.context import ContextManager

	cm = ContextManager("/tmp")

	# With no projects, context should be minimal
	context = cm.get_workspace_context()

	# Should be concise
	lines = context.split('\n')

	# Should not have excessive lines
	assert len(lines) < 100


	if __name__ == "__main__":
	pytest.main([__file__, "-v"])