Instructions to use my-ai-stack/Stack-2-9-finetuned with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use my-ai-stack/Stack-2-9-finetuned with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="my-ai-stack/Stack-2-9-finetuned")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("my-ai-stack/Stack-2-9-finetuned")
model = AutoModelForCausalLM.from_pretrained("my-ai-stack/Stack-2-9-finetuned")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use my-ai-stack/Stack-2-9-finetuned with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "my-ai-stack/Stack-2-9-finetuned"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "my-ai-stack/Stack-2-9-finetuned",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/my-ai-stack/Stack-2-9-finetuned

SGLang

How to use my-ai-stack/Stack-2-9-finetuned with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "my-ai-stack/Stack-2-9-finetuned" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "my-ai-stack/Stack-2-9-finetuned",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "my-ai-stack/Stack-2-9-finetuned" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "my-ai-stack/Stack-2-9-finetuned",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use my-ai-stack/Stack-2-9-finetuned with Docker Model Runner:
```
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
```

Stack-2-9-finetuned

File size: 4,327 Bytes

239da7a

"""
HuggingFace Spaces Deployment for Stack 2.9

Free inference API on HuggingFace Spaces.
https://huggingface.co/docs/hub/spaces-sdks-docker
"""

# =============================================================================
# app.py - Stack 2.9 Inference API
# Deploy this to HuggingFace Spaces for free inference
# =============================================================================

import os
import json
from typing import Optional, List, Dict
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import requests

app = FastAPI(title="Stack 2.9 API")

# Model configuration
MODEL_NAME = os.environ.get("MODEL_NAME", "Qwen/Qwen2.5-Coder-7B-Instruct")
API_URL = os.environ.get("API_URL", "")  # Your model API URL
HF_TOKEN = os.environ.get("HF_TOKEN", "")  # HuggingFace token

# ============================================================================
# Request/Response Models
# ============================================================================

class ChatMessage(BaseModel):
    role: str
    content: str

class ChatRequest(BaseModel):
    messages: List[ChatMessage]
    max_tokens: int = 1024
    temperature: float = 0.7
    top_p: float = 0.9

class ChatResponse(BaseModel):
    content: str
    model: str
    usage: Optional[Dict] = None

class CompletionRequest(BaseModel):
    prompt: str
    max_tokens: int = 512
    temperature: float = 0.7

# ============================================================================
# Health Check
# ============================================================================

@app.get("/health")
async def health():
    return {"status": "healthy", "model": MODEL_NAME}

@app.get("/")
async def root():
    return {
        "name": "Stack 2.9",
        "version": "1.0.0",
        "model": MODEL_NAME,
        "endpoints": {
            "chat": "/v1/chat/completions",
            "complete": "/v1/completions",
            "health": "/health"
        }
    }

# ============================================================================
# OpenAI-Compatible API
# ============================================================================

@app.post("/v1/chat/completions", response_model=ChatResponse)
async def chat_completions(request: ChatRequest):
    """OpenAI-compatible chat endpoint"""

    if API_URL:
        # Use external API
        response = requests.post(
            f"{API_URL}/v1/chat/completions",
            headers={"Authorization": f"Bearer {HF_TOKEN}"},
            json={
                "messages": [m.dict() for m in request.messages],
                "max_tokens": request.max_tokens,
                "temperature": request.temperature,
            },
            timeout=60
        )
        return response.json()

    # Placeholder for local model
    raise HTTPException(
        status_code=503,
        detail="No model API configured. Set API_URL environment variable."
    )

@app.post("/v1/completions")
async def completions(request: CompletionRequest):
    """OpenAI-compatible completion endpoint"""

    if API_URL:
        response = requests.post(
            f"{API_URL}/v1/completions",
            headers={"Authorization": f"Bearer {HF_TOKEN}"},
            json={
                "prompt": request.prompt,
                "max_tokens": request.max_tokens,
                "temperature": request.temperature,
            },
            timeout=60
        )
        return response.json()

    raise HTTPException(
        status_code=503,
        detail="No model API configured"
    )

# ============================================================================
# Model Info
# ============================================================================

@app.get("/v1/models")
async def list_models():
    return {
        "object": "list",
        "data": [
            {
                "id": MODEL_NAME,
                "object": "model",
                "created": 1700000000,
                "owned_by": "stack-2.9"
            }
        ]
    }

# ============================================================================
# Run Server
# ============================================================================

if __name__ == "__main__":
    import uvicorn
    port = int(os.environ.get("PORT", "7860"))
    uvicorn.run(app, host="0.0.0.0", port=port)