Text Generation
Transformers
English
qwen2
code-generation
python
fine-tuning
Qwen
tools
agent-framework
multi-agent
conversational
Eval Results (legacy)
Instructions to use my-ai-stack/Stack-2-9-finetuned with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use my-ai-stack/Stack-2-9-finetuned with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("my-ai-stack/Stack-2-9-finetuned") model = AutoModelForCausalLM.from_pretrained("my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use my-ai-stack/Stack-2-9-finetuned with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "my-ai-stack/Stack-2-9-finetuned" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
- SGLang
How to use my-ai-stack/Stack-2-9-finetuned with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use my-ai-stack/Stack-2-9-finetuned with Docker Model Runner:
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
| # Stack 2.9 Local Deployment Script | |
| # Usage: ./local_deploy.sh [options] | |
| set -e | |
| # Colors for output | |
| RED='\033[0;31m' | |
| GREEN='\033[0;32m' | |
| YELLOW='\033[1;33m' | |
| BLUE='\033[0;34m' | |
| NC='\033[0m' # No Color | |
| # Default configuration | |
| COMPOSE_FILE="docker-compose.yml" | |
| MODEL_PATH="./models" | |
| MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct" # Will be replaced with Stack 2.9 | |
| MODEL_FORMAT="hf" | |
| GPU_MEMORY_UTILIZATION="0.9" | |
| LOG_LEVEL="INFO" | |
| # Function to print colored output | |
| print_status() { | |
| echo -e "${BLUE}[INFO]${NC} $1" | |
| } | |
| print_success() { | |
| echo -e "${GREEN}[SUCCESS]${NC} $1" | |
| } | |
| print_warning() { | |
| echo -e "${YELLOW}[WARNING]${NC} $1" | |
| } | |
| print_error() { | |
| echo -e "${RED}[ERROR]${NC} $1" | |
| } | |
| # Function to check prerequisites | |
| check_prerequisites() { | |
| print_status "Checking prerequisites..." | |
| # Check Docker | |
| if ! command -v docker &> /dev/null; then | |
| print_error "Docker is not installed or not in PATH" | |
| exit 1 | |
| fi | |
| # Check Docker Compose (v1 or v2) | |
| if docker compose version &> /dev/null; then | |
| COMPOSE_CMD="docker compose" | |
| elif command -v docker-compose &> /dev/null; then | |
| COMPOSE_CMD="docker-compose" | |
| else | |
| print_error "Docker Compose is not installed or not in PATH" | |
| exit 1 | |
| fi | |
| # Check NVIDIA Docker support | |
| if ! docker info 2>/dev/null | grep -q "nvidia"; then | |
| print_warning "NVIDIA Docker support not detected. GPU acceleration may not work." | |
| print_warning "Ensure nvidia-docker2 is installed and configured." | |
| fi | |
| print_success "Prerequisites check passed" | |
| } | |
| # Function to setup environment | |
| setup_environment() { | |
| print_status "Setting up environment..." | |
| # Create directories | |
| mkdir -p models logs | |
| chmod 755 models logs | |
| # Create .env file | |
| cat > .env << EOF | |
| MODEL_PATH=${MODEL_PATH} | |
| MODEL_NAME=${MODEL_NAME} | |
| MODEL_FORMAT=${MODEL_FORMAT} | |
| GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION} | |
| LOG_LEVEL=${LOG_LEVEL} | |
| EOF | |
| print_success "Environment setup complete" | |
| } | |
| # Function to download model | |
| download_model() { | |
| print_status "Downloading model (this may take a while)..." | |
| if [ ! -d "models/${MODEL_NAME##*/}" ]; then | |
| print_status "Downloading ${MODEL_NAME}..." | |
| # Use HuggingFace Hub to download model | |
| if command -v huggingface-cli &> /dev/null; then | |
| huggingface-cli download ${MODEL_NAME} --local-dir models | |
| elif command -v git &> /dev/null; then | |
| git lfs install | |
| git clone https://huggingface.co/${MODEL_NAME} models/${MODEL_NAME##*/} | |
| else | |
| print_error "Neither huggingface-cli nor git is available for model download" | |
| exit 1 | |
| fi | |
| print_success "Model downloaded successfully" | |
| else | |
| print_warning "Model already exists, skipping download" | |
| fi | |
| } | |
| # Function to start services | |
| start_services() { | |
| print_status "Starting services..." | |
| ${COMPOSE_CMD} -f ${COMPOSE_FILE} up -d | |
| print_status "Waiting for services to be ready..." | |
| sleep 30 | |
| # Check if services are running | |
| if ${COMPOSE_CMD} -f ${COMPOSE_FILE} ps | grep -q "Up"; then | |
| print_success "Services started successfully" | |
| else | |
| print_error "Failed to start services" | |
| ${COMPOSE_CMD} -f ${COMPOSE_FILE} logs | |
| exit 1 | |
| fi | |
| } | |
| # Function to check status | |
| check_status() { | |
| print_status "Checking service status..." | |
| ${COMPOSE_CMD} -f ${COMPOSE_FILE} ps | |
| print_status "Health check..." | |
| if python3 -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health').read()" &> /dev/null; then | |
| print_success "vLLM server is healthy" | |
| else | |
| print_warning "vLLM server health check failed" | |
| fi | |
| } | |
| # Function to show usage | |
| show_usage() { | |
| echo "Usage: $0 [OPTIONS]" | |
| echo "" | |
| echo "Options:" | |
| echo " -h, --help Show this help message" | |
| echo " --no-model Skip model download" | |
| echo " --force-download Force download even if model exists" | |
| echo " --clean Clean up before deployment" | |
| echo "" | |
| echo "Environment variables:" | |
| echo " MODEL_PATH Path to model directory" | |
| echo " MODEL_NAME HuggingFace model name" | |
| echo " MODEL_FORMAT Model format (hf, safetensors, etc.)" | |
| echo " GPU_MEMORY_UTILIZATION GPU memory utilization (0.0-1.0)" | |
| echo " LOG_LEVEL Log level (DEBUG, INFO, WARNING, ERROR)" | |
| } | |
| # Parse command line arguments | |
| NO_MODEL=false | |
| FORCE_DOWNLOAD=false | |
| CLEAN=false | |
| while [[ $# -gt 0 ]]; do | |
| case $1 in | |
| -h|--help) | |
| show_usage | |
| exit 0 | |
| ;; | |
| --no-model) | |
| NO_MODEL=true | |
| shift | |
| ;; | |
| --force-download) | |
| FORCE_DOWNLOAD=true | |
| shift | |
| ;; | |
| --clean) | |
| CLEAN=true | |
| shift | |
| ;; | |
| *) | |
| print_error "Unknown option: $1" | |
| show_usage | |
| exit 1 | |
| ;; | |
| esac | |
| done | |
| # Clean up if requested | |
| if [[ "${CLEAN}" == "true" ]]; then | |
| print_status "Cleaning up existing deployment..." | |
| ${COMPOSE_CMD} -f ${COMPOSE_FILE} down -v | |
| rm -rf models logs | |
| fi | |
| # Main deployment process | |
| main() { | |
| print_status "Starting Stack 2.9 local deployment..." | |
| echo "===================================" | |
| # Check prerequisites | |
| check_prerequisites | |
| # Setup environment | |
| setup_environment | |
| # Download model if not skipped | |
| if [[ "${NO_MODEL}" == "false" ]]; then | |
| if [[ "${FORCE_DOWNLOAD}" == "true" ]] || [ ! -d "models/${MODEL_NAME##*/}" ]; then | |
| download_model | |
| else | |
| print_warning "Model exists and --force-download not specified, skipping download" | |
| fi | |
| else | |
| print_warning "Model download skipped as requested" | |
| fi | |
| # Start services | |
| start_services | |
| # Check status | |
| check_status | |
| print_success "Stack 2.9 deployment completed successfully!" | |
| echo "" | |
| echo "Service URLs:" | |
| echo " vLLM API: http://localhost:8000" | |
| echo " Prometheus: http://localhost:9090" | |
| echo " Grafana: http://localhost:3000" | |
| echo " Traefik Dashboard: http://localhost:8080" | |
| echo "" | |
| echo "Health check: http://localhost:8000/health" | |
| echo "" | |
| echo "To stop services: ${COMPOSE_CMD} -f ${COMPOSE_FILE} down" | |
| echo "To view logs: ${COMPOSE_CMD} -f ${COMPOSE_FILE} logs -f" | |
| } | |
| # Run main function | |
| main "$@" |