Instructions to use my-ai-stack/Stack-2-9-finetuned with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use my-ai-stack/Stack-2-9-finetuned with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="my-ai-stack/Stack-2-9-finetuned")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("my-ai-stack/Stack-2-9-finetuned")
model = AutoModelForCausalLM.from_pretrained("my-ai-stack/Stack-2-9-finetuned")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use my-ai-stack/Stack-2-9-finetuned with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "my-ai-stack/Stack-2-9-finetuned"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "my-ai-stack/Stack-2-9-finetuned",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/my-ai-stack/Stack-2-9-finetuned

SGLang

How to use my-ai-stack/Stack-2-9-finetuned with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "my-ai-stack/Stack-2-9-finetuned" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "my-ai-stack/Stack-2-9-finetuned",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "my-ai-stack/Stack-2-9-finetuned" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "my-ai-stack/Stack-2-9-finetuned",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use my-ai-stack/Stack-2-9-finetuned with Docker Model Runner:
```
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
```

Stack-2-9-finetuned / stack /training /upload_hf.py

walidsobhie-code

refactor: Squeeze folders further - cleaner structure

65888d5 2 months ago

raw

history blame

9.66 kB

	#!/usr/bin/env python3
	"""
	Stack 2.9 HuggingFace Upload Script
	Pushes the trained model to HuggingFace Hub with proper model card.
	"""

	import argparse
	import os
	import sys
	from pathlib import Path

	# Add parent to path
	sys.path.insert(0, str(Path(__file__).parent.parent))


	def parse_args():
	parser = argparse.ArgumentParser(description="Upload Stack 2.9 to HuggingFace")
	parser.add_argument(
	"--model-path",
	type=str,
	default="./output/stack-2.9-quantized",
	help="Path to quantized model"
	)
	parser.add_argument(
	"--repo-id",
	type=str,
	required=True,
	help="HuggingFace repo ID (e.g., 'username/stack-2.9')"
	)
	parser.add_argument(
	"--token",
	type=str,
	default=None,
	help="HuggingFace token (or set HF_TOKEN env var)"
	)
	parser.add_argument(
	"--private",
	action="store_true",
	help="Create private repo"
	)
	parser.add_argument(
	"--create-model-card",
	action="store_true",
	default=True,
	help="Create model card automatically"
	)
	parser.add_argument(
	"--push-to-hub",
	action="store_true",
	default=True,
	help="Actually push to Hub (else just prepare locally)"
	)
	parser.add_argument(
	"--add-spaces",
	action="store_true",
	help="Create Gradio Spaces demo"
	)
	return parser.parse_args()


	def get_token():
	"""Get HuggingFace token from args or env."""
	return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_TOKEN")


	def create_model_card(args, base_model: str = "Qwen/Qwen2.5-Coder-32B") -> str:
	"""Generate model card content."""

	# Read existing benchmarks if available
	benchmarks = ""
	benchmarks_path = Path(__file__).parent.parent / "BENCHMARKS.md"
	if benchmarks_path.exists():
	benchmarks_content = benchmarks_path.read_text()
	# Extract key metrics
	if "## Results" in benchmarks_content:
	benchmarks = benchmarks_content.split("## Results")[1].split("#")[0]

	model_card = f"""---
	title: Stack 2.9
	base_model: {base_model}
	tags:
	- stack-2.9
	- open-source
	- claude-competitor
	- code-generation
	- qwen
	- fine-tuned
	- transformers
	pipeline_tag: text-generation
	license: apache-2.0
	---

	# Stack 2.9

	Stack 2.9 is a fine-tuned version of Qwen2.5-Coder-32B, specialized for code generation and software development tasks.

	## Model Details

	- Base Model: Qwen2.5-Coder-32B
	- Training Data: Curated coding examples and educational content
	- Fine-tuning Method: LoRA + Merge
	- Quantization: 4-bit (bitsandbytes)

	## Quick Start

	```python
	from transformers import AutoModelForCausalLM, AutoTokenizer

	model = AutoModelForCausalLM.from_pretrained(
	"{args.repo_id}",
	torch_dtype="auto",
	device_map="auto"
	)

	tokenizer = AutoTokenizer.from_pretrained("{args.repo_id}")

	# Chat format
	messages = [
	{{"role": "system", "content": "You are Stack, a helpful coding assistant."}},
	{{"role": "user", "content": "Write a Python function to calculate fibonacci numbers"}}
	]]

	text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = tokenizer(text, return_tensors="pt").to(model.device)

	outputs = model.generate(**inputs, max_new_tokens=512)
	print(tokenizer.decode(outputs[0], skip_special_tokens=True))
	```

	## Requirements

	```bash
	pip install transformers>=4.40.0 torch>=2.0.0 accelerate
	```

	## Inference with vLLM

	```bash
	vllm serve {args.repo_id} --dtype half
	```

	## Benchmarks

	{benchmarks}

	## Limitations

	- Trained on limited dataset; may not cover all edge cases
	- Context window: 32K tokens
	- Model may produce incorrect code; always verify outputs

	## License

	Apache 2.0 - See LICENSE file for details.

	## Citation

	```bibtex
	@misc{{stack-2.9,
	title = {{Stack 2.9}},
	author = {{Stack Team}},
	year = {{2024}},
	url = {{https://huggingface.co/{args.repo_id}}}
	}}
	```
	"""
	return model_card


	def create_gradio_demo(repo_id: str) -> str:
	"""Create a simple Gradio demo for the model."""
	demo_code = '''import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch

	# Load model
	MODEL_NAME = "{{REPO_ID}}"

	print("Loading model...")
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True
	)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

	def generate(prompt, max_tokens, temperature):
	messages = [
	{"role": "system", "content": "You are Stack, a helpful coding assistant."},
	{"role": "user", "content": prompt}
	]

	text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = tokenizer(text, return_tensors="pt").to(model.device)

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=int(max_tokens),
	temperature=temperature,
	do_sample=True
	)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	# Extract just the assistant response
	return response.split("assistant")[-1].strip()

	demo = gr.Interface(
	fn=generate,
	inputs=[
	gr.Textbox(label="Prompt", placeholder="Write a Python function..."),
	gr.Slider(32, 1024, value=512, step=32, label="Max Tokens"),
	gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature")
	],
	outputs=gr.Markdown(label="Response"),
	title="Stack 2.9 Demo",
	description="Try Stack 2.9 - a code generation model"
	)

	demo.launch()
	'''
	return demo_code.replace("{{REPO_ID}}", repo_id)


	def main():
	args = parse_args()

	token = args.token or get_token()
	if not token:
	print("Error: No HuggingFace token provided")
	print("Set HF_TOKEN environment variable or pass --token")
	sys.exit(1)

	print("=" * 60)
	print("Stack 2.9 HuggingFace Upload")
	print("=" * 60)
	print(f"Model path: {args.model_path}")
	print(f"Repo ID: {args.repo_id}")
	print(f"Private: {args.private}")
	print("=" * 60)

	# Validate model path
	if not os.path.exists(args.model_path):
	print(f"Error: Model path {args.model_path} does not exist")
	sys.exit(1)

	# Create model card
	if args.create_model_card:
	print("Creating model card...")
	model_card = create_model_card(args)
	model_card_path = os.path.join(args.model_path, "README.md")
	with open(model_card_path, "w") as f:
	f.write(model_card)
	print(f" Created: {model_card_path}")

	# Push to Hub
	if args.push_to_hub:
	print("\nPushing to HuggingFace Hub...")
	try:
	from huggingface_hub import HfApi, create_repo

	# Create repo if needed
	api = HfApi(token=token)
	try:
	create_repo(
	args.repo_id,
	token=token,
	private=args.private,
	repo_type="model",
	exist_ok=True
	)
	print(f" Repo created/verified: {args.repo_id}")
	except Exception as e:
	print(f" Repo creation: {e}")

	# Upload model files
	print(" Uploading model files...")
	api.upload_folder(
	folder_path=args.model_path,
	repo_id=args.repo_id,
	repo_type="model",
	commit_message="Upload Stack 2.9 model"
	)

	print(f"\n✓ Successfully uploaded to https://huggingface.co/{args.repo_id}")

	except ImportError:
	print("Error: huggingface_hub not installed")
	print("Run: pip install huggingface_hub")
	sys.exit(1)
	except Exception as e:
	print(f"Upload failed: {e}")
	sys.exit(1)

	# Create Gradio demo
	if args.add_spaces:
	print("\nCreating Gradio Spaces demo...")
	demo_code = create_gradio_demo(args.repo_id)
	spaces_dir = "./stack-2.9-spaces"
	os.makedirs(spaces_dir, exist_ok=True)

	with open(os.path.join(spaces_dir, "app.py"), "w") as f:
	f.write(demo_code)

	# Create requirements
	with open(os.path.join(spaces_dir, "requirements.txt"), "w") as f:
	f.write("""gradio
	transformers>=4.40.0
	torch>=2.0.0
	accelerate
	""")

	# Create Spaces config
	with open(os.path.join(spaces_dir, "README.md"), "w") as f:
	f.write(f"""---
	title: Stack 2.9 Demo
	emoji: 🤖
	colorFrom: blue
	colorTo: purple
	sdk: gradio
	app_file: app.py
	pinned: false
	---

	# Stack 2.9 Gradio Demo

	Live demo of Stack 2.9 code generation model.

	[Launch on HuggingFace Spaces](https://huggingface.co/spaces/{args.repo_id.replace('/', '-')})
	""")

	print(f" Created: {spaces_dir}/")

	# Optionally push to Spaces
	if args.push_to_hub:
	try:
	from huggingface_hub import create_repo
	spaces_repo = args.repo_id.replace("models", "spaces")
	create_repo(spaces_repo, token=token, repo_type="space", exist_ok=True)
	print(f" Spaces repo: https://huggingface.co/spaces/{spaces_repo}")
	except Exception as e:
	print(f" Spaces creation: {e}")

	print("\n✓ Upload complete!")
	return 0


	if __name__ == "__main__":
	sys.exit(main())