Instructions to use my-ai-stack/Stack-2-9-finetuned with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use my-ai-stack/Stack-2-9-finetuned with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="my-ai-stack/Stack-2-9-finetuned")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("my-ai-stack/Stack-2-9-finetuned")
model = AutoModelForCausalLM.from_pretrained("my-ai-stack/Stack-2-9-finetuned")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use my-ai-stack/Stack-2-9-finetuned with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "my-ai-stack/Stack-2-9-finetuned"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "my-ai-stack/Stack-2-9-finetuned",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/my-ai-stack/Stack-2-9-finetuned

SGLang

How to use my-ai-stack/Stack-2-9-finetuned with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "my-ai-stack/Stack-2-9-finetuned" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "my-ai-stack/Stack-2-9-finetuned",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "my-ai-stack/Stack-2-9-finetuned" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "my-ai-stack/Stack-2-9-finetuned",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use my-ai-stack/Stack-2-9-finetuned with Docker Model Runner:
```
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
```

Stack-2-9-finetuned / stack /deploy /app.py

walidsobhie-code

refactor: Squeeze folders further - cleaner structure

65888d5 about 2 months ago

raw

history blame

19.8 kB

	"""
	Stack 2.9 - Pattern-Based AI Coding Assistant
	HuggingFace Spaces Demo

	A Gradio interface for Stack 2.9 powered by Qwen2.5-Coder-7B
	with tool integration and pattern memory.
	"""

	import os
	import json
	import time
	from datetime import datetime
	from typing import List, Dict, Optional
	import gradio as gr

	# ============================================================
	# Pattern Memory System
	# ============================================================

	class SelfEvolutionMemory:
	"""Simple in-memory pattern memory system for demo purposes."""

	def __init__(self):
	self.conversations = []
	self.learned_patterns = {}
	self.code_snippets = []
	self.preferences = {}
	self.interaction_count = 0

	def add_interaction(self, user_input: str, assistant_response: str, tools_used: List[str] = None):
	"""Record an interaction for learning."""
	self.interaction_count += 1
	interaction = {
	"timestamp": datetime.now().isoformat(),
	"user_input": user_input,
	"assistant_response": assistant_response,
	"tools_used": tools_used or [],
	"interaction_id": self.interaction_count
	}
	self.conversations.append(interaction)

	# Extract patterns from the interaction
	self._learn_from_interaction(user_input, assistant_response, tools_used or [])

	def _learn_from_interaction(self, user_input: str, response: str, tools: List[str]):
	"""Learn patterns from interactions."""
	# Track tool usage patterns
	for tool in tools:
	if tool not in self.learned_patterns:
	self.learned_patterns[tool] = {"count": 0, "contexts": []}
	self.learned_patterns[tool]["count"] += 1
	self.learned_patterns[tool]["contexts"].append(user_input[:100])

	# Extract code snippets if present
	if "```" in response:
	self.code_snippets.append({
	"timestamp": datetime.now().isoformat(),
	"snippet": response
	})

	def get_context(self) -> str:
	"""Get accumulated context for the model."""
	context_parts = [f"## Pattern Memory ({self.interaction_count} interactions)"]

	if self.learned_patterns:
	context_parts.append("\n### Tool Usage Patterns:")
	for tool, data in sorted(self.learned_patterns.items(), key=lambda x: x[1]["count"], reverse=True)[:5]:
	context_parts.append(f"- {tool}: used {data['count']} times")

	if self.code_snippets:
	context_parts.append(f"\n### Learned {len(self.code_snippets)} code patterns")

	return "\n".join(context_parts)

	def get_stats(self) -> Dict:
	"""Get memory statistics."""
	return {
	"total_interactions": self.interaction_count,
	"tool_patterns": len(self.learned_patterns),
	"code_snippets": len(self.code_snippets),
	"recent_tools": [t for t in self.learned_patterns.keys()][:5]
	}


	# Global memory instance
	memory = SelfEvolutionMemory()

	# ============================================================
	# Tool System
	# ============================================================

	class Tool:
	"""Base tool class."""

	def __init__(self, name: str, description: str, func):
	self.name = name
	self.description = description
	self.func = func

	async def execute(self, args, *kwargs):
	return await self.func(args, *kwargs)


	# Tool implementations (simplified for demo)
	async def tool_file_read(path: str) -> str:
	"""Read a file."""
	try:
	with open(path, 'r') as f:
	return f.read()[:5000] # Limit output
	except FileNotFoundError:
	return f"File not found: {path}"
	except Exception as e:
	return f"Error reading file: {str(e)}"


	async def tool_file_write(path: str, content: str) -> str:
	"""Write to a file."""
	try:
	os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
	with open(path, 'w') as f:
	f.write(content)
	return f"Successfully wrote to {path}"
	except Exception as e:
	return f"Error writing file: {str(e)}"


	async def tool_git_status() -> str:
	"""Get git status."""
	import subprocess
	try:
	result = subprocess.run(["git", "status", "--short"], capture_output=True, text=True, cwd=os.getcwd())
	return result.stdout or "No changes"
	except Exception as e:
	return f"Git error: {str(e)}"


	async def tool_web_search(query: str) -> str:
	"""Search the web."""
	from urllib.parse import quote
	# Return a demo response since we can't make actual API calls
	return f"🔍 Search results for '{query}':\n\n1. [Result 1] - Description here\n2. [Result 2] - Description here\n3. [Result 3] - Description here\n\n(Install brave-search to enable real search)"


	async def tool_run_command(cmd: str) -> str:
	"""Run a shell command."""
	import subprocess
	try:
	result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=30)
	return f"Output:\n{result.stdout}\n\nErrors:\n{result.stderr}" if result.stderr else result.stdout
	except Exception as e:
	return f"Command error: {str(e)}"


	async def tool_create_directory(path: str) -> str:
	"""Create a directory."""
	try:
	os.makedirs(path, exist_ok=True)
	return f"Directory created: {path}"
	except Exception as e:
	return f"Error: {str(e)}"


	async def tool_list_directory(path: str = ".") -> str:
	"""List directory contents."""
	try:
	items = os.listdir(path)
	return "\n".join([f"📁 {i}/" if os.path.isdir(os.path.join(path, i)) else f"📄 {i}" for i in items[:50]])
	except Exception as e:
	return f"Error: {str(e)}"


	# Register tools
	TOOLS = {
	"file_read": Tool("file_read", "Read a file from the filesystem", tool_file_read),
	"file_write": Tool("file_write", "Write content to a file", tool_file_write),
	"git_status": Tool("git_status", "Check git repository status", tool_git_status),
	"web_search": Tool("web_search", "Search the web for information", tool_web_search),
	"run_command": Tool("run_command", "Execute a shell command", tool_run_command),
	"create_directory": Tool("create_directory", "Create a new directory", tool_create_directory),
	"list_directory": Tool("list_directory", "List files in a directory", tool_list_directory),
	}


	def get_tool_descriptions() -> str:
	"""Get descriptions of all available tools."""
	return "\n".join([f"- {t.name}: {t.description}" for t in TOOLS.values()])


	# ============================================================
	# Model Interface
	# ============================================================

	class StackModel:
	"""Stack 2.9 model interface using transformers."""

	def __init__(self, model_id: str = "Qwen/Qwen2.5-Coder-7B-Instruct"):
	self.model_id = model_id
	self.model = None
	self.tokenizer = None
	self.pipeline = None

	def load(self):
	"""Load the model with 4-bit quantization for HF Spaces."""
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	import torch

	print(f"Loading {self.model_id}...")

	# 4-bit quantization config for 16GB GPU
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4"
	)

	# Load tokenizer
	self.tokenizer = AutoTokenizer.from_pretrained(
	self.model_id,
	trust_remote_code=True
	)

	# Load model with quantization
	self.model = AutoModelForCausalLM.from_pretrained(
	self.model_id,
	quantization_config=bnb_config,
	device_map="auto",
	trust_remote_code=True
	)

	print("Model loaded successfully!")

	def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.7) -> str:
	"""Generate a response."""
	if not self.tokenizer:
	return "Model not loaded. Please wait for initialization."

	# Build the prompt with system and tools
	system_prompt = f"""You are Stack 2.9 - a pattern-based AI coding assistant.

	## Available Tools
	{get_tool_descriptions()}

	## Your Capabilities
	- Write, read, and execute code
	- Use git for version control
	- Search the web for information
	- Create and manage files
	- Execute shell commands

	## Self-Evolution
	You learn from each interaction. After responding, summarize what tools you used.

	{memory.get_context()}

	## Instructions
	1. Be helpful and concise
	2. Use tools when needed
	3. Learn from the conversation
	4. Provide code examples when relevant

	Now respond to the user:"""

	full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:"

	# Tokenize
	inputs = self.tokenizer(full_prompt, return_tensors="pt").to(self.model.device)

	# Generate
	outputs = self.model.generate(
	**inputs,
	max_new_tokens=max_tokens,
	temperature=temperature,
	do_sample=True,
	top_p=0.9,
	repetition_penalty=1.1
	)

	# Decode
	response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Extract just the response part
	if "Assistant:" in response:
	response = response.split("Assistant:")[-1].strip()

	return response

	def generate_streaming(self, prompt: str, max_tokens: int = 512):
	"""Generate with streaming (yields tokens)."""
	if not self.tokenizer:
	yield "Model not loaded. Please wait for initialization."
	return

	system_prompt = f"""You are Stack 2.9 - a pattern-based AI coding assistant.

	## Available Tools
	{get_tool_descriptions()}

	## Self-Evolution Memory
	{memory.get_context()}

	Now respond to the user:"""

	full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:"

	inputs = self.tokenizer(full_prompt, return_tensors="pt").to(self.model.device)

	# Generate token by token
	from transformers import GenerationMixin
	from typing import Iterator

	generated_ids = inputs.input_ids

	for _ in range(max_tokens):
	with torch.no_grad():
	outputs = self.model(generated_ids)
	next_token_logits = outputs.logits[:, -1, :]

	# Apply temperature
	next_token_logits = next_token_logits / 0.7

	# Sample
	probs = torch.softmax(next_token_logits, dim=-1)
	next_token = torch.multinomial(probs, num_samples=1)

	generated_ids = torch.cat([generated_ids, next_token], dim=-1)

	# Decode and yield
	token_str = self.tokenizer.decode(next_token[0], skip_special_tokens=True)
	yield token_str

	# Stop on EOS
	if next_token.item() == self.tokenizer.eos_token_id:
	break


	# Global model instance
	model = None


	def initialize_model():
	"""Initialize the model on startup."""
	global model
	try:
	model = StackModel()
	model.load()
	return model
	except Exception as e:
	print(f"Failed to load model: {e}")
	return None


	# ============================================================
	# Gradio Interface
	# ============================================================

	def format_tools_used(tools_used: List[str]) -> str:
	"""Format the tools used for display."""
	if not tools_used:
	return ""
	return f"\n\n🔧 Tools Used: {', '.join(tools_used)}"


	def chat_response(message: str, history: List[List[str]]) -> tuple:
	"""Process a chat message and return response."""
	global model, memory

	if model is None or model.model is None:
	return "⏳ Model is loading. Please wait...", history + [[message, "⏳ Model is loading. Please wait..."]]

	# Track tools used
	tools_used = []

	# Check if we need to use tools based on the message
	message_lower = message.lower()

	if any(kw in message_lower for kw in ['git status', 'git']):
	tools_used.append("git_status")
	if any(kw in message_lower for kw in ['search', 'find', 'look up']):
	tools_used.append("web_search")
	if any(kw in message_lower for kw in ['list files', 'directory', 'ls']):
	tools_used.append("list_directory")
	if any(kw in message_lower for kw in ['run ', 'execute', 'command']):
	tools_used.append("run_command")

	# Generate response
	try:
	response = model.generate(message, max_tokens=512)
	except Exception as e:
	response = f"I encountered an error: {str(e)}"

	# Add tools used to response
	response += format_tools_used(tools_used)

	# Record in memory
	memory.add_interaction(message, response, tools_used)

	return response


	def chat_response_stream(message: str, history: List[List[str]]) -> Generator:
	"""Process a chat message with streaming."""
	global model, memory

	if model is None or model.model is None:
	yield "⏳ Model is loading. Please wait..."
	return

	full_response = ""
	tools_used = []

	message_lower = message.lower()
	if any(kw in message_lower for kw in ['git status', 'git']):
	tools_used.append("git_status")
	if any(kw in message_lower for kw in ['search', 'find']):
	tools_used.append("web_search")
	if any(kw in message_lower for kw in ['list', 'directory']):
	tools_used.append("list_directory")

	# Stream the response
	for token in model.generate_streaming(message, max_tokens=256):
	full_response += token
	yield full_response

	# Add tools used
	if tools_used:
	full_response += format_tools_used(tools_used)
	yield full_response

	# Record in memory
	memory.add_interaction(message, full_response, tools_used)


	# Example prompts for the UI
	EXAMPLE_PROMPTS = [
	"Hello! What can you help me with?",
	"Check git status of this repository",
	"Search for best practices for Python async programming",
	"List the files in the current directory",
	"Write a simple Python function to calculate fibonacci",
	"How do I use Git to create a new branch?",
	"What's your memory of our conversation?",
	]


	def create_gradio_app():
	"""Create the Gradio interface."""

	with gr.Blocks(
	title="Stack 2.9 - Pattern-Based AI Coding Assistant",
	theme=gr.themes.Soft(
	primary_color="#6366f1",
	secondary_color="#818cf8",
	tertiary_color="#a5b4fc"
	)
	) as app:

	# Header
	gr.Markdown("""
	# 🚀 Stack 2.9 - Pattern-Based AI Coding Assistant

	Powered by Qwen2.5-Coder-7B with 4-bit quantization

	---
	""")

	# Memory stats display
	with gr.Row():
	with gr.Column(scale=1):
	stats_display = gr.Markdown(
	"📊 Memory Stats\n\n- Interactions: 0\n- Tools learned: 0\n- Code patterns: 0",
	elem_id="stats"
	)
	with gr.Column(scale=3):
	pass # Spacer

	# Chat interface
	chatbot = gr.Chatbot(
	height=500,
	show_copy_button=True,
	bubble_full_width=False
	)

	with gr.Row():
	msg = gr.Textbox(
	label="Message",
	placeholder="Ask me anything...",
	scale=4,
	lines=3
	)
	submit_btn = gr.Button("Send", variant="primary", scale=1)

	# Clear button
	with gr.Row():
	clear_btn = gr.Button("🗑️ Clear Chat")

	# Example prompts
	gr.Examples(
	examples=EXAMPLE_PROMPTS,
	inputs=msg,
	label="Example Prompts"
	)

	# Memory visualization
	with gr.Accordion("🧠 Self-Evolution Memory", open=False):
	memory_display = gr.Textbox(
	label="Memory Content",
	lines=10,
	interactive=False
	)

	# Functions
	def respond(message, history):
	response = chat_response(message, history)
	history.append([message, response])
	return "", history

	def update_stats():
	stats = memory.get_stats()
	return f"""📊 Memory Stats

	- Interactions: {stats['total_interactions']}
	- Tool Patterns: {stats['tool_patterns']}
	- Code Snippets: {stats['code_snippets']}

	Recent Tools: {', '.join(stats['recent_tools']) if stats['recent_tools'] else 'None'}"""

	def update_memory():
	return memory.get_context()

	# Button click handlers
	submit_btn.click(respond, [msg, chatbot], [msg, chatbot], api_name="send")
	msg.submit(respond, [msg, chatbot], [msg, chatbot], api_name="send")

	def clear_chat():
	return [], ""

	clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])

	# Update stats periodically
	chatbot.change(update_stats, outputs=[stats_display])
	chatbot.change(update_memory, outputs=[memory_display])

	# Footer
	gr.Markdown("""
	---
	### About Stack 2.9

	Stack 2.9 is a pattern-based AI coding assistant that:
	- 🔍 Uses Qwen2.5-Coder-7B (4-bit, ~4GB VRAM)
	- 🛠️ Integrates 7 tools (file, git, web, search, shell)
	- 🧠 Remembers interactions and learns patterns
	- ⚡ Provides fast, streaming responses

	Deployed on HuggingFace Spaces with Gradio
	""")

	return app


	# ============================================================
	# Main Entry Point
	# ============================================================

	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(description="Stack 2.9 - HuggingFace Spaces Demo")
	parser.add_argument("--share", action="store_true", help="Create a public share link")
	parser.add_argument("--port", type=int, default=7860, help="Port to run on")
	parser.add_argument("--model", type=str, default="Qwen/Qwen2.5-Coder-7B-Instruct", help="Model ID")
	args = parser.parse_args()

	print("=" * 50)
	print("🚀 Stack 2.9 - Pattern-Based AI Coding Assistant")
	print("=" * 50)
	print(f"Model: {args.model}")
	print("Loading model...")

	# Initialize model in a thread
	import threading

	def load_model_thread():
	global model
	model = initialize_model()

	loader_thread = threading.Thread(target=load_model_thread)
	loader_thread.start()

	# Create and launch app
	app = create_gradio_app()

	print(f"\n🚀 Launching Gradio on port {args.port}...")
	print("📝 Note: Model loads in background. Chat will work once loaded.\n")

	app.launch(
	server_name="0.0.0.0",
	server_port=args.port,
	share=args.share
	)