Text Generation
Transformers
English
qwen2
code-generation
python
fine-tuning
Qwen
tools
agent-framework
multi-agent
conversational
Eval Results (legacy)
Instructions to use my-ai-stack/Stack-2-9-finetuned with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use my-ai-stack/Stack-2-9-finetuned with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("my-ai-stack/Stack-2-9-finetuned") model = AutoModelForCausalLM.from_pretrained("my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use my-ai-stack/Stack-2-9-finetuned with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "my-ai-stack/Stack-2-9-finetuned" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
- SGLang
How to use my-ai-stack/Stack-2-9-finetuned with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use my-ai-stack/Stack-2-9-finetuned with Docker Model Runner:
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
| """ | |
| Stack 2.9 - Pattern-Based AI Coding Assistant | |
| HuggingFace Spaces Demo | |
| A Gradio interface for Stack 2.9 powered by Qwen2.5-Coder-7B | |
| with tool integration and pattern memory. | |
| """ | |
| import os | |
| import json | |
| import time | |
| from datetime import datetime | |
| from typing import List, Dict, Optional | |
| import gradio as gr | |
| # ============================================================ | |
| # Pattern Memory System | |
| # ============================================================ | |
| class SelfEvolutionMemory: | |
| """Simple in-memory pattern memory system for demo purposes.""" | |
| def __init__(self): | |
| self.conversations = [] | |
| self.learned_patterns = {} | |
| self.code_snippets = [] | |
| self.preferences = {} | |
| self.interaction_count = 0 | |
| def add_interaction(self, user_input: str, assistant_response: str, tools_used: List[str] = None): | |
| """Record an interaction for learning.""" | |
| self.interaction_count += 1 | |
| interaction = { | |
| "timestamp": datetime.now().isoformat(), | |
| "user_input": user_input, | |
| "assistant_response": assistant_response, | |
| "tools_used": tools_used or [], | |
| "interaction_id": self.interaction_count | |
| } | |
| self.conversations.append(interaction) | |
| # Extract patterns from the interaction | |
| self._learn_from_interaction(user_input, assistant_response, tools_used or []) | |
| def _learn_from_interaction(self, user_input: str, response: str, tools: List[str]): | |
| """Learn patterns from interactions.""" | |
| # Track tool usage patterns | |
| for tool in tools: | |
| if tool not in self.learned_patterns: | |
| self.learned_patterns[tool] = {"count": 0, "contexts": []} | |
| self.learned_patterns[tool]["count"] += 1 | |
| self.learned_patterns[tool]["contexts"].append(user_input[:100]) | |
| # Extract code snippets if present | |
| if "```" in response: | |
| self.code_snippets.append({ | |
| "timestamp": datetime.now().isoformat(), | |
| "snippet": response | |
| }) | |
| def get_context(self) -> str: | |
| """Get accumulated context for the model.""" | |
| context_parts = [f"## Pattern Memory ({self.interaction_count} interactions)"] | |
| if self.learned_patterns: | |
| context_parts.append("\n### Tool Usage Patterns:") | |
| for tool, data in sorted(self.learned_patterns.items(), key=lambda x: x[1]["count"], reverse=True)[:5]: | |
| context_parts.append(f"- {tool}: used {data['count']} times") | |
| if self.code_snippets: | |
| context_parts.append(f"\n### Learned {len(self.code_snippets)} code patterns") | |
| return "\n".join(context_parts) | |
| def get_stats(self) -> Dict: | |
| """Get memory statistics.""" | |
| return { | |
| "total_interactions": self.interaction_count, | |
| "tool_patterns": len(self.learned_patterns), | |
| "code_snippets": len(self.code_snippets), | |
| "recent_tools": [t for t in self.learned_patterns.keys()][:5] | |
| } | |
| # Global memory instance | |
| memory = SelfEvolutionMemory() | |
| # ============================================================ | |
| # Tool System | |
| # ============================================================ | |
| class Tool: | |
| """Base tool class.""" | |
| def __init__(self, name: str, description: str, func): | |
| self.name = name | |
| self.description = description | |
| self.func = func | |
| async def execute(self, *args, **kwargs): | |
| return await self.func(*args, **kwargs) | |
| # Tool implementations (simplified for demo) | |
| async def tool_file_read(path: str) -> str: | |
| """Read a file.""" | |
| try: | |
| with open(path, 'r') as f: | |
| return f.read()[:5000] # Limit output | |
| except FileNotFoundError: | |
| return f"File not found: {path}" | |
| except Exception as e: | |
| return f"Error reading file: {str(e)}" | |
| async def tool_file_write(path: str, content: str) -> str: | |
| """Write to a file.""" | |
| try: | |
| os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True) | |
| with open(path, 'w') as f: | |
| f.write(content) | |
| return f"Successfully wrote to {path}" | |
| except Exception as e: | |
| return f"Error writing file: {str(e)}" | |
| async def tool_git_status() -> str: | |
| """Get git status.""" | |
| import subprocess | |
| try: | |
| result = subprocess.run(["git", "status", "--short"], capture_output=True, text=True, cwd=os.getcwd()) | |
| return result.stdout or "No changes" | |
| except Exception as e: | |
| return f"Git error: {str(e)}" | |
| async def tool_web_search(query: str) -> str: | |
| """Search the web.""" | |
| from urllib.parse import quote | |
| # Return a demo response since we can't make actual API calls | |
| return f"π Search results for '{query}':\n\n1. [Result 1] - Description here\n2. [Result 2] - Description here\n3. [Result 3] - Description here\n\n(Install brave-search to enable real search)" | |
| async def tool_run_command(cmd: str) -> str: | |
| """Run a shell command.""" | |
| import subprocess | |
| try: | |
| result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=30) | |
| return f"Output:\n{result.stdout}\n\nErrors:\n{result.stderr}" if result.stderr else result.stdout | |
| except Exception as e: | |
| return f"Command error: {str(e)}" | |
| async def tool_create_directory(path: str) -> str: | |
| """Create a directory.""" | |
| try: | |
| os.makedirs(path, exist_ok=True) | |
| return f"Directory created: {path}" | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| async def tool_list_directory(path: str = ".") -> str: | |
| """List directory contents.""" | |
| try: | |
| items = os.listdir(path) | |
| return "\n".join([f"π {i}/" if os.path.isdir(os.path.join(path, i)) else f"π {i}" for i in items[:50]]) | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| # Register tools | |
| TOOLS = { | |
| "file_read": Tool("file_read", "Read a file from the filesystem", tool_file_read), | |
| "file_write": Tool("file_write", "Write content to a file", tool_file_write), | |
| "git_status": Tool("git_status", "Check git repository status", tool_git_status), | |
| "web_search": Tool("web_search", "Search the web for information", tool_web_search), | |
| "run_command": Tool("run_command", "Execute a shell command", tool_run_command), | |
| "create_directory": Tool("create_directory", "Create a new directory", tool_create_directory), | |
| "list_directory": Tool("list_directory", "List files in a directory", tool_list_directory), | |
| } | |
| def get_tool_descriptions() -> str: | |
| """Get descriptions of all available tools.""" | |
| return "\n".join([f"- **{t.name}**: {t.description}" for t in TOOLS.values()]) | |
| # ============================================================ | |
| # Model Interface | |
| # ============================================================ | |
| class StackModel: | |
| """Stack 2.9 model interface using transformers.""" | |
| def __init__(self, model_id: str = "Qwen/Qwen2.5-Coder-7B-Instruct"): | |
| self.model_id = model_id | |
| self.model = None | |
| self.tokenizer = None | |
| self.pipeline = None | |
| def load(self): | |
| """Load the model with 4-bit quantization for HF Spaces.""" | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| import torch | |
| print(f"Loading {self.model_id}...") | |
| # 4-bit quantization config for 16GB GPU | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4" | |
| ) | |
| # Load tokenizer | |
| self.tokenizer = AutoTokenizer.from_pretrained( | |
| self.model_id, | |
| trust_remote_code=True | |
| ) | |
| # Load model with quantization | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| self.model_id, | |
| quantization_config=bnb_config, | |
| device_map="auto", | |
| trust_remote_code=True | |
| ) | |
| print("Model loaded successfully!") | |
| def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.7) -> str: | |
| """Generate a response.""" | |
| if not self.tokenizer: | |
| return "Model not loaded. Please wait for initialization." | |
| # Build the prompt with system and tools | |
| system_prompt = f"""You are Stack 2.9 - a pattern-based AI coding assistant. | |
| ## Available Tools | |
| {get_tool_descriptions()} | |
| ## Your Capabilities | |
| - Write, read, and execute code | |
| - Use git for version control | |
| - Search the web for information | |
| - Create and manage files | |
| - Execute shell commands | |
| ## Self-Evolution | |
| You learn from each interaction. After responding, summarize what tools you used. | |
| {memory.get_context()} | |
| ## Instructions | |
| 1. Be helpful and concise | |
| 2. Use tools when needed | |
| 3. Learn from the conversation | |
| 4. Provide code examples when relevant | |
| Now respond to the user:""" | |
| full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:" | |
| # Tokenize | |
| inputs = self.tokenizer(full_prompt, return_tensors="pt").to(self.model.device) | |
| # Generate | |
| outputs = self.model.generate( | |
| **inputs, | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| do_sample=True, | |
| top_p=0.9, | |
| repetition_penalty=1.1 | |
| ) | |
| # Decode | |
| response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Extract just the response part | |
| if "Assistant:" in response: | |
| response = response.split("Assistant:")[-1].strip() | |
| return response | |
| def generate_streaming(self, prompt: str, max_tokens: int = 512): | |
| """Generate with streaming (yields tokens).""" | |
| if not self.tokenizer: | |
| yield "Model not loaded. Please wait for initialization." | |
| return | |
| system_prompt = f"""You are Stack 2.9 - a pattern-based AI coding assistant. | |
| ## Available Tools | |
| {get_tool_descriptions()} | |
| ## Self-Evolution Memory | |
| {memory.get_context()} | |
| Now respond to the user:""" | |
| full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:" | |
| inputs = self.tokenizer(full_prompt, return_tensors="pt").to(self.model.device) | |
| # Generate token by token | |
| from transformers import GenerationMixin | |
| from typing import Iterator | |
| generated_ids = inputs.input_ids | |
| for _ in range(max_tokens): | |
| with torch.no_grad(): | |
| outputs = self.model(generated_ids) | |
| next_token_logits = outputs.logits[:, -1, :] | |
| # Apply temperature | |
| next_token_logits = next_token_logits / 0.7 | |
| # Sample | |
| probs = torch.softmax(next_token_logits, dim=-1) | |
| next_token = torch.multinomial(probs, num_samples=1) | |
| generated_ids = torch.cat([generated_ids, next_token], dim=-1) | |
| # Decode and yield | |
| token_str = self.tokenizer.decode(next_token[0], skip_special_tokens=True) | |
| yield token_str | |
| # Stop on EOS | |
| if next_token.item() == self.tokenizer.eos_token_id: | |
| break | |
| # Global model instance | |
| model = None | |
| def initialize_model(): | |
| """Initialize the model on startup.""" | |
| global model | |
| try: | |
| model = StackModel() | |
| model.load() | |
| return model | |
| except Exception as e: | |
| print(f"Failed to load model: {e}") | |
| return None | |
| # ============================================================ | |
| # Gradio Interface | |
| # ============================================================ | |
| def format_tools_used(tools_used: List[str]) -> str: | |
| """Format the tools used for display.""" | |
| if not tools_used: | |
| return "" | |
| return f"\n\nπ§ **Tools Used**: {', '.join(tools_used)}" | |
| def chat_response(message: str, history: List[List[str]]) -> tuple: | |
| """Process a chat message and return response.""" | |
| global model, memory | |
| if model is None or model.model is None: | |
| return "β³ Model is loading. Please wait...", history + [[message, "β³ Model is loading. Please wait..."]] | |
| # Track tools used | |
| tools_used = [] | |
| # Check if we need to use tools based on the message | |
| message_lower = message.lower() | |
| if any(kw in message_lower for kw in ['git status', 'git']): | |
| tools_used.append("git_status") | |
| if any(kw in message_lower for kw in ['search', 'find', 'look up']): | |
| tools_used.append("web_search") | |
| if any(kw in message_lower for kw in ['list files', 'directory', 'ls']): | |
| tools_used.append("list_directory") | |
| if any(kw in message_lower for kw in ['run ', 'execute', 'command']): | |
| tools_used.append("run_command") | |
| # Generate response | |
| try: | |
| response = model.generate(message, max_tokens=512) | |
| except Exception as e: | |
| response = f"I encountered an error: {str(e)}" | |
| # Add tools used to response | |
| response += format_tools_used(tools_used) | |
| # Record in memory | |
| memory.add_interaction(message, response, tools_used) | |
| return response | |
| def chat_response_stream(message: str, history: List[List[str]]) -> Generator: | |
| """Process a chat message with streaming.""" | |
| global model, memory | |
| if model is None or model.model is None: | |
| yield "β³ Model is loading. Please wait..." | |
| return | |
| full_response = "" | |
| tools_used = [] | |
| message_lower = message.lower() | |
| if any(kw in message_lower for kw in ['git status', 'git']): | |
| tools_used.append("git_status") | |
| if any(kw in message_lower for kw in ['search', 'find']): | |
| tools_used.append("web_search") | |
| if any(kw in message_lower for kw in ['list', 'directory']): | |
| tools_used.append("list_directory") | |
| # Stream the response | |
| for token in model.generate_streaming(message, max_tokens=256): | |
| full_response += token | |
| yield full_response | |
| # Add tools used | |
| if tools_used: | |
| full_response += format_tools_used(tools_used) | |
| yield full_response | |
| # Record in memory | |
| memory.add_interaction(message, full_response, tools_used) | |
| # Example prompts for the UI | |
| EXAMPLE_PROMPTS = [ | |
| "Hello! What can you help me with?", | |
| "Check git status of this repository", | |
| "Search for best practices for Python async programming", | |
| "List the files in the current directory", | |
| "Write a simple Python function to calculate fibonacci", | |
| "How do I use Git to create a new branch?", | |
| "What's your memory of our conversation?", | |
| ] | |
| def create_gradio_app(): | |
| """Create the Gradio interface.""" | |
| with gr.Blocks( | |
| title="Stack 2.9 - Pattern-Based AI Coding Assistant", | |
| theme=gr.themes.Soft( | |
| primary_color="#6366f1", | |
| secondary_color="#818cf8", | |
| tertiary_color="#a5b4fc" | |
| ) | |
| ) as app: | |
| # Header | |
| gr.Markdown(""" | |
| # π Stack 2.9 - Pattern-Based AI Coding Assistant | |
| Powered by **Qwen2.5-Coder-7B** with 4-bit quantization | |
| --- | |
| """) | |
| # Memory stats display | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| stats_display = gr.Markdown( | |
| "π **Memory Stats**\n\n- Interactions: 0\n- Tools learned: 0\n- Code patterns: 0", | |
| elem_id="stats" | |
| ) | |
| with gr.Column(scale=3): | |
| pass # Spacer | |
| # Chat interface | |
| chatbot = gr.Chatbot( | |
| height=500, | |
| show_copy_button=True, | |
| bubble_full_width=False | |
| ) | |
| with gr.Row(): | |
| msg = gr.Textbox( | |
| label="Message", | |
| placeholder="Ask me anything...", | |
| scale=4, | |
| lines=3 | |
| ) | |
| submit_btn = gr.Button("Send", variant="primary", scale=1) | |
| # Clear button | |
| with gr.Row(): | |
| clear_btn = gr.Button("ποΈ Clear Chat") | |
| # Example prompts | |
| gr.Examples( | |
| examples=EXAMPLE_PROMPTS, | |
| inputs=msg, | |
| label="Example Prompts" | |
| ) | |
| # Memory visualization | |
| with gr.Accordion("π§ Self-Evolution Memory", open=False): | |
| memory_display = gr.Textbox( | |
| label="Memory Content", | |
| lines=10, | |
| interactive=False | |
| ) | |
| # Functions | |
| def respond(message, history): | |
| response = chat_response(message, history) | |
| history.append([message, response]) | |
| return "", history | |
| def update_stats(): | |
| stats = memory.get_stats() | |
| return f"""π **Memory Stats** | |
| - **Interactions**: {stats['total_interactions']} | |
| - **Tool Patterns**: {stats['tool_patterns']} | |
| - **Code Snippets**: {stats['code_snippets']} | |
| **Recent Tools**: {', '.join(stats['recent_tools']) if stats['recent_tools'] else 'None'}""" | |
| def update_memory(): | |
| return memory.get_context() | |
| # Button click handlers | |
| submit_btn.click(respond, [msg, chatbot], [msg, chatbot], api_name="send") | |
| msg.submit(respond, [msg, chatbot], [msg, chatbot], api_name="send") | |
| def clear_chat(): | |
| return [], "" | |
| clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg]) | |
| # Update stats periodically | |
| chatbot.change(update_stats, outputs=[stats_display]) | |
| chatbot.change(update_memory, outputs=[memory_display]) | |
| # Footer | |
| gr.Markdown(""" | |
| --- | |
| ### About Stack 2.9 | |
| Stack 2.9 is a pattern-based AI coding assistant that: | |
| - π Uses **Qwen2.5-Coder-7B** (4-bit, ~4GB VRAM) | |
| - π οΈ Integrates **7 tools** (file, git, web, search, shell) | |
| - π§ Remembers interactions and learns patterns | |
| - β‘ Provides fast, streaming responses | |
| Deployed on **HuggingFace Spaces** with Gradio | |
| """) | |
| return app | |
| # ============================================================ | |
| # Main Entry Point | |
| # ============================================================ | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Stack 2.9 - HuggingFace Spaces Demo") | |
| parser.add_argument("--share", action="store_true", help="Create a public share link") | |
| parser.add_argument("--port", type=int, default=7860, help="Port to run on") | |
| parser.add_argument("--model", type=str, default="Qwen/Qwen2.5-Coder-7B-Instruct", help="Model ID") | |
| args = parser.parse_args() | |
| print("=" * 50) | |
| print("π Stack 2.9 - Pattern-Based AI Coding Assistant") | |
| print("=" * 50) | |
| print(f"Model: {args.model}") | |
| print("Loading model...") | |
| # Initialize model in a thread | |
| import threading | |
| def load_model_thread(): | |
| global model | |
| model = initialize_model() | |
| loader_thread = threading.Thread(target=load_model_thread) | |
| loader_thread.start() | |
| # Create and launch app | |
| app = create_gradio_app() | |
| print(f"\nπ Launching Gradio on port {args.port}...") | |
| print("π Note: Model loads in background. Chat will work once loaded.\n") | |
| app.launch( | |
| server_name="0.0.0.0", | |
| server_port=args.port, | |
| share=args.share | |
| ) |