Text Generation
Transformers
English
qwen2
code-generation
python
fine-tuning
Qwen
tools
agent-framework
multi-agent
conversational
Eval Results (legacy)
Instructions to use my-ai-stack/Stack-2-9-finetuned with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use my-ai-stack/Stack-2-9-finetuned with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("my-ai-stack/Stack-2-9-finetuned") model = AutoModelForCausalLM.from_pretrained("my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use my-ai-stack/Stack-2-9-finetuned with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "my-ai-stack/Stack-2-9-finetuned" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
- SGLang
How to use my-ai-stack/Stack-2-9-finetuned with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use my-ai-stack/Stack-2-9-finetuned with Docker Model Runner:
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
| import torch | |
| import requests | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from ddgs import DDGS | |
| SYSTEM_PROMPT = """You are Stack 2.9, an expert AI coding assistant. | |
| - Answer questions naturally and helpfully | |
| - When the user asks for code, write clean complete code | |
| - When the user asks a question, answer in plain language | |
| - Be concise and practical | |
| - If asked to search the internet, use the search: command""" | |
| MODEL_NAME = "/Users/walidsobhi/stack-2-9-final-model" | |
| print(f"Loading {MODEL_NAME} from HuggingFace...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| torch_dtype=torch.float16, | |
| device_map="auto" | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| print("β Ready!\n") | |
| # Generation settings | |
| MAX_TOKENS = 200 | |
| TEMPERATURE = 0.4 | |
| TOP_P = 0.9 | |
| REP_PENALTY = 1.2 | |
| print(f"Settings: max_tokens={MAX_TOKENS}, temperature={TEMPERATURE}, top_p={TOP_P}") | |
| print("Commands: search:<query> - search the web, quit/exit - stop\n") | |
| def web_search(query, count=5): | |
| """Search the web using DuckDuckGo (no API key needed)""" | |
| try: | |
| results = [] | |
| with DDGS() as ddgs: | |
| for r in ddgs.text(query, max_results=count): | |
| results.append(f"{r['body'][:200]}") | |
| if len(results) >= count: | |
| break | |
| if results: | |
| return {"success": True, "results": results, "query": query} | |
| return {"success": False, "error": "No results found"} | |
| except Exception as e: | |
| return {"success": False, "error": str(e)} | |
| # Interactive loop | |
| while True: | |
| try: | |
| prompt = input("You: ") | |
| if prompt.lower() in ['quit', 'exit', 'q']: | |
| break | |
| if not prompt.strip(): | |
| continue | |
| # Handle search command | |
| if prompt.lower().startswith("search:"): | |
| query = prompt[7:].strip() | |
| print("π Searching...") | |
| result = web_search(query) | |
| if result["success"]: | |
| print(f"β Results for '{result['query']}':\n") | |
| for i, r in enumerate(result["results"], 1): | |
| print(f" {i}. {r}") | |
| else: | |
| print(f"β Search failed: {result['error']}") | |
| continue | |
| # Prepend system prompt | |
| full_prompt = f"{SYSTEM_PROMPT}\n\nUser: {prompt}\nAssistant:" | |
| inputs = tokenizer(full_prompt, return_tensors='pt').to(model.device) | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=MAX_TOKENS, | |
| temperature=TEMPERATURE, | |
| top_p=TOP_P, | |
| repetition_penalty=REP_PENALTY, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| # Decode full response | |
| full_response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Extract only the assistant's response (after "Assistant:") | |
| if "Assistant:" in full_response: | |
| response = full_response.split("Assistant:")[-1].strip() | |
| else: | |
| response = full_response[len(full_prompt):].strip() | |
| # Stop at common stop points | |
| for stop in ['\n\n\n', 'User:', 'You:']: | |
| if stop in response: | |
| response = response.split(stop)[0].strip() | |
| print(f"AI: {response}\n") | |
| except KeyboardInterrupt: | |
| print("\nExiting...") | |
| break | |
| print("Goodbye!") | |