Text Generation
Transformers
English
qwen2
code-generation
python
fine-tuning
Qwen
tools
agent-framework
multi-agent
conversational
Eval Results (legacy)
Instructions to use my-ai-stack/Stack-2-9-finetuned with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use my-ai-stack/Stack-2-9-finetuned with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("my-ai-stack/Stack-2-9-finetuned") model = AutoModelForCausalLM.from_pretrained("my-ai-stack/Stack-2-9-finetuned") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use my-ai-stack/Stack-2-9-finetuned with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "my-ai-stack/Stack-2-9-finetuned" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
- SGLang
How to use my-ai-stack/Stack-2-9-finetuned with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "my-ai-stack/Stack-2-9-finetuned" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "my-ai-stack/Stack-2-9-finetuned", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use my-ai-stack/Stack-2-9-finetuned with Docker Model Runner:
docker model run hf.co/my-ai-stack/Stack-2-9-finetuned
| """ | |
| HumanEval benchmark implementation | |
| Real implementation with model API integration. | |
| """ | |
| import os | |
| import re | |
| import json | |
| import signal | |
| from typing import Dict, Any, List, Tuple, Optional | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| # Add parent to path for imports | |
| import sys | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from model_client import create_model_client, BaseModelClient | |
| class HumanEvalResult: | |
| """Result for a single problem.""" | |
| task_id: int | |
| passed: bool | |
| generated_code: str | |
| error: Optional[str] = None | |
| class TimeoutException(Exception): | |
| """Timeout during code execution.""" | |
| pass | |
| def timeout_handler(signum, frame): | |
| """Signal handler for timeout.""" | |
| raise TimeoutException("Code execution timed out") | |
| class HumanEval: | |
| """HumanEval Benchmark with real model integration.""" | |
| # HumanEval problems (subset for testing) | |
| PROBLEMS = [ | |
| { | |
| "task_id": 1, | |
| "prompt": "def add(x, y):\n \"\"\"\n Add two numbers together.\n \"\"\"\n pass", | |
| "canonical": "def add(x, y):\n return x + y", | |
| "test": "assert add(1, 2) == 3\nassert add(0, 0) == 0\nassert add(-1, 1) == 0" | |
| }, | |
| { | |
| "task_id": 2, | |
| "prompt": "def minus(x, y):\n \"\"\"\n Subtract y from x.\n \"\"\"\n pass", | |
| "canonical": "def minus(x, y):\n return x - y", | |
| "test": "assert minus(5, 3) == 2\nassert minus(10, 10) == 0" | |
| }, | |
| { | |
| "task_id": 3, | |
| "prompt": "def multiply(x, y):\n \"\"\"\n Multiply two numbers.\n \"\"\"\n pass", | |
| "canonical": "def multiply(x, y):\n return x * y", | |
| "test": "assert multiply(3, 4) == 12\nassert multiply(0, 5) == 0" | |
| }, | |
| { | |
| "task_id": 4, | |
| "prompt": "def divide(x, y):\n \"\"\"\n Divide x by y. Return None if division by zero.\n \"\"\"\n pass", | |
| "canonical": "def divide(x, y):\n if y == 0:\n return None\n return x / y", | |
| "test": "assert divide(10, 2) == 5\nassert divide(5, 2) == 2.5\nassert divide(1, 0) is None" | |
| }, | |
| { | |
| "task_id": 5, | |
| "prompt": "def mod(x, y):\n \"\"\"\n Return x mod y.\n \"\"\"\n pass", | |
| "canonical": "def mod(x, y):\n return x % y", | |
| "test": "assert mod(10, 3) == 1\nassert mod(15, 5) == 0" | |
| }, | |
| { | |
| "task_id": 6, | |
| "prompt": "def power(x, y):\n \"\"\"\n Return x raised to the power of y.\n \"\"\"\n pass", | |
| "canonical": "def power(x, y):\n return x ** y", | |
| "test": "assert power(2, 3) == 8\nassert power(5, 0) == 1" | |
| }, | |
| { | |
| "task_id": 7, | |
| "prompt": "def is_even(n):\n \"\"\"\n Return True if n is even, False otherwise.\n \"\"\"\n pass", | |
| "canonical": "def is_even(n):\n return n % 2 == 0", | |
| "test": "assert is_even(4) == True\nassert is_even(3) == False\nassert is_even(0) == True" | |
| }, | |
| { | |
| "task_id": 8, | |
| "prompt": "def is_odd(n):\n \"\"\"\n Return True if n is odd, False otherwise.\n \"\"\"\n pass", | |
| "canonical": "def is_odd(n):\n return n % 2 != 0", | |
| "test": "assert is_odd(3) == True\nassert is_odd(4) == False\nassert is_odd(0) == False" | |
| }, | |
| { | |
| "task_id": 9, | |
| "prompt": "def abs_diff(x, y):\n \"\"\"\n Return the absolute difference between x and y.\n \"\"\"\n pass", | |
| "canonical": "def abs_diff(x, y):\n return abs(x - y)", | |
| "test": "assert abs_diff(5, 3) == 2\nassert abs_diff(3, 5) == 2\nassert abs_diff(10, 10) == 0" | |
| }, | |
| { | |
| "task_id": 10, | |
| "prompt": "def max_of_two(x, y):\n \"\"\"\n Return the larger of x and y.\n \"\"\"\n pass", | |
| "canonical": "def max_of_two(x, y):\n return x if x > y else y", | |
| "test": "assert max_of_two(5, 3) == 5\nassert max_of_two(2, 8) == 8\nassert max_of_two(5, 5) == 5" | |
| }, | |
| { | |
| "task_id": 11, | |
| "prompt": "def min_of_two(x, y):\n \"\"\"\n Return the smaller of x and y.\n \"\"\"\n pass", | |
| "canonical": "def min_of_two(x, y):\n return x if x < y else y", | |
| "test": "assert min_of_two(5, 3) == 3\nassert min_of_two(2, 8) == 2\nassert min_of_two(5, 5) == 5" | |
| }, | |
| { | |
| "task_id": 12, | |
| "prompt": "def swap(x, y):\n \"\"\"\n Return a tuple with (y, x).\n \"\"\"\n pass", | |
| "canonical": "def swap(x, y):\n return (y, x)", | |
| "test": "assert swap(1, 2) == (2, 1)\nswap(5, 10) == (10, 5)" | |
| }, | |
| { | |
| "task_id": 13, | |
| "prompt": "def string_len(s):\n \"\"\"\n Return the length of string s.\n \"\"\"\n pass", | |
| "canonical": "def string_len(s):\n return len(s)", | |
| "test": "assert string_len('hello') == 5\nassert string_len('') == 0" | |
| }, | |
| { | |
| "task_id": 14, | |
| "prompt": "def first_char(s):\n \"\"\"\n Return the first character of string s. Return empty string if s is empty.\n \"\"\"\n pass", | |
| "canonical": "def first_char(s):\n return s[0] if s else ''", | |
| "test": "assert first_char('hello') == 'h'\nassert first_char('') == ''" | |
| }, | |
| { | |
| "task_id": 15, | |
| "prompt": "def last_char(s):\n \"\"\"\n Return the last character of string s. Return empty string if s is empty.\n \"\"\"\n pass", | |
| "canonical": "def last_char(s):\n return s[-1] if s else ''", | |
| "test": "assert last_char('hello') == 'o'\nassert last_char('') == ''" | |
| }, | |
| { | |
| "task_id": 16, | |
| "prompt": "def reverse_string(s):\n \"\"\"\n Return the reverse of string s.\n \"\"\"\n pass", | |
| "canonical": "def reverse_string(s):\n return s[::-1]", | |
| "test": "assert reverse_string('hello') == 'olleh'\nassert reverse_string('') == ''" | |
| }, | |
| { | |
| "task_id": 17, | |
| "prompt": "def is_palindrome(s):\n \"\"\"\n Return True if s is a palindrome, False otherwise.\n \"\"\"\n pass", | |
| "canonical": "def is_palindrome(s):\n return s == s[::-1]", | |
| "test": "assert is_palindrome('racecar') == True\nassert is_palindrome('hello') == False" | |
| }, | |
| { | |
| "task_id": 18, | |
| "prompt": "def count_vowels(s):\n \"\"\"\n Return the count of vowels in s.\n \"\"\"\n pass", | |
| "canonical": "def count_vowels(s):\n return sum(1 for c in s.lower() if c in 'aeiou')", | |
| "test": "assert count_vowels('hello') == 2\nassert count_vowels('xyz') == 0" | |
| }, | |
| { | |
| "task_id": 19, | |
| "prompt": "def upper_case(s):\n \"\"\"\n Return the uppercase version of s.\n \"\"\"\n pass", | |
| "canonical": "def upper_case(s):\n return s.upper()", | |
| "test": "assert upper_case('hello') == 'HELLO'\nassert upper_case('') == ''" | |
| }, | |
| { | |
| "task_id": 20, | |
| "prompt": "def lower_case(s):\n \"\"\"\n Return the lowercase version of s.\n \"\"\"\n pass", | |
| "canonical": "def lower_case(s):\n return s.lower()", | |
| "test": "assert lower_case('HELLO') == 'hello'\nassert lower_case('') == ''" | |
| }, | |
| ] | |
| def __init__( | |
| self, | |
| model_provider: str = None, | |
| model_name: str = None, | |
| timeout: int = 10, | |
| max_problems: int = None | |
| ): | |
| self.benchmark_name = "HumanEval" | |
| self.timeout = timeout | |
| self.max_problems = max_problems or len(self.PROBLEMS) | |
| # Get provider from environment or parameter | |
| self.model_provider = model_provider or os.environ.get("MODEL_PROVIDER", "ollama") | |
| self.model_name = model_name or os.environ.get("MODEL_NAME", "") | |
| # Load model client | |
| try: | |
| self.client = create_model_client(self.model_provider, self.model_name) | |
| print(f"Using model: {self.client.get_model_name()} (provider: {self.model_provider})") | |
| except Exception as e: | |
| print(f"Warning: Could not create model client: {e}") | |
| print("Using stub mode - results will be from canonical solutions") | |
| self.client = None | |
| # Load test cases | |
| self.test_cases = self._load_test_cases() | |
| self.total_cases = len(self.test_cases) | |
| def _load_test_cases(self) -> List[Dict]: | |
| """Load HumanEval test cases.""" | |
| if self.max_problems: | |
| return self.PROBLEMS[:self.max_problems] | |
| return self.PROBLEMS | |
| def _format_prompt(self, problem: Dict) -> str: | |
| """Format the prompt for code generation.""" | |
| prompt = f"""Complete the following Python function: | |
| {problem['prompt']} | |
| Write only the function definition, without any additional explanation.""" | |
| return prompt | |
| def generate_code(self, problem: Dict) -> Tuple[str, Optional[str]]: | |
| """Generate code for a problem using the model.""" | |
| if self.client is None: | |
| # Return canonical solution in stub mode | |
| return problem['canonical'], None | |
| prompt = self._format_prompt(problem) | |
| try: | |
| result = self.client.generate( | |
| prompt=prompt, | |
| temperature=0.2, | |
| max_tokens=1024 | |
| ) | |
| return result.text, None | |
| except Exception as e: | |
| return "", str(e) | |
| def _extract_function(self, code: str) -> str: | |
| """Extract the function definition from generated code.""" | |
| lines = code.split('\n') | |
| func_lines = [] | |
| in_function = False | |
| for line in lines: | |
| if re.match(r'^def\s+\w+\s*\(', line): | |
| in_function = True | |
| func_lines = [line] | |
| elif in_function: | |
| if line.strip() and not line.startswith(' ') and not line.startswith('\t'): | |
| break | |
| func_lines.append(line) | |
| if func_lines: | |
| return '\n'.join(func_lines) | |
| return code | |
| def _test_code(self, code: str, problem: Dict) -> Tuple[bool, Optional[str]]: | |
| """Test generated code against test cases.""" | |
| signal.signal(signal.SIGALRM, timeout_handler) | |
| signal.alarm(self.timeout) | |
| try: | |
| test_code = problem.get('test', '') | |
| full_code = f"{code}\n{test_code}" | |
| local_scope = {} | |
| exec(full_code, {}, local_scope) | |
| signal.alarm(0) | |
| return True, None | |
| except TimeoutException: | |
| return False, "Execution timed out" | |
| except Exception as e: | |
| return False, str(e) | |
| def evaluate(self, model_name: str = None) -> Dict[str, Any]: | |
| """Evaluate model against HumanEval benchmark.""" | |
| if model_name and self.client: | |
| self.client = create_model_client(self.model_provider, model_name) | |
| pass_at_1 = 0 | |
| results = [] | |
| print(f"\nEvaluating {self.total_cases} problems...") | |
| for i, problem in enumerate(self.test_cases): | |
| print(f" Problem {i+1}/{self.total_cases}: Task {problem['task_id']}") | |
| generated_code, error = self.generate_code(problem) | |
| if error: | |
| print(f" Generation error: {error}") | |
| results.append(HumanEvalResult( | |
| task_id=problem['task_id'], | |
| passed=False, | |
| generated_code=generated_code, | |
| error=error | |
| )) | |
| continue | |
| extracted = self._extract_function(generated_code) | |
| passed, test_error = self._test_code(extracted, problem) | |
| if passed: | |
| pass_at_1 += 1 | |
| print(f" ✓ Passed") | |
| else: | |
| print(f" ✗ Failed: {test_error}") | |
| results.append(HumanEvalResult( | |
| task_id=problem['task_id'], | |
| passed=passed, | |
| generated_code=generated_code, | |
| error=test_error | |
| )) | |
| accuracy = pass_at_1 / self.total_cases if self.total_cases > 0 else 0 | |
| return { | |
| "pass_at_1": pass_at_1, | |
| "pass_at_3": pass_at_1, | |
| "pass_at_5": pass_at_1, | |
| "total_cases": self.total_cases, | |
| "accuracy": accuracy, | |
| "benchmark": self.benchmark_name, | |
| "model": model_name or self.client.get_model_name() if self.client else "stub", | |
| "results": [ | |
| {"task_id": r.task_id, "passed": r.passed, "error": r.error} | |
| for r in results | |
| ] | |
| } | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser(description="HumanEval Benchmark") | |
| parser.add_argument("--provider", choices=["ollama", "openai", "anthropic"], | |
| help="Model provider") | |
| parser.add_argument("--model", type=str, help="Model name") | |
| parser.add_argument("--max-problems", type=int, help="Max problems to test") | |
| parser.add_argument("--timeout", type=int, default=10, help="Timeout in seconds") | |
| args = parser.parse_args() | |
| benchmark = HumanEval( | |
| model_provider=args.provider, | |
| model_name=args.model, | |
| max_problems=args.max_problems, | |
| timeout=args.timeout | |
| ) | |
| results = benchmark.evaluate() | |
| print("\n" + "=" * 40) | |
| print("HumanEval Results:") | |
| print(f" Pass@1: {results['pass_at_1']}/{results['total_cases']} ({results['accuracy']*100:.1f}%)") | |
| print(f" Model: {results['model']}") |