Instructions to use dongbobo/my-awesome-model with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use dongbobo/my-awesome-model with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("dongbobo/my-awesome-model", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| import argparse | |
| import os | |
| import sys | |
| import subprocess | |
| # Add utils to path | |
| sys.path.insert(0, os.path.dirname(__file__)) | |
| from utils.benchmark_utils import BENCHMARK_CALCULATORS | |
| # List of all benchmark categories | |
| BENCHMARK_CATEGORIES = list(BENCHMARK_CALCULATORS.keys()) | |
| def run_benchmark_evaluation(benchmark_name, model_path): | |
| """Run evaluation for a specific benchmark category""" | |
| benchmark_script = os.path.join("evaluation", "benchmarks", benchmark_name, "eval.py") | |
| if not os.path.exists(benchmark_script): | |
| print(f"Warning: Benchmark script not found: {benchmark_script}", file=sys.stderr) | |
| return None | |
| try: | |
| result = subprocess.run( | |
| [sys.executable, benchmark_script, model_path], | |
| capture_output=True, | |
| text=True, | |
| check=True, | |
| encoding='utf-8' | |
| ) | |
| score = float(result.stdout.strip()) | |
| return score | |
| except subprocess.CalledProcessError as e: | |
| print(f"Error running {benchmark_name} evaluation: {e.stderr}", file=sys.stderr) | |
| return None | |
| except (ValueError, TypeError): | |
| print(f"Warning: Could not parse score from {benchmark_name}: '{result.stdout.strip()}'", file=sys.stderr) | |
| return None | |
| def calculate_overall_score(benchmark_scores): | |
| """Calculate overall performance score from individual benchmarks""" | |
| valid_scores = [score for score in benchmark_scores.values() if score is not None] | |
| if not valid_scores: | |
| return None | |
| # Weighted average with slight emphasis on reasoning tasks | |
| weights = { | |
| "math_reasoning": 1.2, | |
| "logical_reasoning": 1.2, | |
| "code_generation": 1.1, | |
| "question_answering": 1.1, | |
| "reading_comprehension": 1.0, | |
| "common_sense": 1.0, | |
| "text_classification": 0.9, | |
| "sentiment_analysis": 0.9, | |
| "dialogue_generation": 1.0, | |
| "summarization": 1.0, | |
| "translation": 1.0, | |
| "knowledge_retrieval": 1.0, | |
| "creative_writing": 0.9, | |
| "instruction_following": 1.1, | |
| "safety_evaluation": 1.1 | |
| } | |
| weighted_sum = 0 | |
| total_weight = 0 | |
| for benchmark, score in benchmark_scores.items(): | |
| if score is not None: | |
| weight = weights.get(benchmark, 1.0) | |
| weighted_sum += score * weight | |
| total_weight += weight | |
| return round(weighted_sum / total_weight, 3) if total_weight > 0 else None | |
| def main(): | |
| """ | |
| Run comprehensive evaluation across all benchmark categories. | |
| Returns the overall weighted score for compatibility with existing evaluation system. | |
| """ | |
| parser = argparse.ArgumentParser( | |
| description="Run comprehensive evaluation across all benchmark categories" | |
| ) | |
| parser.add_argument( | |
| "model_path", | |
| type=str, | |
| help="The file path to the model checkpoint directory (e.g., ../checkpoints/step_100)." | |
| ) | |
| args = parser.parse_args() | |
| # Check if the provided path is a directory | |
| if not os.path.isdir(args.model_path): | |
| print(f"Error: Directory not found at '{args.model_path}'", file=sys.stderr) | |
| sys.exit(1) | |
| # Change to the directory containing the evaluation scripts | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| original_cwd = os.getcwd() | |
| os.chdir(os.path.dirname(script_dir)) | |
| benchmark_scores = {} | |
| # Run evaluation for each benchmark category | |
| for benchmark in BENCHMARK_CATEGORIES: | |
| score = run_benchmark_evaluation(benchmark, args.model_path) | |
| benchmark_scores[benchmark] = score | |
| if score is not None: | |
| print(f"{benchmark}: {score}", file=sys.stderr) | |
| # Calculate overall score | |
| overall_score = calculate_overall_score(benchmark_scores) | |
| # Restore original working directory | |
| os.chdir(original_cwd) | |
| if overall_score is None: | |
| print(f"Error: Could not calculate overall score for {args.model_path}", file=sys.stderr) | |
| sys.exit(1) | |
| # Print only the overall score for compatibility with existing evaluation pipeline | |
| print(overall_score) | |
| if __name__ == "__main__": | |
| main() |