Spaces:
Sleeping
Sleeping
RGB Evaluation
feat: Show all 9 LLM models in app dropdown, add comprehensive code review and metric analysis documentation
b1ccc5d
| """ | |
| Configuration for RGB RAG Evaluation | |
| """ | |
| import os | |
| from typing import List | |
| # Data directory | |
| DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data") | |
| # Results directory | |
| RESULTS_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "results") | |
| # Dataset files | |
| DATASETS = { | |
| "noise_robustness": "en_refine.json", | |
| "negative_rejection": "en_refine.json", | |
| "information_integration": "en_int.json", | |
| "counterfactual_robustness": "en_fact.json", | |
| } | |
| # Dataset URLs (from GitHub) | |
| DATASET_URLS = { | |
| "en_refine.json": "https://raw.githubusercontent.com/chen700564/RGB/main/data/en_refine.json", | |
| "en_int.json": "https://raw.githubusercontent.com/chen700564/RGB/main/data/en_int.json", | |
| "en_fact.json": "https://raw.githubusercontent.com/chen700564/RGB/main/data/en_fact.json", | |
| } | |
| # Default models to evaluate (first 5 as primary) | |
| DEFAULT_MODELS: List[str] = [ | |
| "meta-llama/llama-4-maverick-17b-128e-instruct", # Llama 4 Maverick 17B | |
| "meta-llama/llama-prompt-guard-2-86m", # Llama Prompt Guard 2 86M | |
| "llama-3.1-8b-instant", # Llama 3.1 8B - Fast | |
| "openai/gpt-oss-120b", # GPT OSS 120B | |
| "moonshotai/kimi-k2-instruct", # Moonshot Kimi K2 Instruct | |
| ] | |
| # Additional available models | |
| ADDITIONAL_MODELS: List[str] = [ | |
| "moonshotai/kimi-k2-instruct-0905", # Kimi K2 Instruct 0905 | |
| "moonshotai/kimi-k2-instruct", # Kimi K2 Instruct | |
| "llama-3.3-70b-versatile", # Llama 3.3 70B | |
| "meta-llama/llama-4-scout-17b-16e-instruct", # Llama 4 Scout 17B | |
| "qwen/qwen3-32b", # Qwen 3 32B | |
| ] | |
| # All available models (for UI dropdown) | |
| ALL_MODELS: List[str] = DEFAULT_MODELS + ADDITIONAL_MODELS | |
| # Evaluation settings | |
| EVALUATION_CONFIG = { | |
| "temperature": 0.0, # Use deterministic outputs for reproducibility | |
| "max_tokens": 1024, # Maximum response tokens | |
| "rate_limit_delay": 0.5, # Seconds between API calls | |
| "retry_count": 3, # Number of retries on failure | |
| } | |
| # Metrics to report | |
| METRICS = { | |
| "noise_robustness": ["accuracy"], | |
| "negative_rejection": ["rejection_rate"], | |
| "information_integration": ["accuracy"], | |
| "counterfactual_robustness": ["error_detection_rate", "error_correction_rate"], | |
| } | |