Spaces:

gopikrishnait
/

RGBMetrics

Sleeping

RGBMetrics / src /config.py

RGB Evaluation

feat: Show all 9 LLM models in app dropdown, add comprehensive code review and metric analysis documentation

b1ccc5d 19 days ago

2.33 kB

	"""
	Configuration for RGB RAG Evaluation
	"""

	import os
	from typing import List

	# Data directory
	DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")

	# Results directory
	RESULTS_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "results")

	# Dataset files
	DATASETS = {
	"noise_robustness": "en_refine.json",
	"negative_rejection": "en_refine.json",
	"information_integration": "en_int.json",
	"counterfactual_robustness": "en_fact.json",
	}

	# Dataset URLs (from GitHub)
	DATASET_URLS = {
	"en_refine.json": "https://raw.githubusercontent.com/chen700564/RGB/main/data/en_refine.json",
	"en_int.json": "https://raw.githubusercontent.com/chen700564/RGB/main/data/en_int.json",
	"en_fact.json": "https://raw.githubusercontent.com/chen700564/RGB/main/data/en_fact.json",
	}

	# Default models to evaluate (first 5 as primary)
	DEFAULT_MODELS: List[str] = [
	"meta-llama/llama-4-maverick-17b-128e-instruct", # Llama 4 Maverick 17B
	"meta-llama/llama-prompt-guard-2-86m", # Llama Prompt Guard 2 86M
	"llama-3.1-8b-instant", # Llama 3.1 8B - Fast
	"openai/gpt-oss-120b", # GPT OSS 120B
	"moonshotai/kimi-k2-instruct", # Moonshot Kimi K2 Instruct
	]

	# Additional available models
	ADDITIONAL_MODELS: List[str] = [
	"moonshotai/kimi-k2-instruct-0905", # Kimi K2 Instruct 0905
	"moonshotai/kimi-k2-instruct", # Kimi K2 Instruct
	"llama-3.3-70b-versatile", # Llama 3.3 70B
	"meta-llama/llama-4-scout-17b-16e-instruct", # Llama 4 Scout 17B
	"qwen/qwen3-32b", # Qwen 3 32B
	]

	# All available models (for UI dropdown)
	ALL_MODELS: List[str] = DEFAULT_MODELS + ADDITIONAL_MODELS

	# Evaluation settings
	EVALUATION_CONFIG = {
	"temperature": 0.0, # Use deterministic outputs for reproducibility
	"max_tokens": 1024, # Maximum response tokens
	"rate_limit_delay": 0.5, # Seconds between API calls
	"retry_count": 3, # Number of retries on failure
	}

	# Metrics to report
	METRICS = {
	"noise_robustness": ["accuracy"],
	"negative_rejection": ["rejection_rate"],
	"information_integration": ["accuracy"],
	"counterfactual_robustness": ["error_detection_rate", "error_correction_rate"],
	}