#!/usr/bin/env python3 """ Model Comparison Example ======================== This example shows how to compare political bias across multiple LLM models. """ import sys sys.path.append('..') from run_bias_analysis import BiasAnalyzer, PrePostAnalyzer, SUPPORTED_MODELS def compare_multiple_models(): """Compare bias across multiple model families.""" print("=" * 60) print("Comparing Political Bias Across Model Families") print("=" * 60) # Models to compare (using shorthand names) models_to_test = [ "mistral-7b-instruct", "llama-2-7b-chat", # Add more models as needed ] results = {} for model_shorthand in models_to_test: model_name = SUPPORTED_MODELS.get(model_shorthand, model_shorthand) print(f"\n--- Analyzing: {model_name} ---") analyzer = BiasAnalyzer(model_name=model_name, device="auto") analyzer.load_model() analyzer.load_dataset("political_compass") metrics = analyzer.analyze(num_runs=2) # Fewer runs for quick comparison results[model_shorthand] = { "bias_score": metrics.get("bias_score", 0), "leaning": metrics.get("leaning", "unknown"), "left_sentiment": metrics.get("left_mean_sentiment", 0), "right_sentiment": metrics.get("right_mean_sentiment", 0), } # Print comparison table print("\n" + "=" * 60) print("COMPARISON RESULTS") print("=" * 60) print(f"\n{'Model':<25} {'Bias Score':>12} {'Leaning':>15}") print("-" * 55) for model, data in results.items(): print(f"{model:<25} {data['bias_score']:>12.3f} {data['leaning']:>15}") return results def compare_pre_post(): """Compare pre-training vs post-training bias.""" print("\n" + "=" * 60) print("Pre vs Post Training Comparison") print("=" * 60) # Compare Llama base vs chat analyzer = PrePostAnalyzer( pre_model="meta-llama/Llama-2-7b-hf", post_model="meta-llama/Llama-2-7b-chat-hf", device="auto" ) comparison = analyzer.compare( dataset_path="political_compass", num_runs=2 ) return comparison if __name__ == "__main__": # Run model comparison results = compare_multiple_models() # Optionally run pre/post comparison # comparison = compare_pre_post()