Spaces:
Runtime error
Runtime error
Ahmed Ahmed
commited on
Commit
·
4864926
1
Parent(s):
de071e9
ok
Browse files- app.py +27 -8
- src/about.py +29 -14
- src/evaluation/initialize_models.py +121 -0
- src/evaluation/model_trace_eval.py +142 -187
- src/leaderboard/read_evals.py +8 -0
- test_model_trace.py +0 -43
app.py
CHANGED
|
@@ -89,9 +89,13 @@ def run_perplexity_test(model_name, revision, precision):
|
|
| 89 |
import sys
|
| 90 |
import traceback
|
| 91 |
import gradio as gr
|
|
|
|
| 92 |
|
| 93 |
if not model_name:
|
| 94 |
-
return "Please
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
try:
|
| 97 |
# Use stderr for more reliable logging in HF Spaces
|
|
@@ -125,7 +129,7 @@ def run_perplexity_test(model_name, revision, precision):
|
|
| 125 |
|
| 126 |
🎉 **Results have been saved and both tables have been updated!**
|
| 127 |
|
| 128 |
-
Note
|
| 129 |
|
| 130 |
return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
|
| 131 |
else:
|
|
@@ -167,9 +171,17 @@ except Exception as e:
|
|
| 167 |
# Ensure local directory exists even if repo operations fail
|
| 168 |
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
| 169 |
|
| 170 |
-
#
|
| 171 |
import sys
|
|
|
|
|
|
|
| 172 |
sys.stderr.write("\n🚀 STARTING GRADIO APP INITIALIZATION\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
sys.stderr.write("📊 Creating initial results DataFrame...\n")
|
| 174 |
sys.stderr.flush()
|
| 175 |
|
|
@@ -202,11 +214,17 @@ with demo:
|
|
| 202 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 203 |
|
| 204 |
with gr.TabItem("🧪 Test Model", elem_id="test-model-tab", id=2):
|
| 205 |
-
gr.Markdown("## Run Perplexity Test\n\nTest
|
|
|
|
|
|
|
| 206 |
|
| 207 |
with gr.Row():
|
| 208 |
with gr.Column():
|
| 209 |
-
model_name = gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
revision = gr.Textbox(label="Revision", placeholder="main", value="main")
|
| 211 |
precision = gr.Dropdown(
|
| 212 |
choices=["float16", "bfloat16"],
|
|
@@ -231,13 +249,14 @@ with demo:
|
|
| 231 |
### Tips:
|
| 232 |
- **Check stderr logs** in HF Spaces for detailed debugging information
|
| 233 |
- **Results will update automatically** in the table above after evaluation completes
|
| 234 |
-
- **
|
| 235 |
- **Lower perplexity scores = better performance** (better at predicting text)
|
|
|
|
| 236 |
|
| 237 |
### How it works:
|
| 238 |
-
1.
|
| 239 |
2. Click "Run Perplexity Test"
|
| 240 |
-
3. Wait for evaluation to complete (may take a few minutes for
|
| 241 |
4. Results will appear automatically in the table above!
|
| 242 |
""")
|
| 243 |
|
|
|
|
| 89 |
import sys
|
| 90 |
import traceback
|
| 91 |
import gradio as gr
|
| 92 |
+
from src.evaluation.initialize_models import is_model_allowed
|
| 93 |
|
| 94 |
if not model_name:
|
| 95 |
+
return "Please select a model.", gr.update(), gr.update()
|
| 96 |
+
|
| 97 |
+
if not is_model_allowed(model_name):
|
| 98 |
+
return f"❌ Model '{model_name}' is not in the allowed list. Please select from the dropdown.", gr.update(), gr.update()
|
| 99 |
|
| 100 |
try:
|
| 101 |
# Use stderr for more reliable logging in HF Spaces
|
|
|
|
| 129 |
|
| 130 |
🎉 **Results have been saved and both tables have been updated!**
|
| 131 |
|
| 132 |
+
⏰ **Note**: Model trace p-value computation runs a full model comparison analysis and may take 10-30 minutes per model. Progress will appear in the logs."""
|
| 133 |
|
| 134 |
return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
|
| 135 |
else:
|
|
|
|
| 171 |
# Ensure local directory exists even if repo operations fail
|
| 172 |
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
| 173 |
|
| 174 |
+
# Initialize allowed models
|
| 175 |
import sys
|
| 176 |
+
from src.evaluation.initialize_models import initialize_allowed_models, get_allowed_models
|
| 177 |
+
|
| 178 |
sys.stderr.write("\n🚀 STARTING GRADIO APP INITIALIZATION\n")
|
| 179 |
+
sys.stderr.write("📊 Initializing allowed models...\n")
|
| 180 |
+
sys.stderr.flush()
|
| 181 |
+
|
| 182 |
+
# Initialize the allowed models
|
| 183 |
+
initialize_allowed_models()
|
| 184 |
+
|
| 185 |
sys.stderr.write("📊 Creating initial results DataFrame...\n")
|
| 186 |
sys.stderr.flush()
|
| 187 |
|
|
|
|
| 214 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 215 |
|
| 216 |
with gr.TabItem("🧪 Test Model", elem_id="test-model-tab", id=2):
|
| 217 |
+
gr.Markdown("## Run Perplexity Test\n\nTest one of the supported models for perplexity evaluation.")
|
| 218 |
+
|
| 219 |
+
allowed_models = get_allowed_models()
|
| 220 |
|
| 221 |
with gr.Row():
|
| 222 |
with gr.Column():
|
| 223 |
+
model_name = gr.Dropdown(
|
| 224 |
+
choices=allowed_models,
|
| 225 |
+
label="Model name",
|
| 226 |
+
value=allowed_models[0] if allowed_models else None
|
| 227 |
+
)
|
| 228 |
revision = gr.Textbox(label="Revision", placeholder="main", value="main")
|
| 229 |
precision = gr.Dropdown(
|
| 230 |
choices=["float16", "bfloat16"],
|
|
|
|
| 249 |
### Tips:
|
| 250 |
- **Check stderr logs** in HF Spaces for detailed debugging information
|
| 251 |
- **Results will update automatically** in the table above after evaluation completes
|
| 252 |
+
- **Available models**: Vicuna 7B v1.5, IBM Granite 7B Base, LLeMa 7B
|
| 253 |
- **Lower perplexity scores = better performance** (better at predicting text)
|
| 254 |
+
- **Model trace p-values are computed automatically** (may take 10-30 minutes)
|
| 255 |
|
| 256 |
### How it works:
|
| 257 |
+
1. Select a model from the dropdown
|
| 258 |
2. Click "Run Perplexity Test"
|
| 259 |
+
3. Wait for evaluation to complete (may take a few minutes for perplexity + longer for p-value)
|
| 260 |
4. Results will appear automatically in the table above!
|
| 261 |
""")
|
| 262 |
|
src/about.py
CHANGED
|
@@ -17,37 +17,48 @@ NUM_FEWSHOT = 0 # Not used for perplexity
|
|
| 17 |
# ---------------------------------------------------
|
| 18 |
|
| 19 |
# Your leaderboard name
|
| 20 |
-
TITLE = """<h1 align="center" id="space-title">Model
|
| 21 |
|
| 22 |
# What does your leaderboard evaluate?
|
| 23 |
INTRODUCTION_TEXT = """
|
| 24 |
-
This leaderboard evaluates language models based on their perplexity scores
|
| 25 |
-
structural similarity to
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
- **Perplexity**: Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
|
| 28 |
-
- **Match P-Value**: Lower p-values indicate the model preserves structural similarity to
|
| 29 |
"""
|
| 30 |
|
| 31 |
# Which evaluations are you running?
|
| 32 |
LLM_BENCHMARKS_TEXT = """
|
| 33 |
## How it works
|
| 34 |
|
| 35 |
-
The evaluation runs two types of analysis on language models:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
### 1. Perplexity Evaluation
|
| 38 |
Perplexity tests using a fixed test passage about artificial intelligence.
|
| 39 |
Perplexity measures how well a model predicts text - lower scores mean better predictions.
|
| 40 |
|
| 41 |
### 2. Model Tracing Analysis
|
| 42 |
-
Compares each model's internal structure to
|
| 43 |
-
- **Base Model**:
|
| 44 |
-
- **Comparison**:
|
| 45 |
- **Method**: Neuron matching analysis across transformer layers
|
| 46 |
- **Alignment**: Models are aligned before comparison using the Hungarian algorithm
|
| 47 |
-
- **Output**: P-value indicating structural similarity (lower = more similar to
|
| 48 |
|
| 49 |
The match statistic tests whether neurons in corresponding layers maintain similar functional roles
|
| 50 |
-
between the base model and
|
| 51 |
|
| 52 |
## Test Text
|
| 53 |
|
|
@@ -62,11 +73,15 @@ with these important social considerations.
|
|
| 62 |
"""
|
| 63 |
|
| 64 |
EVALUATION_QUEUE_TEXT = """
|
| 65 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
-
|
| 68 |
-
2. The model should be loadable with AutoModelForCausalLM
|
| 69 |
-
3. The model should support text generation tasks
|
| 70 |
"""
|
| 71 |
|
| 72 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
|
|
| 17 |
# ---------------------------------------------------
|
| 18 |
|
| 19 |
# Your leaderboard name
|
| 20 |
+
TITLE = """<h1 align="center" id="space-title">Model Tracing Leaderboard</h1>"""
|
| 21 |
|
| 22 |
# What does your leaderboard evaluate?
|
| 23 |
INTRODUCTION_TEXT = """
|
| 24 |
+
This leaderboard evaluates specific language models based on their perplexity scores and
|
| 25 |
+
structural similarity to Llama-2-7B using model tracing analysis.
|
| 26 |
|
| 27 |
+
**Models Evaluated:**
|
| 28 |
+
- `lmsys/vicuna-7b-v1.5` - Vicuna 7B v1.5
|
| 29 |
+
- `ibm-granite/granite-7b-base` - IBM Granite 7B Base
|
| 30 |
+
- `EleutherAI/llemma_7b` - LLeMa 7B
|
| 31 |
+
|
| 32 |
+
**Metrics:**
|
| 33 |
- **Perplexity**: Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
|
| 34 |
+
- **Match P-Value**: Lower p-values indicate the model preserves structural similarity to Llama-2-7B after fine-tuning (neuron organization is maintained).
|
| 35 |
"""
|
| 36 |
|
| 37 |
# Which evaluations are you running?
|
| 38 |
LLM_BENCHMARKS_TEXT = """
|
| 39 |
## How it works
|
| 40 |
|
| 41 |
+
The evaluation runs two types of analysis on the supported language models:
|
| 42 |
+
|
| 43 |
+
### Supported Models
|
| 44 |
+
- **Vicuna 7B v1.5** (`lmsys/vicuna-7b-v1.5`) - Chat-optimized LLaMA variant
|
| 45 |
+
- **IBM Granite 7B** (`ibm-granite/granite-7b-base`) - IBM's foundational language model
|
| 46 |
+
- **LLeMa 7B** (`EleutherAI/llemma_7b`) - EleutherAI's mathematical language model
|
| 47 |
|
| 48 |
### 1. Perplexity Evaluation
|
| 49 |
Perplexity tests using a fixed test passage about artificial intelligence.
|
| 50 |
Perplexity measures how well a model predicts text - lower scores mean better predictions.
|
| 51 |
|
| 52 |
### 2. Model Tracing Analysis
|
| 53 |
+
Compares each model's internal structure to Llama-2-7B using the "match" statistic:
|
| 54 |
+
- **Base Model**: Llama-2-7B (`meta-llama/Llama-2-7b-hf`)
|
| 55 |
+
- **Comparison Models**: The 3 supported models listed above
|
| 56 |
- **Method**: Neuron matching analysis across transformer layers
|
| 57 |
- **Alignment**: Models are aligned before comparison using the Hungarian algorithm
|
| 58 |
+
- **Output**: P-value indicating structural similarity (lower = more similar to Llama-2-7B)
|
| 59 |
|
| 60 |
The match statistic tests whether neurons in corresponding layers maintain similar functional roles
|
| 61 |
+
between the base model and the comparison models.
|
| 62 |
|
| 63 |
## Test Text
|
| 64 |
|
|
|
|
| 73 |
"""
|
| 74 |
|
| 75 |
EVALUATION_QUEUE_TEXT = """
|
| 76 |
+
## Testing Models
|
| 77 |
+
|
| 78 |
+
This leaderboard focuses on comparing specific models:
|
| 79 |
+
|
| 80 |
+
1. **Vicuna 7B v1.5** - Chat-optimized variant of LLaMA
|
| 81 |
+
2. **IBM Granite 7B Base** - IBM's foundational language model
|
| 82 |
+
3. **LLeMa 7B** - EleutherAI's mathematical language model
|
| 83 |
|
| 84 |
+
Use the "Test Model" tab to run perplexity evaluation on any of these models.
|
|
|
|
|
|
|
| 85 |
"""
|
| 86 |
|
| 87 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
src/evaluation/initialize_models.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Initialize the leaderboard with specific models and compute their p-values.
|
| 3 |
+
|
| 4 |
+
This module ensures only the specified models are included in the leaderboard
|
| 5 |
+
and their model trace p-values are computed.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import json
|
| 10 |
+
import sys
|
| 11 |
+
from src.evaluation.model_trace_eval import compute_model_trace_p_value
|
| 12 |
+
from src.envs import EVAL_RESULTS_PATH
|
| 13 |
+
|
| 14 |
+
# The specific models we want to include
|
| 15 |
+
ALLOWED_MODELS = [
|
| 16 |
+
"lmsys/vicuna-7b-v1.5",
|
| 17 |
+
"ibm-granite/granite-7b-base",
|
| 18 |
+
"EleutherAI/llemma_7b"
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
def create_model_result_file(model_name, precision="float16"):
|
| 22 |
+
"""
|
| 23 |
+
Create a result file for a model with computed p-value.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
model_name: HuggingFace model identifier
|
| 27 |
+
precision: Model precision
|
| 28 |
+
"""
|
| 29 |
+
sys.stderr.write(f"\n🔧 CREATING RESULT FILE FOR: {model_name}\n")
|
| 30 |
+
sys.stderr.flush()
|
| 31 |
+
|
| 32 |
+
# Create the results directory if it doesn't exist
|
| 33 |
+
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
| 34 |
+
|
| 35 |
+
# Generate a safe filename
|
| 36 |
+
safe_name = model_name.replace("/", "_").replace("-", "_")
|
| 37 |
+
result_file = os.path.join(EVAL_RESULTS_PATH, f"{safe_name}_{precision}.json")
|
| 38 |
+
|
| 39 |
+
sys.stderr.write(f"📁 Result file path: {result_file}\n")
|
| 40 |
+
sys.stderr.flush()
|
| 41 |
+
|
| 42 |
+
# Check if file already exists
|
| 43 |
+
if os.path.exists(result_file):
|
| 44 |
+
sys.stderr.write(f"✅ Result file already exists: {result_file}\n")
|
| 45 |
+
sys.stderr.flush()
|
| 46 |
+
return result_file
|
| 47 |
+
|
| 48 |
+
# Create basic result structure
|
| 49 |
+
result_data = {
|
| 50 |
+
"config": {
|
| 51 |
+
"model_dtype": f"torch.{precision}",
|
| 52 |
+
"model_name": model_name,
|
| 53 |
+
"model_sha": "main"
|
| 54 |
+
},
|
| 55 |
+
"results": {
|
| 56 |
+
"perplexity": {
|
| 57 |
+
"perplexity": None # Will be populated when user tests
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
# Save the result file
|
| 63 |
+
try:
|
| 64 |
+
with open(result_file, 'w') as f:
|
| 65 |
+
json.dump(result_data, f, indent=2)
|
| 66 |
+
|
| 67 |
+
sys.stderr.write(f"✅ Created result file: {result_file}\n")
|
| 68 |
+
sys.stderr.flush()
|
| 69 |
+
return result_file
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
sys.stderr.write(f"❌ Failed to create result file: {e}\n")
|
| 73 |
+
sys.stderr.flush()
|
| 74 |
+
return None
|
| 75 |
+
|
| 76 |
+
def initialize_allowed_models():
|
| 77 |
+
"""
|
| 78 |
+
Initialize result files for all allowed models.
|
| 79 |
+
"""
|
| 80 |
+
sys.stderr.write(f"\n🚀 INITIALIZING ALLOWED MODELS\n")
|
| 81 |
+
sys.stderr.write(f"📋 Models to initialize: {ALLOWED_MODELS}\n")
|
| 82 |
+
sys.stderr.flush()
|
| 83 |
+
|
| 84 |
+
created_files = []
|
| 85 |
+
|
| 86 |
+
for model_name in ALLOWED_MODELS:
|
| 87 |
+
try:
|
| 88 |
+
result_file = create_model_result_file(model_name)
|
| 89 |
+
if result_file:
|
| 90 |
+
created_files.append(result_file)
|
| 91 |
+
|
| 92 |
+
except Exception as e:
|
| 93 |
+
sys.stderr.write(f"❌ Failed to initialize {model_name}: {e}\n")
|
| 94 |
+
sys.stderr.flush()
|
| 95 |
+
continue
|
| 96 |
+
|
| 97 |
+
sys.stderr.write(f"✅ Initialized {len(created_files)} model result files\n")
|
| 98 |
+
sys.stderr.flush()
|
| 99 |
+
|
| 100 |
+
return created_files
|
| 101 |
+
|
| 102 |
+
def is_model_allowed(model_name):
|
| 103 |
+
"""
|
| 104 |
+
Check if a model is in the allowed list.
|
| 105 |
+
|
| 106 |
+
Args:
|
| 107 |
+
model_name: HuggingFace model identifier
|
| 108 |
+
|
| 109 |
+
Returns:
|
| 110 |
+
bool: True if model is allowed
|
| 111 |
+
"""
|
| 112 |
+
return model_name in ALLOWED_MODELS
|
| 113 |
+
|
| 114 |
+
def get_allowed_models():
|
| 115 |
+
"""
|
| 116 |
+
Get the list of allowed models.
|
| 117 |
+
|
| 118 |
+
Returns:
|
| 119 |
+
list: List of allowed model names
|
| 120 |
+
"""
|
| 121 |
+
return ALLOWED_MODELS.copy()
|
src/evaluation/model_trace_eval.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
"""
|
| 2 |
Model tracing evaluation for computing p-values from neuron matching statistics.
|
| 3 |
|
| 4 |
-
This module runs the model-tracing comparison
|
| 5 |
-
|
| 6 |
"""
|
| 7 |
|
| 8 |
import os
|
|
@@ -10,49 +10,26 @@ import sys
|
|
| 10 |
import subprocess
|
| 11 |
import tempfile
|
| 12 |
import pickle
|
| 13 |
-
import
|
| 14 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 15 |
|
| 16 |
-
#
|
| 17 |
model_tracing_path = os.path.join(os.path.dirname(__file__), '../../model-tracing')
|
| 18 |
-
|
| 19 |
-
sys.path.append(model_tracing_path)
|
| 20 |
|
| 21 |
-
sys.stderr.write("🔧
|
| 22 |
-
sys.stderr.
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
sys.stderr.write(" - Importing tracing.utils.llama.model...\n")
|
| 26 |
-
from tracing.utils.llama.model import permute_model, rotate_model
|
| 27 |
-
|
| 28 |
-
sys.stderr.write(" - Importing tracing.utils.llama.matching...\n")
|
| 29 |
-
from tracing.utils.llama.matching import align_model
|
| 30 |
-
|
| 31 |
-
sys.stderr.write(" - Importing tracing.utils.evaluate...\n")
|
| 32 |
-
from tracing.utils.evaluate import prepare_hf_dataset, prepare_hf_dataloader
|
| 33 |
-
|
| 34 |
-
sys.stderr.write(" - Importing tracing.utils.utils...\n")
|
| 35 |
-
from tracing.utils.utils import manual_seed
|
| 36 |
-
|
| 37 |
-
sys.stderr.write(" - Importing tracing.statistics.match...\n")
|
| 38 |
-
from tracing.statistics.match import statistic as match_stat
|
| 39 |
-
|
| 40 |
-
MODEL_TRACING_AVAILABLE = True
|
| 41 |
-
sys.stderr.write("✅ ALL MODEL TRACING IMPORTS SUCCESSFUL\n")
|
| 42 |
-
|
| 43 |
-
except ImportError as e:
|
| 44 |
-
sys.stderr.write(f"❌ MODEL TRACING IMPORTS FAILED: {e}\n")
|
| 45 |
-
import traceback
|
| 46 |
-
sys.stderr.write(f"Full import traceback:\n{traceback.format_exc()}\n")
|
| 47 |
-
MODEL_TRACING_AVAILABLE = False
|
| 48 |
-
|
| 49 |
sys.stderr.write(f"🎯 Final MODEL_TRACING_AVAILABLE = {MODEL_TRACING_AVAILABLE}\n")
|
| 50 |
sys.stderr.flush()
|
| 51 |
|
| 52 |
|
| 53 |
def run_model_trace_analysis(ft_model_name, revision="main", precision="float16"):
|
| 54 |
"""
|
| 55 |
-
Run model tracing analysis
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
Args:
|
| 58 |
ft_model_name: HuggingFace model identifier for the fine-tuned model
|
|
@@ -61,197 +38,175 @@ def run_model_trace_analysis(ft_model_name, revision="main", precision="float16"
|
|
| 61 |
|
| 62 |
Returns:
|
| 63 |
tuple: (success: bool, result: float or error_message)
|
| 64 |
-
If success, result is the aggregate p-value
|
| 65 |
If failure, result is error message
|
| 66 |
"""
|
| 67 |
|
| 68 |
if not MODEL_TRACING_AVAILABLE:
|
| 69 |
-
return False, "Model tracing
|
| 70 |
|
| 71 |
try:
|
| 72 |
-
sys.stderr.write(f"\n=== RUNNING MODEL TRACE ANALYSIS ===\n")
|
| 73 |
-
sys.stderr.write(f"Base model:
|
| 74 |
sys.stderr.write(f"Fine-tuned model: {ft_model_name}\n")
|
| 75 |
sys.stderr.write(f"Revision: {revision}\n")
|
| 76 |
sys.stderr.write(f"Precision: {precision}\n")
|
| 77 |
sys.stderr.flush()
|
| 78 |
|
| 79 |
-
#
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
# Determine dtype
|
| 83 |
-
if precision == "bfloat16":
|
| 84 |
-
dtype = torch.bfloat16
|
| 85 |
-
else:
|
| 86 |
-
dtype = torch.float16
|
| 87 |
-
|
| 88 |
-
# Load base model (gpt2)
|
| 89 |
-
base_model_id = "openai-community/gpt2"
|
| 90 |
-
sys.stderr.write(f"🤖 Loading base model: {base_model_id}\n")
|
| 91 |
-
sys.stderr.write(f" - dtype: {dtype}\n")
|
| 92 |
-
sys.stderr.write(f" - low_cpu_mem_usage: True\n")
|
| 93 |
-
sys.stderr.flush()
|
| 94 |
-
|
| 95 |
-
try:
|
| 96 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
| 97 |
-
base_model_id,
|
| 98 |
-
torch_dtype=dtype,
|
| 99 |
-
low_cpu_mem_usage=True
|
| 100 |
-
)
|
| 101 |
-
sys.stderr.write("✅ Base model loaded successfully\n")
|
| 102 |
-
except Exception as e:
|
| 103 |
-
sys.stderr.write(f"❌ Failed to load base model: {e}\n")
|
| 104 |
-
raise
|
| 105 |
-
|
| 106 |
-
try:
|
| 107 |
-
base_tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=False)
|
| 108 |
-
sys.stderr.write("✅ Base tokenizer loaded successfully\n")
|
| 109 |
-
except Exception as e:
|
| 110 |
-
sys.stderr.write(f"❌ Failed to load base tokenizer: {e}\n")
|
| 111 |
-
raise
|
| 112 |
-
|
| 113 |
-
# Load fine-tuned model
|
| 114 |
-
sys.stderr.write(f"🤖 Loading fine-tuned model: {ft_model_name}\n")
|
| 115 |
-
sys.stderr.write(f" - revision: {revision}\n")
|
| 116 |
-
sys.stderr.write(f" - dtype: {dtype}\n")
|
| 117 |
-
sys.stderr.write(f" - low_cpu_mem_usage: True\n")
|
| 118 |
-
sys.stderr.flush()
|
| 119 |
-
|
| 120 |
-
try:
|
| 121 |
-
ft_model = AutoModelForCausalLM.from_pretrained(
|
| 122 |
-
ft_model_name,
|
| 123 |
-
revision=revision,
|
| 124 |
-
torch_dtype=dtype,
|
| 125 |
-
low_cpu_mem_usage=True
|
| 126 |
-
)
|
| 127 |
-
sys.stderr.write("✅ Fine-tuned model loaded successfully\n")
|
| 128 |
-
except Exception as e:
|
| 129 |
-
sys.stderr.write(f"❌ Failed to load fine-tuned model: {e}\n")
|
| 130 |
-
raise
|
| 131 |
-
|
| 132 |
-
try:
|
| 133 |
-
ft_tokenizer = AutoTokenizer.from_pretrained(ft_model_name, revision=revision, use_fast=False)
|
| 134 |
-
sys.stderr.write("✅ Fine-tuned tokenizer loaded successfully\n")
|
| 135 |
-
except Exception as e:
|
| 136 |
-
sys.stderr.write(f"❌ Failed to load fine-tuned tokenizer: {e}\n")
|
| 137 |
-
raise
|
| 138 |
-
|
| 139 |
-
sys.stderr.write("🎯 ALL MODELS AND TOKENIZERS LOADED SUCCESSFULLY\n")
|
| 140 |
-
|
| 141 |
-
# Show memory info if available
|
| 142 |
-
if torch.cuda.is_available():
|
| 143 |
-
memory_allocated = torch.cuda.memory_allocated() / 1024**3 # GB
|
| 144 |
-
memory_reserved = torch.cuda.memory_reserved() / 1024**3 # GB
|
| 145 |
-
sys.stderr.write(f"💾 GPU Memory - Allocated: {memory_allocated:.2f}GB, Reserved: {memory_reserved:.2f}GB\n")
|
| 146 |
|
|
|
|
| 147 |
sys.stderr.flush()
|
| 148 |
|
| 149 |
-
#
|
| 150 |
-
|
| 151 |
-
sys.stderr.flush()
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
-
|
| 162 |
-
sys.stderr.write("Running model alignment...\n")
|
| 163 |
sys.stderr.flush()
|
| 164 |
|
|
|
|
|
|
|
| 165 |
try:
|
| 166 |
-
|
| 167 |
-
sys.stderr.write("
|
| 168 |
-
|
| 169 |
-
sys.stderr.write(f"Model alignment failed: {e}\n")
|
| 170 |
-
sys.stderr.write("Continuing without alignment...\n")
|
| 171 |
-
sys.stderr.flush()
|
| 172 |
-
|
| 173 |
-
# Run match statistic
|
| 174 |
-
sys.stderr.write("Computing match statistic...\n")
|
| 175 |
-
sys.stderr.flush()
|
| 176 |
-
|
| 177 |
-
# Get number of layers for the models
|
| 178 |
-
if hasattr(base_model, 'transformer') and hasattr(base_model.transformer, 'h'):
|
| 179 |
-
# GPT-2 style
|
| 180 |
-
n_blocks = len(base_model.transformer.h)
|
| 181 |
-
elif hasattr(base_model, 'model') and hasattr(base_model.model, 'layers'):
|
| 182 |
-
# LLaMA style
|
| 183 |
-
n_blocks = len(base_model.model.layers)
|
| 184 |
-
else:
|
| 185 |
-
# Default fallback
|
| 186 |
-
n_blocks = 12 # GPT-2 base has 12 layers
|
| 187 |
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
|
|
|
| 194 |
|
| 195 |
-
|
| 196 |
-
n_blocks = min(n_blocks, ft_n_blocks)
|
| 197 |
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
p_values = match_stat(base_model, ft_model, dataloader, n_blocks=n_blocks)
|
| 204 |
-
except Exception as e:
|
| 205 |
-
sys.stderr.write(f"Match statistic computation failed: {e}\n")
|
| 206 |
sys.stderr.flush()
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
sys.stderr.write("No valid p-values found, returning default\n")
|
| 218 |
sys.stderr.flush()
|
| 219 |
-
return True, 1.0
|
| 220 |
|
| 221 |
-
#
|
| 222 |
-
from tracing.utils.utils import fisher
|
| 223 |
try:
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
except Exception as e:
|
| 226 |
-
sys.stderr.write(f"
|
| 227 |
sys.stderr.flush()
|
| 228 |
-
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
-
sys.stderr.write(f"
|
| 232 |
sys.stderr.write("=== MODEL TRACE ANALYSIS COMPLETED ===\n")
|
| 233 |
sys.stderr.flush()
|
| 234 |
|
| 235 |
-
# Clean up memory
|
| 236 |
-
del base_model
|
| 237 |
-
del ft_model
|
| 238 |
-
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
| 239 |
-
|
| 240 |
return True, aggregate_p_value
|
| 241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
except Exception as e:
|
| 243 |
error_msg = str(e)
|
| 244 |
-
sys.stderr.write(f"Error in model trace analysis: {error_msg}\n")
|
| 245 |
import traceback
|
| 246 |
sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
|
| 247 |
sys.stderr.flush()
|
| 248 |
-
|
| 249 |
-
# Clean up memory even on error
|
| 250 |
-
try:
|
| 251 |
-
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
| 252 |
-
except:
|
| 253 |
-
pass
|
| 254 |
-
|
| 255 |
return False, error_msg
|
| 256 |
|
| 257 |
|
|
|
|
| 1 |
"""
|
| 2 |
Model tracing evaluation for computing p-values from neuron matching statistics.
|
| 3 |
|
| 4 |
+
This module runs the model-tracing comparison using the main.py script from model-tracing
|
| 5 |
+
to determine structural similarity via p-value analysis.
|
| 6 |
"""
|
| 7 |
|
| 8 |
import os
|
|
|
|
| 10 |
import subprocess
|
| 11 |
import tempfile
|
| 12 |
import pickle
|
| 13 |
+
import statistics
|
|
|
|
| 14 |
|
| 15 |
+
# Check if model-tracing directory exists
|
| 16 |
model_tracing_path = os.path.join(os.path.dirname(__file__), '../../model-tracing')
|
| 17 |
+
MODEL_TRACING_AVAILABLE = os.path.exists(model_tracing_path) and os.path.exists(os.path.join(model_tracing_path, 'main.py'))
|
|
|
|
| 18 |
|
| 19 |
+
sys.stderr.write("🔧 CHECKING MODEL TRACING AVAILABILITY...\n")
|
| 20 |
+
sys.stderr.write(f" - Model tracing path: {model_tracing_path}\n")
|
| 21 |
+
sys.stderr.write(f" - Path exists: {os.path.exists(model_tracing_path)}\n")
|
| 22 |
+
sys.stderr.write(f" - main.py exists: {os.path.exists(os.path.join(model_tracing_path, 'main.py'))}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
sys.stderr.write(f"🎯 Final MODEL_TRACING_AVAILABLE = {MODEL_TRACING_AVAILABLE}\n")
|
| 24 |
sys.stderr.flush()
|
| 25 |
|
| 26 |
|
| 27 |
def run_model_trace_analysis(ft_model_name, revision="main", precision="float16"):
|
| 28 |
"""
|
| 29 |
+
Run model tracing analysis using the main.py script from model-tracing directory.
|
| 30 |
+
|
| 31 |
+
Runs the exact command:
|
| 32 |
+
python main.py --base_model_id meta-llama/Llama-2-7b-hf --ft_model_id <ft_model_name> --stat match --align
|
| 33 |
|
| 34 |
Args:
|
| 35 |
ft_model_name: HuggingFace model identifier for the fine-tuned model
|
|
|
|
| 38 |
|
| 39 |
Returns:
|
| 40 |
tuple: (success: bool, result: float or error_message)
|
| 41 |
+
If success, result is the aggregate p-value from aligned test stat
|
| 42 |
If failure, result is error message
|
| 43 |
"""
|
| 44 |
|
| 45 |
if not MODEL_TRACING_AVAILABLE:
|
| 46 |
+
return False, "Model tracing main.py script not available"
|
| 47 |
|
| 48 |
try:
|
| 49 |
+
sys.stderr.write(f"\n=== RUNNING MODEL TRACE ANALYSIS VIA SUBPROCESS ===\n")
|
| 50 |
+
sys.stderr.write(f"Base model: meta-llama/Llama-2-7b-hf\n")
|
| 51 |
sys.stderr.write(f"Fine-tuned model: {ft_model_name}\n")
|
| 52 |
sys.stderr.write(f"Revision: {revision}\n")
|
| 53 |
sys.stderr.write(f"Precision: {precision}\n")
|
| 54 |
sys.stderr.flush()
|
| 55 |
|
| 56 |
+
# Create a temporary file for results
|
| 57 |
+
with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as tmp_file:
|
| 58 |
+
tmp_results_path = tmp_file.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
+
sys.stderr.write(f"📁 Temporary results file: {tmp_results_path}\n")
|
| 61 |
sys.stderr.flush()
|
| 62 |
|
| 63 |
+
# Build the command exactly as user specified
|
| 64 |
+
base_model_id = "meta-llama/Llama-2-7b-hf"
|
|
|
|
| 65 |
|
| 66 |
+
# Build the command
|
| 67 |
+
cmd = [
|
| 68 |
+
"python", "main.py",
|
| 69 |
+
"--base_model_id", base_model_id,
|
| 70 |
+
"--ft_model_id", ft_model_name,
|
| 71 |
+
"--stat", "match",
|
| 72 |
+
"--save", tmp_results_path
|
| 73 |
+
]
|
| 74 |
|
| 75 |
+
# Add revision if not main/default
|
| 76 |
+
if revision and revision != "main":
|
| 77 |
+
# Note: main.py doesn't seem to have a revision flag, but we log it for reference
|
| 78 |
+
sys.stderr.write(f"⚠️ Note: Revision '{revision}' specified but main.py doesn't support --revision flag\n")
|
| 79 |
+
sys.stderr.flush()
|
| 80 |
|
| 81 |
+
sys.stderr.write(f"🚀 Running command: {' '.join(cmd)}\n")
|
|
|
|
| 82 |
sys.stderr.flush()
|
| 83 |
|
| 84 |
+
# Change to model-tracing directory and run the command
|
| 85 |
+
original_cwd = os.getcwd()
|
| 86 |
try:
|
| 87 |
+
os.chdir(model_tracing_path)
|
| 88 |
+
sys.stderr.write(f"📂 Changed to directory: {model_tracing_path}\n")
|
| 89 |
+
sys.stderr.flush()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
+
# Run the subprocess
|
| 92 |
+
result = subprocess.run(
|
| 93 |
+
cmd,
|
| 94 |
+
capture_output=True,
|
| 95 |
+
text=True,
|
| 96 |
+
timeout=3600 # 1 hour timeout
|
| 97 |
+
)
|
| 98 |
|
| 99 |
+
sys.stderr.write(f"📊 Subprocess completed with return code: {result.returncode}\n")
|
|
|
|
| 100 |
|
| 101 |
+
# Log stdout and stderr from the subprocess
|
| 102 |
+
if result.stdout:
|
| 103 |
+
sys.stderr.write(f"📝 STDOUT from model tracing:\n{result.stdout}\n")
|
| 104 |
+
if result.stderr:
|
| 105 |
+
sys.stderr.write(f"⚠️ STDERR from model tracing:\n{result.stderr}\n")
|
|
|
|
|
|
|
|
|
|
| 106 |
sys.stderr.flush()
|
| 107 |
+
|
| 108 |
+
if result.returncode != 0:
|
| 109 |
+
error_msg = f"Model tracing script failed with return code {result.returncode}"
|
| 110 |
+
if result.stderr:
|
| 111 |
+
error_msg += f"\nSTDERR: {result.stderr}"
|
| 112 |
+
return False, error_msg
|
| 113 |
+
|
| 114 |
+
finally:
|
| 115 |
+
os.chdir(original_cwd)
|
| 116 |
+
sys.stderr.write(f"📂 Changed back to directory: {original_cwd}\n")
|
|
|
|
| 117 |
sys.stderr.flush()
|
|
|
|
| 118 |
|
| 119 |
+
# Load and parse the results
|
|
|
|
| 120 |
try:
|
| 121 |
+
sys.stderr.write(f"📖 Loading results from: {tmp_results_path}\n")
|
| 122 |
+
sys.stderr.flush()
|
| 123 |
+
|
| 124 |
+
with open(tmp_results_path, 'rb') as f:
|
| 125 |
+
results = pickle.load(f)
|
| 126 |
+
|
| 127 |
+
sys.stderr.write(f"✅ Results loaded successfully\n")
|
| 128 |
+
sys.stderr.write(f"📋 Available result keys: {list(results.keys())}\n")
|
| 129 |
+
sys.stderr.flush()
|
| 130 |
+
|
| 131 |
+
# Get the aligned test stat (this is what we want with --align flag)
|
| 132 |
+
if "aligned test stat" in results:
|
| 133 |
+
aligned_stat = results["aligned test stat"]
|
| 134 |
+
sys.stderr.write(f"📊 Aligned test stat: {aligned_stat}\n")
|
| 135 |
+
sys.stderr.write(f"📊 Type: {type(aligned_stat)}\n")
|
| 136 |
+
|
| 137 |
+
# The match statistic returns a list of p-values per layer
|
| 138 |
+
if isinstance(aligned_stat, list):
|
| 139 |
+
sys.stderr.write(f"📊 List of {len(aligned_stat)} p-values: {aligned_stat}\n")
|
| 140 |
+
|
| 141 |
+
# Filter valid p-values
|
| 142 |
+
valid_p_values = [p for p in aligned_stat if p is not None and isinstance(p, (int, float)) and 0 <= p <= 1]
|
| 143 |
+
sys.stderr.write(f"📊 Valid p-values: {len(valid_p_values)}/{len(aligned_stat)}\n")
|
| 144 |
+
|
| 145 |
+
if valid_p_values:
|
| 146 |
+
# Use median as the representative p-value
|
| 147 |
+
aggregate_p_value = statistics.median(valid_p_values)
|
| 148 |
+
sys.stderr.write(f"📊 Using median p-value: {aggregate_p_value}\n")
|
| 149 |
+
else:
|
| 150 |
+
sys.stderr.write("⚠️ No valid p-values found, using default\n")
|
| 151 |
+
aggregate_p_value = 1.0
|
| 152 |
+
|
| 153 |
+
elif isinstance(aligned_stat, (int, float)):
|
| 154 |
+
aggregate_p_value = float(aligned_stat)
|
| 155 |
+
sys.stderr.write(f"📊 Using single p-value: {aggregate_p_value}\n")
|
| 156 |
+
else:
|
| 157 |
+
sys.stderr.write(f"⚠️ Unexpected aligned_stat type: {type(aligned_stat)}, using default\n")
|
| 158 |
+
aggregate_p_value = 1.0
|
| 159 |
+
|
| 160 |
+
else:
|
| 161 |
+
sys.stderr.write("⚠️ No 'aligned test stat' found in results, checking non-aligned\n")
|
| 162 |
+
if "non-aligned test stat" in results:
|
| 163 |
+
non_aligned_stat = results["non-aligned test stat"]
|
| 164 |
+
sys.stderr.write(f"📊 Using non-aligned test stat: {non_aligned_stat}\n")
|
| 165 |
+
|
| 166 |
+
if isinstance(non_aligned_stat, list):
|
| 167 |
+
valid_p_values = [p for p in non_aligned_stat if p is not None and isinstance(p, (int, float)) and 0 <= p <= 1]
|
| 168 |
+
if valid_p_values:
|
| 169 |
+
aggregate_p_value = statistics.median(valid_p_values)
|
| 170 |
+
else:
|
| 171 |
+
aggregate_p_value = 1.0
|
| 172 |
+
else:
|
| 173 |
+
aggregate_p_value = float(non_aligned_stat) if isinstance(non_aligned_stat, (int, float)) else 1.0
|
| 174 |
+
else:
|
| 175 |
+
sys.stderr.write("❌ No test stat found in results\n")
|
| 176 |
+
return False, "No test statistic found in results"
|
| 177 |
+
|
| 178 |
+
sys.stderr.flush()
|
| 179 |
+
|
| 180 |
except Exception as e:
|
| 181 |
+
sys.stderr.write(f"❌ Failed to load results: {e}\n")
|
| 182 |
sys.stderr.flush()
|
| 183 |
+
return False, f"Failed to load results: {e}"
|
| 184 |
+
|
| 185 |
+
finally:
|
| 186 |
+
# Clean up temporary file
|
| 187 |
+
try:
|
| 188 |
+
os.unlink(tmp_results_path)
|
| 189 |
+
sys.stderr.write(f"🗑️ Cleaned up temporary file: {tmp_results_path}\n")
|
| 190 |
+
except:
|
| 191 |
+
pass
|
| 192 |
|
| 193 |
+
sys.stderr.write(f"✅ Final aggregate p-value: {aggregate_p_value}\n")
|
| 194 |
sys.stderr.write("=== MODEL TRACE ANALYSIS COMPLETED ===\n")
|
| 195 |
sys.stderr.flush()
|
| 196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
return True, aggregate_p_value
|
| 198 |
|
| 199 |
+
except subprocess.TimeoutExpired:
|
| 200 |
+
sys.stderr.write("❌ Model tracing analysis timed out after 1 hour\n")
|
| 201 |
+
sys.stderr.flush()
|
| 202 |
+
return False, "Analysis timed out"
|
| 203 |
+
|
| 204 |
except Exception as e:
|
| 205 |
error_msg = str(e)
|
| 206 |
+
sys.stderr.write(f"💥 Error in model trace analysis: {error_msg}\n")
|
| 207 |
import traceback
|
| 208 |
sys.stderr.write(f"Traceback: {traceback.format_exc()}\n")
|
| 209 |
sys.stderr.flush()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
return False, error_msg
|
| 211 |
|
| 212 |
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -8,6 +8,7 @@ from src.display.formatting import make_clickable_model
|
|
| 8 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
| 9 |
from src.submission.check_validity import is_model_on_hub
|
| 10 |
from src.evaluation.model_trace_eval import compute_model_trace_p_value
|
|
|
|
| 11 |
|
| 12 |
@dataclass
|
| 13 |
class EvalResult:
|
|
@@ -236,6 +237,13 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
|
| 236 |
try:
|
| 237 |
sys.stderr.write(f"\nConverting result to dict for: {v.full_model}\n")
|
| 238 |
sys.stderr.flush()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
v.to_dict() # we test if the dict version is complete
|
| 240 |
results.append(v)
|
| 241 |
sys.stderr.write("Successfully converted and added result\n")
|
|
|
|
| 8 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
| 9 |
from src.submission.check_validity import is_model_on_hub
|
| 10 |
from src.evaluation.model_trace_eval import compute_model_trace_p_value
|
| 11 |
+
from src.evaluation.initialize_models import is_model_allowed
|
| 12 |
|
| 13 |
@dataclass
|
| 14 |
class EvalResult:
|
|
|
|
| 237 |
try:
|
| 238 |
sys.stderr.write(f"\nConverting result to dict for: {v.full_model}\n")
|
| 239 |
sys.stderr.flush()
|
| 240 |
+
|
| 241 |
+
# Filter to only allowed models
|
| 242 |
+
if not is_model_allowed(v.full_model):
|
| 243 |
+
sys.stderr.write(f"⏭️ Skipping non-allowed model: {v.full_model}\n")
|
| 244 |
+
sys.stderr.flush()
|
| 245 |
+
continue
|
| 246 |
+
|
| 247 |
v.to_dict() # we test if the dict version is complete
|
| 248 |
results.append(v)
|
| 249 |
sys.stderr.write("Successfully converted and added result\n")
|
test_model_trace.py
DELETED
|
@@ -1,43 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Test script for model tracing integration.
|
| 4 |
-
Tests the p-value computation for a simple model comparison.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import sys
|
| 8 |
-
import os
|
| 9 |
-
|
| 10 |
-
# Add src to path
|
| 11 |
-
sys.path.append('src')
|
| 12 |
-
|
| 13 |
-
from evaluation.model_trace_eval import compute_model_trace_p_value
|
| 14 |
-
|
| 15 |
-
def test_model_trace():
|
| 16 |
-
"""Test the model trace p-value computation with a simple example."""
|
| 17 |
-
|
| 18 |
-
print("Testing model trace p-value computation...")
|
| 19 |
-
|
| 20 |
-
# Test with a simple model (should be fast)
|
| 21 |
-
test_model = "openai-community/gpt2"
|
| 22 |
-
|
| 23 |
-
print(f"Computing p-value for {test_model} vs GPT-2...")
|
| 24 |
-
|
| 25 |
-
try:
|
| 26 |
-
p_value = compute_model_trace_p_value(test_model, "main", "float16")
|
| 27 |
-
|
| 28 |
-
if p_value is not None:
|
| 29 |
-
print(f"✅ Success! P-value: {p_value}")
|
| 30 |
-
if 0 <= p_value <= 1:
|
| 31 |
-
print("✅ P-value is in valid range [0, 1]")
|
| 32 |
-
else:
|
| 33 |
-
print(f"⚠️ Warning: P-value {p_value} is outside expected range [0, 1]")
|
| 34 |
-
else:
|
| 35 |
-
print("❌ Failed: P-value is None")
|
| 36 |
-
|
| 37 |
-
except Exception as e:
|
| 38 |
-
print(f"❌ Error: {e}")
|
| 39 |
-
import traceback
|
| 40 |
-
traceback.print_exc()
|
| 41 |
-
|
| 42 |
-
if __name__ == "__main__":
|
| 43 |
-
test_model_trace()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|