Delete scripts
Browse files- scripts/README.md +0 -109
- scripts/generate_data.py +0 -198
- scripts/train.py +0 -30
scripts/README.md
DELETED
|
@@ -1,109 +0,0 @@
|
|
| 1 |
-
# Scripts Directory
|
| 2 |
-
|
| 3 |
-
This directory contains utility scripts for the Pico training framework.
|
| 4 |
-
|
| 5 |
-
## generate_data.py
|
| 6 |
-
|
| 7 |
-
A script to automatically generate `data.json` from training log files for the dashboard.
|
| 8 |
-
|
| 9 |
-
### What it does
|
| 10 |
-
|
| 11 |
-
This script parses log files from the `runs/` directory and extracts:
|
| 12 |
-
- **Training metrics**: Loss, learning rate, and inf/NaN counts at each step
|
| 13 |
-
- **Evaluation results**: Paloma evaluation metrics
|
| 14 |
-
- **Model configuration**: Architecture parameters (d_model, n_layers, etc.)
|
| 15 |
-
|
| 16 |
-
### Usage
|
| 17 |
-
|
| 18 |
-
```bash
|
| 19 |
-
# Generate data.json from the default runs directory
|
| 20 |
-
python scripts/generate_data.py
|
| 21 |
-
|
| 22 |
-
# Specify custom runs directory
|
| 23 |
-
python scripts/generate_data.py --runs-dir /path/to/runs
|
| 24 |
-
|
| 25 |
-
# Specify custom output file
|
| 26 |
-
python scripts/generate_data.py --output /path/to/output.json
|
| 27 |
-
```
|
| 28 |
-
|
| 29 |
-
### How it works
|
| 30 |
-
|
| 31 |
-
1. **Scans runs directory**: Looks for subdirectories containing training runs
|
| 32 |
-
2. **Finds log files**: Locates `.log` files in each run's `logs/` subdirectory
|
| 33 |
-
3. **Parses log content**: Uses regex patterns to extract structured data
|
| 34 |
-
4. **Generates JSON**: Creates a structured JSON file for the dashboard
|
| 35 |
-
|
| 36 |
-
### Log Format Requirements
|
| 37 |
-
|
| 38 |
-
The script expects log files with the following format:
|
| 39 |
-
|
| 40 |
-
```
|
| 41 |
-
2025-08-29 02:09:12 - pico-train - INFO - Step 500 -- π Training Metrics
|
| 42 |
-
2025-08-29 02:09:12 - pico-train - INFO - βββ Loss: 10.8854
|
| 43 |
-
2025-08-29 02:09:12 - pico-train - INFO - βββ Learning Rate: 3.13e-06
|
| 44 |
-
2025-08-29 02:09:12 - pico-train - INFO - βββ Inf/NaN count: 0
|
| 45 |
-
```
|
| 46 |
-
|
| 47 |
-
And evaluation results:
|
| 48 |
-
|
| 49 |
-
```
|
| 50 |
-
2025-08-29 02:15:26 - pico-train - INFO - Step 1000 -- π Evaluation Results
|
| 51 |
-
2025-08-29 02:15:26 - pico-train - INFO - βββ paloma: 7.125172406420199e+27
|
| 52 |
-
```
|
| 53 |
-
|
| 54 |
-
### Output Format
|
| 55 |
-
|
| 56 |
-
The generated `data.json` has this structure:
|
| 57 |
-
|
| 58 |
-
```json
|
| 59 |
-
{
|
| 60 |
-
"runs": [
|
| 61 |
-
{
|
| 62 |
-
"run_name": "model-name",
|
| 63 |
-
"log_file": "log_filename.log",
|
| 64 |
-
"training_metrics": [
|
| 65 |
-
{
|
| 66 |
-
"step": 0,
|
| 67 |
-
"loss": 10.9914,
|
| 68 |
-
"learning_rate": 0.0,
|
| 69 |
-
"inf_nan_count": 0
|
| 70 |
-
}
|
| 71 |
-
],
|
| 72 |
-
"evaluation_results": [
|
| 73 |
-
{
|
| 74 |
-
"step": 1000,
|
| 75 |
-
"paloma": 59434.76600609756
|
| 76 |
-
}
|
| 77 |
-
],
|
| 78 |
-
"config": {
|
| 79 |
-
"d_model": 96,
|
| 80 |
-
"n_layers": 12,
|
| 81 |
-
"max_seq_len": 2048,
|
| 82 |
-
"vocab_size": 50304,
|
| 83 |
-
"lr": 0.0003,
|
| 84 |
-
"max_steps": 200000,
|
| 85 |
-
"batch_size": 8
|
| 86 |
-
}
|
| 87 |
-
}
|
| 88 |
-
],
|
| 89 |
-
"summary": {
|
| 90 |
-
"total_runs": 1,
|
| 91 |
-
"run_names": ["model-name"]
|
| 92 |
-
}
|
| 93 |
-
}
|
| 94 |
-
```
|
| 95 |
-
|
| 96 |
-
### When to use
|
| 97 |
-
|
| 98 |
-
- **After training**: Generate updated dashboard data
|
| 99 |
-
- **Adding new runs**: Include new training sessions in the dashboard
|
| 100 |
-
- **Debugging**: Verify log parsing is working correctly
|
| 101 |
-
- **Dashboard setup**: Initial setup of the training metrics dashboard
|
| 102 |
-
|
| 103 |
-
### Troubleshooting
|
| 104 |
-
|
| 105 |
-
If the script doesn't find any data:
|
| 106 |
-
1. Check that log files exist in `runs/*/logs/`
|
| 107 |
-
2. Verify log format matches the expected pattern
|
| 108 |
-
3. Ensure log files contain training metrics entries
|
| 109 |
-
4. Check file permissions and encoding
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/generate_data.py
DELETED
|
@@ -1,198 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Script to generate data.json from training log files.
|
| 4 |
-
|
| 5 |
-
This script parses log files from the runs directory and extracts:
|
| 6 |
-
- Training metrics (loss, learning rate, inf/nan count)
|
| 7 |
-
- Evaluation results (paloma metrics)
|
| 8 |
-
- Model configuration parameters
|
| 9 |
-
|
| 10 |
-
The output is saved to plots/data.json for the dashboard.
|
| 11 |
-
"""
|
| 12 |
-
|
| 13 |
-
import json
|
| 14 |
-
import re
|
| 15 |
-
from pathlib import Path
|
| 16 |
-
from typing import Any, Dict, List, Optional
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
def parse_training_metrics(log_content: str) -> List[Dict[str, Any]]:
|
| 20 |
-
"""Parse training metrics from log content."""
|
| 21 |
-
metrics = []
|
| 22 |
-
|
| 23 |
-
# Pattern to match training metrics entries with timestamp and log level
|
| 24 |
-
pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- π Training Metrics\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - βββ Loss: ([\d.]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - βββ Learning Rate: ([\d.e+-]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - βββ Inf/NaN count: (\d+)"
|
| 25 |
-
|
| 26 |
-
matches = re.findall(pattern, log_content)
|
| 27 |
-
|
| 28 |
-
for step, loss, lr, inf_nan in matches:
|
| 29 |
-
metrics.append(
|
| 30 |
-
{
|
| 31 |
-
"step": int(step),
|
| 32 |
-
"loss": float(loss),
|
| 33 |
-
"learning_rate": float(lr),
|
| 34 |
-
"inf_nan_count": int(inf_nan),
|
| 35 |
-
}
|
| 36 |
-
)
|
| 37 |
-
|
| 38 |
-
return sorted(metrics, key=lambda x: x["step"])
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
def parse_evaluation_results(log_content: str) -> List[Dict[str, Any]]:
|
| 42 |
-
"""Parse evaluation results from log content."""
|
| 43 |
-
results = []
|
| 44 |
-
|
| 45 |
-
# Pattern to match evaluation results with timestamp and log level
|
| 46 |
-
pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- π Evaluation Results\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - βββ paloma: ([\d.e+-]+)"
|
| 47 |
-
|
| 48 |
-
matches = re.findall(pattern, log_content)
|
| 49 |
-
|
| 50 |
-
for step, paloma in matches:
|
| 51 |
-
try:
|
| 52 |
-
paloma_value = float(paloma)
|
| 53 |
-
results.append({"step": int(step), "paloma": paloma_value})
|
| 54 |
-
except ValueError:
|
| 55 |
-
# Skip if paloma value is not a valid number (e.g., "inf")
|
| 56 |
-
continue
|
| 57 |
-
|
| 58 |
-
return sorted(results, key=lambda x: x["step"])
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
def extract_config_from_log(log_content: str) -> Dict[str, Any]:
|
| 62 |
-
"""Extract model configuration from log content."""
|
| 63 |
-
config = {}
|
| 64 |
-
|
| 65 |
-
# Extract key model parameters
|
| 66 |
-
patterns = {
|
| 67 |
-
"d_model": r"d_model: (\d+)",
|
| 68 |
-
"n_layers": r"n_layers: (\d+)",
|
| 69 |
-
"max_seq_len": r"max_seq_len: (\d+)",
|
| 70 |
-
"vocab_size": r"vocab_size: (\d+)",
|
| 71 |
-
"lr": r"lr: ([\d.e+-]+)",
|
| 72 |
-
"max_steps": r"max_steps: (\d+)",
|
| 73 |
-
"batch_size": r"batch_size: (\d+)",
|
| 74 |
-
}
|
| 75 |
-
|
| 76 |
-
for key, pattern in patterns.items():
|
| 77 |
-
match = re.search(pattern, log_content)
|
| 78 |
-
if match:
|
| 79 |
-
try:
|
| 80 |
-
if key in [
|
| 81 |
-
"d_model",
|
| 82 |
-
"n_layers",
|
| 83 |
-
"max_seq_len",
|
| 84 |
-
"vocab_size",
|
| 85 |
-
"max_steps",
|
| 86 |
-
"batch_size",
|
| 87 |
-
]:
|
| 88 |
-
config[key] = int(match.group(1))
|
| 89 |
-
else:
|
| 90 |
-
config[key] = float(match.group(1))
|
| 91 |
-
except ValueError:
|
| 92 |
-
continue
|
| 93 |
-
|
| 94 |
-
return config
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
def process_run_directory(run_path: Path) -> Optional[Dict[str, Any]]:
|
| 98 |
-
"""Process a single run directory and extract all data."""
|
| 99 |
-
run_name = run_path.name
|
| 100 |
-
|
| 101 |
-
# Find log files
|
| 102 |
-
logs_dir = run_path / "logs"
|
| 103 |
-
if not logs_dir.exists():
|
| 104 |
-
return None
|
| 105 |
-
|
| 106 |
-
log_files = list(logs_dir.glob("*.log"))
|
| 107 |
-
if not log_files:
|
| 108 |
-
return None
|
| 109 |
-
|
| 110 |
-
# Use the most recent log file for configuration
|
| 111 |
-
latest_log = max(log_files, key=lambda x: x.stat().st_mtime)
|
| 112 |
-
|
| 113 |
-
# Read log content
|
| 114 |
-
log_content = latest_log.read_text(encoding="utf-8")
|
| 115 |
-
|
| 116 |
-
# Extract data
|
| 117 |
-
training_metrics = parse_training_metrics(log_content)
|
| 118 |
-
evaluation_results = parse_evaluation_results(log_content)
|
| 119 |
-
config = extract_config_from_log(log_content)
|
| 120 |
-
|
| 121 |
-
# If no training metrics found, skip this run
|
| 122 |
-
if not training_metrics:
|
| 123 |
-
return None
|
| 124 |
-
|
| 125 |
-
return {
|
| 126 |
-
"run_name": run_name,
|
| 127 |
-
"log_file": latest_log.name,
|
| 128 |
-
"training_metrics": training_metrics,
|
| 129 |
-
"evaluation_results": evaluation_results,
|
| 130 |
-
"config": config,
|
| 131 |
-
}
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
def generate_data_json(runs_dir: str = "runs", output_file: str = "plots/data.json"):
|
| 135 |
-
"""Generate data.json from all run directories."""
|
| 136 |
-
runs_path = Path(runs_dir)
|
| 137 |
-
if not runs_path.exists():
|
| 138 |
-
print(f"Runs directory {runs_dir} not found!")
|
| 139 |
-
return
|
| 140 |
-
|
| 141 |
-
runs_data = []
|
| 142 |
-
|
| 143 |
-
# Process each run directory
|
| 144 |
-
for run_dir in runs_path.iterdir():
|
| 145 |
-
if run_dir.is_dir():
|
| 146 |
-
print(f"Processing run: {run_dir.name}")
|
| 147 |
-
run_data = process_run_directory(run_dir)
|
| 148 |
-
if run_data:
|
| 149 |
-
runs_data.append(run_data)
|
| 150 |
-
print(f" β Found {len(run_data['training_metrics'])} training metrics")
|
| 151 |
-
print(
|
| 152 |
-
f" β Found {len(run_data['evaluation_results'])} evaluation results"
|
| 153 |
-
)
|
| 154 |
-
else:
|
| 155 |
-
print(" β No valid data found")
|
| 156 |
-
|
| 157 |
-
if not runs_data:
|
| 158 |
-
print("No valid runs found!")
|
| 159 |
-
return
|
| 160 |
-
|
| 161 |
-
# Create output data structure
|
| 162 |
-
output_data = {
|
| 163 |
-
"runs": runs_data,
|
| 164 |
-
"summary": {
|
| 165 |
-
"total_runs": len(runs_data),
|
| 166 |
-
"run_names": [run["run_name"] for run in runs_data],
|
| 167 |
-
},
|
| 168 |
-
}
|
| 169 |
-
|
| 170 |
-
# Ensure output directory exists
|
| 171 |
-
output_path = Path(output_file)
|
| 172 |
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 173 |
-
|
| 174 |
-
# Write to file
|
| 175 |
-
with open(output_path, "w", encoding="utf-8") as f:
|
| 176 |
-
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
| 177 |
-
|
| 178 |
-
print(f"\nβ Generated {output_file} with {len(runs_data)} runs")
|
| 179 |
-
print(
|
| 180 |
-
f"β Total training metrics: {sum(len(run['training_metrics']) for run in runs_data)}"
|
| 181 |
-
)
|
| 182 |
-
print(
|
| 183 |
-
f"β Total evaluation results: {sum(len(run['evaluation_results']) for run in runs_data)}"
|
| 184 |
-
)
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
if __name__ == "__main__":
|
| 188 |
-
import argparse
|
| 189 |
-
|
| 190 |
-
parser = argparse.ArgumentParser(
|
| 191 |
-
description="Generate data.json from training logs"
|
| 192 |
-
)
|
| 193 |
-
parser.add_argument("--runs-dir", default="runs", help="Path to runs directory")
|
| 194 |
-
parser.add_argument("--output", default="plots/data.json", help="Output file path")
|
| 195 |
-
|
| 196 |
-
args = parser.parse_args()
|
| 197 |
-
|
| 198 |
-
generate_data_json(args.runs_dir, args.output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/train.py
DELETED
|
@@ -1,30 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
A minimal script to train the Pico language model. In practice, you should just use the
|
| 4 |
-
`poetry run train` command to run the training pipeline. Doing so will invoke this script.
|
| 5 |
-
Training logic is located in `src/training/trainer.py`.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
from pathlib import Path
|
| 9 |
-
|
| 10 |
-
import click
|
| 11 |
-
|
| 12 |
-
from src.training.trainer import Trainer
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
@click.command()
|
| 16 |
-
@click.option(
|
| 17 |
-
"--config_path",
|
| 18 |
-
"config_path",
|
| 19 |
-
type=click.Path(exists=True, path_type=Path),
|
| 20 |
-
help="Path to the training configuration file",
|
| 21 |
-
)
|
| 22 |
-
def main(config_path: Path) -> None:
|
| 23 |
-
"""Train the Pico language model using the specified configuration."""
|
| 24 |
-
|
| 25 |
-
trainer = Trainer(config_path=str(config_path))
|
| 26 |
-
trainer.train()
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
if __name__ == "__main__":
|
| 30 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|