ThomasTheMaker commited on
Commit
84cb578
Β·
verified Β·
1 Parent(s): e2c9f4d

Delete scripts

Browse files
Files changed (3) hide show
  1. scripts/README.md +0 -109
  2. scripts/generate_data.py +0 -198
  3. scripts/train.py +0 -30
scripts/README.md DELETED
@@ -1,109 +0,0 @@
1
- # Scripts Directory
2
-
3
- This directory contains utility scripts for the Pico training framework.
4
-
5
- ## generate_data.py
6
-
7
- A script to automatically generate `data.json` from training log files for the dashboard.
8
-
9
- ### What it does
10
-
11
- This script parses log files from the `runs/` directory and extracts:
12
- - **Training metrics**: Loss, learning rate, and inf/NaN counts at each step
13
- - **Evaluation results**: Paloma evaluation metrics
14
- - **Model configuration**: Architecture parameters (d_model, n_layers, etc.)
15
-
16
- ### Usage
17
-
18
- ```bash
19
- # Generate data.json from the default runs directory
20
- python scripts/generate_data.py
21
-
22
- # Specify custom runs directory
23
- python scripts/generate_data.py --runs-dir /path/to/runs
24
-
25
- # Specify custom output file
26
- python scripts/generate_data.py --output /path/to/output.json
27
- ```
28
-
29
- ### How it works
30
-
31
- 1. **Scans runs directory**: Looks for subdirectories containing training runs
32
- 2. **Finds log files**: Locates `.log` files in each run's `logs/` subdirectory
33
- 3. **Parses log content**: Uses regex patterns to extract structured data
34
- 4. **Generates JSON**: Creates a structured JSON file for the dashboard
35
-
36
- ### Log Format Requirements
37
-
38
- The script expects log files with the following format:
39
-
40
- ```
41
- 2025-08-29 02:09:12 - pico-train - INFO - Step 500 -- πŸ”„ Training Metrics
42
- 2025-08-29 02:09:12 - pico-train - INFO - β”œβ”€β”€ Loss: 10.8854
43
- 2025-08-29 02:09:12 - pico-train - INFO - β”œβ”€β”€ Learning Rate: 3.13e-06
44
- 2025-08-29 02:09:12 - pico-train - INFO - └── Inf/NaN count: 0
45
- ```
46
-
47
- And evaluation results:
48
-
49
- ```
50
- 2025-08-29 02:15:26 - pico-train - INFO - Step 1000 -- πŸ“Š Evaluation Results
51
- 2025-08-29 02:15:26 - pico-train - INFO - └── paloma: 7.125172406420199e+27
52
- ```
53
-
54
- ### Output Format
55
-
56
- The generated `data.json` has this structure:
57
-
58
- ```json
59
- {
60
- "runs": [
61
- {
62
- "run_name": "model-name",
63
- "log_file": "log_filename.log",
64
- "training_metrics": [
65
- {
66
- "step": 0,
67
- "loss": 10.9914,
68
- "learning_rate": 0.0,
69
- "inf_nan_count": 0
70
- }
71
- ],
72
- "evaluation_results": [
73
- {
74
- "step": 1000,
75
- "paloma": 59434.76600609756
76
- }
77
- ],
78
- "config": {
79
- "d_model": 96,
80
- "n_layers": 12,
81
- "max_seq_len": 2048,
82
- "vocab_size": 50304,
83
- "lr": 0.0003,
84
- "max_steps": 200000,
85
- "batch_size": 8
86
- }
87
- }
88
- ],
89
- "summary": {
90
- "total_runs": 1,
91
- "run_names": ["model-name"]
92
- }
93
- }
94
- ```
95
-
96
- ### When to use
97
-
98
- - **After training**: Generate updated dashboard data
99
- - **Adding new runs**: Include new training sessions in the dashboard
100
- - **Debugging**: Verify log parsing is working correctly
101
- - **Dashboard setup**: Initial setup of the training metrics dashboard
102
-
103
- ### Troubleshooting
104
-
105
- If the script doesn't find any data:
106
- 1. Check that log files exist in `runs/*/logs/`
107
- 2. Verify log format matches the expected pattern
108
- 3. Ensure log files contain training metrics entries
109
- 4. Check file permissions and encoding
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/generate_data.py DELETED
@@ -1,198 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Script to generate data.json from training log files.
4
-
5
- This script parses log files from the runs directory and extracts:
6
- - Training metrics (loss, learning rate, inf/nan count)
7
- - Evaluation results (paloma metrics)
8
- - Model configuration parameters
9
-
10
- The output is saved to plots/data.json for the dashboard.
11
- """
12
-
13
- import json
14
- import re
15
- from pathlib import Path
16
- from typing import Any, Dict, List, Optional
17
-
18
-
19
- def parse_training_metrics(log_content: str) -> List[Dict[str, Any]]:
20
- """Parse training metrics from log content."""
21
- metrics = []
22
-
23
- # Pattern to match training metrics entries with timestamp and log level
24
- pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- πŸ”„ Training Metrics\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - β”œβ”€β”€ Loss: ([\d.]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - β”œβ”€β”€ Learning Rate: ([\d.e+-]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - └── Inf/NaN count: (\d+)"
25
-
26
- matches = re.findall(pattern, log_content)
27
-
28
- for step, loss, lr, inf_nan in matches:
29
- metrics.append(
30
- {
31
- "step": int(step),
32
- "loss": float(loss),
33
- "learning_rate": float(lr),
34
- "inf_nan_count": int(inf_nan),
35
- }
36
- )
37
-
38
- return sorted(metrics, key=lambda x: x["step"])
39
-
40
-
41
- def parse_evaluation_results(log_content: str) -> List[Dict[str, Any]]:
42
- """Parse evaluation results from log content."""
43
- results = []
44
-
45
- # Pattern to match evaluation results with timestamp and log level
46
- pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- πŸ“Š Evaluation Results\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - └── paloma: ([\d.e+-]+)"
47
-
48
- matches = re.findall(pattern, log_content)
49
-
50
- for step, paloma in matches:
51
- try:
52
- paloma_value = float(paloma)
53
- results.append({"step": int(step), "paloma": paloma_value})
54
- except ValueError:
55
- # Skip if paloma value is not a valid number (e.g., "inf")
56
- continue
57
-
58
- return sorted(results, key=lambda x: x["step"])
59
-
60
-
61
- def extract_config_from_log(log_content: str) -> Dict[str, Any]:
62
- """Extract model configuration from log content."""
63
- config = {}
64
-
65
- # Extract key model parameters
66
- patterns = {
67
- "d_model": r"d_model: (\d+)",
68
- "n_layers": r"n_layers: (\d+)",
69
- "max_seq_len": r"max_seq_len: (\d+)",
70
- "vocab_size": r"vocab_size: (\d+)",
71
- "lr": r"lr: ([\d.e+-]+)",
72
- "max_steps": r"max_steps: (\d+)",
73
- "batch_size": r"batch_size: (\d+)",
74
- }
75
-
76
- for key, pattern in patterns.items():
77
- match = re.search(pattern, log_content)
78
- if match:
79
- try:
80
- if key in [
81
- "d_model",
82
- "n_layers",
83
- "max_seq_len",
84
- "vocab_size",
85
- "max_steps",
86
- "batch_size",
87
- ]:
88
- config[key] = int(match.group(1))
89
- else:
90
- config[key] = float(match.group(1))
91
- except ValueError:
92
- continue
93
-
94
- return config
95
-
96
-
97
- def process_run_directory(run_path: Path) -> Optional[Dict[str, Any]]:
98
- """Process a single run directory and extract all data."""
99
- run_name = run_path.name
100
-
101
- # Find log files
102
- logs_dir = run_path / "logs"
103
- if not logs_dir.exists():
104
- return None
105
-
106
- log_files = list(logs_dir.glob("*.log"))
107
- if not log_files:
108
- return None
109
-
110
- # Use the most recent log file for configuration
111
- latest_log = max(log_files, key=lambda x: x.stat().st_mtime)
112
-
113
- # Read log content
114
- log_content = latest_log.read_text(encoding="utf-8")
115
-
116
- # Extract data
117
- training_metrics = parse_training_metrics(log_content)
118
- evaluation_results = parse_evaluation_results(log_content)
119
- config = extract_config_from_log(log_content)
120
-
121
- # If no training metrics found, skip this run
122
- if not training_metrics:
123
- return None
124
-
125
- return {
126
- "run_name": run_name,
127
- "log_file": latest_log.name,
128
- "training_metrics": training_metrics,
129
- "evaluation_results": evaluation_results,
130
- "config": config,
131
- }
132
-
133
-
134
- def generate_data_json(runs_dir: str = "runs", output_file: str = "plots/data.json"):
135
- """Generate data.json from all run directories."""
136
- runs_path = Path(runs_dir)
137
- if not runs_path.exists():
138
- print(f"Runs directory {runs_dir} not found!")
139
- return
140
-
141
- runs_data = []
142
-
143
- # Process each run directory
144
- for run_dir in runs_path.iterdir():
145
- if run_dir.is_dir():
146
- print(f"Processing run: {run_dir.name}")
147
- run_data = process_run_directory(run_dir)
148
- if run_data:
149
- runs_data.append(run_data)
150
- print(f" βœ“ Found {len(run_data['training_metrics'])} training metrics")
151
- print(
152
- f" βœ“ Found {len(run_data['evaluation_results'])} evaluation results"
153
- )
154
- else:
155
- print(" βœ— No valid data found")
156
-
157
- if not runs_data:
158
- print("No valid runs found!")
159
- return
160
-
161
- # Create output data structure
162
- output_data = {
163
- "runs": runs_data,
164
- "summary": {
165
- "total_runs": len(runs_data),
166
- "run_names": [run["run_name"] for run in runs_data],
167
- },
168
- }
169
-
170
- # Ensure output directory exists
171
- output_path = Path(output_file)
172
- output_path.parent.mkdir(parents=True, exist_ok=True)
173
-
174
- # Write to file
175
- with open(output_path, "w", encoding="utf-8") as f:
176
- json.dump(output_data, f, indent=2, ensure_ascii=False)
177
-
178
- print(f"\nβœ“ Generated {output_file} with {len(runs_data)} runs")
179
- print(
180
- f"βœ“ Total training metrics: {sum(len(run['training_metrics']) for run in runs_data)}"
181
- )
182
- print(
183
- f"βœ“ Total evaluation results: {sum(len(run['evaluation_results']) for run in runs_data)}"
184
- )
185
-
186
-
187
- if __name__ == "__main__":
188
- import argparse
189
-
190
- parser = argparse.ArgumentParser(
191
- description="Generate data.json from training logs"
192
- )
193
- parser.add_argument("--runs-dir", default="runs", help="Path to runs directory")
194
- parser.add_argument("--output", default="plots/data.json", help="Output file path")
195
-
196
- args = parser.parse_args()
197
-
198
- generate_data_json(args.runs_dir, args.output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/train.py DELETED
@@ -1,30 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- A minimal script to train the Pico language model. In practice, you should just use the
4
- `poetry run train` command to run the training pipeline. Doing so will invoke this script.
5
- Training logic is located in `src/training/trainer.py`.
6
- """
7
-
8
- from pathlib import Path
9
-
10
- import click
11
-
12
- from src.training.trainer import Trainer
13
-
14
-
15
- @click.command()
16
- @click.option(
17
- "--config_path",
18
- "config_path",
19
- type=click.Path(exists=True, path_type=Path),
20
- help="Path to the training configuration file",
21
- )
22
- def main(config_path: Path) -> None:
23
- """Train the Pico language model using the specified configuration."""
24
-
25
- trainer = Trainer(config_path=str(config_path))
26
- trainer.train()
27
-
28
-
29
- if __name__ == "__main__":
30
- main()