Delete scripts

Browse files

Files changed (3) hide show

scripts/README.md +0 -109
scripts/generate_data.py +0 -198
scripts/train.py +0 -30

scripts/README.md DELETED Viewed

@@ -1,109 +0,0 @@
-# Scripts Directory
-This directory contains utility scripts for the Pico training framework.
-## generate_data.py
-A script to automatically generate `data.json` from training log files for the dashboard.
-### What it does
-This script parses log files from the `runs/` directory and extracts:
-- **Training metrics**: Loss, learning rate, and inf/NaN counts at each step
-- **Evaluation results**: Paloma evaluation metrics
-- **Model configuration**: Architecture parameters (d_model, n_layers, etc.)
-### Usage
-```bash
-# Generate data.json from the default runs directory
-python scripts/generate_data.py
-# Specify custom runs directory
-python scripts/generate_data.py --runs-dir /path/to/runs
-# Specify custom output file
-python scripts/generate_data.py --output /path/to/output.json
-```
-### How it works
-1. **Scans runs directory**: Looks for subdirectories containing training runs
-2. **Finds log files**: Locates `.log` files in each run's `logs/` subdirectory
-3. **Parses log content**: Uses regex patterns to extract structured data
-4. **Generates JSON**: Creates a structured JSON file for the dashboard
-### Log Format Requirements
-The script expects log files with the following format:
-```
-2025-08-29 02:09:12 - pico-train - INFO - Step 500 -- 🔄 Training Metrics
-2025-08-29 02:09:12 - pico-train - INFO - ├── Loss: 10.8854
-2025-08-29 02:09:12 - pico-train - INFO - ├── Learning Rate: 3.13e-06
-2025-08-29 02:09:12 - pico-train - INFO - └── Inf/NaN count: 0
-```
-And evaluation results:
-```
-2025-08-29 02:15:26 - pico-train - INFO - Step 1000 -- 📊 Evaluation Results
-2025-08-29 02:15:26 - pico-train - INFO - └── paloma: 7.125172406420199e+27
-```
-### Output Format
-The generated `data.json` has this structure:
-```json
-{
-  "runs": [
-    {
-      "run_name": "model-name",
-      "log_file": "log_filename.log",
-      "training_metrics": [
-        {
-          "step": 0,
-          "loss": 10.9914,
-          "learning_rate": 0.0,
-          "inf_nan_count": 0
-        }
-      ],
-      "evaluation_results": [
-        {
-          "step": 1000,
-          "paloma": 59434.76600609756
-        }
-      ],
-      "config": {
-        "d_model": 96,
-        "n_layers": 12,
-        "max_seq_len": 2048,
-        "vocab_size": 50304,
-        "lr": 0.0003,
-        "max_steps": 200000,
-        "batch_size": 8
-      }
-    }
-  ],
-  "summary": {
-    "total_runs": 1,
-    "run_names": ["model-name"]
-  }
-}
-```
-### When to use
-- **After training**: Generate updated dashboard data
-- **Adding new runs**: Include new training sessions in the dashboard
-- **Debugging**: Verify log parsing is working correctly
-- **Dashboard setup**: Initial setup of the training metrics dashboard
-### Troubleshooting
-If the script doesn't find any data:
-1. Check that log files exist in `runs/*/logs/`
-2. Verify log format matches the expected pattern
-3. Ensure log files contain training metrics entries
-4. Check file permissions and encoding

scripts/generate_data.py DELETED Viewed

@@ -1,198 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script to generate data.json from training log files.
-This script parses log files from the runs directory and extracts:
-- Training metrics (loss, learning rate, inf/nan count)
-- Evaluation results (paloma metrics)
-- Model configuration parameters
-The output is saved to plots/data.json for the dashboard.
-"""
-import json
-import re
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-def parse_training_metrics(log_content: str) -> List[Dict[str, Any]]:
-    """Parse training metrics from log content."""
-    metrics = []
-    # Pattern to match training metrics entries with timestamp and log level
-    pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- 🔄 Training Metrics\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - ├── Loss: ([\d.]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - ├── Learning Rate: ([\d.e+-]+)\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - └── Inf/NaN count: (\d+)"
-    matches = re.findall(pattern, log_content)
-    for step, loss, lr, inf_nan in matches:
-        metrics.append(
-            {
-                "step": int(step),
-                "loss": float(loss),
-                "learning_rate": float(lr),
-                "inf_nan_count": int(inf_nan),
-            }
-        )
-    return sorted(metrics, key=lambda x: x["step"])
-def parse_evaluation_results(log_content: str) -> List[Dict[str, Any]]:
-    """Parse evaluation results from log content."""
-    results = []
-    # Pattern to match evaluation results with timestamp and log level
-    pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - Step (\d+) -- 📊 Evaluation Results\n\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} - pico-train - INFO - └── paloma: ([\d.e+-]+)"
-    matches = re.findall(pattern, log_content)
-    for step, paloma in matches:
-        try:
-            paloma_value = float(paloma)
-            results.append({"step": int(step), "paloma": paloma_value})
-        except ValueError:
-            # Skip if paloma value is not a valid number (e.g., "inf")
-            continue
-    return sorted(results, key=lambda x: x["step"])
-def extract_config_from_log(log_content: str) -> Dict[str, Any]:
-    """Extract model configuration from log content."""
-    config = {}
-    # Extract key model parameters
-    patterns = {
-        "d_model": r"d_model: (\d+)",
-        "n_layers": r"n_layers: (\d+)",
-        "max_seq_len": r"max_seq_len: (\d+)",
-        "vocab_size": r"vocab_size: (\d+)",
-        "lr": r"lr: ([\d.e+-]+)",
-        "max_steps": r"max_steps: (\d+)",
-        "batch_size": r"batch_size: (\d+)",
-    }
-    for key, pattern in patterns.items():
-        match = re.search(pattern, log_content)
-        if match:
-            try:
-                if key in [
-                    "d_model",
-                    "n_layers",
-                    "max_seq_len",
-                    "vocab_size",
-                    "max_steps",
-                    "batch_size",
-                ]:
-                    config[key] = int(match.group(1))
-                else:
-                    config[key] = float(match.group(1))
-            except ValueError:
-                continue
-    return config
-def process_run_directory(run_path: Path) -> Optional[Dict[str, Any]]:
-    """Process a single run directory and extract all data."""
-    run_name = run_path.name
-    # Find log files
-    logs_dir = run_path / "logs"
-    if not logs_dir.exists():
-        return None
-    log_files = list(logs_dir.glob("*.log"))
-    if not log_files:
-        return None
-    # Use the most recent log file for configuration
-    latest_log = max(log_files, key=lambda x: x.stat().st_mtime)
-    # Read log content
-    log_content = latest_log.read_text(encoding="utf-8")
-    # Extract data
-    training_metrics = parse_training_metrics(log_content)
-    evaluation_results = parse_evaluation_results(log_content)
-    config = extract_config_from_log(log_content)
-    # If no training metrics found, skip this run
-    if not training_metrics:
-        return None
-    return {
-        "run_name": run_name,
-        "log_file": latest_log.name,
-        "training_metrics": training_metrics,
-        "evaluation_results": evaluation_results,
-        "config": config,
-    }
-def generate_data_json(runs_dir: str = "runs", output_file: str = "plots/data.json"):
-    """Generate data.json from all run directories."""
-    runs_path = Path(runs_dir)
-    if not runs_path.exists():
-        print(f"Runs directory {runs_dir} not found!")
-        return
-    runs_data = []
-    # Process each run directory
-    for run_dir in runs_path.iterdir():
-        if run_dir.is_dir():
-            print(f"Processing run: {run_dir.name}")
-            run_data = process_run_directory(run_dir)
-            if run_data:
-                runs_data.append(run_data)
-                print(f"  ✓ Found {len(run_data['training_metrics'])} training metrics")
-                print(
-                    f"  ✓ Found {len(run_data['evaluation_results'])} evaluation results"
-                )
-            else:
-                print("  ✗ No valid data found")
-    if not runs_data:
-        print("No valid runs found!")
-        return
-    # Create output data structure
-    output_data = {
-        "runs": runs_data,
-        "summary": {
-            "total_runs": len(runs_data),
-            "run_names": [run["run_name"] for run in runs_data],
-        },
-    }
-    # Ensure output directory exists
-    output_path = Path(output_file)
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    # Write to file
-    with open(output_path, "w", encoding="utf-8") as f:
-        json.dump(output_data, f, indent=2, ensure_ascii=False)
-    print(f"\n✓ Generated {output_file} with {len(runs_data)} runs")
-    print(
-        f"✓ Total training metrics: {sum(len(run['training_metrics']) for run in runs_data)}"
-    )
-    print(
-        f"✓ Total evaluation results: {sum(len(run['evaluation_results']) for run in runs_data)}"
-    )
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(
-        description="Generate data.json from training logs"
-    )
-    parser.add_argument("--runs-dir", default="runs", help="Path to runs directory")
-    parser.add_argument("--output", default="plots/data.json", help="Output file path")
-    args = parser.parse_args()
-    generate_data_json(args.runs_dir, args.output)

scripts/train.py DELETED Viewed

@@ -1,30 +0,0 @@
-#!/usr/bin/env python3
-"""
-A minimal script to train the Pico language model. In practice, you should just use the
-`poetry run train` command to run the training pipeline. Doing so will invoke this script.
-Training logic is located in `src/training/trainer.py`.
-"""
-from pathlib import Path
-import click
-from src.training.trainer import Trainer
-@click.command()
-@click.option(
-    "--config_path",
-    "config_path",
-    type=click.Path(exists=True, path_type=Path),
-    help="Path to the training configuration file",
-)
-def main(config_path: Path) -> None:
-    """Train the Pico language model using the specified configuration."""
-    trainer = Trainer(config_path=str(config_path))
-    trainer.train()
-if __name__ == "__main__":
-    main()