Spaces:
Sleeping
Sleeping
Update train.py
Browse files
train.py
CHANGED
|
@@ -10,13 +10,21 @@
|
|
| 10 |
# python train.py --mode validate β validate ADI weights against dataset
|
| 11 |
# python train.py --mode finetune β finetune SmolLM2 on collected data (future)
|
| 12 |
# =============================================================================
|
| 13 |
-
|
| 14 |
import argparse
|
| 15 |
import json
|
| 16 |
import logging
|
| 17 |
from datetime import datetime
|
| 18 |
from pathlib import Path
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
import model as model_module
|
| 21 |
from adi import DumpindexAnalyzer
|
| 22 |
|
|
@@ -28,7 +36,8 @@ logger = logging.getLogger("train")
|
|
| 28 |
# Mode 1 β Export dataset to training format
|
| 29 |
# =============================================================================
|
| 30 |
|
| 31 |
-
def export_dataset(output_path: str =
|
|
|
|
| 32 |
"""
|
| 33 |
Export HF dataset logs to JSONL format for training.
|
| 34 |
Filters: only HIGH_PRIORITY and MEDIUM_PRIORITY entries with actual responses.
|
|
@@ -95,7 +104,7 @@ def validate_adi():
|
|
| 95 |
"samples": len(labeled),
|
| 96 |
"weights": analyzer.weights,
|
| 97 |
}
|
| 98 |
-
|
| 99 |
logger.info("Results saved β validation_results.json")
|
| 100 |
|
| 101 |
|
|
@@ -104,16 +113,10 @@ def validate_adi():
|
|
| 104 |
# =============================================================================
|
| 105 |
|
| 106 |
def finetune():
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
Placeholder β requires export first + enough data (>500 samples recommended).
|
| 110 |
-
"""
|
| 111 |
-
train_file = Path("train_data.jsonl")
|
| 112 |
-
if not train_file.exists():
|
| 113 |
-
logger.error("train_data.jsonl not found β run: python train.py --mode export first")
|
| 114 |
return
|
| 115 |
-
|
| 116 |
-
lines = train_file.read_text().strip().splitlines()
|
| 117 |
logger.info(f"Training samples available: {len(lines)}")
|
| 118 |
|
| 119 |
if len(lines) < 100:
|
|
|
|
| 10 |
# python train.py --mode validate β validate ADI weights against dataset
|
| 11 |
# python train.py --mode finetune β finetune SmolLM2 on collected data (future)
|
| 12 |
# =============================================================================
|
| 13 |
+
import os
|
| 14 |
import argparse
|
| 15 |
import json
|
| 16 |
import logging
|
| 17 |
from datetime import datetime
|
| 18 |
from pathlib import Path
|
| 19 |
|
| 20 |
+
# ββ Path Resolution βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
# HF Spaces: /tmp/ (read-only filesystem)
|
| 22 |
+
# Local dev: current directory
|
| 23 |
+
_TMP = Path("/tmp") if os.getenv("SPACE_ID") else Path(".")
|
| 24 |
+
|
| 25 |
+
TRAIN_DATA = _TMP / "train_data.jsonl"
|
| 26 |
+
VALID_RESULT = _TMP / "validation_results.json"
|
| 27 |
+
|
| 28 |
import model as model_module
|
| 29 |
from adi import DumpindexAnalyzer
|
| 30 |
|
|
|
|
| 36 |
# Mode 1 β Export dataset to training format
|
| 37 |
# =============================================================================
|
| 38 |
|
| 39 |
+
def export_dataset(output_path: str = None):
|
| 40 |
+
output = Path(output_path) if output_path else TRAIN_DATA
|
| 41 |
"""
|
| 42 |
Export HF dataset logs to JSONL format for training.
|
| 43 |
Filters: only HIGH_PRIORITY and MEDIUM_PRIORITY entries with actual responses.
|
|
|
|
| 104 |
"samples": len(labeled),
|
| 105 |
"weights": analyzer.weights,
|
| 106 |
}
|
| 107 |
+
VALID_RESULT.write_text(json.dumps(result, indent=2))
|
| 108 |
logger.info("Results saved β validation_results.json")
|
| 109 |
|
| 110 |
|
|
|
|
| 113 |
# =============================================================================
|
| 114 |
|
| 115 |
def finetune():
|
| 116 |
+
if not TRAIN_DATA.exists():
|
| 117 |
+
logger.error(f"train_data.jsonl not found at {TRAIN_DATA}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
return
|
| 119 |
+
lines = TRAIN_DATA.read_text().strip().splitlines()
|
|
|
|
| 120 |
logger.info(f"Training samples available: {len(lines)}")
|
| 121 |
|
| 122 |
if len(lines) < 100:
|