|
|
""" |
|
|
Combine Training Data and Retrain v7. |
|
|
|
|
|
Adds ICICI samples to existing training data and retrains the model. |
|
|
|
|
|
Author: Ranjit Behera |
|
|
""" |
|
|
|
|
|
import json |
|
|
import subprocess |
|
|
import random |
|
|
from pathlib import Path |
|
|
|
|
|
def combine_and_retrain(): |
|
|
print("=" * 60) |
|
|
print("🔄 COMBINING DATA AND RETRAINING v7") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
train_file = Path("data/training/train.jsonl") |
|
|
valid_file = Path("data/training/valid.jsonl") |
|
|
icici_file = Path("data/synthetic/icici_samples.jsonl") |
|
|
|
|
|
print("\n1. Loading existing training data...") |
|
|
existing_train = [] |
|
|
with open(train_file, 'r') as f: |
|
|
for line in f: |
|
|
existing_train.append(json.loads(line)) |
|
|
print(f" Loaded {len(existing_train)} existing samples") |
|
|
|
|
|
print("\n2. Loading ICICI samples...") |
|
|
icici_samples = [] |
|
|
with open(icici_file, 'r') as f: |
|
|
for line in f: |
|
|
data = json.loads(line) |
|
|
|
|
|
icici_samples.append({ |
|
|
'prompt': data['prompt'], |
|
|
'completion': data['completion'] |
|
|
}) |
|
|
print(f" Loaded {len(icici_samples)} ICICI samples") |
|
|
|
|
|
|
|
|
print("\n3. Combining datasets...") |
|
|
combined = existing_train + icici_samples |
|
|
random.seed(42) |
|
|
random.shuffle(combined) |
|
|
|
|
|
|
|
|
split_idx = int(len(combined) * 0.9) |
|
|
new_train = combined[:split_idx] |
|
|
new_valid = combined[split_idx:] |
|
|
|
|
|
print(f" New training set: {len(new_train)}") |
|
|
print(f" New validation set: {len(new_valid)}") |
|
|
|
|
|
|
|
|
print("\n4. Saving combined data...") |
|
|
|
|
|
|
|
|
train_file.rename(train_file.with_suffix('.jsonl.bak')) |
|
|
valid_file.rename(valid_file.with_suffix('.jsonl.bak')) |
|
|
|
|
|
with open(train_file, 'w') as f: |
|
|
for sample in new_train: |
|
|
f.write(json.dumps(sample) + '\n') |
|
|
|
|
|
with open(valid_file, 'w') as f: |
|
|
for sample in new_valid: |
|
|
f.write(json.dumps(sample) + '\n') |
|
|
|
|
|
print(f" Saved to {train_file}") |
|
|
print(f" Saved to {valid_file}") |
|
|
|
|
|
|
|
|
icici_count = sum(1 for s in new_train if 'ICICI' in s.get('text', '').upper()) |
|
|
print(f"\n ICICI samples in training: ~{icici_count}") |
|
|
|
|
|
print("\n✅ Data combined successfully!") |
|
|
print("\n5. Starting v7 training...") |
|
|
|
|
|
|
|
|
cmd = [ |
|
|
"mlx_lm.lora", |
|
|
"--model", "models/base/phi3-finance-base", |
|
|
"--data", "data/training", |
|
|
"--train", |
|
|
"--iters", "500", |
|
|
"--batch-size", "1", |
|
|
"--num-layers", "16", |
|
|
"--learning-rate", "1e-5", |
|
|
"--adapter-path", "models/adapters/finance-lora-v7", |
|
|
"--max-seq-length", "1024" |
|
|
] |
|
|
|
|
|
print(f"Command: {' '.join(cmd)}") |
|
|
|
|
|
try: |
|
|
subprocess.run(cmd, check=True) |
|
|
print("\n✅ v7 training complete!") |
|
|
except subprocess.CalledProcessError as e: |
|
|
print(f"\n❌ Training failed: {e}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
combine_and_retrain() |
|
|
|