File size: 3,128 Bytes
f80360c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python3
"""
Prepare dataset for LoRA training - Stack 2.9 Local Version
"""
import json
import argparse
from pathlib import Path
from datasets import Dataset

def prepare_dataset(input_path, output_dir, max_length=2048, test_split=0.1):
    """Load JSONL and prepare for training with tokenization."""
    
    from transformers import AutoTokenizer
    
    print(f"Loading data from: {input_path}")
    
    # Load JSONL
    with open(input_path, 'r') as f:
        data = [json.loads(line) for line in f]
    
    print(f"Loaded {len(data)} examples")
    
    # Format as prompt + completion (for causal LM)
    formatted_data = []
    for item in data:
        if 'prompt' in item and 'completion' in item:
            text = item['prompt'] + item['completion']
            formatted_data.append({'text': text})
        elif 'input' in item and 'output' in item:
            text = item['input'] + item['output']
            formatted_data.append({'text': text})
        elif 'instruction' in item and 'output' in item:
            text = item['instruction'] + ' ' + item['output']
            formatted_data.append({'text': text})
    
    print(f"Formatted {len(formatted_data)} examples")
    
    # Create HuggingFace dataset
    dataset = Dataset.from_list(formatted_data)
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-7B", trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Tokenize
    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors=None
        )
    
    dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=['text']
    )
    
    # Split train/eval
    split = dataset.train_test_split(test_size=test_split)
    train_data = split['train']
    eval_data = split['test']
    
    # Save
    output_path = Path(output_dir)
    train_path = output_path / "train"
    eval_path = output_path / "eval"
    
    train_data.save_to_disk(str(train_path))
    eval_data.save_to_disk(str(eval_path))
    
    print(f"Saved to: {output_dir}")
    print(f"  Train: {len(train_data)} examples")
    print(f"  Eval: {len(eval_data)} examples")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input", type=str, default="training-data/generated/synthetic_50k.jsonl")
    parser.add_argument("--output", type=str, default="stack-2.9-training/data")
    parser.add_argument("--max-length", type=int, default=2048)
    parser.add_argument("--test-split", type=float, default=0.1)
    args = parser.parse_args()
    
    # Resolve paths relative to workspace
    input_path = Path(args.input)
    if not input_path.is_absolute():
        input_path = Path("/Users/walidsobhi/.openclaw/workspace/stack-2.9") / input_path
    
    prepare_dataset(
        str(input_path),
        args.output,
        args.max_length,
        args.test_split
    )