prelington commited on
Commit
3a7ae3c
Β·
verified Β·
1 Parent(s): f1a7dc4

Create data_preprocessor.py

Browse files
Files changed (1) hide show
  1. data_preprocessor.py +124 -0
data_preprocessor.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PyPilot Data Preprocessor - Handles massive code datasets
3
+ """
4
+ import json
5
+ import pickle
6
+ import multiprocessing as mp
7
+ from pathlib import Path
8
+ from datasets import load_dataset
9
+ import tokenizers
10
+ from tokenizers import Tokenizer
11
+ from tokenizers.models import BPE
12
+ from tokenizers.trainers import BpeTrainer
13
+ from tokenizers.pre_tokenizers import Whitespace
14
+
15
+ class PyPilotDataPreprocessor:
16
+ def __init__(self):
17
+ self.supported_languages = ['python', 'javascript', 'java', 'cpp', 'go', 'rust']
18
+ self.processed_data = {}
19
+
20
+ def load_github_dataset(self, language='python', split='train'):
21
+ """Load massive code dataset from Hugging Face"""
22
+ print(f"πŸ“₯ Loading {language} code dataset...")
23
+ try:
24
+ dataset = load_dataset("codeparrot/github-code", split=split, languages=[language])
25
+ print(f"βœ… Loaded {len(dataset)} {language} files")
26
+ return dataset
27
+ except Exception as e:
28
+ print(f"❌ Error loading dataset: {e}")
29
+ return None
30
+
31
+ def build_tokenizer(self, dataset, vocab_size=50000):
32
+ """Build custom tokenizer for code"""
33
+ print("πŸ”€ Building custom code tokenizer...")
34
+
35
+ tokenizer = Tokenizer(BPE())
36
+ tokenizer.pre_tokenizer = Whitespace()
37
+
38
+ trainer = BpeTrainer(
39
+ vocab_size=vocab_size,
40
+ special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "[EOL]"]
41
+ )
42
+
43
+ # Train tokenizer on code samples
44
+ def batch_iterator(batch_size=1000):
45
+ for i in range(0, len(dataset), batch_size):
46
+ yield dataset[i:i+batch_size]['code']
47
+
48
+ tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
49
+ tokenizer.save("./pypilot_tokenizer.json")
50
+ print("βœ… Tokenizer built and saved!")
51
+ return tokenizer
52
+
53
+ def parallel_process_files(self, file_paths, num_processes=8):
54
+ """Process files in parallel for maximum speed"""
55
+ print(f"⚑ Processing {len(file_paths)} files with {num_processes} processes...")
56
+
57
+ def process_file(file_path):
58
+ try:
59
+ with open(file_path, 'r', encoding='utf-8') as f:
60
+ content = f.read()
61
+ return {
62
+ 'file_path': str(file_path),
63
+ 'content': content,
64
+ 'length': len(content),
65
+ 'language': self.detect_language(file_path)
66
+ }
67
+ except Exception as e:
68
+ return {'error': str(e), 'file_path': str(file_path)}
69
+
70
+ with mp.Pool(num_processes) as pool:
71
+ results = pool.map(process_file, file_paths)
72
+
73
+ successful = [r for r in results if 'error' not in r]
74
+ print(f"βœ… Processed {len(successful)} files successfully")
75
+ return successful
76
+
77
+ def detect_language(self, file_path):
78
+ """Detect programming language from file extension"""
79
+ extensions = {
80
+ '.py': 'python',
81
+ '.js': 'javascript',
82
+ '.java': 'java',
83
+ '.cpp': 'cpp',
84
+ '.cc': 'cpp',
85
+ '.go': 'go',
86
+ '.rs': 'rust',
87
+ '.ts': 'typescript'
88
+ }
89
+ return extensions.get(Path(file_path).suffix, 'unknown')
90
+
91
+ def create_training_pairs(self, code_samples, context_size=512):
92
+ """Create (input, target) pairs for training"""
93
+ print("πŸ”„ Creating training pairs...")
94
+ training_pairs = []
95
+
96
+ for sample in code_samples:
97
+ code = sample.get('content', '')
98
+ if len(code) > context_size:
99
+ # Split code into chunks and create prediction tasks
100
+ for i in range(0, len(code) - context_size, context_size // 2):
101
+ input_chunk = code[i:i + context_size]
102
+ target_chunk = code[i + 1:i + context_size + 1]
103
+ training_pairs.append({
104
+ 'input': input_chunk,
105
+ 'target': target_chunk,
106
+ 'language': sample.get('language', 'unknown')
107
+ })
108
+
109
+ print(f"βœ… Created {len(training_pairs)} training pairs")
110
+ return training_pairs
111
+
112
+ if __name__ == "__main__":
113
+ preprocessor = PyPilotDataPreprocessor()
114
+
115
+ # Example usage
116
+ dataset = preprocessor.load_github_dataset('python')
117
+ if dataset:
118
+ tokenizer = preprocessor.build_tokenizer(dataset)
119
+ training_data = preprocessor.create_training_pairs(dataset)
120
+
121
+ # Save processed data
122
+ with open('processed_training_data.pkl', 'wb') as f:
123
+ pickle.dump(training_data, f)
124
+ print("πŸ’Ύ Training data saved!")