prelington
/

PyPilot

coding_assistant

Model card Files Files and versions

xet

Community

prelington commited on Oct 2, 2025

Commit

3a7ae3c

verified ·

1 Parent(s): f1a7dc4

Create data_preprocessor.py

Browse files

Files changed (1) hide show

data_preprocessor.py +124 -0

data_preprocessor.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""
+PyPilot Data Preprocessor - Handles massive code datasets
+"""
+import json
+import pickle
+import multiprocessing as mp
+from pathlib import Path
+from datasets import load_dataset
+import tokenizers
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import Whitespace
+class PyPilotDataPreprocessor:
+    def __init__(self):
+        self.supported_languages = ['python', 'javascript', 'java', 'cpp', 'go', 'rust']
+        self.processed_data = {}
+    def load_github_dataset(self, language='python', split='train'):
+        """Load massive code dataset from Hugging Face"""
+        print(f"📥 Loading {language} code dataset...")
+        try:
+            dataset = load_dataset("codeparrot/github-code", split=split, languages=[language])
+            print(f"✅ Loaded {len(dataset)} {language} files")
+            return dataset
+        except Exception as e:
+            print(f"❌ Error loading dataset: {e}")
+            return None
+    def build_tokenizer(self, dataset, vocab_size=50000):
+        """Build custom tokenizer for code"""
+        print("🔤 Building custom code tokenizer...")
+        tokenizer = Tokenizer(BPE())
+        tokenizer.pre_tokenizer = Whitespace()
+        trainer = BpeTrainer(
+            vocab_size=vocab_size,
+            special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "[EOL]"]
+        )
+        # Train tokenizer on code samples
+        def batch_iterator(batch_size=1000):
+            for i in range(0, len(dataset), batch_size):
+                yield dataset[i:i+batch_size]['code']
+        tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
+        tokenizer.save("./pypilot_tokenizer.json")
+        print("✅ Tokenizer built and saved!")
+        return tokenizer
+    def parallel_process_files(self, file_paths, num_processes=8):
+        """Process files in parallel for maximum speed"""
+        print(f"⚡ Processing {len(file_paths)} files with {num_processes} processes...")
+        def process_file(file_path):
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                return {
+                    'file_path': str(file_path),
+                    'content': content,
+                    'length': len(content),
+                    'language': self.detect_language(file_path)
+                }
+            except Exception as e:
+                return {'error': str(e), 'file_path': str(file_path)}
+        with mp.Pool(num_processes) as pool:
+            results = pool.map(process_file, file_paths)
+        successful = [r for r in results if 'error' not in r]
+        print(f"✅ Processed {len(successful)} files successfully")
+        return successful
+    def detect_language(self, file_path):
+        """Detect programming language from file extension"""
+        extensions = {
+            '.py': 'python',
+            '.js': 'javascript',
+            '.java': 'java',
+            '.cpp': 'cpp',
+            '.cc': 'cpp',
+            '.go': 'go',
+            '.rs': 'rust',
+            '.ts': 'typescript'
+        }
+        return extensions.get(Path(file_path).suffix, 'unknown')
+    def create_training_pairs(self, code_samples, context_size=512):
+        """Create (input, target) pairs for training"""
+        print("🔄 Creating training pairs...")
+        training_pairs = []
+        for sample in code_samples:
+            code = sample.get('content', '')
+            if len(code) > context_size:
+                # Split code into chunks and create prediction tasks
+                for i in range(0, len(code) - context_size, context_size // 2):
+                    input_chunk = code[i:i + context_size]
+                    target_chunk = code[i + 1:i + context_size + 1]
+                    training_pairs.append({
+                        'input': input_chunk,
+                        'target': target_chunk,
+                        'language': sample.get('language', 'unknown')
+                    })
+        print(f"✅ Created {len(training_pairs)} training pairs")
+        return training_pairs
+if __name__ == "__main__":
+    preprocessor = PyPilotDataPreprocessor()
+    # Example usage
+    dataset = preprocessor.load_github_dataset('python')
+    if dataset:
+        tokenizer = preprocessor.build_tokenizer(dataset)
+        training_data = preprocessor.create_training_pairs(dataset)
+        # Save processed data
+        with open('processed_training_data.pkl', 'wb') as f:
+            pickle.dump(training_data, f)
+        print("💾 Training data saved!")