#!/usr/bin/env python3 """Patch n.py to support scraper streaming""" import re import shutil from datetime import datetime SRC = "/workspace/n.py" # Backup shutil.copy(SRC, f"{SRC}.backup.{datetime.now().strftime('%Y%m%d_%H%M%S')}") with open(SRC, 'r') as f: code = f.read() # Check if already patched if 'ScraperStreamDataset' in code: print("Already patched!") exit(0) # Add import after first import block import_line = "from stream_loader import ScraperStreamDataset\n" if import_line.strip() not in code: # Insert after 'from datasets import' line code = re.sub( r'(from datasets import[^\n]+\n)', r'\1' + import_line, code, count=1 ) # Patch _open_stream_one function old_func = '''def _open_stream_one(ds_name: str, seed: int, streaming: bool = True): dc = DownloadConfig(max_retries=5, use_etag=True, resume_download=True)''' new_func = '''def _open_stream_one(ds_name: str, seed: int, streaming: bool = True): # Custom scraper streaming support if ds_name == "scraper" or ds_name.startswith("http://"): url = ds_name if ds_name.startswith("http://") else "http://localhost:8888" print(f"[stream] Using scraper: {url}") return iter(ScraperStreamDataset(server_url=url, batch_size=100)) dc = DownloadConfig(max_retries=5, use_etag=True, resume_download=True)''' code = code.replace(old_func, new_func) with open(SRC, 'w') as f: f.write(code) print("Patched successfully!") print("Use --source scraper or --source http://localhost:8888 to use scraped data")