| |
| """Patch n.py to support scraper streaming""" |
| import re |
| import shutil |
| from datetime import datetime |
|
|
| SRC = "/workspace/n.py" |
|
|
| |
| shutil.copy(SRC, f"{SRC}.backup.{datetime.now().strftime('%Y%m%d_%H%M%S')}") |
|
|
| with open(SRC, 'r') as f: |
| code = f.read() |
|
|
| |
| if 'ScraperStreamDataset' in code: |
| print("Already patched!") |
| exit(0) |
|
|
| |
| import_line = "from stream_loader import ScraperStreamDataset\n" |
| if import_line.strip() not in code: |
| |
| code = re.sub( |
| r'(from datasets import[^\n]+\n)', |
| r'\1' + import_line, |
| code, |
| count=1 |
| ) |
|
|
| |
| old_func = '''def _open_stream_one(ds_name: str, seed: int, streaming: bool = True): |
| dc = DownloadConfig(max_retries=5, use_etag=True, resume_download=True)''' |
|
|
| new_func = '''def _open_stream_one(ds_name: str, seed: int, streaming: bool = True): |
| # Custom scraper streaming support |
| if ds_name == "scraper" or ds_name.startswith("http://"): |
| url = ds_name if ds_name.startswith("http://") else "http://localhost:8888" |
| print(f"[stream] Using scraper: {url}") |
| return iter(ScraperStreamDataset(server_url=url, batch_size=100)) |
| dc = DownloadConfig(max_retries=5, use_etag=True, resume_download=True)''' |
|
|
| code = code.replace(old_func, new_func) |
|
|
| with open(SRC, 'w') as f: |
| f.write(code) |
|
|
| print("Patched successfully!") |
| print("Use --source scraper or --source http://localhost:8888 to use scraped data") |
|
|