|
|
|
|
|
"""Patch n.py to support scraper streaming""" |
|
|
import re |
|
|
import shutil |
|
|
from datetime import datetime |
|
|
|
|
|
SRC = "/workspace/n.py" |
|
|
|
|
|
|
|
|
shutil.copy(SRC, f"{SRC}.backup.{datetime.now().strftime('%Y%m%d_%H%M%S')}") |
|
|
|
|
|
with open(SRC, 'r') as f: |
|
|
code = f.read() |
|
|
|
|
|
|
|
|
if 'ScraperStreamDataset' in code: |
|
|
print("Already patched!") |
|
|
exit(0) |
|
|
|
|
|
|
|
|
import_line = "from stream_loader import ScraperStreamDataset\n" |
|
|
if import_line.strip() not in code: |
|
|
|
|
|
code = re.sub( |
|
|
r'(from datasets import[^\n]+\n)', |
|
|
r'\1' + import_line, |
|
|
code, |
|
|
count=1 |
|
|
) |
|
|
|
|
|
|
|
|
old_func = '''def _open_stream_one(ds_name: str, seed: int, streaming: bool = True): |
|
|
dc = DownloadConfig(max_retries=5, use_etag=True, resume_download=True)''' |
|
|
|
|
|
new_func = '''def _open_stream_one(ds_name: str, seed: int, streaming: bool = True): |
|
|
# Custom scraper streaming support |
|
|
if ds_name == "scraper" or ds_name.startswith("http://"): |
|
|
url = ds_name if ds_name.startswith("http://") else "http://localhost:8888" |
|
|
print(f"[stream] Using scraper: {url}") |
|
|
return iter(ScraperStreamDataset(server_url=url, batch_size=100)) |
|
|
dc = DownloadConfig(max_retries=5, use_etag=True, resume_download=True)''' |
|
|
|
|
|
code = code.replace(old_func, new_func) |
|
|
|
|
|
with open(SRC, 'w') as f: |
|
|
f.write(code) |
|
|
|
|
|
print("Patched successfully!") |
|
|
print("Use --source scraper or --source http://localhost:8888 to use scraped data") |
|
|
|