OpenTransformer commited on
Commit
7e616ea
·
verified ·
1 Parent(s): 93b6ddd

Backup script patcher.py

Browse files
Files changed (1) hide show
  1. scripts/patcher.py +49 -0
scripts/patcher.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Patch n.py to support scraper streaming"""
3
+ import re
4
+ import shutil
5
+ from datetime import datetime
6
+
7
+ SRC = "/workspace/n.py"
8
+
9
+ # Backup
10
+ shutil.copy(SRC, f"{SRC}.backup.{datetime.now().strftime('%Y%m%d_%H%M%S')}")
11
+
12
+ with open(SRC, 'r') as f:
13
+ code = f.read()
14
+
15
+ # Check if already patched
16
+ if 'ScraperStreamDataset' in code:
17
+ print("Already patched!")
18
+ exit(0)
19
+
20
+ # Add import after first import block
21
+ import_line = "from stream_loader import ScraperStreamDataset\n"
22
+ if import_line.strip() not in code:
23
+ # Insert after 'from datasets import' line
24
+ code = re.sub(
25
+ r'(from datasets import[^\n]+\n)',
26
+ r'\1' + import_line,
27
+ code,
28
+ count=1
29
+ )
30
+
31
+ # Patch _open_stream_one function
32
+ old_func = '''def _open_stream_one(ds_name: str, seed: int, streaming: bool = True):
33
+ dc = DownloadConfig(max_retries=5, use_etag=True, resume_download=True)'''
34
+
35
+ new_func = '''def _open_stream_one(ds_name: str, seed: int, streaming: bool = True):
36
+ # Custom scraper streaming support
37
+ if ds_name == "scraper" or ds_name.startswith("http://"):
38
+ url = ds_name if ds_name.startswith("http://") else "http://localhost:8888"
39
+ print(f"[stream] Using scraper: {url}")
40
+ return iter(ScraperStreamDataset(server_url=url, batch_size=100))
41
+ dc = DownloadConfig(max_retries=5, use_etag=True, resume_download=True)'''
42
+
43
+ code = code.replace(old_func, new_func)
44
+
45
+ with open(SRC, 'w') as f:
46
+ f.write(code)
47
+
48
+ print("Patched successfully!")
49
+ print("Use --source scraper or --source http://localhost:8888 to use scraped data")