attendantelectro commited on
Commit
7c2ff90
·
verified ·
1 Parent(s): 7ce3d31

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +9 -8
scraper.py CHANGED
@@ -9,30 +9,31 @@ def scrape_pages(base_url, start_page, end_page):
9
  chrome_options.add_argument("--no-sandbox")
10
  chrome_options.add_argument("--disable-dev-shm-usage")
11
 
12
- # استفاده مستقیم از ChromeDriver داخل کانتینر
13
  driver = webdriver.Chrome(options=chrome_options)
14
 
15
- if not os.path.exists('pages'):
16
- os.makedirs('pages')
 
 
17
 
18
  for page in range(start_page, end_page + 1):
19
  url = f"{base_url}?page={page}"
20
  driver.get(url)
21
  html = driver.page_source
22
 
23
- with open(f'pages/t{page}.html', 'w', encoding='utf-8') as f:
24
  f.write(html)
25
  print(f"Saved: t{page}.html")
26
 
27
  driver.quit()
28
 
29
- # ایجاد فایل ZIP
30
- with zipfile.ZipFile('html.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
31
- for root, dirs, files in os.walk('pages'):
32
  for file in files:
33
  zipf.write(os.path.join(root, file), file)
34
 
35
- print("All pages saved and zipped as html.zip")
36
 
37
  if __name__ == "__main__":
38
  base_url = "https://shahvani.com/dastans"
 
9
  chrome_options.add_argument("--no-sandbox")
10
  chrome_options.add_argument("--disable-dev-shm-usage")
11
 
 
12
  driver = webdriver.Chrome(options=chrome_options)
13
 
14
+ # مسیر ذخیره‌سازی در داخل کانتینر
15
+ output_dir = '/app/pages'
16
+ if not os.path.exists(output_dir):
17
+ os.makedirs(output_dir)
18
 
19
  for page in range(start_page, end_page + 1):
20
  url = f"{base_url}?page={page}"
21
  driver.get(url)
22
  html = driver.page_source
23
 
24
+ with open(f'{output_dir}/t{page}.html', 'w', encoding='utf-8') as f:
25
  f.write(html)
26
  print(f"Saved: t{page}.html")
27
 
28
  driver.quit()
29
 
30
+ # ایجاد فایل ZIP در مسیر متصل
31
+ with zipfile.ZipFile('/app/html.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
32
+ for root, dirs, files in os.walk(output_dir):
33
  for file in files:
34
  zipf.write(os.path.join(root, file), file)
35
 
36
+ print("All pages saved and zipped as /app/html.zip")
37
 
38
  if __name__ == "__main__":
39
  base_url = "https://shahvani.com/dastans"