tst / scraper.py
attendantelectro's picture
Update scraper.py
6718a3f verified
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import zipfile
import os
def scrape_pages(base_url, start_page, end_page):
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)
# ذخیره در مسیر /home/seluser/output/pages
output_dir = '/home/seluser/output/pages'
if not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
for page in range(start_page, end_page + 1):
url = f"{base_url}?page={page}"
driver.get(url)
html = driver.page_source
with open(f'{output_dir}/t{page}.html', 'w', encoding='utf-8') as f:
f.write(html)
print(f"Saved: {output_dir}/t{page}.html")
driver.quit()
# ایجاد فایل ZIP در مسیر /home/seluser/output
with zipfile.ZipFile('/home/seluser/output/html.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
for root, dirs, files in os.walk(output_dir):
for file in files:
zipf.write(os.path.join(root, file), file)
print("All pages saved and zipped as /home/seluser/output/html.zip")
if __name__ == "__main__":
base_url = "https://shahvani.com/dastans"
start_page = 1
end_page = 10 # برای تست، عدد کمتری انتخاب کنید
scrape_pages(base_url, start_page, end_page)