File size: 2,900 Bytes
144669d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from random import randint
def scrape_tariffs(urls):
data = []
# Ensure the 'data' directory exists before saving the CSV
os.makedirs("data", exist_ok=True)
for url in urls:
try:
response = requests.get(url, timeout=10) # Added timeout
response.raise_for_status() # Raise exception for bad status codes (4xx, 5xx)
# Scrape data if the response is OK
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
rows = soup.find_all("tr")
for row in rows:
cells = row.find_all("td")
if len(cells) >= 2:
try:
data.append({
"category": cells[0].text.strip(),
"rate": float(cells[1].text.strip().replace(",", "")),
})
except ValueError:
continue
except requests.exceptions.RequestException as e:
print(f"Error fetching data from {url}: {e}")
print("Retrying...")
# Retry logic in case of failure (max 3 retries with random delay)
retries = 3
while retries > 0:
time.sleep(randint(1, 3)) # Sleep for a random time before retrying
retries -= 1
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
rows = soup.find_all("tr")
for row in rows:
cells = row.find_all("td")
if len(cells) >= 2:
try:
data.append({
"category": cells[0].text.strip(),
"rate": float(cells[1].text.strip().replace(",", "")),
})
except ValueError:
continue
break
except requests.exceptions.RequestException:
print(f"Retry failed: {e}")
continue
# Sleep between requests to avoid hitting the servers too quickly
time.sleep(randint(2, 5))
if data:
df = pd.DataFrame(data)
# Save the scraped data to the 'data' directory
df.to_csv("data/tariffs.csv", index=False)
print("Tariff data saved successfully.")
else:
print("No tariff data found.") |