|
|
import os |
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
import pandas as pd |
|
|
import time |
|
|
from random import randint |
|
|
|
|
|
def scrape_tariffs(urls): |
|
|
data = [] |
|
|
|
|
|
|
|
|
os.makedirs("data", exist_ok=True) |
|
|
|
|
|
for url in urls: |
|
|
try: |
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
|
|
|
|
|
|
if response.status_code == 200: |
|
|
soup = BeautifulSoup(response.content, "html.parser") |
|
|
rows = soup.find_all("tr") |
|
|
|
|
|
for row in rows: |
|
|
cells = row.find_all("td") |
|
|
if len(cells) >= 2: |
|
|
try: |
|
|
data.append({ |
|
|
"category": cells[0].text.strip(), |
|
|
"rate": float(cells[1].text.strip().replace(",", "")), |
|
|
}) |
|
|
except ValueError: |
|
|
continue |
|
|
|
|
|
except requests.exceptions.RequestException as e: |
|
|
print(f"Error fetching data from {url}: {e}") |
|
|
print("Retrying...") |
|
|
|
|
|
|
|
|
retries = 3 |
|
|
while retries > 0: |
|
|
time.sleep(randint(1, 3)) |
|
|
retries -= 1 |
|
|
try: |
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
if response.status_code == 200: |
|
|
soup = BeautifulSoup(response.content, "html.parser") |
|
|
rows = soup.find_all("tr") |
|
|
|
|
|
for row in rows: |
|
|
cells = row.find_all("td") |
|
|
if len(cells) >= 2: |
|
|
try: |
|
|
data.append({ |
|
|
"category": cells[0].text.strip(), |
|
|
"rate": float(cells[1].text.strip().replace(",", "")), |
|
|
}) |
|
|
except ValueError: |
|
|
continue |
|
|
break |
|
|
except requests.exceptions.RequestException: |
|
|
print(f"Retry failed: {e}") |
|
|
continue |
|
|
|
|
|
|
|
|
time.sleep(randint(2, 5)) |
|
|
|
|
|
if data: |
|
|
df = pd.DataFrame(data) |
|
|
|
|
|
df.to_csv("data/tariffs.csv", index=False) |
|
|
print("Tariff data saved successfully.") |
|
|
else: |
|
|
print("No tariff data found.") |