Spaces:
Build error
Build error
| import os | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| import time | |
| from random import randint | |
| def scrape_tariffs(urls): | |
| data = [] | |
| # Ensure the 'data' directory exists before saving the CSV | |
| os.makedirs("data", exist_ok=True) | |
| for url in urls: | |
| try: | |
| response = requests.get(url, timeout=10) # Added timeout | |
| response.raise_for_status() # Raise exception for bad status codes (4xx, 5xx) | |
| # Scrape data if the response is OK | |
| if response.status_code == 200: | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| rows = soup.find_all("tr") | |
| for row in rows: | |
| cells = row.find_all("td") | |
| if len(cells) >= 2: | |
| try: | |
| data.append({ | |
| "category": cells[0].text.strip(), | |
| "rate": float(cells[1].text.strip().replace(",", "")), | |
| }) | |
| except ValueError: | |
| continue | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching data from {url}: {e}") | |
| print("Retrying...") | |
| # Retry logic in case of failure (max 3 retries with random delay) | |
| retries = 3 | |
| while retries > 0: | |
| time.sleep(randint(1, 3)) # Sleep for a random time before retrying | |
| retries -= 1 | |
| try: | |
| response = requests.get(url, timeout=10) | |
| response.raise_for_status() | |
| if response.status_code == 200: | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| rows = soup.find_all("tr") | |
| for row in rows: | |
| cells = row.find_all("td") | |
| if len(cells) >= 2: | |
| try: | |
| data.append({ | |
| "category": cells[0].text.strip(), | |
| "rate": float(cells[1].text.strip().replace(",", "")), | |
| }) | |
| except ValueError: | |
| continue | |
| break | |
| except requests.exceptions.RequestException: | |
| print(f"Retry failed: {e}") | |
| continue | |
| # Sleep between requests to avoid hitting the servers too quickly | |
| time.sleep(randint(2, 5)) | |
| if data: | |
| df = pd.DataFrame(data) | |
| # Save the scraped data to the 'data' directory | |
| df.to_csv("data/tariffs.csv", index=False) | |
| print("Tariff data saved successfully.") | |
| else: | |
| print("No tariff data found.") | |