Spaces:
Build error
Build error
Update tariff_scraper.py
Browse files- tariff_scraper.py +19 -38
tariff_scraper.py
CHANGED
|
@@ -4,12 +4,12 @@ import pandas as pd
|
|
| 4 |
|
| 5 |
# Define the URL for PESCO tariff rates
|
| 6 |
TARIFF_URLS = {
|
| 7 |
-
"
|
| 8 |
}
|
| 9 |
|
| 10 |
-
def
|
| 11 |
"""
|
| 12 |
-
Scrape tariff data from
|
| 13 |
|
| 14 |
Args:
|
| 15 |
url (str): The URL of the tariff page to scrape.
|
|
@@ -26,40 +26,21 @@ def scrape_multiple_sections_to_csv(url, output_file="pesco_tariff_data.csv"):
|
|
| 26 |
# Parse the webpage content using BeautifulSoup
|
| 27 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 28 |
|
| 29 |
-
#
|
| 30 |
-
|
| 31 |
-
if not
|
| 32 |
-
return "No
|
| 33 |
|
| 34 |
-
#
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
table_rows = table.find_all('tr')
|
| 41 |
-
for row in table_rows:
|
| 42 |
-
cols = [col.get_text(strip=True) for col in row.find_all(['th', 'td'])]
|
| 43 |
-
data.append(cols)
|
| 44 |
-
|
| 45 |
-
# Create a dataframe for the current table
|
| 46 |
-
if len(data) > 1:
|
| 47 |
-
df = pd.DataFrame(data[1:], columns=data[0])
|
| 48 |
-
else:
|
| 49 |
-
df = pd.DataFrame(data)
|
| 50 |
-
|
| 51 |
-
# Deduplicate column names if necessary
|
| 52 |
-
df.columns = pd.io.parsers._deduplicate(df.columns)
|
| 53 |
-
|
| 54 |
-
# Add a section identifier
|
| 55 |
-
df["Section"] = f"Section {i}"
|
| 56 |
-
all_data.append(df)
|
| 57 |
-
|
| 58 |
-
# Combine all dataframes into one
|
| 59 |
-
combined_data = pd.concat(all_data, ignore_index=True)
|
| 60 |
-
|
| 61 |
-
# Save the combined data to a CSV file
|
| 62 |
-
combined_data.to_csv(output_file, index=False)
|
| 63 |
|
| 64 |
return output_file
|
| 65 |
except requests.exceptions.RequestException as e:
|
|
@@ -71,9 +52,9 @@ def scrape_multiple_sections_to_csv(url, output_file="pesco_tariff_data.csv"):
|
|
| 71 |
|
| 72 |
if __name__ == "__main__":
|
| 73 |
# Test the scraper and save data to a CSV file
|
| 74 |
-
url = TARIFF_URLS["
|
| 75 |
-
output_file = "
|
| 76 |
-
result =
|
| 77 |
if result.endswith(".csv"):
|
| 78 |
print(f"Data successfully saved to {output_file}")
|
| 79 |
else:
|
|
|
|
| 4 |
|
| 5 |
# Define the URL for PESCO tariff rates
|
| 6 |
TARIFF_URLS = {
|
| 7 |
+
"IESCO": "https://iesco.com.pk/index.php/customer-services/tariff-guide"
|
| 8 |
}
|
| 9 |
|
| 10 |
+
def scrape_tariff_data_to_csv(url, output_file="pesco_tariff_data.csv"):
|
| 11 |
"""
|
| 12 |
+
Scrape tariff data from the given URL and save it to a CSV file.
|
| 13 |
|
| 14 |
Args:
|
| 15 |
url (str): The URL of the tariff page to scrape.
|
|
|
|
| 26 |
# Parse the webpage content using BeautifulSoup
|
| 27 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 28 |
|
| 29 |
+
# Extract table rows
|
| 30 |
+
tariff_table = soup.find('table')
|
| 31 |
+
if not tariff_table:
|
| 32 |
+
return "No table found on the webpage."
|
| 33 |
|
| 34 |
+
# Extract data and convert it into a structured format
|
| 35 |
+
data = []
|
| 36 |
+
table_rows = tariff_table.find_all('tr')
|
| 37 |
+
for row in table_rows:
|
| 38 |
+
cols = [col.get_text(strip=True) for col in row.find_all(['th', 'td'])]
|
| 39 |
+
data.append(cols)
|
| 40 |
|
| 41 |
+
# Save the data to a CSV file
|
| 42 |
+
df = pd.DataFrame(data[1:], columns=data[0]) # Use the first row as headers
|
| 43 |
+
df.to_csv(output_file, index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
return output_file
|
| 46 |
except requests.exceptions.RequestException as e:
|
|
|
|
| 52 |
|
| 53 |
if __name__ == "__main__":
|
| 54 |
# Test the scraper and save data to a CSV file
|
| 55 |
+
url = TARIFF_URLS["IESCO"]
|
| 56 |
+
output_file = "iesco_tariff_data.csv"
|
| 57 |
+
result = scrape_tariff_data_to_csv(url, output_file)
|
| 58 |
if result.endswith(".csv"):
|
| 59 |
print(f"Data successfully saved to {output_file}")
|
| 60 |
else:
|