Spaces:
Sleeping
Sleeping
Update scraper.py
Browse files- scraper.py +10 -9
scraper.py
CHANGED
|
@@ -2,21 +2,23 @@ import requests
|
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
|
| 4 |
def fetch_tariff_data(url):
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
| 7 |
|
| 8 |
if response.status_code != 200:
|
| 9 |
print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
|
| 10 |
return None
|
| 11 |
|
| 12 |
-
# Print the first 500 characters of the response content for debugging
|
| 13 |
print(f"Successfully fetched data from {url}. First 500 characters of HTML:")
|
| 14 |
print(response.text[:500]) # Print the first 500 characters of HTML to inspect
|
| 15 |
-
|
| 16 |
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
|
|
|
|
|
| 17 |
tariff_data = parse_tariff_data(soup)
|
| 18 |
-
|
| 19 |
-
# Return the parsed tariff data
|
| 20 |
return tariff_data
|
| 21 |
|
| 22 |
def parse_tariff_data(soup):
|
|
@@ -27,10 +29,10 @@ def parse_tariff_data(soup):
|
|
| 27 |
print("No table found on this page.")
|
| 28 |
return []
|
| 29 |
|
|
|
|
| 30 |
tariff_data = []
|
| 31 |
-
|
| 32 |
-
# Check if the table has rows and parse accordingly
|
| 33 |
rows = table.find_all('tr')[1:] # Skipping the header row
|
|
|
|
| 34 |
if not rows:
|
| 35 |
print("No data rows found in the table.")
|
| 36 |
return []
|
|
@@ -42,7 +44,6 @@ def parse_tariff_data(soup):
|
|
| 42 |
if len(columns) >= 2: # Ensure there are at least two columns (slab and rate)
|
| 43 |
slab = columns[0].text.strip() # Slab details (e.g., 0-50 kWh)
|
| 44 |
try:
|
| 45 |
-
# Attempt to clean and convert the rate to a float
|
| 46 |
rate = float(columns[1].text.strip().replace(',', '').replace('Rs.', ''))
|
| 47 |
except ValueError:
|
| 48 |
print(f"Skipping row with invalid rate: {columns[1].text.strip()}")
|
|
|
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
|
| 4 |
def fetch_tariff_data(url):
|
| 5 |
+
headers = {
|
| 6 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
|
| 7 |
+
}
|
| 8 |
+
response = requests.get(url, headers=headers)
|
| 9 |
|
| 10 |
if response.status_code != 200:
|
| 11 |
print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
|
| 12 |
return None
|
| 13 |
|
|
|
|
| 14 |
print(f"Successfully fetched data from {url}. First 500 characters of HTML:")
|
| 15 |
print(response.text[:500]) # Print the first 500 characters of HTML to inspect
|
| 16 |
+
|
| 17 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 18 |
+
print("Parsed HTML soup object:")
|
| 19 |
+
print(soup.prettify()[:500]) # Print the first 500 characters of the prettified soup for inspection
|
| 20 |
+
|
| 21 |
tariff_data = parse_tariff_data(soup)
|
|
|
|
|
|
|
| 22 |
return tariff_data
|
| 23 |
|
| 24 |
def parse_tariff_data(soup):
|
|
|
|
| 29 |
print("No table found on this page.")
|
| 30 |
return []
|
| 31 |
|
| 32 |
+
print("Table found! Now parsing rows...")
|
| 33 |
tariff_data = []
|
|
|
|
|
|
|
| 34 |
rows = table.find_all('tr')[1:] # Skipping the header row
|
| 35 |
+
|
| 36 |
if not rows:
|
| 37 |
print("No data rows found in the table.")
|
| 38 |
return []
|
|
|
|
| 44 |
if len(columns) >= 2: # Ensure there are at least two columns (slab and rate)
|
| 45 |
slab = columns[0].text.strip() # Slab details (e.g., 0-50 kWh)
|
| 46 |
try:
|
|
|
|
| 47 |
rate = float(columns[1].text.strip().replace(',', '').replace('Rs.', ''))
|
| 48 |
except ValueError:
|
| 49 |
print(f"Skipping row with invalid rate: {columns[1].text.strip()}")
|