Spaces:

CoderHassan
/

EnergyGuru_test_app

Sleeping

App Files Files Community

CoderHassan commited on Jan 4, 2025

Commit

866dcf1

verified ·

1 Parent(s): 9943ae4

Update scraper.py

Browse files

Files changed (1) hide show

scraper.py +10 -9

scraper.py CHANGED Viewed

@@ -2,21 +2,23 @@ import requests
 from bs4 import BeautifulSoup
 def fetch_tariff_data(url):
-    # Send a GET request to fetch the raw HTML content
-    response = requests.get(url)
     if response.status_code != 200:
         print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
         return None
-    # Print the first 500 characters of the response content for debugging
     print(f"Successfully fetched data from {url}. First 500 characters of HTML:")
     print(response.text[:500])  # Print the first 500 characters of HTML to inspect
     soup = BeautifulSoup(response.text, 'html.parser')
     tariff_data = parse_tariff_data(soup)
-    # Return the parsed tariff data
     return tariff_data
 def parse_tariff_data(soup):
@@ -27,10 +29,10 @@ def parse_tariff_data(soup):
         print("No table found on this page.")
         return []
     tariff_data = []
-    # Check if the table has rows and parse accordingly
     rows = table.find_all('tr')[1:]  # Skipping the header row
     if not rows:
         print("No data rows found in the table.")
         return []
@@ -42,7 +44,6 @@ def parse_tariff_data(soup):
         if len(columns) >= 2:  # Ensure there are at least two columns (slab and rate)
             slab = columns[0].text.strip()  # Slab details (e.g., 0-50 kWh)
             try:
-                # Attempt to clean and convert the rate to a float
                 rate = float(columns[1].text.strip().replace(',', '').replace('Rs.', ''))
             except ValueError:
                 print(f"Skipping row with invalid rate: {columns[1].text.strip()}")

 from bs4 import BeautifulSoup
 def fetch_tariff_data(url):
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
+    }
+    response = requests.get(url, headers=headers)
     if response.status_code != 200:
         print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
         return None
     print(f"Successfully fetched data from {url}. First 500 characters of HTML:")
     print(response.text[:500])  # Print the first 500 characters of HTML to inspect
     soup = BeautifulSoup(response.text, 'html.parser')
+    print("Parsed HTML soup object:")
+    print(soup.prettify()[:500])  # Print the first 500 characters of the prettified soup for inspection
     tariff_data = parse_tariff_data(soup)
     return tariff_data
 def parse_tariff_data(soup):
         print("No table found on this page.")
         return []
+    print("Table found! Now parsing rows...")
     tariff_data = []
     rows = table.find_all('tr')[1:]  # Skipping the header row
     if not rows:
         print("No data rows found in the table.")
         return []
         if len(columns) >= 2:  # Ensure there are at least two columns (slab and rate)
             slab = columns[0].text.strip()  # Slab details (e.g., 0-50 kWh)
             try:
                 rate = float(columns[1].text.strip().replace(',', '').replace('Rs.', ''))
             except ValueError:
                 print(f"Skipping row with invalid rate: {columns[1].text.strip()}")