Spaces:

CoderHassan
/

EnergyGuru_test_app

Sleeping

App Files Files Community

CoderHassan commited on Jan 5, 2025

Commit

71520e8

verified ·

1 Parent(s): d57640e

Update scraper.py

Browse files

Files changed (1) hide show

scraper.py +31 -62

scraper.py CHANGED Viewed

@@ -1,69 +1,38 @@
 import requests
 from bs4 import BeautifulSoup
 def fetch_tariff_data(url):
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
-    }
-    response = requests.get(url, headers=headers)
-    if response.status_code != 200:
-        print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
         return None
-    print(f"Successfully fetched data from {url}. First 500 characters of HTML:")
-    print(response.text[:500])  # Print the first 500 characters of HTML to inspect
-    soup = BeautifulSoup(response.text, 'html.parser')
-    print("Parsed HTML soup object:")
-    print(soup.prettify()[:500])  # Print the first 500 characters of the prettified soup for inspection
-    tariff_data = parse_tariff_data(soup)
-    return tariff_data
-def parse_tariff_data(soup):
-    # Try to find the table that contains tariff data
-    table = soup.find('table')
-    if not table:
-        print("No table found on this page.")
-        return []
-    print("Table found! Now parsing rows...")
-    tariff_data = []
-    rows = table.find_all('tr')[1:]  # Skipping the header row
-    if not rows:
-        print("No data rows found in the table.")
-        return []
-    # Loop through each row and parse the tariff details
-    for row in rows:
-        columns = row.find_all('td')
-        if len(columns) >= 2:  # Ensure there are at least two columns (slab and rate)
-            slab = columns[0].text.strip()  # Slab details (e.g., 0-50 kWh)
-            try:
-                rate = float(columns[1].text.strip().replace(',', '').replace('Rs.', ''))
-            except ValueError:
-                print(f"Skipping row with invalid rate: {columns[1].text.strip()}")
-                continue  # Skip rows that don’t have a valid rate
-            # Handle the case where slab is not a valid numeric range
-            slab_range = slab.split('-')
-            if len(slab_range) == 2:
-                try:
-                    lower_limit = float(slab_range[0])
-                    upper_limit = float(slab_range[1])
-                    tariff_data.append({'slab': slab, 'rate': rate, 'lower_limit': lower_limit, 'upper_limit': upper_limit})
-                except ValueError:
-                    print(f"Skipping invalid slab range: {slab}")
-                    continue  # Skip slabs that are not valid numeric ranges
-            elif len(slab_range) == 1:
-                try:
-                    lower_limit = float(slab_range[0])
-                    tariff_data.append({'slab': slab, 'rate': rate, 'lower_limit': lower_limit, 'upper_limit': float('inf')})
-                except ValueError:
-                    print(f"Skipping invalid slab range: {slab}")
-                    continue  # Skip slabs that are not valid numeric values
-    return tariff_data

+# scraper.py
 import requests
 from bs4 import BeautifulSoup
 def fetch_tariff_data(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        tariff_data = []
+        table = soup.find('table')  # Assuming the first table on the page is the tariff table
+        rows = table.find_all('tr')[1:]  # Skipping the header row
+        for row in rows:
+            cols = row.find_all('td')
+            if len(cols) >= 2:
+                lower_limit = parse_float(cols[0].text.strip().split()[0])
+                upper_limit = parse_float(cols[0].text.strip().split()[2] if 'to' in cols[0].text else 'inf')
+                rate = parse_float(cols[-1].text.strip())
+                tariff_data.append({
+                    'lower_limit': lower_limit,
+                    'upper_limit': upper_limit,
+                    'rate': rate
+                })
+        return tariff_data
+    except Exception as e:
+        print(f"Error fetching tariff data: {e}")
         return None
+def parse_float(value):
+    try:
+        return float(value.replace(',', ''))
+    except ValueError:
+        return float('inf')