CoderHassan commited on
Commit
9943ae4
·
verified ·
1 Parent(s): f255e92

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +31 -4
scraper.py CHANGED
@@ -1,26 +1,51 @@
1
- # scraper.py
2
  import requests
3
  from bs4 import BeautifulSoup
4
 
5
  def fetch_tariff_data(url):
 
6
  response = requests.get(url)
 
 
 
 
 
 
 
 
 
7
  soup = BeautifulSoup(response.text, 'html.parser')
8
  tariff_data = parse_tariff_data(soup)
 
 
9
  return tariff_data
10
 
11
  def parse_tariff_data(soup):
12
- # Find the table that contains tariff data
13
  table = soup.find('table')
 
 
 
 
 
14
  tariff_data = []
15
 
16
- for row in table.find_all('tr')[1:]: # Skipping the header row
 
 
 
 
 
 
 
17
  columns = row.find_all('td')
 
18
  if len(columns) >= 2: # Ensure there are at least two columns (slab and rate)
19
  slab = columns[0].text.strip() # Slab details (e.g., 0-50 kWh)
20
  try:
21
- # Attempt to clean and convert rate to a float
22
  rate = float(columns[1].text.strip().replace(',', '').replace('Rs.', ''))
23
  except ValueError:
 
24
  continue # Skip rows that don’t have a valid rate
25
 
26
  # Handle the case where slab is not a valid numeric range
@@ -31,11 +56,13 @@ def parse_tariff_data(soup):
31
  upper_limit = float(slab_range[1])
32
  tariff_data.append({'slab': slab, 'rate': rate, 'lower_limit': lower_limit, 'upper_limit': upper_limit})
33
  except ValueError:
 
34
  continue # Skip slabs that are not valid numeric ranges
35
  elif len(slab_range) == 1:
36
  try:
37
  lower_limit = float(slab_range[0])
38
  tariff_data.append({'slab': slab, 'rate': rate, 'lower_limit': lower_limit, 'upper_limit': float('inf')})
39
  except ValueError:
 
40
  continue # Skip slabs that are not valid numeric values
41
  return tariff_data
 
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
 
4
  def fetch_tariff_data(url):
5
+ # Send a GET request to fetch the raw HTML content
6
  response = requests.get(url)
7
+
8
+ if response.status_code != 200:
9
+ print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
10
+ return None
11
+
12
+ # Print the first 500 characters of the response content for debugging
13
+ print(f"Successfully fetched data from {url}. First 500 characters of HTML:")
14
+ print(response.text[:500]) # Print the first 500 characters of HTML to inspect
15
+
16
  soup = BeautifulSoup(response.text, 'html.parser')
17
  tariff_data = parse_tariff_data(soup)
18
+
19
+ # Return the parsed tariff data
20
  return tariff_data
21
 
22
  def parse_tariff_data(soup):
23
+ # Try to find the table that contains tariff data
24
  table = soup.find('table')
25
+
26
+ if not table:
27
+ print("No table found on this page.")
28
+ return []
29
+
30
  tariff_data = []
31
 
32
+ # Check if the table has rows and parse accordingly
33
+ rows = table.find_all('tr')[1:] # Skipping the header row
34
+ if not rows:
35
+ print("No data rows found in the table.")
36
+ return []
37
+
38
+ # Loop through each row and parse the tariff details
39
+ for row in rows:
40
  columns = row.find_all('td')
41
+
42
  if len(columns) >= 2: # Ensure there are at least two columns (slab and rate)
43
  slab = columns[0].text.strip() # Slab details (e.g., 0-50 kWh)
44
  try:
45
+ # Attempt to clean and convert the rate to a float
46
  rate = float(columns[1].text.strip().replace(',', '').replace('Rs.', ''))
47
  except ValueError:
48
+ print(f"Skipping row with invalid rate: {columns[1].text.strip()}")
49
  continue # Skip rows that don’t have a valid rate
50
 
51
  # Handle the case where slab is not a valid numeric range
 
56
  upper_limit = float(slab_range[1])
57
  tariff_data.append({'slab': slab, 'rate': rate, 'lower_limit': lower_limit, 'upper_limit': upper_limit})
58
  except ValueError:
59
+ print(f"Skipping invalid slab range: {slab}")
60
  continue # Skip slabs that are not valid numeric ranges
61
  elif len(slab_range) == 1:
62
  try:
63
  lower_limit = float(slab_range[0])
64
  tariff_data.append({'slab': slab, 'rate': rate, 'lower_limit': lower_limit, 'upper_limit': float('inf')})
65
  except ValueError:
66
+ print(f"Skipping invalid slab range: {slab}")
67
  continue # Skip slabs that are not valid numeric values
68
  return tariff_data