CoderHassan commited on
Commit
71520e8
·
verified ·
1 Parent(s): d57640e

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +31 -62
scraper.py CHANGED
@@ -1,69 +1,38 @@
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
 
4
  def fetch_tariff_data(url):
5
- headers = {
6
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
7
- }
8
- response = requests.get(url, headers=headers)
9
-
10
- if response.status_code != 200:
11
- print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  return None
13
-
14
- print(f"Successfully fetched data from {url}. First 500 characters of HTML:")
15
- print(response.text[:500]) # Print the first 500 characters of HTML to inspect
16
 
17
- soup = BeautifulSoup(response.text, 'html.parser')
18
- print("Parsed HTML soup object:")
19
- print(soup.prettify()[:500]) # Print the first 500 characters of the prettified soup for inspection
 
 
20
 
21
- tariff_data = parse_tariff_data(soup)
22
- return tariff_data
23
-
24
- def parse_tariff_data(soup):
25
- # Try to find the table that contains tariff data
26
- table = soup.find('table')
27
-
28
- if not table:
29
- print("No table found on this page.")
30
- return []
31
-
32
- print("Table found! Now parsing rows...")
33
- tariff_data = []
34
- rows = table.find_all('tr')[1:] # Skipping the header row
35
-
36
- if not rows:
37
- print("No data rows found in the table.")
38
- return []
39
-
40
- # Loop through each row and parse the tariff details
41
- for row in rows:
42
- columns = row.find_all('td')
43
-
44
- if len(columns) >= 2: # Ensure there are at least two columns (slab and rate)
45
- slab = columns[0].text.strip() # Slab details (e.g., 0-50 kWh)
46
- try:
47
- rate = float(columns[1].text.strip().replace(',', '').replace('Rs.', ''))
48
- except ValueError:
49
- print(f"Skipping row with invalid rate: {columns[1].text.strip()}")
50
- continue # Skip rows that don’t have a valid rate
51
-
52
- # Handle the case where slab is not a valid numeric range
53
- slab_range = slab.split('-')
54
- if len(slab_range) == 2:
55
- try:
56
- lower_limit = float(slab_range[0])
57
- upper_limit = float(slab_range[1])
58
- tariff_data.append({'slab': slab, 'rate': rate, 'lower_limit': lower_limit, 'upper_limit': upper_limit})
59
- except ValueError:
60
- print(f"Skipping invalid slab range: {slab}")
61
- continue # Skip slabs that are not valid numeric ranges
62
- elif len(slab_range) == 1:
63
- try:
64
- lower_limit = float(slab_range[0])
65
- tariff_data.append({'slab': slab, 'rate': rate, 'lower_limit': lower_limit, 'upper_limit': float('inf')})
66
- except ValueError:
67
- print(f"Skipping invalid slab range: {slab}")
68
- continue # Skip slabs that are not valid numeric values
69
- return tariff_data
 
1
+ # scraper.py
2
  import requests
3
  from bs4 import BeautifulSoup
4
 
5
  def fetch_tariff_data(url):
6
+ try:
7
+ response = requests.get(url)
8
+ response.raise_for_status()
9
+ soup = BeautifulSoup(response.content, 'html.parser')
10
+
11
+ tariff_data = []
12
+ table = soup.find('table') # Assuming the first table on the page is the tariff table
13
+ rows = table.find_all('tr')[1:] # Skipping the header row
14
+
15
+ for row in rows:
16
+ cols = row.find_all('td')
17
+ if len(cols) >= 2:
18
+ lower_limit = parse_float(cols[0].text.strip().split()[0])
19
+ upper_limit = parse_float(cols[0].text.strip().split()[2] if 'to' in cols[0].text else 'inf')
20
+ rate = parse_float(cols[-1].text.strip())
21
+
22
+ tariff_data.append({
23
+ 'lower_limit': lower_limit,
24
+ 'upper_limit': upper_limit,
25
+ 'rate': rate
26
+ })
27
+
28
+ return tariff_data
29
+ except Exception as e:
30
+ print(f"Error fetching tariff data: {e}")
31
  return None
 
 
 
32
 
33
+ def parse_float(value):
34
+ try:
35
+ return float(value.replace(',', ''))
36
+ except ValueError:
37
+ return float('inf')
38