CoderHassan commited on
Commit
866dcf1
·
verified ·
1 Parent(s): 9943ae4

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +10 -9
scraper.py CHANGED
@@ -2,21 +2,23 @@ import requests
2
  from bs4 import BeautifulSoup
3
 
4
  def fetch_tariff_data(url):
5
- # Send a GET request to fetch the raw HTML content
6
- response = requests.get(url)
 
 
7
 
8
  if response.status_code != 200:
9
  print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
10
  return None
11
 
12
- # Print the first 500 characters of the response content for debugging
13
  print(f"Successfully fetched data from {url}. First 500 characters of HTML:")
14
  print(response.text[:500]) # Print the first 500 characters of HTML to inspect
15
-
16
  soup = BeautifulSoup(response.text, 'html.parser')
 
 
 
17
  tariff_data = parse_tariff_data(soup)
18
-
19
- # Return the parsed tariff data
20
  return tariff_data
21
 
22
  def parse_tariff_data(soup):
@@ -27,10 +29,10 @@ def parse_tariff_data(soup):
27
  print("No table found on this page.")
28
  return []
29
 
 
30
  tariff_data = []
31
-
32
- # Check if the table has rows and parse accordingly
33
  rows = table.find_all('tr')[1:] # Skipping the header row
 
34
  if not rows:
35
  print("No data rows found in the table.")
36
  return []
@@ -42,7 +44,6 @@ def parse_tariff_data(soup):
42
  if len(columns) >= 2: # Ensure there are at least two columns (slab and rate)
43
  slab = columns[0].text.strip() # Slab details (e.g., 0-50 kWh)
44
  try:
45
- # Attempt to clean and convert the rate to a float
46
  rate = float(columns[1].text.strip().replace(',', '').replace('Rs.', ''))
47
  except ValueError:
48
  print(f"Skipping row with invalid rate: {columns[1].text.strip()}")
 
2
  from bs4 import BeautifulSoup
3
 
4
  def fetch_tariff_data(url):
5
+ headers = {
6
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
7
+ }
8
+ response = requests.get(url, headers=headers)
9
 
10
  if response.status_code != 200:
11
  print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
12
  return None
13
 
 
14
  print(f"Successfully fetched data from {url}. First 500 characters of HTML:")
15
  print(response.text[:500]) # Print the first 500 characters of HTML to inspect
16
+
17
  soup = BeautifulSoup(response.text, 'html.parser')
18
+ print("Parsed HTML soup object:")
19
+ print(soup.prettify()[:500]) # Print the first 500 characters of the prettified soup for inspection
20
+
21
  tariff_data = parse_tariff_data(soup)
 
 
22
  return tariff_data
23
 
24
  def parse_tariff_data(soup):
 
29
  print("No table found on this page.")
30
  return []
31
 
32
+ print("Table found! Now parsing rows...")
33
  tariff_data = []
 
 
34
  rows = table.find_all('tr')[1:] # Skipping the header row
35
+
36
  if not rows:
37
  print("No data rows found in the table.")
38
  return []
 
44
  if len(columns) >= 2: # Ensure there are at least two columns (slab and rate)
45
  slab = columns[0].text.strip() # Slab details (e.g., 0-50 kWh)
46
  try:
 
47
  rate = float(columns[1].text.strip().replace(',', '').replace('Rs.', ''))
48
  except ValueError:
49
  print(f"Skipping row with invalid rate: {columns[1].text.strip()}")