rameshmoorthy commited on
Commit
f217168
·
verified ·
1 Parent(s): 66ffdbf

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +287 -0
app.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ from bs4 import BeautifulSoup
4
+ from selenium import webdriver
5
+ from selenium.webdriver.common.by import By
6
+ from selenium.webdriver.support.ui import WebDriverWait
7
+ from selenium.webdriver.support import expected_conditions as EC
8
+ from selenium.webdriver.chrome.service import Service as ChromeService
9
+ from selenium.webdriver.chrome.options import Options as ChromeOptions
10
+ from geopy.geocoders import Nominatim, ArcGIS
11
+ from geopy.exc import GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError
12
+ import time
13
+ import pandas as pd
14
+ import re
15
+ import os
16
+ import shutil # For finding chromedriver
17
+
18
+ def driversetup_huggingface():
19
+ """Custom driver setup for Hugging Face Spaces (headless)."""
20
+ options = ChromeOptions()
21
+ options.add_argument("--headless")
22
+ options.add_argument("--no-sandbox")
23
+ options.add_argument("--disable-gpu")
24
+ options.add_argument("--window-size=1920,1080")
25
+ options.add_argument("--disable-dev-shm-usage")
26
+ options.add_argument("lang=en")
27
+ options.add_argument("start-maximized")
28
+ options.add_argument("disable-infobars")
29
+ options.add_argument("--disable-extensions")
30
+ options.add_argument("--disable-blink-features=AutomationControlled")
31
+ options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")
32
+
33
+ # Attempt to find chromedriver - Hugging Face Spaces might have it in specific locations
34
+ # or it might need to be installed via packages.txt or a Dockerfile.
35
+ # For Gradio apps on Spaces, it's often pre-configured or easily installable.
36
+ # Let's try common paths or rely on it being in PATH.
37
+
38
+ # Check if chromedriver is in PATH or use a common location
39
+ chromedriver_path = shutil.which("chromedriver")
40
+ if chromedriver_path:
41
+ print(f"Using chromedriver found at: {chromedriver_path}")
42
+ service = ChromeService(executable_path=chromedriver_path)
43
+ else:
44
+ # Fallback if not in PATH - this might fail on HF if not installed correctly
45
+ print("Chromedriver not found in PATH. Attempting to use 'chromedriver' directly (might fail).")
46
+ print("For Hugging Face Spaces, ensure Chrome & Chromedriver are available in the environment.")
47
+ print("You might need to add 'chromium-chromedriver' to a packages.txt file if using a Docker Space.")
48
+ # As a last resort, try initializing without explicit path, hoping Selenium finds it.
49
+ # This part is crucial for HF deployment and might need adjustment based on the HF Space environment.
50
+ # For many Gradio spaces, simply having 'selenium' and 'chromedriver-binary' (or similar)
51
+ # in requirements.txt might work if the base image is well-configured.
52
+ # However, for full Chrome, system-level install is better.
53
+ # For now, we'll proceed assuming it might be found or will error out gracefully.
54
+ try:
55
+ # This assumes chromedriver is globally available or Selenium can find it.
56
+ # On Hugging Face, if using default Docker runtime, you might need to specify
57
+ # apt packages like 'chromium-driver' or 'google-chrome-stable' + 'chromedriver'
58
+ # in a packages.txt file or use a custom Dockerfile.
59
+ # For simplicity, let's assume it can be found or will fail here.
60
+ # A common path if installed via apt in a container:
61
+ if os.path.exists("/usr/bin/chromedriver"):
62
+ service = ChromeService(executable_path="/usr/bin/chromedriver")
63
+ elif os.path.exists("/usr/local/bin/chromedriver"):
64
+ service = ChromeService(executable_path="/usr/local/bin/chromedriver")
65
+ else:
66
+ # This will likely fail if chromedriver isn't installed and in PATH
67
+ # On HF Spaces, you typically ensure this via environment setup (e.g. packages.txt)
68
+ print("Attempting to initialize ChromeService without explicit path...")
69
+ service = ChromeService() # May fail if chromedriver not in PATH
70
+ except Exception as e:
71
+ print(f"Could not initialize ChromeService: {e}. Ensure chromedriver is installed and in PATH.")
72
+ return None
73
+
74
+
75
+ try:
76
+ print("Setting up ChromeDriver for Hugging Face environment...")
77
+ driver = webdriver.Chrome(service=service, options=options)
78
+ print("ChromeDriver setup successful.")
79
+ except Exception as e:
80
+ print(f"Error setting up ChromeDriver: {e}")
81
+ return None
82
+
83
+ driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});")
84
+ return driver
85
+
86
+ def clean_address(address_str):
87
+ if not isinstance(address_str, str):
88
+ return ""
89
+ cleaned_address = ' '.join(address_str.split())
90
+ cleaned_address = re.sub(r'floor-\s*[\w\s]+,?', '', cleaned_address, flags=re.IGNORECASE)
91
+ cleaned_address = cleaned_address.replace(' ,', ',').replace(',,', ',')
92
+ cleaned_address = ', '.join(filter(None, (s.strip() for s in cleaned_address.split(','))))
93
+ if "india" not in cleaned_address.lower() and ("mumbai" in cleaned_address.lower() or "maharashtra" in cleaned_address.lower()):
94
+ cleaned_address += ", India"
95
+ return cleaned_address
96
+
97
+ def geocode_address_with_fallbacks(address_str, attempt_count=0):
98
+ if not address_str or not address_str.strip():
99
+ print("Address string is empty, cannot geocode.")
100
+ return None, None
101
+ cleaned_address = clean_address(address_str)
102
+ print(f"Attempting to geocode cleaned address: '{cleaned_address}' (Attempt {attempt_count + 1})")
103
+ nominatim_user_agent = f"gstin_gradio_app_hf_{int(time.time())}"
104
+ geocoders_to_try = [
105
+ ("Nominatim", Nominatim(user_agent=nominatim_user_agent)),
106
+ ("ArcGIS", ArcGIS(timeout=10))
107
+ ]
108
+ for name, geolocator in geocoders_to_try:
109
+ try:
110
+ print(f"Trying geocoder: {name}...")
111
+ location = geolocator.geocode(cleaned_address, timeout=15)
112
+ if location:
113
+ print(f"Success with {name}: Lat: {location.latitude}, Lon: {location.longitude}")
114
+ return location.latitude, location.longitude
115
+ else:
116
+ print(f"{name} could not geocode the address.")
117
+ except (GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError) as e:
118
+ print(f"{name} geocoding error: {e}")
119
+ except Exception as e:
120
+ print(f"An unexpected error occurred with {name}: {e}")
121
+ time.sleep(1)
122
+ if attempt_count == 0:
123
+ parts = [s.strip() for s in cleaned_address.split(',') if s.strip()]
124
+ if len(parts) > 3:
125
+ generic_address = ', '.join(parts[1:])
126
+ print(f"Trying a more generic address (v1): '{generic_address}'")
127
+ lat, lon = geocode_address_with_fallbacks(generic_address, attempt_count + 1)
128
+ if lat is not None: return lat, lon
129
+ if len(parts) > 4:
130
+ generic_address_v2 = ', '.join(parts[2:])
131
+ print(f"Trying a more generic address (v2): '{generic_address_v2}'")
132
+ return geocode_address_with_fallbacks(generic_address_v2, attempt_count + 1)
133
+ print("All geocoding attempts failed for the address.")
134
+ return None, None
135
+
136
+ def get_gstin_details_for_gradio(gstin_number_input):
137
+ """
138
+ Main function for Gradio: takes GSTIN, scrapes, and returns data as DataFrame.
139
+ """
140
+ gstin_number = str(gstin_number_input).strip().upper()
141
+ if not (len(gstin_number) == 15 and gstin_number.isalnum()):
142
+ return pd.DataFrame({"Error": ["Invalid GSTIN format. Must be 15 alphanumeric characters."]})
143
+
144
+ print(f"Initiating scraper for GSTIN: {gstin_number}")
145
+ driver = driversetup_huggingface()
146
+
147
+ if driver is None:
148
+ print("WebDriver not initialized for scraper.")
149
+ return pd.DataFrame({"Error": ["WebDriver initialization failed. Check server logs."]})
150
+
151
+ extracted_data = {"GSTIN Queried": gstin_number}
152
+ wait_time = 30
153
+ url = "https://www.mastersindia.co/gst-number-search-and-gstin-verification/"
154
+
155
+ try:
156
+ driver.get(url)
157
+ print(f"Navigated to URL: {url}")
158
+
159
+ gstin_input_css_selector = 'input[placeholder="XXXAAAYYYYZ01Z5"]'
160
+ WebDriverWait(driver, wait_time).until(
161
+ EC.presence_of_element_located((By.CSS_SELECTOR, gstin_input_css_selector))
162
+ )
163
+ gstin_input = driver.find_element(By.CSS_SELECTOR, gstin_input_css_selector)
164
+ gstin_input.clear()
165
+ gstin_input.send_keys(gstin_number)
166
+ print(f"Entered GSTIN: {gstin_number}")
167
+
168
+ search_button_css_selector = 'button[aria-label="Search"]'
169
+ WebDriverWait(driver, wait_time).until(
170
+ EC.element_to_be_clickable((By.CSS_SELECTOR, search_button_css_selector))
171
+ )
172
+ search_button = driver.find_element(By.CSS_SELECTOR, search_button_css_selector)
173
+ driver.execute_script("arguments[0].click();", search_button)
174
+ print("Clicked Search button.")
175
+
176
+ results_table_container_css_selector_for_wait = "div.eaKoeQ table"
177
+ WebDriverWait(driver, wait_time).until(
178
+ EC.presence_of_element_located((By.CSS_SELECTOR, results_table_container_css_selector_for_wait))
179
+ )
180
+ print("Results table container found.")
181
+ time.sleep(4)
182
+
183
+ page_source = driver.page_source
184
+ soup = BeautifulSoup(page_source, 'html.parser')
185
+
186
+ table_container_div = soup.select_one("div.eaKoeQ")
187
+ table = None
188
+ if table_container_div: table = table_container_div.find('table')
189
+ if not table: table = soup.find('table')
190
+
191
+ if not table:
192
+ msg = "No data table found on the page after search."
193
+ if "captcha" in page_source.lower(): msg = "CAPTCHA detected during scraping."
194
+ elif "No details found" in page_source or "Invalid GSTIN" in page_source:
195
+ msg = f"No details found for GSTIN {gstin_number} or invalid GSTIN."
196
+ print(msg)
197
+ return pd.DataFrame({"Error": [msg]})
198
+
199
+ rows = table.find_all('tr')
200
+ raw_data = {}
201
+ for row in rows:
202
+ header_element = row.find('th', class_=lambda x: x and 'eLVLDP' in x.split())
203
+ value_element = row.find('td', class_=lambda x: x and 'jdgLDg' in x.split())
204
+ if header_element and value_element:
205
+ raw_data[header_element.get_text(strip=True)] = value_element.get_text(strip=True)
206
+ elif len(row.find_all('td')) == 2:
207
+ cells = row.find_all('td')
208
+ if cells[0].get_text(strip=True):
209
+ raw_data[cells[0].get_text(strip=True)] = cells[1].get_text(strip=True)
210
+
211
+ if not raw_data:
212
+ print("Could not parse any data from the table rows.")
213
+ return pd.DataFrame({"Error": ["Failed to parse data from table."]})
214
+
215
+ fields_to_extract_map = {
216
+ "Principal Place of Business": "Principal Business Address",
217
+ "Additional Place of Business": "Additional Business Address(es)",
218
+ "State Jurisdiction": "State Jurisdiction",
219
+ "Centre Jurisdiction": "Centre Jurisdiction",
220
+ "Date of Registration": "Registration Date",
221
+ "Constitution of Business": "Business Constitution",
222
+ "Taxpayer Type": "Taxpayer Type",
223
+ "GSTIN Status": "GSTIN Status"
224
+ }
225
+ for web_key, display_key in fields_to_extract_map.items():
226
+ extracted_data[display_key] = raw_data.get(web_key, "Not Found")
227
+
228
+ address_to_geocode = extracted_data.get("Principal Business Address")
229
+ if address_to_geocode not in [None, "Not Found", ""]:
230
+ lat, lon = geocode_address_with_fallbacks(address_to_geocode)
231
+ extracted_data["Address Latitude"] = lat if lat is not None else "N/A"
232
+ extracted_data["Address Longitude"] = lon if lon is not None else "N/A"
233
+ else:
234
+ extracted_data["Address Latitude"] = "N/A"
235
+ extracted_data["Address Longitude"] = "N/A"
236
+ if extracted_data.get("Principal Business Address"):
237
+ print("Principal Place of Business not found or empty, skipping geocoding.")
238
+
239
+ print(f"Successfully scraped data for {gstin_number}")
240
+ # Convert dictionary to a 2-column DataFrame for Gradio
241
+ df_output = pd.DataFrame(list(extracted_data.items()), columns=["Field", "Value"])
242
+ return df_output
243
+
244
+ except Exception as e:
245
+ print(f"An error occurred during scraping process for {gstin_number}: {e}")
246
+ # import traceback
247
+ # traceback.print_exc()
248
+ return pd.DataFrame({"Error": [f"Scraping process failed: {str(e)}"]})
249
+ finally:
250
+ if 'driver' in locals() and driver is not None:
251
+ try:
252
+ driver.quit()
253
+ print("Browser closed.")
254
+ except Exception as e:
255
+ print(f"Error quitting driver: {e}")
256
+
257
+ # --- Gradio Interface ---
258
+ iface = gr.Interface(
259
+ fn=get_gstin_details_for_gradio,
260
+ inputs=gr.Textbox(
261
+ label="Enter GSTIN",
262
+ placeholder="Enter 15-character GSTIN (e.g., 27AAFCD5562R1Z5)",
263
+ max_lines=1,
264
+ info="The scraper will fetch details for the provided GSTIN from Masters India."
265
+ ),
266
+ outputs=gr.DataFrame(
267
+ label="GSTIN Details",
268
+ headers=["Field", "Value"],
269
+ wrap=True
270
+ ),
271
+ title="🧾 GSTIN Details Scraper & Verifier",
272
+ description="Enter a valid 15-character Indian GSTIN to fetch its registration details and attempt to geocode the principal place of business. Uses Masters India for scraping.",
273
+ article="<p style='text-align: center;'>Powered by Selenium, BeautifulSoup, Geopy, and Gradio. <br>Note: Scraping may take 20-40 seconds. Geocoding accuracy may vary.</p>",
274
+ examples=[["27AAFCD5562R1Z5"], ["07AAFCM6072R1Z8"]], # Example GSTINs
275
+ allow_flagging="never",
276
+ theme=gr.themes.Soft() # Using a soft theme
277
+ )
278
+
279
+ if __name__ == '__main__':
280
+ # For Hugging Face Spaces, Gradio typically handles the server.
281
+ # This launch(share=True) is more for local testing if you want a public link temporarily.
282
+ # On HF Spaces, just `iface.launch()` is enough.
283
+ # To run locally: python app.py
284
+ if os.environ.get("SYSTEM") == "spaces": # Check if running in Hugging Face Spaces
285
+ iface.launch(debug=False)
286
+ else:
287
+ iface.launch(debug=True, share=True)