Spaces:
Sleeping
Sleeping
| # app.py | |
| import gradio as gr | |
| from bs4 import BeautifulSoup | |
| from selenium import webdriver | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| # Import Firefox specific classes | |
| from selenium.webdriver.firefox.service import Service as FirefoxService | |
| from selenium.webdriver.firefox.options import Options as FirefoxOptions | |
| # from selenium.webdriver.chrome.service import Service as ChromeService # No longer needed | |
| # from selenium.webdriver.chrome.options import Options as ChromeOptions # No longer needed | |
| from geopy.geocoders import Nominatim, ArcGIS | |
| from geopy.exc import GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError | |
| import time | |
| import pandas as pd | |
| import re | |
| import os | |
| import shutil # For finding geckodriver | |
| def driversetup_huggingface(): | |
| """Custom driver setup for Hugging Face Spaces using Firefox (headless).""" | |
| options = FirefoxOptions() | |
| options.add_argument("--headless") | |
| options.add_argument("--window-size=1920,1080") # Set a reasonable window size | |
| options.add_argument("--disable-gpu") # Often recommended for headless | |
| # Firefox doesn't use --no-sandbox or --disable-dev-shm-usage in the same way as Chrome | |
| # User agent and other settings | |
| options.set_preference("intl.accept_languages", "en-US, en") | |
| options.set_preference("general.useragent.override", "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0") # Example Firefox UA | |
| geckodriver_path = shutil.which("geckodriver") | |
| service = None | |
| if geckodriver_path: | |
| print(f"Using geckodriver found at: {geckodriver_path}") | |
| service = FirefoxService(executable_path=geckodriver_path) | |
| else: | |
| print("Geckodriver not found in PATH by shutil.which.") | |
| common_paths = ["/usr/bin/geckodriver", "/usr/local/bin/geckodriver"] | |
| for path in common_paths: | |
| if os.path.exists(path): | |
| print(f"Found geckodriver at common path: {path}") | |
| service = FirefoxService(executable_path=path) | |
| break | |
| if not service: | |
| print("Geckodriver not found in common paths. Attempting to initialize FirefoxService without explicit path...") | |
| print("Ensure 'firefox-esr' and 'geckodriver' are in packages.txt for HF Spaces.") | |
| try: | |
| # This will likely fail if geckodriver isn't installed and in PATH | |
| service = FirefoxService() | |
| except Exception as e_service: | |
| print(f"Could not initialize FirefoxService without explicit path: {e_service}") | |
| return None | |
| try: | |
| print("Setting up GeckoDriver (Firefox) for Hugging Face environment...") | |
| driver = webdriver.Firefox(service=service, options=options) | |
| print("GeckoDriver (Firefox) setup successful.") | |
| except Exception as e_webdriver: | |
| print(f"Error setting up GeckoDriver (Firefox): {e_webdriver}") | |
| if service and service.path: # Check if service.path exists | |
| # geckodriver might not have a simple --version flag like chromedriver | |
| # We can try to run it to see if it executes | |
| try: | |
| os.system(f"{service.path} --version > geckodriver_version.txt 2>&1") | |
| with open("geckodriver_version.txt", "r") as f: | |
| print(f"Geckodriver version check output: {f.read()}") | |
| os.remove("geckodriver_version.txt") | |
| except Exception as e_ver: | |
| print(f"Could not execute geckodriver version check: {e_ver}") | |
| return None | |
| # The AutomationControlled blink feature is Chrome-specific. | |
| # For Firefox, such measures are less common or handled differently. | |
| # driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});") # This is Chrome specific | |
| return driver | |
| def clean_address(address_str): | |
| if not isinstance(address_str, str): | |
| return "" | |
| cleaned_address = ' '.join(address_str.split()) | |
| cleaned_address = re.sub(r'floor-\s*[\w\s]+,?', '', cleaned_address, flags=re.IGNORECASE) | |
| cleaned_address = cleaned_address.replace(' ,', ',').replace(',,', ',') | |
| cleaned_address = ', '.join(filter(None, (s.strip() for s in cleaned_address.split(',')))) | |
| if "india" not in cleaned_address.lower() and ("mumbai" in cleaned_address.lower() or "maharashtra" in cleaned_address.lower()): | |
| cleaned_address += ", India" | |
| return cleaned_address | |
| def geocode_address_with_fallbacks(address_str, attempt_count=0): | |
| if not address_str or not address_str.strip(): | |
| print("Address string is empty, cannot geocode.") | |
| return None, None | |
| cleaned_address = clean_address(address_str) | |
| print(f"Attempting to geocode cleaned address: '{cleaned_address}' (Attempt {attempt_count + 1})") | |
| nominatim_user_agent = f"gstin_gradio_app_hf_{int(time.time())}" | |
| geocoders_to_try = [ | |
| ("Nominatim", Nominatim(user_agent=nominatim_user_agent)), | |
| ("ArcGIS", ArcGIS(timeout=10)) | |
| ] | |
| for name, geolocator in geocoders_to_try: | |
| try: | |
| print(f"Trying geocoder: {name}...") | |
| location = geolocator.geocode(cleaned_address, timeout=15) | |
| if location: | |
| print(f"Success with {name}: Lat: {location.latitude}, Lon: {location.longitude}") | |
| return location.latitude, location.longitude | |
| else: | |
| print(f"{name} could not geocode the address.") | |
| except (GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError) as e: | |
| print(f"{name} geocoding error: {e}") | |
| except Exception as e: | |
| print(f"An unexpected error occurred with {name}: {e}") | |
| time.sleep(1) # Be respectful to APIs | |
| if attempt_count == 0: # Try a more generic address only once | |
| parts = [s.strip() for s in cleaned_address.split(',') if s.strip()] | |
| if len(parts) > 3: | |
| start_index = max(0, len(parts) - 4) | |
| generic_address = ', '.join(parts[start_index:]) | |
| print(f"Trying a more generic address: '{generic_address}'") | |
| return geocode_address_with_fallbacks(generic_address, attempt_count + 1) | |
| print("All geocoding attempts failed for the address.") | |
| return None, None | |
| def get_gstin_details_for_gradio(gstin_number_input): | |
| gstin_number = str(gstin_number_input).strip().upper() | |
| if not (len(gstin_number) == 15 and gstin_number.isalnum()): | |
| return pd.DataFrame({"Error": ["Invalid GSTIN format. Must be 15 alphanumeric characters."]}) | |
| print(f"Initiating scraper for GSTIN: {gstin_number}") | |
| driver = driversetup_huggingface() # Now uses Firefox setup | |
| if driver is None: | |
| print("WebDriver (Firefox) not initialized for scraper.") | |
| return pd.DataFrame({"Error": ["WebDriver (Firefox) initialization failed. Check server logs for GeckoDriver errors."]}) | |
| extracted_data = {"GSTIN Queried": gstin_number} | |
| wait_time = 35 | |
| url = "https://www.mastersindia.co/gst-number-search-and-gstin-verification/" | |
| try: | |
| print(f"Navigating to URL: {url}") | |
| driver.get(url) | |
| time.sleep(1.5) # Slightly longer pause for Firefox initial page load | |
| gstin_input_css_selector = 'input[placeholder="XXXAAAYYYYZ01Z5"]' | |
| print(f"Waiting for GSTIN input box: {gstin_input_css_selector}") | |
| gstin_input = WebDriverWait(driver, wait_time).until( | |
| EC.visibility_of_element_located((By.CSS_SELECTOR, gstin_input_css_selector)) | |
| ) | |
| print("GSTIN input box visible.") | |
| gstin_input.clear() | |
| gstin_input.send_keys(gstin_number) | |
| print(f"Entered GSTIN: {gstin_number}") | |
| time.sleep(0.5) | |
| search_button_css_selector = 'button[aria-label="Search"]' | |
| print(f"Waiting for Search button: {search_button_css_selector}") | |
| search_button = WebDriverWait(driver, wait_time).until( | |
| EC.element_to_be_clickable((By.CSS_SELECTOR, search_button_css_selector)) | |
| ) | |
| print("Search button clickable.") | |
| driver.execute_script("arguments[0].scrollIntoView(true);", search_button) | |
| time.sleep(0.5) | |
| driver.execute_script("arguments[0].click();", search_button) | |
| print("Clicked Search button using JavaScript.") | |
| results_table_css_selector = "div.eaKoeQ table tbody tr" | |
| print(f"Waiting for results table rows: {results_table_css_selector}") | |
| WebDriverWait(driver, wait_time).until( | |
| EC.presence_of_all_elements_located((By.CSS_SELECTOR, results_table_css_selector)) | |
| ) | |
| print("Results table rows are present.") | |
| time.sleep(3) | |
| page_source = driver.page_source | |
| soup = BeautifulSoup(page_source, 'html.parser') | |
| table_container_div = soup.select_one("div.eaKoeQ") | |
| table = None | |
| if table_container_div: table = table_container_div.find('table') | |
| if not table: table = soup.find('table') | |
| if not table: | |
| msg = "No data table found on the page after search." | |
| if "captcha" in page_source.lower(): msg = "CAPTCHA detected during scraping." | |
| elif "No details found" in page_source or "Invalid GSTIN" in page_source: | |
| msg = f"No details found for GSTIN {gstin_number} or invalid GSTIN." | |
| print(msg) | |
| return pd.DataFrame({"Error": [msg]}) | |
| rows = table.find_all('tr') | |
| raw_data = {} | |
| if not rows: | |
| print("Table found, but no rows (<tr>) parsed from it.") | |
| return pd.DataFrame({"Error": ["Data table found but no rows could be parsed."]}) | |
| for row_num, row in enumerate(rows): | |
| header_element = row.find('th', class_=lambda x: x and 'eLVLDP' in x.split()) | |
| value_element = row.find('td', class_=lambda x: x and 'jdgLDg' in x.split()) | |
| if header_element and value_element: | |
| raw_data[header_element.get_text(strip=True)] = value_element.get_text(strip=True) | |
| elif len(row.find_all('td')) == 2: | |
| cells = row.find_all('td') | |
| key = cells[0].get_text(strip=True) | |
| if key: raw_data[key] = cells[1].get_text(strip=True) | |
| if not raw_data: | |
| print("Could not parse any key-value data from the table rows.") | |
| return pd.DataFrame({"Error": ["Failed to parse key-value data from table rows."]}) | |
| fields_to_extract_map = { | |
| "Principal Place of Business": "Principal Business Address", | |
| "Additional Place of Business": "Additional Business Address(es)", | |
| "State Jurisdiction": "State Jurisdiction", | |
| "Centre Jurisdiction": "Centre Jurisdiction", | |
| "Date of Registration": "Registration Date", | |
| "Constitution of Business": "Business Constitution", | |
| "Taxpayer Type": "Taxpayer Type", | |
| "GSTIN Status": "GSTIN Status" | |
| } | |
| for web_key, display_key in fields_to_extract_map.items(): | |
| extracted_data[display_key] = raw_data.get(web_key, "Not Found") | |
| address_to_geocode = extracted_data.get("Principal Business Address") | |
| if address_to_geocode not in [None, "Not Found", ""]: | |
| lat, lon = geocode_address_with_fallbacks(address_to_geocode) | |
| extracted_data["Address Latitude"] = lat if lat is not None else "N/A" | |
| extracted_data["Address Longitude"] = lon if lon is not None else "N/A" | |
| else: | |
| extracted_data["Address Latitude"] = "N/A" | |
| extracted_data["Address Longitude"] = "N/A" | |
| if extracted_data.get("Principal Business Address"): | |
| print("Principal Place of Business not found or empty, skipping geocoding.") | |
| print(f"Successfully scraped data for {gstin_number}") | |
| df_output = pd.DataFrame(list(extracted_data.items()), columns=["Field", "Value"]) | |
| return df_output | |
| except Exception as e: | |
| print(f"An error occurred during scraping process for {gstin_number}: {e}") | |
| return pd.DataFrame({"Error": [f"Scraping process failed: {str(e)}"]}) | |
| finally: | |
| if 'driver' in locals() and driver is not None: | |
| try: | |
| driver.quit() | |
| print("Browser closed.") | |
| except Exception as e_quit: | |
| print(f"Error quitting driver: {e_quit}") | |
| # --- Gradio Interface --- | |
| iface = gr.Interface( | |
| fn=get_gstin_details_for_gradio, | |
| inputs=gr.Textbox( | |
| label="Enter GSTIN", | |
| placeholder="Enter 15-character GSTIN (e.g., 27AAFCD5562R1Z5)", | |
| max_lines=1, | |
| info="The scraper will fetch details for the provided GSTIN from Masters India." | |
| ), | |
| outputs=gr.DataFrame( | |
| label="GSTIN Details", | |
| headers=["Field", "Value"], | |
| wrap=True | |
| ), | |
| title="🧾 GSTIN Details Scraper & Verifier (Firefox Edition)", | |
| description="Enter a valid 15-character Indian GSTIN to fetch its registration details and attempt to geocode the principal place of business. Uses Masters India for scraping (with Firefox/GeckoDriver).", | |
| article="<p style='text-align: center;'>Powered by Selenium, BeautifulSoup, Geopy, and Gradio. <br>Note: Scraping may take 20-45 seconds. Geocoding accuracy may vary.</p>", | |
| examples=[["27AAFCD5562R1Z5"], ["07AAFCM6072R1Z8"]], | |
| allow_flagging="never", | |
| theme=gr.themes.Soft() | |
| ) | |
| if __name__ == '__main__': | |
| if os.environ.get("SYSTEM") == "spaces": | |
| iface.launch(debug=False) | |
| else: | |
| iface.launch(debug=True, share=False) | |
| # webdriver-manager # Useful for local testing with Firefox too | |
| # # app.py | |
| # import gradio as gr | |
| # from bs4 import BeautifulSoup | |
| # from selenium import webdriver | |
| # from selenium.webdriver.common.by import By | |
| # from selenium.webdriver.support.ui import WebDriverWait | |
| # from selenium.webdriver.support import expected_conditions as EC | |
| # from selenium.webdriver.chrome.service import Service as ChromeService | |
| # from selenium.webdriver.chrome.options import Options as ChromeOptions | |
| # from geopy.geocoders import Nominatim, ArcGIS | |
| # from geopy.exc import GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError | |
| # import time | |
| # import pandas as pd | |
| # import re | |
| # import os | |
| # import shutil # For finding chromedriver | |
| # def driversetup_huggingface(): | |
| # """Custom driver setup for Hugging Face Spaces (headless).""" | |
| # options = ChromeOptions() | |
| # options.add_argument("--headless") | |
| # options.add_argument("--no-sandbox") | |
| # # options.add_argument("--disable-gpu") | |
| # # options.add_argument("--window-size=1920,1080") | |
| # options.add_argument("--disable-dev-shm-usage") | |
| # # options.add_argument("lang=en") | |
| # # options.add_argument("start-maximized") | |
| # # options.add_argument("disable-infobars") | |
| # # options.add_argument("--disable-extensions") | |
| # # options.add_argument("--disable-blink-features=AutomationControlled") | |
| # options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36") | |
| # # Attempt to find chromedriver - Hugging Face Spaces might have it in specific locations | |
| # # or it might need to be installed via packages.txt or a Dockerfile. | |
| # # For Gradio apps on Spaces, it's often pre-configured or easily installable. | |
| # # Let's try common paths or rely on it being in PATH. | |
| # # Check if chromedriver is in PATH or use a common location | |
| # chromedriver_path = shutil.which("chromedriver") | |
| # if chromedriver_path: | |
| # print(f"Using chromedriver found at: {chromedriver_path}") | |
| # service = ChromeService(executable_path=chromedriver_path) | |
| # else: | |
| # # Fallback if not in PATH - this might fail on HF if not installed correctly | |
| # print("Chromedriver not found in PATH. Attempting to use 'chromedriver' directly (might fail).") | |
| # print("For Hugging Face Spaces, ensure Chrome & Chromedriver are available in the environment.") | |
| # print("You might need to add 'chromium-chromedriver' to a packages.txt file if using a Docker Space.") | |
| # # As a last resort, try initializing without explicit path, hoping Selenium finds it. | |
| # # This part is crucial for HF deployment and might need adjustment based on the HF Space environment. | |
| # # For many Gradio spaces, simply having 'selenium' and 'chromedriver-binary' (or similar) | |
| # # in requirements.txt might work if the base image is well-configured. | |
| # # However, for full Chrome, system-level install is better. | |
| # # For now, we'll proceed assuming it might be found or will error out gracefully. | |
| # try: | |
| # # This assumes chromedriver is globally available or Selenium can find it. | |
| # # On Hugging Face, if using default Docker runtime, you might need to specify | |
| # # apt packages like 'chromium-driver' or 'google-chrome-stable' + 'chromedriver' | |
| # # in a packages.txt file or use a custom Dockerfile. | |
| # # For simplicity, let's assume it can be found or will fail here. | |
| # # A common path if installed via apt in a container: | |
| # if os.path.exists("/usr/bin/chromedriver"): | |
| # service = ChromeService(executable_path="/usr/bin/chromedriver") | |
| # elif os.path.exists("/usr/local/bin/chromedriver"): | |
| # service = ChromeService(executable_path="/usr/local/bin/chromedriver") | |
| # else: | |
| # # This will likely fail if chromedriver isn't installed and in PATH | |
| # # On HF Spaces, you typically ensure this via environment setup (e.g. packages.txt) | |
| # print("Attempting to initialize ChromeService without explicit path...") | |
| # service = ChromeService() # May fail if chromedriver not in PATH | |
| # except Exception as e: | |
| # print(f"Could not initialize ChromeService: {e}. Ensure chromedriver is installed and in PATH.") | |
| # return None | |
| # try: | |
| # print("Setting up ChromeDriver for Hugging Face environment...") | |
| # driver = webdriver.Chrome(service=service, options=options) | |
| # print("ChromeDriver setup successful.") | |
| # except Exception as e: | |
| # print(f"Error setting up ChromeDriver: {e}") | |
| # return None | |
| # driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});") | |
| # return driver | |
| # def clean_address(address_str): | |
| # if not isinstance(address_str, str): | |
| # return "" | |
| # cleaned_address = ' '.join(address_str.split()) | |
| # cleaned_address = re.sub(r'floor-\s*[\w\s]+,?', '', cleaned_address, flags=re.IGNORECASE) | |
| # cleaned_address = cleaned_address.replace(' ,', ',').replace(',,', ',') | |
| # cleaned_address = ', '.join(filter(None, (s.strip() for s in cleaned_address.split(',')))) | |
| # if "india" not in cleaned_address.lower() and ("mumbai" in cleaned_address.lower() or "maharashtra" in cleaned_address.lower()): | |
| # cleaned_address += ", India" | |
| # return cleaned_address | |
| # def geocode_address_with_fallbacks(address_str, attempt_count=0): | |
| # if not address_str or not address_str.strip(): | |
| # print("Address string is empty, cannot geocode.") | |
| # return None, None | |
| # cleaned_address = clean_address(address_str) | |
| # print(f"Attempting to geocode cleaned address: '{cleaned_address}' (Attempt {attempt_count + 1})") | |
| # nominatim_user_agent = f"gstin_gradio_app_hf_{int(time.time())}" | |
| # geocoders_to_try = [ | |
| # ("Nominatim", Nominatim(user_agent=nominatim_user_agent)), | |
| # ("ArcGIS", ArcGIS(timeout=10)) | |
| # ] | |
| # for name, geolocator in geocoders_to_try: | |
| # try: | |
| # print(f"Trying geocoder: {name}...") | |
| # location = geolocator.geocode(cleaned_address, timeout=15) | |
| # if location: | |
| # print(f"Success with {name}: Lat: {location.latitude}, Lon: {location.longitude}") | |
| # return location.latitude, location.longitude | |
| # else: | |
| # print(f"{name} could not geocode the address.") | |
| # except (GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError) as e: | |
| # print(f"{name} geocoding error: {e}") | |
| # except Exception as e: | |
| # print(f"An unexpected error occurred with {name}: {e}") | |
| # time.sleep(1) | |
| # if attempt_count == 0: | |
| # parts = [s.strip() for s in cleaned_address.split(',') if s.strip()] | |
| # if len(parts) > 3: | |
| # generic_address = ', '.join(parts[1:]) | |
| # print(f"Trying a more generic address (v1): '{generic_address}'") | |
| # lat, lon = geocode_address_with_fallbacks(generic_address, attempt_count + 1) | |
| # if lat is not None: return lat, lon | |
| # if len(parts) > 4: | |
| # generic_address_v2 = ', '.join(parts[2:]) | |
| # print(f"Trying a more generic address (v2): '{generic_address_v2}'") | |
| # return geocode_address_with_fallbacks(generic_address_v2, attempt_count + 1) | |
| # print("All geocoding attempts failed for the address.") | |
| # return None, None | |
| # def get_gstin_details_for_gradio(gstin_number_input): | |
| # """ | |
| # Main function for Gradio: takes GSTIN, scrapes, and returns data as DataFrame. | |
| # """ | |
| # gstin_number = str(gstin_number_input).strip().upper() | |
| # if not (len(gstin_number) == 15 and gstin_number.isalnum()): | |
| # return pd.DataFrame({"Error": ["Invalid GSTIN format. Must be 15 alphanumeric characters."]}) | |
| # print(f"Initiating scraper for GSTIN: {gstin_number}") | |
| # driver = driversetup_huggingface() | |
| # if driver is None: | |
| # print("WebDriver not initialized for scraper.") | |
| # return pd.DataFrame({"Error": ["WebDriver initialization failed. Check server logs."]}) | |
| # extracted_data = {"GSTIN Queried": gstin_number} | |
| # wait_time = 30 | |
| # url = "https://www.mastersindia.co/gst-number-search-and-gstin-verification/" | |
| # try: | |
| # driver.get(url) | |
| # print(f"Navigated to URL: {url}") | |
| # gstin_input_css_selector = 'input[placeholder="XXXAAAYYYYZ01Z5"]' | |
| # WebDriverWait(driver, wait_time).until( | |
| # EC.presence_of_element_located((By.CSS_SELECTOR, gstin_input_css_selector)) | |
| # ) | |
| # gstin_input = driver.find_element(By.CSS_SELECTOR, gstin_input_css_selector) | |
| # gstin_input.clear() | |
| # gstin_input.send_keys(gstin_number) | |
| # print(f"Entered GSTIN: {gstin_number}") | |
| # search_button_css_selector = 'button[aria-label="Search"]' | |
| # WebDriverWait(driver, wait_time).until( | |
| # EC.element_to_be_clickable((By.CSS_SELECTOR, search_button_css_selector)) | |
| # ) | |
| # search_button = driver.find_element(By.CSS_SELECTOR, search_button_css_selector) | |
| # driver.execute_script("arguments[0].click();", search_button) | |
| # print("Clicked Search button.") | |
| # results_table_container_css_selector_for_wait = "div.eaKoeQ table" | |
| # WebDriverWait(driver, wait_time).until( | |
| # EC.presence_of_element_located((By.CSS_SELECTOR, results_table_container_css_selector_for_wait)) | |
| # ) | |
| # print("Results table container found.") | |
| # time.sleep(4) | |
| # page_source = driver.page_source | |
| # soup = BeautifulSoup(page_source, 'html.parser') | |
| # table_container_div = soup.select_one("div.eaKoeQ") | |
| # table = None | |
| # if table_container_div: table = table_container_div.find('table') | |
| # if not table: table = soup.find('table') | |
| # if not table: | |
| # msg = "No data table found on the page after search." | |
| # if "captcha" in page_source.lower(): msg = "CAPTCHA detected during scraping." | |
| # elif "No details found" in page_source or "Invalid GSTIN" in page_source: | |
| # msg = f"No details found for GSTIN {gstin_number} or invalid GSTIN." | |
| # print(msg) | |
| # return pd.DataFrame({"Error": [msg]}) | |
| # rows = table.find_all('tr') | |
| # raw_data = {} | |
| # for row in rows: | |
| # header_element = row.find('th', class_=lambda x: x and 'eLVLDP' in x.split()) | |
| # value_element = row.find('td', class_=lambda x: x and 'jdgLDg' in x.split()) | |
| # if header_element and value_element: | |
| # raw_data[header_element.get_text(strip=True)] = value_element.get_text(strip=True) | |
| # elif len(row.find_all('td')) == 2: | |
| # cells = row.find_all('td') | |
| # if cells[0].get_text(strip=True): | |
| # raw_data[cells[0].get_text(strip=True)] = cells[1].get_text(strip=True) | |
| # if not raw_data: | |
| # print("Could not parse any data from the table rows.") | |
| # return pd.DataFrame({"Error": ["Failed to parse data from table."]}) | |
| # fields_to_extract_map = { | |
| # "Principal Place of Business": "Principal Business Address", | |
| # "Additional Place of Business": "Additional Business Address(es)", | |
| # "State Jurisdiction": "State Jurisdiction", | |
| # "Centre Jurisdiction": "Centre Jurisdiction", | |
| # "Date of Registration": "Registration Date", | |
| # "Constitution of Business": "Business Constitution", | |
| # "Taxpayer Type": "Taxpayer Type", | |
| # "GSTIN Status": "GSTIN Status" | |
| # } | |
| # for web_key, display_key in fields_to_extract_map.items(): | |
| # extracted_data[display_key] = raw_data.get(web_key, "Not Found") | |
| # address_to_geocode = extracted_data.get("Principal Business Address") | |
| # if address_to_geocode not in [None, "Not Found", ""]: | |
| # lat, lon = geocode_address_with_fallbacks(address_to_geocode) | |
| # extracted_data["Address Latitude"] = lat if lat is not None else "N/A" | |
| # extracted_data["Address Longitude"] = lon if lon is not None else "N/A" | |
| # else: | |
| # extracted_data["Address Latitude"] = "N/A" | |
| # extracted_data["Address Longitude"] = "N/A" | |
| # if extracted_data.get("Principal Business Address"): | |
| # print("Principal Place of Business not found or empty, skipping geocoding.") | |
| # print(f"Successfully scraped data for {gstin_number}") | |
| # # Convert dictionary to a 2-column DataFrame for Gradio | |
| # df_output = pd.DataFrame(list(extracted_data.items()), columns=["Field", "Value"]) | |
| # return df_output | |
| # except Exception as e: | |
| # print(f"An error occurred during scraping process for {gstin_number}: {e}") | |
| # # import traceback | |
| # # traceback.print_exc() | |
| # return pd.DataFrame({"Error": [f"Scraping process failed: {str(e)}"]}) | |
| # finally: | |
| # if 'driver' in locals() and driver is not None: | |
| # try: | |
| # driver.quit() | |
| # print("Browser closed.") | |
| # except Exception as e: | |
| # print(f"Error quitting driver: {e}") | |
| # # --- Gradio Interface --- | |
| # iface = gr.Interface( | |
| # fn=get_gstin_details_for_gradio, | |
| # inputs=gr.Textbox( | |
| # label="Enter GSTIN", | |
| # placeholder="Enter 15-character GSTIN (e.g., 27AAFCD5562R1Z5)", | |
| # max_lines=1, | |
| # info="The scraper will fetch details for the provided GSTIN from Masters India." | |
| # ), | |
| # outputs=gr.DataFrame( | |
| # label="GSTIN Details", | |
| # headers=["Field", "Value"], | |
| # wrap=True | |
| # ), | |
| # title="🧾 GSTIN Details Scraper & Verifier", | |
| # description="Enter a valid 15-character Indian GSTIN to fetch its registration details and attempt to geocode the principal place of business. Uses Masters India for scraping.", | |
| # article="<p style='text-align: center;'>Powered by Selenium, BeautifulSoup, Geopy, and Gradio. <br>Note: Scraping may take 20-40 seconds. Geocoding accuracy may vary.</p>", | |
| # examples=[["27AAFCD5562R1Z5"], ["07AAFCM6072R1Z8"]], # Example GSTINs | |
| # allow_flagging="never", | |
| # theme=gr.themes.Soft() # Using a soft theme | |
| # ) | |
| # if __name__ == '__main__': | |
| # # For Hugging Face Spaces, Gradio typically handles the server. | |
| # # This launch(share=True) is more for local testing if you want a public link temporarily. | |
| # # On HF Spaces, just `iface.launch()` is enough. | |
| # # To run locally: python app.py | |
| # if os.environ.get("SYSTEM") == "spaces": # Check if running in Hugging Face Spaces | |
| # iface.launch(debug=False) | |
| # else: | |
| # iface.launch(debug=True, share=True) |