import re import requests from bs4 import BeautifulSoup import gradio as gr def fetch_webpage(url: str) -> str: """Fetch raw HTML from a webpage.""" try: response = requests.get(url, timeout=10) response.raise_for_status() return response.text except Exception as e: return f"ERROR: Unable to fetch page -> {e}" def extract_company_signals_from_html(html: str): """Extract meaningful company-related fields using simple rule-based patterns.""" if html.startswith("ERROR:"): return html, {} soup = BeautifulSoup(html, "html.parser") # Extract visible text text = soup.get_text(separator=" ", strip=True) text = re.sub(r"\s+", " ", text) # --- Example Extractors --- emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text) phones = re.findall(r"\+?\d[\d\-\(\) ]{6,}\d", text) # Detect possible addresses (very simple pattern) address_pattern = r"\d{1,5}\s+[A-Za-z0-9\.\- ]+\s+(Street|St|Rd|Road|Ave|Avenue|Blvd|Lane|Ln|Way|Dr)\b.*" addresses = re.findall(address_pattern, text, flags=re.IGNORECASE) # Social media profile links links = [a.get("href", "") for a in soup.find_all("a")] social = { "linkedin": [l for l in links if "linkedin.com" in l], "facebook": [l for l in links if "facebook.com" in l], "twitter": [l for l in links if "twitter.com" in l], "instagram": [l for l in links if "instagram.com" in l], } # Company name guess (based on