import re import requests from bs4 import BeautifulSoup import gradio as gr def fetch_webpage(url: str) -> str: """Fetch raw HTML from a webpage.""" try: response = requests.get(url, timeout=10) response.raise_for_status() return response.text except Exception as e: return f"ERROR: Unable to fetch page -> {e}" def extract_company_signals_from_html(html: str): """Extract meaningful company-related fields using simple rule-based patterns.""" if html.startswith("ERROR:"): return html, {} soup = BeautifulSoup(html, "html.parser") # Extract visible text text = soup.get_text(separator=" ", strip=True) text = re.sub(r"\s+", " ", text) # --- Example Extractors --- emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text) phones = re.findall(r"\+?\d[\d\-\(\) ]{6,}\d", text) # Detect possible addresses (very simple pattern) address_pattern = r"\d{1,5}\s+[A-Za-z0-9\.\- ]+\s+(Street|St|Rd|Road|Ave|Avenue|Blvd|Lane|Ln|Way|Dr)\b.*" addresses = re.findall(address_pattern, text, flags=re.IGNORECASE) # Social media profile links links = [a.get("href", "") for a in soup.find_all("a")] social = { "linkedin": [l for l in links if "linkedin.com" in l], "facebook": [l for l in links if "facebook.com" in l], "twitter": [l for l in links if "twitter.com" in l], "instagram": [l for l in links if "instagram.com" in l], } # Company name guess (based on ) title_tag = soup.title.string.strip() if soup.title and soup.title.string else "Unknown" structured = { "company_name_guess": title_tag, "emails_found": list(set(emails)), "phones_found": list(set(phones)), "possible_addresses": list(set(addresses)), "social_profiles": social, } return text[:2000] + "...", structured # limit raw text output def run_extraction(url: str): """Pipeline: URL -> HTML -> extracted signals.""" html = fetch_webpage(url) return extract_company_signals_from_html(html) with gr.Blocks() as demo: gr.Markdown( "# Web Company Data Extractor\n" "Enter a company website URL. The app fetches the page and extracts useful " "unstructured company data signals (emails, phones, social links, etc.).\n" "_This is a simple rule-based demo, not a full production parser._" ) with gr.Row(): url_in = gr.Textbox( label="Company Website URL", placeholder="https://www.example.com" ) run_btn = gr.Button("Extract Company Data") raw_text_out = gr.Textbox( label="Extracted Raw Text (truncated)", lines=8 ) structured_out = gr.JSON(label="Structured Company Data Signals") # IMPORTANT: use run_extraction (URL -> HTML -> signals) run_btn.click( run_extraction, inputs=[url_in], outputs=[raw_text_out, structured_out] ) if __name__ == "__main__": demo.launch(share=True)