Spaces:
Sleeping
Sleeping
| import re | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import gradio as gr | |
| def fetch_webpage(url: str) -> str: | |
| """Fetch raw HTML from a webpage.""" | |
| try: | |
| response = requests.get(url, timeout=10) | |
| response.raise_for_status() | |
| return response.text | |
| except Exception as e: | |
| return f"ERROR: Unable to fetch page -> {e}" | |
| def extract_company_signals_from_html(html: str): | |
| """Extract meaningful company-related fields using simple rule-based patterns.""" | |
| if html.startswith("ERROR:"): | |
| return html, {} | |
| soup = BeautifulSoup(html, "html.parser") | |
| # Extract visible text | |
| text = soup.get_text(separator=" ", strip=True) | |
| text = re.sub(r"\s+", " ", text) | |
| # --- Example Extractors --- | |
| emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text) | |
| phones = re.findall(r"\+?\d[\d\-\(\) ]{6,}\d", text) | |
| # Detect possible addresses (very simple pattern) | |
| address_pattern = r"\d{1,5}\s+[A-Za-z0-9\.\- ]+\s+(Street|St|Rd|Road|Ave|Avenue|Blvd|Lane|Ln|Way|Dr)\b.*" | |
| addresses = re.findall(address_pattern, text, flags=re.IGNORECASE) | |
| # Social media profile links | |
| links = [a.get("href", "") for a in soup.find_all("a")] | |
| social = { | |
| "linkedin": [l for l in links if "linkedin.com" in l], | |
| "facebook": [l for l in links if "facebook.com" in l], | |
| "twitter": [l for l in links if "twitter.com" in l], | |
| "instagram": [l for l in links if "instagram.com" in l], | |
| } | |
| # Company name guess (based on <title>) | |
| title_tag = soup.title.string.strip() if soup.title and soup.title.string else "Unknown" | |
| structured = { | |
| "company_name_guess": title_tag, | |
| "emails_found": list(set(emails)), | |
| "phones_found": list(set(phones)), | |
| "possible_addresses": list(set(addresses)), | |
| "social_profiles": social, | |
| } | |
| return text[:2000] + "...", structured # limit raw text output | |
| def run_extraction(url: str): | |
| """Pipeline: URL -> HTML -> extracted signals.""" | |
| html = fetch_webpage(url) | |
| return extract_company_signals_from_html(html) | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| "# Web Company Data Extractor\n" | |
| "Enter a company website URL. The app fetches the page and extracts useful " | |
| "unstructured company data signals (emails, phones, social links, etc.).\n" | |
| "_This is a simple rule-based demo, not a full production parser._" | |
| ) | |
| with gr.Row(): | |
| url_in = gr.Textbox( | |
| label="Company Website URL", | |
| placeholder="https://www.example.com" | |
| ) | |
| run_btn = gr.Button("Extract Company Data") | |
| raw_text_out = gr.Textbox( | |
| label="Extracted Raw Text (truncated)", | |
| lines=8 | |
| ) | |
| structured_out = gr.JSON(label="Structured Company Data Signals") | |
| # IMPORTANT: use run_extraction (URL -> HTML -> signals) | |
| run_btn.click( | |
| run_extraction, | |
| inputs=[url_in], | |
| outputs=[raw_text_out, structured_out] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=True) |