Spaces:

LianHP
/

Web_page_data_html

Sleeping

File size: 3,057 Bytes

530143f

import re
import requests
from bs4 import BeautifulSoup
import gradio as gr


def fetch_webpage(url: str) -> str:
    """Fetch raw HTML from a webpage."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.text
    except Exception as e:
        return f"ERROR: Unable to fetch page -> {e}"


def extract_company_signals_from_html(html: str):
    """Extract meaningful company-related fields using simple rule-based patterns."""

    if html.startswith("ERROR:"):
        return html, {}

    soup = BeautifulSoup(html, "html.parser")

    # Extract visible text
    text = soup.get_text(separator=" ", strip=True)
    text = re.sub(r"\s+", " ", text)

    # --- Example Extractors ---
    emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
    phones = re.findall(r"\+?\d[\d\-\(\) ]{6,}\d", text)

    # Detect possible addresses (very simple pattern)
    address_pattern = r"\d{1,5}\s+[A-Za-z0-9\.\- ]+\s+(Street|St|Rd|Road|Ave|Avenue|Blvd|Lane|Ln|Way|Dr)\b.*"
    addresses = re.findall(address_pattern, text, flags=re.IGNORECASE)

    # Social media profile links
    links = [a.get("href", "") for a in soup.find_all("a")]
    social = {
        "linkedin": [l for l in links if "linkedin.com" in l],
        "facebook": [l for l in links if "facebook.com" in l],
        "twitter": [l for l in links if "twitter.com" in l],
        "instagram": [l for l in links if "instagram.com" in l],
    }

    # Company name guess (based on <title>)
    title_tag = soup.title.string.strip() if soup.title and soup.title.string else "Unknown"

    structured = {
        "company_name_guess": title_tag,
        "emails_found": list(set(emails)),
        "phones_found": list(set(phones)),
        "possible_addresses": list(set(addresses)),
        "social_profiles": social,
    }

    return text[:2000] + "...", structured  # limit raw text output


def run_extraction(url: str):
    """Pipeline: URL -> HTML -> extracted signals."""
    html = fetch_webpage(url)
    return extract_company_signals_from_html(html)


with gr.Blocks() as demo:
    gr.Markdown(
        "# Web Company Data Extractor\n"
        "Enter a company website URL. The app fetches the page and extracts useful "
        "unstructured company data signals (emails, phones, social links, etc.).\n"
        "_This is a simple rule-based demo, not a full production parser._"
    )

    with gr.Row():
        url_in = gr.Textbox(
            label="Company Website URL",
            placeholder="https://www.example.com"
        )
        run_btn = gr.Button("Extract Company Data")

    raw_text_out = gr.Textbox(
        label="Extracted Raw Text (truncated)",
        lines=8
    )
    structured_out = gr.JSON(label="Structured Company Data Signals")

    # IMPORTANT: use run_extraction (URL -> HTML -> signals)
    run_btn.click(
        run_extraction,
        inputs=[url_in],
        outputs=[raw_text_out, structured_out]
    )

if __name__ == "__main__":
    demo.launch(share=True)