LianHP's picture
Upload folder using huggingface_hub
530143f verified
import re
import requests
from bs4 import BeautifulSoup
import gradio as gr
def fetch_webpage(url: str) -> str:
"""Fetch raw HTML from a webpage."""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
return response.text
except Exception as e:
return f"ERROR: Unable to fetch page -> {e}"
def extract_company_signals_from_html(html: str):
"""Extract meaningful company-related fields using simple rule-based patterns."""
if html.startswith("ERROR:"):
return html, {}
soup = BeautifulSoup(html, "html.parser")
# Extract visible text
text = soup.get_text(separator=" ", strip=True)
text = re.sub(r"\s+", " ", text)
# --- Example Extractors ---
emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
phones = re.findall(r"\+?\d[\d\-\(\) ]{6,}\d", text)
# Detect possible addresses (very simple pattern)
address_pattern = r"\d{1,5}\s+[A-Za-z0-9\.\- ]+\s+(Street|St|Rd|Road|Ave|Avenue|Blvd|Lane|Ln|Way|Dr)\b.*"
addresses = re.findall(address_pattern, text, flags=re.IGNORECASE)
# Social media profile links
links = [a.get("href", "") for a in soup.find_all("a")]
social = {
"linkedin": [l for l in links if "linkedin.com" in l],
"facebook": [l for l in links if "facebook.com" in l],
"twitter": [l for l in links if "twitter.com" in l],
"instagram": [l for l in links if "instagram.com" in l],
}
# Company name guess (based on <title>)
title_tag = soup.title.string.strip() if soup.title and soup.title.string else "Unknown"
structured = {
"company_name_guess": title_tag,
"emails_found": list(set(emails)),
"phones_found": list(set(phones)),
"possible_addresses": list(set(addresses)),
"social_profiles": social,
}
return text[:2000] + "...", structured # limit raw text output
def run_extraction(url: str):
"""Pipeline: URL -> HTML -> extracted signals."""
html = fetch_webpage(url)
return extract_company_signals_from_html(html)
with gr.Blocks() as demo:
gr.Markdown(
"# Web Company Data Extractor\n"
"Enter a company website URL. The app fetches the page and extracts useful "
"unstructured company data signals (emails, phones, social links, etc.).\n"
"_This is a simple rule-based demo, not a full production parser._"
)
with gr.Row():
url_in = gr.Textbox(
label="Company Website URL",
placeholder="https://www.example.com"
)
run_btn = gr.Button("Extract Company Data")
raw_text_out = gr.Textbox(
label="Extracted Raw Text (truncated)",
lines=8
)
structured_out = gr.JSON(label="Structured Company Data Signals")
# IMPORTANT: use run_extraction (URL -> HTML -> signals)
run_btn.click(
run_extraction,
inputs=[url_in],
outputs=[raw_text_out, structured_out]
)
if __name__ == "__main__":
demo.launch(share=True)