Spaces:

LianHP
/

Web_page_data_html

Sleeping

App Files Files Community

Web_page_data_html / app.py

LianHP

Upload folder using huggingface_hub

530143f verified 2 months ago

raw

history blame contribute delete

3.06 kB

	import re
	import requests
	from bs4 import BeautifulSoup
	import gradio as gr


	def fetch_webpage(url: str) -> str:
	"""Fetch raw HTML from a webpage."""
	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	return response.text
	except Exception as e:
	return f"ERROR: Unable to fetch page -> {e}"


	def extract_company_signals_from_html(html: str):
	"""Extract meaningful company-related fields using simple rule-based patterns."""

	if html.startswith("ERROR:"):
	return html, {}

	soup = BeautifulSoup(html, "html.parser")

	# Extract visible text
	text = soup.get_text(separator=" ", strip=True)
	text = re.sub(r"\s+", " ", text)

	# --- Example Extractors ---
	emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
	phones = re.findall(r"\+?\d[\d\-\(\) ]{6,}\d", text)

	# Detect possible addresses (very simple pattern)
	address_pattern = r"\d{1,5}\s+[A-Za-z0-9\.\- ]+\s+(Street\|St\|Rd\|Road\|Ave\|Avenue\|Blvd\|Lane\|Ln\|Way\|Dr)\b.*"
	addresses = re.findall(address_pattern, text, flags=re.IGNORECASE)

	# Social media profile links
	links = [a.get("href", "") for a in soup.find_all("a")]
	social = {
	"linkedin": [l for l in links if "linkedin.com" in l],
	"facebook": [l for l in links if "facebook.com" in l],
	"twitter": [l for l in links if "twitter.com" in l],
	"instagram": [l for l in links if "instagram.com" in l],
	}

	# Company name guess (based on <title>)
	title_tag = soup.title.string.strip() if soup.title and soup.title.string else "Unknown"

	structured = {
	"company_name_guess": title_tag,
	"emails_found": list(set(emails)),
	"phones_found": list(set(phones)),
	"possible_addresses": list(set(addresses)),
	"social_profiles": social,
	}

	return text[:2000] + "...", structured # limit raw text output


	def run_extraction(url: str):
	"""Pipeline: URL -> HTML -> extracted signals."""
	html = fetch_webpage(url)
	return extract_company_signals_from_html(html)


	with gr.Blocks() as demo:
	gr.Markdown(
	"# Web Company Data Extractor\n"
	"Enter a company website URL. The app fetches the page and extracts useful "
	"unstructured company data signals (emails, phones, social links, etc.).\n"
	"_This is a simple rule-based demo, not a full production parser._"
	)

	with gr.Row():
	url_in = gr.Textbox(
	label="Company Website URL",
	placeholder="https://www.example.com"
	)
	run_btn = gr.Button("Extract Company Data")

	raw_text_out = gr.Textbox(
	label="Extracted Raw Text (truncated)",
	lines=8
	)
	structured_out = gr.JSON(label="Structured Company Data Signals")

	# IMPORTANT: use run_extraction (URL -> HTML -> signals)
	run_btn.click(
	run_extraction,
	inputs=[url_in],
	outputs=[raw_text_out, structured_out]
	)

	if __name__ == "__main__":
	demo.launch(share=True)