Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gradio/certificate.pem +31 -0
- README.md +3 -9
- app.py +95 -0
- requirements.txt +3 -0
.gradio/certificate.pem
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-----BEGIN CERTIFICATE-----
|
| 2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
| 3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
| 4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
| 5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
| 6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
| 7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
| 8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
| 9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
| 10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
| 11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
| 12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
| 13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
| 14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
| 15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
| 16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
| 17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
| 18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
| 19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
| 20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
| 21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
| 22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
| 23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
| 24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
| 25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
| 26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
| 27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
| 28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
| 29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
| 30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
| 31 |
+
-----END CERTIFICATE-----
|
README.md
CHANGED
|
@@ -1,12 +1,6 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji: 🐨
|
| 4 |
-
colorFrom: blue
|
| 5 |
-
colorTo: green
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version: 6.0.2
|
| 8 |
app_file: app.py
|
| 9 |
-
|
|
|
|
| 10 |
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Web_page_data_html
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
app_file: app.py
|
| 4 |
+
sdk: gradio
|
| 5 |
+
sdk_version: 5.47.2
|
| 6 |
---
|
|
|
|
|
|
app.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import requests
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
import gradio as gr
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def fetch_webpage(url: str) -> str:
|
| 8 |
+
"""Fetch raw HTML from a webpage."""
|
| 9 |
+
try:
|
| 10 |
+
response = requests.get(url, timeout=10)
|
| 11 |
+
response.raise_for_status()
|
| 12 |
+
return response.text
|
| 13 |
+
except Exception as e:
|
| 14 |
+
return f"ERROR: Unable to fetch page -> {e}"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def extract_company_signals_from_html(html: str):
|
| 18 |
+
"""Extract meaningful company-related fields using simple rule-based patterns."""
|
| 19 |
+
|
| 20 |
+
if html.startswith("ERROR:"):
|
| 21 |
+
return html, {}
|
| 22 |
+
|
| 23 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 24 |
+
|
| 25 |
+
# Extract visible text
|
| 26 |
+
text = soup.get_text(separator=" ", strip=True)
|
| 27 |
+
text = re.sub(r"\s+", " ", text)
|
| 28 |
+
|
| 29 |
+
# --- Example Extractors ---
|
| 30 |
+
emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
|
| 31 |
+
phones = re.findall(r"\+?\d[\d\-\(\) ]{6,}\d", text)
|
| 32 |
+
|
| 33 |
+
# Detect possible addresses (very simple pattern)
|
| 34 |
+
address_pattern = r"\d{1,5}\s+[A-Za-z0-9\.\- ]+\s+(Street|St|Rd|Road|Ave|Avenue|Blvd|Lane|Ln|Way|Dr)\b.*"
|
| 35 |
+
addresses = re.findall(address_pattern, text, flags=re.IGNORECASE)
|
| 36 |
+
|
| 37 |
+
# Social media profile links
|
| 38 |
+
links = [a.get("href", "") for a in soup.find_all("a")]
|
| 39 |
+
social = {
|
| 40 |
+
"linkedin": [l for l in links if "linkedin.com" in l],
|
| 41 |
+
"facebook": [l for l in links if "facebook.com" in l],
|
| 42 |
+
"twitter": [l for l in links if "twitter.com" in l],
|
| 43 |
+
"instagram": [l for l in links if "instagram.com" in l],
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
# Company name guess (based on <title>)
|
| 47 |
+
title_tag = soup.title.string.strip() if soup.title and soup.title.string else "Unknown"
|
| 48 |
+
|
| 49 |
+
structured = {
|
| 50 |
+
"company_name_guess": title_tag,
|
| 51 |
+
"emails_found": list(set(emails)),
|
| 52 |
+
"phones_found": list(set(phones)),
|
| 53 |
+
"possible_addresses": list(set(addresses)),
|
| 54 |
+
"social_profiles": social,
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
return text[:2000] + "...", structured # limit raw text output
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def run_extraction(url: str):
|
| 61 |
+
"""Pipeline: URL -> HTML -> extracted signals."""
|
| 62 |
+
html = fetch_webpage(url)
|
| 63 |
+
return extract_company_signals_from_html(html)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
with gr.Blocks() as demo:
|
| 67 |
+
gr.Markdown(
|
| 68 |
+
"# Web Company Data Extractor\n"
|
| 69 |
+
"Enter a company website URL. The app fetches the page and extracts useful "
|
| 70 |
+
"unstructured company data signals (emails, phones, social links, etc.).\n"
|
| 71 |
+
"_This is a simple rule-based demo, not a full production parser._"
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
with gr.Row():
|
| 75 |
+
url_in = gr.Textbox(
|
| 76 |
+
label="Company Website URL",
|
| 77 |
+
placeholder="https://www.example.com"
|
| 78 |
+
)
|
| 79 |
+
run_btn = gr.Button("Extract Company Data")
|
| 80 |
+
|
| 81 |
+
raw_text_out = gr.Textbox(
|
| 82 |
+
label="Extracted Raw Text (truncated)",
|
| 83 |
+
lines=8
|
| 84 |
+
)
|
| 85 |
+
structured_out = gr.JSON(label="Structured Company Data Signals")
|
| 86 |
+
|
| 87 |
+
# IMPORTANT: use run_extraction (URL -> HTML -> signals)
|
| 88 |
+
run_btn.click(
|
| 89 |
+
run_extraction,
|
| 90 |
+
inputs=[url_in],
|
| 91 |
+
outputs=[raw_text_out, structured_out]
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
if __name__ == "__main__":
|
| 95 |
+
demo.launch(share=True)
|
requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.16.0,<5.0.0
|
| 2 |
+
beautifulsoup4
|
| 3 |
+
requests
|