LianHP commited on
Commit
530143f
·
verified ·
1 Parent(s): eed5224

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. .gradio/certificate.pem +31 -0
  2. README.md +3 -9
  3. app.py +95 -0
  4. requirements.txt +3 -0
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Web Page Data Html
3
- emoji: 🐨
4
- colorFrom: blue
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 6.0.2
8
  app_file: app.py
9
- pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Web_page_data_html
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 5.47.2
6
  ---
 
 
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import gradio as gr
5
+
6
+
7
+ def fetch_webpage(url: str) -> str:
8
+ """Fetch raw HTML from a webpage."""
9
+ try:
10
+ response = requests.get(url, timeout=10)
11
+ response.raise_for_status()
12
+ return response.text
13
+ except Exception as e:
14
+ return f"ERROR: Unable to fetch page -> {e}"
15
+
16
+
17
+ def extract_company_signals_from_html(html: str):
18
+ """Extract meaningful company-related fields using simple rule-based patterns."""
19
+
20
+ if html.startswith("ERROR:"):
21
+ return html, {}
22
+
23
+ soup = BeautifulSoup(html, "html.parser")
24
+
25
+ # Extract visible text
26
+ text = soup.get_text(separator=" ", strip=True)
27
+ text = re.sub(r"\s+", " ", text)
28
+
29
+ # --- Example Extractors ---
30
+ emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
31
+ phones = re.findall(r"\+?\d[\d\-\(\) ]{6,}\d", text)
32
+
33
+ # Detect possible addresses (very simple pattern)
34
+ address_pattern = r"\d{1,5}\s+[A-Za-z0-9\.\- ]+\s+(Street|St|Rd|Road|Ave|Avenue|Blvd|Lane|Ln|Way|Dr)\b.*"
35
+ addresses = re.findall(address_pattern, text, flags=re.IGNORECASE)
36
+
37
+ # Social media profile links
38
+ links = [a.get("href", "") for a in soup.find_all("a")]
39
+ social = {
40
+ "linkedin": [l for l in links if "linkedin.com" in l],
41
+ "facebook": [l for l in links if "facebook.com" in l],
42
+ "twitter": [l for l in links if "twitter.com" in l],
43
+ "instagram": [l for l in links if "instagram.com" in l],
44
+ }
45
+
46
+ # Company name guess (based on <title>)
47
+ title_tag = soup.title.string.strip() if soup.title and soup.title.string else "Unknown"
48
+
49
+ structured = {
50
+ "company_name_guess": title_tag,
51
+ "emails_found": list(set(emails)),
52
+ "phones_found": list(set(phones)),
53
+ "possible_addresses": list(set(addresses)),
54
+ "social_profiles": social,
55
+ }
56
+
57
+ return text[:2000] + "...", structured # limit raw text output
58
+
59
+
60
+ def run_extraction(url: str):
61
+ """Pipeline: URL -> HTML -> extracted signals."""
62
+ html = fetch_webpage(url)
63
+ return extract_company_signals_from_html(html)
64
+
65
+
66
+ with gr.Blocks() as demo:
67
+ gr.Markdown(
68
+ "# Web Company Data Extractor\n"
69
+ "Enter a company website URL. The app fetches the page and extracts useful "
70
+ "unstructured company data signals (emails, phones, social links, etc.).\n"
71
+ "_This is a simple rule-based demo, not a full production parser._"
72
+ )
73
+
74
+ with gr.Row():
75
+ url_in = gr.Textbox(
76
+ label="Company Website URL",
77
+ placeholder="https://www.example.com"
78
+ )
79
+ run_btn = gr.Button("Extract Company Data")
80
+
81
+ raw_text_out = gr.Textbox(
82
+ label="Extracted Raw Text (truncated)",
83
+ lines=8
84
+ )
85
+ structured_out = gr.JSON(label="Structured Company Data Signals")
86
+
87
+ # IMPORTANT: use run_extraction (URL -> HTML -> signals)
88
+ run_btn.click(
89
+ run_extraction,
90
+ inputs=[url_in],
91
+ outputs=[raw_text_out, structured_out]
92
+ )
93
+
94
+ if __name__ == "__main__":
95
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio>=4.16.0,<5.0.0
2
+ beautifulsoup4
3
+ requests