bep40 commited on
Commit
600deca
·
verified ·
1 Parent(s): f0b2c74

Upload src/streamlit_app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +122 -0
src/streamlit_app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Files: 2 files loaded
2
+
3
+ === app.py ===
4
+ import streamlit as st
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ import re
8
+
9
+ # Function to scrape only visible text from the given URL
10
+ def scrape_visible_text_from_url(url):
11
+ try:
12
+ response = requests.get(url)
13
+ response.raise_for_status()
14
+ soup = BeautifulSoup(response.content, 'html.parser')
15
+
16
+ # Remove script, style, and other non-visible tags
17
+ for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
18
+ tag.extract()
19
+
20
+ # Get the header content
21
+ header_content = soup.find("header")
22
+ header_text = header_content.get_text() if header_content else ""
23
+
24
+ # Get the paragraph content
25
+ paragraph_content = soup.find_all("p")
26
+ paragraph_text = " ".join([p.get_text() for p in paragraph_content])
27
+
28
+ # Combine header and paragraph text
29
+ visible_text = f"{header_text}\n\n{paragraph_text}"
30
+
31
+ # Remove multiple whitespaces and newlines
32
+ visible_text = re.sub(r'\s+', ' ', visible_text)
33
+ return visible_text.strip()
34
+ except Exception as e:
35
+ st.error(f"Error occurred while scraping the data: {e}")
36
+ return None
37
+
38
+ # Streamlit UI
39
+ def main():
40
+ st.title("Web Data Scraper")
41
+
42
+ # Get the URL from the user
43
+ url_input = st.text_input("Enter the URL of the web page:", "")
44
+
45
+ if st.button("Scrape Visible Text"):
46
+ if url_input:
47
+ # Extract visible text from the URL
48
+ data = scrape_visible_text_from_url(url_input)
49
+ if data:
50
+ st.success("Visible text successfully scraped!")
51
+ st.subheader("Scraped Text:")
52
+ st.write(data)
53
+ else:
54
+ st.warning("Failed to scrape visible text from the URL.")
55
+ else:
56
+ st.warning("Please enter a valid URL.")
57
+
58
+ if __name__ == "__main__":
59
+ main()
60
+
61
+
62
+ === requirements.txt ===
63
+ aiohttp==3.8.5
64
+ aiosignal==1.3.1
65
+ altair==5.0.1
66
+ async-timeout==4.0.2
67
+ attrs==23.1.0
68
+ beautifulsoup4==4.12.2
69
+ blinker==1.6.2
70
+ bs4==0.0.1
71
+ cachetools==5.3.1
72
+ certifi==2023.7.22
73
+ charset-normalizer==3.2.0
74
+ click==8.1.6
75
+ decorator==5.1.1
76
+ frozenlist==1.4.0
77
+ gitdb==4.0.10
78
+ GitPython==3.1.32
79
+ idna==3.4
80
+ importlib-metadata==6.8.0
81
+ Jinja2==3.1.2
82
+ jsonschema==4.18.4
83
+ jsonschema-specifications==2023.7.1
84
+ markdown-it-py==3.0.0
85
+ MarkupSafe==2.1.3
86
+ mdurl==0.1.2
87
+ multidict==6.0.4
88
+ numpy==1.25.2
89
+ openai==0.27.8
90
+ packaging==23.1
91
+ pandas==2.0.3
92
+ Pillow==9.5.0
93
+ protobuf==4.23.4
94
+ pyarrow==12.0.1
95
+ pydeck==0.8.0
96
+ Pygments==2.15.1
97
+ Pympler==1.0.1
98
+ python-dateutil==2.8.2
99
+ python-dotenv==1.0.0
100
+ pytz==2023.3
101
+ pytz-deprecation-shim==0.1.0.post0
102
+ referencing==0.30.0
103
+ requests==2.31.0
104
+ rich==13.5.2
105
+ rpds-py==0.9.2
106
+ six==1.16.0
107
+ smmap==5.0.0
108
+ soupsieve==2.4.1
109
+ streamlit==1.25.0
110
+ tenacity==8.2.2
111
+ toml==0.10.2
112
+ toolz==0.12.0
113
+ tornado==6.3.2
114
+ tqdm==4.65.0
115
+ typing_extensions==4.7.1
116
+ tzdata==2023.3
117
+ tzlocal==4.3.1
118
+ urllib3==2.0.4
119
+ validators==0.20.0
120
+ watchdog==3.0.0
121
+ yarl==1.9.2
122
+ zipp==3.16.2