guifav commited on
Commit
b1e0be4
·
1 Parent(s): 3c0e696

Update app to Gradio and include specific HF space URL

Browse files
Files changed (3) hide show
  1. README.md +29 -4
  2. app.py +20 -33
  3. requirements.txt +4 -1
README.md CHANGED
@@ -1,13 +1,38 @@
1
  ---
2
- title: Webscrape 12ft
3
  emoji: 💻
4
  colorFrom: pink
5
  colorTo: yellow
6
- sdk: streamlit
7
- sdk_version: 1.37.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Bypass Bader
3
  emoji: 💻
4
  colorFrom: pink
5
  colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 3.47.1
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
 
13
+ # Web Data Scraper - Bypass Badger
14
+
15
+ This is a simple web application that allows users to scrape visible text from a given URL. It's built using Gradio for the interface and BeautifulSoup for web scraping.
16
+
17
+ ## How to Use
18
+
19
+ 1. Visit the application at: https://guifav-bypass-badger.hf.space
20
+ 2. Enter the URL of the web page you want to scrape in the input box.
21
+ 3. Click the "Submit" button.
22
+ 4. The visible text from the webpage will be displayed in the output box.
23
+
24
+ Note: This application respects robots.txt and includes a delay to avoid overloading servers. Use responsibly.
25
+
26
+ ## Local Development
27
+
28
+ To run this application locally:
29
+
30
+ 1. Clone the repository
31
+ 2. Install the requirements: `pip install -r requirements.txt`
32
+ 3. Run the app: `python app.py`
33
+
34
+ ## Deployment
35
+
36
+ This application is deployed on Hugging Face Spaces. Any pushes to the main branch will automatically update the deployed version.
37
+
38
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,31 +1,22 @@
1
- import streamlit as st
2
  import requests
3
  from bs4 import BeautifulSoup
4
  import re
 
5
 
6
- # Function to scrape only visible text from the given URL
7
  def scrape_visible_text_from_url(url):
8
  try:
9
- # Modificar o user-agent para simular um navegador
10
  headers = {
11
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
12
  }
13
 
14
- # Adicionar um atraso para evitar sobrecarregar o servidor
15
- import time
16
  time.sleep(2)
17
 
18
- # Usar uma sessão para manter cookies
19
  session = requests.Session()
20
-
21
- # Fazer uma solicitação inicial para obter cookies
22
  session.get(url, headers=headers)
23
-
24
- # Fazer a solicitação real
25
  response = session.get(url, headers=headers)
26
  response.raise_for_status()
27
 
28
- # Resto do código permanece o mesmo
29
  soup = BeautifulSoup(response.content, 'html.parser')
30
 
31
  for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
@@ -38,32 +29,28 @@ def scrape_visible_text_from_url(url):
38
  paragraph_text = " ".join([p.get_text() for p in paragraph_content])
39
 
40
  visible_text = f"{header_text}\n\n{paragraph_text}"
41
-
42
  visible_text = re.sub(r'\s+', ' ', visible_text)
43
  return visible_text.strip()
44
  except Exception as e:
45
- st.error(f"Error occurred while scraping the data: {e}")
46
- return None
47
-
48
- # Streamlit UI
49
- def main():
50
- st.title("Web Data Scraper")
51
-
52
- # Get the URL from the user
53
- url_input = st.text_input("Enter the URL of the web page:", "")
54
 
55
- if st.button("Scrape Visible Text"):
56
- if url_input:
57
- # Extract visible text from the URL
58
- data = scrape_visible_text_from_url(url_input)
59
- if data:
60
- st.success("Visible text successfully scraped!")
61
- st.subheader("Scraped Text:")
62
- st.write(data)
63
- else:
64
- st.warning("Failed to scrape visible text from the URL.")
65
  else:
66
- st.warning("Please enter a valid URL.")
 
 
 
 
 
 
 
 
 
 
67
 
68
  if __name__ == "__main__":
69
- main()
 
1
+ import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
4
  import re
5
+ import time
6
 
 
7
  def scrape_visible_text_from_url(url):
8
  try:
 
9
  headers = {
10
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
11
  }
12
 
 
 
13
  time.sleep(2)
14
 
 
15
  session = requests.Session()
 
 
16
  session.get(url, headers=headers)
 
 
17
  response = session.get(url, headers=headers)
18
  response.raise_for_status()
19
 
 
20
  soup = BeautifulSoup(response.content, 'html.parser')
21
 
22
  for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
 
29
  paragraph_text = " ".join([p.get_text() for p in paragraph_content])
30
 
31
  visible_text = f"{header_text}\n\n{paragraph_text}"
 
32
  visible_text = re.sub(r'\s+', ' ', visible_text)
33
  return visible_text.strip()
34
  except Exception as e:
35
+ return f"Error occurred while scraping the data: {e}"
 
 
 
 
 
 
 
 
36
 
37
+ def scrape_and_display(url):
38
+ if url:
39
+ data = scrape_visible_text_from_url(url)
40
+ if data:
41
+ return f"Scraped Text:\n\n{data}"
 
 
 
 
 
42
  else:
43
+ return "Failed to scrape visible text from the URL."
44
+ else:
45
+ return "Please enter a valid URL."
46
+
47
+ iface = gr.Interface(
48
+ fn=scrape_and_display,
49
+ inputs=gr.Textbox(label="Enter the URL of the web page:"),
50
+ outputs=gr.Textbox(label="Scraped Result"),
51
+ title="Web Data Scraper",
52
+ description="Enter a URL to scrape visible text from the web page."
53
+ )
54
 
55
  if __name__ == "__main__":
56
+ iface.launch()
requirements.txt CHANGED
@@ -29,7 +29,7 @@ packaging==23.1
29
  pandas==2.0.3
30
  Pillow==9.5.0
31
  protobuf==4.23.4
32
- pyarrow==12.0.1
33
  pydeck==0.8.0
34
  Pygments==2.15.1
35
  Pympler==1.0.1
@@ -58,3 +58,6 @@ validators==0.20.0
58
  watchdog==3.0.0
59
  yarl==1.9.2
60
  zipp==3.16.2
 
 
 
 
29
  pandas==2.0.3
30
  Pillow==9.5.0
31
  protobuf==4.23.4
32
+ pyarrow==12.0.0
33
  pydeck==0.8.0
34
  Pygments==2.15.1
35
  Pympler==1.0.1
 
58
  watchdog==3.0.0
59
  yarl==1.9.2
60
  zipp==3.16.2
61
+ gradio==3.47.1
62
+ beautifulsoup4==4.12.2
63
+ requests==2.31.0