Spaces:

guifav
/

bypass_badger

Sleeping

App Files Files Community

guifav commited on Aug 20, 2024

Commit

b1e0be4

1 Parent(s): 3c0e696

Update app to Gradio and include specific HF space URL

Browse files

Files changed (3) hide show

README.md +29 -4
app.py +20 -33
requirements.txt +4 -1

README.md CHANGED Viewed

@@ -1,13 +1,38 @@
 ---
-title: Webscrape 12ft
 emoji: 💻
 colorFrom: pink
 colorTo: yellow
-sdk: streamlit
-sdk_version: 1.37.0
 app_file: app.py
 pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Bypass Bader
 emoji: 💻
 colorFrom: pink
 colorTo: yellow
+sdk: gradio
+sdk_version: 3.47.1
 app_file: app.py
 pinned: false
 license: mit
 ---
+# Web Data Scraper - Bypass Badger
+This is a simple web application that allows users to scrape visible text from a given URL. It's built using Gradio for the interface and BeautifulSoup for web scraping.
+## How to Use
+1. Visit the application at: https://guifav-bypass-badger.hf.space
+2. Enter the URL of the web page you want to scrape in the input box.
+3. Click the "Submit" button.
+4. The visible text from the webpage will be displayed in the output box.
+Note: This application respects robots.txt and includes a delay to avoid overloading servers. Use responsibly.
+## Local Development
+To run this application locally:
+1. Clone the repository
+2. Install the requirements: `pip install -r requirements.txt`
+3. Run the app: `python app.py`
+## Deployment
+This application is deployed on Hugging Face Spaces. Any pushes to the main branch will automatically update the deployed version.
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,31 +1,22 @@
-import streamlit as st
 import requests
 from bs4 import BeautifulSoup
 import re
-# Function to scrape only visible text from the given URL
 def scrape_visible_text_from_url(url):
     try:
-        # Modificar o user-agent para simular um navegador
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
-        # Adicionar um atraso para evitar sobrecarregar o servidor
-        import time
         time.sleep(2)
-        # Usar uma sessão para manter cookies
         session = requests.Session()
-        # Fazer uma solicitação inicial para obter cookies
         session.get(url, headers=headers)
-        # Fazer a solicitação real
         response = session.get(url, headers=headers)
         response.raise_for_status()
-        # Resto do código permanece o mesmo
         soup = BeautifulSoup(response.content, 'html.parser')
         for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
@@ -38,32 +29,28 @@ def scrape_visible_text_from_url(url):
         paragraph_text = " ".join([p.get_text() for p in paragraph_content])
         visible_text = f"{header_text}\n\n{paragraph_text}"
         visible_text = re.sub(r'\s+', ' ', visible_text)
         return visible_text.strip()
     except Exception as e:
-        st.error(f"Error occurred while scraping the data: {e}")
-        return None
-# Streamlit UI
-def main():
-    st.title("Web Data Scraper")
-    # Get the URL from the user
-    url_input = st.text_input("Enter the URL of the web page:", "")
-    if st.button("Scrape Visible Text"):
-        if url_input:
-            # Extract visible text from the URL
-            data = scrape_visible_text_from_url(url_input)
-            if data:
-                st.success("Visible text successfully scraped!")
-                st.subheader("Scraped Text:")
-                st.write(data)
-            else:
-                st.warning("Failed to scrape visible text from the URL.")
         else:
-            st.warning("Please enter a valid URL.")
 if __name__ == "__main__":
-    main()

+import gradio as gr
 import requests
 from bs4 import BeautifulSoup
 import re
+import time
 def scrape_visible_text_from_url(url):
     try:
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
         time.sleep(2)
         session = requests.Session()
         session.get(url, headers=headers)
         response = session.get(url, headers=headers)
         response.raise_for_status()
         soup = BeautifulSoup(response.content, 'html.parser')
         for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "aside", "nav", "img"]):
         paragraph_text = " ".join([p.get_text() for p in paragraph_content])
         visible_text = f"{header_text}\n\n{paragraph_text}"
         visible_text = re.sub(r'\s+', ' ', visible_text)
         return visible_text.strip()
     except Exception as e:
+        return f"Error occurred while scraping the data: {e}"
+def scrape_and_display(url):
+    if url:
+        data = scrape_visible_text_from_url(url)
+        if data:
+            return f"Scraped Text:\n\n{data}"
         else:
+            return "Failed to scrape visible text from the URL."
+    else:
+        return "Please enter a valid URL."
+iface = gr.Interface(
+    fn=scrape_and_display,
+    inputs=gr.Textbox(label="Enter the URL of the web page:"),
+    outputs=gr.Textbox(label="Scraped Result"),
+    title="Web Data Scraper",
+    description="Enter a URL to scrape visible text from the web page."
+)
 if __name__ == "__main__":
+    iface.launch()

requirements.txt CHANGED Viewed

@@ -29,7 +29,7 @@ packaging==23.1
 pandas==2.0.3
 Pillow==9.5.0
 protobuf==4.23.4
-pyarrow==12.0.1
 pydeck==0.8.0
 Pygments==2.15.1
 Pympler==1.0.1
@@ -58,3 +58,6 @@ validators==0.20.0
 watchdog==3.0.0
 yarl==1.9.2
 zipp==3.16.2

 pandas==2.0.3
 Pillow==9.5.0
 protobuf==4.23.4
+pyarrow==12.0.0
 pydeck==0.8.0
 Pygments==2.15.1
 Pympler==1.0.1
 watchdog==3.0.0
 yarl==1.9.2
 zipp==3.16.2
+gradio==3.47.1
+beautifulsoup4==4.12.2
+requests==2.31.0