Spaces:

13ze
/

scrape1

Sleeping

App Files Files Community

13ze commited on Jun 20, 2024

Commit

28d80c3

verified ·

1 Parent(s): caafb3c

Create app.py

Browse files

Files changed (1) hide show

app.py +53 -0

app.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import time
+from random import randint
+import gradio as gr
+def scrape_data_with_retry(url, retries=3):
+    for attempt in range(retries):
+        try:
+            response = requests.get(url, timeout=10)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'html.parser')
+            title = soup.find('title').text if soup.find('title') else 'N/A'
+            content = soup.find('div', class_='entry-content').text if soup.find('div', class_='entry-content') else 'N/A'
+            featured_image = soup.find('meta', property='og:image')['content'] if soup.find('meta', property='og:image') else 'N/A'
+            category = ', '.join([cat.text for cat in soup.find_all('a', rel='category tag')]) if soup.find_all('a', rel='category tag') else 'N/A'
+            tags = ', '.join([tag.text for tag in soup.find_all('a', rel='tag')]) if soup.find_all('a', rel='tag') else 'N/A'
+            return {'title': title, 'content': content, 'featured_image': featured_image, 'category': category, 'tags': tags}
+        except (requests.RequestException, ValueError) as e:
+            time.sleep(2 ** attempt + randint(0, 1000) / 1000)
+    return {'title': 'N/A', 'content': 'N/A', 'featured_image': 'N/A', 'category': 'N/A', 'tags': 'N/A'}
+def scrape_all(urls):
+    scraped_data = []
+    for url in urls:
+        data = scrape_data_with_retry(url)
+        scraped_data.append(data)
+        time.sleep(randint(1, 3))  # Sleep to avoid overloading the server
+    return scraped_data
+# Gradio interface
+def scrape_interface(file):
+    scrape_links_df = pd.read_csv(file.name)
+    urls = scrape_links_df['URL'].tolist()
+    scraped_data = scrape_all(urls)
+    scraped_df = pd.DataFrame(scraped_data)
+    output_file_path = 'scraped_data.csv'
+    scraped_df.to_csv(output_file_path, index=False)
+    return output_file_path
+demo = gr.Interface(
+    fn=scrape_interface,
+    inputs="file",
+    outputs="file",
+    title="Web Scraper",
+    description="Upload a CSV file containing URLs to scrape."
+)
+demo.launch(share=True)