Spaces:

13ze
/

scrape1

Sleeping

App Files Files Community

13ze commited on Jun 20, 2024

Commit

94cefa6

verified ·

1 Parent(s): d93bb24

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -55

app.py CHANGED Viewed

@@ -1,60 +1,49 @@
 import requests
-import csv
 from bs4 import BeautifulSoup
 import gradio as gr
-def scrape_urls(csv_file):
-    posts = []
-    with open(csv_file, 'r') as f:
-        reader = csv.reader(f)
-        next(reader, None)  # Skip the header row if it exists
-        urls = [row[1] for row in reader]
-    for url in urls:
-        print(f"Scraping: {url}")  # Print the URL being scraped for debugging
-        response = requests.get(url)
         soup = BeautifulSoup(response.content, 'html.parser')
-        title = soup.find('h1').text.strip() if soup.find('h1') else "N/A"
-        content_div = soup.find('div', {'class': 'entry-content'})
-        content = content_div.text.strip() if content_div else "N/A"
-        featured_image_element = soup.find('img', {'class': 'featured-image'})
-        featured_image_url = featured_image_element['src'] if featured_image_element else "N/A"
-        category_span = soup.find('span', {'class': 'category'})
-        category = category_span.text.strip() if category_span else "N/A"
-        tags = [tag.text.strip() for tag in soup.find_all('span', {'class': 'tag'})]
-        post = {
-            'title': title,
-            'content': content,
-            'featured_image_url': featured_image_url,
-            'category': category,
-            'tags': tags
-        }
-        posts.append(post)
-    return posts
-def save_posts_to_csv(posts):
-    with open('posts.csv', 'w', newline='', encoding='utf-8') as f:
-        writer = csv.DictWriter(f, fieldnames=['title', 'content', 'featured_image_url', 'category', 'tags'])
-        writer.writeheader()
-        writer.writerows(posts)
-    return 'posts.csv'
-def scrape_and_save(csv_file):
-    posts = scrape_urls(csv_file.name)
-    csv_path = save_posts_to_csv(posts)
-    return csv_path
-demo = gr.Interface(
-    fn=scrape_and_save,
-    inputs=gr.File(label="Upload CSV with URLs"),
-    outputs=gr.File(label="Download Posts CSV"),
-    title="Web Scraping to CSV",
-    description="Upload a CSV file with URLs, and this app will scrape the content and provide a CSV with the results."
-)
-if __name__ == "__main__":
-    demo.launch(share=True)

+# Import required libraries
 import requests
 from bs4 import BeautifulSoup
+import pandas as pd
 import gradio as gr
+# Function to scrape a single post
+def scrape_post(link):
+    response = requests.get(link)
+    soup = BeautifulSoup(response.content, 'html.parser')
+    title = soup.select_one('h1').text.strip()
+    content = soup.select_one('section.entry').text.strip()
+    featured_img = soup.select_one('img.entry-image')['src']
+    category = [a.text for a in soup.select('.entry-cat a')]
+    tags = [a.text for a in soup.select('.entry-tag a')]
+    return {'title': title, 'content': content, 'featured_img': featured_img, 'category': category, 'tags': tags}
+# Function to scrape multiple pages
+def scrape_unwinnable_movies():
+    base_url = 'https://unwinnable.com/category/sections/movies-tv/'
+    next_page = base_url
+    all_posts = []
+    while next_page:
+        response = requests.get(next_page)
         soup = BeautifulSoup(response.content, 'html.parser')
+        post_links = [a['href'] for a in soup.select('.entry-title a')]
+        for link in post_links:
+            post_data = scrape_post(link)
+            all_posts.append(post_data)
+        next_page_elem = soup.select_one('a.next')
+        next_page = next_page_elem['href'] if next_page_elem else None
+    return pd.DataFrame(all_posts)
+# Gradio Interface function
+def scrape_and_display():
+    df = scrape_unwinnable_movies()
+    return gr.DataFrame.update(value=df)
+# Create Gradio Interface
+demo = gr.Interface(fn=scrape_and_display, inputs=[], outputs="dataframe", title="Unwinnable Movies Scraper")
+# Launch the app
+demo.launch()