|
|
|
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
import pandas as pd |
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
def scrape_post(link): |
|
|
response = requests.get(link) |
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
title = soup.select_one('h1').text.strip() |
|
|
content = soup.select_one('section.entry').text.strip() |
|
|
featured_img = soup.select_one('img.entry-image')['src'] |
|
|
category = [a.text for a in soup.select('.entry-cat a')] |
|
|
tags = [a.text for a in soup.select('.entry-tag a')] |
|
|
|
|
|
return {'title': title, 'content': content, 'featured_img': featured_img, 'category': category, 'tags': tags} |
|
|
|
|
|
|
|
|
def scrape_unwinnable_movies(): |
|
|
base_url = 'https://unwinnable.com/category/sections/movies-tv/' |
|
|
next_page = base_url |
|
|
all_posts = [] |
|
|
|
|
|
while next_page: |
|
|
response = requests.get(next_page) |
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
post_links = [a['href'] for a in soup.select('.entry-title a')] |
|
|
for link in post_links: |
|
|
post_data = scrape_post(link) |
|
|
all_posts.append(post_data) |
|
|
|
|
|
next_page_elem = soup.select_one('a.next') |
|
|
next_page = next_page_elem['href'] if next_page_elem else None |
|
|
|
|
|
return pd.DataFrame(all_posts) |
|
|
|
|
|
|
|
|
def scrape_and_display(): |
|
|
df = scrape_unwinnable_movies() |
|
|
return gr.DataFrame.update(value=df) |
|
|
|
|
|
|
|
|
demo = gr.Interface(fn=scrape_and_display, inputs=[], outputs="dataframe", title="Unwinnable Movies Scraper") |
|
|
|
|
|
|
|
|
demo.launch() |
|
|
|