13ze commited on
Commit
94cefa6
·
verified ·
1 Parent(s): d93bb24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -55
app.py CHANGED
@@ -1,60 +1,49 @@
 
1
  import requests
2
- import csv
3
  from bs4 import BeautifulSoup
 
4
  import gradio as gr
5
 
6
- def scrape_urls(csv_file):
7
- posts = []
8
- with open(csv_file, 'r') as f:
9
- reader = csv.reader(f)
10
- next(reader, None) # Skip the header row if it exists
11
- urls = [row[1] for row in reader]
12
-
13
- for url in urls:
14
- print(f"Scraping: {url}") # Print the URL being scraped for debugging
15
- response = requests.get(url)
 
 
 
 
 
 
 
 
 
 
 
16
  soup = BeautifulSoup(response.content, 'html.parser')
17
-
18
- title = soup.find('h1').text.strip() if soup.find('h1') else "N/A"
19
- content_div = soup.find('div', {'class': 'entry-content'})
20
- content = content_div.text.strip() if content_div else "N/A"
21
- featured_image_element = soup.find('img', {'class': 'featured-image'})
22
- featured_image_url = featured_image_element['src'] if featured_image_element else "N/A"
23
- category_span = soup.find('span', {'class': 'category'})
24
- category = category_span.text.strip() if category_span else "N/A"
25
- tags = [tag.text.strip() for tag in soup.find_all('span', {'class': 'tag'})]
26
-
27
- post = {
28
- 'title': title,
29
- 'content': content,
30
- 'featured_image_url': featured_image_url,
31
- 'category': category,
32
- 'tags': tags
33
- }
34
-
35
- posts.append(post)
36
-
37
- return posts
38
-
39
- def save_posts_to_csv(posts):
40
- with open('posts.csv', 'w', newline='', encoding='utf-8') as f:
41
- writer = csv.DictWriter(f, fieldnames=['title', 'content', 'featured_image_url', 'category', 'tags'])
42
- writer.writeheader()
43
- writer.writerows(posts)
44
- return 'posts.csv'
45
-
46
- def scrape_and_save(csv_file):
47
- posts = scrape_urls(csv_file.name)
48
- csv_path = save_posts_to_csv(posts)
49
- return csv_path
50
-
51
- demo = gr.Interface(
52
- fn=scrape_and_save,
53
- inputs=gr.File(label="Upload CSV with URLs"),
54
- outputs=gr.File(label="Download Posts CSV"),
55
- title="Web Scraping to CSV",
56
- description="Upload a CSV file with URLs, and this app will scrape the content and provide a CSV with the results."
57
- )
58
-
59
- if __name__ == "__main__":
60
- demo.launch(share=True)
 
1
+ # Import required libraries
2
  import requests
 
3
  from bs4 import BeautifulSoup
4
+ import pandas as pd
5
  import gradio as gr
6
 
7
+ # Function to scrape a single post
8
+ def scrape_post(link):
9
+ response = requests.get(link)
10
+ soup = BeautifulSoup(response.content, 'html.parser')
11
+
12
+ title = soup.select_one('h1').text.strip()
13
+ content = soup.select_one('section.entry').text.strip()
14
+ featured_img = soup.select_one('img.entry-image')['src']
15
+ category = [a.text for a in soup.select('.entry-cat a')]
16
+ tags = [a.text for a in soup.select('.entry-tag a')]
17
+
18
+ return {'title': title, 'content': content, 'featured_img': featured_img, 'category': category, 'tags': tags}
19
+
20
+ # Function to scrape multiple pages
21
+ def scrape_unwinnable_movies():
22
+ base_url = 'https://unwinnable.com/category/sections/movies-tv/'
23
+ next_page = base_url
24
+ all_posts = []
25
+
26
+ while next_page:
27
+ response = requests.get(next_page)
28
  soup = BeautifulSoup(response.content, 'html.parser')
29
+
30
+ post_links = [a['href'] for a in soup.select('.entry-title a')]
31
+ for link in post_links:
32
+ post_data = scrape_post(link)
33
+ all_posts.append(post_data)
34
+
35
+ next_page_elem = soup.select_one('a.next')
36
+ next_page = next_page_elem['href'] if next_page_elem else None
37
+
38
+ return pd.DataFrame(all_posts)
39
+
40
+ # Gradio Interface function
41
+ def scrape_and_display():
42
+ df = scrape_unwinnable_movies()
43
+ return gr.DataFrame.update(value=df)
44
+
45
+ # Create Gradio Interface
46
+ demo = gr.Interface(fn=scrape_and_display, inputs=[], outputs="dataframe", title="Unwinnable Movies Scraper")
47
+
48
+ # Launch the app
49
+ demo.launch()