LuisMBA commited on
Commit
173f415
·
verified ·
1 Parent(s): e78dd33

Scraper version 00

Browse files
Files changed (1) hide show
  1. books_scraper.py +38 -0
books_scraper.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+
5
+ # Base URL from open library
6
+ BASE_URL = "https://openlibrary.org/subjects/"
7
+
8
+ # Extract an specific genre
9
+ def scrape_books(genre, max_books=50):
10
+ url = f"{BASE_URL}{genre}.json?limit={max_books}"
11
+ response = requests.get(url)
12
+ if response.status_code != 200:
13
+ print(f"Error accesing website: {response.status_code}")
14
+ return []
15
+
16
+ # Parsear JSON
17
+ data = response.json()
18
+ books = []
19
+ for book in data.get("works", []):
20
+ books.append({
21
+ "title": book.get("title", "Unknown"),
22
+ "author": ", ".join(author["name"] for author in book.get("authors", [])),
23
+ "year": book.get("first_publish_year", "Unknown"),
24
+ "genre": genre,
25
+ "description": book.get("description", {}).get("value", "No description")
26
+ if isinstance(book.get("description"), dict)
27
+ else book.get("description", "No description")
28
+ })
29
+ return books
30
+
31
+ # Example: Scraping sci-fi books
32
+ genre = "science_fiction"
33
+ books = scrape_books(genre, max_books=100)
34
+
35
+ # Save in csv
36
+ df = pd.DataFrame(books)
37
+ df.to_csv(f"books_{genre}.csv", index=False, encoding="utf-8")
38
+ print(f"Data saved in books_{genre}.csv")