my-book-api / scraper.py
aliSaac510's picture
Create scraper.py
486d43d verified
import requests
from bs4 import BeautifulSoup
import logging
import time
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
filename='scraper.log',
filemode='w'
)
# Mapping from Arabic to English
ARABIC_TO_ENGLISH_LEVELS = {
'مبتدئ': 'Beginner',
'متوسّط': 'Intermediate',
'متقدّم': 'Advanced',
'متقن': 'Proficient'
}
def _parse_books_from_html(soup: BeautifulSoup) -> list:
"""Helper function to parse a BeautifulSoup object and extract book list details."""
base_url = "https://3asafeer.com"
book_cards = soup.find_all("div", class_="resources-card")
books_on_page = []
for card in book_cards:
link_tag = card.find_parent("a")
if not link_tag:
continue
book_url = link_tag.get("href", '#')
if book_url.startswith('//'):
book_url = f"https:{book_url}"
elif not book_url.startswith('http'):
book_url = f"{base_url}{book_url}"
title_tag = card.find("h3", class_="resources-card-title")
book_title = title_tag.text.strip() if title_tag else "No Title Found"
img_tag = card.find("img")
book_cover_url = img_tag.get("src") if img_tag else ""
level_tag = card.find("span", class_="purple-badge")
arabic_level = level_tag.text.strip() if level_tag else 'Unknown'
api_level = ARABIC_TO_ENGLISH_LEVELS.get(arabic_level, 'Unknown')
books_on_page.append({
"title": book_title,
"url": book_url,
"cover_image_url": book_cover_url,
"level": api_level
})
return books_on_page
def scrape_books_by_level(level: str) -> list:
"""Scrapes all books for a given level, handling pagination."""
allowed_levels = ['Beginner', 'Intermediate', 'Advanced', 'Proficient']
if level != 'All' and level not in allowed_levels:
return []
base_url = "https://3asafeer.com"
api_url = f"{base_url}/caller"
session = requests.Session()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': f'{base_url}/ar/free_books'
}
all_books = []
levels_to_scrape = allowed_levels if level == 'All' else [level]
for current_level in levels_to_scrape:
page_number = 0
while True:
params = {'page': 'read', 'task': 'anonbookspaged', 'format': 'html', 'k': '', 'lvl': current_level, 'p': page_number}
try:
response = session.get(api_url, params=params, headers=headers)
response.raise_for_status()
if not response.text.strip(): break
soup = BeautifulSoup(response.content, "html.parser")
books_from_page = _parse_books_from_html(soup)
if not books_from_page: break
all_books.extend(books_from_page)
page_number += 1
time.sleep(0.2)
except requests.exceptions.RequestException as e:
logging.error(f"Request for level '{current_level}' failed: {e}")
break
return all_books
def scrape_book_content(book_url: str) -> dict:
"""Scrapes title, publisher, and all pages (image, text, audio) of a single book, handling different HTML structures."""
logging.info(f"Starting final robust content scrape for URL: {book_url}")
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(book_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
title_tag = soup.find('h1', class_='viewtitle')
title = title_tag.text.strip() if title_tag else "Title not found"
publisher_tag = soup.find('div', id='publisher')
publisher = publisher_tag.text.replace('الناشر:', '').strip() if publisher_tag else None
page_container = soup.find('div', id='reel')
if not page_container:
return None
slides = page_container.find_all('div', class_='slide')
pages_content = []
for slide in slides:
order = slide.get('order')
if order is None:
continue
# --- Image URL ---
image_source_tag = slide.find('source', type='image/webp')
image_url = image_source_tag.get('srcset') if image_source_tag else None
if not image_url:
img_tag = slide.find('img')
image_url = img_tag.get('src') if img_tag else None
# --- Text (Robust Logic) ---
page_text = ""
# 1. Try the visible div first
text_div = slide.find('div', id=f"script-{order}")
if text_div and text_div.text.strip() not in ['', ' ']:
page_text = text_div.text.strip()
# 2. Fallback to the hidden input if the div is empty
else:
text_input = slide.find('input', id=f"slide-script-{order}")
if text_input:
page_text = text_input.get('value', '').strip()
# --- Audio URLs ---
audio_urls = []
audio_tag = slide.find('audio')
if audio_tag:
source_tags = audio_tag.find_all('source')
audio_urls = [source.get('src') for source in source_tags if source.get('src')]
if image_url or page_text or audio_urls:
pages_content.append({
'page_number': int(order) + 1,
'image_url': image_url,
'text': page_text,
'audio_urls': audio_urls
})
return {
'title': title,
'publisher': publisher,
'book_url': book_url,
'pages': pages_content
}
except requests.exceptions.RequestException as e:
logging.error(f"Failed to fetch book content from {book_url}: {e}")
return None
except Exception as e:
logging.error(f"An unexpected error occurred while parsing {book_url}: {e}")
return None