Spaces:

aliSaac510
/

my-book-api

Sleeping

App Files Files Community

my-book-api / scraper.py

aliSaac510

Create scraper.py

486d43d verified 3 months ago

raw

history blame contribute delete

6.41 kB


	import requests
	from bs4 import BeautifulSoup
	import logging
	import time

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	filename='scraper.log',
	filemode='w'
	)

	# Mapping from Arabic to English
	ARABIC_TO_ENGLISH_LEVELS = {
	'مبتدئ': 'Beginner',
	'متوسّط': 'Intermediate',
	'متقدّم': 'Advanced',
	'متقن': 'Proficient'
	}

	def _parse_books_from_html(soup: BeautifulSoup) -> list:
	"""Helper function to parse a BeautifulSoup object and extract book list details."""
	base_url = "https://3asafeer.com"
	book_cards = soup.find_all("div", class_="resources-card")
	books_on_page = []
	for card in book_cards:
	link_tag = card.find_parent("a")
	if not link_tag:
	continue
	book_url = link_tag.get("href", '#')
	if book_url.startswith('//'):
	book_url = f"https:{book_url}"
	elif not book_url.startswith('http'):
	book_url = f"{base_url}{book_url}"
	title_tag = card.find("h3", class_="resources-card-title")
	book_title = title_tag.text.strip() if title_tag else "No Title Found"
	img_tag = card.find("img")
	book_cover_url = img_tag.get("src") if img_tag else ""
	level_tag = card.find("span", class_="purple-badge")
	arabic_level = level_tag.text.strip() if level_tag else 'Unknown'
	api_level = ARABIC_TO_ENGLISH_LEVELS.get(arabic_level, 'Unknown')
	books_on_page.append({
	"title": book_title,
	"url": book_url,
	"cover_image_url": book_cover_url,
	"level": api_level
	})
	return books_on_page

	def scrape_books_by_level(level: str) -> list:
	"""Scrapes all books for a given level, handling pagination."""
	allowed_levels = ['Beginner', 'Intermediate', 'Advanced', 'Proficient']
	if level != 'All' and level not in allowed_levels:
	return []
	base_url = "https://3asafeer.com"
	api_url = f"{base_url}/caller"
	session = requests.Session()
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Referer': f'{base_url}/ar/free_books'
	}
	all_books = []
	levels_to_scrape = allowed_levels if level == 'All' else [level]
	for current_level in levels_to_scrape:
	page_number = 0
	while True:
	params = {'page': 'read', 'task': 'anonbookspaged', 'format': 'html', 'k': '', 'lvl': current_level, 'p': page_number}
	try:
	response = session.get(api_url, params=params, headers=headers)
	response.raise_for_status()
	if not response.text.strip(): break
	soup = BeautifulSoup(response.content, "html.parser")
	books_from_page = _parse_books_from_html(soup)
	if not books_from_page: break
	all_books.extend(books_from_page)
	page_number += 1
	time.sleep(0.2)
	except requests.exceptions.RequestException as e:
	logging.error(f"Request for level '{current_level}' failed: {e}")
	break
	return all_books

	def scrape_book_content(book_url: str) -> dict:
	"""Scrapes title, publisher, and all pages (image, text, audio) of a single book, handling different HTML structures."""
	logging.info(f"Starting final robust content scrape for URL: {book_url}")
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	response = requests.get(book_url, headers=headers)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')

	title_tag = soup.find('h1', class_='viewtitle')
	title = title_tag.text.strip() if title_tag else "Title not found"

	publisher_tag = soup.find('div', id='publisher')
	publisher = publisher_tag.text.replace('الناشر:', '').strip() if publisher_tag else None

	page_container = soup.find('div', id='reel')
	if not page_container:
	return None

	slides = page_container.find_all('div', class_='slide')
	pages_content = []
	for slide in slides:
	order = slide.get('order')
	if order is None:
	continue

	# --- Image URL ---
	image_source_tag = slide.find('source', type='image/webp')
	image_url = image_source_tag.get('srcset') if image_source_tag else None
	if not image_url:
	img_tag = slide.find('img')
	image_url = img_tag.get('src') if img_tag else None

	# --- Text (Robust Logic) ---
	page_text = ""
	# 1. Try the visible div first
	text_div = slide.find('div', id=f"script-{order}")
	if text_div and text_div.text.strip() not in ['', ' ']:
	page_text = text_div.text.strip()
	# 2. Fallback to the hidden input if the div is empty
	else:
	text_input = slide.find('input', id=f"slide-script-{order}")
	if text_input:
	page_text = text_input.get('value', '').strip()

	# --- Audio URLs ---
	audio_urls = []
	audio_tag = slide.find('audio')
	if audio_tag:
	source_tags = audio_tag.find_all('source')
	audio_urls = [source.get('src') for source in source_tags if source.get('src')]

	if image_url or page_text or audio_urls:
	pages_content.append({
	'page_number': int(order) + 1,
	'image_url': image_url,
	'text': page_text,
	'audio_urls': audio_urls
	})

	return {
	'title': title,
	'publisher': publisher,
	'book_url': book_url,
	'pages': pages_content
	}

	except requests.exceptions.RequestException as e:
	logging.error(f"Failed to fetch book content from {book_url}: {e}")
	return None
	except Exception as e:
	logging.error(f"An unexpected error occurred while parsing {book_url}: {e}")
	return None