Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import logging | |
| import time | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s', | |
| filename='scraper.log', | |
| filemode='w' | |
| ) | |
| # Mapping from Arabic to English | |
| ARABIC_TO_ENGLISH_LEVELS = { | |
| 'مبتدئ': 'Beginner', | |
| 'متوسّط': 'Intermediate', | |
| 'متقدّم': 'Advanced', | |
| 'متقن': 'Proficient' | |
| } | |
| def _parse_books_from_html(soup: BeautifulSoup) -> list: | |
| """Helper function to parse a BeautifulSoup object and extract book list details.""" | |
| base_url = "https://3asafeer.com" | |
| book_cards = soup.find_all("div", class_="resources-card") | |
| books_on_page = [] | |
| for card in book_cards: | |
| link_tag = card.find_parent("a") | |
| if not link_tag: | |
| continue | |
| book_url = link_tag.get("href", '#') | |
| if book_url.startswith('//'): | |
| book_url = f"https:{book_url}" | |
| elif not book_url.startswith('http'): | |
| book_url = f"{base_url}{book_url}" | |
| title_tag = card.find("h3", class_="resources-card-title") | |
| book_title = title_tag.text.strip() if title_tag else "No Title Found" | |
| img_tag = card.find("img") | |
| book_cover_url = img_tag.get("src") if img_tag else "" | |
| level_tag = card.find("span", class_="purple-badge") | |
| arabic_level = level_tag.text.strip() if level_tag else 'Unknown' | |
| api_level = ARABIC_TO_ENGLISH_LEVELS.get(arabic_level, 'Unknown') | |
| books_on_page.append({ | |
| "title": book_title, | |
| "url": book_url, | |
| "cover_image_url": book_cover_url, | |
| "level": api_level | |
| }) | |
| return books_on_page | |
| def scrape_books_by_level(level: str) -> list: | |
| """Scrapes all books for a given level, handling pagination.""" | |
| allowed_levels = ['Beginner', 'Intermediate', 'Advanced', 'Proficient'] | |
| if level != 'All' and level not in allowed_levels: | |
| return [] | |
| base_url = "https://3asafeer.com" | |
| api_url = f"{base_url}/caller" | |
| session = requests.Session() | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'Referer': f'{base_url}/ar/free_books' | |
| } | |
| all_books = [] | |
| levels_to_scrape = allowed_levels if level == 'All' else [level] | |
| for current_level in levels_to_scrape: | |
| page_number = 0 | |
| while True: | |
| params = {'page': 'read', 'task': 'anonbookspaged', 'format': 'html', 'k': '', 'lvl': current_level, 'p': page_number} | |
| try: | |
| response = session.get(api_url, params=params, headers=headers) | |
| response.raise_for_status() | |
| if not response.text.strip(): break | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| books_from_page = _parse_books_from_html(soup) | |
| if not books_from_page: break | |
| all_books.extend(books_from_page) | |
| page_number += 1 | |
| time.sleep(0.2) | |
| except requests.exceptions.RequestException as e: | |
| logging.error(f"Request for level '{current_level}' failed: {e}") | |
| break | |
| return all_books | |
| def scrape_book_content(book_url: str) -> dict: | |
| """Scrapes title, publisher, and all pages (image, text, audio) of a single book, handling different HTML structures.""" | |
| logging.info(f"Starting final robust content scrape for URL: {book_url}") | |
| try: | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| response = requests.get(book_url, headers=headers) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| title_tag = soup.find('h1', class_='viewtitle') | |
| title = title_tag.text.strip() if title_tag else "Title not found" | |
| publisher_tag = soup.find('div', id='publisher') | |
| publisher = publisher_tag.text.replace('الناشر:', '').strip() if publisher_tag else None | |
| page_container = soup.find('div', id='reel') | |
| if not page_container: | |
| return None | |
| slides = page_container.find_all('div', class_='slide') | |
| pages_content = [] | |
| for slide in slides: | |
| order = slide.get('order') | |
| if order is None: | |
| continue | |
| # --- Image URL --- | |
| image_source_tag = slide.find('source', type='image/webp') | |
| image_url = image_source_tag.get('srcset') if image_source_tag else None | |
| if not image_url: | |
| img_tag = slide.find('img') | |
| image_url = img_tag.get('src') if img_tag else None | |
| # --- Text (Robust Logic) --- | |
| page_text = "" | |
| # 1. Try the visible div first | |
| text_div = slide.find('div', id=f"script-{order}") | |
| if text_div and text_div.text.strip() not in ['', ' ']: | |
| page_text = text_div.text.strip() | |
| # 2. Fallback to the hidden input if the div is empty | |
| else: | |
| text_input = slide.find('input', id=f"slide-script-{order}") | |
| if text_input: | |
| page_text = text_input.get('value', '').strip() | |
| # --- Audio URLs --- | |
| audio_urls = [] | |
| audio_tag = slide.find('audio') | |
| if audio_tag: | |
| source_tags = audio_tag.find_all('source') | |
| audio_urls = [source.get('src') for source in source_tags if source.get('src')] | |
| if image_url or page_text or audio_urls: | |
| pages_content.append({ | |
| 'page_number': int(order) + 1, | |
| 'image_url': image_url, | |
| 'text': page_text, | |
| 'audio_urls': audio_urls | |
| }) | |
| return { | |
| 'title': title, | |
| 'publisher': publisher, | |
| 'book_url': book_url, | |
| 'pages': pages_content | |
| } | |
| except requests.exceptions.RequestException as e: | |
| logging.error(f"Failed to fetch book content from {book_url}: {e}") | |
| return None | |
| except Exception as e: | |
| logging.error(f"An unexpected error occurred while parsing {book_url}: {e}") | |
| return None | |