LightNovel / scraper.py
aliSaac510's picture
Update scraper.py
fda0afb verified
import requests
from bs4 import BeautifulSoup
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
def _scrape_novel_list_page(page_url):
response = requests.get(page_url, headers=HEADERS)
soup = BeautifulSoup(response.content, 'html.parser')
results = []
main_content = soup.find('div', class_='col-truyen-main')
if main_content:
for row in main_content.find_all('div', class_='row'):
title_element = row.find('h3', class_='truyen-title')
if title_element and title_element.find('a'):
title_anchor = title_element.find('a')
title = title_anchor.text
url = title_anchor['href']
author_element = row.find('span', class_='author')
author = author_element.text.strip() if author_element else 'N/A'
# --- بداية التعديل: استخراج الصورة ---
image_url = None
# عادة ما تكون الصورة في div بجانب العنوان (مثلاً col-xs-3)
# سنبحث عن أول صورة في السطر
img_tag = row.find('img')
if img_tag:
# قد تكون الصورة في src أو data-cover/data-src
if 'src' in img_tag.attrs:
image_url = img_tag['src']
elif 'data-src' in img_tag.attrs:
image_url = img_tag['data-src']
elif 'data-cover' in img_tag.attrs:
image_url = img_tag['data-cover']
# إصلاح الروابط النسبية
if image_url and image_url.startswith('/'):
image_url = f"https://novelfull.net{image_url}"
# تعيين صورة افتراضية إذا لم توجد
if not image_url:
image_url = "https://placehold.co/200x300?text=No+Image"
# --- نهاية التعديل ---
results.append({
'title': title,
'url': f"https://novelfull.net{url}",
'author': author,
'image_url': image_url # <-- إضافة هذا الحقل ضروري
})
return results
def search_novel(query, page=1):
search_url = f"https://novelfull.net/search?keyword={query}&page={page}"
return _scrape_novel_list_page(search_url)
def get_latest_release_novels(page=1):
url = f"https://novelfull.net/latest-release-novel?page={page}"
return _scrape_novel_list_page(url)
def get_hot_novels(page=1):
url = f"https://novelfull.net/hot-novel?page={page}"
return _scrape_novel_list_page(url)
def get_completed_novels(page=1):
url = f"https://novelfull.net/completed-novel?page={page}"
return _scrape_novel_list_page(url)
def get_most_popular_novels(page=1):
url = f"https://novelfull.net/most-popular?page={page}"
return _scrape_novel_list_page(url)
def get_genres():
return [
"Shounen", "Horror", "Slice of Life",
"Harem", "Drama", "Seinen",
"Comedy", "Tragedy", "Lolicon",
"Martial Arts", "Supernatural", "Adult",
"School Life", "Ecchi", "Josei",
"Mystery", "Xuanhuan", "Sports",
"Shoujo", "Adventure", "Smut",
"Romance", "Action", "Mecha",
"Sci-fi", "Psychological", "Yaoi",
"Gender Bender", "Xianxia", "Shounen Ai",
"Mature", "Wuxia", "Magical Realism",
"Fantasy", "Historical", "Video Games"
]
def get_novels_by_genre(genre_name, page=1):
genre_slug = genre_name.replace(' ', '-')
genre_url = f"https://novelfull.net/genre/{genre_slug}?page={page}"
return _scrape_novel_list_page(genre_url)
def get_novel_content(url):
response = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(response.content, 'html.parser')
title_element = soup.find('h3', class_='title')
title = title_element.text if title_element else 'N/A'
author = 'N/A'
info_div = soup.find('div', class_='info')
if info_div:
author_div = info_div.find('h3', text='Author:')
if author_div and author_div.find_next_sibling('a'):
author = author_div.find_next_sibling('a').text
image_url = 'N/A'
book_div = soup.find('div', class_='book')
if book_div and book_div.find('img'):
image_tag = book_div.find('img')
if 'src' in image_tag.attrs:
src = image_tag['src']
if src.startswith('/'):
image_url = f"https://novelfull.net{src}"
else:
image_url = src
chapters = []
list_chapter_div = soup.find('div', id='list-chapter')
if list_chapter_div:
chapter_lists = list_chapter_div.select('.list-chapter')
for chapter_list in chapter_lists:
for chapter_item in chapter_list.find_all('a'):
chapters.append({
'title': chapter_item.get('title', chapter_item.text.strip()),
'url': f"https://novelfull.net{chapter_item['href']}"
})
return {
'title': title,
'author': author,
'image_url': image_url,
'chapters': chapters
}
def get_chapter_content(url):
response = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(response.content, 'html.parser')
content_element = soup.find('div', id='chapter-content')
content = ''
if content_element:
paragraphs = content_element.find_all('p')
if paragraphs:
content = '\n\n'.join(p.get_text() for p in paragraphs)
else:
content = content_element.get_text(separator='\n\n')
return {
'content': content.strip() if content else 'N/A'
}