Spaces:

aliSaac510
/

LightNovel

Sleeping

App Files Files Community

LightNovel / scraper.py

aliSaac510

Update scraper.py

fda0afb verified 3 months ago

raw

history blame contribute delete

6 kB


	import requests
	from bs4 import BeautifulSoup

	HEADERS = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
	}

	def _scrape_novel_list_page(page_url):
	response = requests.get(page_url, headers=HEADERS)
	soup = BeautifulSoup(response.content, 'html.parser')

	results = []
	main_content = soup.find('div', class_='col-truyen-main')
	if main_content:
	for row in main_content.find_all('div', class_='row'):
	title_element = row.find('h3', class_='truyen-title')
	if title_element and title_element.find('a'):
	title_anchor = title_element.find('a')
	title = title_anchor.text
	url = title_anchor['href']

	author_element = row.find('span', class_='author')
	author = author_element.text.strip() if author_element else 'N/A'

	# --- بداية التعديل: استخراج الصورة ---
	image_url = None
	# عادة ما تكون الصورة في div بجانب العنوان (مثلاً col-xs-3)
	# سنبحث عن أول صورة في السطر
	img_tag = row.find('img')
	if img_tag:
	# قد تكون الصورة في src أو data-cover/data-src
	if 'src' in img_tag.attrs:
	image_url = img_tag['src']
	elif 'data-src' in img_tag.attrs:
	image_url = img_tag['data-src']
	elif 'data-cover' in img_tag.attrs:
	image_url = img_tag['data-cover']

	# إصلاح الروابط النسبية
	if image_url and image_url.startswith('/'):
	image_url = f"https://novelfull.net{image_url}"

	# تعيين صورة افتراضية إذا لم توجد
	if not image_url:
	image_url = "https://placehold.co/200x300?text=No+Image"
	# --- نهاية التعديل ---

	results.append({
	'title': title,
	'url': f"https://novelfull.net{url}",
	'author': author,
	'image_url': image_url # <-- إضافة هذا الحقل ضروري
	})
	return results

	def search_novel(query, page=1):
	search_url = f"https://novelfull.net/search?keyword={query}&page={page}"
	return _scrape_novel_list_page(search_url)

	def get_latest_release_novels(page=1):
	url = f"https://novelfull.net/latest-release-novel?page={page}"
	return _scrape_novel_list_page(url)

	def get_hot_novels(page=1):
	url = f"https://novelfull.net/hot-novel?page={page}"
	return _scrape_novel_list_page(url)

	def get_completed_novels(page=1):
	url = f"https://novelfull.net/completed-novel?page={page}"
	return _scrape_novel_list_page(url)

	def get_most_popular_novels(page=1):
	url = f"https://novelfull.net/most-popular?page={page}"
	return _scrape_novel_list_page(url)

	def get_genres():
	return [
	"Shounen", "Horror", "Slice of Life",
	"Harem", "Drama", "Seinen",
	"Comedy", "Tragedy", "Lolicon",
	"Martial Arts", "Supernatural", "Adult",
	"School Life", "Ecchi", "Josei",
	"Mystery", "Xuanhuan", "Sports",
	"Shoujo", "Adventure", "Smut",
	"Romance", "Action", "Mecha",
	"Sci-fi", "Psychological", "Yaoi",
	"Gender Bender", "Xianxia", "Shounen Ai",
	"Mature", "Wuxia", "Magical Realism",
	"Fantasy", "Historical", "Video Games"
	]

	def get_novels_by_genre(genre_name, page=1):
	genre_slug = genre_name.replace(' ', '-')
	genre_url = f"https://novelfull.net/genre/{genre_slug}?page={page}"
	return _scrape_novel_list_page(genre_url)

	def get_novel_content(url):
	response = requests.get(url, headers=HEADERS)
	soup = BeautifulSoup(response.content, 'html.parser')

	title_element = soup.find('h3', class_='title')
	title = title_element.text if title_element else 'N/A'

	author = 'N/A'
	info_div = soup.find('div', class_='info')
	if info_div:
	author_div = info_div.find('h3', text='Author:')
	if author_div and author_div.find_next_sibling('a'):
	author = author_div.find_next_sibling('a').text

	image_url = 'N/A'
	book_div = soup.find('div', class_='book')
	if book_div and book_div.find('img'):
	image_tag = book_div.find('img')
	if 'src' in image_tag.attrs:
	src = image_tag['src']
	if src.startswith('/'):
	image_url = f"https://novelfull.net{src}"
	else:
	image_url = src

	chapters = []
	list_chapter_div = soup.find('div', id='list-chapter')
	if list_chapter_div:
	chapter_lists = list_chapter_div.select('.list-chapter')
	for chapter_list in chapter_lists:
	for chapter_item in chapter_list.find_all('a'):
	chapters.append({
	'title': chapter_item.get('title', chapter_item.text.strip()),
	'url': f"https://novelfull.net{chapter_item['href']}"
	})

	return {
	'title': title,
	'author': author,
	'image_url': image_url,
	'chapters': chapters
	}

	def get_chapter_content(url):
	response = requests.get(url, headers=HEADERS)
	soup = BeautifulSoup(response.content, 'html.parser')

	content_element = soup.find('div', id='chapter-content')
	content = ''
	if content_element:
	paragraphs = content_element.find_all('p')
	if paragraphs:
	content = '\n\n'.join(p.get_text() for p in paragraphs)
	else:
	content = content_element.get_text(separator='\n\n')

	return {
	'content': content.strip() if content else 'N/A'
	}