Spaces:

shreyas-joshi
/

CoreReader

Sleeping

App Files Files Community

CoreReader / backend /scraper.py

shreyas-joshi

Sync backend with LN-TTS: fix Chapter 1 scraper bug, add sentence offsets, improve error handling, update README

ccb0e29 3 months ago

raw

history blame contribute delete

7.61 kB

	import aiohttp
	from bs4 import BeautifulSoup
	import re
	from urllib.parse import urljoin


	class NovelCoolScraper:
	def __init__(self):
	self.headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
	}

	async def scrape_chapter(self, url: str):
	async with aiohttp.ClientSession() as session:
	async with session.get(url, headers=self.headers) as response:
	if response.status != 200:
	raise Exception(f"Failed to fetch page: {response.status}")
	html = await response.text()

	# NovelCool pages can be large; lxml parser is more reliable here.
	soup = BeautifulSoup(html, 'lxml')

	# Extract Title
	title = "Unknown Chapter"
	title_tag = soup.find('h1')
	if title_tag:
	title = title_tag.get_text(strip=True)
	else:
	page_title = soup.find('title')
	if page_title:
	t = page_title.get_text(strip=True)
	# e.g. "Shadow Slave Chapter 15 - Novel Cool - Best online light novel reading website"
	title = t.split(' - Novel Cool', 1)[0].strip() or t

	# Extract Content
	# In the HTML variant commonly returned to scripted clients, the actual
	# chapter content lives under: div.site-content > div.overflow-hidden
	content_div = soup.select_one('div.site-content div.overflow-hidden')

	if not content_div:
	# Fallback: pick the div with the most <p> tags.
	best = None
	best_count = 0
	for div in soup.find_all('div'):
	ps = div.find_all('p')
	if len(ps) > best_count:
	best_count = len(ps)
	best = div
	content_div = best

	if not content_div:
	raise Exception("Could not find chapter content container")

	paragraphs = []
	for p in content_div.find_all('p'):
	classes = p.get('class') or []
	txt = p.get_text(' ', strip=True)
	if not txt:
	continue
	if 'chapter-end-mark' in classes or txt.lower().strip() == 'chapter end':
	break
	paragraphs.append(txt)

	if not paragraphs:
	raw_text = content_div.get_text(separator='\n', strip=True)
	paragraphs = [line for line in raw_text.split('\n') if line.strip()]

	content = "\n".join(paragraphs)

	# Extract Next/Prev Links
	next_link = None
	prev_link = None

	for a in soup.find_all('a', href=True):
	t = a.get_text(" ", strip=True)
	href = a.get('href')
	if not href:
	continue
	if '/chapter/' not in href:
	continue
	if not next_link and 'Next' in t:
	next_link = href
	if not prev_link and 'Prev' in t:
	prev_link = href
	if next_link and prev_link:
	break

	if next_link:
	next_link = urljoin(url, next_link)
	if prev_link:
	prev_link = urljoin(url, prev_link)

	return {
	"title": title,
	"content": paragraphs, # Return list of paragraphs for easier chunking
	"next_url": next_link,
	"prev_url": prev_link
	}

	async def scrape_novel_index(self, novel_url: str):
	"""Scrape a NovelCool novel page and return a list of chapter links."""
	async with aiohttp.ClientSession() as session:
	async with session.get(novel_url, headers=self.headers) as response:
	if response.status != 200:
	raise Exception(f"Failed to fetch page: {response.status}")
	html = await response.text()

	soup = BeautifulSoup(html, 'lxml')
	links = []
	seen = set()

	def parse_chapter_number(title: str, url: str) -> int \| None:
	t = (title or '').strip()
	# Best-effort chapter number parsing from visible text.
	m = re.search(r"(?:\bChapter\b\|\bCh\.?\b\|\bC\b)\s*(\d+)", t, flags=re.IGNORECASE)
	if m:
	try:
	n = int(m.group(1))
	return n if n > 0 else None
	except Exception:
	pass

	# Fallback: parse from URL, e.g.
	# /chapter/<Novel>-Chapter-15/<id>/ or .../Chapter_15/... etc.
	u = (url or '')
	m = re.search(r"(?:chapter\|ch)[^0-9]{0,12}(\d+)", u, flags=re.IGNORECASE)
	if m:
	try:
	n = int(m.group(1))
	return n if n > 0 else None
	except Exception:
	pass
	return None

	for a in soup.find_all('a', href=True):
	href = a.get('href')
	if not href:
	continue
	if '/chapter/' not in href:
	continue
	abs_url = urljoin(novel_url, href)
	if abs_url in seen:
	continue
	title = a.get_text(' ', strip=True)
	if not title:
	# Some chapter links have empty text (icons). Skip but do NOT
	# mark as seen — the real link with text may appear later.
	continue
	seen.add(abs_url)
	n = parse_chapter_number(title, abs_url)
	links.append({"n": n, "title": title, "url": abs_url})

	# Sort by chapter number when possible, but preserve stable ordering
	# for unknowns (avoid pushing an unparsed Chapter 1 to the end).
	def chapter_key(item):
	n = item.get('n')
	if isinstance(n, int):
	return (0, n)
	return (1, 0)

	links.sort(key=chapter_key)
	return links

	async def scrape_novel_details(self, novel_url: str):
	"""Scrape a NovelCool novel page and return lightweight metadata.

	Currently returns:
	- title: best-effort title
	- cover_url: absolute URL to the cover image, when detectable
	"""
	async with aiohttp.ClientSession() as session:
	async with session.get(novel_url, headers=self.headers) as response:
	if response.status != 200:
	raise Exception(f"Failed to fetch page: {response.status}")
	html = await response.text()

	soup = BeautifulSoup(html, 'lxml')

	title = None
	t = soup.find('title')
	if t:
	raw = t.get_text(strip=True)
	if raw:
	title = raw.split(' - Novel Cool', 1)[0].strip() or raw

	cover_url = None
	img = soup.select_one('img.bookinfo-pic-img')
	if not img:
	img = soup.select_one('img[itemprop="image"]')
	if img:
	src = img.get('src')
	if src:
	cover_url = urljoin(novel_url, src)

	return {
	"title": title,
	"cover_url": cover_url,
	}

	if __name__ == "__main__":
	import asyncio
	scraper = NovelCoolScraper()
	# Test with user provided URL
	url = "https://www.novelcool.com/chapter/Shadow-Slave-Chapter-15/7332162/"
	try:
	result = asyncio.run(scraper.scrape_chapter(url))
	print(f"Title: {result['title']}")
	print(f"Paragraphs: {len(result['content'])}")
	print(f"Next: {result['next_url']}")
	except Exception as e:
	print(f"Error: {e}")