Spaces:
Sleeping
Sleeping
Sync backend with LN-TTS: fix Chapter 1 scraper bug, add sentence offsets, improve error handling, update README
ccb0e29 | import aiohttp | |
| from bs4 import BeautifulSoup | |
| import re | |
| from urllib.parse import urljoin | |
| class NovelCoolScraper: | |
| def __init__(self): | |
| self.headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" | |
| } | |
| async def scrape_chapter(self, url: str): | |
| async with aiohttp.ClientSession() as session: | |
| async with session.get(url, headers=self.headers) as response: | |
| if response.status != 200: | |
| raise Exception(f"Failed to fetch page: {response.status}") | |
| html = await response.text() | |
| # NovelCool pages can be large; lxml parser is more reliable here. | |
| soup = BeautifulSoup(html, 'lxml') | |
| # Extract Title | |
| title = "Unknown Chapter" | |
| title_tag = soup.find('h1') | |
| if title_tag: | |
| title = title_tag.get_text(strip=True) | |
| else: | |
| page_title = soup.find('title') | |
| if page_title: | |
| t = page_title.get_text(strip=True) | |
| # e.g. "Shadow Slave Chapter 15 - Novel Cool - Best online light novel reading website" | |
| title = t.split(' - Novel Cool', 1)[0].strip() or t | |
| # Extract Content | |
| # In the HTML variant commonly returned to scripted clients, the actual | |
| # chapter content lives under: div.site-content > div.overflow-hidden | |
| content_div = soup.select_one('div.site-content div.overflow-hidden') | |
| if not content_div: | |
| # Fallback: pick the div with the most <p> tags. | |
| best = None | |
| best_count = 0 | |
| for div in soup.find_all('div'): | |
| ps = div.find_all('p') | |
| if len(ps) > best_count: | |
| best_count = len(ps) | |
| best = div | |
| content_div = best | |
| if not content_div: | |
| raise Exception("Could not find chapter content container") | |
| paragraphs = [] | |
| for p in content_div.find_all('p'): | |
| classes = p.get('class') or [] | |
| txt = p.get_text(' ', strip=True) | |
| if not txt: | |
| continue | |
| if 'chapter-end-mark' in classes or txt.lower().strip() == 'chapter end': | |
| break | |
| paragraphs.append(txt) | |
| if not paragraphs: | |
| raw_text = content_div.get_text(separator='\n', strip=True) | |
| paragraphs = [line for line in raw_text.split('\n') if line.strip()] | |
| content = "\n".join(paragraphs) | |
| # Extract Next/Prev Links | |
| next_link = None | |
| prev_link = None | |
| for a in soup.find_all('a', href=True): | |
| t = a.get_text(" ", strip=True) | |
| href = a.get('href') | |
| if not href: | |
| continue | |
| if '/chapter/' not in href: | |
| continue | |
| if not next_link and 'Next' in t: | |
| next_link = href | |
| if not prev_link and 'Prev' in t: | |
| prev_link = href | |
| if next_link and prev_link: | |
| break | |
| if next_link: | |
| next_link = urljoin(url, next_link) | |
| if prev_link: | |
| prev_link = urljoin(url, prev_link) | |
| return { | |
| "title": title, | |
| "content": paragraphs, # Return list of paragraphs for easier chunking | |
| "next_url": next_link, | |
| "prev_url": prev_link | |
| } | |
| async def scrape_novel_index(self, novel_url: str): | |
| """Scrape a NovelCool novel page and return a list of chapter links.""" | |
| async with aiohttp.ClientSession() as session: | |
| async with session.get(novel_url, headers=self.headers) as response: | |
| if response.status != 200: | |
| raise Exception(f"Failed to fetch page: {response.status}") | |
| html = await response.text() | |
| soup = BeautifulSoup(html, 'lxml') | |
| links = [] | |
| seen = set() | |
| def parse_chapter_number(title: str, url: str) -> int | None: | |
| t = (title or '').strip() | |
| # Best-effort chapter number parsing from visible text. | |
| m = re.search(r"(?:\bChapter\b|\bCh\.?\b|\bC\b)\s*(\d+)", t, flags=re.IGNORECASE) | |
| if m: | |
| try: | |
| n = int(m.group(1)) | |
| return n if n > 0 else None | |
| except Exception: | |
| pass | |
| # Fallback: parse from URL, e.g. | |
| # /chapter/<Novel>-Chapter-15/<id>/ or .../Chapter_15/... etc. | |
| u = (url or '') | |
| m = re.search(r"(?:chapter|ch)[^0-9]{0,12}(\d+)", u, flags=re.IGNORECASE) | |
| if m: | |
| try: | |
| n = int(m.group(1)) | |
| return n if n > 0 else None | |
| except Exception: | |
| pass | |
| return None | |
| for a in soup.find_all('a', href=True): | |
| href = a.get('href') | |
| if not href: | |
| continue | |
| if '/chapter/' not in href: | |
| continue | |
| abs_url = urljoin(novel_url, href) | |
| if abs_url in seen: | |
| continue | |
| title = a.get_text(' ', strip=True) | |
| if not title: | |
| # Some chapter links have empty text (icons). Skip but do NOT | |
| # mark as seen — the real link with text may appear later. | |
| continue | |
| seen.add(abs_url) | |
| n = parse_chapter_number(title, abs_url) | |
| links.append({"n": n, "title": title, "url": abs_url}) | |
| # Sort by chapter number when possible, but preserve stable ordering | |
| # for unknowns (avoid pushing an unparsed Chapter 1 to the end). | |
| def chapter_key(item): | |
| n = item.get('n') | |
| if isinstance(n, int): | |
| return (0, n) | |
| return (1, 0) | |
| links.sort(key=chapter_key) | |
| return links | |
| async def scrape_novel_details(self, novel_url: str): | |
| """Scrape a NovelCool novel page and return lightweight metadata. | |
| Currently returns: | |
| - title: best-effort title | |
| - cover_url: absolute URL to the cover image, when detectable | |
| """ | |
| async with aiohttp.ClientSession() as session: | |
| async with session.get(novel_url, headers=self.headers) as response: | |
| if response.status != 200: | |
| raise Exception(f"Failed to fetch page: {response.status}") | |
| html = await response.text() | |
| soup = BeautifulSoup(html, 'lxml') | |
| title = None | |
| t = soup.find('title') | |
| if t: | |
| raw = t.get_text(strip=True) | |
| if raw: | |
| title = raw.split(' - Novel Cool', 1)[0].strip() or raw | |
| cover_url = None | |
| img = soup.select_one('img.bookinfo-pic-img') | |
| if not img: | |
| img = soup.select_one('img[itemprop="image"]') | |
| if img: | |
| src = img.get('src') | |
| if src: | |
| cover_url = urljoin(novel_url, src) | |
| return { | |
| "title": title, | |
| "cover_url": cover_url, | |
| } | |
| if __name__ == "__main__": | |
| import asyncio | |
| scraper = NovelCoolScraper() | |
| # Test with user provided URL | |
| url = "https://www.novelcool.com/chapter/Shadow-Slave-Chapter-15/7332162/" | |
| try: | |
| result = asyncio.run(scraper.scrape_chapter(url)) | |
| print(f"Title: {result['title']}") | |
| print(f"Paragraphs: {len(result['content'])}") | |
| print(f"Next: {result['next_url']}") | |
| except Exception as e: | |
| print(f"Error: {e}") | |