Spaces:
Sleeping
Sleeping
File size: 7,608 Bytes
f8a7e1d 7348977 ccb0e29 7348977 ccb0e29 7348977 f8a7e1d ccb0e29 f8a7e1d ccb0e29 7348977 f8a7e1d ccb0e29 f8a7e1d 7348977 f8a7e1d c37e1f4 ccb0e29 c37e1f4 f8a7e1d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 | import aiohttp
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
class NovelCoolScraper:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
async def scrape_chapter(self, url: str):
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=self.headers) as response:
if response.status != 200:
raise Exception(f"Failed to fetch page: {response.status}")
html = await response.text()
# NovelCool pages can be large; lxml parser is more reliable here.
soup = BeautifulSoup(html, 'lxml')
# Extract Title
title = "Unknown Chapter"
title_tag = soup.find('h1')
if title_tag:
title = title_tag.get_text(strip=True)
else:
page_title = soup.find('title')
if page_title:
t = page_title.get_text(strip=True)
# e.g. "Shadow Slave Chapter 15 - Novel Cool - Best online light novel reading website"
title = t.split(' - Novel Cool', 1)[0].strip() or t
# Extract Content
# In the HTML variant commonly returned to scripted clients, the actual
# chapter content lives under: div.site-content > div.overflow-hidden
content_div = soup.select_one('div.site-content div.overflow-hidden')
if not content_div:
# Fallback: pick the div with the most <p> tags.
best = None
best_count = 0
for div in soup.find_all('div'):
ps = div.find_all('p')
if len(ps) > best_count:
best_count = len(ps)
best = div
content_div = best
if not content_div:
raise Exception("Could not find chapter content container")
paragraphs = []
for p in content_div.find_all('p'):
classes = p.get('class') or []
txt = p.get_text(' ', strip=True)
if not txt:
continue
if 'chapter-end-mark' in classes or txt.lower().strip() == 'chapter end':
break
paragraphs.append(txt)
if not paragraphs:
raw_text = content_div.get_text(separator='\n', strip=True)
paragraphs = [line for line in raw_text.split('\n') if line.strip()]
content = "\n".join(paragraphs)
# Extract Next/Prev Links
next_link = None
prev_link = None
for a in soup.find_all('a', href=True):
t = a.get_text(" ", strip=True)
href = a.get('href')
if not href:
continue
if '/chapter/' not in href:
continue
if not next_link and 'Next' in t:
next_link = href
if not prev_link and 'Prev' in t:
prev_link = href
if next_link and prev_link:
break
if next_link:
next_link = urljoin(url, next_link)
if prev_link:
prev_link = urljoin(url, prev_link)
return {
"title": title,
"content": paragraphs, # Return list of paragraphs for easier chunking
"next_url": next_link,
"prev_url": prev_link
}
async def scrape_novel_index(self, novel_url: str):
"""Scrape a NovelCool novel page and return a list of chapter links."""
async with aiohttp.ClientSession() as session:
async with session.get(novel_url, headers=self.headers) as response:
if response.status != 200:
raise Exception(f"Failed to fetch page: {response.status}")
html = await response.text()
soup = BeautifulSoup(html, 'lxml')
links = []
seen = set()
def parse_chapter_number(title: str, url: str) -> int | None:
t = (title or '').strip()
# Best-effort chapter number parsing from visible text.
m = re.search(r"(?:\bChapter\b|\bCh\.?\b|\bC\b)\s*(\d+)", t, flags=re.IGNORECASE)
if m:
try:
n = int(m.group(1))
return n if n > 0 else None
except Exception:
pass
# Fallback: parse from URL, e.g.
# /chapter/<Novel>-Chapter-15/<id>/ or .../Chapter_15/... etc.
u = (url or '')
m = re.search(r"(?:chapter|ch)[^0-9]{0,12}(\d+)", u, flags=re.IGNORECASE)
if m:
try:
n = int(m.group(1))
return n if n > 0 else None
except Exception:
pass
return None
for a in soup.find_all('a', href=True):
href = a.get('href')
if not href:
continue
if '/chapter/' not in href:
continue
abs_url = urljoin(novel_url, href)
if abs_url in seen:
continue
title = a.get_text(' ', strip=True)
if not title:
# Some chapter links have empty text (icons). Skip but do NOT
# mark as seen — the real link with text may appear later.
continue
seen.add(abs_url)
n = parse_chapter_number(title, abs_url)
links.append({"n": n, "title": title, "url": abs_url})
# Sort by chapter number when possible, but preserve stable ordering
# for unknowns (avoid pushing an unparsed Chapter 1 to the end).
def chapter_key(item):
n = item.get('n')
if isinstance(n, int):
return (0, n)
return (1, 0)
links.sort(key=chapter_key)
return links
async def scrape_novel_details(self, novel_url: str):
"""Scrape a NovelCool novel page and return lightweight metadata.
Currently returns:
- title: best-effort title
- cover_url: absolute URL to the cover image, when detectable
"""
async with aiohttp.ClientSession() as session:
async with session.get(novel_url, headers=self.headers) as response:
if response.status != 200:
raise Exception(f"Failed to fetch page: {response.status}")
html = await response.text()
soup = BeautifulSoup(html, 'lxml')
title = None
t = soup.find('title')
if t:
raw = t.get_text(strip=True)
if raw:
title = raw.split(' - Novel Cool', 1)[0].strip() or raw
cover_url = None
img = soup.select_one('img.bookinfo-pic-img')
if not img:
img = soup.select_one('img[itemprop="image"]')
if img:
src = img.get('src')
if src:
cover_url = urljoin(novel_url, src)
return {
"title": title,
"cover_url": cover_url,
}
if __name__ == "__main__":
import asyncio
scraper = NovelCoolScraper()
# Test with user provided URL
url = "https://www.novelcool.com/chapter/Shadow-Slave-Chapter-15/7332162/"
try:
result = asyncio.run(scraper.scrape_chapter(url))
print(f"Title: {result['title']}")
print(f"Paragraphs: {len(result['content'])}")
print(f"Next: {result['next_url']}")
except Exception as e:
print(f"Error: {e}")
|