web1 / scraper.py
iq7se2's picture
Upload 9 files
d043a72 verified
import io
import img2pdf
import re
import asyncio
import zipfile
import aiohttp
from bs4 import BeautifulSoup
from collections import defaultdict
from config import logger, BASE_URL
from database import (
db_pool, upsert_manga_index, save_manga_genres,
get_manga_slug, save_manga_queue
)
# =========================
# IN-MEMORY CACHE & RATE LIMIT
# =========================
CACHE = {}
RATE_LIMIT = defaultdict(list)
def is_rate_limited(user_id, limit=5, per=10):
import time
now = time.time()
RATE_LIMIT[user_id] = [t for t in RATE_LIMIT[user_id] if now - t < per]
if len(RATE_LIMIT[user_id]) >= limit:
return True
RATE_LIMIT[user_id].append(now)
return False
# =========================
# MANGA & CHAPTER API
# =========================
async def get_manga_chapters_api(url):
slug = url.strip().rstrip("/").split("/")[-1]
# Check if genres already saved
genres_cached = False
conn = db_pool.getconn()
try:
with conn.cursor() as c:
c.execute("SELECT 1 FROM manga_genres WHERE manga_slug=%s LIMIT 1", (slug,))
genres_cached = c.fetchone() is not None
finally:
db_pool.putconn(conn)
if url in CACHE and genres_cached:
return CACHE[url]
api_url = f"https://utoon.net/wp-json/icmadara/v1/mangas/slug/{slug}/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "application/json"
}
try:
async with aiohttp.ClientSession(headers=headers) as session:
async with session.get(api_url, timeout=15) as res:
if res.status != 200:
logger.error(f"API Error: {res.status} for {api_url}")
return []
data = await res.json()
mangas_list = data.get("mangas", [])
if not mangas_list:
return []
manga_info = mangas_list[0]
chapters_data = manga_info.get("capitulos", [])
chaps = [
{
"id": str(c.get("id_capitulo")),
"title": c.get("nombre", "Unknown Chapter"),
"slug": c.get("slug", "")
}
for c in chapters_data
]
if chaps:
CACHE[url] = chaps
# Index for search
title = manga_info.get("post_title") or manga_info.get("title") or slug
thumbnail = manga_info.get("thumbnail") or manga_info.get("imagen") or ""
upsert_manga_index(slug, title, url, thumbnail)
# --- Save genres with detailed logging ---
genres = manga_info.get("generos") or manga_info.get("genres") or []
logger.info(f"🔍 Slug {slug}: found {len(genres)} genres in API response.")
if genres:
if len(genres) > 0:
logger.info(f" Sample genre: {genres[0]}")
save_manga_genres(slug, genres)
# Verify save
conn_verify = db_pool.getconn()
try:
with conn_verify.cursor() as cv:
cv.execute("SELECT COUNT(*) FROM manga_genres WHERE manga_slug=%s", (slug,))
count = cv.fetchone()[0]
logger.info(f" Genres saved to DB: {count} rows.")
except Exception as e:
logger.error(f" Verification failed: {e}")
finally:
db_pool.putconn(conn_verify)
else:
logger.warning(f"⚠️ No genres field for {slug}. Response keys: {list(manga_info.keys())}")
# --- End of genres handling ---
return chaps
except Exception as e:
logger.error(f"Scraper Exception: {e}")
return []
def images_to_pdf(images_bytes: list) -> bytes:
"""
Build PDF from original image bytes without recompression.
"""
return img2pdf.convert(images_bytes)
async def fetch_chapter_images_api(chapter_id: str):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "application/json"
}
try:
api_url = f"https://utoon.net/wp-json/icmadara/v1/capitulo/{chapter_id}/"
async with aiohttp.ClientSession(headers=headers) as session:
async with session.get(api_url, timeout=20) as res:
if res.status != 200:
return []
data = await res.json()
images = []
raw_list = data.get("imagenes") or data.get("images") or []
if isinstance(raw_list, list):
for item in raw_list:
if isinstance(item, dict) and "src" in item:
images.append(item["src"])
elif isinstance(item, str):
images.append(item)
if not images:
text = await res.text()
found = re.findall(
r'https?://[^\s"\'<>]+?\.(?:jpg|jpeg|png|webp)(?:[^\s"\'<>]*?)',
text, re.IGNORECASE
)
images = list(dict.fromkeys([img.replace('\\/', '/') for img in found]))
final_list = [
img for img in images
if "logo" not in img.lower()
and "icon" not in img.lower()
and "wp-content/uploads" in img.lower()
]
return final_list if final_list else images
except Exception as e:
logger.error(f"Image Fetch Exception: {e}")
return []
async def _fetch_single_image(session, url, index):
try:
async with session.get(url, timeout=15) as resp:
if resp.status == 200:
content = await resp.read()
ext = url.split(".")[-1].split('?')[0]
if len(ext) > 4: ext = "jpg"
return (index, f"{index:03d}.{ext}", content)
except Exception as e:
logger.warning(f"[ZIP] Failed {url}: {e}")
return None
async def download_images_as_zip_async(images: list, slug: str, ch_id: str) -> bytes:
async with aiohttp.ClientSession(headers={"User-Agent": "Mozilla/5.0"}) as session:
tasks = [_fetch_single_image(session, url, i) for i, url in enumerate(images, 1)]
results = await asyncio.gather(*tasks)
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
for res in sorted([r for r in results if r], key=lambda x: x[0]):
zf.writestr(res[1], res[2])
zip_buffer.seek(0)
return zip_buffer.read()
async def download_images_as_pdf_async(images: list, slug: str, ch_id: str) -> bytes:
async with aiohttp.ClientSession(headers={"User-Agent": "Mozilla/5.0"}) as session:
tasks = [_fetch_single_image(session, url, i) for i, url in enumerate(images, 1)]
results = await asyncio.gather(*tasks)
image_bytes = [r[2] for r in sorted([r for r in results if r], key=lambda x: x[0])]
if not image_bytes:
raise RuntimeError("No images downloaded")
return images_to_pdf(image_bytes)
# =========================
# PAGE LINK FETCHER
# =========================
async def fetch_all_manga_links_paginated(start_page=1, end_page=71, stop_flag=None):
all_links = []
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
async with aiohttp.ClientSession() as session:
for page_num in range(start_page, end_page + 1):
if stop_flag and stop_flag.get("stop_requested"):
break
url = f"https://utoon.net/manga/page/{page_num}/"
try:
async with session.get(url, headers=headers, timeout=30) as resp:
if resp.status == 200:
html = await resp.text()
soup = BeautifulSoup(html, "html.parser")
for h3 in soup.find_all("h3", class_=["h4", "h5"]):
a = h3.find("a")
if a and a.get("href"):
all_links.append(a.get("href"))
await asyncio.sleep(1)
except Exception as e:
logger.error(f"Error in page {page_num}: {e}")
yield page_num, list(set(all_links))
async def collect_manga_links(start_page=1, end_page=71, stop_flag=None,
progress_cb=None) -> list:
all_links = []
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
async with aiohttp.ClientSession() as session:
for page_num in range(start_page, end_page + 1):
if stop_flag and stop_flag.get("stop_requested"):
break
url = f"https://utoon.net/manga/page/{page_num}/"
try:
async with session.get(url, headers=headers, timeout=30) as resp:
if resp.status == 200:
html = await resp.text()
soup = BeautifulSoup(html, "html.parser")
for h3 in soup.find_all("h3", class_=["h4", "h5"]):
a = h3.find("a")
if a and a.get("href"):
all_links.append(a.get("href"))
await asyncio.sleep(1)
except Exception as e:
logger.error(f"Error in page {page_num}: {e}")
if progress_cb and page_num % 10 == 0:
await progress_cb(page_num, len(set(all_links)))
return list(set(all_links))
# =========================
# GENRES HARDCODED LIST (fallback / for browsing)
# =========================
GENRES_LIST = [
{"id": 12, "slug": "fantasy", "nombre": "Fantasy", "total": 1220},
{"id": 10, "slug": "drama", "nombre": "Drama", "total": 1194},
{"id": 4, "slug": "adventure", "nombre": "Adventure", "total": 1165},
{"id": 2, "slug": "action", "nombre": "Action", "total": 1048},
{"id": 6, "slug": "comedy", "nombre": "Comedy", "total": 932},
{"id": 34, "slug": "shounen", "nombre": "Shounen", "total": 930},
{"id": 7, "slug": "comic", "nombre": "Comic", "total": 700},
{"id": 636, "slug": "fight", "nombre": "Fight", "total": 678},
{"id": 21, "slug": "manhwa", "nombre": "Manhwa", "total": 632},
{"id": 615, "slug": "magic", "nombre": "Magic", "total": 607},
{"id": 41, "slug": "supernatural", "nombre": "Supernatural", "total": 583},
{"id": 19, "slug": "manga", "nombre": "Manga", "total": 470},
{"id": 25, "slug": "mystery", "nombre": "Mystery", "total": 245},
{"id": 629, "slug": "isekai", "nombre": "Isekai", "total": 223},
{"id": 15, "slug": "historical", "nombre": "Historical", "total": 208},
{"id": 614, "slug": "reincarnation", "nombre": "Reincarnation", "total": 174},
{"id": 572, "slug": "shoujo", "nombre": "Shoujo", "total": 153},
{"id": 639, "slug": "mangatoon", "nombre": "Mangatoon", "total": 136},
{"id": 43, "slug": "webtoon", "nombre": "Webtoon", "total": 125},
{"id": 20, "slug": "manhua", "nombre": "Manhua", "total": 104},
{"id": 36, "slug": "slice-of-life", "nombre": "Slice of Life", "total": 99},
{"id": 633, "slug": "system", "nombre": "System", "total": 77},
{"id": 29, "slug": "school-life", "nombre": "School Life", "total": 68},
{"id": 640, "slug": "crime", "nombre": "Crime", "total": 332},
{"id": 628, "slug": "hunters", "nombre": "Hunters", "total": 310},
{"id": 22, "slug": "martial-arts", "nombre": "Martial Arts", "total": 331},
{"id": 28, "slug": "romance", "nombre": "Romance", "total": 253},
{"id": 486, "slug": "tragedy", "nombre": "Tragedy", "total": 48},
{"id": 581, "slug": "horror", "nombre": "Horror", "total": 54},
{"id": 31, "slug": "seinen", "nombre": "Seinen", "total": 56},
{"id": 40, "slug": "sports", "nombre": "Sports", "total": 37},
{"id": 8, "slug": "cooking", "nombre": "Cooking", "total": 35},
{"id": 637, "slug": "bully", "nombre": "Bully", "total": 32},
{"id": 630, "slug": "business", "nombre": "Business", "total": 53},
{"id": 631, "slug": "zombies", "nombre": "Zombies", "total": 17},
{"id": 526, "slug": "psychological", "nombre": "Psychological", "total": 20},
{"id": 30, "slug": "sci-fi", "nombre": "Sci-fi", "total": 23},
]
def get_genres_list():
return sorted(GENRES_LIST, key=lambda g: g["total"], reverse=True)