import io import img2pdf import re import asyncio import zipfile import aiohttp from bs4 import BeautifulSoup from collections import defaultdict from config import logger, BASE_URL from database import ( db_pool, upsert_manga_index, save_manga_genres, get_manga_slug, save_manga_queue ) # ========================= # IN-MEMORY CACHE & RATE LIMIT # ========================= CACHE = {} RATE_LIMIT = defaultdict(list) def is_rate_limited(user_id, limit=5, per=10): import time now = time.time() RATE_LIMIT[user_id] = [t for t in RATE_LIMIT[user_id] if now - t < per] if len(RATE_LIMIT[user_id]) >= limit: return True RATE_LIMIT[user_id].append(now) return False # ========================= # MANGA & CHAPTER API # ========================= async def get_manga_chapters_api(url): slug = url.strip().rstrip("/").split("/")[-1] # Check if genres already saved genres_cached = False conn = db_pool.getconn() try: with conn.cursor() as c: c.execute("SELECT 1 FROM manga_genres WHERE manga_slug=%s LIMIT 1", (slug,)) genres_cached = c.fetchone() is not None finally: db_pool.putconn(conn) if url in CACHE and genres_cached: return CACHE[url] api_url = f"https://utoon.net/wp-json/icmadara/v1/mangas/slug/{slug}/" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", "Accept": "application/json" } try: async with aiohttp.ClientSession(headers=headers) as session: async with session.get(api_url, timeout=15) as res: if res.status != 200: logger.error(f"API Error: {res.status} for {api_url}") return [] data = await res.json() mangas_list = data.get("mangas", []) if not mangas_list: return [] manga_info = mangas_list[0] chapters_data = manga_info.get("capitulos", []) chaps = [ { "id": str(c.get("id_capitulo")), "title": c.get("nombre", "Unknown Chapter"), "slug": c.get("slug", "") } for c in chapters_data ] if chaps: CACHE[url] = chaps # Index for search title = manga_info.get("post_title") or manga_info.get("title") or slug thumbnail = manga_info.get("thumbnail") or manga_info.get("imagen") or "" upsert_manga_index(slug, title, url, thumbnail) # --- Save genres with detailed logging --- genres = manga_info.get("generos") or manga_info.get("genres") or [] logger.info(f"🔍 Slug {slug}: found {len(genres)} genres in API response.") if genres: if len(genres) > 0: logger.info(f" Sample genre: {genres[0]}") save_manga_genres(slug, genres) # Verify save conn_verify = db_pool.getconn() try: with conn_verify.cursor() as cv: cv.execute("SELECT COUNT(*) FROM manga_genres WHERE manga_slug=%s", (slug,)) count = cv.fetchone()[0] logger.info(f" Genres saved to DB: {count} rows.") except Exception as e: logger.error(f" Verification failed: {e}") finally: db_pool.putconn(conn_verify) else: logger.warning(f"⚠️ No genres field for {slug}. Response keys: {list(manga_info.keys())}") # --- End of genres handling --- return chaps except Exception as e: logger.error(f"Scraper Exception: {e}") return [] def images_to_pdf(images_bytes: list) -> bytes: """ Build PDF from original image bytes without recompression. """ return img2pdf.convert(images_bytes) async def fetch_chapter_images_api(chapter_id: str): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", "Accept": "application/json" } try: api_url = f"https://utoon.net/wp-json/icmadara/v1/capitulo/{chapter_id}/" async with aiohttp.ClientSession(headers=headers) as session: async with session.get(api_url, timeout=20) as res: if res.status != 200: return [] data = await res.json() images = [] raw_list = data.get("imagenes") or data.get("images") or [] if isinstance(raw_list, list): for item in raw_list: if isinstance(item, dict) and "src" in item: images.append(item["src"]) elif isinstance(item, str): images.append(item) if not images: text = await res.text() found = re.findall( r'https?://[^\s"\'<>]+?\.(?:jpg|jpeg|png|webp)(?:[^\s"\'<>]*?)', text, re.IGNORECASE ) images = list(dict.fromkeys([img.replace('\\/', '/') for img in found])) final_list = [ img for img in images if "logo" not in img.lower() and "icon" not in img.lower() and "wp-content/uploads" in img.lower() ] return final_list if final_list else images except Exception as e: logger.error(f"Image Fetch Exception: {e}") return [] async def _fetch_single_image(session, url, index): try: async with session.get(url, timeout=15) as resp: if resp.status == 200: content = await resp.read() ext = url.split(".")[-1].split('?')[0] if len(ext) > 4: ext = "jpg" return (index, f"{index:03d}.{ext}", content) except Exception as e: logger.warning(f"[ZIP] Failed {url}: {e}") return None async def download_images_as_zip_async(images: list, slug: str, ch_id: str) -> bytes: async with aiohttp.ClientSession(headers={"User-Agent": "Mozilla/5.0"}) as session: tasks = [_fetch_single_image(session, url, i) for i, url in enumerate(images, 1)] results = await asyncio.gather(*tasks) zip_buffer = io.BytesIO() with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf: for res in sorted([r for r in results if r], key=lambda x: x[0]): zf.writestr(res[1], res[2]) zip_buffer.seek(0) return zip_buffer.read() async def download_images_as_pdf_async(images: list, slug: str, ch_id: str) -> bytes: async with aiohttp.ClientSession(headers={"User-Agent": "Mozilla/5.0"}) as session: tasks = [_fetch_single_image(session, url, i) for i, url in enumerate(images, 1)] results = await asyncio.gather(*tasks) image_bytes = [r[2] for r in sorted([r for r in results if r], key=lambda x: x[0])] if not image_bytes: raise RuntimeError("No images downloaded") return images_to_pdf(image_bytes) # ========================= # PAGE LINK FETCHER # ========================= async def fetch_all_manga_links_paginated(start_page=1, end_page=71, stop_flag=None): all_links = [] headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"} async with aiohttp.ClientSession() as session: for page_num in range(start_page, end_page + 1): if stop_flag and stop_flag.get("stop_requested"): break url = f"https://utoon.net/manga/page/{page_num}/" try: async with session.get(url, headers=headers, timeout=30) as resp: if resp.status == 200: html = await resp.text() soup = BeautifulSoup(html, "html.parser") for h3 in soup.find_all("h3", class_=["h4", "h5"]): a = h3.find("a") if a and a.get("href"): all_links.append(a.get("href")) await asyncio.sleep(1) except Exception as e: logger.error(f"Error in page {page_num}: {e}") yield page_num, list(set(all_links)) async def collect_manga_links(start_page=1, end_page=71, stop_flag=None, progress_cb=None) -> list: all_links = [] headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"} async with aiohttp.ClientSession() as session: for page_num in range(start_page, end_page + 1): if stop_flag and stop_flag.get("stop_requested"): break url = f"https://utoon.net/manga/page/{page_num}/" try: async with session.get(url, headers=headers, timeout=30) as resp: if resp.status == 200: html = await resp.text() soup = BeautifulSoup(html, "html.parser") for h3 in soup.find_all("h3", class_=["h4", "h5"]): a = h3.find("a") if a and a.get("href"): all_links.append(a.get("href")) await asyncio.sleep(1) except Exception as e: logger.error(f"Error in page {page_num}: {e}") if progress_cb and page_num % 10 == 0: await progress_cb(page_num, len(set(all_links))) return list(set(all_links)) # ========================= # GENRES HARDCODED LIST (fallback / for browsing) # ========================= GENRES_LIST = [ {"id": 12, "slug": "fantasy", "nombre": "Fantasy", "total": 1220}, {"id": 10, "slug": "drama", "nombre": "Drama", "total": 1194}, {"id": 4, "slug": "adventure", "nombre": "Adventure", "total": 1165}, {"id": 2, "slug": "action", "nombre": "Action", "total": 1048}, {"id": 6, "slug": "comedy", "nombre": "Comedy", "total": 932}, {"id": 34, "slug": "shounen", "nombre": "Shounen", "total": 930}, {"id": 7, "slug": "comic", "nombre": "Comic", "total": 700}, {"id": 636, "slug": "fight", "nombre": "Fight", "total": 678}, {"id": 21, "slug": "manhwa", "nombre": "Manhwa", "total": 632}, {"id": 615, "slug": "magic", "nombre": "Magic", "total": 607}, {"id": 41, "slug": "supernatural", "nombre": "Supernatural", "total": 583}, {"id": 19, "slug": "manga", "nombre": "Manga", "total": 470}, {"id": 25, "slug": "mystery", "nombre": "Mystery", "total": 245}, {"id": 629, "slug": "isekai", "nombre": "Isekai", "total": 223}, {"id": 15, "slug": "historical", "nombre": "Historical", "total": 208}, {"id": 614, "slug": "reincarnation", "nombre": "Reincarnation", "total": 174}, {"id": 572, "slug": "shoujo", "nombre": "Shoujo", "total": 153}, {"id": 639, "slug": "mangatoon", "nombre": "Mangatoon", "total": 136}, {"id": 43, "slug": "webtoon", "nombre": "Webtoon", "total": 125}, {"id": 20, "slug": "manhua", "nombre": "Manhua", "total": 104}, {"id": 36, "slug": "slice-of-life", "nombre": "Slice of Life", "total": 99}, {"id": 633, "slug": "system", "nombre": "System", "total": 77}, {"id": 29, "slug": "school-life", "nombre": "School Life", "total": 68}, {"id": 640, "slug": "crime", "nombre": "Crime", "total": 332}, {"id": 628, "slug": "hunters", "nombre": "Hunters", "total": 310}, {"id": 22, "slug": "martial-arts", "nombre": "Martial Arts", "total": 331}, {"id": 28, "slug": "romance", "nombre": "Romance", "total": 253}, {"id": 486, "slug": "tragedy", "nombre": "Tragedy", "total": 48}, {"id": 581, "slug": "horror", "nombre": "Horror", "total": 54}, {"id": 31, "slug": "seinen", "nombre": "Seinen", "total": 56}, {"id": 40, "slug": "sports", "nombre": "Sports", "total": 37}, {"id": 8, "slug": "cooking", "nombre": "Cooking", "total": 35}, {"id": 637, "slug": "bully", "nombre": "Bully", "total": 32}, {"id": 630, "slug": "business", "nombre": "Business", "total": 53}, {"id": 631, "slug": "zombies", "nombre": "Zombies", "total": 17}, {"id": 526, "slug": "psychological", "nombre": "Psychological", "total": 20}, {"id": 30, "slug": "sci-fi", "nombre": "Sci-fi", "total": 23}, ] def get_genres_list(): return sorted(GENRES_LIST, key=lambda g: g["total"], reverse=True)