| import io |
| import img2pdf |
| import re |
| import asyncio |
| import zipfile |
| import aiohttp |
| from bs4 import BeautifulSoup |
| from collections import defaultdict |
|
|
| from config import logger, BASE_URL |
| from database import ( |
| db_pool, upsert_manga_index, save_manga_genres, |
| get_manga_slug, save_manga_queue |
| ) |
|
|
| |
| |
| |
| CACHE = {} |
| RATE_LIMIT = defaultdict(list) |
|
|
| def is_rate_limited(user_id, limit=5, per=10): |
| import time |
| now = time.time() |
| RATE_LIMIT[user_id] = [t for t in RATE_LIMIT[user_id] if now - t < per] |
| if len(RATE_LIMIT[user_id]) >= limit: |
| return True |
| RATE_LIMIT[user_id].append(now) |
| return False |
|
|
| |
| |
| |
| async def get_manga_chapters_api(url): |
| slug = url.strip().rstrip("/").split("/")[-1] |
|
|
| |
| genres_cached = False |
| conn = db_pool.getconn() |
| try: |
| with conn.cursor() as c: |
| c.execute("SELECT 1 FROM manga_genres WHERE manga_slug=%s LIMIT 1", (slug,)) |
| genres_cached = c.fetchone() is not None |
| finally: |
| db_pool.putconn(conn) |
|
|
| if url in CACHE and genres_cached: |
| return CACHE[url] |
|
|
| api_url = f"https://utoon.net/wp-json/icmadara/v1/mangas/slug/{slug}/" |
| headers = { |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", |
| "Accept": "application/json" |
| } |
| try: |
| async with aiohttp.ClientSession(headers=headers) as session: |
| async with session.get(api_url, timeout=15) as res: |
| if res.status != 200: |
| logger.error(f"API Error: {res.status} for {api_url}") |
| return [] |
| data = await res.json() |
| mangas_list = data.get("mangas", []) |
| if not mangas_list: |
| return [] |
| manga_info = mangas_list[0] |
| chapters_data = manga_info.get("capitulos", []) |
| chaps = [ |
| { |
| "id": str(c.get("id_capitulo")), |
| "title": c.get("nombre", "Unknown Chapter"), |
| "slug": c.get("slug", "") |
| } |
| for c in chapters_data |
| ] |
| if chaps: |
| CACHE[url] = chaps |
| |
| title = manga_info.get("post_title") or manga_info.get("title") or slug |
| thumbnail = manga_info.get("thumbnail") or manga_info.get("imagen") or "" |
| upsert_manga_index(slug, title, url, thumbnail) |
|
|
| |
| genres = manga_info.get("generos") or manga_info.get("genres") or [] |
| logger.info(f"🔍 Slug {slug}: found {len(genres)} genres in API response.") |
| if genres: |
| if len(genres) > 0: |
| logger.info(f" Sample genre: {genres[0]}") |
| save_manga_genres(slug, genres) |
| |
| conn_verify = db_pool.getconn() |
| try: |
| with conn_verify.cursor() as cv: |
| cv.execute("SELECT COUNT(*) FROM manga_genres WHERE manga_slug=%s", (slug,)) |
| count = cv.fetchone()[0] |
| logger.info(f" Genres saved to DB: {count} rows.") |
| except Exception as e: |
| logger.error(f" Verification failed: {e}") |
| finally: |
| db_pool.putconn(conn_verify) |
| else: |
| logger.warning(f"⚠️ No genres field for {slug}. Response keys: {list(manga_info.keys())}") |
| |
|
|
| return chaps |
| except Exception as e: |
| logger.error(f"Scraper Exception: {e}") |
| return [] |
|
|
| def images_to_pdf(images_bytes: list) -> bytes: |
| """ |
| Build PDF from original image bytes without recompression. |
| """ |
| return img2pdf.convert(images_bytes) |
| |
| async def fetch_chapter_images_api(chapter_id: str): |
| headers = { |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", |
| "Accept": "application/json" |
| } |
| try: |
| api_url = f"https://utoon.net/wp-json/icmadara/v1/capitulo/{chapter_id}/" |
| async with aiohttp.ClientSession(headers=headers) as session: |
| async with session.get(api_url, timeout=20) as res: |
| if res.status != 200: |
| return [] |
| data = await res.json() |
| images = [] |
| raw_list = data.get("imagenes") or data.get("images") or [] |
| if isinstance(raw_list, list): |
| for item in raw_list: |
| if isinstance(item, dict) and "src" in item: |
| images.append(item["src"]) |
| elif isinstance(item, str): |
| images.append(item) |
| if not images: |
| text = await res.text() |
| found = re.findall( |
| r'https?://[^\s"\'<>]+?\.(?:jpg|jpeg|png|webp)(?:[^\s"\'<>]*?)', |
| text, re.IGNORECASE |
| ) |
| images = list(dict.fromkeys([img.replace('\\/', '/') for img in found])) |
| final_list = [ |
| img for img in images |
| if "logo" not in img.lower() |
| and "icon" not in img.lower() |
| and "wp-content/uploads" in img.lower() |
| ] |
| return final_list if final_list else images |
| except Exception as e: |
| logger.error(f"Image Fetch Exception: {e}") |
| return [] |
|
|
| async def _fetch_single_image(session, url, index): |
| try: |
| async with session.get(url, timeout=15) as resp: |
| if resp.status == 200: |
| content = await resp.read() |
| ext = url.split(".")[-1].split('?')[0] |
| if len(ext) > 4: ext = "jpg" |
| return (index, f"{index:03d}.{ext}", content) |
| except Exception as e: |
| logger.warning(f"[ZIP] Failed {url}: {e}") |
| return None |
|
|
| async def download_images_as_zip_async(images: list, slug: str, ch_id: str) -> bytes: |
| async with aiohttp.ClientSession(headers={"User-Agent": "Mozilla/5.0"}) as session: |
| tasks = [_fetch_single_image(session, url, i) for i, url in enumerate(images, 1)] |
| results = await asyncio.gather(*tasks) |
| zip_buffer = io.BytesIO() |
| with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf: |
| for res in sorted([r for r in results if r], key=lambda x: x[0]): |
| zf.writestr(res[1], res[2]) |
| zip_buffer.seek(0) |
| return zip_buffer.read() |
|
|
|
|
| async def download_images_as_pdf_async(images: list, slug: str, ch_id: str) -> bytes: |
| async with aiohttp.ClientSession(headers={"User-Agent": "Mozilla/5.0"}) as session: |
| tasks = [_fetch_single_image(session, url, i) for i, url in enumerate(images, 1)] |
| results = await asyncio.gather(*tasks) |
| image_bytes = [r[2] for r in sorted([r for r in results if r], key=lambda x: x[0])] |
| if not image_bytes: |
| raise RuntimeError("No images downloaded") |
| return images_to_pdf(image_bytes) |
|
|
| |
| |
| |
| async def fetch_all_manga_links_paginated(start_page=1, end_page=71, stop_flag=None): |
| all_links = [] |
| headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"} |
| async with aiohttp.ClientSession() as session: |
| for page_num in range(start_page, end_page + 1): |
| if stop_flag and stop_flag.get("stop_requested"): |
| break |
| url = f"https://utoon.net/manga/page/{page_num}/" |
| try: |
| async with session.get(url, headers=headers, timeout=30) as resp: |
| if resp.status == 200: |
| html = await resp.text() |
| soup = BeautifulSoup(html, "html.parser") |
| for h3 in soup.find_all("h3", class_=["h4", "h5"]): |
| a = h3.find("a") |
| if a and a.get("href"): |
| all_links.append(a.get("href")) |
| await asyncio.sleep(1) |
| except Exception as e: |
| logger.error(f"Error in page {page_num}: {e}") |
| yield page_num, list(set(all_links)) |
|
|
| async def collect_manga_links(start_page=1, end_page=71, stop_flag=None, |
| progress_cb=None) -> list: |
| all_links = [] |
| headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"} |
| async with aiohttp.ClientSession() as session: |
| for page_num in range(start_page, end_page + 1): |
| if stop_flag and stop_flag.get("stop_requested"): |
| break |
| url = f"https://utoon.net/manga/page/{page_num}/" |
| try: |
| async with session.get(url, headers=headers, timeout=30) as resp: |
| if resp.status == 200: |
| html = await resp.text() |
| soup = BeautifulSoup(html, "html.parser") |
| for h3 in soup.find_all("h3", class_=["h4", "h5"]): |
| a = h3.find("a") |
| if a and a.get("href"): |
| all_links.append(a.get("href")) |
| await asyncio.sleep(1) |
| except Exception as e: |
| logger.error(f"Error in page {page_num}: {e}") |
| if progress_cb and page_num % 10 == 0: |
| await progress_cb(page_num, len(set(all_links))) |
| return list(set(all_links)) |
|
|
| |
| |
| |
| GENRES_LIST = [ |
| {"id": 12, "slug": "fantasy", "nombre": "Fantasy", "total": 1220}, |
| {"id": 10, "slug": "drama", "nombre": "Drama", "total": 1194}, |
| {"id": 4, "slug": "adventure", "nombre": "Adventure", "total": 1165}, |
| {"id": 2, "slug": "action", "nombre": "Action", "total": 1048}, |
| {"id": 6, "slug": "comedy", "nombre": "Comedy", "total": 932}, |
| {"id": 34, "slug": "shounen", "nombre": "Shounen", "total": 930}, |
| {"id": 7, "slug": "comic", "nombre": "Comic", "total": 700}, |
| {"id": 636, "slug": "fight", "nombre": "Fight", "total": 678}, |
| {"id": 21, "slug": "manhwa", "nombre": "Manhwa", "total": 632}, |
| {"id": 615, "slug": "magic", "nombre": "Magic", "total": 607}, |
| {"id": 41, "slug": "supernatural", "nombre": "Supernatural", "total": 583}, |
| {"id": 19, "slug": "manga", "nombre": "Manga", "total": 470}, |
| {"id": 25, "slug": "mystery", "nombre": "Mystery", "total": 245}, |
| {"id": 629, "slug": "isekai", "nombre": "Isekai", "total": 223}, |
| {"id": 15, "slug": "historical", "nombre": "Historical", "total": 208}, |
| {"id": 614, "slug": "reincarnation", "nombre": "Reincarnation", "total": 174}, |
| {"id": 572, "slug": "shoujo", "nombre": "Shoujo", "total": 153}, |
| {"id": 639, "slug": "mangatoon", "nombre": "Mangatoon", "total": 136}, |
| {"id": 43, "slug": "webtoon", "nombre": "Webtoon", "total": 125}, |
| {"id": 20, "slug": "manhua", "nombre": "Manhua", "total": 104}, |
| {"id": 36, "slug": "slice-of-life", "nombre": "Slice of Life", "total": 99}, |
| {"id": 633, "slug": "system", "nombre": "System", "total": 77}, |
| {"id": 29, "slug": "school-life", "nombre": "School Life", "total": 68}, |
| {"id": 640, "slug": "crime", "nombre": "Crime", "total": 332}, |
| {"id": 628, "slug": "hunters", "nombre": "Hunters", "total": 310}, |
| {"id": 22, "slug": "martial-arts", "nombre": "Martial Arts", "total": 331}, |
| {"id": 28, "slug": "romance", "nombre": "Romance", "total": 253}, |
| {"id": 486, "slug": "tragedy", "nombre": "Tragedy", "total": 48}, |
| {"id": 581, "slug": "horror", "nombre": "Horror", "total": 54}, |
| {"id": 31, "slug": "seinen", "nombre": "Seinen", "total": 56}, |
| {"id": 40, "slug": "sports", "nombre": "Sports", "total": 37}, |
| {"id": 8, "slug": "cooking", "nombre": "Cooking", "total": 35}, |
| {"id": 637, "slug": "bully", "nombre": "Bully", "total": 32}, |
| {"id": 630, "slug": "business", "nombre": "Business", "total": 53}, |
| {"id": 631, "slug": "zombies", "nombre": "Zombies", "total": 17}, |
| {"id": 526, "slug": "psychological", "nombre": "Psychological", "total": 20}, |
| {"id": 30, "slug": "sci-fi", "nombre": "Sci-fi", "total": 23}, |
| ] |
|
|
| def get_genres_list(): |
| return sorted(GENRES_LIST, key=lambda g: g["total"], reverse=True) |