Spaces:

iq7se2
/

web1

Sleeping

File size: 13,260 Bytes

d043a72

import io
import img2pdf
import re
import asyncio
import zipfile
import aiohttp
from bs4 import BeautifulSoup
from collections import defaultdict

from config import logger, BASE_URL
from database import (
    db_pool, upsert_manga_index, save_manga_genres,
    get_manga_slug, save_manga_queue
)

# =========================
# IN-MEMORY CACHE & RATE LIMIT
# =========================
CACHE      = {}
RATE_LIMIT = defaultdict(list)

def is_rate_limited(user_id, limit=5, per=10):
    import time
    now = time.time()
    RATE_LIMIT[user_id] = [t for t in RATE_LIMIT[user_id] if now - t < per]
    if len(RATE_LIMIT[user_id]) >= limit:
        return True
    RATE_LIMIT[user_id].append(now)
    return False

# =========================
# MANGA & CHAPTER API
# =========================
async def get_manga_chapters_api(url):
    slug = url.strip().rstrip("/").split("/")[-1]

    # Check if genres already saved
    genres_cached = False
    conn = db_pool.getconn()
    try:
        with conn.cursor() as c:
            c.execute("SELECT 1 FROM manga_genres WHERE manga_slug=%s LIMIT 1", (slug,))
            genres_cached = c.fetchone() is not None
    finally:
        db_pool.putconn(conn)

    if url in CACHE and genres_cached:
        return CACHE[url]

    api_url = f"https://utoon.net/wp-json/icmadara/v1/mangas/slug/{slug}/"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "application/json"
    }
    try:
        async with aiohttp.ClientSession(headers=headers) as session:
            async with session.get(api_url, timeout=15) as res:
                if res.status != 200:
                    logger.error(f"API Error: {res.status} for {api_url}")
                    return []
                data        = await res.json()
                mangas_list = data.get("mangas", [])
                if not mangas_list:
                    return []
                manga_info    = mangas_list[0]
                chapters_data = manga_info.get("capitulos", [])
                chaps = [
                    {
                        "id":    str(c.get("id_capitulo")),
                        "title": c.get("nombre", "Unknown Chapter"),
                        "slug":  c.get("slug", "")
                    }
                    for c in chapters_data
                ]
                if chaps:
                    CACHE[url] = chaps
                # Index for search
                title     = manga_info.get("post_title") or manga_info.get("title") or slug
                thumbnail = manga_info.get("thumbnail") or manga_info.get("imagen") or ""
                upsert_manga_index(slug, title, url, thumbnail)

                # --- Save genres with detailed logging ---
                genres = manga_info.get("generos") or manga_info.get("genres") or []
                logger.info(f"🔍 Slug {slug}: found {len(genres)} genres in API response.")
                if genres:
                    if len(genres) > 0:
                        logger.info(f"   Sample genre: {genres[0]}")
                    save_manga_genres(slug, genres)
                    # Verify save
                    conn_verify = db_pool.getconn()
                    try:
                        with conn_verify.cursor() as cv:
                            cv.execute("SELECT COUNT(*) FROM manga_genres WHERE manga_slug=%s", (slug,))
                            count = cv.fetchone()[0]
                            logger.info(f"   Genres saved to DB: {count} rows.")
                    except Exception as e:
                        logger.error(f"   Verification failed: {e}")
                    finally:
                        db_pool.putconn(conn_verify)
                else:
                    logger.warning(f"⚠️ No genres field for {slug}. Response keys: {list(manga_info.keys())}")
                # --- End of genres handling ---

                return chaps
    except Exception as e:
        logger.error(f"Scraper Exception: {e}")
        return []

def images_to_pdf(images_bytes: list) -> bytes:
    """
    Build PDF from original image bytes without recompression.
    """
    return img2pdf.convert(images_bytes)
    
async def fetch_chapter_images_api(chapter_id: str):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "application/json"
    }
    try:
        api_url = f"https://utoon.net/wp-json/icmadara/v1/capitulo/{chapter_id}/"
        async with aiohttp.ClientSession(headers=headers) as session:
            async with session.get(api_url, timeout=20) as res:
                if res.status != 200:
                    return []
                data     = await res.json()
                images   = []
                raw_list = data.get("imagenes") or data.get("images") or []
                if isinstance(raw_list, list):
                    for item in raw_list:
                        if isinstance(item, dict) and "src" in item:
                            images.append(item["src"])
                        elif isinstance(item, str):
                            images.append(item)
                if not images:
                    text  = await res.text()
                    found = re.findall(
                        r'https?://[^\s"\'<>]+?\.(?:jpg|jpeg|png|webp)(?:[^\s"\'<>]*?)',
                        text, re.IGNORECASE
                    )
                    images = list(dict.fromkeys([img.replace('\\/', '/') for img in found]))
                final_list = [
                    img for img in images
                    if "logo" not in img.lower()
                    and "icon" not in img.lower()
                    and "wp-content/uploads" in img.lower()
                ]
                return final_list if final_list else images
    except Exception as e:
        logger.error(f"Image Fetch Exception: {e}")
        return []

async def _fetch_single_image(session, url, index):
    try:
        async with session.get(url, timeout=15) as resp:
            if resp.status == 200:
                content = await resp.read()
                ext = url.split(".")[-1].split('?')[0]
                if len(ext) > 4: ext = "jpg"
                return (index, f"{index:03d}.{ext}", content)
    except Exception as e:
        logger.warning(f"[ZIP] Failed {url}: {e}")
    return None

async def download_images_as_zip_async(images: list, slug: str, ch_id: str) -> bytes:
    async with aiohttp.ClientSession(headers={"User-Agent": "Mozilla/5.0"}) as session:
        tasks   = [_fetch_single_image(session, url, i) for i, url in enumerate(images, 1)]
        results = await asyncio.gather(*tasks)
    zip_buffer = io.BytesIO()
    with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
        for res in sorted([r for r in results if r], key=lambda x: x[0]):
            zf.writestr(res[1], res[2])
    zip_buffer.seek(0)
    return zip_buffer.read()


async def download_images_as_pdf_async(images: list, slug: str, ch_id: str) -> bytes:
    async with aiohttp.ClientSession(headers={"User-Agent": "Mozilla/5.0"}) as session:
        tasks = [_fetch_single_image(session, url, i) for i, url in enumerate(images, 1)]
        results = await asyncio.gather(*tasks)
    image_bytes = [r[2] for r in sorted([r for r in results if r], key=lambda x: x[0])]
    if not image_bytes:
        raise RuntimeError("No images downloaded")
    return images_to_pdf(image_bytes)

# =========================
# PAGE LINK FETCHER
# =========================
async def fetch_all_manga_links_paginated(start_page=1, end_page=71, stop_flag=None):
    all_links = []
    headers   = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
    async with aiohttp.ClientSession() as session:
        for page_num in range(start_page, end_page + 1):
            if stop_flag and stop_flag.get("stop_requested"):
                break
            url = f"https://utoon.net/manga/page/{page_num}/"
            try:
                async with session.get(url, headers=headers, timeout=30) as resp:
                    if resp.status == 200:
                        html = await resp.text()
                        soup = BeautifulSoup(html, "html.parser")
                        for h3 in soup.find_all("h3", class_=["h4", "h5"]):
                            a = h3.find("a")
                            if a and a.get("href"):
                                all_links.append(a.get("href"))
                await asyncio.sleep(1)
            except Exception as e:
                logger.error(f"Error in page {page_num}: {e}")
            yield page_num, list(set(all_links))

async def collect_manga_links(start_page=1, end_page=71, stop_flag=None,
                               progress_cb=None) -> list:
    all_links = []
    headers   = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
    async with aiohttp.ClientSession() as session:
        for page_num in range(start_page, end_page + 1):
            if stop_flag and stop_flag.get("stop_requested"):
                break
            url = f"https://utoon.net/manga/page/{page_num}/"
            try:
                async with session.get(url, headers=headers, timeout=30) as resp:
                    if resp.status == 200:
                        html = await resp.text()
                        soup = BeautifulSoup(html, "html.parser")
                        for h3 in soup.find_all("h3", class_=["h4", "h5"]):
                            a = h3.find("a")
                            if a and a.get("href"):
                                all_links.append(a.get("href"))
                await asyncio.sleep(1)
            except Exception as e:
                logger.error(f"Error in page {page_num}: {e}")
            if progress_cb and page_num % 10 == 0:
                await progress_cb(page_num, len(set(all_links)))
    return list(set(all_links))

# =========================
# GENRES HARDCODED LIST (fallback / for browsing)
# =========================
GENRES_LIST = [
    {"id": 12,  "slug": "fantasy",       "nombre": "Fantasy",       "total": 1220},
    {"id": 10,  "slug": "drama",         "nombre": "Drama",         "total": 1194},
    {"id": 4,   "slug": "adventure",     "nombre": "Adventure",     "total": 1165},
    {"id": 2,   "slug": "action",        "nombre": "Action",        "total": 1048},
    {"id": 6,   "slug": "comedy",        "nombre": "Comedy",        "total": 932},
    {"id": 34,  "slug": "shounen",       "nombre": "Shounen",       "total": 930},
    {"id": 7,   "slug": "comic",         "nombre": "Comic",         "total": 700},
    {"id": 636, "slug": "fight",         "nombre": "Fight",         "total": 678},
    {"id": 21,  "slug": "manhwa",        "nombre": "Manhwa",        "total": 632},
    {"id": 615, "slug": "magic",         "nombre": "Magic",         "total": 607},
    {"id": 41,  "slug": "supernatural",  "nombre": "Supernatural",  "total": 583},
    {"id": 19,  "slug": "manga",         "nombre": "Manga",         "total": 470},
    {"id": 25,  "slug": "mystery",       "nombre": "Mystery",       "total": 245},
    {"id": 629, "slug": "isekai",        "nombre": "Isekai",        "total": 223},
    {"id": 15,  "slug": "historical",    "nombre": "Historical",    "total": 208},
    {"id": 614, "slug": "reincarnation", "nombre": "Reincarnation", "total": 174},
    {"id": 572, "slug": "shoujo",        "nombre": "Shoujo",        "total": 153},
    {"id": 639, "slug": "mangatoon",     "nombre": "Mangatoon",     "total": 136},
    {"id": 43,  "slug": "webtoon",       "nombre": "Webtoon",       "total": 125},
    {"id": 20,  "slug": "manhua",        "nombre": "Manhua",        "total": 104},
    {"id": 36,  "slug": "slice-of-life", "nombre": "Slice of Life", "total": 99},
    {"id": 633, "slug": "system",        "nombre": "System",        "total": 77},
    {"id": 29,  "slug": "school-life",   "nombre": "School Life",   "total": 68},
    {"id": 640, "slug": "crime",         "nombre": "Crime",         "total": 332},
    {"id": 628, "slug": "hunters",       "nombre": "Hunters",       "total": 310},
    {"id": 22,  "slug": "martial-arts",  "nombre": "Martial Arts",  "total": 331},
    {"id": 28,  "slug": "romance",       "nombre": "Romance",       "total": 253},
    {"id": 486, "slug": "tragedy",       "nombre": "Tragedy",       "total": 48},
    {"id": 581, "slug": "horror",        "nombre": "Horror",        "total": 54},
    {"id": 31,  "slug": "seinen",        "nombre": "Seinen",        "total": 56},
    {"id": 40,  "slug": "sports",        "nombre": "Sports",        "total": 37},
    {"id": 8,   "slug": "cooking",       "nombre": "Cooking",       "total": 35},
    {"id": 637, "slug": "bully",         "nombre": "Bully",         "total": 32},
    {"id": 630, "slug": "business",      "nombre": "Business",      "total": 53},
    {"id": 631, "slug": "zombies",       "nombre": "Zombies",       "total": 17},
    {"id": 526, "slug": "psychological", "nombre": "Psychological", "total": 20},
    {"id": 30,  "slug": "sci-fi",        "nombre": "Sci-fi",        "total": 23},
]

def get_genres_list():
    return sorted(GENRES_LIST, key=lambda g: g["total"], reverse=True)