| """ |
| ============================================ |
| Core Scraper Engine |
| - Orchestrates the entire scraping process for ONE novel |
| - Login → Navigate → Scrape Chapter → Next Chapter → Repeat |
| - Handles errors, captchas, and database saves |
| - Each novel runs as an independent async task |
| ============================================ |
| """ |
|
|
| import asyncio |
| import logging |
| import random |
| import re |
| from typing import Optional, Dict, Any |
| from datetime import datetime, timezone |
| from playwright.async_api import ( |
| Page, |
| TimeoutError as PlaywrightTimeout, |
| Error as PlaywrightError, |
| ) |
| from sqlalchemy.ext.asyncio import AsyncSession |
| from sqlalchemy.exc import IntegrityError |
|
|
| from app.config import settings |
| from app.scraper.browser_manager import browser_manager |
| from app.scraper.human_simulator import HumanSimulator |
| from app.scraper.captcha_detector import captcha_detector |
| from app.database.connection import get_session_factory |
| from deep_translator import GoogleTranslator |
|
|
| from app.database.crud import ( |
| save_chapter, |
| update_novel_status, |
| increment_chapter_count, |
| get_last_chapter_number, |
| chapter_exists, |
| ) |
| from app.database.models import NovelStatus |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| |
| |
| |
| |
| scraper_status: Dict[int, Dict[str, Any]] = {} |
|
|
|
|
| def update_status(novel_id: int, **kwargs): |
| """Update the live status for a novel (read by UI).""" |
| if novel_id not in scraper_status: |
| scraper_status[novel_id] = {} |
| scraper_status[novel_id].update(kwargs) |
| scraper_status[novel_id]["last_updated"] = datetime.now(timezone.utc).isoformat() |
|
|
|
|
| |
| |
| |
| async def attempt_login( |
| page: Page, |
| human: HumanSimulator, |
| novel_id: int, |
| url: str, |
| email: Optional[str], |
| password: Optional[str], |
| ) -> bool: |
| """ |
| Attempt to log in to the novel website. |
| |
| Strategy: |
| 1. Navigate to the URL |
| 2. Look for common login form selectors |
| 3. Fill in credentials |
| 4. Submit the form |
| 5. Verify login succeeded |
| |
| Returns True if login succeeded (or wasn't needed). |
| """ |
| if not email or not password: |
| logger.info(f"Novel {novel_id}: No credentials provided, skipping login.") |
| return True |
|
|
| update_status(novel_id, phase="logging_in", message="Attempting login...") |
|
|
| try: |
| |
| logger.info(f"Novel {novel_id}: Navigating to {url} for login...") |
| await page.goto(url, wait_until="domcontentloaded") |
| await human.simulate_page_arrival() |
|
|
| |
| login_selectors = [ |
| |
| { |
| "email": [ |
| "input[name='email']", |
| "input[type='email']", |
| "input[name='username']", |
| "input[name='log']", |
| "input[id='email']", |
| "input[id='username']", |
| "#login-email", |
| "input[placeholder*='email' i]", |
| "input[placeholder*='username' i]", |
| ], |
| "password": [ |
| "input[name='password']", |
| "input[type='password']", |
| "input[name='pwd']", |
| "input[id='password']", |
| "#login-password", |
| ], |
| "submit": [ |
| "button[type='submit']", |
| "input[type='submit']", |
| "button.login-btn", |
| "button.sign-in", |
| "#login-submit", |
| "button:has-text('Log In')", |
| "button:has-text('Sign In')", |
| "button:has-text('Login')", |
| ], |
| } |
| ] |
|
|
| |
| email_filled = False |
| password_filled = False |
| |
| for selector_group in login_selectors: |
| |
| for email_sel in selector_group["email"]: |
| try: |
| email_field = await page.query_selector(email_sel) |
| if email_field: |
| await human.type_like_human(email_sel, email) |
| email_filled = True |
| logger.info(f"Novel {novel_id}: Email filled using {email_sel}") |
| break |
| except Exception: |
| continue |
|
|
| |
| for pass_sel in selector_group["password"]: |
| try: |
| pass_field = await page.query_selector(pass_sel) |
| if pass_field: |
| await human.type_like_human(pass_sel, password) |
| password_filled = True |
| logger.info(f"Novel {novel_id}: Password filled using {pass_sel}") |
| break |
| except Exception: |
| continue |
|
|
| if email_filled and password_filled: |
| break |
|
|
| if not email_filled or not password_filled: |
| logger.warning( |
| f"Novel {novel_id}: Could not find login form fields. " |
| f"Email found: {email_filled}, Password found: {password_filled}. " |
| f"Proceeding without login." |
| ) |
| return True |
|
|
| |
| await human.medium_delay("before clicking login") |
| |
| submitted = False |
| for selector_group in login_selectors: |
| for submit_sel in selector_group["submit"]: |
| try: |
| submit_btn = await page.query_selector(submit_sel) |
| if submit_btn: |
| await submit_btn.click() |
| submitted = True |
| logger.info(f"Novel {novel_id}: Login form submitted using {submit_sel}") |
| break |
| except Exception: |
| continue |
| if submitted: |
| break |
|
|
| if not submitted: |
| |
| await page.keyboard.press("Enter") |
| logger.info(f"Novel {novel_id}: Login submitted via Enter key") |
|
|
| |
| await human.long_delay("waiting for login response") |
|
|
| |
| page_content = await page.content() |
| page_content_lower = page_content.lower() |
|
|
| login_failed_indicators = [ |
| "invalid password", |
| "incorrect password", |
| "login failed", |
| "wrong password", |
| "invalid credentials", |
| "account not found", |
| "error logging in", |
| ] |
|
|
| for indicator in login_failed_indicators: |
| if indicator in page_content_lower: |
| logger.error(f"Novel {novel_id}: Login appears to have failed ('{indicator}' found)") |
| update_status(novel_id, phase="login_failed", message=f"Login failed: {indicator}") |
| return False |
|
|
| logger.info(f"Novel {novel_id}: Login appears successful! ✅") |
| update_status(novel_id, phase="logged_in", message="Login successful") |
| return True |
|
|
| except PlaywrightTimeout: |
| logger.error(f"Novel {novel_id}: Login page timed out") |
| return False |
| except Exception as e: |
| logger.error(f"Novel {novel_id}: Login error: {e}") |
| return False |
|
|
|
|
| |
| |
| |
| async def extract_chapter_content( |
| page: Page, |
| novel_id: int, |
| content_selector: str, |
| ) -> Optional[Dict[str, str]]: |
| """ |
| Extract clean chapter text from the current page. |
| |
| Returns: |
| { |
| "title": "Chapter 123: The Battle", |
| "content": "The full chapter text...", |
| "url": "https://..." |
| } |
| or None if extraction failed |
| """ |
| try: |
| |
| selectors = [s.strip() for s in content_selector.split(",")] |
| |
| content_element = None |
| used_selector = None |
| |
| for selector in selectors: |
| try: |
| content_element = await page.query_selector(selector) |
| if content_element: |
| used_selector = selector |
| break |
| except Exception: |
| continue |
|
|
| if content_element is None: |
| logger.warning(f"Novel {novel_id}: No content element found with selectors: {content_selector}") |
| return None |
|
|
| |
| raw_text = await content_element.inner_text() |
|
|
| if not raw_text or len(raw_text.strip()) < 50: |
| logger.warning(f"Novel {novel_id}: Content too short ({len(raw_text.strip())} chars)") |
| return None |
|
|
| |
| clean_text = clean_chapter_text(raw_text) |
|
|
| |
| title = await extract_chapter_title(page) |
|
|
| |
| current_url = page.url |
|
|
| logger.info( |
| f"Novel {novel_id}: Extracted chapter " |
| f"(title: '{title}', {len(clean_text)} chars, selector: {used_selector})" |
| ) |
|
|
| return { |
| "title": title, |
| "content": clean_text, |
| "url": current_url, |
| } |
|
|
| except Exception as e: |
| logger.error(f"Novel {novel_id}: Content extraction error: {e}") |
| return None |
|
|
|
|
| async def extract_chapter_title(page: Page) -> str: |
| """ |
| Try to extract the chapter title from the page. |
| Tries multiple common selectors. |
| """ |
| title_selectors = [ |
| "h1.chapter-title", |
| "h1.entry-title", |
| ".chapter-title", |
| "h1", |
| "h2.chapter-title", |
| ".text-center h1", |
| ".reader-title", |
| "#chapter-heading", |
| ".chr-title", |
| ".title-chapter", |
| ] |
|
|
| for selector in title_selectors: |
| try: |
| element = await page.query_selector(selector) |
| if element: |
| title = await element.inner_text() |
| title = title.strip() |
| if title and len(title) < 200: |
| return title |
| except Exception: |
| continue |
|
|
| |
| try: |
| page_title = await page.title() |
| if page_title: |
| return page_title.strip()[:200] |
| except Exception: |
| pass |
|
|
| return "Untitled Chapter" |
|
|
|
|
| def clean_chapter_text(raw_text: str) -> str: |
| """ |
| Clean raw scraped text: |
| - Remove excessive whitespace |
| - Remove common ad/junk text |
| - Normalize line breaks |
| """ |
| if not raw_text: |
| return "" |
|
|
| text = raw_text |
|
|
| |
| junk_patterns = [ |
| r"(?i)translator[:\s]*.*?(?=\n|$)", |
| r"(?i)editor[:\s]*.*?(?=\n|$)", |
| r"(?i)please read at.*?(?=\n|$)", |
| r"(?i)support us at.*?(?=\n|$)", |
| r"(?i)read (?:the )?latest (?:chapter )?at.*?(?=\n|$)", |
| r"(?i)visit.*?for (?:the )?latest.*?(?=\n|$)", |
| r"(?i)if you find any errors.*?(?=\n|$)", |
| r"(?i)join our discord.*?(?=\n|$)", |
| r"(?i)patreon\.com.*?(?=\n|$)", |
| r"(?i)ko-fi\.com.*?(?=\n|$)", |
| r"(?i)advertisement", |
| r"(?i)sponsored content", |
| ] |
|
|
| for pattern in junk_patterns: |
| text = re.sub(pattern, "", text) |
|
|
| |
| |
| text = re.sub(r"\n{3,}", "\n\n", text) |
| |
| |
| text = re.sub(r" {2,}", " ", text) |
| |
| |
| lines = text.split("\n") |
| cleaned_lines = [line.strip() for line in lines] |
| text = "\n".join(cleaned_lines) |
|
|
| |
| text = text.strip() |
|
|
| return text |
|
|
|
|
| |
| |
| |
| async def click_next_chapter( |
| page: Page, |
| human: HumanSimulator, |
| novel_id: int, |
| next_button_selector: str, |
| ) -> bool: |
| """ |
| Find and click the "Next Chapter" button. |
| |
| Returns True if navigation succeeded, False if no next chapter found. |
| """ |
| try: |
| |
| selectors = [s.strip() for s in next_button_selector.split(",")] |
| |
| next_button = None |
| used_selector = None |
|
|
| for selector in selectors: |
| try: |
| next_button = await page.query_selector(selector) |
| if next_button: |
| |
| is_visible = await next_button.is_visible() |
| is_enabled = await next_button.is_enabled() |
| |
| if is_visible and is_enabled: |
| used_selector = selector |
| break |
| else: |
| next_button = None |
| except Exception: |
| continue |
|
|
| if next_button is None: |
| |
| text_patterns = [ |
| "Next Chapter", |
| "Next", |
| "Next →", |
| "Next >>", |
| "→", |
| ">>", |
| "Continue Reading", |
| "NEXT", |
| ] |
| |
| for pattern in text_patterns: |
| try: |
| next_button = await page.query_selector(f"a:has-text('{pattern}')") |
| if next_button: |
| is_visible = await next_button.is_visible() |
| if is_visible: |
| used_selector = f"text='{pattern}'" |
| break |
| next_button = None |
| except Exception: |
| continue |
|
|
| if next_button is None: |
| logger.info(f"Novel {novel_id}: No 'Next Chapter' button found. Might be the last chapter.") |
| return False |
|
|
| |
| try: |
| box = await next_button.bounding_box() |
| if box: |
| center_x = int(box["x"] + box["width"] / 2) |
| center_y = int(box["y"] + box["height"] / 2) |
| await human.simulate_before_click(center_x, center_y) |
| except Exception: |
| pass |
|
|
| |
| current_url = page.url |
| |
| await next_button.click() |
| logger.info(f"Novel {novel_id}: Clicked 'Next Chapter' ({used_selector})") |
|
|
| |
| try: |
| await page.wait_for_load_state("domcontentloaded", timeout=20000) |
| except PlaywrightTimeout: |
| logger.warning(f"Novel {novel_id}: Page load timeout after clicking next") |
|
|
| |
| new_url = page.url |
| if new_url == current_url: |
| |
| await asyncio.sleep(3) |
| new_url = page.url |
| |
| if new_url == current_url: |
| logger.warning(f"Novel {novel_id}: URL didn't change after clicking next") |
| |
|
|
| |
| await human.simulate_page_arrival() |
|
|
| return True |
|
|
| except PlaywrightTimeout: |
| logger.warning(f"Novel {novel_id}: Timeout while clicking next chapter") |
| return False |
| except Exception as e: |
| logger.error(f"Novel {novel_id}: Error clicking next chapter: {e}") |
| return False |
|
|
|
|
| |
| |
| |
|
|
|
|
| |
| |
| |
| def _translate_hindi(text: str) -> str: |
| """Translate English text to Hindi in chunks.""" |
| try: |
| translator = GoogleTranslator(source='en', target='hi') |
| chunk_size = 4000 |
| if len(text) <= chunk_size: |
| return translator.translate(text) or text |
| |
| chunks, current = [], "" |
| for word in text.split(' '): |
| if len(current) + len(word) + 1 <= chunk_size: |
| current += (" " + word) if current else word |
| else: |
| if current: |
| chunks.append(translator.translate(current) or current) |
| current = word |
| if current: |
| chunks.append(translator.translate(current) or current) |
| return " ".join(chunks) |
| except Exception as e: |
| logger.warning(f"Translation failed: {e} — saving English only") |
| return "" |
|
|
| async def scrape_novel(novel_id: int, novel_data: Dict[str, Any]): |
| """ |
| Main scraping function for a single novel. |
| This runs as an independent async task. |
| |
| Flow: |
| 1. Acquire semaphore slot |
| 2. Create browser context |
| 3. Login (if credentials provided) |
| 4. Navigate to starting URL |
| 5. Loop: |
| a. Check for captcha/protection |
| b. Extract chapter content |
| c. Save to database |
| d. Click next chapter |
| e. Repeat |
| 6. Clean up browser context |
| """ |
| url = novel_data["url"] |
| title = novel_data.get("title", "Unknown Novel") |
| email = novel_data.get("login_email") |
| password = novel_data.get("login_password") |
| next_btn_selector = novel_data.get( |
| "next_button_selector", |
| "a.next_page, a[rel='next'], .next-chap, button.next-chapter" |
| ) |
| content_sel = novel_data.get( |
| "content_selector", |
| ".chapter-content, .reading-content, #chapter-content, .text-left" |
| ) |
|
|
| update_status( |
| novel_id, |
| title=title, |
| phase="queued", |
| message="Waiting for available browser slot...", |
| chapters_scraped=0, |
| current_url=url, |
| ) |
|
|
| |
| logger.info(f"Novel {novel_id} ({title}): Waiting for browser slot...") |
| |
| async with browser_manager.semaphore: |
| logger.info(f"Novel {novel_id} ({title}): 🟢 Got browser slot! Starting...") |
| |
| context = None |
| page = None |
| session_factory = get_session_factory() |
| |
| try: |
| |
| update_status(novel_id, phase="initializing", message="Creating browser...") |
| context, page = await browser_manager.create_context_for_novel(novel_id) |
| human = HumanSimulator(page) |
|
|
| |
| async with session_factory() as db: |
| await update_novel_status( |
| db, novel_id, NovelStatus.LOGGING_IN, |
| current_url=url, |
| ) |
| await db.commit() |
|
|
| |
| login_success = await attempt_login( |
| page, human, novel_id, url, email, password |
| ) |
| |
| if not login_success: |
| async with session_factory() as db: |
| await update_novel_status( |
| db, novel_id, NovelStatus.FAILED, |
| error_message="Login failed", |
| ) |
| await db.commit() |
| update_status(novel_id, phase="failed", message="Login failed!") |
| return |
|
|
| |
| update_status(novel_id, phase="navigating", message=f"Going to {url}") |
| |
| logger.info(f"Novel {novel_id}: Navigating to starting URL: {url}") |
| await page.goto(url, wait_until="domcontentloaded") |
| await human.simulate_page_arrival() |
|
|
| async with session_factory() as db: |
| await update_novel_status( |
| db, novel_id, NovelStatus.SCRAPING, |
| current_url=url, |
| ) |
| await db.commit() |
|
|
| |
| async with session_factory() as db: |
| chapter_number = await get_last_chapter_number(db, novel_id) |
| |
| chapter_number += 1 |
| consecutive_failures = 0 |
| max_consecutive_failures = 5 |
|
|
| |
| logger.info(f"Novel {novel_id}: Starting scraping loop from chapter {chapter_number}") |
| |
| while True: |
| update_status( |
| novel_id, |
| phase="scraping", |
| message=f"Processing chapter {chapter_number}...", |
| current_url=page.url, |
| chapters_scraped=chapter_number - 1, |
| ) |
|
|
| |
| is_blocked, block_reason = await captcha_detector.check_for_protection( |
| page, novel_id, content_sel |
| ) |
|
|
| if is_blocked: |
| |
| if "cloudflare" in block_reason.lower(): |
| auto_resolved = await captcha_detector.wait_for_cloudflare_auto_resolve( |
| page, novel_id, max_wait=15 |
| ) |
| if auto_resolved: |
| is_blocked = False |
|
|
| if is_blocked: |
| |
| update_status( |
| novel_id, |
| phase="captcha_detected", |
| message=f"⚠️ Protection detected: {block_reason}", |
| ) |
|
|
| |
| screenshot_file = await captcha_detector.handle_protection_detected( |
| page, novel_id, block_reason, browser_manager |
| ) |
|
|
| |
| async with session_factory() as db: |
| await update_novel_status( |
| db, novel_id, NovelStatus.PAUSED_CAPTCHA, |
| error_message=block_reason, |
| screenshot_path=screenshot_file, |
| needs_intervention=True, |
| ) |
| await db.commit() |
|
|
| update_status( |
| novel_id, |
| phase="waiting_intervention", |
| message="Waiting for you to solve the captcha...", |
| screenshot=screenshot_file, |
| ) |
|
|
| |
| intervention_success = await captcha_detector.wait_for_intervention( |
| novel_id, timeout_minutes=30 |
| ) |
|
|
| if not intervention_success: |
| logger.error(f"Novel {novel_id}: Intervention timeout, stopping.") |
| async with session_factory() as db: |
| await update_novel_status( |
| db, novel_id, NovelStatus.FAILED, |
| error_message="Captcha intervention timeout", |
| ) |
| await db.commit() |
| update_status(novel_id, phase="failed", message="Captcha timeout!") |
| return |
|
|
| |
| async with session_factory() as db: |
| await update_novel_status( |
| db, novel_id, NovelStatus.SCRAPING, |
| needs_intervention=False, |
| ) |
| await db.commit() |
|
|
| |
| await human.medium_delay("post-intervention") |
| continue |
|
|
| |
| chapter_data = await extract_chapter_content( |
| page, novel_id, content_sel |
| ) |
|
|
| if chapter_data is None: |
| consecutive_failures += 1 |
| logger.warning( |
| f"Novel {novel_id}: Failed to extract chapter {chapter_number} " |
| f"(failure {consecutive_failures}/{max_consecutive_failures})" |
| ) |
| |
| if consecutive_failures >= max_consecutive_failures: |
| logger.error( |
| f"Novel {novel_id}: Too many consecutive failures, stopping." |
| ) |
| async with session_factory() as db: |
| await update_novel_status( |
| db, novel_id, NovelStatus.FAILED, |
| error_message=f"Too many extraction failures at chapter {chapter_number}", |
| ) |
| await db.commit() |
| update_status( |
| novel_id, phase="failed", |
| message=f"Failed after {max_consecutive_failures} consecutive extraction failures" |
| ) |
| return |
| |
| |
| has_next = await click_next_chapter( |
| page, human, novel_id, next_btn_selector |
| ) |
| if not has_next: |
| break |
| continue |
|
|
| |
| consecutive_failures = 0 |
|
|
| |
| try: |
| async with session_factory() as db: |
| |
| already_exists = await chapter_exists( |
| db, novel_id, chapter_number |
| ) |
| |
| if not already_exists: |
| |
| loop = asyncio.get_event_loop() |
| hindi_content = await loop.run_in_executor( |
| None, _translate_hindi, chapter_data["content"] |
| ) |
| hindi_title = await loop.run_in_executor( |
| None, _translate_hindi, chapter_data["title"] |
| ) |
| logger.info( |
| f"Novel {novel_id}: 🇮🇳 Translated Ch {chapter_number} to Hindi" |
| ) |
|
|
| await save_chapter( |
| db, |
| novel_id=novel_id, |
| chapter_number=chapter_number, |
| content=chapter_data["content"], |
| title=chapter_data["title"], |
| url=chapter_data["url"], |
| content_hindi=hindi_content or None, |
| title_hindi=hindi_title or None, |
| ) |
| await increment_chapter_count(db, novel_id) |
| await db.commit() |
| |
| logger.info( |
| f"Novel {novel_id}: ✅ Saved Chapter {chapter_number} " |
| f"- '{chapter_data['title']}'" |
| ) |
| else: |
| logger.info( |
| f"Novel {novel_id}: Chapter {chapter_number} already exists, skipping save" |
| ) |
|
|
| except IntegrityError: |
| logger.warning( |
| f"Novel {novel_id}: Chapter {chapter_number} duplicate, skipping" |
| ) |
| except Exception as e: |
| logger.error( |
| f"Novel {novel_id}: Database error saving chapter {chapter_number}: {e}" |
| ) |
| |
|
|
| update_status( |
| novel_id, |
| phase="scraping", |
| message=f"✅ Chapter {chapter_number} saved! Moving to next...", |
| chapters_scraped=chapter_number, |
| last_chapter_title=chapter_data["title"], |
| ) |
|
|
| |
| await human.simulate_reading_chapter() |
|
|
| |
| has_next = await click_next_chapter( |
| page, human, novel_id, next_btn_selector |
| ) |
|
|
| if not has_next: |
| logger.info( |
| f"Novel {novel_id}: No next chapter found after chapter {chapter_number}. " |
| f"Novel might be complete! 🎉" |
| ) |
| break |
|
|
| chapter_number += 1 |
|
|
| |
| if chapter_number > 10000: |
| logger.warning(f"Novel {novel_id}: Hit 10000 chapter limit, stopping") |
| break |
|
|
| |
| async with session_factory() as db: |
| await update_novel_status( |
| db, novel_id, NovelStatus.COMPLETED, |
| ) |
| await db.commit() |
|
|
| update_status( |
| novel_id, |
| phase="completed", |
| message=f"🎉 Completed! {chapter_number - 1} chapters scraped.", |
| chapters_scraped=chapter_number - 1, |
| ) |
| logger.info( |
| f"Novel {novel_id} ({title}): 🎉 COMPLETED! " |
| f"{chapter_number - 1} chapters scraped." |
| ) |
|
|
| except PlaywrightError as e: |
| error_msg = f"Browser error: {str(e)}" |
| logger.error(f"Novel {novel_id}: {error_msg}") |
| |
| async with session_factory() as db: |
| await update_novel_status( |
| db, novel_id, NovelStatus.PAUSED_ERROR, |
| error_message=error_msg, |
| ) |
| await db.commit() |
| |
| update_status(novel_id, phase="error", message=error_msg) |
|
|
| except Exception as e: |
| error_msg = f"Unexpected error: {str(e)}" |
| logger.error(f"Novel {novel_id}: {error_msg}", exc_info=True) |
| |
| try: |
| async with session_factory() as db: |
| await update_novel_status( |
| db, novel_id, NovelStatus.PAUSED_ERROR, |
| error_message=error_msg, |
| ) |
| await db.commit() |
| except Exception: |
| pass |
| |
| update_status(novel_id, phase="error", message=error_msg) |
|
|
| finally: |
| |
| await browser_manager.close_context(novel_id) |
| captcha_detector.clear_intervention(novel_id) |
| logger.info(f"Novel {novel_id}: Browser context cleaned up.") |
|
|
|
|
| |
| |
| |
| |
| active_tasks: Dict[int, asyncio.Task] = {} |
|
|
|
|
| async def start_scraping_novel(novel_id: int, novel_data: Dict[str, Any]): |
| """ |
| Start scraping a novel as a background task. |
| """ |
| if novel_id in active_tasks: |
| task = active_tasks[novel_id] |
| if not task.done(): |
| logger.warning(f"Novel {novel_id}: Already scraping, ignoring duplicate start") |
| return False |
|
|
| |
| task = asyncio.create_task( |
| scrape_novel(novel_id, novel_data), |
| name=f"scraper-novel-{novel_id}", |
| ) |
| active_tasks[novel_id] = task |
|
|
| |
| def task_done_callback(t: asyncio.Task): |
| try: |
| exception = t.exception() |
| if exception: |
| logger.error( |
| f"Novel {novel_id} task failed with exception: {exception}" |
| ) |
| except asyncio.CancelledError: |
| logger.info(f"Novel {novel_id} task was cancelled") |
|
|
| task.add_done_callback(task_done_callback) |
|
|
| logger.info(f"Novel {novel_id}: Scraping task started! 🚀") |
| return True |
|
|
|
|
| async def stop_scraping_novel(novel_id: int) -> bool: |
| """ |
| Stop scraping a specific novel. |
| """ |
| if novel_id in active_tasks: |
| task = active_tasks[novel_id] |
| if not task.done(): |
| task.cancel() |
| try: |
| await task |
| except asyncio.CancelledError: |
| pass |
| |
| |
| await browser_manager.close_context(novel_id) |
| |
| logger.info(f"Novel {novel_id}: Scraping stopped ✋") |
| |
| update_status(novel_id, phase="stopped", message="Scraping stopped by user") |
| return True |
|
|
| return False |
|
|
|
|
| async def stop_all_scraping(): |
| """Stop all active scraping tasks.""" |
| for novel_id in list(active_tasks.keys()): |
| await stop_scraping_novel(novel_id) |
| logger.info("All scraping tasks stopped.") |
|
|
|
|
| def get_active_task_ids() -> list: |
| """Get IDs of all novels currently being scraped.""" |
| return [nid for nid, task in active_tasks.items() if not task.done()] |