Translaterpeed

Sleeping

App Files Files Community

Ruhivig65 commited on Mar 10

Commit

43ec02a

verified ·

1 Parent(s): 237370b

Upload 4 files

Browse files

Files changed (4) hide show

app/api/__init__.py +1 -0
app/api/routes_download.py +178 -0
app/api/routes_intervention.py +257 -0
app/api/routes_scraper.py +341 -0

app/api/__init__.py CHANGED Viewed

	@@ -0,0 +1 @@


1	+ # API Routes Package

app/api/routes_download.py ADDED Viewed

	@@ -0,0 +1,178 @@

+"""
+============================================
+Download Routes
+- Download complete novel as .txt file
+- Get chapter info/counts
+============================================
+"""
+import io
+import logging
+from fastapi import APIRouter, Depends, HTTPException
+from fastapi.responses import StreamingResponse
+from sqlalchemy.ext.asyncio import AsyncSession
+from app.database.connection import get_db_session
+from app.database.crud import (
+    get_novel_by_id,
+    get_chapters_for_novel,
+    get_chapter_count,
+    get_total_word_count,
+)
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/api/download", tags=["Download"])
+@router.get("/{novel_id}/info")
+async def get_download_info(
+    novel_id: int,
+    db: AsyncSession = Depends(get_db_session),
+):
+    """
+    Get info about downloadable content for a novel.
+    Shows chapter count, word count, etc.
+    """
+    novel = await get_novel_by_id(db, novel_id)
+    if not novel:
+        raise HTTPException(status_code=404, detail="Novel not found")
+    chapter_count = await get_chapter_count(db, novel_id)
+    word_count = await get_total_word_count(db, novel_id)
+    return {
+        "novel_id": novel_id,
+        "title": novel.title,
+        "status": novel.status.value,
+        "chapter_count": chapter_count,
+        "word_count": word_count,
+        "estimated_pages": word_count // 250 if word_count else 0,
+        "downloadable": chapter_count > 0,
+    }
+@router.get("/{novel_id}/txt")
+async def download_novel_txt(
+    novel_id: int,
+    db: AsyncSession = Depends(get_db_session),
+):
+    """
+    Download the complete novel as a .txt file.
+    Format:
+    ========================================
+    NOVEL TITLE
+    ========================================
+    --- Chapter 1: Title ---
+    Chapter content...
+    --- Chapter 2: Title ---
+    Chapter content...
+    """
+    novel = await get_novel_by_id(db, novel_id)
+    if not novel:
+        raise HTTPException(status_code=404, detail="Novel not found")
+    chapters = await get_chapters_for_novel(db, novel_id)
+    if not chapters:
+        raise HTTPException(
+            status_code=404,
+            detail="No chapters found for this novel. Scrape some chapters first!",
+        )
+    # --- Build the text file content ---
+    lines = []
+    # Header
+    lines.append("=" * 60)
+    lines.append(f"  {novel.title}")
+    lines.append("=" * 60)
+    lines.append(f"  Source: {novel.url}")
+    lines.append(f"  Chapters: {len(chapters)}")
+    total_words = sum(ch.word_count for ch in chapters)
+    lines.append(f"  Total Words: {total_words:,}")
+    lines.append(f"  Generated by Novel Scraper Pro")
+    lines.append("=" * 60)
+    lines.append("")
+    lines.append("")
+    # Chapters
+    for chapter in chapters:
+        lines.append("-" * 50)
+        lines.append(f"  Chapter {chapter.chapter_number}: {chapter.title}")
+        lines.append("-" * 50)
+        lines.append("")
+        lines.append(chapter.content)
+        lines.append("")
+        lines.append("")
+    # Footer
+    lines.append("=" * 60)
+    lines.append("  END OF NOVEL")
+    lines.append("=" * 60)
+    # --- Create the file as a stream ---
+    content = "\n".join(lines)
+    # Encode to bytes
+    file_bytes = content.encode("utf-8")
+    # Create a streaming response
+    buffer = io.BytesIO(file_bytes)
+    # Clean filename
+    safe_title = "".join(
+        c for c in novel.title if c.isalnum() or c in (' ', '-', '_')
+    ).strip()
+    safe_title = safe_title[:100]  # Max filename length
+    filename = f"{safe_title}.txt"
+    logger.info(
+        f"Download: Novel {novel_id} '{novel.title}' - "
+        f"{len(chapters)} chapters, {total_words:,} words"
+    )
+    return StreamingResponse(
+        buffer,
+        media_type="text/plain; charset=utf-8",
+        headers={
+            "Content-Disposition": f'attachment; filename="{filename}"',
+            "Content-Length": str(len(file_bytes)),
+        },
+    )
+@router.get("/{novel_id}/chapters")
+async def list_chapters(
+    novel_id: int,
+    db: AsyncSession = Depends(get_db_session),
+):
+    """
+    List all saved chapters for a novel (without full content).
+    Useful for the UI to show progress.
+    """
+    novel = await get_novel_by_id(db, novel_id)
+    if not novel:
+        raise HTTPException(status_code=404, detail="Novel not found")
+    chapters = await get_chapters_for_novel(db, novel_id)
+    return {
+        "novel_id": novel_id,
+        "title": novel.title,
+        "total_chapters": len(chapters),
+        "chapters": [
+            {
+                "number": ch.chapter_number,
+                "title": ch.title,
+                "word_count": ch.word_count,
+                "url": ch.url,
+                "scraped_at": ch.scraped_at.isoformat() if ch.scraped_at else None,
+            }
+            for ch in chapters
+        ],
+    }

app/api/routes_intervention.py ADDED Viewed

	@@ -0,0 +1,257 @@

+"""
+============================================
+Manual Intervention Routes
+- View captcha screenshots
+- Click on captcha remotely
+- Type text into fields remotely
+- Mark intervention as resolved
+============================================
+"""
+import os
+import logging
+from typing import Optional
+from fastapi import APIRouter, HTTPException, status
+from fastapi.responses import FileResponse
+from pydantic import BaseModel, Field
+from app.scraper.browser_manager import browser_manager
+from app.scraper.captcha_detector import captcha_detector
+from app.scraper.scraper_engine import scraper_status, update_status
+from app.config import settings
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/api/intervention", tags=["Intervention"])
+# ============================================
+# Request Models
+# ============================================
+class ClickRequest(BaseModel):
+    """Request to click at coordinates on a novel's page."""
+    novel_id: int = Field(..., examples=[1])
+    x: int = Field(..., ge=0, examples=[500])
+    y: int = Field(..., ge=0, examples=[300])
+class TypeRequest(BaseModel):
+    """Request to type text into a field on a novel's page."""
+    novel_id: int = Field(..., examples=[1])
+    selector: str = Field(..., examples=["input#captcha-input"])
+    text: str = Field(..., examples=["abc123"])
+class ResolveRequest(BaseModel):
+    """Request to mark an intervention as resolved."""
+    novel_id: int = Field(..., examples=[1])
+class RefreshScreenshotRequest(BaseModel):
+    """Request a fresh screenshot."""
+    novel_id: int = Field(..., examples=[1])
+# ============================================
+# Routes
+# ============================================
+@router.get("/active")
+async def get_active_interventions():
+    """
+    Get all novels currently waiting for manual intervention.
+    The frontend polls this to show captcha alerts.
+    """
+    interventions = captcha_detector.get_all_interventions()
+    result = {}
+    for novel_id, info in interventions.items():
+        result[novel_id] = {
+            "novel_id": novel_id,
+            "screenshot": info.get("screenshot", ""),
+            "reason": info.get("reason", "Unknown"),
+            "page_url": info.get("page_url", ""),
+            "timestamp": info.get("timestamp", 0),
+            "waiting": info.get("waiting", True),
+        }
+    return {
+        "count": len(result),
+        "interventions": result,
+    }
+@router.get("/screenshot/{filename}")
+async def get_screenshot(filename: str):
+    """
+    Serve a captcha screenshot image.
+    The frontend displays this so the user can see the captcha.
+    """
+    # Security: prevent directory traversal
+    safe_filename = os.path.basename(filename)
+    filepath = os.path.join(settings.SCREENSHOTS_DIR, safe_filename)
+    if not os.path.exists(filepath):
+        raise HTTPException(
+            status_code=404,
+            detail=f"Screenshot not found: {safe_filename}",
+        )
+    return FileResponse(
+        filepath,
+        media_type="image/png",
+        filename=safe_filename,
+    )
+@router.post("/click")
+async def remote_click(request: ClickRequest):
+    """
+    Click at specific coordinates on a novel's browser page.
+    How it works:
+    1. User sees the screenshot in the UI
+    2. User clicks on the captcha in the screenshot
+    3. Frontend sends the click coordinates here
+    4. Backend performs the actual click on the headless browser
+    """
+    novel_id = request.novel_id
+    # Verify the novel has an active page
+    page = browser_manager.get_page(novel_id)
+    if page is None or page.is_closed():
+        raise HTTPException(
+            status_code=404,
+            detail=f"No active browser page for Novel {novel_id}",
+        )
+    try:
+        await browser_manager.click_at_coordinates(novel_id, request.x, request.y)
+        logger.info(f"Remote click at ({request.x}, {request.y}) for Novel {novel_id}")
+        # Take a new screenshot after clicking to show the result
+        import time
+        new_filename = f"novel_{novel_id}_after_click_{int(time.time())}.png"
+        new_screenshot = await browser_manager.take_screenshot(novel_id, new_filename)
+        return {
+            "message": f"Clicked at ({request.x}, {request.y})",
+            "new_screenshot": new_filename if new_screenshot else None,
+        }
+    except Exception as e:
+        logger.error(f"Remote click failed for Novel {novel_id}: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Click failed: {str(e)}",
+        )
+@router.post("/type")
+async def remote_type(request: TypeRequest):
+    """
+    Type text into a field on a novel's browser page.
+    Useful for text-based captchas.
+    """
+    novel_id = request.novel_id
+    page = browser_manager.get_page(novel_id)
+    if page is None or page.is_closed():
+        raise HTTPException(
+            status_code=404,
+            detail=f"No active browser page for Novel {novel_id}",
+        )
+    try:
+        await browser_manager.type_text(novel_id, request.selector, request.text)
+        logger.info(
+            f"Remote type into '{request.selector}' for Novel {novel_id}"
+        )
+        return {
+            "message": f"Typed '{request.text}' into '{request.selector}'",
+        }
+    except Exception as e:
+        logger.error(f"Remote type failed for Novel {novel_id}: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Type failed: {str(e)}",
+        )
+@router.post("/resolve")
+async def resolve_intervention(request: ResolveRequest):
+    """
+    Mark a captcha intervention as resolved.
+    Call this after:
+    1. You've clicked on the captcha via /click
+    2. The captcha appears to be solved
+    3. You want the scraper to continue
+    """
+    novel_id = request.novel_id
+    intervention = captcha_detector.get_intervention_status(novel_id)
+    if not intervention:
+        raise HTTPException(
+            status_code=404,
+            detail=f"No active intervention for Novel {novel_id}",
+        )
+    captcha_detector.mark_intervention_complete(novel_id)
+    update_status(
+        novel_id,
+        phase="resuming",
+        message="Intervention resolved! Resuming scraping...",
+    )
+    logger.info(f"Intervention resolved for Novel {novel_id} ✅")
+    return {
+        "message": f"Intervention for Novel {novel_id} marked as resolved",
+        "novel_id": novel_id,
+    }
+@router.post("/refresh-screenshot")
+async def refresh_screenshot(request: RefreshScreenshotRequest):
+    """
+    Take a fresh screenshot of the novel's current page.
+    Use this to see the current state after clicking.
+    """
+    novel_id = request.novel_id
+    page = browser_manager.get_page(novel_id)
+    if page is None or page.is_closed():
+        raise HTTPException(
+            status_code=404,
+            detail=f"No active browser page for Novel {novel_id}",
+        )
+    try:
+        import time
+        filename = f"novel_{novel_id}_refresh_{int(time.time())}.png"
+        screenshot_path = await browser_manager.take_screenshot(novel_id, filename)
+        if screenshot_path is None:
+            raise HTTPException(
+                status_code=500,
+                detail="Failed to take screenshot",
+            )
+        return {
+            "screenshot": filename,
+            "page_url": page.url,
+            "page_title": await page.title(),
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Refresh screenshot failed for Novel {novel_id}: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Screenshot failed: {str(e)}",
+        )

app/api/routes_scraper.py ADDED Viewed

	@@ -0,0 +1,341 @@

+"""
+============================================
+Scraper API Routes
+- Add novels
+- Start/Stop scraping
+- Get live status
+- Delete novels
+============================================
+"""
+import logging
+from typing import Optional, List
+from fastapi import APIRouter, Depends, HTTPException, status
+from pydantic import BaseModel, Field, HttpUrl
+from sqlalchemy.ext.asyncio import AsyncSession
+from app.database.connection import get_db_session
+from app.database.crud import (
+    create_novel,
+    get_all_novels,
+    get_novel_by_id,
+    delete_novel,
+    update_novel_status,
+)
+from app.database.models import NovelStatus
+from app.scraper.scraper_engine import (
+    start_scraping_novel,
+    stop_scraping_novel,
+    stop_all_scraping,
+    scraper_status,
+    get_active_task_ids,
+)
+from app.scraper.browser_manager import browser_manager
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/api", tags=["Scraper"])
+# ============================================
+# Request/Response Models (Pydantic)
+# ============================================
+class NovelAddRequest(BaseModel):
+    """Request body for adding a new novel."""
+    title: str = Field(..., min_length=1, max_length=500, examples=["My Novel"])
+    url: str = Field(..., min_length=10, examples=["https://example.com/novel/chapter-1"])
+    login_email: Optional[str] = Field(None, examples=["user@email.com"])
+    login_password: Optional[str] = Field(None, examples=["password123"])
+    next_button_selector: Optional[str] = Field(
+        None,
+        examples=["a.next_page, a[rel='next'], .next-chap"]
+    )
+    content_selector: Optional[str] = Field(
+        None,
+        examples=[".chapter-content, .reading-content, #chapter-content"]
+    )
+class NovelResponse(BaseModel):
+    """Response model for a novel."""
+    id: int
+    title: str
+    url: str
+    status: str
+    chapters_scraped: int
+    last_error: Optional[str] = None
+    needs_intervention: bool = False
+    screenshot_path: Optional[str] = None
+    current_url: Optional[str] = None
+    class Config:
+        from_attributes = True
+class BatchAddRequest(BaseModel):
+    """Request body for adding multiple novels at once."""
+    novels: List[NovelAddRequest] = Field(..., min_length=1, max_length=15)
+    shared_email: Optional[str] = None
+    shared_password: Optional[str] = None
+class StartScrapeRequest(BaseModel):
+    """Request body for starting scraping."""
+    novel_ids: Optional[List[int]] = Field(
+        None,
+        description="Specific novel IDs to scrape. If None, scrape all queued."
+    )
+class StatusResponse(BaseModel):
+    """Live status of all scraping activities."""
+    active_browsers: int
+    max_browsers: int
+    browser_initialized: bool
+    novels: dict
+    active_task_ids: list
+# ============================================
+# Routes
+# ============================================
+@router.post("/novels", response_model=NovelResponse, status_code=status.HTTP_201_CREATED)
+async def add_novel(
+    request: NovelAddRequest,
+    db: AsyncSession = Depends(get_db_session),
+):
+    """
+    Add a single novel to the database.
+    It will be in 'queued' status until scraping starts.
+    """
+    try:
+        novel = await create_novel(
+            db,
+            title=request.title,
+            url=request.url,
+            login_email=request.login_email,
+            login_password=request.login_password,
+            next_button_selector=request.next_button_selector,
+            content_selector=request.content_selector,
+        )
+        await db.commit()
+        logger.info(f"Novel added: {novel.title} (ID: {novel.id})")
+        return NovelResponse(
+            id=novel.id,
+            title=novel.title,
+            url=novel.url,
+            status=novel.status.value,
+            chapters_scraped=novel.chapters_scraped,
+            last_error=novel.last_error,
+            needs_intervention=novel.needs_intervention,
+            screenshot_path=novel.screenshot_path,
+            current_url=novel.current_url,
+        )
+    except Exception as e:
+        logger.error(f"Error adding novel: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Failed to add novel: {str(e)}",
+        )
+@router.post("/novels/batch", status_code=status.HTTP_201_CREATED)
+async def add_novels_batch(
+    request: BatchAddRequest,
+    db: AsyncSession = Depends(get_db_session),
+):
+    """
+    Add multiple novels at once.
+    Optionally share login credentials across all novels.
+    """
+    added_novels = []
+    errors = []
+    for novel_req in request.novels:
+        try:
+            email = novel_req.login_email or request.shared_email
+            password = novel_req.login_password or request.shared_password
+            novel = await create_novel(
+                db,
+                title=novel_req.title,
+                url=novel_req.url,
+                login_email=email,
+                login_password=password,
+                next_button_selector=novel_req.next_button_selector,
+                content_selector=novel_req.content_selector,
+            )
+            added_novels.append({
+                "id": novel.id,
+                "title": novel.title,
+                "status": "queued",
+            })
+        except Exception as e:
+            errors.append({
+                "title": novel_req.title,
+                "error": str(e),
+            })
+    await db.commit()
+    return {
+        "added": added_novels,
+        "errors": errors,
+        "total_added": len(added_novels),
+        "total_errors": len(errors),
+    }
+@router.get("/novels", response_model=List[NovelResponse])
+async def list_novels(db: AsyncSession = Depends(get_db_session)):
+    """Get all novels with their current status."""
+    novels = await get_all_novels(db)
+    return [
+        NovelResponse(
+            id=n.id,
+            title=n.title,
+            url=n.url,
+            status=n.status.value,
+            chapters_scraped=n.chapters_scraped,
+            last_error=n.last_error,
+            needs_intervention=n.needs_intervention,
+            screenshot_path=n.screenshot_path,
+            current_url=n.current_url,
+        )
+        for n in novels
+    ]
+@router.get("/novels/{novel_id}", response_model=NovelResponse)
+async def get_novel(
+    novel_id: int,
+    db: AsyncSession = Depends(get_db_session),
+):
+    """Get a specific novel by ID."""
+    novel = await get_novel_by_id(db, novel_id)
+    if not novel:
+        raise HTTPException(status_code=404, detail="Novel not found")
+    return NovelResponse(
+        id=novel.id,
+        title=novel.title,
+        url=novel.url,
+        status=novel.status.value,
+        chapters_scraped=novel.chapters_scraped,
+        last_error=novel.last_error,
+        needs_intervention=novel.needs_intervention,
+        screenshot_path=novel.screenshot_path,
+        current_url=novel.current_url,
+    )
+@router.delete("/novels/{novel_id}")
+async def remove_novel(
+    novel_id: int,
+    db: AsyncSession = Depends(get_db_session),
+):
+    """Delete a novel and all its chapters."""
+    # Stop scraping first if active
+    await stop_scraping_novel(novel_id)
+    deleted = await delete_novel(db, novel_id)
+    if not deleted:
+        raise HTTPException(status_code=404, detail="Novel not found")
+    await db.commit()
+    return {"message": f"Novel {novel_id} deleted successfully"}
+@router.post("/scrape/start")
+async def start_scraping(
+    request: StartScrapeRequest = StartScrapeRequest(),
+    db: AsyncSession = Depends(get_db_session),
+):
+    """
+    Start scraping novels.
+    If novel_ids provided, scrape only those.
+    If not, scrape all novels in 'queued' status.
+    """
+    if not browser_manager.is_initialized:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Browser not initialized yet. Please wait and try again.",
+        )
+    novels = await get_all_novels(db)
+    if request.novel_ids:
+        novels_to_scrape = [n for n in novels if n.id in request.novel_ids]
+    else:
+        novels_to_scrape = [
+            n for n in novels
+            if n.status in [NovelStatus.QUEUED, NovelStatus.PAUSED_ERROR]
+        ]
+    if not novels_to_scrape:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail="No novels found to scrape.",
+        )
+    started = []
+    for novel in novels_to_scrape:
+        novel_data = {
+            "url": novel.url,
+            "title": novel.title,
+            "login_email": novel.login_email,
+            "login_password": novel.login_password,
+            "next_button_selector": novel.next_button_selector,
+            "content_selector": novel.content_selector,
+        }
+        success = await start_scraping_novel(novel.id, novel_data)
+        if success:
+            started.append({"id": novel.id, "title": novel.title})
+    return {
+        "message": f"Started scraping {len(started)} novel(s)",
+        "started": started,
+        "max_concurrent": browser_manager.semaphore._value,
+    }
+@router.post("/scrape/stop/{novel_id}")
+async def stop_one_scraping(novel_id: int):
+    """Stop scraping a specific novel."""
+    stopped = await stop_scraping_novel(novel_id)
+    if stopped:
+        return {"message": f"Novel {novel_id} scraping stopped"}
+    else:
+        raise HTTPException(
+            status_code=404,
+            detail=f"Novel {novel_id} is not actively scraping",
+        )
+@router.post("/scrape/stop-all")
+async def stop_all():
+    """Stop all active scraping tasks."""
+    await stop_all_scraping()
+    return {"message": "All scraping tasks stopped"}
+@router.get("/status", response_model=StatusResponse)
+async def get_live_status():
+    """
+    Get real-time status of all scraping activities.
+    This endpoint is polled by the frontend every few seconds.
+    """
+    return StatusResponse(
+        active_browsers=browser_manager.active_count,
+        max_browsers=settings.MAX_CONCURRENT_BROWSERS,
+        browser_initialized=browser_manager.is_initialized,
+        novels=scraper_status,
+        active_task_ids=get_active_task_ids(),
+    )
+# Need to import settings
+from app.config import settings