Nfff / app /api /routes_scraper.py
Ruhivig65's picture
Upload 4 files
43ec02a verified
"""
============================================
Scraper API Routes
- Add novels
- Start/Stop scraping
- Get live status
- Delete novels
============================================
"""
import logging
from typing import Optional, List
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel, Field, HttpUrl
from sqlalchemy.ext.asyncio import AsyncSession
from app.database.connection import get_db_session
from app.database.crud import (
create_novel,
get_all_novels,
get_novel_by_id,
delete_novel,
update_novel_status,
)
from app.database.models import NovelStatus
from app.scraper.scraper_engine import (
start_scraping_novel,
stop_scraping_novel,
stop_all_scraping,
scraper_status,
get_active_task_ids,
)
from app.scraper.browser_manager import browser_manager
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api", tags=["Scraper"])
# ============================================
# Request/Response Models (Pydantic)
# ============================================
class NovelAddRequest(BaseModel):
"""Request body for adding a new novel."""
title: str = Field(..., min_length=1, max_length=500, examples=["My Novel"])
url: str = Field(..., min_length=10, examples=["https://example.com/novel/chapter-1"])
login_email: Optional[str] = Field(None, examples=["user@email.com"])
login_password: Optional[str] = Field(None, examples=["password123"])
next_button_selector: Optional[str] = Field(
None,
examples=["a.next_page, a[rel='next'], .next-chap"]
)
content_selector: Optional[str] = Field(
None,
examples=[".chapter-content, .reading-content, #chapter-content"]
)
class NovelResponse(BaseModel):
"""Response model for a novel."""
id: int
title: str
url: str
status: str
chapters_scraped: int
last_error: Optional[str] = None
needs_intervention: bool = False
screenshot_path: Optional[str] = None
current_url: Optional[str] = None
class Config:
from_attributes = True
class BatchAddRequest(BaseModel):
"""Request body for adding multiple novels at once."""
novels: List[NovelAddRequest] = Field(..., min_length=1, max_length=15)
shared_email: Optional[str] = None
shared_password: Optional[str] = None
class StartScrapeRequest(BaseModel):
"""Request body for starting scraping."""
novel_ids: Optional[List[int]] = Field(
None,
description="Specific novel IDs to scrape. If None, scrape all queued."
)
class StatusResponse(BaseModel):
"""Live status of all scraping activities."""
active_browsers: int
max_browsers: int
browser_initialized: bool
novels: dict
active_task_ids: list
# ============================================
# Routes
# ============================================
@router.post("/novels", response_model=NovelResponse, status_code=status.HTTP_201_CREATED)
async def add_novel(
request: NovelAddRequest,
db: AsyncSession = Depends(get_db_session),
):
"""
Add a single novel to the database.
It will be in 'queued' status until scraping starts.
"""
try:
novel = await create_novel(
db,
title=request.title,
url=request.url,
login_email=request.login_email,
login_password=request.login_password,
next_button_selector=request.next_button_selector,
content_selector=request.content_selector,
)
await db.commit()
logger.info(f"Novel added: {novel.title} (ID: {novel.id})")
return NovelResponse(
id=novel.id,
title=novel.title,
url=novel.url,
status=novel.status.value,
chapters_scraped=novel.chapters_scraped,
last_error=novel.last_error,
needs_intervention=novel.needs_intervention,
screenshot_path=novel.screenshot_path,
current_url=novel.current_url,
)
except Exception as e:
logger.error(f"Error adding novel: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to add novel: {str(e)}",
)
@router.post("/novels/batch", status_code=status.HTTP_201_CREATED)
async def add_novels_batch(
request: BatchAddRequest,
db: AsyncSession = Depends(get_db_session),
):
"""
Add multiple novels at once.
Optionally share login credentials across all novels.
"""
added_novels = []
errors = []
for novel_req in request.novels:
try:
email = novel_req.login_email or request.shared_email
password = novel_req.login_password or request.shared_password
novel = await create_novel(
db,
title=novel_req.title,
url=novel_req.url,
login_email=email,
login_password=password,
next_button_selector=novel_req.next_button_selector,
content_selector=novel_req.content_selector,
)
added_novels.append({
"id": novel.id,
"title": novel.title,
"status": "queued",
})
except Exception as e:
errors.append({
"title": novel_req.title,
"error": str(e),
})
await db.commit()
return {
"added": added_novels,
"errors": errors,
"total_added": len(added_novels),
"total_errors": len(errors),
}
@router.get("/novels", response_model=List[NovelResponse])
async def list_novels(db: AsyncSession = Depends(get_db_session)):
"""Get all novels with their current status."""
novels = await get_all_novels(db)
return [
NovelResponse(
id=n.id,
title=n.title,
url=n.url,
status=n.status.value,
chapters_scraped=n.chapters_scraped,
last_error=n.last_error,
needs_intervention=n.needs_intervention,
screenshot_path=n.screenshot_path,
current_url=n.current_url,
)
for n in novels
]
@router.get("/novels/{novel_id}", response_model=NovelResponse)
async def get_novel(
novel_id: int,
db: AsyncSession = Depends(get_db_session),
):
"""Get a specific novel by ID."""
novel = await get_novel_by_id(db, novel_id)
if not novel:
raise HTTPException(status_code=404, detail="Novel not found")
return NovelResponse(
id=novel.id,
title=novel.title,
url=novel.url,
status=novel.status.value,
chapters_scraped=novel.chapters_scraped,
last_error=novel.last_error,
needs_intervention=novel.needs_intervention,
screenshot_path=novel.screenshot_path,
current_url=novel.current_url,
)
@router.delete("/novels/{novel_id}")
async def remove_novel(
novel_id: int,
db: AsyncSession = Depends(get_db_session),
):
"""Delete a novel and all its chapters."""
# Stop scraping first if active
await stop_scraping_novel(novel_id)
deleted = await delete_novel(db, novel_id)
if not deleted:
raise HTTPException(status_code=404, detail="Novel not found")
await db.commit()
return {"message": f"Novel {novel_id} deleted successfully"}
@router.post("/scrape/start")
async def start_scraping(
request: StartScrapeRequest = StartScrapeRequest(),
db: AsyncSession = Depends(get_db_session),
):
"""
Start scraping novels.
If novel_ids provided, scrape only those.
If not, scrape all novels in 'queued' status.
"""
if not browser_manager.is_initialized:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Browser not initialized yet. Please wait and try again.",
)
novels = await get_all_novels(db)
if request.novel_ids:
novels_to_scrape = [n for n in novels if n.id in request.novel_ids]
else:
novels_to_scrape = [
n for n in novels
if n.status in [NovelStatus.QUEUED, NovelStatus.PAUSED_ERROR]
]
if not novels_to_scrape:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="No novels found to scrape.",
)
started = []
for novel in novels_to_scrape:
novel_data = {
"url": novel.url,
"title": novel.title,
"login_email": novel.login_email,
"login_password": novel.login_password,
"next_button_selector": novel.next_button_selector,
"content_selector": novel.content_selector,
}
success = await start_scraping_novel(novel.id, novel_data)
if success:
started.append({"id": novel.id, "title": novel.title})
return {
"message": f"Started scraping {len(started)} novel(s)",
"started": started,
"max_concurrent": browser_manager.semaphore._value,
}
@router.post("/scrape/stop/{novel_id}")
async def stop_one_scraping(novel_id: int):
"""Stop scraping a specific novel."""
stopped = await stop_scraping_novel(novel_id)
if stopped:
return {"message": f"Novel {novel_id} scraping stopped"}
else:
raise HTTPException(
status_code=404,
detail=f"Novel {novel_id} is not actively scraping",
)
@router.post("/scrape/stop-all")
async def stop_all():
"""Stop all active scraping tasks."""
await stop_all_scraping()
return {"message": "All scraping tasks stopped"}
@router.get("/status", response_model=StatusResponse)
async def get_live_status():
"""
Get real-time status of all scraping activities.
This endpoint is polled by the frontend every few seconds.
"""
return StatusResponse(
active_browsers=browser_manager.active_count,
max_browsers=settings.MAX_CONCURRENT_BROWSERS,
browser_initialized=browser_manager.is_initialized,
novels=scraper_status,
active_task_ids=get_active_task_ids(),
)
# Need to import settings
from app.config import settings