Nfff

Paused

App Files Files Community

Nfff / app /api /routes_scraper.py

Ruhivig65

Upload 4 files

43ec02a verified about 1 month ago

raw

history blame contribute delete

10.3 kB

	"""
	============================================
	Scraper API Routes
	- Add novels
	- Start/Stop scraping
	- Get live status
	- Delete novels
	============================================
	"""

	import logging
	from typing import Optional, List
	from fastapi import APIRouter, Depends, HTTPException, status
	from pydantic import BaseModel, Field, HttpUrl
	from sqlalchemy.ext.asyncio import AsyncSession

	from app.database.connection import get_db_session
	from app.database.crud import (
	create_novel,
	get_all_novels,
	get_novel_by_id,
	delete_novel,
	update_novel_status,
	)
	from app.database.models import NovelStatus
	from app.scraper.scraper_engine import (
	start_scraping_novel,
	stop_scraping_novel,
	stop_all_scraping,
	scraper_status,
	get_active_task_ids,
	)
	from app.scraper.browser_manager import browser_manager

	logger = logging.getLogger(__name__)
	router = APIRouter(prefix="/api", tags=["Scraper"])


	# ============================================
	# Request/Response Models (Pydantic)
	# ============================================
	class NovelAddRequest(BaseModel):
	"""Request body for adding a new novel."""
	title: str = Field(..., min_length=1, max_length=500, examples=["My Novel"])
	url: str = Field(..., min_length=10, examples=["https://example.com/novel/chapter-1"])
	login_email: Optional[str] = Field(None, examples=["user@email.com"])
	login_password: Optional[str] = Field(None, examples=["password123"])
	next_button_selector: Optional[str] = Field(
	None,
	examples=["a.next_page, a[rel='next'], .next-chap"]
	)
	content_selector: Optional[str] = Field(
	None,
	examples=[".chapter-content, .reading-content, #chapter-content"]
	)


	class NovelResponse(BaseModel):
	"""Response model for a novel."""
	id: int
	title: str
	url: str
	status: str
	chapters_scraped: int
	last_error: Optional[str] = None
	needs_intervention: bool = False
	screenshot_path: Optional[str] = None
	current_url: Optional[str] = None

	class Config:
	from_attributes = True


	class BatchAddRequest(BaseModel):
	"""Request body for adding multiple novels at once."""
	novels: List[NovelAddRequest] = Field(..., min_length=1, max_length=15)
	shared_email: Optional[str] = None
	shared_password: Optional[str] = None


	class StartScrapeRequest(BaseModel):
	"""Request body for starting scraping."""
	novel_ids: Optional[List[int]] = Field(
	None,
	description="Specific novel IDs to scrape. If None, scrape all queued."
	)


	class StatusResponse(BaseModel):
	"""Live status of all scraping activities."""
	active_browsers: int
	max_browsers: int
	browser_initialized: bool
	novels: dict
	active_task_ids: list


	# ============================================
	# Routes
	# ============================================
	@router.post("/novels", response_model=NovelResponse, status_code=status.HTTP_201_CREATED)
	async def add_novel(
	request: NovelAddRequest,
	db: AsyncSession = Depends(get_db_session),
	):
	"""
	Add a single novel to the database.
	It will be in 'queued' status until scraping starts.
	"""
	try:
	novel = await create_novel(
	db,
	title=request.title,
	url=request.url,
	login_email=request.login_email,
	login_password=request.login_password,
	next_button_selector=request.next_button_selector,
	content_selector=request.content_selector,
	)
	await db.commit()

	logger.info(f"Novel added: {novel.title} (ID: {novel.id})")

	return NovelResponse(
	id=novel.id,
	title=novel.title,
	url=novel.url,
	status=novel.status.value,
	chapters_scraped=novel.chapters_scraped,
	last_error=novel.last_error,
	needs_intervention=novel.needs_intervention,
	screenshot_path=novel.screenshot_path,
	current_url=novel.current_url,
	)

	except Exception as e:
	logger.error(f"Error adding novel: {e}")
	raise HTTPException(
	status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
	detail=f"Failed to add novel: {str(e)}",
	)


	@router.post("/novels/batch", status_code=status.HTTP_201_CREATED)
	async def add_novels_batch(
	request: BatchAddRequest,
	db: AsyncSession = Depends(get_db_session),
	):
	"""
	Add multiple novels at once.
	Optionally share login credentials across all novels.
	"""
	added_novels = []
	errors = []

	for novel_req in request.novels:
	try:
	email = novel_req.login_email or request.shared_email
	password = novel_req.login_password or request.shared_password

	novel = await create_novel(
	db,
	title=novel_req.title,
	url=novel_req.url,
	login_email=email,
	login_password=password,
	next_button_selector=novel_req.next_button_selector,
	content_selector=novel_req.content_selector,
	)
	added_novels.append({
	"id": novel.id,
	"title": novel.title,
	"status": "queued",
	})
	except Exception as e:
	errors.append({
	"title": novel_req.title,
	"error": str(e),
	})

	await db.commit()

	return {
	"added": added_novels,
	"errors": errors,
	"total_added": len(added_novels),
	"total_errors": len(errors),
	}


	@router.get("/novels", response_model=List[NovelResponse])
	async def list_novels(db: AsyncSession = Depends(get_db_session)):
	"""Get all novels with their current status."""
	novels = await get_all_novels(db)
	return [
	NovelResponse(
	id=n.id,
	title=n.title,
	url=n.url,
	status=n.status.value,
	chapters_scraped=n.chapters_scraped,
	last_error=n.last_error,
	needs_intervention=n.needs_intervention,
	screenshot_path=n.screenshot_path,
	current_url=n.current_url,
	)
	for n in novels
	]


	@router.get("/novels/{novel_id}", response_model=NovelResponse)
	async def get_novel(
	novel_id: int,
	db: AsyncSession = Depends(get_db_session),
	):
	"""Get a specific novel by ID."""
	novel = await get_novel_by_id(db, novel_id)
	if not novel:
	raise HTTPException(status_code=404, detail="Novel not found")

	return NovelResponse(
	id=novel.id,
	title=novel.title,
	url=novel.url,
	status=novel.status.value,
	chapters_scraped=novel.chapters_scraped,
	last_error=novel.last_error,
	needs_intervention=novel.needs_intervention,
	screenshot_path=novel.screenshot_path,
	current_url=novel.current_url,
	)


	@router.delete("/novels/{novel_id}")
	async def remove_novel(
	novel_id: int,
	db: AsyncSession = Depends(get_db_session),
	):
	"""Delete a novel and all its chapters."""
	# Stop scraping first if active
	await stop_scraping_novel(novel_id)

	deleted = await delete_novel(db, novel_id)
	if not deleted:
	raise HTTPException(status_code=404, detail="Novel not found")

	await db.commit()
	return {"message": f"Novel {novel_id} deleted successfully"}


	@router.post("/scrape/start")
	async def start_scraping(
	request: StartScrapeRequest = StartScrapeRequest(),
	db: AsyncSession = Depends(get_db_session),
	):
	"""
	Start scraping novels.
	If novel_ids provided, scrape only those.
	If not, scrape all novels in 'queued' status.
	"""
	if not browser_manager.is_initialized:
	raise HTTPException(
	status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
	detail="Browser not initialized yet. Please wait and try again.",
	)

	novels = await get_all_novels(db)

	if request.novel_ids:
	novels_to_scrape = [n for n in novels if n.id in request.novel_ids]
	else:
	novels_to_scrape = [
	n for n in novels
	if n.status in [NovelStatus.QUEUED, NovelStatus.PAUSED_ERROR]
	]

	if not novels_to_scrape:
	raise HTTPException(
	status_code=status.HTTP_404_NOT_FOUND,
	detail="No novels found to scrape.",
	)

	started = []
	for novel in novels_to_scrape:
	novel_data = {
	"url": novel.url,
	"title": novel.title,
	"login_email": novel.login_email,
	"login_password": novel.login_password,
	"next_button_selector": novel.next_button_selector,
	"content_selector": novel.content_selector,
	}

	success = await start_scraping_novel(novel.id, novel_data)
	if success:
	started.append({"id": novel.id, "title": novel.title})

	return {
	"message": f"Started scraping {len(started)} novel(s)",
	"started": started,
	"max_concurrent": browser_manager.semaphore._value,
	}


	@router.post("/scrape/stop/{novel_id}")
	async def stop_one_scraping(novel_id: int):
	"""Stop scraping a specific novel."""
	stopped = await stop_scraping_novel(novel_id)
	if stopped:
	return {"message": f"Novel {novel_id} scraping stopped"}
	else:
	raise HTTPException(
	status_code=404,
	detail=f"Novel {novel_id} is not actively scraping",
	)


	@router.post("/scrape/stop-all")
	async def stop_all():
	"""Stop all active scraping tasks."""
	await stop_all_scraping()
	return {"message": "All scraping tasks stopped"}


	@router.get("/status", response_model=StatusResponse)
	async def get_live_status():
	"""
	Get real-time status of all scraping activities.
	This endpoint is polled by the frontend every few seconds.
	"""
	return StatusResponse(
	active_browsers=browser_manager.active_count,
	max_browsers=settings.MAX_CONCURRENT_BROWSERS,
	browser_initialized=browser_manager.is_initialized,
	novels=scraper_status,
	active_task_ids=get_active_task_ids(),
	)


	# Need to import settings
	from app.config import settings