Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- app/database/__init__.py +1 -0
- app/database/connection.py +170 -0
- app/database/crud.py +275 -0
- app/database/models.py +156 -0
app/database/__init__.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Database Package
|
app/database/connection.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
============================================
|
| 3 |
+
Database Connection Manager
|
| 4 |
+
- Async SQLAlchemy Engine with Connection Pooling
|
| 5 |
+
- Handles Render's free tier quirks:
|
| 6 |
+
* Sleep mode reconnection
|
| 7 |
+
* Connection drop recovery
|
| 8 |
+
* Pool pre-ping for stale connections
|
| 9 |
+
============================================
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import logging
|
| 13 |
+
from sqlalchemy.ext.asyncio import (
|
| 14 |
+
create_async_engine,
|
| 15 |
+
AsyncSession,
|
| 16 |
+
async_sessionmaker,
|
| 17 |
+
AsyncEngine,
|
| 18 |
+
)
|
| 19 |
+
from sqlalchemy.orm import DeclarativeBase
|
| 20 |
+
from sqlalchemy.pool import AsyncAdaptedQueuePool
|
| 21 |
+
from sqlalchemy import text
|
| 22 |
+
|
| 23 |
+
from app.config import get_database_url, settings
|
| 24 |
+
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# ============================================
|
| 29 |
+
# Base Model Class (all models inherit this)
|
| 30 |
+
# ============================================
|
| 31 |
+
class Base(DeclarativeBase):
|
| 32 |
+
"""Base class for all database models."""
|
| 33 |
+
pass
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# ============================================
|
| 37 |
+
# Engine Configuration
|
| 38 |
+
# ============================================
|
| 39 |
+
_engine: AsyncEngine | None = None
|
| 40 |
+
_session_factory: async_sessionmaker[AsyncSession] | None = None
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def get_engine() -> AsyncEngine:
|
| 44 |
+
"""
|
| 45 |
+
Create or return the async SQLAlchemy engine.
|
| 46 |
+
|
| 47 |
+
Key settings for Render Free Tier:
|
| 48 |
+
- pool_pre_ping=True: Tests connection before using it
|
| 49 |
+
(prevents "connection closed" errors after DB sleep)
|
| 50 |
+
- pool_size=5: Small pool (free tier has limited connections)
|
| 51 |
+
- max_overflow=3: Allow 3 extra connections under load
|
| 52 |
+
- pool_recycle=300: Recycle connections every 5 minutes
|
| 53 |
+
(prevents stale connections from Render's idle timeout)
|
| 54 |
+
- pool_timeout=30: Wait max 30s for a connection from pool
|
| 55 |
+
"""
|
| 56 |
+
global _engine
|
| 57 |
+
|
| 58 |
+
if _engine is None:
|
| 59 |
+
database_url = get_database_url()
|
| 60 |
+
|
| 61 |
+
logger.info("Creating database engine...")
|
| 62 |
+
logger.info(f"Database host: {database_url.split('@')[1].split('/')[0] if '@' in database_url else 'unknown'}")
|
| 63 |
+
|
| 64 |
+
_engine = create_async_engine(
|
| 65 |
+
database_url,
|
| 66 |
+
# --- Connection Pool Settings ---
|
| 67 |
+
poolclass=AsyncAdaptedQueuePool,
|
| 68 |
+
pool_pre_ping=True, # CRITICAL: Check connection health
|
| 69 |
+
pool_size=5, # Base connections in pool
|
| 70 |
+
max_overflow=3, # Extra connections allowed
|
| 71 |
+
pool_recycle=300, # Recycle every 5 min (300 sec)
|
| 72 |
+
pool_timeout=30, # Wait 30s for available connection
|
| 73 |
+
|
| 74 |
+
# --- Query Settings ---
|
| 75 |
+
echo=settings.DEBUG, # Log SQL queries in debug mode
|
| 76 |
+
|
| 77 |
+
# --- Connection Arguments ---
|
| 78 |
+
connect_args={
|
| 79 |
+
"timeout": 30, # Connection timeout
|
| 80 |
+
"command_timeout": 60, # Query timeout
|
| 81 |
+
"server_settings": {
|
| 82 |
+
"application_name": "novel_scraper",
|
| 83 |
+
"idle_in_transaction_session_timeout": "60000", # 60s
|
| 84 |
+
},
|
| 85 |
+
},
|
| 86 |
+
)
|
| 87 |
+
logger.info("Database engine created successfully.")
|
| 88 |
+
|
| 89 |
+
return _engine
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def get_session_factory() -> async_sessionmaker[AsyncSession]:
|
| 93 |
+
"""
|
| 94 |
+
Create or return the async session factory.
|
| 95 |
+
"""
|
| 96 |
+
global _session_factory
|
| 97 |
+
|
| 98 |
+
if _session_factory is None:
|
| 99 |
+
engine = get_engine()
|
| 100 |
+
_session_factory = async_sessionmaker(
|
| 101 |
+
bind=engine,
|
| 102 |
+
class_=AsyncSession,
|
| 103 |
+
expire_on_commit=False, # Don't expire objects after commit
|
| 104 |
+
autocommit=False,
|
| 105 |
+
autoflush=False,
|
| 106 |
+
)
|
| 107 |
+
logger.info("Session factory created.")
|
| 108 |
+
|
| 109 |
+
return _session_factory
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
async def get_db_session() -> AsyncSession:
|
| 113 |
+
"""
|
| 114 |
+
Dependency for FastAPI routes.
|
| 115 |
+
Yields an async database session.
|
| 116 |
+
|
| 117 |
+
Usage in FastAPI:
|
| 118 |
+
@router.get("/something")
|
| 119 |
+
async def my_route(db: AsyncSession = Depends(get_db_session)):
|
| 120 |
+
...
|
| 121 |
+
"""
|
| 122 |
+
factory = get_session_factory()
|
| 123 |
+
async with factory() as session:
|
| 124 |
+
try:
|
| 125 |
+
yield session
|
| 126 |
+
await session.commit()
|
| 127 |
+
except Exception as e:
|
| 128 |
+
await session.rollback()
|
| 129 |
+
logger.error(f"Database session error: {e}")
|
| 130 |
+
raise
|
| 131 |
+
finally:
|
| 132 |
+
await session.close()
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
async def init_database():
|
| 136 |
+
"""
|
| 137 |
+
Initialize database: Create all tables.
|
| 138 |
+
Called on application startup.
|
| 139 |
+
"""
|
| 140 |
+
engine = get_engine()
|
| 141 |
+
|
| 142 |
+
try:
|
| 143 |
+
# Test connection first
|
| 144 |
+
async with engine.begin() as conn:
|
| 145 |
+
await conn.execute(text("SELECT 1"))
|
| 146 |
+
logger.info("✅ Database connection test successful!")
|
| 147 |
+
|
| 148 |
+
# Create all tables
|
| 149 |
+
async with engine.begin() as conn:
|
| 150 |
+
await conn.run_sync(Base.metadata.create_all)
|
| 151 |
+
logger.info("✅ Database tables created/verified!")
|
| 152 |
+
|
| 153 |
+
except Exception as e:
|
| 154 |
+
logger.error(f"❌ Database initialization failed: {e}")
|
| 155 |
+
logger.error("Check your DATABASE_URL environment variable!")
|
| 156 |
+
raise
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
async def close_database():
|
| 160 |
+
"""
|
| 161 |
+
Close database connections.
|
| 162 |
+
Called on application shutdown.
|
| 163 |
+
"""
|
| 164 |
+
global _engine, _session_factory
|
| 165 |
+
|
| 166 |
+
if _engine is not None:
|
| 167 |
+
await _engine.dispose()
|
| 168 |
+
_engine = None
|
| 169 |
+
_session_factory = None
|
| 170 |
+
logger.info("Database connections closed.")
|
app/database/crud.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
============================================
|
| 3 |
+
CRUD Operations (Create, Read, Update, Delete)
|
| 4 |
+
All database operations go through here.
|
| 5 |
+
Includes comprehensive error handling.
|
| 6 |
+
============================================
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
from typing import Optional, List
|
| 11 |
+
from sqlalchemy.ext.asyncio import AsyncSession
|
| 12 |
+
from sqlalchemy import select, update, func, delete
|
| 13 |
+
from sqlalchemy.exc import (
|
| 14 |
+
IntegrityError,
|
| 15 |
+
OperationalError,
|
| 16 |
+
DisconnectionError,
|
| 17 |
+
TimeoutError as SATimeoutError,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
from app.database.models import Novel, Chapter, NovelStatus
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ============================================
|
| 26 |
+
# Retry Decorator for DB Operations
|
| 27 |
+
# ============================================
|
| 28 |
+
def handle_db_errors(func):
|
| 29 |
+
"""
|
| 30 |
+
Decorator to handle common database errors gracefully.
|
| 31 |
+
Catches connection drops, timeouts, and retries logic.
|
| 32 |
+
"""
|
| 33 |
+
async def wrapper(*args, **kwargs):
|
| 34 |
+
try:
|
| 35 |
+
return await func(*args, **kwargs)
|
| 36 |
+
except DisconnectionError as e:
|
| 37 |
+
logger.error(f"Database disconnected during {func.__name__}: {e}")
|
| 38 |
+
logger.info("This usually happens when Render DB wakes from sleep. Retrying...")
|
| 39 |
+
raise
|
| 40 |
+
except OperationalError as e:
|
| 41 |
+
logger.error(f"Database operational error in {func.__name__}: {e}")
|
| 42 |
+
raise
|
| 43 |
+
except SATimeoutError as e:
|
| 44 |
+
logger.error(f"Database timeout in {func.__name__}: {e}")
|
| 45 |
+
raise
|
| 46 |
+
except IntegrityError as e:
|
| 47 |
+
logger.warning(f"Integrity error in {func.__name__}: {e}")
|
| 48 |
+
raise
|
| 49 |
+
except Exception as e:
|
| 50 |
+
logger.error(f"Unexpected DB error in {func.__name__}: {e}")
|
| 51 |
+
raise
|
| 52 |
+
return wrapper
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# ============================================
|
| 56 |
+
# Novel CRUD
|
| 57 |
+
# ============================================
|
| 58 |
+
@handle_db_errors
|
| 59 |
+
async def create_novel(
|
| 60 |
+
db: AsyncSession,
|
| 61 |
+
title: str,
|
| 62 |
+
url: str,
|
| 63 |
+
login_email: Optional[str] = None,
|
| 64 |
+
login_password: Optional[str] = None,
|
| 65 |
+
next_button_selector: Optional[str] = None,
|
| 66 |
+
content_selector: Optional[str] = None,
|
| 67 |
+
) -> Novel:
|
| 68 |
+
"""Create a new novel entry in the database."""
|
| 69 |
+
novel = Novel(
|
| 70 |
+
title=title,
|
| 71 |
+
url=url,
|
| 72 |
+
current_url=url,
|
| 73 |
+
login_email=login_email,
|
| 74 |
+
login_password=login_password,
|
| 75 |
+
status=NovelStatus.QUEUED,
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
if next_button_selector:
|
| 79 |
+
novel.next_button_selector = next_button_selector
|
| 80 |
+
if content_selector:
|
| 81 |
+
novel.content_selector = content_selector
|
| 82 |
+
|
| 83 |
+
db.add(novel)
|
| 84 |
+
await db.flush() # Get the ID without committing
|
| 85 |
+
await db.refresh(novel)
|
| 86 |
+
|
| 87 |
+
logger.info(f"Created novel: {novel.title} (ID: {novel.id})")
|
| 88 |
+
return novel
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
@handle_db_errors
|
| 92 |
+
async def get_novel_by_id(db: AsyncSession, novel_id: int) -> Optional[Novel]:
|
| 93 |
+
"""Get a single novel by its ID."""
|
| 94 |
+
result = await db.execute(
|
| 95 |
+
select(Novel).where(Novel.id == novel_id)
|
| 96 |
+
)
|
| 97 |
+
return result.scalar_one_or_none()
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
@handle_db_errors
|
| 101 |
+
async def get_all_novels(db: AsyncSession) -> List[Novel]:
|
| 102 |
+
"""Get all novels ordered by creation date."""
|
| 103 |
+
result = await db.execute(
|
| 104 |
+
select(Novel).order_by(Novel.created_at.desc())
|
| 105 |
+
)
|
| 106 |
+
return list(result.scalars().all())
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
@handle_db_errors
|
| 110 |
+
async def get_novels_needing_intervention(db: AsyncSession) -> List[Novel]:
|
| 111 |
+
"""Get novels that are paused and need manual captcha solving."""
|
| 112 |
+
result = await db.execute(
|
| 113 |
+
select(Novel).where(
|
| 114 |
+
Novel.needs_intervention == True,
|
| 115 |
+
Novel.status == NovelStatus.PAUSED_CAPTCHA,
|
| 116 |
+
)
|
| 117 |
+
)
|
| 118 |
+
return list(result.scalars().all())
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
@handle_db_errors
|
| 122 |
+
async def update_novel_status(
|
| 123 |
+
db: AsyncSession,
|
| 124 |
+
novel_id: int,
|
| 125 |
+
status: NovelStatus,
|
| 126 |
+
error_message: Optional[str] = None,
|
| 127 |
+
current_url: Optional[str] = None,
|
| 128 |
+
screenshot_path: Optional[str] = None,
|
| 129 |
+
needs_intervention: Optional[bool] = None,
|
| 130 |
+
) -> Optional[Novel]:
|
| 131 |
+
"""Update the status and related fields of a novel."""
|
| 132 |
+
update_data = {"status": status}
|
| 133 |
+
|
| 134 |
+
if error_message is not None:
|
| 135 |
+
update_data["last_error"] = error_message
|
| 136 |
+
if current_url is not None:
|
| 137 |
+
update_data["current_url"] = current_url
|
| 138 |
+
if screenshot_path is not None:
|
| 139 |
+
update_data["screenshot_path"] = screenshot_path
|
| 140 |
+
if needs_intervention is not None:
|
| 141 |
+
update_data["needs_intervention"] = needs_intervention
|
| 142 |
+
|
| 143 |
+
await db.execute(
|
| 144 |
+
update(Novel)
|
| 145 |
+
.where(Novel.id == novel_id)
|
| 146 |
+
.values(**update_data)
|
| 147 |
+
)
|
| 148 |
+
await db.flush()
|
| 149 |
+
|
| 150 |
+
# Return updated novel
|
| 151 |
+
return await get_novel_by_id(db, novel_id)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
@handle_db_errors
|
| 155 |
+
async def increment_chapter_count(db: AsyncSession, novel_id: int) -> None:
|
| 156 |
+
"""Increment the chapters_scraped counter for a novel."""
|
| 157 |
+
await db.execute(
|
| 158 |
+
update(Novel)
|
| 159 |
+
.where(Novel.id == novel_id)
|
| 160 |
+
.values(chapters_scraped=Novel.chapters_scraped + 1)
|
| 161 |
+
)
|
| 162 |
+
await db.flush()
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
@handle_db_errors
|
| 166 |
+
async def delete_novel(db: AsyncSession, novel_id: int) -> bool:
|
| 167 |
+
"""Delete a novel and all its chapters (cascade)."""
|
| 168 |
+
result = await db.execute(
|
| 169 |
+
delete(Novel).where(Novel.id == novel_id)
|
| 170 |
+
)
|
| 171 |
+
await db.flush()
|
| 172 |
+
deleted = result.rowcount > 0
|
| 173 |
+
if deleted:
|
| 174 |
+
logger.info(f"Deleted novel ID: {novel_id}")
|
| 175 |
+
return deleted
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
# ============================================
|
| 179 |
+
# Chapter CRUD
|
| 180 |
+
# ============================================
|
| 181 |
+
@handle_db_errors
|
| 182 |
+
async def save_chapter(
|
| 183 |
+
db: AsyncSession,
|
| 184 |
+
novel_id: int,
|
| 185 |
+
chapter_number: int,
|
| 186 |
+
content: str,
|
| 187 |
+
title: Optional[str] = None,
|
| 188 |
+
url: Optional[str] = None,
|
| 189 |
+
) -> Chapter:
|
| 190 |
+
"""
|
| 191 |
+
Save a chapter to the database.
|
| 192 |
+
If a chapter with the same number already exists for this novel,
|
| 193 |
+
it will raise an IntegrityError (caught by caller).
|
| 194 |
+
"""
|
| 195 |
+
word_count = len(content.split()) if content else 0
|
| 196 |
+
|
| 197 |
+
chapter = Chapter(
|
| 198 |
+
novel_id=novel_id,
|
| 199 |
+
chapter_number=chapter_number,
|
| 200 |
+
title=title or f"Chapter {chapter_number}",
|
| 201 |
+
content=content,
|
| 202 |
+
url=url,
|
| 203 |
+
word_count=word_count,
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
db.add(chapter)
|
| 207 |
+
await db.flush()
|
| 208 |
+
await db.refresh(chapter)
|
| 209 |
+
|
| 210 |
+
logger.info(
|
| 211 |
+
f"Saved Chapter {chapter_number} for Novel {novel_id} "
|
| 212 |
+
f"({word_count} words)"
|
| 213 |
+
)
|
| 214 |
+
return chapter
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
@handle_db_errors
|
| 218 |
+
async def get_chapters_for_novel(
|
| 219 |
+
db: AsyncSession,
|
| 220 |
+
novel_id: int,
|
| 221 |
+
) -> List[Chapter]:
|
| 222 |
+
"""Get all chapters for a novel, ordered by chapter number."""
|
| 223 |
+
result = await db.execute(
|
| 224 |
+
select(Chapter)
|
| 225 |
+
.where(Chapter.novel_id == novel_id)
|
| 226 |
+
.order_by(Chapter.chapter_number.asc())
|
| 227 |
+
)
|
| 228 |
+
return list(result.scalars().all())
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
@handle_db_errors
|
| 232 |
+
async def get_chapter_count(db: AsyncSession, novel_id: int) -> int:
|
| 233 |
+
"""Get the total number of chapters saved for a novel."""
|
| 234 |
+
result = await db.execute(
|
| 235 |
+
select(func.count(Chapter.id))
|
| 236 |
+
.where(Chapter.novel_id == novel_id)
|
| 237 |
+
)
|
| 238 |
+
return result.scalar() or 0
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
@handle_db_errors
|
| 242 |
+
async def get_last_chapter_number(db: AsyncSession, novel_id: int) -> int:
|
| 243 |
+
"""Get the highest chapter number saved for a novel."""
|
| 244 |
+
result = await db.execute(
|
| 245 |
+
select(func.max(Chapter.chapter_number))
|
| 246 |
+
.where(Chapter.novel_id == novel_id)
|
| 247 |
+
)
|
| 248 |
+
return result.scalar() or 0
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
@handle_db_errors
|
| 252 |
+
async def chapter_exists(
|
| 253 |
+
db: AsyncSession,
|
| 254 |
+
novel_id: int,
|
| 255 |
+
chapter_number: int,
|
| 256 |
+
) -> bool:
|
| 257 |
+
"""Check if a specific chapter already exists."""
|
| 258 |
+
result = await db.execute(
|
| 259 |
+
select(func.count(Chapter.id))
|
| 260 |
+
.where(
|
| 261 |
+
Chapter.novel_id == novel_id,
|
| 262 |
+
Chapter.chapter_number == chapter_number,
|
| 263 |
+
)
|
| 264 |
+
)
|
| 265 |
+
return (result.scalar() or 0) > 0
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
@handle_db_errors
|
| 269 |
+
async def get_total_word_count(db: AsyncSession, novel_id: int) -> int:
|
| 270 |
+
"""Get total word count across all chapters of a novel."""
|
| 271 |
+
result = await db.execute(
|
| 272 |
+
select(func.sum(Chapter.word_count))
|
| 273 |
+
.where(Chapter.novel_id == novel_id)
|
| 274 |
+
)
|
| 275 |
+
return result.scalar() or 0
|
app/database/models.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
============================================
|
| 3 |
+
Database Models (Tables)
|
| 4 |
+
- Novel: Stores novel metadata & scraping status
|
| 5 |
+
- Chapter: Stores individual chapter content
|
| 6 |
+
============================================
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import enum
|
| 10 |
+
from datetime import datetime, timezone
|
| 11 |
+
from sqlalchemy import (
|
| 12 |
+
Column,
|
| 13 |
+
Integer,
|
| 14 |
+
String,
|
| 15 |
+
Text,
|
| 16 |
+
DateTime,
|
| 17 |
+
ForeignKey,
|
| 18 |
+
Enum as SQLEnum,
|
| 19 |
+
Boolean,
|
| 20 |
+
Float,
|
| 21 |
+
Index,
|
| 22 |
+
UniqueConstraint,
|
| 23 |
+
)
|
| 24 |
+
from sqlalchemy.orm import relationship
|
| 25 |
+
from app.database.connection import Base
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# ============================================
|
| 29 |
+
# Enums for Status Tracking
|
| 30 |
+
# ============================================
|
| 31 |
+
class NovelStatus(str, enum.Enum):
|
| 32 |
+
"""Status of a novel's scraping process."""
|
| 33 |
+
QUEUED = "queued" # Waiting in queue
|
| 34 |
+
LOGGING_IN = "logging_in" # Attempting login
|
| 35 |
+
SCRAPING = "scraping" # Actively scraping chapters
|
| 36 |
+
PAUSED_CAPTCHA = "paused_captcha" # Waiting for manual intervention
|
| 37 |
+
PAUSED_ERROR = "paused_error" # Paused due to error
|
| 38 |
+
COMPLETED = "completed" # All chapters scraped
|
| 39 |
+
FAILED = "failed" # Permanently failed
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# ============================================
|
| 43 |
+
# Novel Model
|
| 44 |
+
# ============================================
|
| 45 |
+
class Novel(Base):
|
| 46 |
+
"""
|
| 47 |
+
Stores metadata about each novel being scraped.
|
| 48 |
+
"""
|
| 49 |
+
__tablename__ = "novels"
|
| 50 |
+
|
| 51 |
+
id = Column(Integer, primary_key=True, autoincrement=True)
|
| 52 |
+
|
| 53 |
+
# --- Basic Info ---
|
| 54 |
+
title = Column(String(500), nullable=False, default="Unknown Novel")
|
| 55 |
+
url = Column(String(2000), nullable=False) # Starting URL
|
| 56 |
+
current_url = Column(String(2000), nullable=True) # Current page URL
|
| 57 |
+
|
| 58 |
+
# --- Credentials (for login) ---
|
| 59 |
+
login_email = Column(String(500), nullable=True)
|
| 60 |
+
login_password = Column(String(500), nullable=True) # In production, encrypt this!
|
| 61 |
+
|
| 62 |
+
# --- Scraping Config ---
|
| 63 |
+
next_button_selector = Column(
|
| 64 |
+
String(500),
|
| 65 |
+
nullable=False,
|
| 66 |
+
default="a.next_page, a[rel='next'], .next-chap, button.next-chapter"
|
| 67 |
+
)
|
| 68 |
+
content_selector = Column(
|
| 69 |
+
String(500),
|
| 70 |
+
nullable=False,
|
| 71 |
+
default=".chapter-content, .reading-content, #chapter-content, .text-left"
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# --- Status Tracking ---
|
| 75 |
+
status = Column(
|
| 76 |
+
SQLEnum(NovelStatus),
|
| 77 |
+
nullable=False,
|
| 78 |
+
default=NovelStatus.QUEUED
|
| 79 |
+
)
|
| 80 |
+
chapters_scraped = Column(Integer, default=0)
|
| 81 |
+
last_error = Column(Text, nullable=True)
|
| 82 |
+
|
| 83 |
+
# --- Captcha/Intervention ---
|
| 84 |
+
screenshot_path = Column(String(1000), nullable=True)
|
| 85 |
+
needs_intervention = Column(Boolean, default=False)
|
| 86 |
+
|
| 87 |
+
# --- Timestamps ---
|
| 88 |
+
created_at = Column(
|
| 89 |
+
DateTime(timezone=True),
|
| 90 |
+
default=lambda: datetime.now(timezone.utc)
|
| 91 |
+
)
|
| 92 |
+
updated_at = Column(
|
| 93 |
+
DateTime(timezone=True),
|
| 94 |
+
default=lambda: datetime.now(timezone.utc),
|
| 95 |
+
onupdate=lambda: datetime.now(timezone.utc)
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# --- Relationships ---
|
| 99 |
+
chapters = relationship(
|
| 100 |
+
"Chapter",
|
| 101 |
+
back_populates="novel",
|
| 102 |
+
cascade="all, delete-orphan",
|
| 103 |
+
order_by="Chapter.chapter_number",
|
| 104 |
+
lazy="selectin",
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
def __repr__(self):
|
| 108 |
+
return f"<Novel(id={self.id}, title='{self.title}', status={self.status})>"
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# ============================================
|
| 112 |
+
# Chapter Model
|
| 113 |
+
# ============================================
|
| 114 |
+
class Chapter(Base):
|
| 115 |
+
"""
|
| 116 |
+
Stores individual chapter content.
|
| 117 |
+
Each chapter belongs to a Novel.
|
| 118 |
+
"""
|
| 119 |
+
__tablename__ = "chapters"
|
| 120 |
+
|
| 121 |
+
id = Column(Integer, primary_key=True, autoincrement=True)
|
| 122 |
+
|
| 123 |
+
# --- Foreign Key ---
|
| 124 |
+
novel_id = Column(
|
| 125 |
+
Integer,
|
| 126 |
+
ForeignKey("novels.id", ondelete="CASCADE"),
|
| 127 |
+
nullable=False,
|
| 128 |
+
index=True,
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
# --- Chapter Data ---
|
| 132 |
+
chapter_number = Column(Integer, nullable=False)
|
| 133 |
+
title = Column(String(1000), nullable=True, default="")
|
| 134 |
+
content = Column(Text, nullable=False) # The actual chapter text
|
| 135 |
+
url = Column(String(2000), nullable=True) # URL where this was scraped from
|
| 136 |
+
word_count = Column(Integer, default=0)
|
| 137 |
+
|
| 138 |
+
# --- Timestamps ---
|
| 139 |
+
scraped_at = Column(
|
| 140 |
+
DateTime(timezone=True),
|
| 141 |
+
default=lambda: datetime.now(timezone.utc)
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
# --- Relationships ---
|
| 145 |
+
novel = relationship("Novel", back_populates="chapters")
|
| 146 |
+
|
| 147 |
+
# --- Constraints ---
|
| 148 |
+
__table_args__ = (
|
| 149 |
+
# Ensure no duplicate chapters for same novel
|
| 150 |
+
UniqueConstraint("novel_id", "chapter_number", name="uq_novel_chapter"),
|
| 151 |
+
# Index for faster queries
|
| 152 |
+
Index("ix_chapter_novel_number", "novel_id", "chapter_number"),
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
def __repr__(self):
|
| 156 |
+
return f"<Chapter(id={self.id}, novel_id={self.novel_id}, ch={self.chapter_number})>"
|