Ruhivig65 commited on
Commit
b8907ef
·
verified ·
1 Parent(s): 03ac5aa

Upload 4 files

Browse files
app/database/__init__.py CHANGED
@@ -0,0 +1 @@
 
 
1
+ # Database Package
app/database/connection.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ============================================
3
+ Database Connection Manager
4
+ - Async SQLAlchemy Engine with Connection Pooling
5
+ - Handles Render's free tier quirks:
6
+ * Sleep mode reconnection
7
+ * Connection drop recovery
8
+ * Pool pre-ping for stale connections
9
+ ============================================
10
+ """
11
+
12
+ import logging
13
+ from sqlalchemy.ext.asyncio import (
14
+ create_async_engine,
15
+ AsyncSession,
16
+ async_sessionmaker,
17
+ AsyncEngine,
18
+ )
19
+ from sqlalchemy.orm import DeclarativeBase
20
+ from sqlalchemy.pool import AsyncAdaptedQueuePool
21
+ from sqlalchemy import text
22
+
23
+ from app.config import get_database_url, settings
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ # ============================================
29
+ # Base Model Class (all models inherit this)
30
+ # ============================================
31
+ class Base(DeclarativeBase):
32
+ """Base class for all database models."""
33
+ pass
34
+
35
+
36
+ # ============================================
37
+ # Engine Configuration
38
+ # ============================================
39
+ _engine: AsyncEngine | None = None
40
+ _session_factory: async_sessionmaker[AsyncSession] | None = None
41
+
42
+
43
+ def get_engine() -> AsyncEngine:
44
+ """
45
+ Create or return the async SQLAlchemy engine.
46
+
47
+ Key settings for Render Free Tier:
48
+ - pool_pre_ping=True: Tests connection before using it
49
+ (prevents "connection closed" errors after DB sleep)
50
+ - pool_size=5: Small pool (free tier has limited connections)
51
+ - max_overflow=3: Allow 3 extra connections under load
52
+ - pool_recycle=300: Recycle connections every 5 minutes
53
+ (prevents stale connections from Render's idle timeout)
54
+ - pool_timeout=30: Wait max 30s for a connection from pool
55
+ """
56
+ global _engine
57
+
58
+ if _engine is None:
59
+ database_url = get_database_url()
60
+
61
+ logger.info("Creating database engine...")
62
+ logger.info(f"Database host: {database_url.split('@')[1].split('/')[0] if '@' in database_url else 'unknown'}")
63
+
64
+ _engine = create_async_engine(
65
+ database_url,
66
+ # --- Connection Pool Settings ---
67
+ poolclass=AsyncAdaptedQueuePool,
68
+ pool_pre_ping=True, # CRITICAL: Check connection health
69
+ pool_size=5, # Base connections in pool
70
+ max_overflow=3, # Extra connections allowed
71
+ pool_recycle=300, # Recycle every 5 min (300 sec)
72
+ pool_timeout=30, # Wait 30s for available connection
73
+
74
+ # --- Query Settings ---
75
+ echo=settings.DEBUG, # Log SQL queries in debug mode
76
+
77
+ # --- Connection Arguments ---
78
+ connect_args={
79
+ "timeout": 30, # Connection timeout
80
+ "command_timeout": 60, # Query timeout
81
+ "server_settings": {
82
+ "application_name": "novel_scraper",
83
+ "idle_in_transaction_session_timeout": "60000", # 60s
84
+ },
85
+ },
86
+ )
87
+ logger.info("Database engine created successfully.")
88
+
89
+ return _engine
90
+
91
+
92
+ def get_session_factory() -> async_sessionmaker[AsyncSession]:
93
+ """
94
+ Create or return the async session factory.
95
+ """
96
+ global _session_factory
97
+
98
+ if _session_factory is None:
99
+ engine = get_engine()
100
+ _session_factory = async_sessionmaker(
101
+ bind=engine,
102
+ class_=AsyncSession,
103
+ expire_on_commit=False, # Don't expire objects after commit
104
+ autocommit=False,
105
+ autoflush=False,
106
+ )
107
+ logger.info("Session factory created.")
108
+
109
+ return _session_factory
110
+
111
+
112
+ async def get_db_session() -> AsyncSession:
113
+ """
114
+ Dependency for FastAPI routes.
115
+ Yields an async database session.
116
+
117
+ Usage in FastAPI:
118
+ @router.get("/something")
119
+ async def my_route(db: AsyncSession = Depends(get_db_session)):
120
+ ...
121
+ """
122
+ factory = get_session_factory()
123
+ async with factory() as session:
124
+ try:
125
+ yield session
126
+ await session.commit()
127
+ except Exception as e:
128
+ await session.rollback()
129
+ logger.error(f"Database session error: {e}")
130
+ raise
131
+ finally:
132
+ await session.close()
133
+
134
+
135
+ async def init_database():
136
+ """
137
+ Initialize database: Create all tables.
138
+ Called on application startup.
139
+ """
140
+ engine = get_engine()
141
+
142
+ try:
143
+ # Test connection first
144
+ async with engine.begin() as conn:
145
+ await conn.execute(text("SELECT 1"))
146
+ logger.info("✅ Database connection test successful!")
147
+
148
+ # Create all tables
149
+ async with engine.begin() as conn:
150
+ await conn.run_sync(Base.metadata.create_all)
151
+ logger.info("✅ Database tables created/verified!")
152
+
153
+ except Exception as e:
154
+ logger.error(f"❌ Database initialization failed: {e}")
155
+ logger.error("Check your DATABASE_URL environment variable!")
156
+ raise
157
+
158
+
159
+ async def close_database():
160
+ """
161
+ Close database connections.
162
+ Called on application shutdown.
163
+ """
164
+ global _engine, _session_factory
165
+
166
+ if _engine is not None:
167
+ await _engine.dispose()
168
+ _engine = None
169
+ _session_factory = None
170
+ logger.info("Database connections closed.")
app/database/crud.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ============================================
3
+ CRUD Operations (Create, Read, Update, Delete)
4
+ All database operations go through here.
5
+ Includes comprehensive error handling.
6
+ ============================================
7
+ """
8
+
9
+ import logging
10
+ from typing import Optional, List
11
+ from sqlalchemy.ext.asyncio import AsyncSession
12
+ from sqlalchemy import select, update, func, delete
13
+ from sqlalchemy.exc import (
14
+ IntegrityError,
15
+ OperationalError,
16
+ DisconnectionError,
17
+ TimeoutError as SATimeoutError,
18
+ )
19
+
20
+ from app.database.models import Novel, Chapter, NovelStatus
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ # ============================================
26
+ # Retry Decorator for DB Operations
27
+ # ============================================
28
+ def handle_db_errors(func):
29
+ """
30
+ Decorator to handle common database errors gracefully.
31
+ Catches connection drops, timeouts, and retries logic.
32
+ """
33
+ async def wrapper(*args, **kwargs):
34
+ try:
35
+ return await func(*args, **kwargs)
36
+ except DisconnectionError as e:
37
+ logger.error(f"Database disconnected during {func.__name__}: {e}")
38
+ logger.info("This usually happens when Render DB wakes from sleep. Retrying...")
39
+ raise
40
+ except OperationalError as e:
41
+ logger.error(f"Database operational error in {func.__name__}: {e}")
42
+ raise
43
+ except SATimeoutError as e:
44
+ logger.error(f"Database timeout in {func.__name__}: {e}")
45
+ raise
46
+ except IntegrityError as e:
47
+ logger.warning(f"Integrity error in {func.__name__}: {e}")
48
+ raise
49
+ except Exception as e:
50
+ logger.error(f"Unexpected DB error in {func.__name__}: {e}")
51
+ raise
52
+ return wrapper
53
+
54
+
55
+ # ============================================
56
+ # Novel CRUD
57
+ # ============================================
58
+ @handle_db_errors
59
+ async def create_novel(
60
+ db: AsyncSession,
61
+ title: str,
62
+ url: str,
63
+ login_email: Optional[str] = None,
64
+ login_password: Optional[str] = None,
65
+ next_button_selector: Optional[str] = None,
66
+ content_selector: Optional[str] = None,
67
+ ) -> Novel:
68
+ """Create a new novel entry in the database."""
69
+ novel = Novel(
70
+ title=title,
71
+ url=url,
72
+ current_url=url,
73
+ login_email=login_email,
74
+ login_password=login_password,
75
+ status=NovelStatus.QUEUED,
76
+ )
77
+
78
+ if next_button_selector:
79
+ novel.next_button_selector = next_button_selector
80
+ if content_selector:
81
+ novel.content_selector = content_selector
82
+
83
+ db.add(novel)
84
+ await db.flush() # Get the ID without committing
85
+ await db.refresh(novel)
86
+
87
+ logger.info(f"Created novel: {novel.title} (ID: {novel.id})")
88
+ return novel
89
+
90
+
91
+ @handle_db_errors
92
+ async def get_novel_by_id(db: AsyncSession, novel_id: int) -> Optional[Novel]:
93
+ """Get a single novel by its ID."""
94
+ result = await db.execute(
95
+ select(Novel).where(Novel.id == novel_id)
96
+ )
97
+ return result.scalar_one_or_none()
98
+
99
+
100
+ @handle_db_errors
101
+ async def get_all_novels(db: AsyncSession) -> List[Novel]:
102
+ """Get all novels ordered by creation date."""
103
+ result = await db.execute(
104
+ select(Novel).order_by(Novel.created_at.desc())
105
+ )
106
+ return list(result.scalars().all())
107
+
108
+
109
+ @handle_db_errors
110
+ async def get_novels_needing_intervention(db: AsyncSession) -> List[Novel]:
111
+ """Get novels that are paused and need manual captcha solving."""
112
+ result = await db.execute(
113
+ select(Novel).where(
114
+ Novel.needs_intervention == True,
115
+ Novel.status == NovelStatus.PAUSED_CAPTCHA,
116
+ )
117
+ )
118
+ return list(result.scalars().all())
119
+
120
+
121
+ @handle_db_errors
122
+ async def update_novel_status(
123
+ db: AsyncSession,
124
+ novel_id: int,
125
+ status: NovelStatus,
126
+ error_message: Optional[str] = None,
127
+ current_url: Optional[str] = None,
128
+ screenshot_path: Optional[str] = None,
129
+ needs_intervention: Optional[bool] = None,
130
+ ) -> Optional[Novel]:
131
+ """Update the status and related fields of a novel."""
132
+ update_data = {"status": status}
133
+
134
+ if error_message is not None:
135
+ update_data["last_error"] = error_message
136
+ if current_url is not None:
137
+ update_data["current_url"] = current_url
138
+ if screenshot_path is not None:
139
+ update_data["screenshot_path"] = screenshot_path
140
+ if needs_intervention is not None:
141
+ update_data["needs_intervention"] = needs_intervention
142
+
143
+ await db.execute(
144
+ update(Novel)
145
+ .where(Novel.id == novel_id)
146
+ .values(**update_data)
147
+ )
148
+ await db.flush()
149
+
150
+ # Return updated novel
151
+ return await get_novel_by_id(db, novel_id)
152
+
153
+
154
+ @handle_db_errors
155
+ async def increment_chapter_count(db: AsyncSession, novel_id: int) -> None:
156
+ """Increment the chapters_scraped counter for a novel."""
157
+ await db.execute(
158
+ update(Novel)
159
+ .where(Novel.id == novel_id)
160
+ .values(chapters_scraped=Novel.chapters_scraped + 1)
161
+ )
162
+ await db.flush()
163
+
164
+
165
+ @handle_db_errors
166
+ async def delete_novel(db: AsyncSession, novel_id: int) -> bool:
167
+ """Delete a novel and all its chapters (cascade)."""
168
+ result = await db.execute(
169
+ delete(Novel).where(Novel.id == novel_id)
170
+ )
171
+ await db.flush()
172
+ deleted = result.rowcount > 0
173
+ if deleted:
174
+ logger.info(f"Deleted novel ID: {novel_id}")
175
+ return deleted
176
+
177
+
178
+ # ============================================
179
+ # Chapter CRUD
180
+ # ============================================
181
+ @handle_db_errors
182
+ async def save_chapter(
183
+ db: AsyncSession,
184
+ novel_id: int,
185
+ chapter_number: int,
186
+ content: str,
187
+ title: Optional[str] = None,
188
+ url: Optional[str] = None,
189
+ ) -> Chapter:
190
+ """
191
+ Save a chapter to the database.
192
+ If a chapter with the same number already exists for this novel,
193
+ it will raise an IntegrityError (caught by caller).
194
+ """
195
+ word_count = len(content.split()) if content else 0
196
+
197
+ chapter = Chapter(
198
+ novel_id=novel_id,
199
+ chapter_number=chapter_number,
200
+ title=title or f"Chapter {chapter_number}",
201
+ content=content,
202
+ url=url,
203
+ word_count=word_count,
204
+ )
205
+
206
+ db.add(chapter)
207
+ await db.flush()
208
+ await db.refresh(chapter)
209
+
210
+ logger.info(
211
+ f"Saved Chapter {chapter_number} for Novel {novel_id} "
212
+ f"({word_count} words)"
213
+ )
214
+ return chapter
215
+
216
+
217
+ @handle_db_errors
218
+ async def get_chapters_for_novel(
219
+ db: AsyncSession,
220
+ novel_id: int,
221
+ ) -> List[Chapter]:
222
+ """Get all chapters for a novel, ordered by chapter number."""
223
+ result = await db.execute(
224
+ select(Chapter)
225
+ .where(Chapter.novel_id == novel_id)
226
+ .order_by(Chapter.chapter_number.asc())
227
+ )
228
+ return list(result.scalars().all())
229
+
230
+
231
+ @handle_db_errors
232
+ async def get_chapter_count(db: AsyncSession, novel_id: int) -> int:
233
+ """Get the total number of chapters saved for a novel."""
234
+ result = await db.execute(
235
+ select(func.count(Chapter.id))
236
+ .where(Chapter.novel_id == novel_id)
237
+ )
238
+ return result.scalar() or 0
239
+
240
+
241
+ @handle_db_errors
242
+ async def get_last_chapter_number(db: AsyncSession, novel_id: int) -> int:
243
+ """Get the highest chapter number saved for a novel."""
244
+ result = await db.execute(
245
+ select(func.max(Chapter.chapter_number))
246
+ .where(Chapter.novel_id == novel_id)
247
+ )
248
+ return result.scalar() or 0
249
+
250
+
251
+ @handle_db_errors
252
+ async def chapter_exists(
253
+ db: AsyncSession,
254
+ novel_id: int,
255
+ chapter_number: int,
256
+ ) -> bool:
257
+ """Check if a specific chapter already exists."""
258
+ result = await db.execute(
259
+ select(func.count(Chapter.id))
260
+ .where(
261
+ Chapter.novel_id == novel_id,
262
+ Chapter.chapter_number == chapter_number,
263
+ )
264
+ )
265
+ return (result.scalar() or 0) > 0
266
+
267
+
268
+ @handle_db_errors
269
+ async def get_total_word_count(db: AsyncSession, novel_id: int) -> int:
270
+ """Get total word count across all chapters of a novel."""
271
+ result = await db.execute(
272
+ select(func.sum(Chapter.word_count))
273
+ .where(Chapter.novel_id == novel_id)
274
+ )
275
+ return result.scalar() or 0
app/database/models.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ============================================
3
+ Database Models (Tables)
4
+ - Novel: Stores novel metadata & scraping status
5
+ - Chapter: Stores individual chapter content
6
+ ============================================
7
+ """
8
+
9
+ import enum
10
+ from datetime import datetime, timezone
11
+ from sqlalchemy import (
12
+ Column,
13
+ Integer,
14
+ String,
15
+ Text,
16
+ DateTime,
17
+ ForeignKey,
18
+ Enum as SQLEnum,
19
+ Boolean,
20
+ Float,
21
+ Index,
22
+ UniqueConstraint,
23
+ )
24
+ from sqlalchemy.orm import relationship
25
+ from app.database.connection import Base
26
+
27
+
28
+ # ============================================
29
+ # Enums for Status Tracking
30
+ # ============================================
31
+ class NovelStatus(str, enum.Enum):
32
+ """Status of a novel's scraping process."""
33
+ QUEUED = "queued" # Waiting in queue
34
+ LOGGING_IN = "logging_in" # Attempting login
35
+ SCRAPING = "scraping" # Actively scraping chapters
36
+ PAUSED_CAPTCHA = "paused_captcha" # Waiting for manual intervention
37
+ PAUSED_ERROR = "paused_error" # Paused due to error
38
+ COMPLETED = "completed" # All chapters scraped
39
+ FAILED = "failed" # Permanently failed
40
+
41
+
42
+ # ============================================
43
+ # Novel Model
44
+ # ============================================
45
+ class Novel(Base):
46
+ """
47
+ Stores metadata about each novel being scraped.
48
+ """
49
+ __tablename__ = "novels"
50
+
51
+ id = Column(Integer, primary_key=True, autoincrement=True)
52
+
53
+ # --- Basic Info ---
54
+ title = Column(String(500), nullable=False, default="Unknown Novel")
55
+ url = Column(String(2000), nullable=False) # Starting URL
56
+ current_url = Column(String(2000), nullable=True) # Current page URL
57
+
58
+ # --- Credentials (for login) ---
59
+ login_email = Column(String(500), nullable=True)
60
+ login_password = Column(String(500), nullable=True) # In production, encrypt this!
61
+
62
+ # --- Scraping Config ---
63
+ next_button_selector = Column(
64
+ String(500),
65
+ nullable=False,
66
+ default="a.next_page, a[rel='next'], .next-chap, button.next-chapter"
67
+ )
68
+ content_selector = Column(
69
+ String(500),
70
+ nullable=False,
71
+ default=".chapter-content, .reading-content, #chapter-content, .text-left"
72
+ )
73
+
74
+ # --- Status Tracking ---
75
+ status = Column(
76
+ SQLEnum(NovelStatus),
77
+ nullable=False,
78
+ default=NovelStatus.QUEUED
79
+ )
80
+ chapters_scraped = Column(Integer, default=0)
81
+ last_error = Column(Text, nullable=True)
82
+
83
+ # --- Captcha/Intervention ---
84
+ screenshot_path = Column(String(1000), nullable=True)
85
+ needs_intervention = Column(Boolean, default=False)
86
+
87
+ # --- Timestamps ---
88
+ created_at = Column(
89
+ DateTime(timezone=True),
90
+ default=lambda: datetime.now(timezone.utc)
91
+ )
92
+ updated_at = Column(
93
+ DateTime(timezone=True),
94
+ default=lambda: datetime.now(timezone.utc),
95
+ onupdate=lambda: datetime.now(timezone.utc)
96
+ )
97
+
98
+ # --- Relationships ---
99
+ chapters = relationship(
100
+ "Chapter",
101
+ back_populates="novel",
102
+ cascade="all, delete-orphan",
103
+ order_by="Chapter.chapter_number",
104
+ lazy="selectin",
105
+ )
106
+
107
+ def __repr__(self):
108
+ return f"<Novel(id={self.id}, title='{self.title}', status={self.status})>"
109
+
110
+
111
+ # ============================================
112
+ # Chapter Model
113
+ # ============================================
114
+ class Chapter(Base):
115
+ """
116
+ Stores individual chapter content.
117
+ Each chapter belongs to a Novel.
118
+ """
119
+ __tablename__ = "chapters"
120
+
121
+ id = Column(Integer, primary_key=True, autoincrement=True)
122
+
123
+ # --- Foreign Key ---
124
+ novel_id = Column(
125
+ Integer,
126
+ ForeignKey("novels.id", ondelete="CASCADE"),
127
+ nullable=False,
128
+ index=True,
129
+ )
130
+
131
+ # --- Chapter Data ---
132
+ chapter_number = Column(Integer, nullable=False)
133
+ title = Column(String(1000), nullable=True, default="")
134
+ content = Column(Text, nullable=False) # The actual chapter text
135
+ url = Column(String(2000), nullable=True) # URL where this was scraped from
136
+ word_count = Column(Integer, default=0)
137
+
138
+ # --- Timestamps ---
139
+ scraped_at = Column(
140
+ DateTime(timezone=True),
141
+ default=lambda: datetime.now(timezone.utc)
142
+ )
143
+
144
+ # --- Relationships ---
145
+ novel = relationship("Novel", back_populates="chapters")
146
+
147
+ # --- Constraints ---
148
+ __table_args__ = (
149
+ # Ensure no duplicate chapters for same novel
150
+ UniqueConstraint("novel_id", "chapter_number", name="uq_novel_chapter"),
151
+ # Index for faster queries
152
+ Index("ix_chapter_novel_number", "novel_id", "chapter_number"),
153
+ )
154
+
155
+ def __repr__(self):
156
+ return f"<Chapter(id={self.id}, novel_id={self.novel_id}, ch={self.chapter_number})>"