Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- app/database/crud.py +5 -1
- app/database/models.py +24 -79
app/database/crud.py
CHANGED
|
@@ -186,9 +186,11 @@ async def save_chapter(
|
|
| 186 |
content: str,
|
| 187 |
title: Optional[str] = None,
|
| 188 |
url: Optional[str] = None,
|
|
|
|
|
|
|
| 189 |
) -> Chapter:
|
| 190 |
"""
|
| 191 |
-
Save a chapter to the database.
|
| 192 |
If a chapter with the same number already exists for this novel,
|
| 193 |
it will raise an IntegrityError (caught by caller).
|
| 194 |
"""
|
|
@@ -198,7 +200,9 @@ async def save_chapter(
|
|
| 198 |
novel_id=novel_id,
|
| 199 |
chapter_number=chapter_number,
|
| 200 |
title=title or f"Chapter {chapter_number}",
|
|
|
|
| 201 |
content=content,
|
|
|
|
| 202 |
url=url,
|
| 203 |
word_count=word_count,
|
| 204 |
)
|
|
|
|
| 186 |
content: str,
|
| 187 |
title: Optional[str] = None,
|
| 188 |
url: Optional[str] = None,
|
| 189 |
+
content_hindi: Optional[str] = None,
|
| 190 |
+
title_hindi: Optional[str] = None,
|
| 191 |
) -> Chapter:
|
| 192 |
"""
|
| 193 |
+
Save a chapter to the database (English + Hindi).
|
| 194 |
If a chapter with the same number already exists for this novel,
|
| 195 |
it will raise an IntegrityError (caught by caller).
|
| 196 |
"""
|
|
|
|
| 200 |
novel_id=novel_id,
|
| 201 |
chapter_number=chapter_number,
|
| 202 |
title=title or f"Chapter {chapter_number}",
|
| 203 |
+
title_hindi=title_hindi,
|
| 204 |
content=content,
|
| 205 |
+
content_hindi=content_hindi,
|
| 206 |
url=url,
|
| 207 |
word_count=word_count,
|
| 208 |
)
|
app/database/models.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
============================================
|
| 3 |
Database Models (Tables)
|
| 4 |
- Novel: Stores novel metadata & scraping status
|
| 5 |
-
- Chapter: Stores individual chapter content
|
| 6 |
============================================
|
| 7 |
"""
|
| 8 |
|
|
@@ -25,80 +25,48 @@ from sqlalchemy.orm import relationship
|
|
| 25 |
from app.database.connection import Base
|
| 26 |
|
| 27 |
|
| 28 |
-
# ============================================
|
| 29 |
-
# Enums for Status Tracking
|
| 30 |
-
# ============================================
|
| 31 |
class NovelStatus(str, enum.Enum):
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
FAILED = "failed" # Permanently failed
|
| 40 |
|
| 41 |
|
| 42 |
-
# ============================================
|
| 43 |
-
# Novel Model
|
| 44 |
-
# ============================================
|
| 45 |
class Novel(Base):
|
| 46 |
-
"""
|
| 47 |
-
Stores metadata about each novel being scraped.
|
| 48 |
-
"""
|
| 49 |
__tablename__ = "novels"
|
| 50 |
|
| 51 |
id = Column(Integer, primary_key=True, autoincrement=True)
|
| 52 |
-
|
| 53 |
-
# --- Basic Info ---
|
| 54 |
title = Column(String(500), nullable=False, default="Unknown Novel")
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
# --- Credentials (for login) ---
|
| 59 |
login_email = Column(String(500), nullable=True)
|
| 60 |
-
login_password = Column(String(500), nullable=True)
|
| 61 |
-
|
| 62 |
-
# --- Scraping Config ---
|
| 63 |
next_button_selector = Column(
|
| 64 |
-
String(500),
|
| 65 |
-
nullable=False,
|
| 66 |
default="a.next_page, a[rel='next'], .next-chap, button.next-chapter"
|
| 67 |
)
|
| 68 |
content_selector = Column(
|
| 69 |
-
String(500),
|
| 70 |
-
nullable=False,
|
| 71 |
default=".chapter-content, .reading-content, #chapter-content, .text-left"
|
| 72 |
)
|
| 73 |
-
|
| 74 |
-
# --- Status Tracking ---
|
| 75 |
-
status = Column(
|
| 76 |
-
SQLEnum(NovelStatus),
|
| 77 |
-
nullable=False,
|
| 78 |
-
default=NovelStatus.QUEUED
|
| 79 |
-
)
|
| 80 |
chapters_scraped = Column(Integer, default=0)
|
| 81 |
last_error = Column(Text, nullable=True)
|
| 82 |
-
|
| 83 |
-
# --- Captcha/Intervention ---
|
| 84 |
screenshot_path = Column(String(1000), nullable=True)
|
| 85 |
needs_intervention = Column(Boolean, default=False)
|
| 86 |
-
|
| 87 |
-
# --- Timestamps ---
|
| 88 |
-
created_at = Column(
|
| 89 |
-
DateTime(timezone=True),
|
| 90 |
-
default=lambda: datetime.now(timezone.utc)
|
| 91 |
-
)
|
| 92 |
updated_at = Column(
|
| 93 |
DateTime(timezone=True),
|
| 94 |
default=lambda: datetime.now(timezone.utc),
|
| 95 |
onupdate=lambda: datetime.now(timezone.utc)
|
| 96 |
)
|
| 97 |
|
| 98 |
-
# --- Relationships ---
|
| 99 |
chapters = relationship(
|
| 100 |
-
"Chapter",
|
| 101 |
-
back_populates="novel",
|
| 102 |
cascade="all, delete-orphan",
|
| 103 |
order_by="Chapter.chapter_number",
|
| 104 |
lazy="selectin",
|
|
@@ -108,49 +76,26 @@ class Novel(Base):
|
|
| 108 |
return f"<Novel(id={self.id}, title='{self.title}', status={self.status})>"
|
| 109 |
|
| 110 |
|
| 111 |
-
# ============================================
|
| 112 |
-
# Chapter Model
|
| 113 |
-
# ============================================
|
| 114 |
class Chapter(Base):
|
| 115 |
-
"""
|
| 116 |
-
Stores individual chapter content.
|
| 117 |
-
Each chapter belongs to a Novel.
|
| 118 |
-
"""
|
| 119 |
__tablename__ = "chapters"
|
| 120 |
|
| 121 |
id = Column(Integer, primary_key=True, autoincrement=True)
|
| 122 |
-
|
| 123 |
-
# --- Foreign Key ---
|
| 124 |
-
novel_id = Column(
|
| 125 |
-
Integer,
|
| 126 |
-
ForeignKey("novels.id", ondelete="CASCADE"),
|
| 127 |
-
nullable=False,
|
| 128 |
-
index=True,
|
| 129 |
-
)
|
| 130 |
-
|
| 131 |
-
# --- Chapter Data ---
|
| 132 |
chapter_number = Column(Integer, nullable=False)
|
| 133 |
title = Column(String(1000), nullable=True, default="")
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
| 136 |
word_count = Column(Integer, default=0)
|
| 137 |
-
|
| 138 |
-
# --- Timestamps ---
|
| 139 |
-
scraped_at = Column(
|
| 140 |
-
DateTime(timezone=True),
|
| 141 |
-
default=lambda: datetime.now(timezone.utc)
|
| 142 |
-
)
|
| 143 |
|
| 144 |
-
# --- Relationships ---
|
| 145 |
novel = relationship("Novel", back_populates="chapters")
|
| 146 |
|
| 147 |
-
# --- Constraints ---
|
| 148 |
__table_args__ = (
|
| 149 |
-
# Ensure no duplicate chapters for same novel
|
| 150 |
UniqueConstraint("novel_id", "chapter_number", name="uq_novel_chapter"),
|
| 151 |
-
# Index for faster queries
|
| 152 |
Index("ix_chapter_novel_number", "novel_id", "chapter_number"),
|
| 153 |
)
|
| 154 |
|
| 155 |
def __repr__(self):
|
| 156 |
-
return f"<Chapter(id={self.id}, novel_id={self.novel_id}, ch={self.chapter_number})>"
|
|
|
|
| 2 |
============================================
|
| 3 |
Database Models (Tables)
|
| 4 |
- Novel: Stores novel metadata & scraping status
|
| 5 |
+
- Chapter: Stores individual chapter content (English + Hindi)
|
| 6 |
============================================
|
| 7 |
"""
|
| 8 |
|
|
|
|
| 25 |
from app.database.connection import Base
|
| 26 |
|
| 27 |
|
|
|
|
|
|
|
|
|
|
| 28 |
class NovelStatus(str, enum.Enum):
|
| 29 |
+
QUEUED = "queued"
|
| 30 |
+
LOGGING_IN = "logging_in"
|
| 31 |
+
SCRAPING = "scraping"
|
| 32 |
+
PAUSED_CAPTCHA = "paused_captcha"
|
| 33 |
+
PAUSED_ERROR = "paused_error"
|
| 34 |
+
COMPLETED = "completed"
|
| 35 |
+
FAILED = "failed"
|
|
|
|
| 36 |
|
| 37 |
|
|
|
|
|
|
|
|
|
|
| 38 |
class Novel(Base):
|
|
|
|
|
|
|
|
|
|
| 39 |
__tablename__ = "novels"
|
| 40 |
|
| 41 |
id = Column(Integer, primary_key=True, autoincrement=True)
|
|
|
|
|
|
|
| 42 |
title = Column(String(500), nullable=False, default="Unknown Novel")
|
| 43 |
+
title_hindi = Column(String(500), nullable=True) # Hindi translated title
|
| 44 |
+
url = Column(String(2000), nullable=False)
|
| 45 |
+
current_url = Column(String(2000), nullable=True)
|
|
|
|
| 46 |
login_email = Column(String(500), nullable=True)
|
| 47 |
+
login_password = Column(String(500), nullable=True)
|
|
|
|
|
|
|
| 48 |
next_button_selector = Column(
|
| 49 |
+
String(500), nullable=False,
|
|
|
|
| 50 |
default="a.next_page, a[rel='next'], .next-chap, button.next-chapter"
|
| 51 |
)
|
| 52 |
content_selector = Column(
|
| 53 |
+
String(500), nullable=False,
|
|
|
|
| 54 |
default=".chapter-content, .reading-content, #chapter-content, .text-left"
|
| 55 |
)
|
| 56 |
+
status = Column(SQLEnum(NovelStatus), nullable=False, default=NovelStatus.QUEUED)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
chapters_scraped = Column(Integer, default=0)
|
| 58 |
last_error = Column(Text, nullable=True)
|
|
|
|
|
|
|
| 59 |
screenshot_path = Column(String(1000), nullable=True)
|
| 60 |
needs_intervention = Column(Boolean, default=False)
|
| 61 |
+
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
updated_at = Column(
|
| 63 |
DateTime(timezone=True),
|
| 64 |
default=lambda: datetime.now(timezone.utc),
|
| 65 |
onupdate=lambda: datetime.now(timezone.utc)
|
| 66 |
)
|
| 67 |
|
|
|
|
| 68 |
chapters = relationship(
|
| 69 |
+
"Chapter", back_populates="novel",
|
|
|
|
| 70 |
cascade="all, delete-orphan",
|
| 71 |
order_by="Chapter.chapter_number",
|
| 72 |
lazy="selectin",
|
|
|
|
| 76 |
return f"<Novel(id={self.id}, title='{self.title}', status={self.status})>"
|
| 77 |
|
| 78 |
|
|
|
|
|
|
|
|
|
|
| 79 |
class Chapter(Base):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
__tablename__ = "chapters"
|
| 81 |
|
| 82 |
id = Column(Integer, primary_key=True, autoincrement=True)
|
| 83 |
+
novel_id = Column(Integer, ForeignKey("novels.id", ondelete="CASCADE"), nullable=False, index=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
chapter_number = Column(Integer, nullable=False)
|
| 85 |
title = Column(String(1000), nullable=True, default="")
|
| 86 |
+
title_hindi = Column(String(1000), nullable=True) # ✅ Hindi title
|
| 87 |
+
content = Column(Text, nullable=False) # English content
|
| 88 |
+
content_hindi = Column(Text, nullable=True) # ✅ Hindi content
|
| 89 |
+
url = Column(String(2000), nullable=True)
|
| 90 |
word_count = Column(Integer, default=0)
|
| 91 |
+
scraped_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
|
|
|
| 93 |
novel = relationship("Novel", back_populates="chapters")
|
| 94 |
|
|
|
|
| 95 |
__table_args__ = (
|
|
|
|
| 96 |
UniqueConstraint("novel_id", "chapter_number", name="uq_novel_chapter"),
|
|
|
|
| 97 |
Index("ix_chapter_novel_number", "novel_id", "chapter_number"),
|
| 98 |
)
|
| 99 |
|
| 100 |
def __repr__(self):
|
| 101 |
+
return f"<Chapter(id={self.id}, novel_id={self.novel_id}, ch={self.chapter_number})>"
|