Ruhivig65 commited on
Commit
e29427f
·
verified ·
1 Parent(s): 5d1e634

Upload 4 files

Browse files
Files changed (2) hide show
  1. app/database/crud.py +5 -1
  2. app/database/models.py +24 -79
app/database/crud.py CHANGED
@@ -186,9 +186,11 @@ async def save_chapter(
186
  content: str,
187
  title: Optional[str] = None,
188
  url: Optional[str] = None,
 
 
189
  ) -> Chapter:
190
  """
191
- Save a chapter to the database.
192
  If a chapter with the same number already exists for this novel,
193
  it will raise an IntegrityError (caught by caller).
194
  """
@@ -198,7 +200,9 @@ async def save_chapter(
198
  novel_id=novel_id,
199
  chapter_number=chapter_number,
200
  title=title or f"Chapter {chapter_number}",
 
201
  content=content,
 
202
  url=url,
203
  word_count=word_count,
204
  )
 
186
  content: str,
187
  title: Optional[str] = None,
188
  url: Optional[str] = None,
189
+ content_hindi: Optional[str] = None,
190
+ title_hindi: Optional[str] = None,
191
  ) -> Chapter:
192
  """
193
+ Save a chapter to the database (English + Hindi).
194
  If a chapter with the same number already exists for this novel,
195
  it will raise an IntegrityError (caught by caller).
196
  """
 
200
  novel_id=novel_id,
201
  chapter_number=chapter_number,
202
  title=title or f"Chapter {chapter_number}",
203
+ title_hindi=title_hindi,
204
  content=content,
205
+ content_hindi=content_hindi,
206
  url=url,
207
  word_count=word_count,
208
  )
app/database/models.py CHANGED
@@ -2,7 +2,7 @@
2
  ============================================
3
  Database Models (Tables)
4
  - Novel: Stores novel metadata & scraping status
5
- - Chapter: Stores individual chapter content
6
  ============================================
7
  """
8
 
@@ -25,80 +25,48 @@ from sqlalchemy.orm import relationship
25
  from app.database.connection import Base
26
 
27
 
28
- # ============================================
29
- # Enums for Status Tracking
30
- # ============================================
31
  class NovelStatus(str, enum.Enum):
32
- """Status of a novel's scraping process."""
33
- QUEUED = "queued" # Waiting in queue
34
- LOGGING_IN = "logging_in" # Attempting login
35
- SCRAPING = "scraping" # Actively scraping chapters
36
- PAUSED_CAPTCHA = "paused_captcha" # Waiting for manual intervention
37
- PAUSED_ERROR = "paused_error" # Paused due to error
38
- COMPLETED = "completed" # All chapters scraped
39
- FAILED = "failed" # Permanently failed
40
 
41
 
42
- # ============================================
43
- # Novel Model
44
- # ============================================
45
  class Novel(Base):
46
- """
47
- Stores metadata about each novel being scraped.
48
- """
49
  __tablename__ = "novels"
50
 
51
  id = Column(Integer, primary_key=True, autoincrement=True)
52
-
53
- # --- Basic Info ---
54
  title = Column(String(500), nullable=False, default="Unknown Novel")
55
- url = Column(String(2000), nullable=False) # Starting URL
56
- current_url = Column(String(2000), nullable=True) # Current page URL
57
-
58
- # --- Credentials (for login) ---
59
  login_email = Column(String(500), nullable=True)
60
- login_password = Column(String(500), nullable=True) # In production, encrypt this!
61
-
62
- # --- Scraping Config ---
63
  next_button_selector = Column(
64
- String(500),
65
- nullable=False,
66
  default="a.next_page, a[rel='next'], .next-chap, button.next-chapter"
67
  )
68
  content_selector = Column(
69
- String(500),
70
- nullable=False,
71
  default=".chapter-content, .reading-content, #chapter-content, .text-left"
72
  )
73
-
74
- # --- Status Tracking ---
75
- status = Column(
76
- SQLEnum(NovelStatus),
77
- nullable=False,
78
- default=NovelStatus.QUEUED
79
- )
80
  chapters_scraped = Column(Integer, default=0)
81
  last_error = Column(Text, nullable=True)
82
-
83
- # --- Captcha/Intervention ---
84
  screenshot_path = Column(String(1000), nullable=True)
85
  needs_intervention = Column(Boolean, default=False)
86
-
87
- # --- Timestamps ---
88
- created_at = Column(
89
- DateTime(timezone=True),
90
- default=lambda: datetime.now(timezone.utc)
91
- )
92
  updated_at = Column(
93
  DateTime(timezone=True),
94
  default=lambda: datetime.now(timezone.utc),
95
  onupdate=lambda: datetime.now(timezone.utc)
96
  )
97
 
98
- # --- Relationships ---
99
  chapters = relationship(
100
- "Chapter",
101
- back_populates="novel",
102
  cascade="all, delete-orphan",
103
  order_by="Chapter.chapter_number",
104
  lazy="selectin",
@@ -108,49 +76,26 @@ class Novel(Base):
108
  return f"<Novel(id={self.id}, title='{self.title}', status={self.status})>"
109
 
110
 
111
- # ============================================
112
- # Chapter Model
113
- # ============================================
114
  class Chapter(Base):
115
- """
116
- Stores individual chapter content.
117
- Each chapter belongs to a Novel.
118
- """
119
  __tablename__ = "chapters"
120
 
121
  id = Column(Integer, primary_key=True, autoincrement=True)
122
-
123
- # --- Foreign Key ---
124
- novel_id = Column(
125
- Integer,
126
- ForeignKey("novels.id", ondelete="CASCADE"),
127
- nullable=False,
128
- index=True,
129
- )
130
-
131
- # --- Chapter Data ---
132
  chapter_number = Column(Integer, nullable=False)
133
  title = Column(String(1000), nullable=True, default="")
134
- content = Column(Text, nullable=False) # The actual chapter text
135
- url = Column(String(2000), nullable=True) # URL where this was scraped from
 
 
136
  word_count = Column(Integer, default=0)
137
-
138
- # --- Timestamps ---
139
- scraped_at = Column(
140
- DateTime(timezone=True),
141
- default=lambda: datetime.now(timezone.utc)
142
- )
143
 
144
- # --- Relationships ---
145
  novel = relationship("Novel", back_populates="chapters")
146
 
147
- # --- Constraints ---
148
  __table_args__ = (
149
- # Ensure no duplicate chapters for same novel
150
  UniqueConstraint("novel_id", "chapter_number", name="uq_novel_chapter"),
151
- # Index for faster queries
152
  Index("ix_chapter_novel_number", "novel_id", "chapter_number"),
153
  )
154
 
155
  def __repr__(self):
156
- return f"<Chapter(id={self.id}, novel_id={self.novel_id}, ch={self.chapter_number})>"
 
2
  ============================================
3
  Database Models (Tables)
4
  - Novel: Stores novel metadata & scraping status
5
+ - Chapter: Stores individual chapter content (English + Hindi)
6
  ============================================
7
  """
8
 
 
25
  from app.database.connection import Base
26
 
27
 
 
 
 
28
  class NovelStatus(str, enum.Enum):
29
+ QUEUED = "queued"
30
+ LOGGING_IN = "logging_in"
31
+ SCRAPING = "scraping"
32
+ PAUSED_CAPTCHA = "paused_captcha"
33
+ PAUSED_ERROR = "paused_error"
34
+ COMPLETED = "completed"
35
+ FAILED = "failed"
 
36
 
37
 
 
 
 
38
  class Novel(Base):
 
 
 
39
  __tablename__ = "novels"
40
 
41
  id = Column(Integer, primary_key=True, autoincrement=True)
 
 
42
  title = Column(String(500), nullable=False, default="Unknown Novel")
43
+ title_hindi = Column(String(500), nullable=True) # Hindi translated title
44
+ url = Column(String(2000), nullable=False)
45
+ current_url = Column(String(2000), nullable=True)
 
46
  login_email = Column(String(500), nullable=True)
47
+ login_password = Column(String(500), nullable=True)
 
 
48
  next_button_selector = Column(
49
+ String(500), nullable=False,
 
50
  default="a.next_page, a[rel='next'], .next-chap, button.next-chapter"
51
  )
52
  content_selector = Column(
53
+ String(500), nullable=False,
 
54
  default=".chapter-content, .reading-content, #chapter-content, .text-left"
55
  )
56
+ status = Column(SQLEnum(NovelStatus), nullable=False, default=NovelStatus.QUEUED)
 
 
 
 
 
 
57
  chapters_scraped = Column(Integer, default=0)
58
  last_error = Column(Text, nullable=True)
 
 
59
  screenshot_path = Column(String(1000), nullable=True)
60
  needs_intervention = Column(Boolean, default=False)
61
+ created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
 
 
 
 
 
62
  updated_at = Column(
63
  DateTime(timezone=True),
64
  default=lambda: datetime.now(timezone.utc),
65
  onupdate=lambda: datetime.now(timezone.utc)
66
  )
67
 
 
68
  chapters = relationship(
69
+ "Chapter", back_populates="novel",
 
70
  cascade="all, delete-orphan",
71
  order_by="Chapter.chapter_number",
72
  lazy="selectin",
 
76
  return f"<Novel(id={self.id}, title='{self.title}', status={self.status})>"
77
 
78
 
 
 
 
79
  class Chapter(Base):
 
 
 
 
80
  __tablename__ = "chapters"
81
 
82
  id = Column(Integer, primary_key=True, autoincrement=True)
83
+ novel_id = Column(Integer, ForeignKey("novels.id", ondelete="CASCADE"), nullable=False, index=True)
 
 
 
 
 
 
 
 
 
84
  chapter_number = Column(Integer, nullable=False)
85
  title = Column(String(1000), nullable=True, default="")
86
+ title_hindi = Column(String(1000), nullable=True) # Hindi title
87
+ content = Column(Text, nullable=False) # English content
88
+ content_hindi = Column(Text, nullable=True) # ✅ Hindi content
89
+ url = Column(String(2000), nullable=True)
90
  word_count = Column(Integer, default=0)
91
+ scraped_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
 
 
 
 
 
92
 
 
93
  novel = relationship("Novel", back_populates="chapters")
94
 
 
95
  __table_args__ = (
 
96
  UniqueConstraint("novel_id", "chapter_number", name="uq_novel_chapter"),
 
97
  Index("ix_chapter_novel_number", "novel_id", "chapter_number"),
98
  )
99
 
100
  def __repr__(self):
101
+ return f"<Chapter(id={self.id}, novel_id={self.novel_id}, ch={self.chapter_number})>"