Spaces:

IndraneelKumar
/

Search_Engine

Sleeping

App Files Files Community

IndraneelKumar commited on Nov 12, 2025

Commit

804054e

1 Parent(s): dc59d80

Added RSS Feeds for Medium Articles and Individual Publications

Browse files

Files changed (22) hide show

.env.example +3 -3
README.md +3 -2
cloudbuild_fastapi.yaml +1 -1
deploy_fastapi.sh +1 -1
frontend/app.py +3 -3
prefect-cloud.yaml +1 -1
prefect-local.yaml +1 -1
pyproject.toml +2 -2
src/api/main.py +2 -2
src/config.py +4 -4
src/configs/feeds_rss.yaml +36 -0
src/infrastructure/qdrant/qdrant_vectorstore.py +5 -5
src/infrastructure/supabase/create_db.py +5 -5
src/models/sql_models.py +1 -1
src/pipelines/flows/rss_ingestion_flow.py +4 -4
src/pipelines/tasks/fetch_rss.py +6 -6
src/pipelines/tasks/ingest_rss.py +3 -3
tests/integration/test_db_connection.py +4 -4
tests/integration/test_rss_pipeline.py +17 -17
tests/test_models/test_sql_models.py +2 -2
tests/unit/test_fetch_rss_entries.py +4 -4
uv.lock +1 -1

.env.example CHANGED Viewed

@@ -1,4 +1,4 @@
-SUPABASE_DB__TABLE_NAME=substack_articles
 SUPABASE_DB__HOST=your_supabase_db_host_here
 SUPABASE_DB__NAME=postgres
 SUPABASE_DB__USER=your_supabase_db_user_here
@@ -12,7 +12,7 @@ RSS__BATCH_SIZE=30
 # Qdrant configurationbatch
 QDRANT__API_KEY=your_qdrant_api_key_here
 QDRANT__URL=your_qdrant_url_here
-QDRANT__COLLECTION_NAME=substack_collection
 QDRANT__DENSE_MODEL_NAME=BAAI/bge-base-en-v1.5 # BAAI/bge-large-en-v1.5 (1024), BAAI/bge-base-en-v1.5 (HF, 768). BAAI/bge-base-en (Fastembed, 768)
 QDRANT__SPARSE_MODEL_NAME=Qdrant/bm25 # prithivida/Splade_PP_en_v1, Qdrant/bm25
 QDRANT__VECTOR_DIM=768 # 768, 1024
@@ -49,7 +49,7 @@ OPENROUTER__API_URL=https://openrouter.ai/api/v1
 # OPIK OBSERVABILITY
 OPIK__API_KEY=your_opik_api_key_here
-OPIK__PROJECT_NAME=substack-pipeline
 # FastAPI Endpoint
 BACKEND_URL=your_fastapi_backend_url_here

+SUPABASE_DB__TABLE_NAME=feed_articles
 SUPABASE_DB__HOST=your_supabase_db_host_here
 SUPABASE_DB__NAME=postgres
 SUPABASE_DB__USER=your_supabase_db_user_here
 # Qdrant configurationbatch
 QDRANT__API_KEY=your_qdrant_api_key_here
 QDRANT__URL=your_qdrant_url_here
+QDRANT__COLLECTION_NAME=feed_collection
 QDRANT__DENSE_MODEL_NAME=BAAI/bge-base-en-v1.5 # BAAI/bge-large-en-v1.5 (1024), BAAI/bge-base-en-v1.5 (HF, 768). BAAI/bge-base-en (Fastembed, 768)
 QDRANT__SPARSE_MODEL_NAME=Qdrant/bm25 # prithivida/Splade_PP_en_v1, Qdrant/bm25
 QDRANT__VECTOR_DIM=768 # 768, 1024
 # OPIK OBSERVABILITY
 OPIK__API_KEY=your_opik_api_key_here
+OPIK__PROJECT_NAME=feed-pipeline
 # FastAPI Endpoint
 BACKEND_URL=your_fastapi_backend_url_here

README.md CHANGED Viewed

@@ -12,7 +12,7 @@ pinned: false
 # Articles Search Engine
-A compact, production-style RAG pipeline. It ingests Substack RSS articles, stores them in Postgres (Supabase), creates dense/sparse embeddings in Qdrant, and exposes search and answer endpoints via FastAPI with a simple Gradio UI.
 ## How it works (brief)
 - Ingest RSS → Supabase:
@@ -23,6 +23,7 @@ A compact, production-style RAG pipeline. It ingests Substack RSS articles, stor
 - Search + generate:
   - FastAPI (`src/api/main.py`) exposes search endpoints (keyword, semantic, hybrid) and assembles answers with citations.
   - LLM providers are pluggable with fallback (OpenRouter, OpenAI, Hugging Face).
 - UI + deploy:
   - Gradio app for quick local search (`frontend/app.py`).
   - Containerization with Docker and optional deploy to Google Cloud Run.
@@ -31,7 +32,7 @@ A compact, production-style RAG pipeline. It ingests Substack RSS articles, stor
 - Python 3.12, FastAPI, Prefect, SQLAlchemy
 - Supabase (Postgres) for articles
 - Qdrant for vector search (dense + sparse/hybrid)
-- OpenRouter / OpenAI / Hugging Face for LLM completion
 - Gradio UI, Docker, Google Cloud Run
 - Config via Pydantic Settings, `uv` or `pip` for deps

 # Articles Search Engine
+A compact, production-style RAG pipeline. It ingests Substack, Medium and top publications RSS articles, stores them in Postgres (Supabase), creates dense/sparse embeddings in Qdrant, and exposes search and answer endpoints via FastAPI with a simple Gradio UI.
 ## How it works (brief)
 - Ingest RSS → Supabase:
 - Search + generate:
   - FastAPI (`src/api/main.py`) exposes search endpoints (keyword, semantic, hybrid) and assembles answers with citations.
   - LLM providers are pluggable with fallback (OpenRouter, OpenAI, Hugging Face).
+  - Opik is used for Evaluation
 - UI + deploy:
   - Gradio app for quick local search (`frontend/app.py`).
   - Containerization with Docker and optional deploy to Google Cloud Run.
 - Python 3.12, FastAPI, Prefect, SQLAlchemy
 - Supabase (Postgres) for articles
 - Qdrant for vector search (dense + sparse/hybrid)
+- OpenRouter / OpenAI / Hugging Face for LLM completion, Opik for LLM Evaluation
 - Gradio UI, Docker, Google Cloud Run
 - Config via Pydantic Settings, `uv` or `pip` for deps

cloudbuild_fastapi.yaml CHANGED Viewed

@@ -7,6 +7,6 @@ steps:
       export DOCKER_BUILDKIT=1
       docker build -t gcr.io/${PROJECT_ID}/${_SERVICE_NAME} -f Dockerfile .
 substitutions:
-  _SERVICE_NAME: "substack-pipeline-fastapi"
 images:
   - "gcr.io/${PROJECT_ID}/${_SERVICE_NAME}"

       export DOCKER_BUILDKIT=1
       docker build -t gcr.io/${PROJECT_ID}/${_SERVICE_NAME} -f Dockerfile .
 substitutions:
+  _SERVICE_NAME: "feed-pipeline-fastapi"
 images:
   - "gcr.io/${PROJECT_ID}/${_SERVICE_NAME}"

deploy_fastapi.sh CHANGED Viewed

@@ -26,7 +26,7 @@ echo "✅ Environment variables loaded."
 # Configuration
 # -----------------------
 PROJECT_ID="personal-projects-477710"
-SERVICE_NAME="substack-pipeline-fastapi"
 REGION="asia-south2" #europe-west1 "europe-west6"
 IMAGE_NAME="gcr.io/$PROJECT_ID/$SERVICE_NAME"

 # Configuration
 # -----------------------
 PROJECT_ID="personal-projects-477710"
+SERVICE_NAME="feed-pipeline-fastapi"
 REGION="asia-south2" #europe-west1 "europe-west6"
 IMAGE_NAME="gcr.io/$PROJECT_ID/$SERVICE_NAME"

frontend/app.py CHANGED Viewed

@@ -551,12 +551,12 @@ def update_model_choices(provider):
 # -----------------------
 # Gradio UI
 # -----------------------
-with gr.Blocks(title="Substack Articles LLM Engine", theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
     # Header
     gr.HTML(
         "<div id='app-header'>"
-        "  <h1>📰 Substack Articles LLM Engine</h1>"
-        "  <p>Search Substack content or ask an AI across your feeds — fast and delightful.</p>"
         "</div>"
     )

 # -----------------------
 # Gradio UI
 # -----------------------
+with gr.Blocks(title="AI Search Engine for Articles", theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
     # Header
     gr.HTML(
         "<div id='app-header'>"
+        "  <h1>📰 AI Search Engine for Articles</h1>"
+        "  <p>Search Substack, Medium and top publications content or ask an AI across your feeds — fast and delightful.</p>"
         "</div>"
     )

prefect-cloud.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 pull:
 - prefect.deployments.steps.git_clone:
     id: clone-step
-    repository: https://github.com/Indraneel99/substack-newsletters-search-course
     credentials: "{{ prefect.blocks.github-credentials.my-gh-creds }}"
 - prefect.deployments.steps.run_shell_script:

 pull:
 - prefect.deployments.steps.git_clone:
     id: clone-step
+    repository: https://github.com/Indraneel99/AISearchEngine
     credentials: "{{ prefect.blocks.github-credentials.my-gh-creds }}"
 - prefect.deployments.steps.run_shell_script:

prefect-local.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 pull:
 - prefect.deployments.steps.git_clone:
     id: clone-step
-    repository: https://github.com/Indraneel99/substack-newsletters-search-course
     credentials: "{{ prefect.blocks.github-credentials.my-gh-creds }}"
 # This function ensures pip is installed in the environment (Only needed for Prefect Server)

 pull:
 - prefect.deployments.steps.git_clone:
     id: clone-step
+    repository: https://github.com/Indraneel99/AISearchEngine
     credentials: "{{ prefect.blocks.github-credentials.my-gh-creds }}"
 # This function ensures pip is installed in the environment (Only needed for Prefect Server)

pyproject.toml CHANGED Viewed

@@ -1,7 +1,7 @@
 [project]
-name = "substack-newsletters-search-course"
 version = "1.0.0"
-description = "A pipeline to retrieve Newsletters from Substack"
 readme = "README.md"
 authors = [
     {name = "Benito Martin"}

 [project]
+name = "AISearchEngine"
 version = "1.0.0"
+description = "A pipeline to retrieve Newsletters from Substack, Medium and Top publications"
 readme = "README.md"
 authors = [
     {name = "Benito Martin"}

src/api/main.py CHANGED Viewed

@@ -73,9 +73,9 @@ async def lifespan(app: FastAPI):
 # -----------------------
 app = FastAPI(
-    title="Substack RAG API",
     version="1.0",
-    description="API for Substack Retrieval-Augmented Generation (RAG) system",
     lifespan=lifespan,
     # root_path=root_path,
 )

 # -----------------------
 app = FastAPI(
+    title="Search Engine RAG API",
     version="1.0",
+    description="API for Articles Search Retrieval-Augmented Generation (RAG) system",
     lifespan=lifespan,
     # root_path=root_path,
 )

src/config.py CHANGED Viewed

@@ -12,13 +12,13 @@ from src.models.article_models import FeedItem
 # Supabase database settings
 # -----------------------------
 class SupabaseDBSettings(BaseModel):
-    table_name: str = Field(default="substack_articles", description="Supabase table name")
     host: str = Field(default="localhost", description="Database host")
     name: str = Field(default="postgres", description="Database name")
     user: str = Field(default="postgres", description="Database user")
     password: SecretStr = Field(default=SecretStr("password"), description="Database password")
     port: int = Field(default=6543, description="Database port")
-    test_database: str = Field(default="substack_test", description="Test database name")
 # -----------------------------
@@ -43,7 +43,7 @@ class QdrantSettings(BaseModel):
     api_key: str = Field(default="", description="Qdrant API key")
     timeout: int = Field(default=30, description="Qdrant client timeout")
     collection_name: str = Field(
-        default="substack_collection", description="Qdrant collection name"
     )
     dense_model_name: str = Field(default="BAAI/bge-base-en", description="Dense model name")
     sparse_model_name: str = Field(
@@ -126,7 +126,7 @@ class OpenRouterSettings(BaseModel):
 # -----------------------------
 class OpikObservabilitySettings(BaseModel):
     api_key: str = Field(default="", description="Opik Observability API key")
-    project_name: str = Field(default="substack-pipeline", description="Opik project name")
 # -----------------------------

 # Supabase database settings
 # -----------------------------
 class SupabaseDBSettings(BaseModel):
+    table_name: str = Field(default="feed_articles", description="Supabase table name")
     host: str = Field(default="localhost", description="Database host")
     name: str = Field(default="postgres", description="Database name")
     user: str = Field(default="postgres", description="Database user")
     password: SecretStr = Field(default=SecretStr("password"), description="Database password")
     port: int = Field(default=6543, description="Database port")
+    test_database: str = Field(default="feed_test", description="Test database name")
 # -----------------------------
     api_key: str = Field(default="", description="Qdrant API key")
     timeout: int = Field(default=30, description="Qdrant client timeout")
     collection_name: str = Field(
+        default="feed_collection", description="Qdrant collection name"
     )
     dense_model_name: str = Field(default="BAAI/bge-base-en", description="Dense model name")
     sparse_model_name: str = Field(
 # -----------------------------
 class OpikObservabilitySettings(BaseModel):
     api_key: str = Field(default="", description="Opik Observability API key")
+    project_name: str = Field(default="feed-pipeline", description="Opik project name")
 # -----------------------------

src/configs/feeds_rss.yaml CHANGED Viewed

@@ -89,3 +89,39 @@ feeds:
 - name: "slys.dev"
   author: "Anna & Jakub Slys"
   url: "https://iam.slys.dev/feed"

 - name: "slys.dev"
   author: "Anna & Jakub Slys"
   url: "https://iam.slys.dev/feed"
+- name: "Technology Review"
+  author: "Technology Review"
+  url: "https://www.technologyreview.com/feed"
+- name: "AI Trends"
+  author: "AI Trends"
+  url: "https://www.aitrends.com/feed"
+- name: "Machine Learning Mastery"
+  author: "Machine Learning Mastery"
+  url: "https://machinelearningmastery.com/feed"
+- name: "The Gradient"
+  author: "The Gradient"
+  url: "https://thegradient.pub/rss/"
+- name: "Towards Data Science"
+  author: "Towards Data Science"
+  url: "https://towardsdatascience.com/feed/"
+- name: "Microsoft AI"
+  author: "Microsoft AI"
+  url: "https://blogs.microsoft.com/ai/feed/"
+- name: "Marktechpost"
+  author: "Marktechpost"
+  url: "https://www.marktechpost.com/feed/"
+- name: "Daily AI"
+  author: "Daily AI"
+  url: "https://dailyai.com/feed/"
+- name: "TopBots"
+  author: "TopBots"
+  url: "https://www.topbots.com/feed/"
+- name: "Towards AI"
+  author: "Towards AI"
+  url: "https://pub.towardsai.net/feed/"
+- name: "The Pycoach"
+  author: "Frank Andrade"
+  url: "https://medium.com/feed/@frank-andrade"
+- name: "Anthony Alcaraz"
+  author: "Anthony Alcaraz"
+  url: "https://medium.com/feed/@alcarazanthony1"

src/infrastructure/qdrant/qdrant_vectorstore.py CHANGED Viewed

@@ -26,7 +26,7 @@ from qdrant_client.models import Batch, Distance, SparseVector, models
 from sqlalchemy.orm import Session
 from src.config import settings
-from src.models.sql_models import SubstackArticle
 from src.models.vectorstore_models import ArticleChunkPayload
 from src.utils.logger_util import log_batch_status, setup_logging
 from src.utils.text_splitter import TextSplitter
@@ -522,7 +522,7 @@ class AsyncQdrantVectorStore:
     async def _article_batch_generator(
         self, session: Session, from_date: datetime | None = None
-    ) -> AsyncGenerator[list[SubstackArticle], None]:
         """Yield batches of articles from SQL database.
         Args:
@@ -530,7 +530,7 @@ class AsyncQdrantVectorStore:
             from_date (datetime, optional): Filter articles from this date.
         Yields:
-            list[SubstackArticle]: Batch of articles.
         Raises:
             Exception: If database query fails.
@@ -542,9 +542,9 @@ class AsyncQdrantVectorStore:
         try:
             offset = 0
             while True:
-                query = session.query(SubstackArticle).order_by(SubstackArticle.published_at)
                 if from_date:
-                    query = query.filter(SubstackArticle.published_at >= from_date)
                 articles = query.offset(offset).limit(self.article_batch_size).all()
                 if not articles:
                     break

 from sqlalchemy.orm import Session
 from src.config import settings
+from src.models.sql_models import FeedArticle
 from src.models.vectorstore_models import ArticleChunkPayload
 from src.utils.logger_util import log_batch_status, setup_logging
 from src.utils.text_splitter import TextSplitter
     async def _article_batch_generator(
         self, session: Session, from_date: datetime | None = None
+    ) -> AsyncGenerator[list[FeedArticle], None]:
         """Yield batches of articles from SQL database.
         Args:
             from_date (datetime, optional): Filter articles from this date.
         Yields:
+            list[FeedArticle]: Batch of articles.
         Raises:
             Exception: If database query fails.
         try:
             offset = 0
             while True:
+                query = session.query(FeedArticle).order_by(FeedArticle.published_at)
                 if from_date:
+                    query = query.filter(FeedArticle.published_at >= from_date)
                 articles = query.offset(offset).limit(self.article_batch_size).all()
                 if not articles:
                     break

src/infrastructure/supabase/create_db.py CHANGED Viewed

@@ -2,17 +2,17 @@ from sqlalchemy import inspect
 from sqlalchemy.exc import SQLAlchemyError
 from src.infrastructure.supabase.init_session import init_engine
-from src.models.sql_models import Base, SubstackArticle
 from src.utils.logger_util import setup_logging
 logger = setup_logging()
 def create_table() -> None:
-    """Create the SubstackArticle table in the Supabase Postgres database if it does not exist.
     This function initializes a SQLAlchemy engine, checks if the table defined by
-    `SubstackArticle.__tablename__` exists in the database, and creates it if necessary.
     The engine is properly disposed of after the operation to prevent resource leaks.
     Errors during table creation are logged and handled gracefully.
@@ -33,14 +33,14 @@ def create_table() -> None:
         # Create an inspector to check existing tables
         inspector = inspect(engine)
         existing_tables = inspector.get_table_names()
-        table_name = SubstackArticle.__tablename__
         # Check if the table already exists
         if table_name in existing_tables:
             logger.info(f"Table '{table_name}' already exists. No action needed.")
         else:
             logger.info(f"Table '{table_name}' does not exist. Creating...")
-            # Create all tables defined in Base.metadata (includes SubstackArticle)
             Base.metadata.create_all(bind=engine)
             logger.info(f"Table '{table_name}' created successfully.")
     except SQLAlchemyError as e:

 from sqlalchemy.exc import SQLAlchemyError
 from src.infrastructure.supabase.init_session import init_engine
+from src.models.sql_models import Base, FeedArticle
 from src.utils.logger_util import setup_logging
 logger = setup_logging()
 def create_table() -> None:
+    """Create the FeedArticle table in the Supabase Postgres database if it does not exist.
     This function initializes a SQLAlchemy engine, checks if the table defined by
+    `FeedArticle.__tablename__` exists in the database, and creates it if necessary.
     The engine is properly disposed of after the operation to prevent resource leaks.
     Errors during table creation are logged and handled gracefully.
         # Create an inspector to check existing tables
         inspector = inspect(engine)
         existing_tables = inspector.get_table_names()
+        table_name = FeedArticle.__tablename__
         # Check if the table already exists
         if table_name in existing_tables:
             logger.info(f"Table '{table_name}' already exists. No action needed.")
         else:
             logger.info(f"Table '{table_name}' does not exist. Creating...")
+            # Create all tables defined in Base.metadata (includes FeedArticle)
             Base.metadata.create_all(bind=engine)
             logger.info(f"Table '{table_name}' created successfully.")
     except SQLAlchemyError as e:

src/models/sql_models.py CHANGED Viewed

@@ -12,7 +12,7 @@ class Base(DeclarativeBase):
     pass
-class SubstackArticle(Base):
     __tablename__ = settings.supabase_db.table_name
     # Primary internal ID

     pass
+class FeedArticle(Base):
     __tablename__ = settings.supabase_db.table_name
     # Primary internal ID

src/pipelines/flows/rss_ingestion_flow.py CHANGED Viewed

@@ -3,7 +3,7 @@ from prefect import flow, unmapped
 from src.config import settings
 from src.infrastructure.supabase.init_session import init_engine
 from src.models.article_models import FeedItem
-from src.models.sql_models import SubstackArticle
 from src.pipelines.tasks.fetch_rss import fetch_rss_entries
 from src.pipelines.tasks.ingest_rss import ingest_from_rss
 from src.utils.logger_util import setup_logging
@@ -16,7 +16,7 @@ from src.utils.logger_util import setup_logging
     retries=2,
     retry_delay_seconds=120,
 )
-def rss_ingest_flow(article_model: type[SubstackArticle] = SubstackArticle) -> None:
     """Fetch and ingest articles from configured RSS feeds concurrently.
     Each feed is fetched in parallel and ingested into the database
@@ -24,7 +24,7 @@ def rss_ingest_flow(article_model: type[SubstackArticle] = SubstackArticle) -> N
     after completion.
     Args:
-        article_model (type[SubstackArticle]): SQLAlchemy model for storing articles.
     Returns:
         None
@@ -115,4 +115,4 @@ def rss_ingest_flow(article_model: type[SubstackArticle] = SubstackArticle) -> N
 if __name__ == "__main__":
-    rss_ingest_flow(article_model=SubstackArticle)

 from src.config import settings
 from src.infrastructure.supabase.init_session import init_engine
 from src.models.article_models import FeedItem
+from src.models.sql_models import FeedArticle
 from src.pipelines.tasks.fetch_rss import fetch_rss_entries
 from src.pipelines.tasks.ingest_rss import ingest_from_rss
 from src.utils.logger_util import setup_logging
     retries=2,
     retry_delay_seconds=120,
 )
+def rss_ingest_flow(article_model: type[FeedArticle] = FeedArticle) -> None:
     """Fetch and ingest articles from configured RSS feeds concurrently.
     Each feed is fetched in parallel and ingested into the database
     after completion.
     Args:
+        article_model (type[FeedArticle]): SQLAlchemy model for storing articles.
     Returns:
         None
 if __name__ == "__main__":
+    rss_ingest_flow(article_model=FeedArticle)

src/pipelines/tasks/fetch_rss.py CHANGED Viewed

@@ -8,13 +8,13 @@ from sqlalchemy.orm import Session
 from src.infrastructure.supabase.init_session import init_session
 from src.models.article_models import ArticleItem, FeedItem
-from src.models.sql_models import SubstackArticle
 from src.utils.logger_util import setup_logging
 @task(
     task_run_name="fetch_rss_entries-{feed.name}",
-    description="Fetch RSS entries from a Substack feed.",
     retries=2,
     retry_delay_seconds=120,
     cache_policy=NO_CACHE,
@@ -22,9 +22,9 @@ from src.utils.logger_util import setup_logging
 def fetch_rss_entries(
     feed: FeedItem,
     engine: Engine,
-    article_model: type[SubstackArticle] = SubstackArticle,
 ) -> list[ArticleItem]:
-    """Fetch all RSS items from a Substack feed and convert them to ArticleItem objects.
     Each task uses its own SQLAlchemy session. Articles already stored in the database
     or with empty links/content are skipped. Errors during parsing individual items
@@ -33,8 +33,8 @@ def fetch_rss_entries(
     Args:
         feed (FeedItem): Metadata for the feed (name, author, URL).
         engine (Engine): SQLAlchemy engine for database connection.
-        article_model (type[SubstackArticle], optional): Model used to persist articles.
-            Defaults to SubstackArticle.
     Returns:
         list[ArticleItem]: List of new ArticleItem objects ready for parsing/ingestion.

 from src.infrastructure.supabase.init_session import init_session
 from src.models.article_models import ArticleItem, FeedItem
+from src.models.sql_models import FeedArticle
 from src.utils.logger_util import setup_logging
 @task(
     task_run_name="fetch_rss_entries-{feed.name}",
+    description="Fetch RSS entries from a substack/Medium and top publications feed.",
     retries=2,
     retry_delay_seconds=120,
     cache_policy=NO_CACHE,
 def fetch_rss_entries(
     feed: FeedItem,
     engine: Engine,
+    article_model: type[FeedArticle] = FeedArticle,
 ) -> list[ArticleItem]:
+    """Fetch all RSS items from a substack/Medium and top publications feed and convert them to ArticleItem objects.
     Each task uses its own SQLAlchemy session. Articles already stored in the database
     or with empty links/content are skipped. Errors during parsing individual items
     Args:
         feed (FeedItem): Metadata for the feed (name, author, URL).
         engine (Engine): SQLAlchemy engine for database connection.
+        article_model (type[FeedArticle], optional): Model used to persist articles.
+            Defaults to FeedArticle.
     Returns:
         list[ArticleItem]: List of new ArticleItem objects ready for parsing/ingestion.

src/pipelines/tasks/ingest_rss.py CHANGED Viewed

@@ -6,7 +6,7 @@ from sqlalchemy.orm import Session
 from src.config import settings
 from src.infrastructure.supabase.init_session import init_session
 from src.models.article_models import ArticleItem, FeedItem
-from src.models.sql_models import SubstackArticle
 from src.utils.logger_util import setup_logging
@@ -20,7 +20,7 @@ from src.utils.logger_util import setup_logging
 def ingest_from_rss(
     fetched_articles: list[ArticleItem],
     feed: FeedItem,
-    article_model: type[SubstackArticle],
     engine: Engine,
 ) -> None:
     """Ingest articles fetched from RSS (already Markdownified).
@@ -93,7 +93,7 @@ def ingest_from_rss(
 def _persist_batch(
     session: Session,
     batch: list[ArticleItem],
-    article_model: type[SubstackArticle],
 ) -> None:
     """Helper to bulk insert a batch of ArticleItems."""
     rows = [

 from src.config import settings
 from src.infrastructure.supabase.init_session import init_session
 from src.models.article_models import ArticleItem, FeedItem
+from src.models.sql_models import FeedArticle
 from src.utils.logger_util import setup_logging
 def ingest_from_rss(
     fetched_articles: list[ArticleItem],
     feed: FeedItem,
+    article_model: type[FeedArticle],
     engine: Engine,
 ) -> None:
     """Ingest articles fetched from RSS (already Markdownified).
 def _persist_batch(
     session: Session,
     batch: list[ArticleItem],
+    article_model: type[FeedArticle],
 ) -> None:
     """Helper to bulk insert a batch of ArticleItems."""
     rows = [

tests/integration/test_db_connection.py CHANGED Viewed

@@ -8,7 +8,7 @@ setup_logging()
 def test_connect_to_test_table(db_session: Connection) -> None:
-    """Test connectivity to the 'substack_test' table and fetch a single row.
     Args:
         db_session (Connection): SQLAlchemy Connection object.
@@ -18,12 +18,12 @@ def test_connect_to_test_table(db_session: Connection) -> None:
         Exception: If the table does not exist or query fails.
     """
-    logger.info("Testing connection to 'substack_test' table...")
     try:
-        result = db_session.execute(text("SELECT * FROM substack_test LIMIT 1")).fetchall()
         logger.info(f"Query result: {result}")
         assert isinstance(result, list), "Query result is not a list"
     except Exception as e:
-        logger.error(f"Failed to query 'substack_test' table: {e}")
         raise

 def test_connect_to_test_table(db_session: Connection) -> None:
+    """Test connectivity to the 'feed_test' table and fetch a single row.
     Args:
         db_session (Connection): SQLAlchemy Connection object.
         Exception: If the table does not exist or query fails.
     """
+    logger.info("Testing connection to 'feed_test' table...")
     try:
+        result = db_session.execute(text("SELECT * FROM feed_test LIMIT 1")).fetchall()
         logger.info(f"Query result: {result}")
         assert isinstance(result, list), "Query result is not a list"
     except Exception as e:
+        logger.error(f"Failed to query 'feed_test' table: {e}")
         raise

tests/integration/test_rss_pipeline.py CHANGED Viewed

@@ -4,7 +4,7 @@ from loguru import logger
 from sqlalchemy import text
 from sqlalchemy.engine import Engine
 from sqlalchemy.orm import Session
-from test_models.test_sql_models import SubstackTestArticle  # Test-specific table model
 from src.models.article_models import FeedItem
 from src.pipelines.tasks.fetch_rss import fetch_rss_entries
@@ -29,12 +29,12 @@ def test_rss_pipeline_end_to_end_mocked(db_session: Session, db_engine: Engine)
     """
     # Clear test table
-    logger.info("Clearing test table 'substack_test'")
-    db_session.execute(text("DELETE FROM substack_test"))
     db_session.commit()
     # Verify table is empty
-    initial_count = db_session.query(SubstackTestArticle).count()
     logger.info(f"Initial article count in test table: {initial_count}")
     assert initial_count == 0, "Test table was not cleared"
@@ -88,7 +88,7 @@ def test_rss_pipeline_end_to_end_mocked(db_session: Session, db_engine: Engine)
     fetched_articles = fetch_rss_entries(
         test_feed,
         engine=db_engine,
-        article_model=SubstackTestArticle,
     )
     logger.info(f"Fetched {len(fetched_articles)} articles for feed '{test_feed.name}'")
@@ -99,14 +99,14 @@ def test_rss_pipeline_end_to_end_mocked(db_session: Session, db_engine: Engine)
     ingest_from_rss(
         fetched_articles,
         feed=test_feed,
-        article_model=SubstackTestArticle,
         engine=db_engine,
     )
     # Verify DB insertion
     articles_in_db = (
-        db_session.query(SubstackTestArticle)
-        .order_by(SubstackTestArticle.published_at.desc())
         .all()
     )
     logger.info(f"Inserted article titles: {[a.title for a in articles_in_db]}")
@@ -122,7 +122,7 @@ def test_rss_pipeline_end_to_end_mocked(db_session: Session, db_engine: Engine)
 ################################################################################
 # The code below calls out to live URLs and is not suitable for CI,
-# as Substack may block requests from CI environments.
 # It is left here for reference and can be run manually if desired.
 # Uncomment to enable live integration test
@@ -132,7 +132,7 @@ def test_rss_pipeline_end_to_end_mocked(db_session: Session, db_engine: Engine)
 # from sqlalchemy import text
 # from sqlalchemy.engine import Engine
 # from sqlalchemy.orm import Session
-# from test_models.test_sql_models import SubstackTestArticle  # Test-specific table model
 # from src.models.article_models import FeedItem
 # from src.pipelines.tasks.batch_parse_ingest_articles import parse_and_ingest
@@ -153,12 +153,12 @@ def test_rss_pipeline_end_to_end_mocked(db_session: Session, db_engine: Engine)
 #     """
 #     # Clear test table
-#     logger.info("Clearing test table 'substack_test'")
-#     db_session.execute(text("DELETE FROM substack_test"))
 #     db_session.commit()
 #     # Verify table is empty
-#     initial_count = db_session.query(SubstackTestArticle).count()
 #     logger.info(f"Initial article count in test table: {initial_count}")
 #     assert initial_count == 0, "Test table was not cleared"
@@ -173,7 +173,7 @@ def test_rss_pipeline_end_to_end_mocked(db_session: Session, db_engine: Engine)
 #     fetched_articles = fetch_rss_entries(
 #         test_feed,
 #         engine=db_engine,
-#         article_model=SubstackTestArticle,
 #     )
 #     logger.info(f"Fetched {len(fetched_articles)} articles for feed '{test_feed.name}'")
@@ -185,14 +185,14 @@ def test_rss_pipeline_end_to_end_mocked(db_session: Session, db_engine: Engine)
 #     parse_and_ingest(
 #         fetched_articles,
 #         feed=test_feed,
-#         article_model=SubstackTestArticle,
 #         engine=db_engine,
 #     )
 #     # Verify DB insertion
 #     articles_in_db = (
-#         db_session.query(SubstackTestArticle)
-#         .order_by(SubstackTestArticle.published_at.desc())
 #         .all()
 #     )
 #     logger.info(f"Inserted article titles: {[a.title for a in articles_in_db]}")

 from sqlalchemy import text
 from sqlalchemy.engine import Engine
 from sqlalchemy.orm import Session
+from test_models.test_sql_models import FeedTestArticle  # Test-specific table model
 from src.models.article_models import FeedItem
 from src.pipelines.tasks.fetch_rss import fetch_rss_entries
     """
     # Clear test table
+    logger.info("Clearing test table 'feed_test'")
+    db_session.execute(text("DELETE FROM feed_test"))
     db_session.commit()
     # Verify table is empty
+    initial_count = db_session.query(FeedTestArticle).count()
     logger.info(f"Initial article count in test table: {initial_count}")
     assert initial_count == 0, "Test table was not cleared"
     fetched_articles = fetch_rss_entries(
         test_feed,
         engine=db_engine,
+        article_model=FeedTestArticle,
     )
     logger.info(f"Fetched {len(fetched_articles)} articles for feed '{test_feed.name}'")
     ingest_from_rss(
         fetched_articles,
         feed=test_feed,
+        article_model=FeedTestArticle,
         engine=db_engine,
     )
     # Verify DB insertion
     articles_in_db = (
+        db_session.query(FeedTestArticle)
+        .order_by(FeedTestArticle.published_at.desc())
         .all()
     )
     logger.info(f"Inserted article titles: {[a.title for a in articles_in_db]}")
 ################################################################################
 # The code below calls out to live URLs and is not suitable for CI,
+# as Substack/medium may block requests from CI environments.
 # It is left here for reference and can be run manually if desired.
 # Uncomment to enable live integration test
 # from sqlalchemy import text
 # from sqlalchemy.engine import Engine
 # from sqlalchemy.orm import Session
+# from test_models.test_sql_models import FeedTestArticle  # Test-specific table model
 # from src.models.article_models import FeedItem
 # from src.pipelines.tasks.batch_parse_ingest_articles import parse_and_ingest
 #     """
 #     # Clear test table
+#     logger.info("Clearing test table 'feed_test'")
+#     db_session.execute(text("DELETE FROM feed_test"))
 #     db_session.commit()
 #     # Verify table is empty
+#     initial_count = db_session.query(FeedTestArticle).count()
 #     logger.info(f"Initial article count in test table: {initial_count}")
 #     assert initial_count == 0, "Test table was not cleared"
 #     fetched_articles = fetch_rss_entries(
 #         test_feed,
 #         engine=db_engine,
+#         article_model=FeedTestArticle,
 #     )
 #     logger.info(f"Fetched {len(fetched_articles)} articles for feed '{test_feed.name}'")
 #     parse_and_ingest(
 #         fetched_articles,
 #         feed=test_feed,
+#         article_model=FeedTestArticle,
 #         engine=db_engine,
 #     )
 #     # Verify DB insertion
 #     articles_in_db = (
+#         db_session.query(FeedTestArticle)
+#         .order_by(FeedTestArticle.published_at.desc())
 #         .all()
 #     )
 #     logger.info(f"Inserted article titles: {[a.title for a in articles_in_db]}")

tests/test_models/test_sql_models.py CHANGED Viewed

@@ -10,8 +10,8 @@ class Base(DeclarativeBase):
     pass
-class SubstackTestArticle(Base):
-    __tablename__ = "substack_test"
     # Primary internal ID
     id: Mapped[int] = mapped_column(BigInteger, primary_key=True, index=True)

     pass
+class FeedTestArticle(Base):
+    __tablename__ = "feed_test"
     # Primary internal ID
     id: Mapped[int] = mapped_column(BigInteger, primary_key=True, index=True)

tests/unit/test_fetch_rss_entries.py CHANGED Viewed

@@ -3,7 +3,7 @@ import responses
 from loguru import logger
 from sqlalchemy import text
 from sqlalchemy.orm import Session
-from test_models.test_sql_models import SubstackTestArticle
 from src.infrastructure.supabase.init_session import init_engine
 from src.models.article_models import ArticleItem, FeedItem
@@ -49,8 +49,8 @@ def test_fetch_rss_mocked_feed() -> None:
     try:
         # Clear the test table before running
         session = Session(bind=engine)
-        logger.info("Clearing test table 'substack_test' before test")
-        session.execute(text("DELETE FROM substack_test"))
         session.commit()
         logger.info("Test table cleared")
@@ -58,7 +58,7 @@ def test_fetch_rss_mocked_feed() -> None:
         articles = fetch_rss_entries(
             feed=test_feed,
             engine=engine,
-            article_model=SubstackTestArticle,
         )
         logger.info(f"Fetched {len(articles)} articles from {test_feed.url}")

 from loguru import logger
 from sqlalchemy import text
 from sqlalchemy.orm import Session
+from test_models.test_sql_models import FeedTestArticle
 from src.infrastructure.supabase.init_session import init_engine
 from src.models.article_models import ArticleItem, FeedItem
     try:
         # Clear the test table before running
         session = Session(bind=engine)
+        logger.info("Clearing test table 'feed_test' before test")
+        session.execute(text("DELETE FROM feed_test"))
         session.commit()
         logger.info("Test table cleared")
         articles = fetch_rss_entries(
             feed=test_feed,
             engine=engine,
+            article_model=FeedTestArticle,
         )
         logger.info(f"Fetched {len(articles)} articles from {test_feed.url}")

uv.lock CHANGED Viewed

@@ -3746,7 +3746,7 @@ wheels = [
 ]
 [[package]]
-name = "substack-newsletters-search-course"
 version = "1.0.0"
 source = { editable = "." }
 dependencies = [

 ]
 [[package]]
+name = "AISearchEngine"
 version = "1.0.0"
 source = { editable = "." }
 dependencies = [