Spaces:

AlainDeLong
/

Backend-Sentiment-System

Sleeping

App Files Files Community

AlainDeLong commited on Sep 25, 2025

Commit

54c2a57

1 Parent(s): a5411db

chore: preprare project for first deployment

Browse files

Files changed (22) hide show

.gitignore +210 -0
Dockerfile +16 -0
README.md +1 -1
app/__init__.py +0 -0
app/api/__init__.py +0 -0
app/api/endpoints/__init__.py +0 -0
app/api/endpoints/analysis.py +548 -0
app/cli.py +0 -0
app/core/__init__.py +0 -0
app/core/clients.py +12 -0
app/core/config.py +85 -0
app/core/db.py +0 -0
app/main.py +37 -0
app/schemas/__init__.py +0 -0
app/schemas/analysis_schema.py +117 -0
app/scripts/__init__.py +0 -0
app/scripts/consumer_job.py +222 -0
app/scripts/producer_job.py +155 -0
app/services/__init__.py +0 -0
app/services/sentiment_service.py +103 -0
app/services/youtube_service.py +106 -0
requirements.txt +20 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,210 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+.vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Kafka Commands
+command.txt

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# use small official image
+FROM python:3.11-slim
+# set workdir
+WORKDIR /app
+# copy only requirements first (for cache)
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# copy code
+COPY . .
+# uvicorn listen on 0.0.0.0:7860 (Spaces default port)
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--proxy-headers"]

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Backend Sentiment System
 emoji: 👁
 colorFrom: pink
 colorTo: yellow

 ---
+title: Develop Social Sentiment Analysis System
 emoji: 👁
 colorFrom: pink
 colorTo: yellow

app/__init__.py ADDED Viewed

File without changes

app/api/__init__.py ADDED Viewed

File without changes

app/api/endpoints/__init__.py ADDED Viewed

File without changes

app/api/endpoints/analysis.py ADDED Viewed

	@@ -0,0 +1,548 @@

+from typing import Any, List, Dict
+import uuid
+from datetime import datetime
+from fastapi import APIRouter, HTTPException, status, Request
+import asyncio
+from motor.motor_asyncio import AsyncIOMotorClient
+from bson import ObjectId
+import pandas as pd
+from trendspy import Trends
+from app.core.config import settings
+from app.core.clients import qstash_client, qstash_receiver
+from app.schemas.analysis_schema import (
+    WeeklyTrendResponseSchema,
+    WeeklyTrendListResponse,
+    TrendDetailResponseSchema,
+    OnDemandRequestSchema,
+    OnDemandResponseSchema,
+    JobStatusResponseSchema,
+)
+from app.services.sentiment_service import SentimentService
+from app.services.youtube_service import YouTubeService
+# Create a router to organize endpoints
+# router = APIRouter(prefix="/trends", tags=["trends"])
+router = APIRouter(prefix=settings.API_PREFIX_TRENDS)
+# --- MongoDB Connection ---
+client = AsyncIOMotorClient(settings.MONGODB_CONNECTION_STRING)
+db = client[settings.DB_NAME]
+# Initialize services once when the application starts.
+# This avoids reloading the heavy AI model on every request.
+print("Initializing services...")
+tr = Trends(request_delay=2.0)
+yt_service = YouTubeService(api_key=settings.YT_API_KEY)
+sentiment_service = SentimentService()
+async def fetch_repr_comments(entity_id):
+    # Find all source videos linked to this entity
+    source_docs = await db.sources_youtube.find({"entity_id": entity_id}).to_list(
+        length=None
+    )
+    source_ids = [doc["_id"] for doc in source_docs]
+    if not source_ids:
+        return {"positive": [], "neutral": [], "negative": []}
+    # Fetch 2 comments for each sentiment
+    sentiments = ["positive", "neutral", "negative"]
+    comment_tasks = []
+    limit = settings.REPRESENTATIVE_COMMENTS_LIMIT
+    for sentiment in sentiments:
+        task = (
+            db.comments_youtube.find(
+                {"source_id": {"$in": source_ids}, "sentiment": sentiment},
+                {"text": 1, "author": 1, "publish_date": 1, "_id": 0},
+            )
+            .sort("publish_date", -1)
+            .limit(limit)
+            .to_list(length=limit)
+        )
+        comment_tasks.append(task)
+    results = await asyncio.gather(*comment_tasks)
+    # Convert datetime objects to string format for JSON response
+    for sentiment_list in results:
+        for comment in sentiment_list:
+            if "publish_date" in comment and hasattr(
+                comment["publish_date"], "isoformat"
+            ):
+                comment["publish_date"] = comment["publish_date"].isoformat()
+    return dict(zip(sentiments, results))
+async def _get_full_entity_details(
+    entity_id: ObjectId, analysis_type: str
+) -> Dict[str, Any] | None:
+    """
+    Fetches all detailed data for an entity. It runs the database query,
+    interest data fetching, and comment fetching as concurrent, independent tasks.
+    """
+    async def fetch_main_data_task():
+        """Fetches the main analysis data from the database."""
+        pipeline = [
+            {"$match": {"entity_id": entity_id, "analysis_type": analysis_type}},
+            {"$sort": {"created_at": -1}},
+            {"$limit": 1},
+            {
+                "$lookup": {
+                    "from": "entities",
+                    "localField": "entity_id",
+                    "foreignField": "_id",
+                    "as": "entity_info",
+                }
+            },
+            {"$unwind": "$entity_info"},
+            {
+                "$project": {
+                    "analysis_result_id": "$_id",
+                    "_id": {"$toString": "$entity_info._id"},
+                    "keyword": "$entity_info.keyword",
+                    "thumbnail_url": "$entity_info.thumbnail_url",
+                    "representative_video_url": "$entity_info.video_url",
+                    "analysis": "$results",
+                    "interest_over_time": "$interest_over_time",
+                }
+            },
+        ]
+        results = await db.analysis_results.aggregate(pipeline).to_list(length=1)
+        return results[0] if results else None
+    # Run the main DB query and comment fetching concurrently
+    main_data_task = fetch_main_data_task()
+    comments_task = fetch_repr_comments(entity_id)
+    main_data, rep_comments = await asyncio.gather(main_data_task, comments_task)
+    if not main_data:
+        # If the main entity/analysis is not found, we can't proceed.
+        return None
+    # Now, handle the interest data fetching based on the result of the main query
+    if not main_data.get("interest_over_time"):
+        print(
+            f"Interest data not found in DB for '{main_data['keyword']}'. Fetching live..."
+        )
+        def blocking_interest_fetch(keyword: str):
+            """Synchronous wrapper for the blocking trendspy call."""
+            df = tr.interest_over_time(keywords=[keyword], timeframe="now 7-d")
+            if df.empty:
+                return []
+            daily_df = df[[keyword]].resample("D").mean().round(0).astype(int)
+            return [
+                {"date": index.strftime("%Y-%m-%d"), "value": int(row.iloc[0])}
+                for index, row in daily_df.iterrows()
+            ]
+        try:
+            # Run the blocking call in a separate thread to not block the server
+            interest_data_to_cache = await asyncio.to_thread(
+                blocking_interest_fetch, main_data["keyword"]
+            )
+            if interest_data_to_cache:
+                main_data["interest_over_time"] = interest_data_to_cache
+                await db.analysis_results.update_one(
+                    {"_id": main_data["analysis_result_id"]},
+                    {"$set": {"interest_over_time": interest_data_to_cache}},
+                )
+                print(
+                    f"Successfully cached interest data for '{main_data['keyword']}'."
+                )
+        except Exception as e:
+            print(f"Could not fetch live interest data: {e}")
+            main_data["interest_over_time"] = []
+    # Combine all results
+    main_data.pop("analysis_result_id", None)
+    return {**main_data, "representative_comments": rep_comments}
+@router.get("/weekly", response_model=WeeklyTrendListResponse)
+async def get_weekly_trends():
+    """
+    Retrieves the latest weekly sentiment analysis results.
+    This endpoint fetches data from the 'analysis_results' collection and
+    joins it with the 'entities' collection to get keyword and thumbnail details.
+    """
+    try:
+        # MongoDB Aggregation Pipeline to join collections
+        pipeline = [
+            # 1. Filter for weekly analysis and sort by date to get the latest run
+            {"$match": {"analysis_type": "weekly"}},
+            {"$sort": {"created_at": -1}},
+            {"$limit": settings.HOME_PAGE_ENTITIES_LIMIT},
+            # 2. Join with the 'entities' collection
+            {
+                "$lookup": {
+                    "from": "entities",
+                    "localField": "entity_id",
+                    "foreignField": "_id",
+                    "as": "entity_info",
+                }
+            },
+            # 3. Deconstruct the entity_info array
+            {"$unwind": "$entity_info"},
+            # 4. Project the final structure for the API response
+            {
+                "$project": {
+                    "_id": {"$toString": "$entity_info._id"},
+                    "keyword": "$entity_info.keyword",
+                    "thumbnail_url": "$entity_info.thumbnail_url",
+                    "analysis": {
+                        "positive_count": "$results.positive_count",
+                        "negative_count": "$results.negative_count",
+                        "neutral_count": "$results.neutral_count",
+                        "total_comments": "$results.total_comments",
+                    },
+                }
+            },
+        ]
+        results = await db.analysis_results.aggregate(pipeline).to_list(length=None)
+        if not results:
+            raise HTTPException(status_code=500, detail="Internal server error")
+        response_data = {"data": results}
+        return response_data
+    except Exception as e:
+        # Log the error for debugging
+        print(f"An error occurred: {e}")
+        raise HTTPException(status_code=500, detail="Internal server error")
+@router.get("/{analysis_type}/{entity_id}", response_model=TrendDetailResponseSchema)
+async def get_trend_detail_by_type(analysis_type: str, entity_id: str):
+    """
+    Retrieves detailed information for a single entity, specifying
+    whether to fetch the 'weekly' or 'on-demand' analysis result.
+    """
+    if analysis_type not in ["weekly", "on-demand"]:
+        raise HTTPException(
+            status_code=400,
+            detail="Invalid analysis type. Must be 'weekly' or 'on-demand'.",
+        )
+    try:
+        entity_obj_id = ObjectId(entity_id)
+    except Exception:
+        raise HTTPException(status_code=400, detail="Invalid entity ID format.")
+    # Call the helper function with the specified type
+    full_details = await _get_full_entity_details(entity_obj_id, analysis_type)
+    if not full_details:
+        raise HTTPException(
+            status_code=404,
+            detail=f"'{analysis_type}' analysis for this entity not found.",
+        )
+    return full_details
+@router.post(
+    "/analysis/on-demand",
+    status_code=status.HTTP_202_ACCEPTED,
+    response_model=OnDemandResponseSchema,
+)
+async def create_on_demand_analysis(request_data: OnDemandRequestSchema):
+    """
+    Handles an on-demand analysis request.
+    First, it checks if a recent 'weekly' analysis for the keyword exists.
+    If yes, it returns a 'found' status with the entity_id for immediate redirection.
+    If not, it queues a new analysis job via QStash and returns a 'queued' status.
+    """
+    if not request_data.keyword or not request_data.keyword.strip():
+        raise HTTPException(status_code=400, detail="Keyword cannot be empty.")
+    # Convert incoming keyword to lowercase for consistent matching
+    keyword = request_data.keyword.lower().strip()
+    # Check for existing weekly analysis
+    entity = await db.entities.find_one({"keyword": keyword})
+    if entity:
+        analysis = await db.analysis_results.find_one(
+            {"entity_id": entity["_id"], "analysis_type": "weekly"}
+        )
+        if analysis:
+            print(
+                f"Found existing weekly analysis for '{keyword}'. Returning redirect info."
+            )
+            # Return a different response if data already exists
+            return {"status": "found", "entity_id": str(entity["_id"])}
+    # If no existing analysis, proceed with queuing a new job
+    print(f"No weekly analysis found for '{keyword}'. Queuing a new job.")
+    job_id = str(uuid.uuid4())
+    job_document = {
+        "_id": job_id,
+        "keyword": keyword,
+        "status": "pending",
+        "created_at": datetime.now(),
+        "updated_at": datetime.now(),
+        "result_id": None,
+    }
+    await db.on_demand_jobs.insert_one(job_document)
+    # callback_url = f"{settings.BASE_URL}/api/v1/trends/analysis/process-job"
+    callback_url = f"{settings.BASE_URL}{settings.API_PREFIX}{settings.API_VERSION}{settings.API_PREFIX_TRENDS}/analysis/process-job"
+    print(
+        f"Queuing job {job_id} for keyword '{keyword}' with callback to {callback_url}"
+    )
+    try:
+        qstash_client.message.publish_json(
+            url=callback_url, body={"keyword": keyword, "job_id": job_id}, retries=3
+        )
+    except Exception as e:
+        # If publishing fails, update the job status to 'failed'
+        await db.on_demand_jobs.update_one(
+            {"_id": job_id}, {"$set": {"status": "failed"}}
+        )
+        print(f"Error publishing to QStash: {e}")
+        raise HTTPException(status_code=500, detail="Failed to queue analysis job.")
+    return {"status": "queued", "job_id": job_id}
+@router.get("/analysis/status/{job_id}", response_model=JobStatusResponseSchema)
+async def get_analysis_status(job_id: str):
+    """
+    Checks the status of an on-demand analysis job from the 'on_demand_jobs' collection.
+    """
+    job = await db.on_demand_jobs.find_one({"_id": job_id})
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found.")
+    response_data = {
+        "_id": job["_id"],
+        "status": job["status"],
+        "keyword": job["keyword"],
+        "result": None,
+    }
+    # If job is completed, fetch the full result data
+    if job["status"] == "completed" and job.get("result_id"):
+        analysis_doc = await db.analysis_results.find_one({"_id": job["result_id"]})
+        # Check if the analysis document exists and contains an entity_id
+        if analysis_doc and analysis_doc.get("entity_id"):
+            # Get the correct entity_id from the analysis document
+            entity_id = analysis_doc["entity_id"]
+            # Call the helper with the correct entity_id and type
+            full_details = await _get_full_entity_details(entity_id, "on-demand")
+            response_data["result"] = full_details
+    return response_data
+@router.post("/analysis/process-job", include_in_schema=False)
+async def process_on_demand_job(request: Request):
+    """
+    A webhook endpoint called by QStash to perform the full analysis for a
+    single keyword. It fetches data, runs sentiment analysis, and saves all
+    results to the database.
+    """
+    # 1. Initialization
+    job_data = await request.json()
+    keyword = job_data.get("keyword")
+    job_id = job_data.get("job_id")
+    if not keyword:
+        # Acknowledge the request but do nothing if keyword is missing
+        return {"message": "Keyword is missing, job ignored."}
+    print(f"Processing job {job_id} for keyword: {keyword}")
+    # Update job status to 'processing'
+    await db.on_demand_jobs.update_one(
+        {"_id": job_id},
+        {"$set": {"status": "processing", "updated_at": datetime.now()}},
+    )
+    # 2. Fetch data (similar to a mini-producer)
+    # Note: For on-demand, I might use a smaller fetching strategy
+    videos = yt_service.search_videos(query_string=keyword)
+    if not videos:
+        print(f"No videos found for on-demand keyword: {keyword}")
+        return {"message": "No videos found."}
+    comments_for_entity: List[Dict[str, Any]] = []
+    for video in videos:
+        video_id = video.get("id", {}).get("videoId")
+        snippet = video.get("snippet", {})
+        if not video_id or not snippet:
+            continue
+        comments = yt_service.fetch_comments(
+            video_id=video_id, limit=settings.ON_DEMAND_COMMENTS_PER_VIDEO
+        )  # Smaller limit for on-demand
+        for comment in comments:
+            comment["video_id"] = video_id
+            comment["video_title"] = snippet.get("title")
+            comment["video_publish_date"] = snippet.get("publishedAt")
+            comment["video_url"] = f"https://www.youtube.com/watch?v={video_id}"
+        comments_for_entity.extend(comments)
+        if (
+            len(comments_for_entity) >= settings.ON_DEMAND_TOTAL_COMMENTS
+        ):  # Smaller total limit for on-demand
+            break
+    final_comments = comments_for_entity[: settings.ON_DEMAND_TOTAL_COMMENTS]
+    if not final_comments:
+        print(f"No comments found for on-demand keyword: {keyword}")
+        return {"message": "No comments found."}
+    # 3. Perform Sentiment Analysis IN BATCHES
+    print(f"Analyzing {len(final_comments)} comments in batches...")
+    batch_size = settings.CONSUMER_BATCH_SIZE
+    all_predictions = []
+    # Loop through comments in chunks of batch_size
+    for i in range(0, len(final_comments), batch_size):
+        batch_comments = final_comments[i : i + batch_size]
+        texts_to_predict = [comment.get("text", "") for comment in batch_comments]
+        # Process one small batch at a time
+        batch_predictions = sentiment_service.predict_batch(texts_to_predict)
+        all_predictions.extend(batch_predictions)
+        print(f"  - Processed batch {i // batch_size + 1}...")
+    # 4. Save results to Database (similar to a mini-consumer)
+    video_id_cache: Dict[str, ObjectId] = {}
+    comments_to_insert: List[Dict[str, Any]] = []
+    # 4a. Upsert Entity first to get a stable entity_id
+    video_id = None
+    for video in videos:
+        video_id = video.get("id", {}).get("videoId", "")
+        if video_id:
+            break
+    entity_thumbnail_url = (
+        videos[0].get("snippet", {}).get("thumbnails", {}).get("high", {}).get("url")
+    )
+    entity_video_url = f"https://www.youtube.com/watch?v={video_id}"
+    entity_doc = await db.entities.find_one_and_update(
+        {"keyword": keyword},
+        {
+            "$set": {
+                "thumbnail_url": entity_thumbnail_url,
+                "video_url": entity_video_url,
+            },
+            "$setOnInsert": {
+                "keyword": keyword,
+                "geo": settings.FETCH_TRENDS_GEO,
+                "volume": 0,  # Placeholder values
+                # "thumbnail_url": entity_thumbnail_url,
+                # "video_url": entity_video_url,
+                "start_date": datetime.now(),
+            },
+        },
+        upsert=True,
+        return_document=True,
+    )
+    entity_id = entity_doc["_id"]
+    # 4b. Process and save each comment
+    for comment_data, prediction in zip(final_comments, all_predictions):
+        sentiment_label = prediction["label"].lower()
+        # Upsert Source Video
+        video_id = comment_data.get("video_id")
+        source_id: ObjectId | None = video_id_cache.get(video_id)
+        if not source_id:
+            source_doc = await db.sources_youtube.find_one_and_update(
+                {"video_id": video_id},
+                {
+                    "$set": {"entity_id": entity_id},
+                    "$setOnInsert": {
+                        # "entity_id": entity_id,
+                        "video_id": video_id,
+                        "url": comment_data.get("video_url"),
+                        "title": comment_data.get("video_title"),
+                        "publish_date": datetime.strptime(
+                            comment_data.get("video_publish_date"), "%Y-%m-%dT%H:%M:%SZ"
+                        ),
+                    },
+                },
+                upsert=True,
+                return_document=True,
+            )
+            source_id = source_doc["_id"]
+            video_id_cache[video_id] = source_id
+        # Prepare comment for bulk insertion
+        comments_to_insert.append(
+            {
+                "source_id": source_id,
+                "comment_id": comment_data.get("comment_id"),
+                "text": comment_data.get("text"),
+                "author": comment_data.get("author"),
+                "publish_date": datetime.strptime(
+                    comment_data.get("publish_date"), "%Y-%m-%dT%H:%M:%SZ"
+                ),
+                "sentiment": sentiment_label,
+            }
+        )
+        # Update aggregated results in real-time
+        await db.analysis_results.update_one(
+            {"entity_id": entity_id},
+            {
+                "$inc": {
+                    f"results.{sentiment_label}_count": 1,
+                    "results.total_comments": 1,
+                },
+                "$setOnInsert": {
+                    "entity_id": entity_id,
+                    "analysis_type": "on-demand",  # Note the type
+                    "created_at": datetime.now(),
+                    "status": "completed",
+                    "interest_over_time": [],
+                },
+            },
+            upsert=True,
+        )
+    # 4c. Bulk insert all comments after the loop
+    if comments_to_insert:
+        await db.comments_youtube.insert_many(comments_to_insert)
+    # 4d. Final update to job status
+    analysis_result_doc = await db.analysis_results.find_one(
+        {"entity_id": entity_id, "analysis_type": "on-demand"}
+    )
+    result_id = analysis_result_doc["_id"] if analysis_result_doc else None
+    await db.on_demand_jobs.update_one(
+        {"_id": job_id},
+        {
+            "$set": {
+                "status": "completed",
+                "result_id": result_id,
+                "updated_at": datetime.now(),
+            }
+        },
+    )
+    print(f"Successfully processed and saved analysis for job {job_id}")
+    return {"message": f"Job {job_id} for '{keyword}' processed successfully."}

app/cli.py ADDED Viewed

File without changes

app/core/__init__.py ADDED Viewed

File without changes

app/core/clients.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from qstash import QStash, Receiver
+from app.core.config import settings
+# Initialize the QStash client for publishing messages
+# qstash_client = QStash(token=settings.QSTASH_TOKEN, base_url=settings.QSTASH_URL)
+qstash_client = QStash(token=settings.QSTASH_TOKEN)
+# Initialize the QStash receiver for verifying incoming messages
+qstash_receiver = Receiver(
+    current_signing_key=settings.QSTASH_CURRENT_SIGNING_KEY,
+    next_signing_key=settings.QSTASH_NEXT_SIGNING_KEY,
+)

app/core/config.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from pydantic_settings import BaseSettings, SettingsConfigDict
+from urllib.parse import quote_plus
+class Settings(BaseSettings):
+    """
+    Manages application settings and loads environment variables from a .env file.
+    It constructs the complete MongoDB connection string from individual components.
+    """
+    # Database name
+    DB_NAME: str
+    # AI Model name
+    SENTIMENT_MODEL: str
+    # YouTube API Key
+    YT_API_KEY: str
+    # MongoDB connection details
+    MONGODB_URI: str
+    MONGODB_USR: str
+    MONGODB_PWD: str
+    # Kafka Name Topic
+    KAFKA_TOPIC: str
+    # Data Fetching Strategy Settings
+    FETCH_NUM_ENTITIES: int = 20
+    FETCH_VIDEOS_PER_ENTITY: int = 5
+    FETCH_COMMENTS_PER_VIDEO: int = 100
+    FETCH_TOTAL_COMMENTS_PER_ENTITY: int = 500
+    FETCH_TRENDS_GEO: str = "US"
+    FETCH_TRENDS_WITHIN_DAYS: int = 7
+    # Consumer Performance Settings
+    CONSUMER_BATCH_SIZE: int = 32
+    CONSUMER_BATCH_TIMEOUT_SECONDS: int = 5
+    # FastAPI
+    API_PREFIX: str = "/api"
+    API_VERSION: str = "/v1"
+    API_PREFIX_TRENDS: str
+    DEBUG: bool = False
+    # QStash Settings
+    QSTASH_URL: str = "https://qstash.upstash.io"
+    QSTASH_TOKEN: str
+    QSTASH_CURRENT_SIGNING_KEY: str
+    QSTASH_NEXT_SIGNING_KEY: str
+    # Base URL for the application, used for constructing callback URLs
+    BASE_URL: str = "http://localhost:8000"
+    # Display Settings
+    HOME_PAGE_ENTITIES_LIMIT: int = 10
+    REPRESENTATIVE_COMMENTS_LIMIT: int = 3
+    # On-Demand Use Case
+    ON_DEMAND_COMMENTS_PER_VIDEO: int = 100
+    ON_DEMAND_TOTAL_COMMENTS: int = 500
+    # Pydantic model configuration to load from .env file
+    model_config = SettingsConfigDict(
+        env_file=".env", env_file_encoding="utf-8", extra="ignore"
+    )
+    @property
+    def MONGODB_CONNECTION_STRING(self) -> str:
+        """
+        Constructs and returns the full MongoDB connection string,
+        properly escaping the username and password.
+        """
+        # Escape username and password to handle special characters
+        escaped_usr = quote_plus(self.MONGODB_USR)
+        escaped_pwd = quote_plus(self.MONGODB_PWD)
+        # Replace placeholders in the URI
+        return self.MONGODB_URI.replace("<username>", escaped_usr).replace(
+            "<password>", escaped_pwd
+        )
+# Create a single instance of the Settings class
+settings = Settings()

app/core/db.py ADDED Viewed

File without changes

app/main.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from app.core.config import settings
+from app.api.endpoints import analysis
+app = FastAPI(
+    title="Sentiment Analysis API",
+    description="API for analyzing sentiment of trending topics.",
+    version="1.0.0",
+    docs_url="/docs",
+    redoc_url="/redoc"
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"]
+)
+app.include_router(analysis.router, prefix=f"{settings.API_PREFIX}{settings.API_VERSION}", tags=["Analysis"])
+@app.get("/", tags=["Root"])
+def read_root() -> dict:
+    """
+    Root endpoint to check if the API is running.
+    """
+    return {"message": "Welcome to the Sentiment Analysis API"}
+if __name__ == '__main__':
+    import uvicorn
+    uvicorn.run("app.main:app", host='0.0.0.0', port=8000, reload=True)

app/schemas/__init__.py ADDED Viewed

File without changes

app/schemas/analysis_schema.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from typing import List, Literal, Union
+from pydantic import BaseModel, Field
+# --- Schemas for Weekly Trends ---
+class AnalysisResultSchema(BaseModel):
+    """
+    Represents the aggregated analysis counts for an entity.
+    """
+    total_comments: int = 0
+    positive_count: int = 0
+    neutral_count: int = 0
+    negative_count: int = 0
+class WeeklyTrendResponseSchema(BaseModel):
+    """
+    Represents a single trending entity in the weekly results API response.
+    """
+    entity_id: str = Field(..., alias="_id")
+    keyword: str
+    thumbnail_url: str | None = None
+    analysis: AnalysisResultSchema
+class WeeklyTrendListResponse(BaseModel):
+    """
+    Represents the top-level API response for the weekly trends endpoint.
+    """
+    data: List[WeeklyTrendResponseSchema]
+# --- Schemas for Trend Detail ---
+class InterestOverTimePoint(BaseModel):
+    """
+    Represents a single data point in the interest over time chart.
+    """
+    date: str
+    value: int
+class CommentSchema(BaseModel):
+    """
+    Represents a single representative comment.
+    """
+    publish_date: str
+    text: str
+    author: str
+class RepresentativeCommentsSchema(BaseModel):
+    """
+    Holds lists of representative comments for each sentiment.
+    """
+    positive: List[CommentSchema]
+    neutral: List[CommentSchema]
+    negative: List[CommentSchema]
+class TrendDetailResponseSchema(WeeklyTrendResponseSchema):
+    """
+    Represents the full detailed response for a single entity,
+    inheriting basic fields from WeeklyTrendResponseSchema.
+    """
+    representative_video_url: str | None = None
+    interest_over_time: List[InterestOverTimePoint]
+    representative_comments: RepresentativeCommentsSchema
+# --- Schemas for On-Demand Analysis ---
+class OnDemandRequestSchema(BaseModel):
+    """
+    Defines the request body for an on-demand analysis request from a user.
+    """
+    keyword: str
+# Define two distinct response schemas
+class QueuedResponseSchema(BaseModel):
+    """
+    Response when a new job is successfully queued.
+    """
+    status: Literal["queued"]
+    job_id: str
+class FoundResponseSchema(BaseModel):
+    """
+    Response when an existing analysis is found.
+    """
+    status: Literal["found"]
+    entity_id: str
+# The final response model is a Union of the two possibilities
+OnDemandResponseSchema = Union[QueuedResponseSchema, FoundResponseSchema]
+class JobStatusResponseSchema(BaseModel):
+    """
+    Represents the status of an on-demand job.
+    """
+    job_id: str = Field(..., alias="_id")
+    status: str
+    keyword: str
+    result: TrendDetailResponseSchema | None = None

app/scripts/__init__.py ADDED Viewed

File without changes

app/scripts/consumer_job.py ADDED Viewed

	@@ -0,0 +1,222 @@

+from typing import Any, List, Dict
+import json
+import time
+from datetime import datetime
+from confluent_kafka import Consumer, KafkaError, Message
+from pymongo import MongoClient
+from pymongo.database import Database
+from bson import ObjectId
+from app.core.config import settings
+from app.services.sentiment_service import SentimentService
+def process_message_batch(
+    batch: List[Message],
+    sentiment_service: SentimentService,
+    db: Database,
+) -> None:
+    """
+    Processes a batch of Kafka messages: performs sentiment analysis and updates all database collections.
+    """
+    if not batch:
+        return
+    print(f"Processing a batch of {len(batch)} messages...")
+    # --- 1. Prepare data for the model ---
+    messages_data: List[Dict[str, Any]] = []
+    texts_to_predict: List[str] = []
+    for msg in batch:
+        message_data = json.loads(msg.value().decode("utf-8"))
+        messages_data.append(message_data)
+        texts_to_predict.append(
+            message_data.get("video_and_comment_data", {}).get("text", "")
+        )
+    if not texts_to_predict:
+        print("Batch contains only empty comments after preprocessing. Skipping.")
+        return
+    # --- 2. Perform Batch Sentiment Analysis ---
+    predictions = sentiment_service.predict_batch(texts_to_predict)
+    # --- 3. Save data to Database ---
+    video_id_cache: Dict[str, ObjectId] = {}
+    comments_to_insert: List[Dict[str, Any]] = []
+    for message_data, prediction in zip(messages_data, predictions):
+        entity_keyword = message_data.get("entity_keyword")
+        entity_thumbnail = message_data.get("entity_thumbnail_url")
+        entity_video_url = message_data.get("entity_video_url")
+        entity_volume = message_data.get("entity_volume")
+        interest_data = message_data.get("interest_over_time")
+        data = message_data.get("video_and_comment_data", {})
+        video_id = data.get("video_id")
+        video_title = data.get("video_title")
+        video_publish_date_str = data.get("video_publish_date")
+        video_url = data.get("video_url")
+        sentiment_label = prediction["label"].lower()
+        if not all([entity_keyword, entity_volume is not None, video_id]):
+            continue
+        # 3a. Upsert Entity and get its ID
+        entity_doc = db.entities.find_one_and_update(
+            {"keyword": entity_keyword},
+            {
+                "$set": {
+                    "volume": entity_volume,
+                    "thumbnail_url": entity_thumbnail,
+                    "video_url": entity_video_url,
+                },
+                "$setOnInsert": {
+                    "keyword": entity_keyword,
+                    "geo": settings.FETCH_TRENDS_GEO,
+                    # "volume": entity_volume,
+                    # "thumbnail_url": entity_thumbnail,
+                    # "video_url": entity_video_url,
+                    "start_date": datetime.now(),
+                },
+            },
+            upsert=True,
+            return_document=True,
+        )
+        entity_id = entity_doc["_id"]
+        # 3b. Upsert Source Video and get its ID (with in-batch caching)
+        source_id: ObjectId | None = video_id_cache.get(video_id)
+        if not source_id:
+            source_doc = db.sources_youtube.find_one_and_update(
+                {"video_id": video_id},
+                {
+                    "$set": {"entity_id": entity_id},
+                    "$setOnInsert": {
+                        "entity_id": entity_id,
+                        "video_id": video_id,
+                        "url": video_url,
+                        "title": video_title,
+                        "publish_date": datetime.strptime(
+                            video_publish_date_str, "%Y-%m-%dT%H:%M:%SZ"
+                        ),
+                    },
+                },
+                upsert=True,
+                return_document=True,
+            )
+            source_id = source_doc["_id"]
+            video_id_cache[video_id] = source_id
+        # 3c. Prepare comment for bulk insertion
+        comments_to_insert.append(
+            {
+                "source_id": source_id,
+                "comment_id": data.get("comment_id"),
+                "text": data.get("text"),
+                "author": data.get("author"),
+                "publish_date": datetime.strptime(
+                    data.get("publish_date"), "%Y-%m-%dT%H:%M:%SZ"
+                ),
+                "sentiment": sentiment_label,
+            }
+        )
+        # 3d. Update Aggregated Analysis Result
+        db.analysis_results.update_one(
+            {"entity_id": entity_id},
+            {
+                "$inc": {
+                    f"results.{sentiment_label}_count": 1,
+                    "results.total_comments": 1,
+                },
+                "$setOnInsert": {
+                    "entity_id": entity_id,
+                    "analysis_type": "weekly",
+                    "created_at": datetime.now(),
+                    "status": "completed",
+                    "interest_over_time": interest_data,
+                },
+            },
+            upsert=True,
+        )
+    # 3e. Bulk insert all comments from the batch
+    if comments_to_insert:
+        db.comments_youtube.insert_many(comments_to_insert)
+        print(f"Inserted {len(comments_to_insert)} raw comments into database.")
+def run_consumer_job() -> None:
+    """
+    This job consumes raw comments from Kafka in batches, performs sentiment analysis,
+    and saves the results into MongoDB.
+    """
+    # --- 1. Initialization ---
+    print("Initializing services...")
+    sentiment_service = SentimentService()
+    mongo_client = MongoClient(settings.MONGODB_CONNECTION_STRING)
+    db = mongo_client[settings.DB_NAME]
+    kafka_conf = {
+        "bootstrap.servers": "localhost:9092",
+        "group.id": "sentiment_analyzer_group",
+        "auto.offset.reset": "earliest",
+        "enable.auto.commit": False,
+    }
+    consumer = Consumer(kafka_conf)
+    consumer.subscribe([settings.KAFKA_TOPIC])
+    print("Consumer job started. Waiting for messages...")
+    # --- 2. Batch Processing Loop ---
+    message_batch: List[Message] = []
+    last_process_time = time.time()
+    try:
+        while True:
+            msg = consumer.poll(timeout=1.0)
+            if msg is None:
+                # No new message, check for timeout
+                if message_batch and (
+                    time.time() - last_process_time
+                    > settings.CONSUMER_BATCH_TIMEOUT_SECONDS
+                ):
+                    process_message_batch(message_batch, sentiment_service, db)
+                    consumer.commit(message=msg, asynchronous=False)
+                    message_batch.clear()
+                    last_process_time = time.time()
+                continue
+            if msg.error():
+                # Handle Kafka errors
+                if msg.error().code() != KafkaError._PARTITION_EOF:
+                    print(f"Kafka error: {msg.error()}")
+                continue
+            # Add message to batch and check if batch is full
+            message_batch.append(msg)
+            if len(message_batch) >= settings.CONSUMER_BATCH_SIZE:
+                process_message_batch(message_batch, sentiment_service, db)
+                consumer.commit(message=msg, asynchronous=False)
+                message_batch.clear()
+                last_process_time = time.time()
+    except KeyboardInterrupt:
+        print("Stopping consumer job...")
+        # Process any remaining messages in the batch before exiting
+        process_message_batch(message_batch, sentiment_service, db)
+    finally:
+        consumer.close()
+        mongo_client.close()
+        print("Consumer and DB connection closed.")
+if __name__ == "__main__":
+    run_consumer_job()

app/scripts/producer_job.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import json
+import pandas as pd
+from typing import Any, List, Dict
+from datetime import datetime as dt, timedelta
+from trendspy import Trends
+from confluent_kafka import Producer
+from app.core.config import settings
+from app.services.youtube_service import YouTubeService
+def get_rfc_time_ago(days: int) -> str:
+    """
+    Calculates the datetime N days ago from now and returns it in RFC 3339 format.
+    """
+    time_ago = dt.now() - timedelta(days=days)
+    return time_ago.strftime("%Y-%m-%dT%H:%M:%SZ")
+def run_producer_job() -> None:
+    """
+    This job fetches trending topics, searches for related YouTube videos,
+    collects comments from those videos, and sends them to a Kafka topic.
+    It uses a contextual query building strategy for better search accuracy.
+    """
+    # --- 1. Initialization ---
+    yt_service = YouTubeService(api_key=settings.YT_API_KEY)
+    kafka_conf = {"bootstrap.servers": "localhost:9092"}
+    producer = Producer(kafka_conf)
+    kafka_topic = settings.KAFKA_TOPIC
+    print("Producer job started...")
+    # --- 2. Get Trending Entities ---
+    try:
+        trends_client = Trends()
+        trends = trends_client.trending_now(
+            geo=settings.FETCH_TRENDS_GEO, hours=24 * settings.FETCH_TRENDS_WITHIN_DAYS
+        )
+        trends.sort(key=lambda item: item.volume, reverse=True)
+        top_trends = trends[: settings.FETCH_NUM_ENTITIES]
+        print(f"Successfully fetched {len(top_trends)} trending entities.")
+    except Exception as e:
+        print(f"Failed to fetch trends: {e}")
+        return
+    time_filter = get_rfc_time_ago(days=settings.FETCH_TRENDS_WITHIN_DAYS)
+    # --- 3. Process Each Entity ---
+    for trend in top_trends:
+        entity_keyword = trend.keyword
+        print(f"\n--- Processing entity: {entity_keyword} ---")
+        # --- Fetch and process interest over time data ---
+        interest_data = []
+        try:
+            trends_client_for_interest = Trends()
+            df = trends_client_for_interest.interest_over_time(
+                keywords=[entity_keyword], timeframe=f"now {settings.FETCH_TRENDS_WITHIN_DAYS}-d"
+            )
+            if not df.empty:
+                daily_df = df[[entity_keyword]].resample('D').mean().round(0).astype(int)
+                interest_data = [
+                    {"date": index.strftime('%Y-%m-%d'), "value": int(row.iloc[0])}
+                    for index, row in daily_df.iterrows()
+                ]
+            print(f"Successfully fetched interest over time data for '{entity_keyword}'.")
+        except Exception as e:
+            print(f"Could not fetch interest over time data for '{entity_keyword}': {e}")
+        # --- 3a. Contextual Query Building ---
+        # Sort related keywords by length to prioritize more specific ones
+        related_keywords = sorted(trend.trend_keywords, key=len, reverse=True)
+        # Combine the main keyword with the top 2 longest related keywords
+        keywords_to_search = [entity_keyword] + related_keywords[:2]
+        # Remove duplicates
+        keywords_to_search = list(dict.fromkeys(keywords_to_search))
+        # Create a query like: '"long keyword" OR "main keyword"'
+        query_string = " OR ".join([f'"{k}"' for k in keywords_to_search])
+        print(f"Constructed query: {query_string}")
+        # --- 3b. Search and Get Thumbnail ---
+        videos = yt_service.search_videos(
+            query_string=query_string, published_after=time_filter
+        )
+        if not videos:
+            print(f"No videos found for '{entity_keyword}'. Skipping...")
+            continue
+        first_video = videos[0]
+        entity_thumbnail_url = first_video.get("snippet", {}).get("thumbnails", {}).get("high", {}).get("url")
+        # Construct the representative video URL
+        video_id = first_video.get("id", {}).get("videoId", "")
+        entity_video_url = f"https://www.youtube.com/watch?v={video_id}" if video_id else None
+        # --- 3c. Fetch Comments with Smart Sampling ---
+        comments_for_entity: List[Dict[str, Any]] = []
+        for video in videos:
+            video_id = video.get("id", {}).get("videoId")
+            snippet = video.get("snippet", {})
+            if not video_id or not snippet:
+                continue
+            print(
+                f"Fetching comments from video: {snippet.get('title', 'N/A')[:50]}..."
+            )
+            comments = yt_service.fetch_comments(
+                video_id=video_id, limit=settings.FETCH_COMMENTS_PER_VIDEO
+            )
+            for comment in comments:
+                comment["video_id"] = video_id
+                comment["video_title"] = snippet.get("title")
+                comment["video_publish_date"] = snippet.get("publishedAt")
+                comment["video_url"] = f"https://www.youtube.com/watch?v={video_id}"
+            comments_for_entity.extend(comments)
+            if len(comments_for_entity) >= settings.FETCH_TOTAL_COMMENTS_PER_ENTITY:
+                break
+        final_comments = comments_for_entity[: settings.FETCH_TOTAL_COMMENTS_PER_ENTITY]
+        # --- 4. Produce Messages to Kafka ---
+        if not final_comments:
+            print(f"No comments collected for '{entity_keyword}'.")
+            continue
+        print(
+            f"Producing {len(final_comments)} comments for '{entity_keyword}' to Kafka topic '{kafka_topic}'..."
+        )
+        for comment in final_comments:
+            message_payload = {
+                "entity_keyword": entity_keyword,
+                "entity_thumbnail_url": entity_thumbnail_url,
+                "entity_video_url": entity_video_url,
+                "entity_volume": trend.volume,
+                "interest_over_time": interest_data,
+                "video_and_comment_data": comment,
+            }
+            producer.produce(
+                kafka_topic,
+                key=entity_keyword.encode("utf-8"),
+                value=json.dumps(message_payload).encode("utf-8"),
+            )
+    producer.flush()
+    print("\nProducer job finished. All messages flushed to Kafka.")
+if __name__ == "__main__":
+    run_producer_job()

app/services/__init__.py ADDED Viewed

File without changes

app/services/sentiment_service.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from typing import List, Dict, Any
+from app.core.config import settings
+import torch
+import numpy as np
+from scipy.special import softmax
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
+class SentimentService:
+    """
+    A service for loading the sentiment analysis model and performing predictions.
+    """
+    def __init__(self) -> None:
+        """
+        Initialize the service by loading the sentiment analysis model and tokenizer.
+        """
+        # Select device (GPU if available, otherwise CPU)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if self.device.type == "cuda":
+            print(
+                f"GPU found: {torch.cuda.get_device_name(0)}. Loading model onto GPU."
+            )
+        else:
+            print("GPU not found. Loading model onto CPU.")
+        # Load model, tokenizer, and config (for id2label mapping)
+        model_name = settings.SENTIMENT_MODEL
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.config = AutoConfig.from_pretrained(model_name)
+        self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(
+            self.device
+        )
+        self.model.eval()  # set model to inference mode
+        print("Sentiment model loaded successfully.")
+    def _preprocess_text(self, text: str) -> str:
+        """
+        Replace @user mentions and http links with placeholders.
+        """
+        if not isinstance(text, str):
+            return ""
+        new_text = []
+        for t in text.split(" "):
+            t = "@user" if t.startswith("@") and len(t) > 1 else t
+            t = "http" if t.startswith("http") else t
+            new_text.append(t)
+        return " ".join(new_text)
+    def predict_batch(self, texts: List[str]) -> List[Dict[str, Any]]:
+        """
+        Predict sentiment for a batch of texts (batch size is assumed to be small).
+        """
+        # Preprocess all texts
+        preprocessed_texts = [self._preprocess_text(text) for text in texts]
+        # Keep only non-empty texts and remember their original indices
+        non_empty_texts_with_indices = [
+            (i, text) for i, text in enumerate(preprocessed_texts) if text.strip()
+        ]
+        if not non_empty_texts_with_indices:
+            return []
+        indices, texts_to_predict = zip(*non_empty_texts_with_indices)
+        # Tokenize the batch
+        encoded_inputs = self.tokenizer(
+            list(texts_to_predict),
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=512,
+        ).to(self.device)
+        # Run inference
+        with torch.no_grad():
+            outputs = self.model(**encoded_inputs)
+            logits = outputs.logits.detach().cpu().numpy()
+        # Explicitly clear intermediate tensors from VRAM
+        del encoded_inputs, outputs
+        torch.cuda.empty_cache()
+        # Apply softmax to get probabilities
+        probs = softmax(logits, axis=1)
+        # Map predictions to labels with highest probability
+        predictions = []
+        for prob in probs:
+            max_idx = int(np.argmax(prob))
+            predictions.append(
+                {"label": self.config.id2label[max_idx], "score": float(prob[max_idx])}
+            )
+        # Map predictions back to their original positions
+        final_results: List[Dict[str, Any] | None] = [None] * len(texts)
+        for original_index, prediction in zip(indices, predictions):
+            final_results[original_index] = prediction
+        # Replace None results with a default neutral prediction
+        default_prediction = {"label": "neutral", "score": 1.0}
+        return [res if res is not None else default_prediction for res in final_results]

app/services/youtube_service.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from typing import Any, List, Dict
+from app.core.config import settings
+from googleapiclient.discovery import build, Resource
+from googleapiclient.errors import HttpError
+class YouTubeService:
+    """
+    A service class for interacting with the YouTube Data API.
+    """
+    def __init__(self, api_key: str) -> None:
+        self.api_key: str = api_key
+        try:
+            self.youtube: Resource = build("youtube", "v3", developerKey=api_key)
+        except Exception as e:
+            print(f"Failed to build Youtube service: {e}")
+            raise
+    def search_videos(
+        self, query_string: str, published_after: str | None = None
+    ) -> List[Dict[str, Any]]:
+        """
+        Searches for videos related to a keyword.
+        """
+        try:
+            search_request = self.youtube.search().list(
+                q=query_string,
+                part="snippet",
+                type="video",
+                maxResults=settings.FETCH_VIDEOS_PER_ENTITY,
+                regionCode="US",
+                relevanceLanguage="en",
+                publishedAfter=published_after,
+                order="relevance",
+            )
+            response = search_request.execute()
+            return response.get("items", [])
+        except HttpError as e:
+            print(
+                f"An HTTP error {e.resp.status} occurred during video search: {e.content}"
+            )
+            return []
+        except Exception as e:
+            print(f"An unexpected error occurred during video search: {e}")
+            return []
+    def fetch_comments(self, video_id: str, limit: int = 100) -> List[Dict[str, Any]]:
+        """
+        Fetches top-level comments for a given video ID using pagination
+        until the specified limit is reached or there are no more comments.
+        """
+        comments: List[Dict[str, Any]] = []
+        next_page_token: str | None = None
+        try:
+            while len(comments) < limit:
+                response = (
+                    self.youtube.commentThreads()
+                    .list(
+                        part="snippet",
+                        videoId=video_id,
+                        maxResults=100,
+                        textFormat="plainText",
+                        order="relevance",  # Fetch most relevant comments first
+                        pageToken=next_page_token,
+                    )
+                    .execute()
+                )
+                for item in response.get("items", []):
+                    snippet = (
+                        item.get("snippet", {})
+                        .get("topLevelComment", {})
+                        .get("snippet", {})
+                    )
+                    if snippet:
+                        comment_data = {
+                            "comment_id": item.get("id"),
+                            "text": snippet.get("textDisplay"),
+                            "author": snippet.get("authorDisplayName"),
+                            "publish_date": snippet.get("publishedAt"),
+                        }
+                        comments.append(comment_data)
+                next_page_token = response.get("nextPageToken")
+                if not next_page_token:
+                    # No more pages of comments
+                    break
+        except HttpError as e:
+            # It's common for comments to be disabled, so we'll log it but not treat as a fatal error.
+            if "commentsDisabled" in str(e.content):
+                print(f"Comments are disabled for video {video_id}.")
+            else:
+                print(
+                    f"An HTTP error {e.resp.status} occurred fetching comments: {e.content}"
+                )
+        except Exception as e:
+            print(f"An unexpected error occurred fetching comments: {e}")
+        # Always return the comments collected so far, truncated to the limit
+        return comments[:limit]

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+aiokafka==0.12.0
+confluent-kafka==2.11.1
+datasets==4.0.0
+fastapi[all]==0.116.1
+google-api-python-client==2.181.0
+google-auth-oauthlib==1.2.2
+httpx==0.28.1
+motor==3.7.1
+numpy==2.2.6
+pydantic==2.11.8
+pymongo==4.14.0
+pyngrok==7.3.0
+python-dotenv==1.1.1
+qstash==3.0.0
+requests==2.32.5
+scipy==1.16.2
+transformers==4.56.1
+trendspy==0.1.6
+uvicorn==0.35.0
+torch==2.6.0