Spaces:

hblim
/

reddit_sentiment_tracker

Sleeping

App Files Files Community

hblim commited on Jun 23, 2025

Commit

a6576f0

0 Parent(s):

Clean codebase for HF Space (drop Prometheus binary data)

Browse files

Files changed (40) hide show

.github/workflows/daily.yml +63 -0
.gitignore +62 -0
Dockerfile +18 -0
LICENSE +21 -0
README.md +256 -0
app.py +11 -0
config.yaml +29 -0
frontend/__init__.py +1 -0
frontend/app.py +287 -0
frontend/data_utils.py +143 -0
frontend/text_analysis.py +62 -0
notebooks/keyword_extraction.ipynb +867 -0
notebooks/loading_data.ipynb +785 -0
notebooks/post_analysis.ipynb +567 -0
notebooks/split_data_scored.ipynb +2798 -0
pyproject.toml +32 -0
reddit_analysis/__init__.py +7 -0
reddit_analysis/common_metrics.py +40 -0
reddit_analysis/config_utils.py +114 -0
reddit_analysis/inference/__init__.py +5 -0
reddit_analysis/inference/score.py +327 -0
reddit_analysis/monitoring/dashboard.json +309 -0
reddit_analysis/monitoring/dashboard_failure.png +0 -0
reddit_analysis/monitoring/dashboard_success.png +0 -0
reddit_analysis/monitoring/docker-compose.yml +21 -0
reddit_analysis/monitoring/prometheus.yml +8 -0
reddit_analysis/scraper/__init__.py +5 -0
reddit_analysis/scraper/scrape.py +310 -0
reddit_analysis/summarizer/__init__.py +5 -0
reddit_analysis/summarizer/aggregator.py +68 -0
reddit_analysis/summarizer/summarize.py +274 -0
reddit_analysis/test_config.py +72 -0
reddit_analysis/tests/README.md +78 -0
reddit_analysis/tests/inference/test_score.py +282 -0
reddit_analysis/tests/scraper/test_scrape.py +187 -0
reddit_analysis/tests/summarizer/test_summarize.py +127 -0
reddit_analysis/tests/test_config_utils.py +101 -0
requirements-dev.txt +11 -0
requirements.txt +22 -0
subreddit_daily_summary.csv +213 -0

.github/workflows/daily.yml ADDED Viewed

	@@ -0,0 +1,63 @@

+# .github/workflows/daily.yml
+name: Daily ETL & CI
+on:
+  push:
+    branches: [main]
+  workflow_dispatch:
+  schedule:
+    - cron: '0 23 * * *'
+jobs:
+  build:
+    # This is the GitHub‑hosted runner’s OS.
+    # You can change to macos-latest if you really need a macOS VM,
+    # but ubuntu-latest is faster and usually all you need.
+    runs-on: ubuntu-latest
+    env:
+      # These come from your repository settings → Secrets → Actions.
+      # Add HF_TOKEN, REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT there.
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      REDDIT_CLIENT_ID: ${{ secrets.REDDIT_CLIENT_ID }}
+      REDDIT_CLIENT_SECRET: ${{ secrets.REDDIT_CLIENT_SECRET }}
+      REDDIT_USER_AGENT: ${{ secrets.REDDIT_USER_AGENT }}
+      REPLICATE_API_TOKEN: ${{ secrets.REPLICATE_API_TOKEN }}
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v3
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+      - name: Cache pip
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements-dev.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements-dev.txt
+      - name: Run unit tests
+        run: pytest --maxfail=1 --disable-warnings -q
+      - name: Compute DATE (UTC)
+        id: set-date
+        run: echo "DATE=$(date -u +'%Y-%m-%d')" >> $GITHUB_ENV
+      - name: Scrape
+        run: python -m reddit_analysis.scraper.scrape --date "$DATE"
+      - name: Score
+        run: python -m reddit_analysis.inference.score --date "$DATE" --overwrite
+      - name: Summarize
+        run: python -m reddit_analysis.summarizer.summarize --date "$DATE" --overwrite

.gitignore ADDED Viewed

	@@ -0,0 +1,62 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+scratch/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+.pytest_cache/
+.coverage
+htmlcov/
+.tox/
+.nox/
+.hypothesis/
+# Virtual Environment
+venv/
+env/
+ENV/
+.env
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+.ipynb_checkpoints/
+**/.ipynb_checkpoints/
+# Data directories
+data/
+data_raw/
+data_scored/
+# Docker
+.docker/
+docker-compose.override.yml
+# Logs
+*.log
+logs/
+# OS
+.DS_Store
+Thumbs.db reddit_analysis/monitoring/prometheus-data/
+reddit_analysis/monitoring/prometheus-data/
+reddit_analysis/monitoring/prometheus-data/

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+# Use the official lightweight Python image
+FROM python:3.12-slim
+# Set working directory
+WORKDIR /app
+# Copy and install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy your Streamlit app
+COPY . .
+# Expose Streamlit’s default port
+EXPOSE 8502
+# Launch the app
+ENTRYPOINT ["streamlit", "run", "frontend/app.py", "--server.port=8502", "--server.address=0.0.0.0"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Halston Lim
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,256 @@

+# Reddit Sentiment Pipeline
+[![CI Status](https://github.com/halstonblim/reddit_sentiment_pipeline/actions/workflows/daily.yml/badge.svg)](https://github.com/halstonblim/reddit_sentiment_pipeline/actions/workflows/daily.yml)
+[![Streamlit App](https://img.shields.io/badge/demo-streamlit-ff4b4b?logo=streamlit)](https://redditsentimentpipeline.streamlit.app/)
+A fully‑automated **end‑to‑end MLOps** pipeline that tracks daily sentiment trends on Reddit, scores posts with a transformer‑based model served from Replicate, summarizes the results, and publishes an interactive Streamlit dashboard—all orchestrated by GitHub Actions.
+***Analyzing the Public Discourse of AI News***
+The pipeline is currently configured in `config.yaml` to analyze AI news discourse across `r/articial`, `r/LocalLLama`, `r/singularity`, and `r/OpenAI`. The data is persisted across three steps
+1. **Scrapes** new submissions from a configurable list of subreddits (→ `data_raw/`)
+2. **Classifies** each post with a sentiment model served on Replicate (→ `data_scored/`)
+3. **Summarises** daily trends for lightweight front-end consumption (→ `daily_summary/`)
+More information on the data can be found on the Hugging Face Dataset repo [hblim/top_reddit_posts_daily](https://huggingface.co/datasets/hblim/top_reddit_posts_daily)
+***Sentiment Analysis***
+We use the [DistilBERT sentiment analysis model](https://github.com/halstonblim/batch-bert-sentiment), which is wrapped with Cog for easy deployment on Replicate. The model handles batched input texts in a single API call, which improves performance by parallelizing computation on the GPU.
+---
+## Table of Contents
+1. [Project Structure](#project-structure)
+2. [Installation & Quick start](#installation)
+3. [Configuration](#configuration)
+4. [Back-end reddit_analysis](#backend-reddit-analysis)
+5. [Unit tests](#unit-tests)
+6. [Front-end Streamlit](#front-end-streamlit)
+7. [CI/CD & GitHub Actions](#cicd-github-actions)
+8. [Monitoring with Grafana/Prometheus](#monitoring-with-grafanaprometheus)
+9. [Extending / Customising](#extending--customizing)
+---
+## Project Structure
+````text
+reddit_sentiment_pipeline/
+├── reddit_analysis/               # Back‑end
+│   ├── __init__.py
+│   ├── scraper/
+│   │   └── scrape.py              # Collect raw posts → HF dataset (data_raw)
+│   ├── inference/
+│   │   └── score.py               # Call Replicate model → adds sentiment scores
+│   ├── summarizer/
+│   │   └── summarize.py           # Aggregate + export CSV summaries (data_scored)
+│   ├── config_utils.py            # Secrets & YAML helper
+│   ├── tests/                     # Pytest test-suite
+|
+├── streamlit_app/                 # Front‑end
+│   └── app.py
+│
+├── .github/
+│   └── workflows/
+│       ├── daily.yml              # Cron‑triggered ETL + summarize
+│
+├── config.yaml                    # Default runtime config (subreddits, models …)
+├── requirements.txt               # requirements for front end only
+├── requirements-dev.txt           # requirements for local development
+└── README.md
+````
+### Automated Workflow
+```
+[GitHub Actions Cron @ 21:00 UTC]
+          |
+          v
+  +-------+-------------+
+  |    Scrape Reddit    |  ← `scraper/scrape.py --date $DATE`
+  +-------+-------------+
+          |
+          v
+  +-------+-------------+
+  |  Sentiment Analysis |  ← `inference/score.py --date $DATE`
+  +-------+-------------+
+          |
+          v
+  +-------+-------------+
+  |      Summarize      |  ← `summarizer/summarize.py --date $DATE`
+  +-------+-------------+
+          |
+          v
+  [HF Dataset: data files]
+          |
+   Frontend (Streamlit app)
+          |
+   Public URL (Streamlit Cloud)
+```
+---
+## Installation
+To run the frontend streamlit app locally
+```bash
+git clone https://github.com/halstonblim/reddit_sentiment_pipeline.git
+cd reddit_sentiment_pipeline
+pip install -r requirements.txt
+streamlit run frontend/app.py
+```
+To run the backend reddit analysis locally and set up your own scraper, sentiment analysis, and export pipeline, steps are roughly
+- Get Reddit/Hugging Face/Replicate accounts and API tokens
+- You must configure a .env with the secrets (HF, Replicate, Reddit tokens)
+- Configure the .yaml file to point to the proper Hugging Face repository and Replicate models, and subreddits to scrape
+Once those are configured you can run the following which should scrape Reddit, analyze text remotely with a Replicate model, and export results to Hugging Face
+```bash
+pip install -r requirements-dev.txt
+# Run the full pipeline for today
+$ python -m reddit_analysis.scraper.scrape    --date $(date +%F)
+$ python -m reddit_analysis.inference.score  --date $(date +%F)
+$ python -m reddit_analysis.summarizer.summarize --date $(date +%F)
+```
+---
+## Configuration
+All non‑secret settings live in **`config.yaml`**; sensitive tokens are supplied via environment variables or a `.env` file.
+```yaml
+# config.yaml (excerpt)
+repo_id: hblim/top_reddit_posts_daily
+push_to_hf: true
+subreddits:
+  - name: apple
+    post_limit: 100
+    comment_limit: 5
+```
+| Variable | Where to set | Description |
+|----------|-------------|-------------|
+| `HF_TOKEN` | GitHub → *Settings › Secrets and variables* <br>or local `.env` | Personal access token with **write** permission to the HF dataset |
+| `REPLICATE_API_TOKEN` | same | Token to invoke the Replicate model |
+| `ENV` | optional | `local`, `ci`, `prod` – toggles logging & Streamlit behaviour |
+---
+## Backend reddit analysis
+### 1. `scraper.scrape`
+Collects the top *N* daily posts from each configured subreddit and appends them to a [Hugging Face **Parquet** dataset](https://huggingface.co/datasets/hblim/top_reddit_posts_daily/tree/main/data_raw) (`data_raw`).
+```bash
+python -m reddit_analysis.scraper.scrape \
+       --date 2025-04-22           # YYYY‑MM‑DD (defaults to today)
+       --limit 100                 # optional, posts/subreddit
+       --overwrite                 # re‑upload if already exists
+```
+* **Dependencies:** [`praw`](https://praw.readthedocs.io/), `huggingface‑hub`
+* **De‑duplication:** handled server‑side via dataset row `post_id` as primary key—**no local state needed**.
+---
+### 2. `inference.score`
+Downloads one day of raw posts, sends raw text consisting of `title + selftext` to the **Replicate** hosted model in batches for optimized parallel computation, and pushes a scored Parquet file to a separate [Hugging Face **Parquet** dataset](https://huggingface.co/datasets/hblim/top_reddit_posts_daily/tree/main/data_scored) `data_scored`.
+```bash
+python -m reddit_analysis.inference.score \
+       --date 2025-04-22 \
+       --model your‑org/sentiment‑model:latest \
+       --batch_size 64            # Replicate parallelism
+```
+* **Retry logic:** automatic back‑off for `httpx.RemoteProtocolError`.
+---
+### 3. `summarizer.summarize`
+Aggregates daily sentiment by subreddit (mean & weighted means) and writes a compact CSV plus a Parquet summary.
+```bash
+python -m reddit_analysis.summarizer.summarize \
+       --date 2025-04-22 \
+       --output_format csv parquet
+```
+* **Uses `pandas` `groupby` (no default sorting—explicitly sorts by date + subreddit).**
+* **Exports** are placed under `data_summary/` in the same HF dataset repo.
+---
+## Unit tests
+The backend test‑suite lives in `reddit_analysis/tests/` and can be executed with **pytest**:
+```bash
+pytest -q
+```
+| File | What it tests | Key fixtures / mocks |
+|------|--------------|----------------------|
+| `tests/scraper/test_scrape.py` | Reddit fetch logic, de‑duplication rules | `praw.Reddit`, `huggingface_hub.HfApi` mocked via `monkeypatch` |
+| `tests/inference/test_score.py` | Batching, error handling when HF token missing | Fake Replicate API via `httpx.MockTransport` |
+| `tests/summarizer/test_summarize.py` | Correct aggregation & sorting | `pandas` dummy frames |
+CI runs the tests on every push (see [daily.yml](#cicd--github-actions)).
+---
+## Front end (Streamlit)
+`streamlit_app/app.py` provides an interactive dashboard that:
+1. Downloads the daily summary CSVs from HF.
+2. Displays time‑series sentiment trends, top posts tables, and subreddit post counts.
+3. Allows filtering by date range or subreddit with responsive Altair charts.
+```bash
+# Local preview
+streamlit run streamlit_app/app.py
+```
+---
+## CI/CD Github Actions
+### `.github/workflows/daily.yml`
+| Step | What it does |
+|------|--------------|
+| **Setup** | Checkout repo, install Python 3.12, cache pip deps |
+| **Scrape** | `python -m reddit_analysis.scraper.scrape --date $DATE` |
+| **Score** | `python -m reddit_analysis.inference.score --date $DATE` |
+| **Summarize** | `python -m reddit_analysis.summarizer.summarize --date $DATE` |
+| **Tests** | `pytest -q` |
+*Trigger:* `cron: "0 21 * * *"` → 4 pm America/Chicago every day.
+Secrets (`HF_TOKEN`, `REPLICATE_API_TOKEN`) are injected via **repository secrets** so the workflow can push to Hugging Face and call Replicate. The runner is completely stateless—every job starts on a fresh VM and writes data only to external storage (HF dataset).
+---
+## Monitoring with Grafana/Prometheus
+Implemented a local lightweight Prometheus + Grafana stack; each pipeline stage pushes job_success and job_duration_seconds metrics. Dashboard surfaces run health & latency trends.
+Example of success state:
+![Success](reddit_analysis/monitoring/dashboard_success.png)
+Example of failure state:
+![Failure](reddit_analysis/monitoring/dashboard_failure.png)
+--
+## Extending / Customizing
+* **Change subreddits** – edit the list in `config.yaml` or pass `--subreddits` to the scraper.
+* **Swap sentiment models** – point `replicate_model` to any text‑classification model on Replicate with single‑sentence input.
+* **Augment summaries** – create additional aggregator modules (e.g. keyword extraction) and add a new step in `daily.yml`.

app.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""Entry point for the Hugging Face Spaces application.
+This tiny wrapper simply imports the Streamlit app defined in
+`frontend/app.py`. Importing that module is enough to launch the UI
+because the Streamlit code executes at import time.
+"""
+# Importing `frontend.app` is sufficient to start the Streamlit app.
+# The variable name is unused, but keeping the assignment suppresses
+# linters complaining about unused imports.
+import frontend.app  # noqa: F401  # pragma: no cover

config.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+# Hugging Face repository configuration
+repo_id: hblim/top_reddit_posts_daily
+repo_type: dataset
+# Inference configuration
+replicate_model: halstonblim/distilbert-base-uncased-finetuned-sst-2-english:d1a897bcd8ebb23c5aab87317eee2d6c919cdc5cfbf9154140c5c2fb47344b8c
+scored_dir: reddit_analysis/data/data_scored
+hf_scored_dir: data_scored_subreddit
+batch_size: 1024
+# Scraper configuration
+timezone: US/Central
+raw_dir: reddit_analysis/data/data_raw
+logs_dir: reddit_analysis/data/logs
+hf_raw_dir: data_raw
+push_to_hf: true
+subreddits:
+  - name: artificial
+    post_limit: 100
+    comment_limit: 10
+  - name: LocalLLaMA
+    post_limit: 100
+    comment_limit: 10
+  - name: singularity
+    post_limit: 100
+    comment_limit: 10
+  - name: OpenAI
+    post_limit: 100
+    comment_limit: 10

frontend/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Frontend package exposing the Streamlit app for Hugging Face Spaces."""

frontend/app.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import os
+import streamlit as st
+import pandas as pd
+import numpy as np
+import altair as alt
+from datetime import date, timedelta, datetime
+# Import from local modules
+from data_utils import load_summary, load_day, get_subreddit_colors, get_last_updated_hf_caption
+from text_analysis import keywords_for_df
+st.set_page_config(page_title="Reddit Sentiment Trends", layout="wide")
+st.title("Reddit Sentiment Monitor")
+# ── Load & transform data ────────────────────────────────────────────────────
+df = load_summary()
+last_update_caption = get_last_updated_hf_caption()
+# Get colors for each subreddit
+subreddits = df["subreddit"].unique()
+subreddit_colors = get_subreddit_colors(subreddits)
+# Define time format to use across all charts
+time_format = "%m/%d/%Y"
+# Get date range from the dataset for the form
+min_date = df["date"].min().date()
+max_date = df["date"].max().date()
+# ── Community weighted sentiment line chart for all subreddits ───────────────
+st.subheader("Community Weighted Sentiment by Subreddit")
+# Add date range selector for the time series
+date_range = st.date_input(
+    "Select date range for time series",
+    (min_date, max_date),
+    min_value=min_date,
+    max_value=max_date
+)
+start_date, end_date = date_range
+filtered_df = df[(df["date"].dt.date >= start_date) & (df["date"].dt.date <= end_date)]
+# Add a multiselect widget for choosing which subreddits to display
+selected_subs = st.multiselect(
+    "Select subreddits to display",
+    options=list(subreddits),
+    default=list(subreddits)
+)
+plot_df = filtered_df[filtered_df["subreddit"].isin(selected_subs)]
+# Define hover selection for nearest point
+nearest = alt.selection_single(
+    name="nearest",
+    on="mouseover",
+    nearest=True,
+    fields=["date"],
+    empty="none"
+)
+# Base chart for DRY encoding
+base = alt.Chart(plot_df).encode(
+    x=alt.X("date:T", title="Date", axis=alt.Axis(format=time_format)),
+    y=alt.Y("community_weighted_sentiment:Q", title="Community Weighted Sentiment"),
+    color=alt.Color(
+        "subreddit:N",
+        scale=alt.Scale(domain=list(subreddits), range=list(subreddit_colors.values())),
+        legend=alt.Legend(
+            title="Subreddit",
+            orient="top",
+            direction="vertical",
+            columns=1
+        )
+    )
+)
+# Draw lines
+line = base.mark_line()
+# Invisible selectors to capture hover events
+selectors = base.mark_point(opacity=0).add_selection(nearest)
+# Draw highlighted points on hover
+points_hover = base.mark_point(size=60).encode(
+    opacity=alt.condition(nearest, alt.value(1), alt.value(0))
+)
+# Tooltip rule and popup
+tooltips = base.mark_rule(color="gray").encode(
+    tooltip=[
+        alt.Tooltip("subreddit:N", title="Subreddit"),
+        alt.Tooltip("date:T", title="Date", format=time_format),
+        alt.Tooltip("community_weighted_sentiment:Q", title="Sentiment", format=".2f")
+    ]
+).transform_filter(nearest)
+# Layer everything and make interactive
+hover_chart = alt.layer(line, selectors, points_hover, tooltips).properties(
+    height=300
+).interactive()
+st.altair_chart(hover_chart, use_container_width=True)
+# ── Bar chart for post counts by subreddit (side-by-side) ────────────────────
+st.subheader("Daily Post Counts by Subreddit")
+# Create grouped bar chart for post counts by date and subreddit
+bar_chart = alt.Chart(df).mark_bar().encode(
+    x=alt.X("date:T", title="Date", axis=alt.Axis(format=time_format)),
+    y=alt.Y("count:Q", title="Post Count"),
+    xOffset="subreddit:N",  # This creates the side-by-side grouping
+    color=alt.Color(
+        "subreddit:N",
+        scale=alt.Scale(domain=list(subreddits), range=list(subreddit_colors.values())),
+        legend=alt.Legend(title="Subreddit")
+    ),
+    tooltip=["date", "subreddit", "count"]
+).properties(height=300).interactive()
+st.altair_chart(bar_chart, use_container_width=True)
+# ── Latest metrics for each subreddit ─────────────────────────────────────────
+st.subheader("Latest Metrics")
+# Get the most recent data for each subreddit
+latest_by_subreddit = df.sort_values("date").groupby("subreddit").last().reset_index()
+# Display metrics in columns
+cols = st.columns(len(latest_by_subreddit))
+for i, (_, row) in enumerate(latest_by_subreddit.iterrows()):
+    with cols[i]:
+        st.markdown(f"**{row['subreddit']}**")
+        st.metric("Community Weighted", f"{row['community_weighted_sentiment']:.2f}")
+        st.metric("Posts", int(row["count"]))
+# ── Analyze sentiment driving posts ─────────────────────────────────────
+st.header("Analyze sentiment driving posts")
+with st.form("analysis_form"):
+    col1, col2 = st.columns(2)
+    with col1:
+        selected_subreddit = st.selectbox("Select Subreddit", options=subreddits)
+    with col2:
+        selected_date = st.date_input(
+            "Select Date",
+            value=max_date,
+            min_value=min_date,
+            max_value=max_date
+        )
+    submit_button = st.form_submit_button("Analyze Posts")
+if submit_button:
+    date_str = selected_date.strftime("%Y-%m-%d")
+    with st.spinner(f"Loading data for r/{selected_subreddit} on {date_str}..."):
+        posts_df = load_day(date_str, selected_subreddit)
+    if posts_df.empty:
+        st.error(f"No posts found for r/{selected_subreddit} on {date_str}")
+    else:
+        # Separate posts and comments
+        posts = posts_df[posts_df["type"] == "post"]
+        comments = posts_df[posts_df["type"] == "comment"]
+        # Overall summary metrics using engagement-adjusted sentiment (EAS)
+        n_posts = len(posts)
+        df_day = posts_df.copy()
+        df_day["score_num"] = pd.to_numeric(df_day["score"], errors="coerce").fillna(0)
+        weights_base_day = 1 + np.log1p(df_day["score_num"].clip(lower=0))
+        gamma_post = 0.3
+        weights_day = weights_base_day * np.where(df_day["type"] == "post", gamma_post, 1.0)
+        total_weight_day = weights_day.sum()
+        overall_eas = (weights_day * df_day["sentiment"]).sum() / weights_day.sum() if weights_day.sum() > 0 else 0
+        # Normalize daily weighted sentiment to range [-1,1]
+        overall_eas = 2 * overall_eas - 1
+        overall_score = df_day["score"].sum()
+        st.subheader(f"r/{selected_subreddit} on {date_str}")
+        c1, c2, c3 = st.columns(3)
+        c1.metric("Posts", n_posts)
+        c2.metric("Daily Weighted Sentiment, All Posts", f"{overall_eas:.2f}")
+        c3.metric("Total Score, All Posts", f"{overall_score:,}")
+        # Wrap analysis and rendering of top posts in a spinner
+        with st.spinner("Analyzing sentiment and rendering top posts..."):
+            # Build per-post analysis
+            analysis_rows = []
+            for _, post in posts.iterrows():
+                pid = post["post_id"]
+                text = post["text"]
+                # Gather comments for this post
+                post_comments = comments[comments["parent_id"] == f"t3_{pid}"]
+                # Combine post and comments for calculations
+                segment = pd.concat([pd.DataFrame([post]), post_comments], ignore_index=True)
+                # Compute engagement-adjusted sentiment for this post thread
+                segment_score_num = pd.to_numeric(segment["score"], errors="coerce").fillna(0)
+                weights_base = 1 + np.log1p(segment_score_num.clip(lower=0))
+                gamma_post = 0.3
+                weights_seg = weights_base * np.where(segment["type"] == "post", gamma_post, 1.0)
+                ws = (weights_seg * segment["sentiment"]).sum() / weights_seg.sum() if weights_seg.sum() > 0 else 0
+                # Normalize weighted sentiment of thread to range [-1,1]
+                ws = 2 * ws - 1
+                ts = segment["score"].sum()
+                nc = len(post_comments)
+                thread_weight_sum = weights_seg.sum()
+                contrib_weight = thread_weight_sum / total_weight_day if total_weight_day > 0 else 0
+                total_contribution = contrib_weight * ws
+                analysis_rows.append({
+                    "post_id": pid,
+                    "Post Keywords": "",  # placeholder; will compute for top posts only
+                    "Weighted Sentiment of Thread": ws,
+                    "Contribution Weight": contrib_weight,
+                    "Total Sentiment Contribution": total_contribution,
+                    "# Comments": nc,
+                    "Total Score": ts
+                })
+            analysis_df = pd.DataFrame(analysis_rows)
+            # Determine top 5 posts by contribution weight
+            top5 = analysis_df.sort_values("Contribution Weight", ascending=False).head(5).copy()
+            top5.reset_index(drop=True, inplace=True)
+            # Compute keywords only for top posts
+            for idx, row in top5.iterrows():
+                pid = row["post_id"]
+                post_text = posts[posts["post_id"] == pid].iloc[0]["text"]
+                kw = keywords_for_df(pd.DataFrame({"text": [post_text]}), top_n=2)
+                keywords_list = [k for k, _ in kw][:2]
+                top5.at[idx, "Post Keywords"] = ", ".join(keywords_list)
+            # Format numeric columns
+            for df_part in (top5,):
+                df_part["Weighted Sentiment of Thread"] = df_part["Weighted Sentiment of Thread"].map("{:.2f}".format)
+                df_part["Total Score"] = df_part["Total Score"].map("{:,}".format)
+                df_part["Contribution Weight"] = df_part["Contribution Weight"].map("{:.2%}".format)
+                df_part["Total Sentiment Contribution"] = df_part["Total Sentiment Contribution"].map("{:.4f}".format)
+            st.subheader("Top 5 Posts by Contribution Weight")
+            st.dataframe(
+                top5[["Post Keywords", "Weighted Sentiment of Thread", "Contribution Weight", "Total Sentiment Contribution", "# Comments", "Total Score"]],
+                use_container_width=True
+            )
+            st.subheader("Post Details")
+            for idx, row in top5.reset_index(drop=True).iterrows():
+                pid = row["post_id"]
+                post_obj = posts[posts["post_id"] == pid].iloc[0]
+                post_text = post_obj["text"]
+                with st.expander(f"{idx} - {post_text.split('\\n')[0][:50]}..."):
+                    # Post Metrics
+                    post_sent = post_obj["sentiment"]
+                    # Normalize post sentiment to [-1,1]
+                    post_sent_norm = 2 * post_sent - 1
+                    post_score = post_obj["score"]
+                    ps = pd.to_numeric(post_score, errors="coerce")
+                    post_score_num = ps if (not np.isnan(ps) and ps >= 0) else 0
+                    # Compute post weight
+                    post_weight = (1 + np.log1p(post_score_num)) * gamma_post
+                    st.markdown("**Post:**")
+                    st.markdown(f"{post_text[:300]}{'...' if len(post_text) > 300 else ''}"
+                                f"(Sentiment: {post_sent_norm:.2f}, Weight: {post_weight:.2f}, Score: {post_score:,})"
+                                )
+                    st.markdown("---")
+                    # Display top 5 comments with metrics
+                    top_comments = (
+                        comments[comments["parent_id"] == f"t3_{pid}"]
+                        .sort_values("score", ascending=False)
+                        .head(5)
+                    )
+                    st.markdown("**Top Comments:**")
+                    for c_idx, comment in top_comments.iterrows():
+                        c_text = comment["text"]
+                        # Normalize comment sentiment and compute weight
+                        c_sent_norm = 2 * comment["sentiment"] - 1
+                        c_score = comment["score"]
+                        cs = pd.to_numeric(c_score, errors="coerce")
+                        c_score_num = cs if (not np.isnan(cs) and cs >= 0) else 0
+                        c_weight = 1 + np.log1p(c_score_num)
+                        st.markdown(
+                            f"{c_idx}. {c_text[:200]}{'...' if len(c_text) > 200 else ''} "
+                            f"(Sentiment: {c_sent_norm:.2f}, Weight: {c_weight:.2f}, Score: {c_score:,})"
+                        )
+# Display the data source attribution
+st.markdown(last_update_caption, unsafe_allow_html=True)

frontend/data_utils.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from __future__ import annotations
+from pathlib import Path
+import os
+import yaml
+import pandas as pd
+import numpy as np
+from huggingface_hub import HfApi
+from datetime import datetime, timezone
+import re
+# Root directory of the project
+ROOT = Path(__file__).resolve().parent.parent
+# Detect Streamlit runtime
+try:
+    import streamlit as st
+    has_streamlit = True
+except ImportError:
+    has_streamlit = False
+# Load environment variables when running locally
+if os.getenv("ENV") == "local" or not has_streamlit:
+    from dotenv import load_dotenv
+    load_dotenv(ROOT / ".env")
+# Read Hugging Face dataset repo ID from config
+with open(ROOT / "config.yaml") as f:
+    cfg = yaml.safe_load(f)
+REPO_ID: str = cfg["repo_id"]
+# Initialize Hugging Face API client
+api = HfApi()
+# URL for the summary CSV in the dataset
+CSV_URL = (
+    f"https://huggingface.co/datasets/{REPO_ID}/resolve/main/subreddit_daily_summary.csv"
+)
+def get_secret(key: str, default=None) -> str | None:
+    """Fetch a secret from environment variables or Streamlit secrets."""
+    val = os.getenv(key)
+    if val is None and has_streamlit:
+        val = st.secrets.get(key, default)
+    return val
+import streamlit as st
+@st.cache_data(ttl=6000, show_spinner=False)
+def load_summary() -> pd.DataFrame:
+    """Download and return the subreddit daily summary as a DataFrame using HF Hub API. Cached for 10 minutes."""
+    # Use HF Hub API to download the file instead of direct URL
+    local_file = api.hf_hub_download(
+        repo_id=REPO_ID,
+        filename="subreddit_daily_summary.csv",
+        repo_type="dataset"
+    )
+    df = pd.read_csv(local_file, parse_dates=["date"])
+    needed = {"date", "subreddit", "mean_sentiment", "community_weighted_sentiment", "count"}
+    if not needed.issubset(df.columns):
+        missing = needed - set(df.columns)
+        raise ValueError(f"Missing columns in summary CSV: {missing}")
+    return df
+def _sanitize(name: str) -> str:
+    """
+    Make subreddit safe for filenames (removes slashes, spaces, etc.).
+    """
+    name = name.strip().lower()
+    name = re.sub(r"[^\w\-\.]", "_", name)
+    return name
+@st.cache_data(show_spinner=False, ttl=60*60)
+def load_day(date: str, subreddit: str) -> pd.DataFrame:
+    """Lazy-download the parquet shard for one YYYY-MM-DD and return df slice.
+    Args:
+        date: Date string in YYYY-MM-DD format
+        subreddit: Subreddit name to filter by
+    Returns:
+        DataFrame containing posts from the specified subreddit on the given day
+    """
+    # Download the subreddit-specific file using sanitized subreddit
+    safe_sub = _sanitize(subreddit)
+    fname = f"data_scored_subreddit/{date}__{safe_sub}.parquet"
+    local = api.hf_hub_download(REPO_ID, fname, repo_type="dataset")
+    df_day = pd.read_parquet(local)
+    # File contains only the selected subreddit; reset index
+    return df_day.reset_index(drop=True)
+def get_last_updated_hf(repo_id: str) -> datetime:
+    """
+    Retrieve the dataset repo's last modified datetime via HF Hub API.
+    Returns a timezone-aware datetime in UTC.
+    """
+    info = api.repo_info(repo_id=repo_id, repo_type="dataset")
+    dt: datetime = info.lastModified  # already a datetime object
+    if dt.tzinfo is not None:
+        dt = dt.astimezone(timezone.utc)
+    return dt
+def get_last_updated_hf_caption() -> str:
+    """
+    Build a markdown-formatted caption string showing the dataset source and last update.
+    Uses REPO_ID and the HF Hub API to fetch the timestamp.
+    """
+    # Generate dataset link and timestamp
+    dataset_url = f"https://huggingface.co/datasets/{REPO_ID}"
+    last_update_dt = get_last_updated_hf(REPO_ID)
+    last_update = last_update_dt.strftime("%Y-%m-%d %H:%M:%S UTC")
+    # Return the small-caption HTML/markdown string
+    return (
+        f"<small>"
+        f"Data source: <a href='{dataset_url}' target='_blank'>{REPO_ID}</a> &bull; "
+        f"Last updated: {last_update}"
+        f"</small>"
+    )
+def add_rolling(df: pd.DataFrame, window: int = 7) -> pd.DataFrame:
+    """Add a rolling mean for community_weighted_sentiment over the specified window."""
+    out = df.copy()
+    for sub, grp in out.groupby("subreddit"):
+        grp_sorted = grp.sort_values("date")
+        roll = grp_sorted["community_weighted_sentiment"].rolling(window).mean()
+        out.loc[grp_sorted.index, f"roll_{window}"] = roll
+    return out
+def get_subreddit_colors(subreddits: list[str]) -> dict[str, str]:
+    """Provide a consistent color map for each subreddit."""
+    palette = [
+        "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728",
+        "#9467bd", "#8c564b", "#e377c2", "#7f7f7f",
+    ]
+    return {sub: palette[i % len(palette)] for i, sub in enumerate(sorted(subreddits))}

frontend/text_analysis.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""
+Text analysis utilities for Reddit content insights.
+Provides keyword extraction and similarity matching functions.
+"""
+import pandas as pd
+import spacy
+from sentence_transformers import SentenceTransformer
+from keybert import KeyBERT
+# Initialize spaCy and sentence transformer models
+try:
+    nlp = spacy.load("en_core_web_sm")
+except OSError:
+    import streamlit as st
+    with st.spinner("Downloading NLP model (first run only)..."):
+        from spacy.cli import download
+        download("en_core_web_sm")
+        nlp = spacy.load("en_core_web_sm")
+# Cache models at module scope for reuse
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+kw_model = KeyBERT(embedder)
+def keywords_for_df(df: pd.DataFrame, top_n=5):
+    """
+    Extract keywords from a DataFrame containing Reddit posts.
+    Args:
+        df: DataFrame with a 'text' column containing post content
+        top_n: Number of top keywords to return
+    Returns:
+        List of (keyword, score) tuples
+    """
+    if df.empty:
+        return []
+    # Join all text from the dataframe
+    raw = " ".join(df["text"].astype(str))
+    # Process with spaCy to extract noun chunks and named entities
+    doc = nlp(raw.lower())
+    # Combine noun chunks and relevant named entities
+    cand = " ".join(
+        [c.text for c in doc.noun_chunks] +
+        [e.text for e in doc.ents if e.label_ in {"PRODUCT", "EVENT", "ORG", "GPE"}]
+    )
+    # Quick stopword list to filter common terms
+    for ex in ['blog','topic','locked','author','moderator','error','bot','comments','archive','support','discord']:
+        cand = cand.replace(ex, " ")
+    # Use KeyBERT to extract keywords with diversity
+    return kw_model.extract_keywords(
+        cand,
+        keyphrase_ngram_range=(1, 3),
+        stop_words="english",
+        use_mmr=True,
+        diversity=0.8,
+        top_n=top_n
+    )

notebooks/keyword_extraction.ipynb ADDED Viewed

	@@ -0,0 +1,867 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9f7cc561-f375-4cd6-953f-65af221bc1ae",
+   "metadata": {},
+   "source": [
+    "# Keyword Extraction Analysis\n",
+    "Analyze buzzwords driving sentiment on any given day"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 461,
+   "id": "59e2493c-10e5-402f-9875-d07d989cd451",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-30T02:14:47.665748Z",
+     "iopub.status.busy": "2025-04-30T02:14:47.665748Z",
+     "iopub.status.idle": "2025-04-30T02:14:47.674765Z",
+     "shell.execute_reply": "2025-04-30T02:14:47.673749Z",
+     "shell.execute_reply.started": "2025-04-30T02:14:47.665748Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d5b34940-8c46-421b-b00b-0badaca194fc",
+   "metadata": {},
+   "source": [
+    "### Download data from HF Hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 462,
+   "id": "03be6fde-e68b-4026-8f90-cd6f5d5f21db",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-30T02:14:49.167723Z",
+     "iopub.status.busy": "2025-04-30T02:14:49.167723Z",
+     "iopub.status.idle": "2025-04-30T02:14:50.725451Z",
+     "shell.execute_reply": "2025-04-30T02:14:50.725451Z",
+     "shell.execute_reply.started": "2025-04-30T02:14:49.167723Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total records across 16 days: 4520\n"
+     ]
+    }
+   ],
+   "source": [
+    "from huggingface_hub import HfApi\n",
+    "\n",
+    "api = HfApi()\n",
+    "all_files = api.list_repo_files(\"hblim/top_reddit_posts_daily\", repo_type=\"dataset\")\n",
+    "parquet_files = sorted([f for f in all_files if f.startswith('data_scored') and f.endswith(\".parquet\")])\n",
+    "\n",
+    "df = []\n",
+    "for shard in parquet_files:\n",
+    "    local_path = api.hf_hub_download(repo_id=\"hblim/top_reddit_posts_daily\", filename=shard, repo_type=\"dataset\")\n",
+    "    file_date = os.path.splitext(os.path.basename(local_path))[0]\n",
+    "    df.append(pd.read_parquet(local_path).assign(filedate=file_date))\n",
+    "df = pd.concat(df, ignore_index=True)\n",
+    "print(f\"Total records across {df.filedate.nunique()} days: {len(df)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 464,
+   "id": "dbcaf06b-9c29-4913-b3a9-938017eb6ffd",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-30T02:14:51.655177Z",
+     "iopub.status.busy": "2025-04-30T02:14:51.654664Z",
+     "iopub.status.idle": "2025-04-30T02:14:51.669190Z",
+     "shell.execute_reply": "2025-04-30T02:14:51.669190Z",
+     "shell.execute_reply.started": "2025-04-30T02:14:51.655177Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>subreddit</th>\n",
+       "      <th>created_at</th>\n",
+       "      <th>retrieved_at</th>\n",
+       "      <th>type</th>\n",
+       "      <th>text</th>\n",
+       "      <th>score</th>\n",
+       "      <th>post_id</th>\n",
+       "      <th>parent_id</th>\n",
+       "      <th>sentiment</th>\n",
+       "      <th>confidence</th>\n",
+       "      <th>filedate</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>apple</td>\n",
+       "      <td>2025-04-14 11:19:50-05:00</td>\n",
+       "      <td>2025-04-14 23:44:27.136181-05:00</td>\n",
+       "      <td>post</td>\n",
+       "      <td>iPhone 16e Helps Apple Take Q1 2025 Top Spot in Global Smartphone Market\\n\\n</td>\n",
+       "      <td>655</td>\n",
+       "      <td>1jz2xrw</td>\n",
+       "      <td>None</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.9971</td>\n",
+       "      <td>2025-04-14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>apple</td>\n",
+       "      <td>2025-04-14 11:00:16-05:00</td>\n",
+       "      <td>2025-04-14 23:44:27.136181-05:00</td>\n",
+       "      <td>comment</td>\n",
+       "      <td>I've closed all rings every day starting on June 19 2015. This won't be a problem as long as I don't get run over or die.</td>\n",
+       "      <td>9</td>\n",
+       "      <td>mn2wpoi</td>\n",
+       "      <td>t3_1jyzp05</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.9965</td>\n",
+       "      <td>2025-04-14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>apple</td>\n",
+       "      <td>2025-04-14 11:59:56-05:00</td>\n",
+       "      <td>2025-04-14 23:44:27.136181-05:00</td>\n",
+       "      <td>post</td>\n",
+       "      <td>Smartphone tariffs are coming back in ‘a month or two,’ says Trump admin\\n\\n</td>\n",
+       "      <td>194</td>\n",
+       "      <td>1jz3wsi</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.9829</td>\n",
+       "      <td>2025-04-14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>apple</td>\n",
+       "      <td>2025-04-14 11:59:56-05:00</td>\n",
+       "      <td>2025-04-14 23:44:27.136181-05:00</td>\n",
+       "      <td>comment</td>\n",
+       "      <td>This topic has been automatically locked due to being controversial and/or political by nature. However, the submission itself will remain accessible as long as it is related to Apple.\\n\\n\\nThis decision was made by a bot based on specific keywords. If you feel that this was in error, please report it to the moderators so that it can be reviewed.\\n  \\n\\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/apple) if you have any questions or concerns.*</td>\n",
+       "      <td>1</td>\n",
+       "      <td>mn38mac</td>\n",
+       "      <td>t3_1jz3wsi</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.9972</td>\n",
+       "      <td>2025-04-14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>apple</td>\n",
+       "      <td>2025-04-14 18:04:42-05:00</td>\n",
+       "      <td>2025-04-14 23:44:27.136181-05:00</td>\n",
+       "      <td>post</td>\n",
+       "      <td>Apple to Analyze User Data on Devices to Bolster AI Technology\\n\\n</td>\n",
+       "      <td>69</td>\n",
+       "      <td>1jzcpwz</td>\n",
+       "      <td>None</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.9976</td>\n",
+       "      <td>2025-04-14</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  subreddit                 created_at                      retrieved_at  \\\n",
+       "0     apple  2025-04-14 11:19:50-05:00  2025-04-14 23:44:27.136181-05:00   \n",
+       "1     apple  2025-04-14 11:00:16-05:00  2025-04-14 23:44:27.136181-05:00   \n",
+       "2     apple  2025-04-14 11:59:56-05:00  2025-04-14 23:44:27.136181-05:00   \n",
+       "3     apple  2025-04-14 11:59:56-05:00  2025-04-14 23:44:27.136181-05:00   \n",
+       "4     apple  2025-04-14 18:04:42-05:00  2025-04-14 23:44:27.136181-05:00   \n",
+       "\n",
+       "      type  \\\n",
+       "0     post   \n",
+       "1  comment   \n",
+       "2     post   \n",
+       "3  comment   \n",
+       "4     post   \n",
+       "\n",
+       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     text  \\\n",
+       "0                                                                                                                                                                                                                                                                                                                                                                                                                                                                            iPhone 16e Helps Apple Take Q1 2025 Top Spot in Global Smartphone Market\\n\\n   \n",
+       "1                                                                                                                                                                                                                                                                                                                                                                                                                               I've closed all rings every day starting on June 19 2015. This won't be a problem as long as I don't get run over or die.   \n",
+       "2                                                                                                                                                                                                                                                                                                                                                                                                                                                                            Smartphone tariffs are coming back in ‘a month or two,’ says Trump admin\\n\\n   \n",
+       "3  This topic has been automatically locked due to being controversial and/or political by nature. However, the submission itself will remain accessible as long as it is related to Apple.\\n\\n\\nThis decision was made by a bot based on specific keywords. If you feel that this was in error, please report it to the moderators so that it can be reviewed.\\n  \\n\\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/apple) if you have any questions or concerns.*   \n",
+       "4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      Apple to Analyze User Data on Devices to Bolster AI Technology\\n\\n   \n",
+       "\n",
+       "   score  post_id   parent_id  sentiment  confidence    filedate  \n",
+       "0    655  1jz2xrw        None          1      0.9971  2025-04-14  \n",
+       "1      9  mn2wpoi  t3_1jyzp05          1      0.9965  2025-04-14  \n",
+       "2    194  1jz3wsi        None          0      0.9829  2025-04-14  \n",
+       "3      1  mn38mac  t3_1jz3wsi          0      0.9972  2025-04-14  \n",
+       "4     69  1jzcpwz        None          1      0.9976  2025-04-14  "
+      ]
+     },
+     "execution_count": 464,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "958f8e47-37c9-4d53-9d20-41a29a0c2714",
+   "metadata": {},
+   "source": [
+    "### Look at specific subreddit, date"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 562,
+   "id": "9f20145f-1ae0-4fa5-ac99-d00d9909cd76",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-30T02:54:23.431576Z",
+     "iopub.status.busy": "2025-04-30T02:54:23.431576Z",
+     "iopub.status.idle": "2025-04-30T02:54:23.436791Z",
+     "shell.execute_reply": "2025-04-30T02:54:23.436791Z",
+     "shell.execute_reply.started": "2025-04-30T02:54:23.431576Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# 1. Filter your dataframe\n",
+    "date = '2025-04-14'\n",
+    "subreddit = 'apple'\n",
+    "day_sub = (df['filedate'] == date) & (df['subreddit'] == subreddit) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 589,
+   "id": "2c286057-13db-49f4-a5d0-178a5d004b53",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-30T03:24:38.658484Z",
+     "iopub.status.busy": "2025-04-30T03:24:38.658484Z",
+     "iopub.status.idle": "2025-04-30T03:24:38.669335Z",
+     "shell.execute_reply": "2025-04-30T03:24:38.669335Z",
+     "shell.execute_reply.started": "2025-04-30T03:24:38.658484Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Daily aggregated sentiment stats\n",
+      "Community Weighted Sentiment = 0.3353147742165998\n",
+      "Average Sentiment = 0.43902439024390244\n"
+     ]
+    }
+   ],
+   "source": [
+    "dftest = df[day_sub]\n",
+    "print(\"Daily aggregated sentiment stats\")\n",
+    "print(\"Community Weighted Sentiment =\",((2 * dftest['sentiment'] - 1) * np.log1p(dftest['score'].clip(0,None))).mean())\n",
+    "print(\"Average Sentiment =\",dftest['sentiment'].mean())\n",
+    "# dftest.sort_values('score',ascending=False)\n",
+    "# dftest.groupby('parent_id').agg({'sentiment': ['mean','sum','count']})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "85d4797e-c9dd-45b0-b72f-cb8d279f5ebc",
+   "metadata": {},
+   "source": [
+    "### Use KeyBERT and sentiment transformers model to extract keywords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 587,
+   "id": "8c0a4f6a-d34a-4397-add9-1d68e670eaf7",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-30T02:59:03.237586Z",
+     "iopub.status.busy": "2025-04-30T02:59:03.237586Z",
+     "iopub.status.idle": "2025-04-30T02:59:05.388763Z",
+     "shell.execute_reply": "2025-04-30T02:59:05.388763Z",
+     "shell.execute_reply.started": "2025-04-30T02:59:03.237586Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('smartphone market', 0.5024), ('command thanks universal', 0.0662), ('dimensions leak years', 0.0196), ('wwdc non paywall', -0.0052), ('animations new techniques', -0.0178)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from keybert import KeyBERT\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "import spacy\n",
+    "\n",
+    "raw_text = \" \".join(df.loc[day_sub, 'text'].astype(str))\n",
+    "\n",
+    "# 2. Load spaCy with parser enabled for noun_chunks\n",
+    "nlp = spacy.load(\"en_core_web_sm\")  # keep the parser on\n",
+    "doc = nlp(raw_text.lower())\n",
+    "\n",
+    "# 3. Build candidate phrases\n",
+    "candidates = \" \".join(\n",
+    "    [chunk.text for chunk in doc.noun_chunks]\n",
+    "    + [ent.text for ent in doc.ents if ent.label_ in {\"PRODUCT\",\"EVENT\",}]\n",
+    ")\n",
+    "\n",
+    "for exclude in ['google','pixel','android','apple','rationale','advice','blog','topic','locked','author','moderator','error','bot','comments','archive','support','discord']:\n",
+    "    candidates = candidates.replace(exclude,' ')\n",
+    "\n",
+    "# 4. Keyword extraction with local embeddings\n",
+    "model    = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
+    "kw_model = KeyBERT(model)\n",
+    "keywords = kw_model.extract_keywords(\n",
+    "    candidates,\n",
+    "    keyphrase_ngram_range=(1, 3),\n",
+    "    stop_words=\"english\",\n",
+    "    use_mmr=True,\n",
+    "    diversity=0.9,\n",
+    "    top_n=5\n",
+    ")\n",
+    "\n",
+    "print(keywords)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3ea178ad-676c-4362-b817-e66563dce6de",
+   "metadata": {},
+   "source": [
+    "### Ensure keywords actually match to posts or comments based on cosine similarity"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 591,
+   "id": "63e1b61f-cdcb-4c5a-9024-531e26eee495",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-30T03:27:16.957826Z",
+     "iopub.status.busy": "2025-04-30T03:27:16.957826Z",
+     "iopub.status.idle": "2025-04-30T03:27:17.668631Z",
+     "shell.execute_reply": "2025-04-30T03:27:17.667592Z",
+     "shell.execute_reply.started": "2025-04-30T03:27:16.957826Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>keyword</th>\n",
+       "      <th>mean_sentiment</th>\n",
+       "      <th>community_weighted_sentiment</th>\n",
+       "      <th>n_posts</th>\n",
+       "      <th>total_score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>smartphone market</td>\n",
+       "      <td>-0.076923</td>\n",
+       "      <td>0.841451</td>\n",
+       "      <td>13</td>\n",
+       "      <td>2798</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>dimensions leak years</td>\n",
+       "      <td>-1.000000</td>\n",
+       "      <td>-5.939423</td>\n",
+       "      <td>2</td>\n",
+       "      <td>804</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>animations new techniques</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>2.944439</td>\n",
+       "      <td>1</td>\n",
+       "      <td>18</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>wwdc non paywall</td>\n",
+       "      <td>-1.000000</td>\n",
+       "      <td>-2.397895</td>\n",
+       "      <td>1</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                     keyword  mean_sentiment  community_weighted_sentiment  \\\n",
+       "0          smartphone market       -0.076923                      0.841451   \n",
+       "1      dimensions leak years       -1.000000                     -5.939423   \n",
+       "2  animations new techniques        1.000000                      2.944439   \n",
+       "3           wwdc non paywall       -1.000000                     -2.397895   \n",
+       "\n",
+       "   n_posts  total_score  \n",
+       "0       13         2798  \n",
+       "1        2          804  \n",
+       "2        1           18  \n",
+       "3        1           10  "
+      ]
+     },
+     "execution_count": 591,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "\n",
+    "# 1) Precompute embeddings for all texts in your day/subreddit slice\n",
+    "texts     = df.loc[day_sub, 'text'].tolist()\n",
+    "text_embs = model.encode(texts, convert_to_tensor=False)  # shape: (n_texts, 384)\n",
+    "\n",
+    "results = []\n",
+    "subsets = {}\n",
+    "# if you only want to test on a single kw, iterate keywords_test instead\n",
+    "for kw, _score in keywords:  \n",
+    "    # kw is now a string\n",
+    "    kw_emb = model.encode(kw, convert_to_tensor=False)      # shape: (384,)\n",
+    "    kw_emb = kw_emb.reshape(1, -1)                          # shape: (1, 384)\n",
+    "    \n",
+    "    sims = cosine_similarity(text_embs, kw_emb).flatten()   # OK: (n_texts,) vs (1,384)\n",
+    "    \n",
+    "    # rank or threshold as before\n",
+    "    hits   = df.loc[day_sub].iloc[sims.argsort()[::-1]]\n",
+    "    mask   = sims >= 0.3\n",
+    "        \n",
+    "    subset = df.loc[day_sub].iloc[mask]\n",
+    "    if subset.empty:\n",
+    "        continue\n",
+    "    subsets[kw] = subset\n",
+    "    \n",
+    "    # compute sentiment stats on subset…\n",
+    "    mean_sent   = 2 * subset['sentiment'].mean() - 1\n",
+    "    weighted    = ((2 * subset['sentiment'] - 1) * np.log1p(subset['score'].clip(0,None))).mean()\n",
+    "    total_score = subset['score'].sum()\n",
+    "    results.append((kw, mean_sent, weighted, len(subset), total_score))\n",
+    "\n",
+    "summary = pd.DataFrame(results, columns=[\n",
+    "    'keyword', 'mean_sentiment', 'community_weighted_sentiment', 'n_posts' , 'total_score'\n",
+    "]).sort_values('total_score', ascending=False).reset_index(drop=True)\n",
+    "\n",
+    "summary"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fbcbd745-8bbf-49f3-9fac-a4c9a056de6f",
+   "metadata": {},
+   "source": [
+    "### Manually inspect posts and comments associated with the keyword"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 593,
+   "id": "1b2dadf2-c2b3-447a-bcdf-aec627635f49",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-30T03:27:33.238178Z",
+     "iopub.status.busy": "2025-04-30T03:27:33.237146Z",
+     "iopub.status.idle": "2025-04-30T03:27:33.245143Z",
+     "shell.execute_reply": "2025-04-30T03:27:33.245143Z",
+     "shell.execute_reply.started": "2025-04-30T03:27:33.238178Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style type=\"text/css\">\n",
+       "</style>\n",
+       "<table id=\"T_d6ecc\">\n",
+       "  <caption>KEYWORD = smartphone market</caption>\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th class=\"blank level0\" >&nbsp;</th>\n",
+       "      <th id=\"T_d6ecc_level0_col0\" class=\"col_heading level0 col0\" >subreddit</th>\n",
+       "      <th id=\"T_d6ecc_level0_col1\" class=\"col_heading level0 col1\" >created_at</th>\n",
+       "      <th id=\"T_d6ecc_level0_col2\" class=\"col_heading level0 col2\" >retrieved_at</th>\n",
+       "      <th id=\"T_d6ecc_level0_col3\" class=\"col_heading level0 col3\" >type</th>\n",
+       "      <th id=\"T_d6ecc_level0_col4\" class=\"col_heading level0 col4\" >text</th>\n",
+       "      <th id=\"T_d6ecc_level0_col5\" class=\"col_heading level0 col5\" >score</th>\n",
+       "      <th id=\"T_d6ecc_level0_col6\" class=\"col_heading level0 col6\" >post_id</th>\n",
+       "      <th id=\"T_d6ecc_level0_col7\" class=\"col_heading level0 col7\" >parent_id</th>\n",
+       "      <th id=\"T_d6ecc_level0_col8\" class=\"col_heading level0 col8\" >sentiment</th>\n",
+       "      <th id=\"T_d6ecc_level0_col9\" class=\"col_heading level0 col9\" >confidence</th>\n",
+       "      <th id=\"T_d6ecc_level0_col10\" class=\"col_heading level0 col10\" >filedate</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th id=\"T_d6ecc_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
+       "      <td id=\"T_d6ecc_row0_col0\" class=\"data row0 col0\" >apple</td>\n",
+       "      <td id=\"T_d6ecc_row0_col1\" class=\"data row0 col1\" >2025-04-14 11:19:50-05:00</td>\n",
+       "      <td id=\"T_d6ecc_row0_col2\" class=\"data row0 col2\" >2025-04-14 23:44:27.136181-05:00</td>\n",
+       "      <td id=\"T_d6ecc_row0_col3\" class=\"data row0 col3\" >post</td>\n",
+       "      <td id=\"T_d6ecc_row0_col4\" class=\"data row0 col4\" >iPhone 16e Helps Apple Take Q1 2025 Top Spot in Global Smartphone Market\n",
+       "\n",
+       "</td>\n",
+       "      <td id=\"T_d6ecc_row0_col5\" class=\"data row0 col5\" >655</td>\n",
+       "      <td id=\"T_d6ecc_row0_col6\" class=\"data row0 col6\" >1jz2xrw</td>\n",
+       "      <td id=\"T_d6ecc_row0_col7\" class=\"data row0 col7\" >None</td>\n",
+       "      <td id=\"T_d6ecc_row0_col8\" class=\"data row0 col8\" >1</td>\n",
+       "      <td id=\"T_d6ecc_row0_col9\" class=\"data row0 col9\" >0.997100</td>\n",
+       "      <td id=\"T_d6ecc_row0_col10\" class=\"data row0 col10\" >2025-04-14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_d6ecc_level0_row1\" class=\"row_heading level0 row1\" >2</th>\n",
+       "      <td id=\"T_d6ecc_row1_col0\" class=\"data row1 col0\" >apple</td>\n",
+       "      <td id=\"T_d6ecc_row1_col1\" class=\"data row1 col1\" >2025-04-14 11:59:56-05:00</td>\n",
+       "      <td id=\"T_d6ecc_row1_col2\" class=\"data row1 col2\" >2025-04-14 23:44:27.136181-05:00</td>\n",
+       "      <td id=\"T_d6ecc_row1_col3\" class=\"data row1 col3\" >post</td>\n",
+       "      <td id=\"T_d6ecc_row1_col4\" class=\"data row1 col4\" >Smartphone tariffs are coming back in ‘a month or two,’ says Trump admin\n",
+       "\n",
+       "</td>\n",
+       "      <td id=\"T_d6ecc_row1_col5\" class=\"data row1 col5\" >194</td>\n",
+       "      <td id=\"T_d6ecc_row1_col6\" class=\"data row1 col6\" >1jz3wsi</td>\n",
+       "      <td id=\"T_d6ecc_row1_col7\" class=\"data row1 col7\" >None</td>\n",
+       "      <td id=\"T_d6ecc_row1_col8\" class=\"data row1 col8\" >0</td>\n",
+       "      <td id=\"T_d6ecc_row1_col9\" class=\"data row1 col9\" >0.982900</td>\n",
+       "      <td id=\"T_d6ecc_row1_col10\" class=\"data row1 col10\" >2025-04-14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_d6ecc_level0_row2\" class=\"row_heading level0 row2\" >3</th>\n",
+       "      <td id=\"T_d6ecc_row2_col0\" class=\"data row2 col0\" >apple</td>\n",
+       "      <td id=\"T_d6ecc_row2_col1\" class=\"data row2 col1\" >2025-04-14 11:59:56-05:00</td>\n",
+       "      <td id=\"T_d6ecc_row2_col2\" class=\"data row2 col2\" >2025-04-14 23:44:27.136181-05:00</td>\n",
+       "      <td id=\"T_d6ecc_row2_col3\" class=\"data row2 col3\" >comment</td>\n",
+       "      <td id=\"T_d6ecc_row2_col4\" class=\"data row2 col4\" >This topic has been automatically locked due to being controversial and/or political by nature. However, the submission itself will remain accessible as long as it is related to Apple.\n",
+       "\n",
+       "\n",
+       "This decision was made by a bot based on specific keywords. If you feel that this was in error, please report it to the moderators so that it can be reviewed.\n",
+       "  \n",
+       "\n",
+       "*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/apple) if you have any questions or concerns.*</td>\n",
+       "      <td id=\"T_d6ecc_row2_col5\" class=\"data row2 col5\" >1</td>\n",
+       "      <td id=\"T_d6ecc_row2_col6\" class=\"data row2 col6\" >mn38mac</td>\n",
+       "      <td id=\"T_d6ecc_row2_col7\" class=\"data row2 col7\" >t3_1jz3wsi</td>\n",
+       "      <td id=\"T_d6ecc_row2_col8\" class=\"data row2 col8\" >0</td>\n",
+       "      <td id=\"T_d6ecc_row2_col9\" class=\"data row2 col9\" >0.997200</td>\n",
+       "      <td id=\"T_d6ecc_row2_col10\" class=\"data row2 col10\" >2025-04-14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_d6ecc_level0_row3\" class=\"row_heading level0 row3\" >23</th>\n",
+       "      <td id=\"T_d6ecc_row3_col0\" class=\"data row3 col0\" >apple</td>\n",
+       "      <td id=\"T_d6ecc_row3_col1\" class=\"data row3 col1\" >2025-04-14 11:43:39-05:00</td>\n",
+       "      <td id=\"T_d6ecc_row3_col2\" class=\"data row3 col2\" >2025-04-14 23:44:27.136181-05:00</td>\n",
+       "      <td id=\"T_d6ecc_row3_col3\" class=\"data row3 col3\" >comment</td>\n",
+       "      <td id=\"T_d6ecc_row3_col4\" class=\"data row3 col4\" >My boss purchased me a 16e to use for iOS development. It may not make much sense for end users but it is a nearly perfect corporate phone.</td>\n",
+       "      <td id=\"T_d6ecc_row3_col5\" class=\"data row3 col5\" >309</td>\n",
+       "      <td id=\"T_d6ecc_row3_col6\" class=\"data row3 col6\" >mn35a6r</td>\n",
+       "      <td id=\"T_d6ecc_row3_col7\" class=\"data row3 col7\" >t3_1jz2xrw</td>\n",
+       "      <td id=\"T_d6ecc_row3_col8\" class=\"data row3 col8\" >1</td>\n",
+       "      <td id=\"T_d6ecc_row3_col9\" class=\"data row3 col9\" >0.998600</td>\n",
+       "      <td id=\"T_d6ecc_row3_col10\" class=\"data row3 col10\" >2025-04-14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_d6ecc_level0_row4\" class=\"row_heading level0 row4\" >24</th>\n",
+       "      <td id=\"T_d6ecc_row4_col0\" class=\"data row4 col0\" >apple</td>\n",
+       "      <td id=\"T_d6ecc_row4_col1\" class=\"data row4 col1\" >2025-04-14 11:24:30-05:00</td>\n",
+       "      <td id=\"T_d6ecc_row4_col2\" class=\"data row4 col2\" >2025-04-14 23:44:27.136181-05:00</td>\n",
+       "      <td id=\"T_d6ecc_row4_col3\" class=\"data row4 col3\" >comment</td>\n",
+       "      <td id=\"T_d6ecc_row4_col4\" class=\"data row4 col4\" >Despite what the tech influencers might have said, Apple clearly knows what they’re doing.</td>\n",
+       "      <td id=\"T_d6ecc_row4_col5\" class=\"data row4 col5\" >283</td>\n",
+       "      <td id=\"T_d6ecc_row4_col6\" class=\"data row4 col6\" >mn31i1a</td>\n",
+       "      <td id=\"T_d6ecc_row4_col7\" class=\"data row4 col7\" >t3_1jz2xrw</td>\n",
+       "      <td id=\"T_d6ecc_row4_col8\" class=\"data row4 col8\" >1</td>\n",
+       "      <td id=\"T_d6ecc_row4_col9\" class=\"data row4 col9\" >0.999600</td>\n",
+       "      <td id=\"T_d6ecc_row4_col10\" class=\"data row4 col10\" >2025-04-14</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n"
+      ],
+      "text/plain": [
+       "<pandas.io.formats.style.Styler at 0x26d025bdb80>"
+      ]
+     },
+     "execution_count": 593,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "keyword_index = 0\n",
+    "subsets[summary.keyword[keyword_index]].head().style.set_caption(f\"KEYWORD = {summary.keyword[keyword_index]}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1f8adbfe-3141-417f-bc13-922b7f1098a7",
+   "metadata": {},
+   "source": [
+    "### Helper tool: Retrieve post and comments by post_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 534,
+   "id": "ea794d80-bf98-47c7-877d-1b2e6a626b12",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-30T02:46:15.966447Z",
+     "iopub.status.busy": "2025-04-30T02:46:15.966447Z",
+     "iopub.status.idle": "2025-04-30T02:46:15.979590Z",
+     "shell.execute_reply": "2025-04-30T02:46:15.979590Z",
+     "shell.execute_reply.started": "2025-04-30T02:46:15.966447Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>subreddit</th>\n",
+       "      <th>created_at</th>\n",
+       "      <th>retrieved_at</th>\n",
+       "      <th>type</th>\n",
+       "      <th>text</th>\n",
+       "      <th>score</th>\n",
+       "      <th>post_id</th>\n",
+       "      <th>parent_id</th>\n",
+       "      <th>sentiment</th>\n",
+       "      <th>confidence</th>\n",
+       "      <th>filedate</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2748</th>\n",
+       "      <td>Android</td>\n",
+       "      <td>2025-04-23 08:15:55-05:00</td>\n",
+       "      <td>2025-04-23 19:03:18.888116-05:00</td>\n",
+       "      <td>post</td>\n",
+       "      <td>The new feature that gives higher memory priority to background tabs containing user edits, such as fillable forms or drafts (reducing the chance of them being killed and thus not losing your progress) is now available in Chrome Canary for Android.\\n\\n</td>\n",
+       "      <td>224</td>\n",
+       "      <td>1k5ywd6</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.9717</td>\n",
+       "      <td>2025-04-23</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2749</th>\n",
+       "      <td>Android</td>\n",
+       "      <td>2025-04-23 08:43:37-05:00</td>\n",
+       "      <td>2025-04-23 19:03:18.888116-05:00</td>\n",
+       "      <td>comment</td>\n",
+       "      <td>Android's task refreshing is so bad and random that I've adapted my whole workflow around it by simply never trusting it and constantly copying whatever I input. If I write something and need switch away to another app even for a second, I copy the text before I do it. \\n\\nAndroid still does this even if you have 16GB of RAM!</td>\n",
+       "      <td>1</td>\n",
+       "      <td>molv84l</td>\n",
+       "      <td>t3_1k5ywd6</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.9996</td>\n",
+       "      <td>2025-04-23</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2750</th>\n",
+       "      <td>Android</td>\n",
+       "      <td>2025-04-23 08:19:42-05:00</td>\n",
+       "      <td>2025-04-23 19:03:18.888116-05:00</td>\n",
+       "      <td>comment</td>\n",
+       "      <td>I love that \"it reduces the chance\" but it doesn't eliminate the chance something I am working on it is killed...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>molr08u</td>\n",
+       "      <td>t3_1k5ywd6</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.9835</td>\n",
+       "      <td>2025-04-23</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2751</th>\n",
+       "      <td>Android</td>\n",
+       "      <td>2025-04-23 08:17:05-05:00</td>\n",
+       "      <td>2025-04-23 19:03:18.888116-05:00</td>\n",
+       "      <td>comment</td>\n",
+       "      <td>Context: [**Background tabs containing user edits, such as filled forms or drafts, will soon have a higher memory priority in Chrome for Android, this will reduce the likelihood of these tabs been killed prematurely.**](https://old.reddit.com/r/Android/comments/1j3ktpg/background_tabs_containing_user_edits_such_as/)\\n\\n.\\n\\nThe patch responsible for this change [**was merged yesterday.**](https://chromium-review.googlesource.com/c/chromium/src/+/6321765)</td>\n",
+       "      <td>1</td>\n",
+       "      <td>molqjut</td>\n",
+       "      <td>t3_1k5ywd6</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.9996</td>\n",
+       "      <td>2025-04-23</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2752</th>\n",
+       "      <td>Android</td>\n",
+       "      <td>2025-04-23 12:13:43-05:00</td>\n",
+       "      <td>2025-04-23 19:03:18.888116-05:00</td>\n",
+       "      <td>comment</td>\n",
+       "      <td>Would love it if Android would let me \"pin\" apps by default that I didn't want to come out of memory. Would be amazing for apps that are slow to re-open.</td>\n",
+       "      <td>1</td>\n",
+       "      <td>mon1t9v</td>\n",
+       "      <td>t3_1k5ywd6</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.9928</td>\n",
+       "      <td>2025-04-23</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2753</th>\n",
+       "      <td>Android</td>\n",
+       "      <td>2025-04-23 11:59:23-05:00</td>\n",
+       "      <td>2025-04-23 19:03:18.888116-05:00</td>\n",
+       "      <td>comment</td>\n",
+       "      <td>Solid upgrade for Android users.</td>\n",
+       "      <td>1</td>\n",
+       "      <td>momytb8</td>\n",
+       "      <td>t3_1k5ywd6</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.9996</td>\n",
+       "      <td>2025-04-23</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     subreddit                 created_at                      retrieved_at  \\\n",
+       "2748   Android  2025-04-23 08:15:55-05:00  2025-04-23 19:03:18.888116-05:00   \n",
+       "2749   Android  2025-04-23 08:43:37-05:00  2025-04-23 19:03:18.888116-05:00   \n",
+       "2750   Android  2025-04-23 08:19:42-05:00  2025-04-23 19:03:18.888116-05:00   \n",
+       "2751   Android  2025-04-23 08:17:05-05:00  2025-04-23 19:03:18.888116-05:00   \n",
+       "2752   Android  2025-04-23 12:13:43-05:00  2025-04-23 19:03:18.888116-05:00   \n",
+       "2753   Android  2025-04-23 11:59:23-05:00  2025-04-23 19:03:18.888116-05:00   \n",
+       "\n",
+       "         type  \\\n",
+       "2748     post   \n",
+       "2749  comment   \n",
+       "2750  comment   \n",
+       "2751  comment   \n",
+       "2752  comment   \n",
+       "2753  comment   \n",
+       "\n",
+       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                            text  \\\n",
+       "2748                                                                                                                                                                                                                The new feature that gives higher memory priority to background tabs containing user edits, such as fillable forms or drafts (reducing the chance of them being killed and thus not losing your progress) is now available in Chrome Canary for Android.\\n\\n   \n",
+       "2749                                                                                                                                     Android's task refreshing is so bad and random that I've adapted my whole workflow around it by simply never trusting it and constantly copying whatever I input. If I write something and need switch away to another app even for a second, I copy the text before I do it. \\n\\nAndroid still does this even if you have 16GB of RAM!   \n",
+       "2750                                                                                                                                                                                                                                                                                                                                                           I love that \"it reduces the chance\" but it doesn't eliminate the chance something I am working on it is killed...   \n",
+       "2751  Context: [**Background tabs containing user edits, such as filled forms or drafts, will soon have a higher memory priority in Chrome for Android, this will reduce the likelihood of these tabs been killed prematurely.**](https://old.reddit.com/r/Android/comments/1j3ktpg/background_tabs_containing_user_edits_such_as/)\\n\\n.\\n\\nThe patch responsible for this change [**was merged yesterday.**](https://chromium-review.googlesource.com/c/chromium/src/+/6321765)   \n",
+       "2752                                                                                                                                                                                                                                                                                                                   Would love it if Android would let me \"pin\" apps by default that I didn't want to come out of memory. Would be amazing for apps that are slow to re-open.   \n",
+       "2753                                                                                                                                                                                                                                                                                                                                                                                                                                            Solid upgrade for Android users.   \n",
+       "\n",
+       "      score  post_id   parent_id  sentiment  confidence    filedate  \n",
+       "2748    224  1k5ywd6        None          0      0.9717  2025-04-23  \n",
+       "2749      1  molv84l  t3_1k5ywd6          0      0.9996  2025-04-23  \n",
+       "2750      1  molr08u  t3_1k5ywd6          1      0.9835  2025-04-23  \n",
+       "2751      1  molqjut  t3_1k5ywd6          0      0.9996  2025-04-23  \n",
+       "2752      1  mon1t9v  t3_1k5ywd6          0      0.9928  2025-04-23  \n",
+       "2753      1  momytb8  t3_1k5ywd6          1      0.9996  2025-04-23  "
+      ]
+     },
+     "execution_count": 534,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "postid = '1k5ywd6'\n",
+    "df[lambda x: ((x.post_id == postid) | (x.parent_id == f't3_{postid}'))]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:reddit_streamlit]",
+   "language": "python",
+   "name": "conda-env-reddit_streamlit-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

notebooks/loading_data.ipynb ADDED Viewed

	@@ -0,0 +1,785 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "dd911245-61d7-49db-9960-570f7feefe2b",
+   "metadata": {},
+   "source": [
+    "# Loading Reddit Data\n",
+    "\n",
+    "This notebook has functions to read in parquet data from Hugging Face dataset [hblim/top_reddit_posts_daily](https://huggingface.co/datasets/hblim/top_reddit_posts_daily)\n",
+    "\n",
+    "I created this notebook to help me fix a issue where I overwrote `data_raw/2025-04-20.parquet` with 2025-04-23 data during testing\n",
+    "- I went to Hugging Face version history to see when the 2025-04-20 file was overwritten erroneously, and saw newer commits on 2025-04-23)\n",
+    "- I cloned the Hugging Face dataset repo locally and checked out last correct version for the corrupted 2025-04-20.parquet file\n",
+    "- Verified that the data looked good (e.g. retrieved date > created date), and not duplicated anywhere else, and then pushed the correct version back to the  main remote"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "id": "d5071073-274b-480d-8503-28d2292422d3",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-26T20:34:47.775286Z",
+     "iopub.status.busy": "2025-04-26T20:34:47.775286Z",
+     "iopub.status.idle": "2025-04-26T20:34:47.779253Z",
+     "shell.execute_reply": "2025-04-26T20:34:47.779253Z",
+     "shell.execute_reply.started": "2025-04-26T20:34:47.775286Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import glob\n",
+    "import datetime\n",
+    "from pathlib import Path\n",
+    "from dotenv import load_dotenv\n",
+    "import pandas as pd\n",
+    "import pyarrow\n",
+    "\n",
+    "from huggingface_hub import HfApi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9085f661-ba37-4715-b87b-3699cb75db2f",
+   "metadata": {},
+   "source": [
+    "Download all historical data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 167,
+   "id": "0c14356b-721c-4048-8efb-f69d8eae4900",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-26T21:37:43.533282Z",
+     "iopub.status.busy": "2025-04-26T21:37:43.533282Z",
+     "iopub.status.idle": "2025-04-26T21:37:43.539908Z",
+     "shell.execute_reply": "2025-04-26T21:37:43.539908Z",
+     "shell.execute_reply.started": "2025-04-26T21:37:43.533282Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def load_reddit_data(source,folder='data_raw'):\n",
+    "    \"\"\"\n",
+    "    Load Reddit daily posts data either from Hugging Face Hub or local files.\n",
+    "\n",
+    "    Args:\n",
+    "        repo_id (str): Hugging Face repo id (e.g., \"hblim/top_reddit_posts_daily\")\n",
+    "        source (str): \"hub\" to load from Hugging Face, \"local\" to load from local files\n",
+    "    Returns:\n",
+    "        pd.DataFrame: Combined DataFrame of all posts\n",
+    "    \"\"\"\n",
+    "    df = []\n",
+    "    \n",
+    "    if source == \"hub\":\n",
+    "        api = HfApi()\n",
+    "        all_files = api.list_repo_files(\"hblim/top_reddit_posts_daily\", repo_type=\"dataset\")\n",
+    "        parquet_files = sorted([f for f in all_files if f.startswith(folder) and f.endswith(\".parquet\")])\n",
+    "\n",
+    "        for shard in parquet_files:\n",
+    "            local_path = api.hf_hub_download(repo_id=repo_id, filename=shard, repo_type=\"dataset\")\n",
+    "            file_date = os.path.splitext(os.path.basename(local_path))[0]\n",
+    "            df.append(pd.read_parquet(local_path).assign(filedate=file_date))\n",
+    "\n",
+    "    elif source == \"local\":\n",
+    "        cwd = os.getcwd()\n",
+    "        local_folder = os.path.join(cwd, \"top_reddit_posts_daily\", folder)\n",
+    "        local_files = sorted(glob.glob(os.path.join(local_folder, \"*.parquet\")))\n",
+    "\n",
+    "        for local_path in local_files:\n",
+    "            file_date = os.path.splitext(os.path.basename(local_path))[0]\n",
+    "            df.append(pd.read_parquet(local_path).assign(filedate=file_date))\n",
+    "\n",
+    "    else:\n",
+    "        raise ValueError(f\"Unknown source: {source}. Use 'hub' or 'local'.\")\n",
+    "\n",
+    "    df = pd.concat(df, ignore_index=True)\n",
+    "    print(f\"Total records across {df.filedate.nunique()} days: {len(df)}\")\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "435fe428-0c99-4d77-9c9d-e9c9a974b16e",
+   "metadata": {},
+   "source": [
+    "### Check if raw and raw-deduplicated data line up\n",
+    "Is the raw data duplicated anywhere? Then the de-duplicated data should have fewer posts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 164,
+   "id": "e5f35dcd-4267-4bef-bc6a-0e89937441c3",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-26T21:32:17.106045Z",
+     "iopub.status.busy": "2025-04-26T21:32:17.106045Z",
+     "iopub.status.idle": "2025-04-26T21:32:18.082758Z",
+     "shell.execute_reply": "2025-04-26T21:32:18.082758Z",
+     "shell.execute_reply.started": "2025-04-26T21:32:17.106045Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total records across 13 days: 3666\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = load_reddit_data(\"hub\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 158,
+   "id": "ff824b99-c38f-4519-87df-54f9946cc20b",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-26T21:24:44.078709Z",
+     "iopub.status.busy": "2025-04-26T21:24:44.078709Z",
+     "iopub.status.idle": "2025-04-26T21:24:44.086147Z",
+     "shell.execute_reply": "2025-04-26T21:24:44.086147Z",
+     "shell.execute_reply.started": "2025-04-26T21:24:44.078709Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "filedate\n",
+       "2025-04-14    312\n",
+       "2025-04-15    258\n",
+       "2025-04-16    330\n",
+       "2025-04-17    324\n",
+       "2025-04-18    255\n",
+       "2025-04-19    232\n",
+       "2025-04-20    251\n",
+       "2025-04-21    233\n",
+       "2025-04-22    268\n",
+       "2025-04-23    331\n",
+       "2025-04-24    332\n",
+       "2025-04-25    309\n",
+       "2025-04-26    231\n",
+       "Name: subreddit, dtype: int64"
+      ]
+     },
+     "execution_count": 158,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.groupby('filedate').subreddit.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 150,
+   "id": "6179d986-471b-40cd-bcf5-529f582315ee",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-26T21:16:54.824986Z",
+     "iopub.status.busy": "2025-04-26T21:16:54.824986Z",
+     "iopub.status.idle": "2025-04-26T21:16:54.842945Z",
+     "shell.execute_reply": "2025-04-26T21:16:54.842945Z",
+     "shell.execute_reply.started": "2025-04-26T21:16:54.824986Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total records across 13 days: 3666\n"
+     ]
+    }
+   ],
+   "source": [
+    "df[\"retrieved_at\"] = pd.to_datetime(df[\"retrieved_at\"],utc=True)\n",
+    "\n",
+    "# Step 1: Find duplicate post_ids\n",
+    "duplicates = df[df.duplicated(subset=[\"post_id\"], keep=False)]\n",
+    "\n",
+    "# Report duplicates and their retrieved_at dates\n",
+    "duplicate_report = duplicates[[\"post_id\", \"retrieved_at\"]]\n",
+    "\n",
+    "# Step 2: De-duplicate keeping the first occurrence, sorted by retrieved_at\n",
+    "df_deduplicated = df_deduplicated.sort_values(by=\"retrieved_at\").reset_index(drop=True)\n",
+    "df_deduplicated = df.drop_duplicates(subset=[\"post_id\"], keep=\"first\")\n",
+    "\n",
+    "print(f\"Total records across {df_deduplicated.filedate.nunique()} days: {len(df_deduplicated)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 153,
+   "id": "67a5fd89-8ddc-4247-ba22-8f411169487f",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-26T21:17:16.136315Z",
+     "iopub.status.busy": "2025-04-26T21:17:16.136315Z",
+     "iopub.status.idle": "2025-04-26T21:17:16.146070Z",
+     "shell.execute_reply": "2025-04-26T21:17:16.146070Z",
+     "shell.execute_reply.started": "2025-04-26T21:17:16.136315Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>original</th>\n",
+       "      <th>deduplicated</th>\n",
+       "      <th>matching</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>filedate</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2025-04-14</th>\n",
+       "      <td>312</td>\n",
+       "      <td>312</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-15</th>\n",
+       "      <td>258</td>\n",
+       "      <td>258</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-16</th>\n",
+       "      <td>330</td>\n",
+       "      <td>330</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-17</th>\n",
+       "      <td>324</td>\n",
+       "      <td>324</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-18</th>\n",
+       "      <td>255</td>\n",
+       "      <td>255</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-19</th>\n",
+       "      <td>232</td>\n",
+       "      <td>232</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-20</th>\n",
+       "      <td>251</td>\n",
+       "      <td>251</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-21</th>\n",
+       "      <td>233</td>\n",
+       "      <td>233</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-22</th>\n",
+       "      <td>268</td>\n",
+       "      <td>268</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-23</th>\n",
+       "      <td>331</td>\n",
+       "      <td>331</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-24</th>\n",
+       "      <td>332</td>\n",
+       "      <td>332</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-25</th>\n",
+       "      <td>309</td>\n",
+       "      <td>309</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-26</th>\n",
+       "      <td>231</td>\n",
+       "      <td>231</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            original  deduplicated  matching\n",
+       "filedate                                    \n",
+       "2025-04-14       312           312      True\n",
+       "2025-04-15       258           258      True\n",
+       "2025-04-16       330           330      True\n",
+       "2025-04-17       324           324      True\n",
+       "2025-04-18       255           255      True\n",
+       "2025-04-19       232           232      True\n",
+       "2025-04-20       251           251      True\n",
+       "2025-04-21       233           233      True\n",
+       "2025-04-22       268           268      True\n",
+       "2025-04-23       331           331      True\n",
+       "2025-04-24       332           332      True\n",
+       "2025-04-25       309           309      True\n",
+       "2025-04-26       231           231      True"
+      ]
+     },
+     "execution_count": 153,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "summary1 = df.groupby('filedate').subreddit.count()\n",
+    "summary2 = df_deduplicated.groupby('filedate').subreddit.count().loc[summary1.index]\n",
+    "\n",
+    "comparison = pd.DataFrame({\n",
+    "    'original': summary1,\n",
+    "    'deduplicated': summary2\n",
+    "})\n",
+    "\n",
+    "comparison['matching'] = comparison['original'] == comparison['deduplicated']\n",
+    "comparison"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e566f098-1402-41cd-a7ec-83900f91e6fb",
+   "metadata": {},
+   "source": [
+    "### Check if raw and summary data line up"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 154,
+   "id": "056e51ff-dabd-474a-84c8-6a76f82a4488",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-26T21:17:19.652014Z",
+     "iopub.status.busy": "2025-04-26T21:17:19.645961Z",
+     "iopub.status.idle": "2025-04-26T21:17:19.790646Z",
+     "shell.execute_reply": "2025-04-26T21:17:19.790646Z",
+     "shell.execute_reply.started": "2025-04-26T21:17:19.652014Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "df_summary = pd.read_csv(api.hf_hub_download(repo_id=repo_id, filename='subreddit_daily_summary.csv', repo_type=\"dataset\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 155,
+   "id": "321eb761-6278-47e8-89f4-24b06f5ddeb3",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-26T21:17:20.795827Z",
+     "iopub.status.busy": "2025-04-26T21:17:20.795309Z",
+     "iopub.status.idle": "2025-04-26T21:17:20.805781Z",
+     "shell.execute_reply": "2025-04-26T21:17:20.804717Z",
+     "shell.execute_reply.started": "2025-04-26T21:17:20.795827Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>csv_counts</th>\n",
+       "      <th>parquet_counts</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>date</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2025-04-14</th>\n",
+       "      <td>312</td>\n",
+       "      <td>312</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-15</th>\n",
+       "      <td>258</td>\n",
+       "      <td>258</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-16</th>\n",
+       "      <td>330</td>\n",
+       "      <td>330</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-17</th>\n",
+       "      <td>324</td>\n",
+       "      <td>324</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-18</th>\n",
+       "      <td>255</td>\n",
+       "      <td>255</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-19</th>\n",
+       "      <td>232</td>\n",
+       "      <td>232</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-20</th>\n",
+       "      <td>251</td>\n",
+       "      <td>251</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-21</th>\n",
+       "      <td>233</td>\n",
+       "      <td>233</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-22</th>\n",
+       "      <td>234</td>\n",
+       "      <td>268</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-23</th>\n",
+       "      <td>309</td>\n",
+       "      <td>331</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-24</th>\n",
+       "      <td>332</td>\n",
+       "      <td>332</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-25</th>\n",
+       "      <td>309</td>\n",
+       "      <td>309</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-26</th>\n",
+       "      <td>231</td>\n",
+       "      <td>231</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            csv_counts  parquet_counts\n",
+       "date                                  \n",
+       "2025-04-14         312             312\n",
+       "2025-04-15         258             258\n",
+       "2025-04-16         330             330\n",
+       "2025-04-17         324             324\n",
+       "2025-04-18         255             255\n",
+       "2025-04-19         232             232\n",
+       "2025-04-20         251             251\n",
+       "2025-04-21         233             233\n",
+       "2025-04-22         234             268\n",
+       "2025-04-23         309             331\n",
+       "2025-04-24         332             332\n",
+       "2025-04-25         309             309\n",
+       "2025-04-26         231             231"
+      ]
+     },
+     "execution_count": 155,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# First compute both series\n",
+    "summary1 = df_summary.groupby('date')['count'].sum()\n",
+    "summary2 = df.groupby('filedate').subreddit.count().loc[summary1.index]\n",
+    "\n",
+    "# Now merge into a single DataFrame\n",
+    "merged = pd.DataFrame({\n",
+    "    'csv_counts': summary1,\n",
+    "    'parquet_counts': summary2\n",
+    "})\n",
+    "\n",
+    "merged"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "96b1f688-c768-4aba-93f6-5247d85f8998",
+   "metadata": {},
+   "source": [
+    "### Check if raw and scored data line up"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 168,
+   "id": "4ef592c5-c36e-454a-bd59-d455a8a3e062",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-26T21:38:12.679864Z",
+     "iopub.status.busy": "2025-04-26T21:38:12.679864Z",
+     "iopub.status.idle": "2025-04-26T21:38:15.906491Z",
+     "shell.execute_reply": "2025-04-26T21:38:15.905943Z",
+     "shell.execute_reply.started": "2025-04-26T21:38:12.679864Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "911485558cf84562889cd9245d5e9a24",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-04-22.parquet:   0%|          | 0.00/65.9k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total records across 13 days: 3666\n",
+      "Total records across 13 days: 3666\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>raw</th>\n",
+       "      <th>scored</th>\n",
+       "      <th>matching</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>filedate</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2025-04-14</th>\n",
+       "      <td>312</td>\n",
+       "      <td>312</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-15</th>\n",
+       "      <td>258</td>\n",
+       "      <td>258</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-16</th>\n",
+       "      <td>330</td>\n",
+       "      <td>330</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-17</th>\n",
+       "      <td>324</td>\n",
+       "      <td>324</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-18</th>\n",
+       "      <td>255</td>\n",
+       "      <td>255</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-19</th>\n",
+       "      <td>232</td>\n",
+       "      <td>232</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-20</th>\n",
+       "      <td>251</td>\n",
+       "      <td>251</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-21</th>\n",
+       "      <td>233</td>\n",
+       "      <td>233</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-22</th>\n",
+       "      <td>268</td>\n",
+       "      <td>268</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-23</th>\n",
+       "      <td>331</td>\n",
+       "      <td>331</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-24</th>\n",
+       "      <td>332</td>\n",
+       "      <td>332</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-25</th>\n",
+       "      <td>309</td>\n",
+       "      <td>309</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2025-04-26</th>\n",
+       "      <td>231</td>\n",
+       "      <td>231</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            raw  scored  matching\n",
+       "filedate                         \n",
+       "2025-04-14  312     312      True\n",
+       "2025-04-15  258     258      True\n",
+       "2025-04-16  330     330      True\n",
+       "2025-04-17  324     324      True\n",
+       "2025-04-18  255     255      True\n",
+       "2025-04-19  232     232      True\n",
+       "2025-04-20  251     251      True\n",
+       "2025-04-21  233     233      True\n",
+       "2025-04-22  268     268      True\n",
+       "2025-04-23  331     331      True\n",
+       "2025-04-24  332     332      True\n",
+       "2025-04-25  309     309      True\n",
+       "2025-04-26  231     231      True"
+      ]
+     },
+     "execution_count": 168,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = load_reddit_data(\"hub\",folder='data_scored')\n",
+    "\n",
+    "summary1 = df.groupby('filedate').subreddit.count()\n",
+    "\n",
+    "df_scored = load_reddit_data(\"hub\",folder='data_scored')\n",
+    "summary2 = df_scored.groupby('filedate').subreddit.count().loc[summary1.index]\n",
+    "\n",
+    "comparison = pd.DataFrame({\n",
+    "    'raw': summary1,\n",
+    "    'scored': summary2\n",
+    "})\n",
+    "\n",
+    "comparison['matching'] = comparison['raw'] == comparison['scored']\n",
+    "comparison"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:reddit_streamlit]",
+   "language": "python",
+   "name": "conda-env-reddit_streamlit-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

notebooks/post_analysis.ipynb ADDED Viewed

	@@ -0,0 +1,567 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "7d423f7b-730c-4669-be82-c0a7141b7c76",
+   "metadata": {},
+   "source": [
+    "# Analyze sentiment driving posts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "83e1a76f-45a1-44d9-ae4a-62425a7af45d",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-06T18:19:44.555148Z",
+     "iopub.status.busy": "2025-06-06T18:19:44.555148Z",
+     "iopub.status.idle": "2025-06-06T18:19:45.754942Z",
+     "shell.execute_reply": "2025-06-06T18:19:45.754942Z",
+     "shell.execute_reply.started": "2025-06-06T18:19:44.555148Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import glob\n",
+    "import datetime\n",
+    "from pathlib import Path\n",
+    "from dotenv import load_dotenv\n",
+    "import pandas as pd\n",
+    "import pyarrow\n",
+    "\n",
+    "from huggingface_hub import HfApi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "c808621b-f55a-4a80-8011-420c0be55151",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-06T18:25:14.801890Z",
+     "iopub.status.busy": "2025-06-06T18:25:14.801890Z",
+     "iopub.status.idle": "2025-06-06T18:25:14.811651Z",
+     "shell.execute_reply": "2025-06-06T18:25:14.811651Z",
+     "shell.execute_reply.started": "2025-06-06T18:25:14.801890Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "Download a single subreddit-day Parquet file from\n",
+    "`hblim/top_reddit_posts_daily/data_scored_subreddit/`.\n",
+    "\n",
+    "Prereqs\n",
+    "-------\n",
+    "pip install huggingface_hub pandas pyarrow\n",
+    "huggingface-cli login  # or set HF_TOKEN\n",
+    "\"\"\"\n",
+    "\n",
+    "from __future__ import annotations\n",
+    "\n",
+    "import re\n",
+    "from pathlib import Path\n",
+    "from typing import Optional\n",
+    "\n",
+    "import pandas as pd\n",
+    "from huggingface_hub import HfApi, hf_hub_download\n",
+    "\n",
+    "\n",
+    "def _sanitize(sub: str) -> str:\n",
+    "    \"\"\"\n",
+    "    Apply the same cleaning rule that was used when the shards were created\n",
+    "    (lowercase + replace any char that isn't 0-9, a-z, _, -, . with '_').\n",
+    "    \"\"\"\n",
+    "    return re.sub(r\"[^\\w\\-.]\", \"_\", sub.strip().lower())\n",
+    "\n",
+    "\n",
+    "def download_subreddit_day(\n",
+    "    date_str: str,              # \"YYYY-MM-DD\"\n",
+    "    subreddit: str,             # e.g. \"MachineLearning\"\n",
+    "    repo_id: str = \"hblim/top_reddit_posts_daily\",\n",
+    "    data_folder: str = \"data_scored_subreddit\",\n",
+    "    out_dir: str | Path = \"downloads\",\n",
+    "    token: Optional[str] = None,\n",
+    ") -> Path:\n",
+    "    \"\"\"\n",
+    "    Returns the local path of the downloaded Parquet file.\n",
+    "\n",
+    "    Raises FileNotFoundError if the shard isn't on the Hub.\n",
+    "    \"\"\"\n",
+    "    api = HfApi(token=token)\n",
+    "    safe_sub = _sanitize(subreddit)\n",
+    "\n",
+    "    # remote path is exactly how the splitter wrote it: YYYY-MM-DD__sub.parquet\n",
+    "    filename_in_repo = f\"{data_folder}/{date_str}__{safe_sub}.parquet\"\n",
+    "\n",
+    "    # sanity check: make sure the file exists in the repo\n",
+    "    if filename_in_repo not in api.list_repo_files(repo_id, repo_type=\"dataset\"):\n",
+    "        raise FileNotFoundError(\n",
+    "            f\"No shard named '{filename_in_repo}' in {repo_id}. \"\n",
+    "            \"Maybe the date or subreddit is wrong?\"\n",
+    "        )\n",
+    "\n",
+    "    local_path = hf_hub_download(\n",
+    "        repo_id=repo_id,\n",
+    "        filename=filename_in_repo,\n",
+    "        repo_type=\"dataset\",\n",
+    "        cache_dir=str(Path(out_dir).expanduser()),\n",
+    "    )\n",
+    "    print(f\"✅ Downloaded to: {local_path}\")\n",
+    "    return Path(local_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "id": "e64146ea-4bdc-461b-9c27-99aaac5a50a8",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-07T01:10:47.720639Z",
+     "iopub.status.busy": "2025-06-07T01:10:47.720639Z",
+     "iopub.status.idle": "2025-06-07T01:10:48.845012Z",
+     "shell.execute_reply": "2025-06-07T01:10:48.845012Z",
+     "shell.execute_reply.started": "2025-06-07T01:10:47.720639Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a07b9648bd3b4454ad05b564e304ca76",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-06__localllama.parquet:   0%|          | 0.00/69.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Downloaded to: downloads\\datasets--hblim--top_reddit_posts_daily\\snapshots\\5fc94d45ca6e670268f2e505350bbc08ec7d5d84\\data_scored_subreddit\\2025-06-06__localllama.parquet\n"
+     ]
+    }
+   ],
+   "source": [
+    "subreddit = 'localllama'\n",
+    "date = '2025-06-06'\n",
+    "path = download_subreddit_day(\n",
+    "        date_str=date,\n",
+    "        subreddit=subreddit)\n",
+    "df = pd.read_parquet(path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "id": "6c41a5bc-0169-491b-ac26-bc7edae8f852",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-07T01:10:49.801513Z",
+     "iopub.status.busy": "2025-06-07T01:10:49.801513Z",
+     "iopub.status.idle": "2025-06-07T01:10:49.851213Z",
+     "shell.execute_reply": "2025-06-07T01:10:49.851213Z",
+     "shell.execute_reply.started": "2025-06-07T01:10:49.801513Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\halst\\AppData\\Local\\Temp\\ipykernel_23912\\1682697236.py:32: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  thread_metrics = grouped.apply(lambda group: pd.Series({\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "# Assume 'df' is already loaded in the notebook, e.g.:\n",
+    "# df = pd.read_csv(\"my_reddit_day.csv\")\n",
+    "\n",
+    "def compute_metrics_for_df(df, gamma_post=0.3):\n",
+    "    # 1. Ensure 'score' is numeric\n",
+    "    df['score_num'] = pd.to_numeric(df['score'], errors='coerce').fillna(0)\n",
+    "\n",
+    "    # 2. Compute weights: log-scaled by score, with a lower multiplier for posts\n",
+    "    weights = (1 + np.log1p(df['score_num'].clip(lower=0)))\n",
+    "    weights *= np.where(df['type'] == 'post', gamma_post, 1.0)\n",
+    "    df['weight'] = weights\n",
+    "\n",
+    "    # 3. Compute a thread_id for each row\n",
+    "    def thread_id(row):\n",
+    "        if row['type'] == 'post':\n",
+    "            return str(row['post_id'])\n",
+    "        pid = row['parent_id']\n",
+    "        if isinstance(pid, str) and pid.startswith('t3_'):\n",
+    "            return pid[3:]\n",
+    "        return str(pid)\n",
+    "\n",
+    "    df['thread_id'] = df.apply(thread_id, axis=1)\n",
+    "\n",
+    "    # 4. Overall daily weighted sentiment (EAS)\n",
+    "    day_eas = (df['weight'] * df['sentiment']).sum() / df['weight'].sum()\n",
+    "\n",
+    "    # 5. Per-thread metrics\n",
+    "    grouped = df.groupby('thread_id')\n",
+    "    thread_metrics = grouped.apply(lambda group: pd.Series({\n",
+    "        'eas': (group['weight'] * group['sentiment']).sum() / group['weight'].sum(),\n",
+    "        'tot_weight': group['weight'].sum(),\n",
+    "        'title': (\n",
+    "            group.loc[group['type'] == 'post', 'text']\n",
+    "                 .iloc[0]\n",
+    "            if (group['type'] == 'post').any()\n",
+    "            else ''\n",
+    "        )\n",
+    "    })).reset_index()\n",
+    "\n",
+    "    # 6. Contribution: how much each thread shifts the day sentiment from 0.5\n",
+    "    thread_metrics['contrib'] = thread_metrics['tot_weight'] * (thread_metrics['eas'] - 0.5)\n",
+    "\n",
+    "    return day_eas, thread_metrics\n",
+    "\n",
+    "# === Example usage on your preloaded DataFrame ===\n",
+    "day_eas_value, thread_df = compute_metrics_for_df(df)\n",
+    "\n",
+    "# 7. Show the overall daily sentiment\n",
+    "daily_summary = pd.DataFrame([{\n",
+    "    'weighted_sentiment (EAS)': round(day_eas_value, 3)\n",
+    "}])\n",
+    "daily_summary\n",
+    "\n",
+    "thread_top_pos = thread_df.sort_values('contrib', ascending=False).head(5).copy()\n",
+    "thread_top_neg = thread_df.sort_values('contrib').head(5).copy()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "id": "7c32258e-14b4-42d4-a535-b2598e19f968",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-07T01:11:03.484174Z",
+     "iopub.status.busy": "2025-06-07T01:11:03.480647Z",
+     "iopub.status.idle": "2025-06-07T01:11:03.528587Z",
+     "shell.execute_reply": "2025-06-07T01:11:03.528587Z",
+     "shell.execute_reply.started": "2025-06-07T01:11:03.484174Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\halst\\AppData\\Local\\Temp\\ipykernel_23912\\1682697236.py:32: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
+      "  thread_metrics = grouped.apply(lambda group: pd.Series({\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>weighted_sentiment (EAS)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.3186</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   weighted_sentiment (EAS)\n",
+       "0                    0.3186"
+      ]
+     },
+     "execution_count": 86,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# === Example usage on your preloaded DataFrame ===\n",
+    "day_eas_value, thread_df = compute_metrics_for_df(df)\n",
+    "\n",
+    "# 7. Show the overall daily sentiment\n",
+    "daily_summary = pd.DataFrame([{\n",
+    "    'weighted_sentiment (EAS)': round(day_eas_value, 4)\n",
+    "}])\n",
+    "daily_summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "823deb32-cce6-4b1f-aba2-bebdf1645b6e",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-07T01:11:45.194222Z",
+     "iopub.status.busy": "2025-06-07T01:11:45.188532Z",
+     "iopub.status.idle": "2025-06-07T01:11:45.198360Z",
+     "shell.execute_reply": "2025-06-07T01:11:45.198360Z",
+     "shell.execute_reply.started": "2025-06-07T01:11:45.194222Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# 8. Extract top 5 positive-contribution threads and top 5 negative-contribution threads\n",
+    "thread_top_pos = thread_df.sort_values('contrib', ascending=False).head(5).copy()\n",
+    "thread_top_neg = thread_df.sort_values('contrib').head(5).copy()\n",
+    "\n",
+    "# (Optionally) truncate titles for display\n",
+    "# thread_top_pos['title'] = thread_top_pos['title'].str.slice(0, 90)\n",
+    "# thread_top_neg['title'] = thread_top_neg['title'].str.slice(0, 90)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "id": "085652c4-b599-4d7b-bc64-e3a464d3d72c",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-07T01:12:18.195083Z",
+     "iopub.status.busy": "2025-06-07T01:12:18.194068Z",
+     "iopub.status.idle": "2025-06-07T01:12:18.201898Z",
+     "shell.execute_reply": "2025-06-07T01:12:18.201898Z",
+     "shell.execute_reply.started": "2025-06-07T01:12:18.195083Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>eas</th>\n",
+       "      <th>tot_weight</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>Is this the largest \"No synthetic data\" open weight LLM? (142B)\\n\\nFrom the GitHub page of https://huggingface.co/rednote-hilab/dots.llm1.base</td>\n",
+       "      <td>0.579431</td>\n",
+       "      <td>28.660264</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Tokasaurus: An LLM Inference Engine for High-Throughput Workloads\\n\\n</td>\n",
+       "      <td>0.740024</td>\n",
+       "      <td>8.072325</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>Real-time conversation with a character on your local machine\\n\\nAnd also the voice split function\\n\\nSorry for my English =)</td>\n",
+       "      <td>0.551828</td>\n",
+       "      <td>30.763515</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>Offline verbal chat bot with modular tool calling!\\n\\nThis is an update from my original [post](https://www.reddit.com/r/LocalLLaMA/comments/1l2vrg2/fully_offline_verbal_chat_bot/) where I demoed my fully offline verbal chat bot. I've made a couple updates, and should be releasing it on github soon.  \\n\\- Clipboard insertion: allows you to insert your clipboard to the prompt with just a key press  \\n\\- Modular tool calling: allows the model to use tools that can be drag and dropped into a folder\\n\\nTo clarify how tool calling works: Behind the scenes the program parses the json headers of all files in the tools folder at startup, and then passes them along with the users message. This means you can simply drag and drop a tool, restart the app, and use it.\\n\\nPlease leave suggestions and ask any questions you might have!</td>\n",
+       "      <td>0.764096</td>\n",
+       "      <td>4.431766</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>I thought Qwen3 was putting out some questionable content into my code...\\n\\nOh. \\*\\*SOLVED.\\*\\*  See why, I think, at the end.\\n\\nOkay, so I was trying \\`aider\\`. Only tried a bit here and there, but I just switched to using \\`Qwen\\_Qwen3-14B-Q6\\_K\\_L.gguf\\`.  And I see this in my aider output:\\n\\n\\`\\`\\`text  \\n\\## Signoff: insurgent (razzin' frazzin' motherfu... stupid directx...)  \\n\\`\\`\\`  \\nNow, please bear in mind, this is script that plots timestamps, like \\`ls | plottimes\\` and, aside from plotting time data as a \\`heatmap\\`, it has no special war or battle terminology, nor profane language in it.  I am not familiar with this thing to know where or how that was generated, since it SEEMS to be from a trial run aider did of the code:\\n\\nhttps://preview.redd.it/zamjz1bdsb5f1.jpg?width=719&amp;format=pjpg&amp;auto=webp&amp;s=5ca874f91bdd6fe7fc20f4eb797e5ddc22500dec\\n\\nBut, that seems to be the code running -- not LLM output directly.\\n\\nOdd!\\n\\n...scrolling back to see what's up there:\\n\\n...</td>\n",
+       "      <td>0.719805</td>\n",
+       "      <td>4.278161</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      title  \\\n",
+       "32                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           Is this the largest \"No synthetic data\" open weight LLM? (142B)\\n\\nFrom the GitHub page of https://huggingface.co/rednote-hilab/dots.llm1.base   \n",
+       "14                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    Tokasaurus: An LLM Inference Engine for High-Throughput Workloads\\n\\n   \n",
+       "21                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            Real-time conversation with a character on your local machine\\n\\nAnd also the voice split function\\n\\nSorry for my English =)   \n",
+       "37                                                                                                                                                                          Offline verbal chat bot with modular tool calling!\\n\\nThis is an update from my original [post](https://www.reddit.com/r/LocalLLaMA/comments/1l2vrg2/fully_offline_verbal_chat_bot/) where I demoed my fully offline verbal chat bot. I've made a couple updates, and should be releasing it on github soon.  \\n\\- Clipboard insertion: allows you to insert your clipboard to the prompt with just a key press  \\n\\- Modular tool calling: allows the model to use tools that can be drag and dropped into a folder\\n\\nTo clarify how tool calling works: Behind the scenes the program parses the json headers of all files in the tools folder at startup, and then passes them along with the users message. This means you can simply drag and drop a tool, restart the app, and use it.\\n\\nPlease leave suggestions and ask any questions you might have!   \n",
+       "31  I thought Qwen3 was putting out some questionable content into my code...\\n\\nOh. \\*\\*SOLVED.\\*\\*  See why, I think, at the end.\\n\\nOkay, so I was trying \\`aider\\`. Only tried a bit here and there, but I just switched to using \\`Qwen\\_Qwen3-14B-Q6\\_K\\_L.gguf\\`.  And I see this in my aider output:\\n\\n\\`\\`\\`text  \\n\\## Signoff: insurgent (razzin' frazzin' motherfu... stupid directx...)  \\n\\`\\`\\`  \\nNow, please bear in mind, this is script that plots timestamps, like \\`ls | plottimes\\` and, aside from plotting time data as a \\`heatmap\\`, it has no special war or battle terminology, nor profane language in it.  I am not familiar with this thing to know where or how that was generated, since it SEEMS to be from a trial run aider did of the code:\\n\\nhttps://preview.redd.it/zamjz1bdsb5f1.jpg?width=719&format=pjpg&auto=webp&s=5ca874f91bdd6fe7fc20f4eb797e5ddc22500dec\\n\\nBut, that seems to be the code running -- not LLM output directly.\\n\\nOdd!\\n\\n...scrolling back to see what's up there:\\n\\n...   \n",
+       "\n",
+       "         eas  tot_weight  \n",
+       "32  0.579431   28.660264  \n",
+       "14  0.740024    8.072325  \n",
+       "21  0.551828   30.763515  \n",
+       "37  0.764096    4.431766  \n",
+       "31  0.719805    4.278161  "
+      ]
+     },
+     "execution_count": 89,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "thread_top_pos[['title', 'eas', 'tot_weight']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "4b925100-a077-4178-a826-721677f5461d",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-07T01:12:18.509027Z",
+     "iopub.status.busy": "2025-06-07T01:12:18.509027Z",
+     "iopub.status.idle": "2025-06-07T01:12:18.520061Z",
+     "shell.execute_reply": "2025-06-07T01:12:18.519048Z",
+     "shell.execute_reply.started": "2025-06-07T01:12:18.509027Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>eas</th>\n",
+       "      <th>tot_weight</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>Cannot even run the smallest model on system RAM?\\n\\nI am a bit confused. I am trying to run small LLMs on my Unraid server within the Ollama docker, using just the CPU and 16GB of system RAM.\\n\\nGot Ollama up and running, but even when pulling the smallest models like Qwen 3 0.6B with Q4\\_K\\_M quantization, Ollama tells me I need way more RAM than I have left to spare. Why is that? Should this model not be running on any potato? Does this have to do with context overhead?\\n\\n  \\nSorry if this is a stupid question, I am trying to learn more about this and cannot find the solution anywhere else.</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>23.823146</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>what's the case against flash attention?\\n\\nI accidently stumbled upon the -fa (flash attention) flag in llama.cpp's llama-server. I cannot speak to the speedup in performence as i haven't properly tested it, but the memory optimization is huge: 8B-F16-gguf model with 100k fit comfortably in 32GB vram gpu with some 2-3 GB to spare.\\n\\nA very brief search revealed that flash attention theoretically computes the same mathematical function, and in practice benchmarks show no change in the model's output quality.\\n\\nSo my question is, is flash attention really just free lunch? what's the catch? why is it not enabled by default?</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>22.075726</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>It is possble to run non-reasoning deepseek-r1-0528?\\n\\nI know, stupid question, but couldn't find an answer to it!</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>17.520515</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Can a model be so radically altered that its origin can no longer be recognized? YES!\\n\\n**Phi-lthy4**( [https://huggingface.co/SicariusSicariiStuff/Phi-lthy4](https://huggingface.co/SicariusSicariiStuff/Phi-lthy4) ) has been consistently described as **exceptionally unique** by all who have tested it, **almost devoid of SLOP**, and it is now widely regarded as the **most unique roleplay model available**. It underwent an intensive continued pretraining (CPT) phase, extensive supervised fine-tuning (SFT) on high-quality organic datasets, and leveraged advanced techniques including model merging, parameter pruning, and upscaling.\\n\\nInterestingly, this distinctiveness was validated in a recent paper: [*Gradient-Based Model Fingerprinting for LLM Similarity Detection and Family Classification*](https://arxiv.org/html/2506.01631v1). Among a wide array of models tested, this one stood out as **unclassifiable** by traditional architecture-based fingerprinting—highlighting the extent of ...</td>\n",
+       "      <td>0.211321</td>\n",
+       "      <td>27.502412</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>China's Rednote Open-source dots.llm performance &amp; cost\\n\\n\\nhttps://github.com/rednote-hilab/dots.llm1/blob/main/dots1_tech_report.pdf</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>15.465402</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      title  \\\n",
+       "23                                                                                                                                                                                                                                                                                                                                                                                                               Cannot even run the smallest model on system RAM?\\n\\nI am a bit confused. I am trying to run small LLMs on my Unraid server within the Ollama docker, using just the CPU and 16GB of system RAM.\\n\\nGot Ollama up and running, but even when pulling the smallest models like Qwen 3 0.6B with Q4\\_K\\_M quantization, Ollama tells me I need way more RAM than I have left to spare. Why is that? Should this model not be running on any potato? Does this have to do with context overhead?\\n\\n  \\nSorry if this is a stupid question, I am trying to learn more about this and cannot find the solution anywhere else.    \n",
+       "36                                                                                                                                                                                                                                                                                                                                                                                  what's the case against flash attention?\\n\\nI accidently stumbled upon the -fa (flash attention) flag in llama.cpp's llama-server. I cannot speak to the speedup in performence as i haven't properly tested it, but the memory optimization is huge: 8B-F16-gguf model with 100k fit comfortably in 32GB vram gpu with some 2-3 GB to spare.\\n\\nA very brief search revealed that flash attention theoretically computes the same mathematical function, and in practice benchmarks show no change in the model's output quality.\\n\\nSo my question is, is flash attention really just free lunch? what's the catch? why is it not enabled by default?   \n",
+       "17                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      It is possble to run non-reasoning deepseek-r1-0528?\\n\\nI know, stupid question, but couldn't find an answer to it!   \n",
+       "13  Can a model be so radically altered that its origin can no longer be recognized? YES!\\n\\n**Phi-lthy4**( [https://huggingface.co/SicariusSicariiStuff/Phi-lthy4](https://huggingface.co/SicariusSicariiStuff/Phi-lthy4) ) has been consistently described as **exceptionally unique** by all who have tested it, **almost devoid of SLOP**, and it is now widely regarded as the **most unique roleplay model available**. It underwent an intensive continued pretraining (CPT) phase, extensive supervised fine-tuning (SFT) on high-quality organic datasets, and leveraged advanced techniques including model merging, parameter pruning, and upscaling.\\n\\nInterestingly, this distinctiveness was validated in a recent paper: [*Gradient-Based Model Fingerprinting for LLM Similarity Detection and Family Classification*](https://arxiv.org/html/2506.01631v1). Among a wide array of models tested, this one stood out as **unclassifiable** by traditional architecture-based fingerprinting—highlighting the extent of ...   \n",
+       "12                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  China's Rednote Open-source dots.llm performance & cost\\n\\n\\nhttps://github.com/rednote-hilab/dots.llm1/blob/main/dots1_tech_report.pdf   \n",
+       "\n",
+       "         eas  tot_weight  \n",
+       "23  0.000000   23.823146  \n",
+       "36  0.000000   22.075726  \n",
+       "17  0.000000   17.520515  \n",
+       "13  0.211321   27.502412  \n",
+       "12  0.000000   15.465402  "
+      ]
+     },
+     "execution_count": 90,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "thread_top_neg[['title', 'eas', 'tot_weight']]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:reddit_streamlit]",
+   "language": "python",
+   "name": "conda-env-reddit_streamlit-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

notebooks/split_data_scored.ipynb ADDED Viewed

	@@ -0,0 +1,2798 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "e94d29a3-2c94-4131-b951-8604613cdd63",
+   "metadata": {},
+   "source": [
+    "Split daily Reddit Parquet shards by subreddit and re-upload."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "ccdb349a-bfeb-428b-a2be-fa8da62ad644",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-05T20:35:51.119431Z",
+     "iopub.status.busy": "2025-06-05T20:35:51.117431Z",
+     "iopub.status.idle": "2025-06-05T20:35:54.082300Z",
+     "shell.execute_reply": "2025-06-05T20:35:54.082300Z",
+     "shell.execute_reply.started": "2025-06-05T20:35:51.119431Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: huggingface_hub in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (0.30.2)\n",
+      "Collecting huggingface_hub\n",
+      "  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)\n",
+      "Requirement already satisfied: filelock in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from huggingface_hub) (3.13.1)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from huggingface_hub) (2024.6.1)\n",
+      "Requirement already satisfied: packaging>=20.9 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from huggingface_hub) (24.2)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from huggingface_hub) (6.0.2)\n",
+      "Requirement already satisfied: requests in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from huggingface_hub) (2.32.3)\n",
+      "Requirement already satisfied: tqdm>=4.42.1 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from huggingface_hub) (4.67.1)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from huggingface_hub) (4.12.2)\n",
+      "Requirement already satisfied: colorama in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from tqdm>=4.42.1->huggingface_hub) (0.4.6)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from requests->huggingface_hub) (3.4.1)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from requests->huggingface_hub) (3.10)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from requests->huggingface_hub) (2.4.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from requests->huggingface_hub) (2025.1.31)\n",
+      "Downloading huggingface_hub-0.32.4-py3-none-any.whl (512 kB)\n",
+      "Installing collected packages: huggingface_hub\n",
+      "  Attempting uninstall: huggingface_hub\n",
+      "    Found existing installation: huggingface-hub 0.30.2\n",
+      "    Uninstalling huggingface-hub-0.30.2:\n",
+      "      Successfully uninstalled huggingface-hub-0.30.2\n",
+      "Successfully installed huggingface_hub-0.32.4\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install -q pyarrow fastparquet\n",
+    "!pip install -U huggingface_hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "8fe6bfff-770f-4237-868b-10099ab9468c",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-05T20:41:40.658262Z",
+     "iopub.status.busy": "2025-06-05T20:41:40.658262Z",
+     "iopub.status.idle": "2025-06-05T20:41:40.667028Z",
+     "shell.execute_reply": "2025-06-05T20:41:40.667028Z",
+     "shell.execute_reply.started": "2025-06-05T20:41:40.658262Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from __future__ import annotations\n",
+    "\n",
+    "import os\n",
+    "import re\n",
+    "import shutil\n",
+    "import tempfile\n",
+    "from pathlib import Path\n",
+    "from typing import Iterable\n",
+    "\n",
+    "import pandas as pd\n",
+    "from huggingface_hub import HfApi, hf_hub_download, CommitOperationAdd\n",
+    "from tqdm.auto import tqdm\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "3fb82f27-d6ee-4f18-b2eb-86edcdc505db",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-05T20:41:41.607705Z",
+     "iopub.status.busy": "2025-06-05T20:41:41.607705Z",
+     "iopub.status.idle": "2025-06-05T20:41:41.625213Z",
+     "shell.execute_reply": "2025-06-05T20:41:41.625213Z",
+     "shell.execute_reply.started": "2025-06-05T20:41:41.607705Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def _sanitize(name: str) -> str:\n",
+    "    \"\"\"\n",
+    "    Make subreddit safe for filenames (removes slashes, spaces, etc.).\n",
+    "    \"\"\"\n",
+    "    name = name.strip().lower()\n",
+    "    name = re.sub(r\"[^\\w\\-\\.]\", \"_\", name)  # keep letters, numbers, _, -, .\n",
+    "    return name\n",
+    "    \n",
+    "def split_and_upload_by_subreddit(\n",
+    "    repo_id: str = \"hblim/top_reddit_posts_daily\",\n",
+    "    source_folder: str = \"data_scored\",\n",
+    "    target_folder: str = \"data_scored_subreddit\",\n",
+    "    overwrite: bool = False,\n",
+    "    batch_size: int = 20,\n",
+    "    token: str | None = None,\n",
+    ") -> None:\n",
+    "    \"\"\"\n",
+    "    For every Parquet in `source_folder`, create one Parquet per subreddit\n",
+    "    and upload to `target_folder`.\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    repo_id : str\n",
+    "        Hugging Face dataset repo id.\n",
+    "    source_folder : str\n",
+    "        Folder that already contains the daily Parquet files.\n",
+    "    target_folder : str\n",
+    "        New folder to hold subreddit-level Parquet shards.\n",
+    "    overwrite : bool\n",
+    "        Re-process / re-upload even if the target file already exists.\n",
+    "    batch_size : int\n",
+    "        Upload this many files per commit (reduces commit spam).\n",
+    "    token : str | None\n",
+    "        HF token; if None, uses the one stored by `huggingface-cli login`.\n",
+    "    \"\"\"\n",
+    "    api = HfApi(token=token)\n",
+    "\n",
+    "    # 1. discover daily Parquet files in the repo\n",
+    "    files_in_repo: Iterable[str] = api.list_repo_files(repo_id, repo_type=\"dataset\")\n",
+    "    daily_files = sorted(\n",
+    "        f for f in files_in_repo if f.startswith(source_folder) and f.endswith(\".parquet\")\n",
+    "    )\n",
+    "    if not daily_files:\n",
+    "        raise RuntimeError(f\"No Parquet files found in {source_folder}\")\n",
+    "\n",
+    "    print(f\"Found {len(daily_files)} daily shards in {source_folder}\")\n",
+    "\n",
+    "    with tempfile.TemporaryDirectory() as tmp_dir:\n",
+    "        tmp_dir = Path(tmp_dir)\n",
+    "\n",
+    "        upload_queue: list[tuple[Path, str]] = []\n",
+    "        pbar = tqdm(daily_files, desc=\"processing days\", unit=\"file\")\n",
+    "\n",
+    "        for remote_path in pbar:\n",
+    "            file_date = Path(remote_path).stem  # e.g. 2025-05-31\n",
+    "            local_path = hf_hub_download(\n",
+    "                repo_id=repo_id,\n",
+    "                filename=remote_path,\n",
+    "                repo_type=\"dataset\",\n",
+    "                cache_dir=tmp_dir,  # keep inside temp dir\n",
+    "            )\n",
+    "            df = pd.read_parquet(local_path)\n",
+    "\n",
+    "            # 2. split by subreddit\n",
+    "            for subreddit, sub_df in df.groupby(\"subreddit\", sort=False):\n",
+    "                safe_sub = _sanitize(subreddit)\n",
+    "                out_fname = f\"{file_date}__{safe_sub}.parquet\"\n",
+    "                out_repo_path = f\"{target_folder}/{out_fname}\"\n",
+    "\n",
+    "                # skip if already in repo and not overwriting\n",
+    "                if not overwrite and out_repo_path in files_in_repo:\n",
+    "                    continue\n",
+    "\n",
+    "                out_local = tmp_dir / out_fname\n",
+    "                sub_df.to_parquet(out_local, index=False)\n",
+    "                upload_queue.append((out_local, out_repo_path))\n",
+    "\n",
+    "            # upload in batches to reduce commit churn\n",
+    "            if len(upload_queue) >= batch_size:\n",
+    "                _flush_upload_queue(api, repo_id, upload_queue)\n",
+    "                upload_queue.clear()\n",
+    "\n",
+    "        # flush any leftovers\n",
+    "        if upload_queue:\n",
+    "            _flush_upload_queue(api, repo_id, upload_queue)\n",
+    "\n",
+    "    print(\"✅ Done – all subreddit shards uploaded.\")\n",
+    "\n",
+    "\n",
+    "def _flush_upload_queue(api: HfApi, repo_id: str,\n",
+    "                        queue: list[tuple[Path, str]]) -> None:\n",
+    "    \"\"\"Upload a batch of files in one commit (works on ≥0.28).\"\"\"\n",
+    "    if not queue:\n",
+    "        return\n",
+    "\n",
+    "    ops = [\n",
+    "        CommitOperationAdd(\n",
+    "            path_in_repo=dst,         # where the file will live in the repo\n",
+    "            path_or_fileobj=str(src)  # local temp file\n",
+    "        )\n",
+    "        for src, dst in queue\n",
+    "    ]\n",
+    "\n",
+    "    api.create_commit(\n",
+    "        repo_id=repo_id,\n",
+    "        repo_type=\"dataset\",\n",
+    "        operations=ops,\n",
+    "        commit_message=f\"Add {len(queue)} subreddit parquet file(s)\",\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "d8f29912-98b1-4e37-bff5-f3cfff2170d3",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-05T20:41:42.177374Z",
+     "iopub.status.busy": "2025-06-05T20:41:42.177374Z",
+     "iopub.status.idle": "2025-06-05T20:42:06.076189Z",
+     "shell.execute_reply": "2025-06-05T20:42:06.075678Z",
+     "shell.execute_reply.started": "2025-06-05T20:41:42.177374Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 35 daily shards in data_scored\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "68642a82f2ef41c6b663a455d3781374",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "processing days:   0%|          | 0/35 [00:00<?, ?file/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1961d119f4254c6a9f4af4139b0dbda5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-01.parquet:   0%|          | 0.00/271k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a5bfdcb06e4541698ede37f5bf845662",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-02.parquet:   0%|          | 0.00/202k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "75929f953cef4b1d949471a9b887fa64",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-03.parquet:   0%|          | 0.00/231k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "64947704c3954ddca1e181a905da4ddd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-04.parquet:   0%|          | 0.00/195k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "edf2c664cfd2445f8fdca3eb6c0ee1fb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-05.parquet:   0%|          | 0.00/225k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a46c2e19563045fd8230a10dec756890",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-06.parquet:   0%|          | 0.00/225k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fe0f07d515e0455391b0a0661196df87",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-07.parquet:   0%|          | 0.00/188k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ade879eedd8a4016a44da6a5b8caf631",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-08.parquet:   0%|          | 0.00/228k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "16f30a3d35654112a93d667ba400c0bb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-09.parquet:   0%|          | 0.00/221k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "820260817bf74db78da330a3fbdcf449",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-10.parquet:   0%|          | 0.00/190k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d2721d0d407248f8b488e0176b4a1cf6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-11.parquet:   0%|          | 0.00/193k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3472b2ab7af24c728ec70ccc8978f8bd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-12.parquet:   0%|          | 0.00/230k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "09ec3ac45f5748f0bc04221d3c19dc5b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-13.parquet:   0%|          | 0.00/221k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "aa4bb60b6c614e61ba03b91e624e2552",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-01__localllama.parquet:   0%|          | 0.00/151k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "473044c36385426195e88caf02182cb2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-01__artificial.parquet:   0%|          | 0.00/36.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c870917db331495781261caad218c974",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-01__singularity.parquet:   0%|          | 0.00/51.6k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a4c64168eeb247c695cd3957b37b33a2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-01__openai.parquet:   0%|          | 0.00/59.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d7c6be3476204b6b94f6e4abeea8d64c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 52 LFS files:   0%|          | 0/52 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ce3c5fcbf7e041a0aa780ee2aea950bf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-02__artificial.parquet:   0%|          | 0.00/25.5k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ff8e4f095ee54929b4b4e2809629ee73",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-02__localllama.parquet:   0%|          | 0.00/89.5k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "97147dad0d5442d9a423c69547d10a2b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-02__singularity.parquet:   0%|          | 0.00/44.8k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "820fda08019a45ea8b9147026711446c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-02__openai.parquet:   0%|          | 0.00/66.8k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "319c3e6b6ccb4e4db46b2b3053d62bff",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-03__artificial.parquet:   0%|          | 0.00/25.3k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2ac26f7d96614bd3b9ed109487e63ca9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-03__localllama.parquet:   0%|          | 0.00/113k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2b1ed4648fb94e6db93f833dccbb047b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-03__singularity.parquet:   0%|          | 0.00/57.9k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b33518a7498c4a11b859cd6806373dac",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-03__openai.parquet:   0%|          | 0.00/60.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "983964c1dd7f400a861503208251b6f3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-04__artificial.parquet:   0%|          | 0.00/23.6k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8258e1f3990a4deeb53f6cb48953f39b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-04__localllama.parquet:   0%|          | 0.00/83.6k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2da678108a244b809cf3c4ab11164afe",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-04__singularity.parquet:   0%|          | 0.00/42.0k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7d276ff05077468f910033029d6f4d8c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-04__openai.parquet:   0%|          | 0.00/68.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "675a18a2606243a5a03437679bbd9baa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-05__artificial.parquet:   0%|          | 0.00/12.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e28f11283a694ded8a360d536b109f14",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-05__localllama.parquet:   0%|          | 0.00/108k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0418075a84ed4154a0a7ab3227bfde91",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-05__singularity.parquet:   0%|          | 0.00/62.5k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "be03cf1503af4a7fb8a40024ef4ea61b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-05__openai.parquet:   0%|          | 0.00/65.9k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "49213e6b8e0243da8e9e2cab6c5a7d9e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-06__artificial.parquet:   0%|          | 0.00/32.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "af77232ad564422ea9d7fb1fb034666f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-06__localllama.parquet:   0%|          | 0.00/107k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b71f5d092d6e46409cb2d790b6cdc840",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-06__singularity.parquet:   0%|          | 0.00/41.9k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "610bb4f3ccb24cde9e43989cf5e29c68",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-06__openai.parquet:   0%|          | 0.00/68.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ca82377832da497e8e55fb31c124d5c4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-07__artificial.parquet:   0%|          | 0.00/32.8k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dacc1f40174d4872b7f19f9965e5e2e4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-07__localllama.parquet:   0%|          | 0.00/89.0k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "90eb047b0a144558ab6d035dc4735abb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-07__singularity.parquet:   0%|          | 0.00/45.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c135843a057a4273ad027c1754281ca0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-07__openai.parquet:   0%|          | 0.00/46.6k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "989aad4accb640c6bdc3c7357475247b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-08__artificial.parquet:   0%|          | 0.00/21.3k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "51effc38ee62422a8d46bd4226043d01",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-08__localllama.parquet:   0%|          | 0.00/96.9k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a4e3873e7fa241208d0b3bad78e244c6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-08__singularity.parquet:   0%|          | 0.00/61.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "488994c7f63e4fd3925f734871bf6344",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-08__openai.parquet:   0%|          | 0.00/72.3k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2c7ad2d13b2e4f9cb0f87ac2ce2996c6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-09__artificial.parquet:   0%|          | 0.00/18.5k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4cea5e6449a94f96ba138cca713bbaaa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-09__localllama.parquet:   0%|          | 0.00/95.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ab828b861453476d8037c13d53deaf05",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-09__singularity.parquet:   0%|          | 0.00/64.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "57ddc20f7169441b8eaa53c12921fb71",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-09__openai.parquet:   0%|          | 0.00/66.6k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "66679d74053b471799fdb430ed89f477",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-10__artificial.parquet:   0%|          | 0.00/27.6k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6d623dd7af7f466a85175e92dbc5170c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-10__localllama.parquet:   0%|          | 0.00/74.8k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "19b4c4f96ce745d88b00a49a45514312",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-10__singularity.parquet:   0%|          | 0.00/49.6k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cea15455d7ad4bc3b2f9229a349a1e97",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-10__openai.parquet:   0%|          | 0.00/62.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c78bd8a82ef14d7ea479bf9d67e6b928",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-11__artificial.parquet:   0%|          | 0.00/24.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6be2a2815a014ad8909c8b3527deac85",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-11__localllama.parquet:   0%|          | 0.00/87.3k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3ba58801c184431fa691121b89a8e9f2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-11__singularity.parquet:   0%|          | 0.00/43.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "94c247807ad84aecac35b034f9ac656d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-11__openai.parquet:   0%|          | 0.00/61.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a6f81be0b0f34f6db9af3dfdd36ce4d8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-12__artificial.parquet:   0%|          | 0.00/34.5k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "22be492306e941c0b9e7a8b4c3841bfe",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-12__localllama.parquet:   0%|          | 0.00/91.9k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7909955c0dd34184b58e4b53a8ab30c6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-12__singularity.parquet:   0%|          | 0.00/67.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "784d09168b7a449f97dddd7e89ee3402",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-12__openai.parquet:   0%|          | 0.00/63.9k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a02faa359c034d9b96c2a73e04987551",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-13__artificial.parquet:   0%|          | 0.00/31.6k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6331ecc735ff451faa8785eb9986931e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-13__localllama.parquet:   0%|          | 0.00/110k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "580144158c4b423a835111eb71c332e6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-13__singularity.parquet:   0%|          | 0.00/34.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b0d466b9c969469c8ca1fd602f306120",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-13__openai.parquet:   0%|          | 0.00/77.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "56bea313ab98472389be2d309c49a016",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-14.parquet:   0%|          | 0.00/252k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bb79c1aeb9274fdba63b219b2d88b55e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-15.parquet:   0%|          | 0.00/238k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "71b8b91c4b114b0f813e99891354f5ef",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-16.parquet:   0%|          | 0.00/215k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ef1a788d1759479f91af32b8ec1cc564",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-17.parquet:   0%|          | 0.00/211k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "399c257195bd4e58be5c5e967d08b92d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-18.parquet:   0%|          | 0.00/181k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f5e5497475a644ebb329022fd10112ff",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-19.parquet:   0%|          | 0.00/203k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e12150fd686449e4bbebecff846dc0eb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-20.parquet:   0%|          | 0.00/200k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "939ab5b309bd4551b8b81c0d1bc8b8b9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-21.parquet:   0%|          | 0.00/305k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cb043c5ee7bc4d268e14c8502ab38dd4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-22.parquet:   0%|          | 0.00/268k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "725ba9d1636e45b59cdf0c1e049125ac",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-23.parquet:   0%|          | 0.00/245k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2eb616fa7bf546af8d6905746d2d22e1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-24.parquet:   0%|          | 0.00/255k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "408b5ba48b1e440695e075461696db44",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-25.parquet:   0%|          | 0.00/232k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "74060deaa13a4cd394edee6a6221fb24",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-26.parquet:   0%|          | 0.00/229k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "54098d40c04740f3bfb52e0ad118f0c9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-14__singularity.parquet:   0%|          | 0.00/67.7k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ed212abb01cf4f3e8c7795966354e763",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-14__openai.parquet:   0%|          | 0.00/77.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1eda7ad8367343ff9b1aab3af9c0e23e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 52 LFS files:   0%|          | 0/52 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e8fdc3d100074276bcf3ef806e816379",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-14__artificial.parquet:   0%|          | 0.00/44.0k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5cd6a5e66b1e469badfe3b0eb302e18f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-14__localllama.parquet:   0%|          | 0.00/86.7k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c38e5e4579c04b5faa6a5c1ed195d9ac",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-15__artificial.parquet:   0%|          | 0.00/27.5k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "10949fbbcab440aea1518bf44738ef76",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-15__localllama.parquet:   0%|          | 0.00/91.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ab54f1bfa30f4c9fb6fa0ba7b151e172",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-15__singularity.parquet:   0%|          | 0.00/87.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f39123da09d440b1bbcac24baba73f74",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-16__artificial.parquet:   0%|          | 0.00/28.3k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e8fdd23d2d33453a9625e8f964ad5ccf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-15__openai.parquet:   0%|          | 0.00/62.5k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a9cd7f7663c0483d9b884efb7ddf9461",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-16__localllama.parquet:   0%|          | 0.00/88.7k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2cad07640f3a426d90ec230979606662",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-16__singularity.parquet:   0%|          | 0.00/61.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f417ed44a35249709bcc6aa12d2f00fd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-16__openai.parquet:   0%|          | 0.00/61.3k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c90b234944fa4e48928f80a8fcad1b11",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-17__artificial.parquet:   0%|          | 0.00/31.0k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b08c9e2a792844c283e409b4c53b311d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-17__localllama.parquet:   0%|          | 0.00/83.0k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cd3aeda6025c4253806583b881f4bde0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-17__singularity.parquet:   0%|          | 0.00/55.8k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c749b4e239f14c14946af293bd5878ec",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-17__openai.parquet:   0%|          | 0.00/75.9k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f35850eb833f470eb34a7bb7af90e7bb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-18__artificial.parquet:   0%|          | 0.00/20.8k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8e3a317077b447828b5f188af62e495a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-18__localllama.parquet:   0%|          | 0.00/89.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "818aeb4d423a44b5bb58042c1631609c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-18__singularity.parquet:   0%|          | 0.00/37.0k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ac8350638ff64429ab8537211b46e349",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-18__openai.parquet:   0%|          | 0.00/59.9k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c8ac2c9af38341e988c1cc110a37c0aa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-19__artificial.parquet:   0%|          | 0.00/34.9k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d1b8dfcf11084da182ca8a30b121d6d4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-19__localllama.parquet:   0%|          | 0.00/83.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "66d7d49bf1cc407d803805624cf5b4e4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-19__singularity.parquet:   0%|          | 0.00/74.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e6a5d864b9ae4a47b3aaf6c090f650e0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-19__openai.parquet:   0%|          | 0.00/39.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6e6ba9dcefa94b00bd2b9e788cce50c7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-20__artificial.parquet:   0%|          | 0.00/29.8k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "73cb4e099ba746ffaf3ce037ca9321b3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-20__localllama.parquet:   0%|          | 0.00/76.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1aa484c8a9c64f479b6c00db725e2b08",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-20__singularity.parquet:   0%|          | 0.00/74.3k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9bedec9db179405193a5174ae2eaf84c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-20__openai.parquet:   0%|          | 0.00/44.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d36d86fa830a4afcb50fa4d7cd6384cb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-21__artificial.parquet:   0%|          | 0.00/30.5k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8c32cb9728e8486a836dd2c7be037eb5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-21__localllama.parquet:   0%|          | 0.00/103k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a63b4c0871e04f9eb20ed25cc689806f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-21__singularity.parquet:   0%|          | 0.00/134k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a5a36a0ecafc49f78455368f8bdacffa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-21__openai.parquet:   0%|          | 0.00/63.5k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "edb6712fdac44a57aad37d6960e65c5d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-22__artificial.parquet:   0%|          | 0.00/29.3k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "60327c547d1d4970bc93ba721e017401",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-22__localllama.parquet:   0%|          | 0.00/107k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "51aee3296d81463da75209b20414fad0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-22__singularity.parquet:   0%|          | 0.00/84.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "147cfa0d1a4a4bd2a026f44f9a21ef4d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-22__openai.parquet:   0%|          | 0.00/72.0k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d6ed6504fe12452a8530429ec72497e7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-23__artificial.parquet:   0%|          | 0.00/44.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7ff04044c7a74935a8c7fa06873de5b1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-23__localllama.parquet:   0%|          | 0.00/97.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2a0d0d2b3cac40cc9f030b4c9bea1b17",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-23__singularity.parquet:   0%|          | 0.00/80.9k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "293ac26617d043e9a6b089b6c41a1c24",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-23__openai.parquet:   0%|          | 0.00/53.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7d4a32215f294ced838440086a2222ed",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-24__artificial.parquet:   0%|          | 0.00/36.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "31ae291a73ea4b54b6119b4917b4028f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-24__localllama.parquet:   0%|          | 0.00/88.0k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2ecffec0359c4c289e22c86feafd5316",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-24__singularity.parquet:   0%|          | 0.00/102k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0562f20172df437fb2ab5a2931b27876",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-24__openai.parquet:   0%|          | 0.00/66.3k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5cb6500f4252418393e74d5eaaf3c507",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-25__artificial.parquet:   0%|          | 0.00/46.3k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "89e9eddfd25348f89d045b00dee77e07",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-25__localllama.parquet:   0%|          | 0.00/77.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2e6d25bafd1042dd8e86c441d13a4c00",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-25__singularity.parquet:   0%|          | 0.00/71.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "10c8dd04e5e74cffb3ca466dfad8c188",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-25__openai.parquet:   0%|          | 0.00/63.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9c9251a1a90c47beace49fedb97ae176",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-26__artificial.parquet:   0%|          | 0.00/29.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "12d335dfe3b24f1c802bdf8e1c2c89a8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-26__localllama.parquet:   0%|          | 0.00/103k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "964f131d58164bcea62374320f3be24b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-26__singularity.parquet:   0%|          | 0.00/64.8k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "932b3d54b55a4b2e8419a90af68c2ef8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-26__openai.parquet:   0%|          | 0.00/59.3k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "93a7910bb2574a2a92ad5c3f00e72352",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-27.parquet:   0%|          | 0.00/232k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e9de3761367f4fcf80fc5978b5ef9d08",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-28.parquet:   0%|          | 0.00/270k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "55b638eb2ae3431ba0facd50aac5146e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-29.parquet:   0%|          | 0.00/262k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "83a2c27a2bc14de787b45736894bf9b8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-30.parquet:   0%|          | 0.00/240k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5a90b1ee7ad14a2a804990f948f575c7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-31.parquet:   0%|          | 0.00/231k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f703a9d565b54fe992e457891077635b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-01.parquet:   0%|          | 0.00/167k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8c79327d81a14ce6a526ee88c5bdb9e5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-02.parquet:   0%|          | 0.00/250k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1cf197678ff842d2bfe2fe30125cb924",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-03.parquet:   0%|          | 0.00/206k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "28cda701d8174e00966856b4d32a74b6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-04.parquet:   0%|          | 0.00/269k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0105914388e94b5289036a1415360afa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-27__artificial.parquet:   0%|          | 0.00/32.8k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "14874b9c03024c2d863c285345810028",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-27__singularity.parquet:   0%|          | 0.00/85.7k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4ac23fb5af124e04b64ff29059912298",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 36 LFS files:   0%|          | 0/36 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cd4317b62cdb4be1a4ff725f12c6176c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-27__localllama.parquet:   0%|          | 0.00/86.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c87ab7ac7f414fd89a5e73464dcb3de1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-27__openai.parquet:   0%|          | 0.00/50.3k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b0f9464f6d01470585b06fb3a51bb702",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-28__artificial.parquet:   0%|          | 0.00/27.6k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "203e32d6504344708541a4d304496100",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-28__localllama.parquet:   0%|          | 0.00/93.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c8dcbc994b294834bb073eb194053f9e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-28__singularity.parquet:   0%|          | 0.00/115k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "853063d0703141dfbd81a3e2540a569d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-28__openai.parquet:   0%|          | 0.00/62.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2e24f6a4094d4196989db9cf4e333ee4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-29__artificial.parquet:   0%|          | 0.00/26.9k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3d9974e8ab9f4796beb67915b338f59b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-29__localllama.parquet:   0%|          | 0.00/123k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6b8de7975297472a8ef7b8daf904ca0c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-29__singularity.parquet:   0%|          | 0.00/100k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0c17c89f875746759f2832fc3d64f3e1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-29__openai.parquet:   0%|          | 0.00/42.5k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2bb8c8922cab406e86de439df0cb9154",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-30__artificial.parquet:   0%|          | 0.00/29.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6336539762264fca8d65f34d57a73632",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-30__localllama.parquet:   0%|          | 0.00/94.9k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fdffb60713f041379a0310ea550b217d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-30__singularity.parquet:   0%|          | 0.00/88.7k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c569149b6e0a4b039c7bd942afb0c0e4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-30__openai.parquet:   0%|          | 0.00/50.6k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c6862669361e42048b862f4bd314e700",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-31__artificial.parquet:   0%|          | 0.00/34.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "98436288eb734934a80746c56f46e3f3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-31__localllama.parquet:   0%|          | 0.00/82.8k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4273f1d1d0e34eb5b9495b552c132d0d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-31__singularity.parquet:   0%|          | 0.00/82.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e12cda59fafe43db8d005655a913da40",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-05-31__openai.parquet:   0%|          | 0.00/58.9k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c0254fe8ee8743688db50c519208007c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-01__artificial.parquet:   0%|          | 0.00/12.9k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2b54a17e08034a4387a70a6042ffea2c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-01__localllama.parquet:   0%|          | 0.00/71.4k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c636b07f5f85442ea7cd7c9de3a92988",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-01__singularity.parquet:   0%|          | 0.00/50.8k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a88a12a5548949e69285c8fb32cb6c8f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-01__openai.parquet:   0%|          | 0.00/52.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ec5a3ca174944a679bf80805d5981e79",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-02__artificial.parquet:   0%|          | 0.00/30.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ad5f8a6c6a014fd4b402d7f331212ef2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-02__localllama.parquet:   0%|          | 0.00/102k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "744563bd5cbd485aa7bb55794d7fbfbb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-02__singularity.parquet:   0%|          | 0.00/81.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "54b2c7fab6ec45b78bb9da01048b6fe3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-02__openai.parquet:   0%|          | 0.00/59.5k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9d2b23456fcf404fb91413d2f28ced9d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-03__artificial.parquet:   0%|          | 0.00/37.6k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "674defc04a144c4fbb2a450992a1b2f2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-03__localllama.parquet:   0%|          | 0.00/76.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e84930c46a5c422b9fb7a2689b8560a5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-03__singularity.parquet:   0%|          | 0.00/57.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "73626e3ab52a4f0084c4b95305a0685a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-03__openai.parquet:   0%|          | 0.00/59.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0310161f060647ac9468e630615a23b2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-04__artificial.parquet:   0%|          | 0.00/35.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cf77892cf2ad4609aeb14b3ff8619c22",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-04__localllama.parquet:   0%|          | 0.00/84.6k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6a661e7c21d74c468450b419f8a04bf8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-04__singularity.parquet:   0%|          | 0.00/84.3k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f0d0f2d06033432e86853f24aa427af2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "2025-06-04__openai.parquet:   0%|          | 0.00/88.2k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Done – all subreddit shards uploaded.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Example call – adjust repo_id / token as needed\n",
+    "split_and_upload_by_subreddit(\n",
+    "    repo_id=\"hblim/top_reddit_posts_daily\",\n",
+    "    source_folder=\"data_scored\",\n",
+    "    target_folder=\"data_scored_subreddit\",\n",
+    "    overwrite=False,  # set True if you need to regenerate everything\n",
+    "    batch_size=50,    # tweak for faster / slower commits\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:reddit]",
+   "language": "python",
+   "name": "conda-env-reddit-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,32 @@

+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "reddit_analysis"
+version = "0.1.0"
+authors = [
+    { name = "Halston Lim", email = "halstonblim@gmail.com" },
+]
+description = "A pipeline for scraping, analyzing, and summarizing Reddit data"
+readme = "README.md"
+requires-python = ">=3.8"
+dependencies = [
+    "pandas",
+    "praw",
+    "pyarrow",
+    "huggingface-hub",
+    "replicate",
+    "python-dotenv",
+    "pyyaml",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "black",
+    "isort",
+]
+[tool.setuptools]
+packages = ["reddit_analysis"]

reddit_analysis/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+Reddit Analysis Pipeline
+A package for scraping, analyzing, and summarizing Reddit data.
+"""
+__version__ = "0.1.0"

reddit_analysis/common_metrics.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from prometheus_client import CollectorRegistry, Histogram, Gauge, push_to_gateway
+import time, os, sys
+from reddit_analysis.config_utils import load_environment, get_secret
+REGISTRY = CollectorRegistry()
+EXEC_DURATION = Gauge(
+    "job_duration_seconds",
+    "Wall-clock duration of the most recent job run",
+    ["job"],
+    registry=REGISTRY
+)
+SUCCESS = Gauge("job_success", "Did the job finish without exception? (1/0)", ["job"], registry=REGISTRY)
+load_environment()
+def get_gateway():
+    try:
+        return get_secret("PROM_PUSHGW_HOST")
+    except Exception:
+        return None
+def run_with_metrics(job_name, func, *args, **kwargs):
+    gateway = get_gateway()
+    if not gateway:
+        # Metrics disabled, just run the function
+        return func(*args, **kwargs)
+    start = time.time()
+    ok = 0
+    try:
+        result = func(*args, **kwargs)
+        ok = 1
+        return result
+    finally:
+        elapsed = time.time() - start
+        EXEC_DURATION.labels(job=job_name).set(elapsed)
+        SUCCESS.labels(job=job_name).set(ok)
+        try:
+            print("Pushing to gateway")
+            push_to_gateway(gateway, job=job_name, registry=REGISTRY)
+        except Exception as e:
+            print(f"[metrics] WARNING: push to {gateway} failed: {e}")

reddit_analysis/config_utils.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""
+Configuration utilities for Reddit analysis tools.
+Handles loading of config from YAML and secrets from environment or Streamlit.
+"""
+import os
+from pathlib import Path
+import yaml
+# Determine if Streamlit is available
+try:
+    import streamlit as st
+    HAS_STREAMLIT = True
+except ImportError:
+    HAS_STREAMLIT = False
+# Project root - now points to the project root directory
+ROOT = Path(__file__).resolve().parent.parent
+def is_running_streamlit():
+    # The only reliable way to detect if running inside a Streamlit app
+    return os.getenv("STREAMLIT_SERVER_PORT") is not None
+def load_environment():
+    """Load environment variables from .env if not running as a Streamlit app."""
+    if not is_running_streamlit():
+        from dotenv import load_dotenv
+        load_dotenv(dotenv_path=ROOT / '.env')
+def get_secret(key, default=None):
+    """Get a secret from environment variables or Streamlit secrets."""
+    value = os.getenv(key)
+    if value is None and HAS_STREAMLIT and is_running_streamlit():
+        value = st.secrets.get(key, default)
+    if value is None and default is None:
+        raise ValueError(f"Required secret {key} not found in environment or Streamlit secrets")
+    return value
+def load_config(config_path=None):
+    """Load configuration from YAML file."""
+    if config_path is None:
+        config_path = ROOT / "config.yaml"
+    else:
+        config_path = Path(config_path)
+    with open(config_path, 'r') as f:
+        config = yaml.safe_load(f)
+    return config
+def get_project_root():
+    """Return the project root directory."""
+    return ROOT
+def setup_config():
+    """
+    Set up and return configuration and commonly used values.
+    Returns:
+        A dictionary containing configuration and common values:
+        - config: The parsed YAML config
+        - secrets: A dictionary of required secrets (e.g., HF_TOKEN)
+        - paths: Common file paths (all relative to project root)
+    """
+    # Load environment variables
+    load_environment()
+    # Load config
+    config = load_config()
+    # Common secrets
+    secrets = {
+        'HF_TOKEN': get_secret('HF_TOKEN')
+    }
+    # Get directory paths from config or use defaults
+    raw_dir = config.get('raw_dir', 'data_raw')
+    scored_dir = config.get('scored_dir', 'data_scored')
+    logs_dir = config.get('logs_dir', 'logs')
+    # Get HF repository directories (paths within the HF repo)
+    hf_raw_dir = config.get('hf_raw_dir', 'data_raw')
+    hf_scored_dir = config.get('hf_scored_dir', 'data_scored')
+    # Common paths and constants (all paths are relative to project root)
+    paths = {
+        'root': ROOT,
+        'raw_dir': ROOT / raw_dir,
+        'scored_dir': ROOT / scored_dir,
+        'logs_dir': ROOT / logs_dir,
+        'summary_file': ROOT / config.get('summary_file', 'subreddit_daily_summary.csv'),
+        'hf_raw_dir': hf_raw_dir,
+        'hf_scored_dir': hf_scored_dir
+    }
+    # Add REPLICATE_API_TOKEN if it's in the environment
+    try:
+        secrets['REPLICATE_API_TOKEN'] = get_secret('REPLICATE_API_TOKEN')
+    except ValueError:
+        # This is optional for scrape.py, so we'll ignore if missing
+        pass
+    # Add Reddit API credentials if available
+    for key in ['REDDIT_CLIENT_ID', 'REDDIT_CLIENT_SECRET', 'REDDIT_USER_AGENT']:
+        try:
+            secrets[key] = get_secret(key)
+        except ValueError:
+            # These are required by scrape.py but we'll check there
+            pass
+    return {
+        'config': config,
+        'secrets': secrets,
+        'paths': paths
+    }

reddit_analysis/inference/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""
+Inference subpackage for Reddit Analysis Pipeline.
+Contains functionality for sentiment analysis of Reddit data.
+"""

reddit_analysis/inference/score.py ADDED Viewed

	@@ -0,0 +1,327 @@

+#!/usr/bin/env python
+"""
+Score Reddit posts and comments using Replicate.
+CLI examples
+------------
+# Score data for a specific date
+python -m reddit_analysis.inference.score --date 2025-04-20
+"""
+from __future__ import annotations
+import argparse
+import logging
+from datetime import date, timedelta
+from pathlib import Path
+from typing import Optional, List, Dict, Any
+import pandas as pd
+import pyarrow.parquet as pq
+from huggingface_hub import (
+    hf_hub_download,
+    list_repo_files,
+    login,
+    upload_file,
+    HfApi
+)
+import replicate
+import json
+import httpx
+import re
+from reddit_analysis.config_utils import setup_config
+import json
+import time
+from typing import List, Dict
+import httpx
+import replicate
+def setup_logging(logs_dir: Path) -> logging.Logger:
+    """Set up logging configuration using logs_dir from config."""
+    logs_dir.mkdir(parents=True, exist_ok=True)
+    # Create log filename with current date
+    log_file = logs_dir / f"reddit_scorer_{date.today().strftime('%Y-%m-%d')}.log"
+    # Configure logging
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.FileHandler(log_file, encoding="utf-8")
+        ]
+    )
+    logger = logging.getLogger(__name__)
+    logger.info(f"Logging initialized. Log file: {log_file}")
+    return logger
+class ReplicateAPI:
+    """Wrapper class for Replicate API interactions."""
+    def __init__(self, api_token: str, model: str, timeout_s: int = 1200):
+        # Replicate accepts an httpx.Timeout via the `timeout=` kwarg
+        self.client = replicate.Client(
+            api_token=api_token,
+            timeout=httpx.Timeout(timeout_s)  # same limit for connect/read/write/pool
+        )
+        self.model = model
+        self.retries = 3                     # total attempts per batch
+        self.logger = logging.getLogger(__name__)
+    def predict(self, texts: List[str]) -> Dict[str, List[float]]:
+        """Run sentiment analysis on a batch of texts.
+        Sends payload as a *JSON string* (your requirement) and
+        retries on transient HTTP/1.1 disconnects or timeouts.
+        """
+        payload = {"texts": json.dumps(texts)}   # keep JSON string
+        for attempt in range(self.retries):
+            try:
+                result = self.client.run(self.model, input=payload)
+                # Expected Replicate output structure
+                return {
+                    "predicted_labels": result.get("predicted_labels", []),
+                    "confidences":      result.get("confidences", []),
+                }
+            except (httpx.RemoteProtocolError, httpx.ReadTimeout) as err:
+                if attempt == self.retries - 1:
+                    raise  # re‑raise on final failure
+                backoff = 2 ** attempt            # 1 s, 2 s, 4 s …
+                self.logger.warning(f"{err!s} – retrying in {backoff}s")
+                time.sleep(backoff)
+class FileManager:
+    """Wrapper class for file operations that can be mocked for testing."""
+    def __init__(self, base_dir: Path):
+        self.base_dir = base_dir
+        self.base_dir.mkdir(parents=True, exist_ok=True)
+    def save_parquet(self, df: pd.DataFrame, filename: str) -> Path:
+        path = self.base_dir / f"{filename}.parquet"
+        df.to_parquet(path, index=False)
+        return path
+    def read_parquet(self, filename: str) -> pd.DataFrame:
+        path = self.base_dir / f"{filename}"
+        return pd.read_parquet(path)
+class HuggingFaceManager:
+    """Wrapper class for HuggingFace Hub operations that can be mocked for testing."""
+    def __init__(self, token: str, repo_id: str, repo_type: str = "dataset"):
+        self.token = token
+        self.repo_id = repo_id
+        self.repo_type = repo_type
+        self.api = HfApi(token=token)
+    def download_file(self, path_in_repo: str) -> Path:
+        return Path(hf_hub_download(
+            repo_id=self.repo_id,
+            repo_type=self.repo_type,
+            filename=path_in_repo,
+            token=self.token
+        ))
+    def upload_file(self, local_path: str, path_in_repo: str):
+        self.api.upload_file(
+            path_or_fileobj=local_path,
+            path_in_repo=path_in_repo,
+            repo_id=self.repo_id,
+            repo_type=self.repo_type,
+            token=self.token
+        )
+    def list_files(self, prefix: str) -> List[str]:
+        files = self.api.list_repo_files(
+            repo_id=self.repo_id,
+            repo_type=self.repo_type
+        )
+        return [file for file in files if file.startswith(prefix)]
+class SentimentScorer:
+    def __init__(
+        self,
+        cfg: Dict[str, Any],
+        replicate_api: Optional[ReplicateAPI] = None,
+        file_manager: Optional[FileManager] = None,
+        hf_manager: Optional[HuggingFaceManager] = None
+    ):
+        self.config = cfg['config']
+        self.secrets = cfg['secrets']
+        self.paths = cfg['paths']
+        self.logger = logging.getLogger(__name__)
+        # Initialize services with dependency injection
+        self.replicate_api = replicate_api or ReplicateAPI(
+            api_token=self.secrets['REPLICATE_API_TOKEN'],
+            model=self.config['replicate_model']
+        )
+        self.file_manager = file_manager or FileManager(self.paths['scored_dir'])
+        self.hf_manager = hf_manager or HuggingFaceManager(
+            token=self.secrets['HF_TOKEN'],
+            repo_id=self.config['repo_id'],
+            repo_type=self.config.get('repo_type', 'dataset')
+        )
+    def process_batch(self, texts: List[str]) -> tuple[List[float], List[float]]:
+        """Process a batch of texts through the sentiment model."""
+        result = self.replicate_api.predict(texts)
+        return result['predicted_labels'], result['confidences']
+    def get_existing_subreddits(self, date_str: str) -> set:
+        """Get set of subreddits that already have scored files for the given date."""
+        scored_files = self.hf_manager.list_files("data_scored_subreddit/")
+        existing_subreddits = set()
+        for fn in scored_files:
+            if fn.startswith(f"data_scored_subreddit/{date_str}__") and fn.endswith('.parquet'):
+                # Extract subreddit from filename: data_scored_subreddit/{date}__{subreddit}.parquet
+                subreddit = Path(fn).stem.split('__', 1)[1]
+                existing_subreddits.add(subreddit)
+        return existing_subreddits
+    def _sanitize(self, name: str) -> str:
+        """
+        Make subreddit safe for filenames (removes slashes, spaces, etc.).
+        """
+        name = name.strip().lower()
+        name = re.sub(r"[^\w\-\.]", "_", name)
+        return name
+    def score_date(self, date_str: str, overwrite: bool = False) -> None:
+        """Process a single date: download, score, save, and upload separate files per subreddit."""
+        self.logger.info(f"Scoring date: {date_str}")
+        # Get existing subreddits if not overwriting
+        existing_subreddits = set()
+        if not overwrite:
+            existing_subreddits = self.get_existing_subreddits(date_str)
+            if existing_subreddits:
+                self.logger.info(f"Found {len(existing_subreddits)} existing subreddit files for {date_str}")
+        # Download raw file
+        raw_path = f"{self.paths['hf_raw_dir']}/{date_str}.parquet"
+        local_path = self.hf_manager.download_file(raw_path)
+        df = self.file_manager.read_parquet(str(local_path))
+        # Validate required columns
+        required_columns = {'text', 'score', 'post_id', 'subreddit'}
+        missing_columns = required_columns - set(df.columns)
+        if missing_columns:
+            raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")
+        # Filter out existing subreddits if not overwriting
+        subreddits_to_process = df['subreddit'].unique()
+        if not overwrite and existing_subreddits:
+            subreddits_to_process = [s for s in subreddits_to_process if s not in existing_subreddits]
+            if not subreddits_to_process:
+                self.logger.info(f"All subreddits already processed for {date_str}")
+                return
+            df = df[df['subreddit'].isin(subreddits_to_process)].copy()
+            self.logger.info(f"Processing {len(subreddits_to_process)} new subreddits for {date_str}")
+        # Process in batches
+        batch_size = self.config.get('batch_size', 16)
+        texts = df['text'].tolist()
+        sentiments = []
+        confidences = []
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i:i + batch_size]
+            batch_sentiments, batch_confidences = self.process_batch(batch)
+            sentiments.extend(batch_sentiments[:len(batch)])  # Only take as many results as input texts
+            confidences.extend(batch_confidences[:len(batch)])  # Only take as many results as input texts
+        # Add results to DataFrame
+        df['sentiment'] = sentiments
+        df['confidence'] = confidences
+        # Group by subreddit and save separate files
+        subreddits = df['subreddit'].unique()
+        self.logger.info(f"Found {len(subreddits)} subreddits to process for {date_str}")
+        for subreddit in subreddits:
+            subreddit_df = df[df['subreddit'] == subreddit].copy()
+            # Save scored file per subreddit using sanitized subreddit
+            safe_sub = self._sanitize(subreddit)
+            filename = f"{date_str}__{safe_sub}"
+            scored_path = self.file_manager.save_parquet(subreddit_df, filename)
+            # Upload to HuggingFace with new path structure
+            path_in_repo = f"data_scored_subreddit/{date_str}__{safe_sub}.parquet"
+            self.hf_manager.upload_file(str(scored_path), path_in_repo)
+            self.logger.info(f"Uploaded scored file for {date_str}/{subreddit} ({len(subreddit_df)} rows) to {self.config['repo_id']}/{path_in_repo}")
+def main(date_arg: str = None, overwrite: bool = False) -> None:
+    if date_arg is None:
+        raise ValueError("Date argument is required")
+    # Load configuration
+    cfg = setup_config()
+    # Initialize logging
+    logger = setup_logging(cfg['paths']['logs_dir'])
+    # Check if REPLICATE_API_TOKEN is available
+    if 'REPLICATE_API_TOKEN' not in cfg['secrets']:
+        raise ValueError("REPLICATE_API_TOKEN is required for scoring")
+    # Initialize scorer
+    scorer = SentimentScorer(cfg)
+    # Check if date exists in raw files
+    raw_dates = set()
+    for fn in scorer.hf_manager.list_files(scorer.paths['hf_raw_dir']):
+        if fn.endswith('.parquet'):
+            raw_dates.add(Path(fn).stem)
+    if date_arg not in raw_dates:
+        logger.warning(f"No raw file found for date {date_arg}")
+        return
+    # Check if date already exists in scored files (check subreddit files)
+    if not overwrite:
+        # Get existing scored files for this date
+        scored_files = scorer.hf_manager.list_files("data_scored_subreddit/")
+        existing_subreddits = set()
+        for fn in scored_files:
+            if fn.startswith(f"data_scored_subreddit/{date_arg}__") and fn.endswith('.parquet'):
+                # Extract subreddit from filename: data_scored_subreddit/{date}__{subreddit}.parquet
+                subreddit = Path(fn).stem.split('__', 1)[1]
+                existing_subreddits.add(subreddit)
+        # Check what subreddits are in the raw data
+        raw_path = f"{scorer.paths['hf_raw_dir']}/{date_arg}.parquet"
+        try:
+            local_path = scorer.hf_manager.download_file(raw_path)
+            df = scorer.file_manager.read_parquet(str(local_path))
+            raw_subreddits = set(df['subreddit'].unique())
+            # If all subreddits already exist, skip processing
+            if raw_subreddits.issubset(existing_subreddits):
+                logger.info(f"All subreddits for date {date_arg} already scored ({len(existing_subreddits)} files)")
+                return
+            else:
+                missing_subreddits = raw_subreddits - existing_subreddits
+                logger.info(f"Some subreddits missing for {date_arg}: {missing_subreddits}")
+        except Exception as e:
+            logger.warning(f"Could not check existing subreddits for {date_arg}: {e}")
+    # Score the specified date
+    scorer.score_date(date_arg, overwrite)
+if __name__ == '__main__':
+    from reddit_analysis.common_metrics import run_with_metrics
+    parser = argparse.ArgumentParser(description='Score raw HF dataset files via Replicate.')
+    parser.add_argument('--date', type=str, required=True, help='YYYY-MM-DD date to process')
+    parser.add_argument('--overwrite', action='store_true', help='Overwrite existing scored file')
+    args = parser.parse_args()
+    run_with_metrics("score", main, args.date, args.overwrite)

reddit_analysis/monitoring/dashboard.json ADDED Viewed

	@@ -0,0 +1,309 @@

+{
+    "annotations": {
+      "list": [
+        {
+          "builtIn": 1,
+          "datasource": {
+            "type": "grafana",
+            "uid": "-- Grafana --"
+          },
+          "enable": true,
+          "hide": true,
+          "iconColor": "rgba(0, 211, 255, 1)",
+          "name": "Annotations & Alerts",
+          "type": "dashboard"
+        }
+      ]
+    },
+    "editable": true,
+    "fiscalYearStartMonth": 0,
+    "graphTooltip": 0,
+    "id": 1,
+    "links": [],
+    "panels": [
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "fejtp15n071moe"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "thresholds"
+            },
+            "mappings": [
+              {
+                "options": {
+                  "0": {
+                    "index": 0,
+                    "text": "Failure"
+                  },
+                  "1": {
+                    "index": 1,
+                    "text": "Success"
+                  }
+                },
+                "type": "value"
+              }
+            ],
+            "max": 1,
+            "min": 0,
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "red"
+                },
+                {
+                  "color": "green",
+                  "value": 0.5
+                }
+              ]
+            }
+          },
+          "overrides": []
+        },
+        "gridPos": {
+          "h": 6,
+          "w": 24,
+          "x": 0,
+          "y": 0
+        },
+        "id": 1,
+        "options": {
+          "minVizHeight": 75,
+          "minVizWidth": 75,
+          "orientation": "auto",
+          "reduceOptions": {
+            "calcs": [
+              "lastNotNull"
+            ],
+            "fields": "",
+            "values": false
+          },
+          "showThresholdLabels": false,
+          "showThresholdMarkers": false,
+          "sizing": "auto",
+          "text": {}
+        },
+        "pluginVersion": "11.6.1",
+        "targets": [
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "fejtp15n071moe"
+            },
+            "editorMode": "code",
+            "expr": "job_success{job=~\"scrape|score|summarize\"}",
+            "interval": "",
+            "legendFormat": "{{job}}",
+            "range": true,
+            "refId": "A"
+          }
+        ],
+        "title": "LAST RUN STATUS (Live)",
+        "type": "gauge"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "fejtp15n071moe"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "axisBorderShow": false,
+              "axisCenteredZero": false,
+              "axisColorMode": "text",
+              "axisLabel": "",
+              "axisPlacement": "auto",
+              "barAlignment": 0,
+              "barWidthFactor": 0.6,
+              "drawStyle": "line",
+              "fillOpacity": 0,
+              "gradientMode": "none",
+              "hideFrom": {
+                "legend": false,
+                "tooltip": false,
+                "viz": false
+              },
+              "insertNulls": false,
+              "lineInterpolation": "linear",
+              "lineWidth": 1,
+              "pointSize": 5,
+              "scaleDistribution": {
+                "type": "linear"
+              },
+              "showPoints": "auto",
+              "spanNulls": false,
+              "stacking": {
+                "group": "A",
+                "mode": "none"
+              },
+              "thresholdsStyle": {
+                "mode": "off"
+              }
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green"
+                },
+                {
+                  "color": "red",
+                  "value": 80
+                }
+              ]
+            }
+          },
+          "overrides": []
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 6
+        },
+        "id": 3,
+        "options": {
+          "legend": {
+            "calcs": [],
+            "displayMode": "list",
+            "placement": "bottom",
+            "showLegend": true
+          },
+          "tooltip": {
+            "hideZeros": false,
+            "mode": "single",
+            "sort": "none"
+          }
+        },
+        "pluginVersion": "11.6.1",
+        "targets": [
+          {
+            "editorMode": "code",
+            "expr": "job_success{job=~\"scrape|score|summarize\"}",
+            "interval": "",
+            "legendFormat": "{{job}} state",
+            "range": true,
+            "refId": "A"
+          }
+        ],
+        "title": "Job Success Monitor",
+        "type": "timeseries"
+      },
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "fejtp15n071moe"
+        },
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "axisBorderShow": false,
+              "axisCenteredZero": false,
+              "axisColorMode": "text",
+              "axisLabel": "",
+              "axisPlacement": "auto",
+              "barAlignment": 0,
+              "barWidthFactor": 0.6,
+              "drawStyle": "line",
+              "fillOpacity": 0,
+              "gradientMode": "none",
+              "hideFrom": {
+                "legend": false,
+                "tooltip": false,
+                "viz": false
+              },
+              "insertNulls": false,
+              "lineInterpolation": "linear",
+              "lineWidth": 1,
+              "pointSize": 5,
+              "scaleDistribution": {
+                "type": "linear"
+              },
+              "showPoints": "auto",
+              "spanNulls": false,
+              "stacking": {
+                "group": "A",
+                "mode": "none"
+              },
+              "thresholdsStyle": {
+                "mode": "off"
+              }
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green"
+                },
+                {
+                  "color": "red",
+                  "value": 80
+                }
+              ]
+            }
+          },
+          "overrides": []
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 6
+        },
+        "id": 2,
+        "options": {
+          "legend": {
+            "calcs": [],
+            "displayMode": "list",
+            "placement": "bottom",
+            "showLegend": true
+          },
+          "tooltip": {
+            "hideZeros": false,
+            "mode": "single",
+            "sort": "none"
+          }
+        },
+        "pluginVersion": "11.6.1",
+        "targets": [
+          {
+            "editorMode": "code",
+            "expr": "job_duration_seconds{job=~\"scrape|score|summarize\"}",
+            "interval": "",
+            "legendFormat": "{{job}} duration (seconds)",
+            "range": true,
+            "refId": "A"
+          }
+        ],
+        "title": "Wall Clock Time Monitor",
+        "type": "timeseries"
+      }
+    ],
+    "preload": false,
+    "schemaVersion": 41,
+    "tags": [],
+    "templating": {
+      "list": []
+    },
+    "time": {
+      "from": "now-20m",
+      "to": "now"
+    },
+    "timepicker": {},
+    "timezone": "browser",
+    "title": "Pipeline Health",
+    "uid": "aejtpwaxibk00d",
+    "version": 11
+  }

reddit_analysis/monitoring/dashboard_failure.png ADDED Viewed

reddit_analysis/monitoring/dashboard_success.png ADDED Viewed

reddit_analysis/monitoring/docker-compose.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+services:
+  pushgateway:
+    image: prom/pushgateway
+    ports: ["9091:9091"]
+  prometheus:
+    image: prom/prometheus
+    command:
+      - "--config.file=/etc/prometheus/prometheus.yml"
+    volumes:
+      - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
+    ports: ["9090:9090"]
+    depends_on: [pushgateway]
+  grafana:
+    image: grafana/grafana
+    environment:
+      - GF_SECURITY_ADMIN_PASSWORD=admin
+      - GF_USERS_ALLOW_SIGN_UP=false
+    ports: ["3000:3000"]
+    depends_on: [prometheus]

reddit_analysis/monitoring/prometheus.yml ADDED Viewed

	@@ -0,0 +1,8 @@

+global:
+  scrape_interval: 15s
+scrape_configs:
+  - job_name: pushgateway
+    honor_labels: true
+    static_configs:
+      - targets: ['pushgateway:9091']

reddit_analysis/scraper/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""
+Scraper subpackage for Reddit Analysis Pipeline.
+Contains functionality for scraping Reddit data.
+"""

reddit_analysis/scraper/scrape.py ADDED Viewed

	@@ -0,0 +1,310 @@

+#!/usr/bin/env python
+"""
+Scrape Reddit posts and comments.
+CLI examples
+------------
+# Scrape data for a specific date
+python -m reddit_analysis.scraper.scrape --date 2025-04-20
+"""
+from __future__ import annotations
+import argparse
+import os
+import sys
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Optional, List, Dict, Any
+import pandas as pd
+import pyarrow.parquet as pq
+from huggingface_hub import (
+    hf_hub_download,
+    list_repo_files,
+    login,
+    upload_file,
+    HfApi
+)
+import praw
+import logging
+import pytz
+from tqdm import tqdm
+from reddit_analysis.config_utils import setup_config
+class RedditAPI:
+    """Wrapper class for Reddit API interactions that can be mocked for testing."""
+    def __init__(self, client_id: str, client_secret: str, user_agent: str):
+        self.reddit = praw.Reddit(
+            client_id=client_id,
+            client_secret=client_secret,
+            user_agent=user_agent
+        )
+    def get_subreddit(self, name: str):
+        return self.reddit.subreddit(name)
+    def get_rate_limit_info(self) -> Dict[str, Any]:
+        return {
+            'used': self.reddit.auth.limits.get('used'),
+            'remaining': self.reddit.auth.limits.get('remaining'),
+            'reset_timestamp': self.reddit.auth.limits.get('reset_timestamp')
+        }
+class FileManager:
+    """Wrapper class for file operations that can be mocked for testing."""
+    def __init__(self, base_dir: Path):
+        self.base_dir = base_dir
+        self.base_dir.mkdir(parents=True, exist_ok=True)
+    def save_csv(self, df: pd.DataFrame, filename: str) -> Path:
+        path = self.base_dir / f"{filename}.csv"
+        df.to_csv(path, index=False)
+        return path
+    def save_parquet(self, df: pd.DataFrame, filename: str) -> Path:
+        path = self.base_dir / f"{filename}.parquet"
+        df.to_parquet(path, index=False)
+        return path
+    def read_parquet(self, filename: str) -> pd.DataFrame:
+        path = self.base_dir / f"{filename}.parquet"
+        return pd.read_parquet(path)
+class HuggingFaceManager:
+    """Wrapper class for HuggingFace Hub operations that can be mocked for testing."""
+    def __init__(self, token: str, repo_id: str, repo_type: str = "dataset"):
+        self.token = token
+        self.repo_id = repo_id
+        self.repo_type = repo_type
+        self.api = HfApi(token=token)
+    def download_file(self, path_in_repo: str) -> Path:
+        return Path(hf_hub_download(
+            repo_id=self.repo_id,
+            repo_type=self.repo_type,
+            filename=path_in_repo,
+            token=self.token
+        ))
+    def upload_file(self, local_path: str, path_in_repo: str):
+        self.api.upload_file(
+            path_or_fileobj=local_path,
+            path_in_repo=path_in_repo,
+            repo_id=self.repo_id,
+            repo_type=self.repo_type,
+            token=self.token
+        )
+    def list_files(self, prefix: str) -> List[str]:
+        return self.api.list_repo_files(
+            repo_id=self.repo_id,
+            repo_type=self.repo_type
+        )
+class RedditScraper:
+    def __init__(
+        self,
+        cfg: Dict[str, Any],
+        reddit_api: Optional[RedditAPI] = None,
+        file_manager: Optional[FileManager] = None,
+        hf_manager: Optional[HuggingFaceManager] = None
+    ):
+        self.config = cfg['config']
+        self.secrets = cfg['secrets']
+        self.paths = cfg['paths']
+        self.logger = logging.getLogger(__name__)
+        # Initialize services with dependency injection
+        self.reddit_api = reddit_api or RedditAPI(
+            client_id=self.secrets.get('REDDIT_CLIENT_ID'),
+            client_secret=self.secrets.get('REDDIT_CLIENT_SECRET'),
+            user_agent=self.secrets.get('REDDIT_USER_AGENT')
+        )
+        self.file_manager = file_manager or FileManager(self.paths['raw_dir'])
+        if self.config.get('push_to_hf', False):
+            self.hf_manager = hf_manager or HuggingFaceManager(
+                token=self.secrets.get('HF_TOKEN'),
+                repo_id=self.config.get('repo_id'),
+                repo_type=self.config.get('repo_type', 'dataset')
+            )
+        else:
+            self.hf_manager = hf_manager
+        self.timezone = pytz.timezone(self.config['timezone'])
+        self.logger.info(f"Output directory set to: {self.paths['raw_dir']}")
+    def get_posts(self, subreddit_config: Dict[str, Any]) -> pd.DataFrame:
+        """Fetch posts and comments from a subreddit based on configuration."""
+        subreddit_name = subreddit_config['name']
+        post_limit = subreddit_config['post_limit']
+        comment_limit = subreddit_config['comment_limit']
+        retrieved_at = datetime.now(self.timezone)
+        records = []
+        subreddit = self.reddit_api.get_subreddit(subreddit_name)
+        self.logger.info(f"Fetching {post_limit} posts from r/{subreddit_name}")
+        for submission in tqdm(
+            subreddit.top(time_filter="day", limit=post_limit),
+            total=post_limit,
+            desc=f"Processing r/{subreddit_name}"
+        ):
+            # Add post record
+            records.append({
+                "subreddit": subreddit_name,
+                "created_at": datetime.fromtimestamp(submission.created_utc, tz=self.timezone),
+                "retrieved_at": retrieved_at,
+                "type": "post",
+                "text": submission.title + "\n\n" + submission.selftext,
+                "score": submission.score,
+                "post_id": submission.id,
+                "parent_id": None
+            })
+            # Get top comments if comment_limit > 0
+            if comment_limit > 0:
+                submission.comment_sort = 'top'
+                submission.comments.replace_more(limit=0)
+                comments = getattr(submission.comments, '_comments', [])[:comment_limit]
+                for comment in comments:
+                    records.append({
+                        "subreddit": subreddit_name,
+                        "created_at": datetime.fromtimestamp(comment.created_utc, tz=self.timezone),
+                        "retrieved_at": retrieved_at,
+                        "type": "comment",
+                        "text": comment.body,
+                        "score": comment.score,
+                        "post_id": comment.id,
+                        "parent_id": comment.parent_id
+                    })
+        return pd.DataFrame(records)
+    def print_rate_limit_info(self):
+        """Print current Reddit API rate limit information."""
+        limits = self.reddit_api.get_rate_limit_info()
+        reset_ts = limits.get('reset_timestamp')
+        reset_time = (
+            datetime.fromtimestamp(reset_ts, tz=self.timezone)
+            .strftime("%Y-%m-%d %I:%M:%S %p %Z")
+            if reset_ts else "Unknown"
+        )
+        self.logger.info("Reddit API Rate Limit Info")
+        self.logger.info(f"Requests used:      {limits.get('used')}")
+        self.logger.info(f"Requests remaining: {limits.get('remaining')}")
+        self.logger.info(f"Resets at:          {reset_time}")
+    def process_date(self, date_str: str) -> None:
+        """Process data for a specific date."""
+        self.logger.info(f"Processing data for date: {date_str}")
+        all_records = []
+        for sub_cfg in self.config['subreddits']:
+            self.logger.info(f"Processing subreddit: {sub_cfg['name']}")
+            df = self.get_posts(sub_cfg)
+            all_records.append(df)
+        combined_df = pd.concat(all_records, ignore_index=True)
+        self.logger.info(f"Total records collected: {len(combined_df)}")
+        # Save to CSV
+        self.file_manager.save_csv(combined_df, date_str)
+        # Upload to HuggingFace if configured
+        if self.config.get('push_to_hf', False):
+            self._upload_to_hf(combined_df, date_str)
+        self.print_rate_limit_info()
+        self.logger.info("Reddit scraper completed successfully")
+    def _upload_to_hf(self, df: pd.DataFrame, date_str: str) -> None:
+        """Upload data to HuggingFace Hub."""
+        try:
+            current_date = datetime.strptime(date_str, "%Y-%m-%d")
+            prev_date = (current_date - timedelta(days=1)).strftime("%Y-%m-%d")
+            prev_file_path = f"{self.paths['hf_raw_dir']}/{prev_date}.parquet"
+            self.logger.info(f"Checking for previous day's file: {prev_file_path}")
+            try:
+                downloaded_path = self.hf_manager.download_file(prev_file_path)
+                existing_df = pd.read_parquet(downloaded_path)
+                existing_ids = set(existing_df["post_id"].tolist())
+                Path(downloaded_path).unlink()
+                original_count = len(df)
+                df = df[~df["post_id"].isin(existing_ids)]
+                filtered_count = len(df)
+                self.logger.info(f"Filtered {original_count - filtered_count} duplicates")
+                if df.empty:
+                    self.logger.info("No new posts to upload after deduplication")
+                    return
+            except Exception as e:
+                self.logger.warning(f"Could not fetch/process previous file: {e}")
+            parquet_path = self.file_manager.save_parquet(df, date_str)
+            path_in_repo = f"{self.paths['hf_raw_dir']}/{date_str}.parquet"
+            self.hf_manager.upload_file(str(parquet_path), path_in_repo)
+            self.logger.info(f"Uploaded {len(df)} rows for {date_str} → {path_in_repo}")
+        except Exception as e:
+            self.logger.error(f"Failed to upload to Hugging Face: {e}")
+            raise
+def setup_logging(logs_dir: Path) -> logging.Logger:
+    """Set up logging configuration using logs_dir from config."""
+    logs_dir.mkdir(parents=True, exist_ok=True)
+    # Create log filename with current date
+    log_file = logs_dir / f"reddit_scraper_{datetime.now().strftime('%Y-%m-%d')}.log"
+    # Configure logging
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.FileHandler(log_file, encoding="utf-8")
+        ]
+    )
+    logger = logging.getLogger(__name__)
+    logger.info(f"Logging initialized. Log file: {log_file}")
+    return logger
+def main(date_str: str = None) -> None:
+    # Load configuration first
+    cfg = setup_config()
+    # Initialize logging with configured logs_dir
+    logs_dir = cfg['paths']['logs_dir']
+    logger = setup_logging(logs_dir)
+    logger.info("Starting Reddit scraper...")
+    # Validate environment variables
+    required_env_vars = ["REDDIT_CLIENT_ID", "REDDIT_CLIENT_SECRET", "REDDIT_USER_AGENT"]
+    if cfg['config'].get('push_to_hf', False):
+        required_env_vars.append("HF_TOKEN")
+    missing = [v for v in required_env_vars if not cfg['secrets'].get(v) and not os.getenv(v)]
+    if missing:
+        logger.error(f"Missing required environment variables: {', '.join(missing)}")
+        raise ValueError(f"Missing required environment variables: {', '.join(missing)}")
+    # Instantiate and run
+    logger.info("Initializing Reddit scraper...")
+    scraper = RedditScraper(cfg)
+    if date_str is None:
+        date_str = datetime.now(pytz.timezone(cfg['config']['timezone'])).strftime("%Y-%m-%d")
+    scraper.process_date(date_str)
+if __name__ == "__main__":
+    from reddit_analysis.common_metrics import run_with_metrics
+    parser = argparse.ArgumentParser(description='Scrape Reddit posts and comments.')
+    parser.add_argument('--date', type=str, help='YYYY-MM-DD date to process')
+    args = parser.parse_args()
+    run_with_metrics("scrape", main, args.date)

reddit_analysis/summarizer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""
+Summarizer subpackage for Reddit Analysis Pipeline.
+Contains functionality for summarizing Reddit data analysis.
+"""

reddit_analysis/summarizer/aggregator.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""Pure‑function helpers for daily aggregation."""
+from __future__ import annotations
+import pandas as pd
+import numpy as np
+def summary_from_df(df: pd.DataFrame, gamma_post: float = 0.3) -> pd.DataFrame:
+    """
+    Return a DataFrame with daily & subreddit aggregates.
+    Expects columns:
+        retrieved_at  - UTC timestamp or ISO-date string
+        subreddit     - subreddit name
+        sentiment     - numeric score (e.g. −1 … 1)
+        score         - numeric weight / post score
+    Output columns:
+        date               (datetime.date)
+        subreddit          (string)
+        mean_sentiment
+        community_weighted_sentiment
+        count
+    """
+    # Normalize retrieved_at to datetime and extract calendar day
+    df = df.copy()
+    df["date"] = pd.to_datetime(df["retrieved_at"]).dt.date
+    # Group by date and subreddit
+    grouped = df.groupby(["date", "subreddit"])
+    # Aggregate metrics
+    result = grouped.agg(
+        # First calculate raw mean_sentiment
+        raw_mean_sentiment=("sentiment", "mean"),
+        count=("sentiment", "count"),
+    ).reset_index()
+    # Apply transformation to raw_mean_sentiment to get values in range [-1, 1] instead of [0, 1]
+    result["mean_sentiment"] = 2 * result["raw_mean_sentiment"] - 1
+    # Remove the raw mean column
+    result = result.drop(columns="raw_mean_sentiment")
+    # Calculate engagement-adjusted sentiment (EAS) for each group
+    # 1. Ensure 'score' is numeric
+    df["score_num"] = pd.to_numeric(df["score"], errors="coerce").fillna(0)
+    # 2. Compute base weights (1 + log1p(score))
+    weights_base = 1 + np.log1p(df["score_num"].clip(lower=0))
+    # 3. Apply post weight multiplier
+    weights = weights_base * np.where(df.get("type", None) == "post", gamma_post, 1.0)
+    df["weight"] = weights
+    # 4. Compute EAS per group: weighted average of sentiment
+    community_weighted_sentiments = []
+    for (date, subreddit), group in grouped:
+        w = group["weight"]
+        s = group["sentiment"]
+        eas = (w * s).sum() / w.sum() if w.sum() > 0 else 0
+        community_weighted_sentiments.append(eas)
+    result["community_weighted_sentiment"] = community_weighted_sentiments
+    # Normalize community_weighted_sentiment to range [-1,1]
+    result["community_weighted_sentiment"] = 2 * result["community_weighted_sentiment"] - 1
+    # Ensure consistent column order
+    result = result[["date", "subreddit", "mean_sentiment", "community_weighted_sentiment", "count"]]
+    return result

reddit_analysis/summarizer/summarize.py ADDED Viewed

	@@ -0,0 +1,274 @@

+#!/usr/bin/env python
+"""
+Summarise scored shards into one daily_summary.csv
+CLI examples
+------------
+# Summarize data for a specific date
+python -m reddit_analysis.summarizer.summarize --date 2025-04-20
+"""
+from __future__ import annotations
+import argparse
+from datetime import date
+from pathlib import Path
+from typing import Optional, List, Dict, Any, Set, Tuple
+import pandas as pd
+from huggingface_hub import hf_hub_download, HfApi
+from reddit_analysis.config_utils import setup_config
+from reddit_analysis.summarizer.aggregator import summary_from_df
+# --------------------------------------------------------------------------- #
+#  Utilities                                                                  #
+# --------------------------------------------------------------------------- #
+class FileManager:
+    """Wrapper class for simple local file I/O that can be mocked for testing."""
+    def __init__(self, base_dir: Path):
+        self.base_dir = base_dir
+        self.base_dir.mkdir(parents=True, exist_ok=True)
+    # ---------- CSV helpers ------------------------------------------------- #
+    def read_csv(self, path: Path) -> pd.DataFrame:
+        if not path.exists() or path.stat().st_size == 0:
+            return pd.DataFrame(
+                columns=["date", "subreddit",
+                         "mean_sentiment", "community_weighted_sentiment", "count"]
+            )
+        return pd.read_csv(path)
+    def write_csv(self, df: pd.DataFrame, path: Path) -> Path:
+        df.to_csv(path, index=False)
+        return path
+    # ---------- Parquet helper --------------------------------------------- #
+    @staticmethod
+    def read_parquet(path: Path) -> pd.DataFrame:
+        return pd.read_parquet(path)
+class HuggingFaceManager:
+    """Thin wrapper around Hugging Face Hub file ops (mock‑friendly)."""
+    def __init__(self, token: str, repo_id: str, repo_type: str = "dataset"):
+        self.token = token
+        self.repo_id = repo_id
+        self.repo_type = repo_type
+        self.api = HfApi(token=token)
+    def download_file(self, path_in_repo: str) -> Path:
+        return Path(
+            hf_hub_download(
+                repo_id=self.repo_id,
+                repo_type=self.repo_type,
+                filename=path_in_repo,
+                token=self.token
+            )
+        )
+    def upload_file(self, local_path: str, path_in_repo: str):
+        self.api.upload_file(
+            path_or_fileobj=local_path,
+            path_in_repo=path_in_repo,
+            repo_id=self.repo_id,
+            repo_type=self.repo_type,
+            token=self.token
+        )
+    def list_files(self, prefix: str) -> List[str]:
+        """List files in the HF repo filtered by prefix."""
+        files = self.api.list_repo_files(
+            repo_id=self.repo_id,
+            repo_type=self.repo_type
+        )
+        return [f for f in files if f.startswith(prefix)]
+# --------------------------------------------------------------------------- #
+#  Core manager                                                               #
+# --------------------------------------------------------------------------- #
+class SummaryManager:
+    def __init__(
+        self,
+        cfg: Dict[str, Any],
+        file_manager: Optional[FileManager] = None,
+        hf_manager: Optional[HuggingFaceManager] = None
+    ):
+        self.config = cfg["config"]
+        self.secrets = cfg["secrets"]
+        self.paths = cfg["paths"]
+        # I/O helpers
+        self.file_manager = file_manager or FileManager(self.paths["root"])
+        self.hf_manager = hf_manager or HuggingFaceManager(
+            token=self.secrets["HF_TOKEN"],
+            repo_id=self.config["repo_id"],
+            repo_type=self.config.get("repo_type", "dataset"),
+        )
+        # Cache path for the combined summary file on disk
+        self.local_summary_path: Path = self.paths["summary_file"]
+    # --------------------------------------------------------------------- #
+    #  Remote summary helpers                                               #
+    # --------------------------------------------------------------------- #
+    def _load_remote_summary(self) -> pd.DataFrame:
+        """
+        Ensure `daily_summary.csv` is present locally by downloading the
+        latest version from HF Hub (if it exists) and return it as a DataFrame.
+        """
+        remote_name = self.paths["summary_file"].name
+        try:
+            cached_path = self.hf_manager.download_file(remote_name)
+        except Exception:
+            # first run – file doesn't exist yet on the Hub
+            return pd.DataFrame(
+                columns=["date", "subreddit",
+                         "mean_sentiment", "community_weighted_sentiment", "count"]
+            )
+        return pd.read_csv(cached_path)
+    def _save_and_push_summary(self, df: pd.DataFrame):
+        """Persist the updated summary both locally and back to HF Hub."""
+        self.file_manager.write_csv(df, self.local_summary_path)
+        self.hf_manager.upload_file(str(self.local_summary_path),
+                                    self.local_summary_path.name)
+    # --------------------------------------------------------------------- #
+    #  Public helpers                                                       #
+    # --------------------------------------------------------------------- #
+    def get_processed_combinations(self) -> Set[Tuple[date, str]]:
+        """
+        Return a set of (date, subreddit) pairs that are *already* present
+        in the remote summary so we can de‑duplicate.
+        """
+        df_summary = self._load_remote_summary()
+        if df_summary.empty:
+            return set()
+        df_summary["date"] = pd.to_datetime(df_summary["date"]).dt.date
+        return {
+            (row["date"], row["subreddit"])
+            for _, row in df_summary.iterrows()
+        }
+    # --------------------------------------------------------------------- #
+    #  Main workflow                                                        #
+    # --------------------------------------------------------------------- #
+    def process_date(self, date_str: str, overwrite: bool = False) -> None:
+        """Download scored data for `date_str`, aggregate, and append/upload."""
+        # ---------- Pull scored shards for the given date ------------------ #
+        prefix = f"{self.paths['hf_scored_dir']}/{date_str}__"
+        # List all remote shards
+        try:
+            all_files = self.hf_manager.list_files(self.paths['hf_scored_dir'])
+        except Exception as err:
+            print(f"Error: could not list scored shards in {self.paths['hf_scored_dir']}: {err}")
+            return
+        # Filter to shards matching this date
+        try:
+            shards = [fn for fn in all_files if fn.startswith(prefix) and fn.endswith('.parquet')]
+        except TypeError:
+            # fall back in case list_files returned a non-iterable (e.g., a mock)
+            shards = [all_files]
+        if not shards:
+            print(f"No scored shards found for {date_str} under {self.paths['hf_scored_dir']}")
+            return
+        # Download and concatenate all shards
+        dfs: List[pd.DataFrame] = []
+        for shard in shards:
+            try:
+                local_path = self.hf_manager.download_file(shard)
+            except Exception as err:
+                print(f"Error: could not download scored shard {shard}: {err}")
+                return
+            dfs.append(self.file_manager.read_parquet(local_path))
+        df_day = pd.concat(dfs, ignore_index=True)
+        # sanity‑check
+        required_cols = {"retrieved_at", "subreddit", "sentiment", "score"}
+        if not required_cols.issubset(df_day.columns):
+            raise ValueError(f"{shards[0]} missing columns {required_cols}")
+        # ---------- Aggregate ------------------------------------------------ #
+        df_summary_day = summary_from_df(df_day)
+        # ---------- De‑duplication / overwrite ------------------------------ #
+        existing_pairs = self.get_processed_combinations()
+        if not overwrite:
+            df_summary_day = df_summary_day[
+                ~df_summary_day.apply(
+                    lambda r: (r["date"], r["subreddit"]) in existing_pairs,
+                    axis=1,
+                )
+            ]
+        if df_summary_day.empty:
+            print("Nothing new to summarise for this date.")
+            return
+        # ---------- Combine with historical summary ------------------------- #
+        df_summary = self._load_remote_summary()
+        if overwrite:
+            df_summary = df_summary[df_summary["date"] != date_str]
+        # Remove weighted_sentiment column if it exists
+        if "weighted_sentiment" in df_summary.columns:
+            df_summary = df_summary.drop(columns=["weighted_sentiment"])
+        df_out = (
+            pd.concat([df_summary, df_summary_day], ignore_index=True)
+            if not df_summary.empty
+            else df_summary_day
+        )
+        df_out["date"] = pd.to_datetime(df_out["date"]).dt.date
+        df_out.sort_values(["date", "subreddit"], inplace=True)
+        # Ensure the weighted_sentiment column is dropped from final output
+        if "weighted_sentiment" in df_out.columns:
+            df_out = df_out.drop(columns=["weighted_sentiment"])
+        # Round floating point columns to 4 decimal places
+        if "mean_sentiment" in df_out.columns:
+            df_out["mean_sentiment"] = df_out["mean_sentiment"].round(4)
+        if "community_weighted_sentiment" in df_out.columns:
+            df_out["community_weighted_sentiment"] = df_out["community_weighted_sentiment"].round(4)
+        # ---------- Save & upload ------------------------------------------- #
+        self._save_and_push_summary(df_out)
+        print(f"Updated {self.local_summary_path.name} → {len(df_out)} rows")
+# --------------------------------------------------------------------------- #
+#  CLI entry‑point                                                            #
+# --------------------------------------------------------------------------- #
+def main(date_str: str, overwrite: bool = False) -> None:
+    if not date_str:
+        raise ValueError("--date is required (YYYY-MM-DD)")
+    # Confirm valid date
+    try:
+        date.fromisoformat(date_str)
+    except ValueError:
+        raise ValueError(f"Invalid date: {date_str} (expected YYYY‑MM‑DD)")
+    cfg = setup_config()
+    SummaryManager(cfg).process_date(date_str, overwrite)
+if __name__ == "__main__":
+    from reddit_analysis.common_metrics import run_with_metrics
+    parser = argparse.ArgumentParser(
+        description="Summarize scored Reddit data for a specific date."
+    )
+    parser.add_argument("--date", required=True,
+                        help="YYYY-MM-DD date to process")
+    parser.add_argument("--overwrite", action="store_true",
+                        help="Replace any existing rows for this date")
+    args = parser.parse_args()
+    run_with_metrics("summarize", main, args.date, args.overwrite)

reddit_analysis/test_config.py ADDED Viewed

	@@ -0,0 +1,72 @@

+#!/usr/bin/env python
+"""
+Test script for config_utils module.
+This allows us to verify that our common configuration loading works properly.
+"""
+import argparse
+import os
+from pprint import pprint
+import reddit_analysis.config_utils as config_utils
+def main():
+    """Test the config_utils module."""
+    print("Testing config_utils.py")
+    # Load the configuration
+    cfg = config_utils.setup_config()
+    # Print the configuration (excluding sensitive values)
+    print("\nConfiguration:")
+    print("--------------")
+    print(f"Project root: {cfg['paths']['root']}")
+    print(f"Repo ID: {cfg['config'].get('repo_id', 'Not specified')}")
+    # Print directory configurations
+    print("\nLocal Directory Paths:")
+    print("--------------------")
+    print(f"Raw data directory: {cfg['paths']['raw_dir']}")
+    print(f"Scored data directory: {cfg['paths']['scored_dir']}")
+    print(f"Logs directory: {cfg['paths']['logs_dir']}")
+    print(f"Summary file: {cfg['paths']['summary_file']}")
+    # Print HF repository paths
+    print("\nHugging Face Repository Paths:")
+    print("---------------------------")
+    print(f"HF Raw data directory: {cfg['paths']['hf_raw_dir']}")
+    print(f"HF Scored data directory: {cfg['paths']['hf_scored_dir']}")
+    # Check if these directories exist
+    print("\nDirectory Status:")
+    print("----------------")
+    for dir_name, dir_path in [
+        ('Raw data', cfg['paths']['raw_dir']),
+        ('Scored data', cfg['paths']['scored_dir']),
+        ('Logs', cfg['paths']['logs_dir'])
+    ]:
+        exists = os.path.exists(dir_path)
+        status = "Exists" if exists else "Does not exist"
+        print(f"{dir_name} directory ({dir_path}): {status}")
+    # Check if summary file exists
+    summary_exists = os.path.exists(cfg['paths']['summary_file'])
+    print(f"Summary file ({cfg['paths']['summary_file']}): {'Exists' if summary_exists else 'Does not exist'}")
+    # Check if essential secrets are present (without printing their values)
+    print("\nSecrets available:")
+    print("-----------------")
+    print(f"HF_TOKEN: {'Present' if 'HF_TOKEN' in cfg['secrets'] else 'Missing'}")
+    print(f"REPLICATE_API_TOKEN: {'Present' if 'REPLICATE_API_TOKEN' in cfg['secrets'] else 'Missing'}")
+    # Check Reddit API credentials
+    for key in ['REDDIT_CLIENT_ID', 'REDDIT_CLIENT_SECRET', 'REDDIT_USER_AGENT']:
+        print(f"{key}: {'Present' if key in cfg['secrets'] or os.getenv(key) else 'Missing'}")
+    # List the subreddits from config if available
+    if 'subreddits' in cfg['config']:
+        print("\nConfigured subreddits:")
+        print("---------------------")
+        for sub in cfg['config']['subreddits']:
+            print(f"- {sub.get('name', 'unnamed')}: {sub.get('post_limit', 'N/A')} posts, {sub.get('comment_limit', 'N/A')} comments")
+if __name__ == "__main__":
+    main()

reddit_analysis/tests/README.md ADDED Viewed

	@@ -0,0 +1,78 @@

+### `test_config_utils.py`
+- **Functions under test**
+  - `load_config(path)` — reads settings from a YAML file.
+  - `get_secret(key)` — retrieves a secret first from `os.environ`, then from `streamlit.secrets`, else raises.
+- **Patching & mocking**
+  - Environment variables via `os.environ` or `monkeypatch.setenv()` / `monkeypatch.delenv()`.
+  - `reddit_analysis.config_utils.HAS_STREAMLIT` toggled to simulate presence of Streamlit.
+  - `streamlit.secrets` replaced with a `MockSecrets` object exposing a `.get(key)` method.
+- **Example inputs**
+  - A temporary `config.yaml` with keys like `repo_id: test/repo`, `batch_size: 16`, `replicate_model: test/model`.
+  - Secret key `"TEST_SECRET"` set in `os.environ` or returned by `MockSecrets.get()`.
+  - Missing secret scenario triggers `ValueError("Required secret TEST_SECRET not found…")`.
+---
+### `test_scrape.py`
+- **Methods under test**
+  - `RedditScraper.get_posts(subreddit)` — calls PRAW client’s `.subreddit(...).top()` and returns a DataFrame with columns `post_id, title, text, score, subreddit, created_utc, url, num_comments`.
+  - `RedditScraper.upload_to_hf(df, date)` — downloads existing parquet via `hf_hub_download`, deduplicates by `post_id`, then calls `hf_api.upload_file(...)`.
+  - `main(date)` CLI — loads config, checks for Reddit credentials, raises if missing.
+- **Patching & mocking**
+  - A fake PRAW client (`mock_reddit_client`) whose `.subreddit().top()` yields two `Mock` submissions (ids `post0`, `post1`).
+  - `hf_hub_download` patched to return a path for a “previous” parquet file containing `prev_df`.
+  - `mock_hf_api.upload_file` to capture the uploaded parquet path.
+  - Environment via `monkeypatch` and `reddit_analysis.config_utils.HAS_STREAMLIT` + `streamlit.secrets`.
+- **Example inputs**
+  - **`get_posts`** uses two submissions with `id='post0'`, `title='Test Post 0'`, etc., expecting a 2‑row DataFrame.
+  - **`upload_to_hf`** combines `prev_df` (posts 0 & 1) with `new_df` (posts 1 & 2), resulting in only `post1` & `post2` uploaded.
+  - **CLI** invoked with no Reddit env vars, raising `ValueError("Missing required Reddit API credentials")`.
+---
+### `test_summarize.py`
+- **Methods under test**
+  - `RedditSummarizer.summarize_date(date)` — downloads scored parquet, groups by `subreddit`, and computes `mean_sentiment`, `count`, `total_score`, `weighted_sentiment`, plus `date`.
+  - `RedditSummarizer.update_summary(df)` — appends to or creates `summary_file`, preserving chronological order.
+  - CLI entrypoint in `main(date)` — validates date format or scored-file existence.
+- **Patching & mocking**
+  - `hf_hub_download` patched to return a temp parquet containing `sample_scored_data` (4 rows for two subreddits).
+  - `reddit_analysis.config_utils.HAS_STREAMLIT` and `streamlit.secrets.get(...)` for missing-file tests.
+- **Example inputs & expectations**
+  - **`summarize_date`**:
+    ```python
+    sample_scored_data = pd.DataFrame({
+      'subreddit': ['test1','test1','test2','test2'],
+      'sentiment': [0.8,0.6,0.4,0.2],
+      'score': [10,20,30,40],
+      …
+    })
+    ```
+    – Expect two summary rows:
+    - test1: `mean_sentiment≈0.7`, `count=2`, `total_score=30`, `weighted_sentiment≈0.6667`
+    - test2: `mean_sentiment≈0.3`, `count=2`, `total_score=70`, `weighted_sentiment≈0.2857`
+  - **`update_summary`**: merges an initial 2‑row file for `2025-04-19` with a new 2‑row file for `2025-04-20`, ending with 4 total rows.
+  - **CLI invalid date**: `main('2025-04-20-invalid')` → `ValueError("Invalid date format")`.
+  - **Missing scored file**: patched `hf_hub_download` raises → `ValueError("Failed to download scored file…")`.
+---
+### `test_score.py`
+- **Class & functions under test**
+  - `RedditScorer.score_date(date)` — downloads input parquet, asserts required columns (`text, score, post_id, subreddit`), splits into batches, calls `replicate_client.run()`, injects `sentiment` & `confidence`, writes parquet, then calls `hf_api.upload_file()`.
+  - CLI `main(date)` — reads `.env` or `streamlit.secrets`, requires `REPLICATE_API_TOKEN`, else raises.
+- **Patching & mocking**
+  - `hf_hub_download` patched to return a temp parquet for the “input” DataFrame.
+  - `mock_hf_api` supplying a stubbed `upload_file` method.
+  - `mock_replicate_client.run` side‑effect that:
+    ```python
+    texts = json.loads(input['texts'])
+    sentiments = ['positive' if i%2==0 else 'negative' for i in range(len(texts))]
+    confidences = [0.9 if i%2==0 else 0.8 for i in range(len(texts))]
+    ```
+  - `reddit_analysis.config_utils.HAS_STREAMLIT` + `streamlit.secrets.get(...)` for the CLI missing‑token test.
+- **Example inputs & expectations**
+  - **`test_score_date`**: input DataFrame with two rows (`'Test text 1'`, `'Test text 2'`), expects uploaded parquet to have `sentiment=['positive','negative']`, `confidence=[0.9,0.8]` and all six columns present.
+  - **`test_score_date_missing_columns`**: input missing `post_id`/`subreddit` → `ValueError("missing expected columns")`.
+  - **`test_score_date_batch_processing`**: input of 5 texts, `batch_size=2` → `replicate_client.run` called 3 times, final uploaded file contains all 5 rows.
+  - **`test_cli_missing_token`**: no `REPLICATE_API_TOKEN` in env or secrets → `ValueError("REPLICATE_API_TOKEN is required for scoring")`.

reddit_analysis/tests/inference/test_score.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import os
+from pathlib import Path
+import pytest
+import pandas as pd
+from datetime import datetime
+import pytz
+from unittest.mock import Mock, patch
+import json
+from reddit_analysis.inference.score import SentimentScorer, ReplicateAPI, FileManager, HuggingFaceManager
+@pytest.fixture
+def mock_config():
+    """Create a mock configuration dictionary."""
+    return {
+        'config': {
+            'repo_id': 'test/repo',
+            'repo_type': 'dataset',
+            'batch_size': 2,
+            'replicate_model': 'test/model'
+        },
+        'paths': {
+            'raw_dir': Path('data/raw'),
+            'scored_dir': Path('data/scored'),
+            'hf_raw_dir': 'data/raw',
+            'hf_scored_dir': 'data/scored'
+        },
+        'secrets': {
+            'HF_TOKEN': 'test_token',
+            'REPLICATE_API_TOKEN': 'test_token'
+        }
+    }
+@pytest.fixture
+def mock_replicate_api():
+    """Create a mock ReplicateAPI."""
+    mock = Mock(spec=ReplicateAPI)
+    mock.predict.return_value = {
+        'predicted_labels': ['positive', 'negative'],
+        'confidences': [0.9, 0.8]
+    }
+    return mock
+@pytest.fixture
+def mock_file_manager():
+    """Create a mock FileManager."""
+    mock = Mock(spec=FileManager)
+    return mock
+@pytest.fixture
+def mock_hf_manager():
+    """Create a mock HuggingFaceManager."""
+    mock = Mock(spec=HuggingFaceManager)
+    return mock
+def test_score_date(mock_config, mock_replicate_api, mock_file_manager, mock_hf_manager):
+    """Test the score_date method."""
+    # Create test input DataFrame
+    input_df = pd.DataFrame({
+        'text': ['Test text 1', 'Test text 2'],
+        'score': [1, 2],
+        'post_id': ['post1', 'post2'],
+        'subreddit': ['test1', 'test1']
+    })
+    # Mock file operations
+    mock_hf_manager.download_file.return_value = Path('test.parquet')
+    mock_file_manager.read_parquet.return_value = input_df
+    mock_hf_manager.list_files.return_value = []  # No existing files
+    # Initialize scorer with mocked dependencies
+    scorer = SentimentScorer(
+        mock_config,
+        replicate_api=mock_replicate_api,
+        file_manager=mock_file_manager,
+        hf_manager=mock_hf_manager
+    )
+    # Score the data
+    scorer.score_date('2025-04-20')
+    # Verify API calls
+    mock_replicate_api.predict.assert_called_once()
+    mock_file_manager.save_parquet.assert_called_once()
+    mock_hf_manager.upload_file.assert_called_once()
+def test_score_date_missing_columns(mock_config, mock_replicate_api, mock_file_manager, mock_hf_manager):
+    """Test score_date with missing required columns."""
+    # Create test input DataFrame missing required columns
+    input_df = pd.DataFrame({
+        'text': ['Test text 1', 'Test text 2'],
+        'score': [1, 2]
+    })
+    # Mock file operations
+    mock_hf_manager.download_file.return_value = Path('test.parquet')
+    mock_file_manager.read_parquet.return_value = input_df
+    mock_hf_manager.list_files.return_value = []  # No existing files
+    # Initialize scorer with mocked dependencies
+    scorer = SentimentScorer(
+        mock_config,
+        replicate_api=mock_replicate_api,
+        file_manager=mock_file_manager,
+        hf_manager=mock_hf_manager
+    )
+    # Verify it raises ValueError
+    with pytest.raises(ValueError) as exc_info:
+        scorer.score_date('2025-04-20')
+    assert "Missing required columns" in str(exc_info.value)
+def test_score_date_batch_processing(mock_config, mock_replicate_api, mock_file_manager, mock_hf_manager):
+    """Test that score_date correctly processes data in batches."""
+    # Create test input DataFrame with more rows than batch size
+    input_df = pd.DataFrame({
+        'text': [f'Test text {i}' for i in range(5)],
+        'score': [i + 1 for i in range(5)],
+        'post_id': [f'post{i}' for i in range(5)],
+        'subreddit': ['test1'] * 5
+    })
+    # Mock file operations
+    mock_hf_manager.download_file.return_value = Path('test.parquet')
+    mock_file_manager.read_parquet.return_value = input_df
+    mock_hf_manager.list_files.return_value = []  # No existing files
+    # Initialize scorer with mocked dependencies
+    scorer = SentimentScorer(
+        mock_config,
+        replicate_api=mock_replicate_api,
+        file_manager=mock_file_manager,
+        hf_manager=mock_hf_manager
+    )
+    # Score the data
+    scorer.score_date('2025-04-20')
+    # Verify that replicate_api.predict was called the correct number of times
+    assert mock_replicate_api.predict.call_count == 3  # 5 rows with batch_size=2
+    # Verify file operations
+    mock_file_manager.save_parquet.assert_called_once()
+    mock_hf_manager.upload_file.assert_called_once()
+def test_score_date_multiple_subreddits(mock_config, mock_replicate_api, mock_file_manager, mock_hf_manager):
+    """Test that score_date correctly handles multiple subreddits."""
+    # Create test input DataFrame with multiple subreddits
+    input_df = pd.DataFrame({
+        'text': ['Test text 1', 'Test text 2', 'Test text 3', 'Test text 4'],
+        'score': [1, 2, 3, 4],
+        'post_id': ['post1', 'post2', 'post3', 'post4'],
+        'subreddit': ['test1', 'test1', 'test2', 'test2']
+    })
+    # Mock file operations
+    mock_hf_manager.download_file.return_value = Path('test.parquet')
+    mock_file_manager.read_parquet.return_value = input_df
+    mock_hf_manager.list_files.return_value = []  # No existing files
+    # Initialize scorer with mocked dependencies
+    scorer = SentimentScorer(
+        mock_config,
+        replicate_api=mock_replicate_api,
+        file_manager=mock_file_manager,
+        hf_manager=mock_hf_manager
+    )
+    # Score the data
+    scorer.score_date('2025-04-20')
+    # Verify API calls
+    mock_replicate_api.predict.assert_called()
+    # Verify that save_parquet was called for each subreddit
+    assert mock_file_manager.save_parquet.call_count == 2  # 2 subreddits
+    # Verify that upload_file was called for each subreddit
+    assert mock_hf_manager.upload_file.call_count == 2  # 2 subreddits
+    # Check that the upload paths are correct
+    upload_calls = mock_hf_manager.upload_file.call_args_list
+    upload_paths = [call[0][1] for call in upload_calls]  # Second positional argument is path_in_repo
+    assert 'data_scored_subreddit/2025-04-20__test1.parquet' in upload_paths
+    assert 'data_scored_subreddit/2025-04-20__test2.parquet' in upload_paths
+def test_score_date_with_existing_subreddits(mock_config, mock_replicate_api, mock_file_manager, mock_hf_manager):
+    """Test that score_date skips existing subreddits when overwrite=False."""
+    # Create test input DataFrame with multiple subreddits
+    input_df = pd.DataFrame({
+        'text': ['Test text 1', 'Test text 2', 'Test text 3', 'Test text 4'],
+        'score': [1, 2, 3, 4],
+        'post_id': ['post1', 'post2', 'post3', 'post4'],
+        'subreddit': ['test1', 'test1', 'test2', 'test2']
+    })
+    # Mock file operations
+    mock_hf_manager.download_file.return_value = Path('test.parquet')
+    mock_file_manager.read_parquet.return_value = input_df
+    # Mock existing files - test1 already exists
+    mock_hf_manager.list_files.return_value = ['data_scored_subreddit/2025-04-20__test1.parquet']
+    # Initialize scorer with mocked dependencies
+    scorer = SentimentScorer(
+        mock_config,
+        replicate_api=mock_replicate_api,
+        file_manager=mock_file_manager,
+        hf_manager=mock_hf_manager
+    )
+    # Score the data (overwrite=False by default)
+    scorer.score_date('2025-04-20', overwrite=False)
+    # Verify API calls - should only process test2 subreddit (2 texts)
+    mock_replicate_api.predict.assert_called()
+    # Verify that save_parquet was called only for test2
+    assert mock_file_manager.save_parquet.call_count == 1
+    # Verify that upload_file was called only for test2
+    assert mock_hf_manager.upload_file.call_count == 1
+    # Check that only test2 was uploaded
+    upload_calls = mock_hf_manager.upload_file.call_args_list
+    upload_paths = [call[0][1] for call in upload_calls]  # Second positional argument is path_in_repo
+    assert 'data_scored_subreddit/2025-04-20__test2.parquet' in upload_paths
+    assert 'data_scored_subreddit/2025-04-20__test1.parquet' not in upload_paths
+def test_get_existing_subreddits(mock_config, mock_replicate_api, mock_file_manager, mock_hf_manager):
+    """Test the get_existing_subreddits method."""
+    # Mock existing files
+    mock_hf_manager.list_files.return_value = [
+        'data_scored_subreddit/2025-04-20__test1.parquet',
+        'data_scored_subreddit/2025-04-20__test2.parquet',
+        'data_scored_subreddit/2025-04-21__test1.parquet',  # Different date
+        'other_folder/2025-04-20__test3.parquet'  # Different folder
+    ]
+    # Initialize scorer with mocked dependencies
+    scorer = SentimentScorer(
+        mock_config,
+        replicate_api=mock_replicate_api,
+        file_manager=mock_file_manager,
+        hf_manager=mock_hf_manager
+    )
+    # Get existing subreddits for 2025-04-20
+    existing = scorer.get_existing_subreddits('2025-04-20')
+    # Should only include test1 and test2 for the correct date
+    assert existing == {'test1', 'test2'}
+def test_cli_missing_token(monkeypatch, tmp_path):
+    """Test CLI with missing REPLICATE_API_TOKEN."""
+    # Create a temporary .env file without REPLICATE_API_TOKEN
+    env_path = tmp_path / '.env'
+    env_path.write_text('')
+    # Set environment variable to point to our test .env
+    monkeypatch.setenv('REDDIT_ANALYSIS_ENV', str(env_path))
+    # Remove REPLICATE_API_TOKEN from environment
+    monkeypatch.delenv('REPLICATE_API_TOKEN', raising=False)
+    # Ensure HF_TOKEN is present so only REPLICATE_API_TOKEN is missing
+    monkeypatch.setenv('HF_TOKEN', 'dummy_hf_token')
+    # Mock Streamlit's HAS_STREAMLIT to True
+    monkeypatch.setattr('reddit_analysis.config_utils.HAS_STREAMLIT', True)
+    # Mock is_running_streamlit to True
+    monkeypatch.setattr('reddit_analysis.config_utils.is_running_streamlit', lambda: True)
+    # Mock Streamlit secrets
+    mock_secrets = Mock()
+    mock_secrets.get.return_value = None
+    monkeypatch.setattr('streamlit.secrets', mock_secrets)
+    # Print for debug
+    import os
+    print('DEBUG: REPLICATE_API_TOKEN value before main:', os.environ.get('REPLICATE_API_TOKEN'))
+    # Run the CLI with --date argument
+    with pytest.raises(ValueError) as exc_info:
+        from reddit_analysis.inference.score import main
+        main('2025-04-20')
+    assert "REPLICATE_API_TOKEN is required for scoring" in str(exc_info.value)

reddit_analysis/tests/scraper/test_scrape.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import os
+from pathlib import Path
+import pytest
+import pandas as pd
+from datetime import datetime, date
+import pytz
+from unittest.mock import Mock, patch
+from reddit_analysis.scraper.scrape import RedditScraper, RedditAPI, FileManager, HuggingFaceManager
+@pytest.fixture
+def mock_config():
+    """Create a mock configuration dictionary."""
+    return {
+        'config': {
+            'repo_id': 'test/repo',
+            'repo_type': 'dataset',
+            'subreddits': [
+                {'name': 'test1', 'post_limit': 2, 'comment_limit': 2},
+                {'name': 'test2', 'post_limit': 2, 'comment_limit': 2}
+            ],
+            'post_limit': 100,
+            'timezone': 'UTC'
+        },
+        'paths': {
+            'raw_dir': Path('data/raw'),
+            'logs_dir': Path('logs'),
+            'hf_raw_dir': 'data/raw'
+        },
+        'secrets': {
+            'HF_TOKEN': 'test_token',
+            'REDDIT_CLIENT_ID': 'test_id',
+            'REDDIT_CLIENT_SECRET': 'test_secret',
+            'REDDIT_USER_AGENT': 'test_agent'
+        }
+    }
+@pytest.fixture
+def mock_reddit_api():
+    """Create a mock RedditAPI."""
+    mock = Mock(spec=RedditAPI)
+    # Create mock submission objects
+    mock_submissions = []
+    for i in range(2):
+        submission = Mock()
+        submission.id = f'post{i}'
+        submission.title = f'Test Post {i}'
+        submission.selftext = f'Test content {i}'
+        submission.score = i + 1
+        submission.created_utc = datetime.now(pytz.UTC).timestamp()
+        submission.url = f'https://reddit.com/test{i}'
+        submission.num_comments = i * 10
+        # Mock the comments
+        comment = Mock()
+        comment.id = f'comment{i}'
+        comment.body = f'Test comment {i}'
+        comment.score = i + 5
+        comment.created_utc = datetime.now(pytz.UTC).timestamp()
+        comment.parent_id = submission.id
+        # Set up comment attributes
+        submission.comments = Mock()
+        submission.comments._comments = [comment]
+        submission.comments.replace_more = Mock(return_value=None)
+        mock_submissions.append(submission)
+    # Set up the mock subreddit
+    mock_subreddit = Mock()
+    mock_subreddit.top.return_value = mock_submissions
+    mock.get_subreddit.return_value = mock_subreddit
+    return mock
+@pytest.fixture
+def mock_file_manager():
+    """Create a mock FileManager."""
+    mock = Mock(spec=FileManager)
+    return mock
+@pytest.fixture
+def mock_hf_manager():
+    """Create a mock HuggingFaceManager."""
+    mock = Mock(spec=HuggingFaceManager)
+    return mock
+def test_get_posts(mock_config, mock_reddit_api):
+    """Test the get_posts method."""
+    # Initialize scraper with mocked RedditAPI
+    scraper = RedditScraper(mock_config, reddit_api=mock_reddit_api)
+    # Get posts for a test subreddit
+    df = scraper.get_posts({'name': 'test1', 'post_limit': 2, 'comment_limit': 2})
+    # Verify DataFrame structure and content
+    assert isinstance(df, pd.DataFrame)
+    assert len(df) == 4  # 2 posts + 2 comments
+    # Verify posts
+    posts_df = df[df['type'] == 'post']
+    assert len(posts_df) == 2
+    assert posts_df['subreddit'].iloc[0] == 'test1'
+    assert posts_df['post_id'].iloc[0] == 'post0'
+    assert posts_df['post_id'].iloc[1] == 'post1'
+    # Verify comments
+    comments_df = df[df['type'] == 'comment']
+    assert len(comments_df) == 2
+    assert comments_df['subreddit'].iloc[0] == 'test1'
+    assert comments_df['post_id'].iloc[0] == 'comment0'
+    assert comments_df['parent_id'].iloc[0] == 'post0'
+def test_upload_to_hf_deduplication(mock_config, mock_file_manager, mock_hf_manager):
+    """Test the upload_to_hf method with deduplication."""
+    # Create test DataFrames
+    prev_df = pd.DataFrame({
+        'post_id': ['post0', 'post1'],
+        'title': ['Old Post 0', 'Old Post 1'],
+        'text': ['Old content 0', 'Old content 1'],
+        'score': [1, 2],
+        'subreddit': ['test1', 'test1'],
+        'created_utc': [datetime.now(pytz.UTC)] * 2,
+        'url': ['https://reddit.com/old0', 'https://reddit.com/old1'],
+        'num_comments': [10, 20]
+    })
+    new_df = pd.DataFrame({
+        'post_id': ['post1', 'post2'],
+        'title': ['New Post 1', 'New Post 2'],
+        'text': ['New content 1', 'New content 2'],
+        'score': [3, 4],
+        'subreddit': ['test1', 'test1'],
+        'created_utc': [datetime.now(pytz.UTC)] * 2,
+        'url': ['https://reddit.com/new1', 'https://reddit.com/new2'],
+        'num_comments': [30, 40]
+    })
+    # Mock file operations
+    mock_hf_manager.download_file.return_value = Path('test.parquet')
+    mock_file_manager.read_parquet.return_value = prev_df
+    # Initialize scraper with mocked dependencies
+    scraper = RedditScraper(
+        mock_config,
+        file_manager=mock_file_manager,
+        hf_manager=mock_hf_manager
+    )
+    # Upload new data
+    scraper._upload_to_hf(new_df, '2025-04-20')
+    # Verify file operations
+    mock_file_manager.save_parquet.assert_called_once()
+    mock_hf_manager.upload_file.assert_called_once()
+def test_cli_missing_env(monkeypatch, tmp_path):
+    """Test CLI with missing environment variables."""
+    # Create a temporary .env file without required variables
+    env_path = tmp_path / '.env'
+    env_path.write_text('')
+    # Set environment variable to point to our test .env
+    monkeypatch.setenv('REDDIT_ANALYSIS_ENV', str(env_path))
+    # Remove any existing Reddit API credentials from environment
+    for key in ['REDDIT_CLIENT_ID', 'REDDIT_CLIENT_SECRET', 'REDDIT_USER_AGENT']:
+        monkeypatch.delenv(key, raising=False)
+    # Ensure HF_TOKEN is present so only Reddit client vars are missing
+    monkeypatch.setenv('HF_TOKEN', 'dummy_hf_token')
+    # Mock Streamlit's HAS_STREAMLIT to True
+    monkeypatch.setattr('reddit_analysis.config_utils.HAS_STREAMLIT', True)
+    # Mock is_running_streamlit to True
+    monkeypatch.setattr('reddit_analysis.config_utils.is_running_streamlit', lambda: True)
+    # Mock Streamlit secrets to return None
+    mock_secrets = Mock()
+    mock_secrets.get.return_value = None
+    monkeypatch.setattr('streamlit.secrets', mock_secrets)
+    # Print for debug
+    import os
+    print('DEBUG: REDDIT_CLIENT_ID value before main:', os.environ.get('REDDIT_CLIENT_ID'))
+    # Run the CLI with --date argument
+    with pytest.raises(ValueError) as exc_info:
+        from reddit_analysis.scraper.scrape import main
+        main('2025-04-20')
+    assert "Missing required environment variables: REDDIT_CLIENT_ID" in str(exc_info.value)

reddit_analysis/tests/summarizer/test_summarize.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import pytest
+import pandas as pd
+from pathlib import Path
+from datetime import date
+from unittest.mock import Mock, patch
+from reddit_analysis.summarizer.summarize import (
+    SummaryManager,
+    FileManager,
+    HuggingFaceManager,
+)
+# --------------------------------------------------------------------------- #
+#  Fixtures                                                                   #
+# --------------------------------------------------------------------------- #
+@pytest.fixture
+def mock_config(tmp_path):
+    """Minimal config dict compatible with SummaryManager."""
+    return {
+        "config": {
+            "repo_id": "test/repo",
+            "repo_type": "dataset",
+        },
+        "paths": {
+            "root": tmp_path,
+            "scored_dir": tmp_path / "scored",
+            "hf_scored_dir": "scored",          # relative path in the Hub
+            "summary_file": tmp_path / "summary.csv",
+        },
+        "secrets": {"HF_TOKEN": "fake"},
+    }
+@pytest.fixture
+def mock_file_manager():
+    """FileManager double with just the methods we need."""
+    m = Mock(spec=FileManager)
+    # read_parquet returns sample data we set in each test
+    # write_csv just returns a Path so downstream code is happy
+    m.write_csv.return_value = Path("summary.csv")
+    return m
+@pytest.fixture
+def mock_hf_manager():
+    """HuggingFaceManager double."""
+    return Mock(spec=HuggingFaceManager)
+# --------------------------------------------------------------------------- #
+#  Tests                                                                      #
+# --------------------------------------------------------------------------- #
+def test_process_date(mock_config, mock_file_manager, mock_hf_manager):
+    """End‑to‑end happy path."""
+    # ---------- sample scored shard --------------------------------------- #
+    sample = pd.DataFrame(
+        {
+            "subreddit": ["a", "a", "b", "b"],
+            "sentiment": [0.8, 0.6, 0.4, 0.2],
+            "score": [10, 20, 30, 40],
+            "post_id": ["p1", "p2", "p3", "p4"],
+            "text": ["t1", "t2", "t3", "t4"],
+            "retrieved_at": pd.Timestamp.utcnow(),
+        }
+    )
+    mock_file_manager.read_parquet.return_value = sample
+    # first call → download scored file, second call (within _save_and_push_summary) unused here
+    mock_hf_manager.download_file.return_value = Path("dummy.parquet")
+    with patch.object(
+        SummaryManager, "_load_remote_summary", return_value=pd.DataFrame()
+    ):
+        mgr = SummaryManager(
+            mock_config, file_manager=mock_file_manager, hf_manager=mock_hf_manager
+        )
+        mgr.process_date("2025-04-20")
+    # assertions
+    mock_file_manager.read_parquet.assert_called_once()
+    mock_file_manager.write_csv.assert_called_once()
+    mock_hf_manager.upload_file.assert_called_once()
+def test_get_processed_combinations(mock_config, mock_file_manager, mock_hf_manager):
+    """The helper should translate the existing CSV into a set of tuples."""
+    existing = pd.DataFrame(
+        {
+            "date": ["2025-04-19", "2025-04-19"],
+            "subreddit": ["a", "b"],
+            "mean_sentiment": [0.5, 0.3],
+            "weighted_sentiment": [0.4, 0.2],
+            "count": [1, 1],
+        }
+    )
+    with patch.object(
+        SummaryManager, "_load_remote_summary", return_value=existing
+    ):
+        mgr = SummaryManager(
+            mock_config, file_manager=mock_file_manager, hf_manager=mock_hf_manager
+        )
+        processed = mgr.get_processed_combinations()
+    assert processed == {(date(2025, 4, 19), "a"), (date(2025, 4, 19), "b")}
+def test_cli_invalid_date():
+    """main() should raise on malformed dates."""
+    from reddit_analysis.summarizer.summarize import main
+    with pytest.raises(ValueError):
+        main("bad‑date‑format")
+def test_cli_missing_scored_file(mock_config, mock_file_manager, mock_hf_manager):
+    """Gracefully handles a missing *_scored.parquet on the Hub."""
+    # download of scored file raises, but remote summary loads fine →
+    mock_hf_manager.download_file.side_effect = Exception("not found")
+    with patch.object(
+        SummaryManager, "_load_remote_summary", return_value=pd.DataFrame()
+    ):
+        mgr = SummaryManager(
+            mock_config, file_manager=mock_file_manager, hf_manager=mock_hf_manager
+        )
+        # Should simply return after printing error, not raise.
+        assert mgr.process_date("2025-04-20") is None

reddit_analysis/tests/test_config_utils.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import os
+from pathlib import Path
+import pytest
+import yaml
+from reddit_analysis.config_utils import load_config, get_secret, ROOT
+@pytest.fixture
+def temp_config_file(tmp_path):
+    """Create a temporary config file with test data."""
+    config = {
+        'repo_id': 'test/repo',
+        'repo_type': 'dataset',
+        'raw_dir': 'data/raw',
+        'scored_dir': 'data/scored',
+        'logs_dir': 'logs',
+        'summary_file': 'summary.csv',
+        'hf_raw_dir': 'data/raw',
+        'hf_scored_dir': 'data/scored',
+        'batch_size': 16,
+        'replicate_model': 'test/model',
+        'subreddits': ['test1', 'test2'],
+        'post_limit': 100
+    }
+    config_path = tmp_path / 'config.yaml'
+    with open(config_path, 'w') as f:
+        yaml.dump(config, f)
+    return config_path
+def test_load_config(temp_config_file, monkeypatch):
+    """Test that load_config correctly reads the config file."""
+    # Mock the ROOT path to point to our test directory
+    monkeypatch.setattr('reddit_analysis.config_utils.ROOT', temp_config_file.parent)
+    # Load the config
+    config = load_config()  # Should now find config.yaml in the test directory
+    # Verify the values
+    assert config['repo_id'] == 'test/repo'
+    assert config['repo_type'] == 'dataset'
+    assert config['raw_dir'] == 'data/raw'
+    assert config['scored_dir'] == 'data/scored'
+    assert config['logs_dir'] == 'logs'
+    assert config['summary_file'] == 'summary.csv'
+    assert config['hf_raw_dir'] == 'data/raw'
+    assert config['hf_scored_dir'] == 'data/scored'
+    assert config['batch_size'] == 16
+    assert config['replicate_model'] == 'test/model'
+    assert config['subreddits'] == ['test1', 'test2']
+    assert config['post_limit'] == 100
+def test_get_secret_env_var(monkeypatch):
+    """Test get_secret with environment variable."""
+    # Set a test environment variable
+    monkeypatch.setenv('TEST_SECRET', 'env_value')
+    # Get the secret
+    value = get_secret('TEST_SECRET')
+    # Verify it returns the environment variable value
+    assert value == 'env_value'
+def test_get_secret_streamlit(monkeypatch):
+    """Test get_secret with Streamlit secrets."""
+    # Remove environment variable
+    monkeypatch.delenv('TEST_SECRET', raising=False)
+    # Mock Streamlit's HAS_STREAMLIT to True
+    monkeypatch.setattr('reddit_analysis.config_utils.HAS_STREAMLIT', True)
+    # Mock is_running_streamlit to True
+    monkeypatch.setattr('reddit_analysis.config_utils.is_running_streamlit', lambda: True)
+    # Mock Streamlit secrets
+    class MockSecrets:
+        def get(self, key, default=None):
+            return 'streamlit_value'
+    monkeypatch.setattr('streamlit.secrets', MockSecrets())
+    # Get the secret
+    value = get_secret('TEST_SECRET')
+    # Verify it returns the Streamlit secret value
+    assert value == 'streamlit_value'
+def test_get_secret_missing(monkeypatch):
+    """Test get_secret when secret is missing from both sources."""
+    # Remove environment variable
+    monkeypatch.delenv('TEST_SECRET', raising=False)
+    # Mock Streamlit's HAS_STREAMLIT to True
+    monkeypatch.setattr('reddit_analysis.config_utils.HAS_STREAMLIT', True)
+    # Mock Streamlit secrets to return None
+    class MockSecrets:
+        def get(self, key, default=None):
+            return default
+    monkeypatch.setattr('streamlit.secrets', MockSecrets())
+    # Verify it raises ValueError
+    with pytest.raises(ValueError) as exc_info:
+        get_secret('TEST_SECRET')
+    assert "Required secret TEST_SECRET not found" in str(exc_info.value)

requirements-dev.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+pandas==2.2.3
+python-dotenv==1.1.0
+pyyaml==6.0.2
+replicate==1.0.4
+huggingface-hub==0.30.2
+streamlit==1.44.1
+altair==5.5.0
+pyarrow==19.0.1
+pytest==8.3.5
+praw>=7.8.1
+prometheus-client==0.21.1

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+# Core dependencies
+streamlit==1.44.1
+pandas==2.2.3
+numpy==2.2.4
+altair==5.5.0
+# Data handling
+huggingface-hub==0.30.2
+pyyaml==6.0.2
+# Text analysis
+spacy==3.8.5
+scikit-learn==1.6.1
+sentence-transformers==4.1.0
+keybert==0.9.0
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
+# Local development
+python-dotenv==1.1.0
+# Added for parquet reading on Spaces
+pyarrow==16.1.0

subreddit_daily_summary.csv ADDED Viewed

	@@ -0,0 +1,213 @@

+date,subreddit,mean_sentiment,community_weighted_sentiment,count
+2025-05-01,LocalLLaMA,-0.4952,-0.4779,523
+2025-05-01,OpenAI,-0.5242,-0.517,227
+2025-05-01,artificial,-0.475,-0.5065,80
+2025-05-01,singularity,-0.3839,-0.4357,211
+2025-05-02,LocalLLaMA,-0.5027,-0.4347,366
+2025-05-02,OpenAI,-0.4947,-0.4879,285
+2025-05-02,artificial,-0.3913,-0.5042,46
+2025-05-02,singularity,-0.4244,-0.4151,205
+2025-05-03,LocalLLaMA,-0.5518,-0.5262,415
+2025-05-03,OpenAI,-0.4688,-0.4546,256
+2025-05-03,artificial,-0.4286,-0.3653,56
+2025-05-03,singularity,-0.3534,-0.3939,232
+2025-05-04,LocalLLaMA,-0.3213,-0.3051,333
+2025-05-04,OpenAI,-0.4365,-0.4455,252
+2025-05-04,artificial,-0.5172,-0.5418,58
+2025-05-04,singularity,-0.6024,-0.6052,166
+2025-05-05,LocalLLaMA,-0.473,-0.4502,444
+2025-05-05,OpenAI,-0.4486,-0.426,243
+2025-05-05,artificial,-0.3714,-0.3691,35
+2025-05-05,singularity,-0.3362,-0.3324,232
+2025-05-06,LocalLLaMA,-0.5656,-0.5327,419
+2025-05-06,OpenAI,-0.5019,-0.5556,269
+2025-05-06,artificial,-0.4468,-0.4585,94
+2025-05-06,singularity,-0.3714,-0.3904,245
+2025-05-07,LocalLLaMA,-0.4633,-0.4799,354
+2025-05-07,OpenAI,-0.4258,-0.4588,209
+2025-05-07,artificial,-0.5146,-0.5191,103
+2025-05-07,singularity,-0.3407,-0.3575,182
+2025-05-08,LocalLLaMA,-0.4769,-0.4615,325
+2025-05-08,OpenAI,-0.5182,-0.4833,303
+2025-05-08,artificial,-0.5,-0.5292,52
+2025-05-08,singularity,-0.494,-0.5126,249
+2025-05-09,LocalLLaMA,-0.4595,-0.4251,333
+2025-05-09,OpenAI,-0.4436,-0.4213,266
+2025-05-09,artificial,-0.5238,-0.5502,63
+2025-05-09,singularity,-0.502,-0.5425,253
+2025-05-10,LocalLLaMA,-0.4552,-0.4159,279
+2025-05-10,OpenAI,-0.5269,-0.5317,186
+2025-05-10,artificial,-0.443,-0.4837,79
+2025-05-10,singularity,-0.5192,-0.5185,208
+2025-05-11,LocalLLaMA,-0.5655,-0.5272,290
+2025-05-11,OpenAI,-0.4572,-0.4568,269
+2025-05-11,artificial,-0.5882,-0.5505,68
+2025-05-11,singularity,-0.3736,-0.3069,182
+2025-05-12,LocalLLaMA,-0.562,-0.5701,274
+2025-05-12,OpenAI,-0.5152,-0.4961,264
+2025-05-12,artificial,-0.4667,-0.5324,75
+2025-05-12,singularity,-0.498,-0.5231,247
+2025-05-13,LocalLLaMA,-0.4971,-0.4682,342
+2025-05-13,OpenAI,-0.5833,-0.6451,288
+2025-05-13,artificial,-0.3671,-0.4165,79
+2025-05-13,singularity,-0.3571,-0.4305,140
+2025-05-14,LocalLLaMA,-0.3776,-0.2943,286
+2025-05-14,OpenAI,-0.5369,-0.569,298
+2025-05-14,artificial,-0.5,-0.5367,88
+2025-05-14,singularity,-0.36,-0.3138,300
+2025-05-15,LocalLLaMA,-0.5308,-0.5159,341
+2025-05-15,OpenAI,-0.533,-0.5087,227
+2025-05-15,artificial,-0.5942,-0.6274,69
+2025-05-15,singularity,-0.4604,-0.4808,341
+2025-05-16,LocalLLaMA,-0.5168,-0.5195,327
+2025-05-16,OpenAI,-0.5751,-0.5613,273
+2025-05-16,artificial,-0.5802,-0.5783,81
+2025-05-16,singularity,-0.4568,-0.4414,324
+2025-05-17,LocalLLaMA,-0.5658,-0.5476,281
+2025-05-17,OpenAI,-0.5299,-0.5133,234
+2025-05-17,artificial,-0.4545,-0.5082,77
+2025-05-17,singularity,-0.5506,-0.5318,178
+2025-05-18,LocalLLaMA,-0.4783,-0.4879,230
+2025-05-18,OpenAI,-0.4545,-0.4629,165
+2025-05-18,artificial,-0.4902,-0.4985,51
+2025-05-18,singularity,-0.5461,-0.5584,141
+2025-05-19,LocalLLaMA,-0.4875,-0.4821,320
+2025-05-19,OpenAI,-0.6712,-0.6774,146
+2025-05-19,artificial,-0.4766,-0.5524,107
+2025-05-19,singularity,-0.391,-0.3832,335
+2025-05-20,LocalLLaMA,-0.5137,-0.466,329
+2025-05-20,OpenAI,-0.4822,-0.4133,197
+2025-05-20,artificial,-0.4,-0.4258,60
+2025-05-20,singularity,-0.3147,-0.3419,429
+2025-05-21,LocalLLaMA,-0.5,-0.5178,372
+2025-05-21,OpenAI,-0.4307,-0.5103,267
+2025-05-21,artificial,-0.5263,-0.6283,76
+2025-05-21,singularity,-0.3589,-0.3353,599
+2025-05-22,LocalLLaMA,-0.4813,-0.4787,374
+2025-05-22,OpenAI,-0.4939,-0.5255,328
+2025-05-22,artificial,-0.6667,-0.6903,72
+2025-05-22,singularity,-0.4947,-0.5288,566
+2025-05-23,LocalLLaMA,-0.5445,-0.5678,382
+2025-05-23,OpenAI,-0.4605,-0.4919,215
+2025-05-23,artificial,-0.3274,-0.3235,113
+2025-05-23,singularity,-0.393,-0.3799,402
+2025-05-24,LocalLLaMA,-0.4333,-0.4601,240
+2025-05-24,OpenAI,-0.344,-0.3161,250
+2025-05-24,artificial,-0.3488,-0.3326,86
+2025-05-24,singularity,-0.4379,-0.4447,491
+2025-05-25,LocalLLaMA,-0.5081,-0.5379,248
+2025-05-25,OpenAI,-0.345,-0.38,229
+2025-05-25,artificial,-0.3535,-0.3803,99
+2025-05-25,singularity,-0.3897,-0.3967,331
+2025-05-26,LocalLLaMA,-0.481,-0.4868,343
+2025-05-26,OpenAI,-0.5125,-0.5693,160
+2025-05-26,artificial,-0.2609,-0.2746,46
+2025-05-26,singularity,-0.4444,-0.4124,270
+2025-05-27,LocalLLaMA,-0.5611,-0.5977,319
+2025-05-27,OpenAI,-0.5197,-0.4877,229
+2025-05-27,artificial,-0.6436,-0.6834,101
+2025-05-27,singularity,-0.2628,-0.284,331
+2025-05-28,LocalLLaMA,-0.5333,-0.4956,360
+2025-05-28,OpenAI,-0.4729,-0.4775,258
+2025-05-28,artificial,-0.4186,-0.4808,86
+2025-05-28,singularity,-0.4292,-0.4194,459
+2025-05-29,LocalLLaMA,-0.4661,-0.4449,472
+2025-05-29,OpenAI,-0.5281,-0.5376,178
+2025-05-29,artificial,-0.0909,-0.0087,66
+2025-05-29,singularity,-0.3836,-0.4301,464
+2025-05-30,LocalLLaMA,-0.4895,-0.4555,380
+2025-05-30,OpenAI,-0.4791,-0.4653,215
+2025-05-30,artificial,-0.5333,-0.5649,90
+2025-05-30,singularity,-0.3952,-0.4286,377
+2025-05-31,LocalLLaMA,-0.5974,-0.6178,313
+2025-05-31,OpenAI,-0.4913,-0.4578,173
+2025-05-31,artificial,-0.3077,-0.258,78
+2025-05-31,singularity,-0.4563,-0.4511,309
+2025-06-01,LocalLLaMA,-0.4754,-0.4483,244
+2025-06-01,OpenAI,-0.4286,-0.4016,203
+2025-06-01,artificial,-0.2941,-0.2128,17
+2025-06-01,singularity,-0.4667,-0.4858,180
+2025-06-02,LocalLLaMA,-0.4886,-0.4693,352
+2025-06-02,OpenAI,-0.5528,-0.5055,246
+2025-06-02,artificial,-0.4792,-0.6261,96
+2025-06-02,singularity,-0.5287,-0.5184,314
+2025-06-03,LocalLLaMA,-0.405,-0.3515,279
+2025-06-03,OpenAI,-0.545,-0.5749,211
+2025-06-03,artificial,-0.6,-0.6247,80
+2025-06-03,singularity,-0.4876,-0.5192,242
+2025-06-04,LocalLLaMA,-0.4672,-0.4955,274
+2025-06-04,OpenAI,-0.5962,-0.5539,317
+2025-06-04,artificial,-0.5435,-0.605,92
+2025-06-04,singularity,-0.3316,-0.299,377
+2025-06-05,LocalLLaMA,-0.4882,-0.4796,297
+2025-06-05,OpenAI,-0.4632,-0.4344,231
+2025-06-05,artificial,-0.6712,-0.7541,73
+2025-06-05,singularity,-0.4007,-0.3616,307
+2025-06-06,LocalLLaMA,-0.438,-0.3628,274
+2025-06-06,OpenAI,-0.5,-0.4981,148
+2025-06-06,artificial,-0.5,-0.5466,72
+2025-06-06,singularity,-0.3361,-0.3444,238
+2025-06-07,LocalLLaMA,-0.4808,-0.4602,208
+2025-06-07,OpenAI,-0.4357,-0.4429,241
+2025-06-07,artificial,-0.4563,-0.4383,103
+2025-06-07,singularity,-0.373,-0.3527,252
+2025-06-08,LocalLLaMA,-0.5448,-0.5058,268
+2025-06-08,OpenAI,-0.5039,-0.4824,254
+2025-06-08,artificial,-0.6364,-0.678,77
+2025-06-08,singularity,-0.4054,-0.4938,148
+2025-06-09,LocalLLaMA,-0.5054,-0.4401,279
+2025-06-09,OpenAI,-0.4878,-0.4226,246
+2025-06-09,artificial,-0.4478,-0.4649,134
+2025-06-09,singularity,-0.4618,-0.4825,249
+2025-06-10,LocalLLaMA,-0.348,-0.3122,273
+2025-06-10,OpenAI,-0.4957,-0.5238,349
+2025-06-10,artificial,-0.5663,-0.5317,83
+2025-06-10,singularity,-0.383,-0.4165,376
+2025-06-11,LocalLLaMA,-0.4113,-0.3931,248
+2025-06-11,OpenAI,-0.4286,-0.3769,217
+2025-06-11,artificial,-0.4955,-0.5698,111
+2025-06-11,singularity,-0.368,-0.359,269
+2025-06-12,LocalLLaMA,-0.4094,-0.4098,254
+2025-06-12,OpenAI,-0.5276,-0.5785,254
+2025-06-12,artificial,-0.3735,-0.4439,83
+2025-06-12,singularity,-0.2961,-0.3084,233
+2025-06-13,LocalLLaMA,-0.3556,-0.3242,270
+2025-06-13,OpenAI,-0.4382,-0.4042,178
+2025-06-13,artificial,-0.3821,-0.3721,123
+2025-06-13,singularity,-0.2,-0.2585,125
+2025-06-14,LocalLLaMA,-0.5736,-0.5512,197
+2025-06-14,OpenAI,-0.3966,-0.4258,179
+2025-06-14,artificial,-0.4167,-0.4459,96
+2025-06-14,singularity,-0.1354,-0.1692,192
+2025-06-15,LocalLLaMA,-0.4684,-0.4165,237
+2025-06-15,OpenAI,-0.5294,-0.5125,102
+2025-06-15,artificial,-0.4754,-0.5787,61
+2025-06-15,singularity,-0.427,-0.392,178
+2025-06-16,LocalLLaMA,-0.518,-0.5471,278
+2025-06-16,OpenAI,-0.5169,-0.5528,207
+2025-06-16,artificial,-0.5696,-0.5846,79
+2025-06-16,singularity,-0.3418,-0.3892,158
+2025-06-17,LocalLLaMA,-0.4744,-0.4956,293
+2025-06-17,OpenAI,-0.426,-0.4405,223
+2025-06-17,artificial,-0.3608,-0.3481,97
+2025-06-17,singularity,-0.433,-0.4326,321
+2025-06-18,LocalLLaMA,-0.4528,-0.4244,307
+2025-06-18,OpenAI,-0.5152,-0.4993,231
+2025-06-18,artificial,-0.541,-0.5906,61
+2025-06-18,singularity,-0.4416,-0.4302,197
+2025-06-19,LocalLLaMA,-0.528,-0.5063,411
+2025-06-19,OpenAI,-0.3475,-0.3711,236
+2025-06-19,artificial,-0.5696,-0.6251,79
+2025-06-19,singularity,-0.4249,-0.4267,313
+2025-06-20,LocalLLaMA,-0.4225,-0.3431,374
+2025-06-20,OpenAI,-0.6126,-0.6681,222
+2025-06-20,artificial,-0.5238,-0.6093,63
+2025-06-20,singularity,-0.445,-0.4342,209
+2025-06-21,LocalLLaMA,-0.5521,-0.5479,317
+2025-06-21,OpenAI,-0.5932,-0.6164,177
+2025-06-21,artificial,-0.5579,-0.6365,95
+2025-06-21,singularity,-0.3566,-0.4154,286
+2025-06-22,LocalLLaMA,-0.5122,-0.3947,41
+2025-06-22,OpenAI,-0.3846,-0.4019,130
+2025-06-22,artificial,-0.28,-0.4088,75
+2025-06-22,singularity,-0.28,-0.2504,125