hblim commited on
Commit
a6576f0
·
0 Parent(s):

Clean codebase for HF Space (drop Prometheus binary data)

Browse files
Files changed (40) hide show
  1. .github/workflows/daily.yml +63 -0
  2. .gitignore +62 -0
  3. Dockerfile +18 -0
  4. LICENSE +21 -0
  5. README.md +256 -0
  6. app.py +11 -0
  7. config.yaml +29 -0
  8. frontend/__init__.py +1 -0
  9. frontend/app.py +287 -0
  10. frontend/data_utils.py +143 -0
  11. frontend/text_analysis.py +62 -0
  12. notebooks/keyword_extraction.ipynb +867 -0
  13. notebooks/loading_data.ipynb +785 -0
  14. notebooks/post_analysis.ipynb +567 -0
  15. notebooks/split_data_scored.ipynb +2798 -0
  16. pyproject.toml +32 -0
  17. reddit_analysis/__init__.py +7 -0
  18. reddit_analysis/common_metrics.py +40 -0
  19. reddit_analysis/config_utils.py +114 -0
  20. reddit_analysis/inference/__init__.py +5 -0
  21. reddit_analysis/inference/score.py +327 -0
  22. reddit_analysis/monitoring/dashboard.json +309 -0
  23. reddit_analysis/monitoring/dashboard_failure.png +0 -0
  24. reddit_analysis/monitoring/dashboard_success.png +0 -0
  25. reddit_analysis/monitoring/docker-compose.yml +21 -0
  26. reddit_analysis/monitoring/prometheus.yml +8 -0
  27. reddit_analysis/scraper/__init__.py +5 -0
  28. reddit_analysis/scraper/scrape.py +310 -0
  29. reddit_analysis/summarizer/__init__.py +5 -0
  30. reddit_analysis/summarizer/aggregator.py +68 -0
  31. reddit_analysis/summarizer/summarize.py +274 -0
  32. reddit_analysis/test_config.py +72 -0
  33. reddit_analysis/tests/README.md +78 -0
  34. reddit_analysis/tests/inference/test_score.py +282 -0
  35. reddit_analysis/tests/scraper/test_scrape.py +187 -0
  36. reddit_analysis/tests/summarizer/test_summarize.py +127 -0
  37. reddit_analysis/tests/test_config_utils.py +101 -0
  38. requirements-dev.txt +11 -0
  39. requirements.txt +22 -0
  40. subreddit_daily_summary.csv +213 -0
.github/workflows/daily.yml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # .github/workflows/daily.yml
2
+ name: Daily ETL & CI
3
+
4
+ on:
5
+ push:
6
+ branches: [main]
7
+ workflow_dispatch:
8
+ schedule:
9
+ - cron: '0 23 * * *'
10
+
11
+ jobs:
12
+ build:
13
+ # This is the GitHub‑hosted runner’s OS.
14
+ # You can change to macos-latest if you really need a macOS VM,
15
+ # but ubuntu-latest is faster and usually all you need.
16
+ runs-on: ubuntu-latest
17
+
18
+ env:
19
+ # These come from your repository settings → Secrets → Actions.
20
+ # Add HF_TOKEN, REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT there.
21
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
22
+ REDDIT_CLIENT_ID: ${{ secrets.REDDIT_CLIENT_ID }}
23
+ REDDIT_CLIENT_SECRET: ${{ secrets.REDDIT_CLIENT_SECRET }}
24
+ REDDIT_USER_AGENT: ${{ secrets.REDDIT_USER_AGENT }}
25
+ REPLICATE_API_TOKEN: ${{ secrets.REPLICATE_API_TOKEN }}
26
+
27
+ steps:
28
+ - name: Check out code
29
+ uses: actions/checkout@v3
30
+
31
+ - name: Set up Python 3.12
32
+ uses: actions/setup-python@v4
33
+ with:
34
+ python-version: "3.12"
35
+
36
+ - name: Cache pip
37
+ uses: actions/cache@v3
38
+ with:
39
+ path: ~/.cache/pip
40
+ key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements-dev.txt') }}
41
+ restore-keys: |
42
+ ${{ runner.os }}-pip-
43
+
44
+ - name: Install dependencies
45
+ run: |
46
+ python -m pip install --upgrade pip
47
+ pip install -r requirements-dev.txt
48
+
49
+ - name: Run unit tests
50
+ run: pytest --maxfail=1 --disable-warnings -q
51
+
52
+ - name: Compute DATE (UTC)
53
+ id: set-date
54
+ run: echo "DATE=$(date -u +'%Y-%m-%d')" >> $GITHUB_ENV
55
+
56
+ - name: Scrape
57
+ run: python -m reddit_analysis.scraper.scrape --date "$DATE"
58
+
59
+ - name: Score
60
+ run: python -m reddit_analysis.inference.score --date "$DATE" --overwrite
61
+
62
+ - name: Summarize
63
+ run: python -m reddit_analysis.summarizer.summarize --date "$DATE" --overwrite
.gitignore ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ scratch/
18
+ var/
19
+ wheels/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+ MANIFEST
24
+ .pytest_cache/
25
+ .coverage
26
+ htmlcov/
27
+ .tox/
28
+ .nox/
29
+ .hypothesis/
30
+
31
+ # Virtual Environment
32
+ venv/
33
+ env/
34
+ ENV/
35
+ .env
36
+
37
+ # IDE
38
+ .idea/
39
+ .vscode/
40
+ *.swp
41
+ *.swo
42
+ .ipynb_checkpoints/
43
+ **/.ipynb_checkpoints/
44
+
45
+ # Data directories
46
+ data/
47
+ data_raw/
48
+ data_scored/
49
+
50
+ # Docker
51
+ .docker/
52
+ docker-compose.override.yml
53
+
54
+ # Logs
55
+ *.log
56
+ logs/
57
+
58
+ # OS
59
+ .DS_Store
60
+ Thumbs.db reddit_analysis/monitoring/prometheus-data/
61
+ reddit_analysis/monitoring/prometheus-data/
62
+ reddit_analysis/monitoring/prometheus-data/
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official lightweight Python image
2
+ FROM python:3.12-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Copy and install dependencies
8
+ COPY requirements.txt .
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+
11
+ # Copy your Streamlit app
12
+ COPY . .
13
+
14
+ # Expose Streamlit’s default port
15
+ EXPOSE 8502
16
+
17
+ # Launch the app
18
+ ENTRYPOINT ["streamlit", "run", "frontend/app.py", "--server.port=8502", "--server.address=0.0.0.0"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Halston Lim
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Reddit Sentiment Pipeline
2
+
3
+ [![CI Status](https://github.com/halstonblim/reddit_sentiment_pipeline/actions/workflows/daily.yml/badge.svg)](https://github.com/halstonblim/reddit_sentiment_pipeline/actions/workflows/daily.yml)
4
+ [![Streamlit App](https://img.shields.io/badge/demo-streamlit-ff4b4b?logo=streamlit)](https://redditsentimentpipeline.streamlit.app/)
5
+
6
+ A fully‑automated **end‑to‑end MLOps** pipeline that tracks daily sentiment trends on Reddit, scores posts with a transformer‑based model served from Replicate, summarizes the results, and publishes an interactive Streamlit dashboard—all orchestrated by GitHub Actions.
7
+
8
+ ***Analyzing the Public Discourse of AI News***
9
+
10
+ The pipeline is currently configured in `config.yaml` to analyze AI news discourse across `r/articial`, `r/LocalLLama`, `r/singularity`, and `r/OpenAI`. The data is persisted across three steps
11
+ 1. **Scrapes** new submissions from a configurable list of subreddits (→ `data_raw/`)
12
+ 2. **Classifies** each post with a sentiment model served on Replicate (→ `data_scored/`)
13
+ 3. **Summarises** daily trends for lightweight front-end consumption (→ `daily_summary/`)
14
+
15
+ More information on the data can be found on the Hugging Face Dataset repo [hblim/top_reddit_posts_daily](https://huggingface.co/datasets/hblim/top_reddit_posts_daily)
16
+
17
+ ***Sentiment Analysis***
18
+
19
+ We use the [DistilBERT sentiment analysis model](https://github.com/halstonblim/batch-bert-sentiment), which is wrapped with Cog for easy deployment on Replicate. The model handles batched input texts in a single API call, which improves performance by parallelizing computation on the GPU.
20
+
21
+ ---
22
+
23
+ ## Table of Contents
24
+ 1. [Project Structure](#project-structure)
25
+ 2. [Installation & Quick start](#installation)
26
+ 3. [Configuration](#configuration)
27
+ 4. [Back-end reddit_analysis](#backend-reddit-analysis)
28
+ 5. [Unit tests](#unit-tests)
29
+ 6. [Front-end Streamlit](#front-end-streamlit)
30
+ 7. [CI/CD & GitHub Actions](#cicd-github-actions)
31
+ 8. [Monitoring with Grafana/Prometheus](#monitoring-with-grafanaprometheus)
32
+ 9. [Extending / Customising](#extending--customizing)
33
+
34
+ ---
35
+
36
+ ## Project Structure
37
+
38
+ ````text
39
+ reddit_sentiment_pipeline/
40
+ ├── reddit_analysis/   # Back‑end
41
+ │   ├── __init__.py
42
+ │   ├── scraper/
43
+ │   │   └── scrape.py   # Collect raw posts → HF dataset (data_raw)
44
+ │   ├── inference/
45
+ │   │   └── score.py   # Call Replicate model → adds sentiment scores
46
+ │   ├── summarizer/
47
+ │   │   └── summarize.py   # Aggregate + export CSV summaries (data_scored)
48
+ │   ├── config_utils.py   # Secrets & YAML helper
49
+ │   ├── tests/ # Pytest test-suite
50
+ |
51
+ ├── streamlit_app/   # Front‑end
52
+ │   └── app.py
53
+
54
+ ├── .github/
55
+ │   └── workflows/
56
+ │   ├── daily.yml   # Cron‑triggered ETL + summarize
57
+
58
+ ├── config.yaml   # Default runtime config (subreddits, models …)
59
+ ├── requirements.txt # requirements for front end only
60
+ ├── requirements-dev.txt # requirements for local development
61
+ └── README.md
62
+ ````
63
+
64
+ ### Automated Workflow
65
+ ```
66
+ [GitHub Actions Cron @ 21:00 UTC]
67
+ |
68
+ v
69
+ +-------+-------------+
70
+ | Scrape Reddit | ← `scraper/scrape.py --date $DATE`
71
+ +-------+-------------+
72
+ |
73
+ v
74
+ +-------+-------------+
75
+ | Sentiment Analysis | ← `inference/score.py --date $DATE`
76
+ +-------+-------------+
77
+ |
78
+ v
79
+ +-------+-------------+
80
+ | Summarize | ← `summarizer/summarize.py --date $DATE`
81
+ +-------+-------------+
82
+ |
83
+ v
84
+ [HF Dataset: data files]
85
+ |
86
+ Frontend (Streamlit app)
87
+ |
88
+ Public URL (Streamlit Cloud)
89
+ ```
90
+
91
+ ---
92
+
93
+ ## Installation
94
+
95
+ To run the frontend streamlit app locally
96
+
97
+ ```bash
98
+ git clone https://github.com/halstonblim/reddit_sentiment_pipeline.git
99
+ cd reddit_sentiment_pipeline
100
+ pip install -r requirements.txt
101
+ streamlit run frontend/app.py
102
+ ```
103
+
104
+ To run the backend reddit analysis locally and set up your own scraper, sentiment analysis, and export pipeline, steps are roughly
105
+ - Get Reddit/Hugging Face/Replicate accounts and API tokens
106
+ - You must configure a .env with the secrets (HF, Replicate, Reddit tokens)
107
+ - Configure the .yaml file to point to the proper Hugging Face repository and Replicate models, and subreddits to scrape
108
+
109
+ Once those are configured you can run the following which should scrape Reddit, analyze text remotely with a Replicate model, and export results to Hugging Face
110
+
111
+ ```bash
112
+ pip install -r requirements-dev.txt
113
+
114
+ # Run the full pipeline for today
115
+ $ python -m reddit_analysis.scraper.scrape   --date $(date +%F)
116
+ $ python -m reddit_analysis.inference.score  --date $(date +%F)
117
+ $ python -m reddit_analysis.summarizer.summarize --date $(date +%F)
118
+ ```
119
+ ---
120
+
121
+ ## Configuration
122
+
123
+ All non‑secret settings live in **`config.yaml`**; sensitive tokens are supplied via environment variables or a `.env` file.
124
+
125
+ ```yaml
126
+ # config.yaml (excerpt)
127
+ repo_id: hblim/top_reddit_posts_daily
128
+ push_to_hf: true
129
+ subreddits:
130
+ - name: apple
131
+ post_limit: 100
132
+ comment_limit: 5
133
+ ```
134
+
135
+ | Variable | Where to set | Description |
136
+ |----------|-------------|-------------|
137
+ | `HF_TOKEN` | GitHub → *Settings › Secrets and variables* <br>or local `.env` | Personal access token with **write** permission to the HF dataset |
138
+ | `REPLICATE_API_TOKEN` | same | Token to invoke the Replicate model |
139
+ | `ENV` | optional | `local`, `ci`, `prod` – toggles logging & Streamlit behaviour |
140
+
141
+ ---
142
+
143
+ ## Backend reddit analysis
144
+
145
+ ### 1. `scraper.scrape`
146
+ Collects the top *N* daily posts from each configured subreddit and appends them to a [Hugging Face **Parquet** dataset](https://huggingface.co/datasets/hblim/top_reddit_posts_daily/tree/main/data_raw) (`data_raw`).
147
+
148
+ ```bash
149
+ python -m reddit_analysis.scraper.scrape \
150
+ --date 2025-04-22 # YYYY‑MM‑DD (defaults to today)
151
+ --limit 100 # optional, posts/subreddit
152
+ --overwrite # re‑upload if already exists
153
+ ```
154
+
155
+ * **Dependencies:** [`praw`](https://praw.readthedocs.io/), `huggingface‑hub`
156
+ * **De‑duplication:** handled server‑side via dataset row `post_id` as primary key—**no local state needed**.
157
+
158
+ ---
159
+
160
+ ### 2. `inference.score`
161
+ Downloads one day of raw posts, sends raw text consisting of `title + selftext` to the **Replicate** hosted model in batches for optimized parallel computation, and pushes a scored Parquet file to a separate [Hugging Face **Parquet** dataset](https://huggingface.co/datasets/hblim/top_reddit_posts_daily/tree/main/data_scored) `data_scored`.
162
+
163
+ ```bash
164
+ python -m reddit_analysis.inference.score \
165
+ --date 2025-04-22 \
166
+ --model your‑org/sentiment‑model:latest \
167
+ --batch_size 64 # Replicate parallelism
168
+ ```
169
+
170
+ * **Retry logic:** automatic back‑off for `httpx.RemoteProtocolError`.
171
+ ---
172
+
173
+ ### 3. `summarizer.summarize`
174
+ Aggregates daily sentiment by subreddit (mean & weighted means) and writes a compact CSV plus a Parquet summary.
175
+
176
+ ```bash
177
+ python -m reddit_analysis.summarizer.summarize \
178
+ --date 2025-04-22 \
179
+ --output_format csv parquet
180
+ ```
181
+
182
+ * **Uses `pandas` `groupby` (no default sorting—explicitly sorts by date + subreddit).**
183
+ * **Exports** are placed under `data_summary/` in the same HF dataset repo.
184
+
185
+ ---
186
+
187
+ ## Unit tests
188
+
189
+ The backend test‑suite lives in `reddit_analysis/tests/` and can be executed with **pytest**:
190
+
191
+ ```bash
192
+ pytest -q
193
+ ```
194
+
195
+ | File | What it tests | Key fixtures / mocks |
196
+ |------|--------------|----------------------|
197
+ | `tests/scraper/test_scrape.py` | Reddit fetch logic, de‑duplication rules | `praw.Reddit`, `huggingface_hub.HfApi` mocked via `monkeypatch` |
198
+ | `tests/inference/test_score.py` | Batching, error handling when HF token missing | Fake Replicate API via `httpx.MockTransport` |
199
+ | `tests/summarizer/test_summarize.py` | Correct aggregation & sorting | `pandas` dummy frames |
200
+
201
+ CI runs the tests on every push (see [daily.yml](#cicd--github-actions)).
202
+
203
+ ---
204
+
205
+ ## Front end (Streamlit)
206
+
207
+ `streamlit_app/app.py` provides an interactive dashboard that:
208
+ 1. Downloads the daily summary CSVs from HF.
209
+ 2. Displays time‑series sentiment trends, top posts tables, and subreddit post counts.
210
+ 3. Allows filtering by date range or subreddit with responsive Altair charts.
211
+
212
+ ```bash
213
+ # Local preview
214
+ streamlit run streamlit_app/app.py
215
+ ```
216
+ ---
217
+
218
+ ## CI/CD Github Actions
219
+
220
+ ### `.github/workflows/daily.yml`
221
+
222
+
223
+ | Step | What it does |
224
+ |------|--------------|
225
+ | **Setup** | Checkout repo, install Python 3.12, cache pip deps |
226
+ | **Scrape** | `python -m reddit_analysis.scraper.scrape --date $DATE` |
227
+ | **Score** | `python -m reddit_analysis.inference.score --date $DATE` |
228
+ | **Summarize** | `python -m reddit_analysis.summarizer.summarize --date $DATE` |
229
+ | **Tests** | `pytest -q` |
230
+
231
+ *Trigger:* `cron: "0 21 * * *"` → 4 pm America/Chicago every day.
232
+
233
+ Secrets (`HF_TOKEN`, `REPLICATE_API_TOKEN`) are injected via **repository secrets** so the workflow can push to Hugging Face and call Replicate. The runner is completely stateless—every job starts on a fresh VM and writes data only to external storage (HF dataset).
234
+
235
+ ---
236
+
237
+ ## Monitoring with Grafana/Prometheus
238
+
239
+ Implemented a local lightweight Prometheus + Grafana stack; each pipeline stage pushes job_success and job_duration_seconds metrics. Dashboard surfaces run health & latency trends.
240
+
241
+ Example of success state:
242
+
243
+ ![Success](reddit_analysis/monitoring/dashboard_success.png)
244
+
245
+ Example of failure state:
246
+
247
+ ![Failure](reddit_analysis/monitoring/dashboard_failure.png)
248
+
249
+ --
250
+
251
+ ## Extending / Customizing
252
+
253
+ * **Change subreddits** – edit the list in `config.yaml` or pass `--subreddits` to the scraper.
254
+ * **Swap sentiment models** – point `replicate_model` to any text‑classification model on Replicate with single‑sentence input.
255
+ * **Augment summaries** – create additional aggregator modules (e.g. keyword extraction) and add a new step in `daily.yml`.
256
+
app.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Entry point for the Hugging Face Spaces application.
2
+
3
+ This tiny wrapper simply imports the Streamlit app defined in
4
+ `frontend/app.py`. Importing that module is enough to launch the UI
5
+ because the Streamlit code executes at import time.
6
+ """
7
+
8
+ # Importing `frontend.app` is sufficient to start the Streamlit app.
9
+ # The variable name is unused, but keeping the assignment suppresses
10
+ # linters complaining about unused imports.
11
+ import frontend.app # noqa: F401 # pragma: no cover
config.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face repository configuration
2
+ repo_id: hblim/top_reddit_posts_daily
3
+ repo_type: dataset
4
+
5
+ # Inference configuration
6
+ replicate_model: halstonblim/distilbert-base-uncased-finetuned-sst-2-english:d1a897bcd8ebb23c5aab87317eee2d6c919cdc5cfbf9154140c5c2fb47344b8c
7
+ scored_dir: reddit_analysis/data/data_scored
8
+ hf_scored_dir: data_scored_subreddit
9
+ batch_size: 1024
10
+
11
+ # Scraper configuration
12
+ timezone: US/Central
13
+ raw_dir: reddit_analysis/data/data_raw
14
+ logs_dir: reddit_analysis/data/logs
15
+ hf_raw_dir: data_raw
16
+ push_to_hf: true
17
+ subreddits:
18
+ - name: artificial
19
+ post_limit: 100
20
+ comment_limit: 10
21
+ - name: LocalLLaMA
22
+ post_limit: 100
23
+ comment_limit: 10
24
+ - name: singularity
25
+ post_limit: 100
26
+ comment_limit: 10
27
+ - name: OpenAI
28
+ post_limit: 100
29
+ comment_limit: 10
frontend/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Frontend package exposing the Streamlit app for Hugging Face Spaces."""
frontend/app.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import numpy as np
5
+ import altair as alt
6
+ from datetime import date, timedelta, datetime
7
+
8
+ # Import from local modules
9
+ from data_utils import load_summary, load_day, get_subreddit_colors, get_last_updated_hf_caption
10
+ from text_analysis import keywords_for_df
11
+
12
+ st.set_page_config(page_title="Reddit Sentiment Trends", layout="wide")
13
+ st.title("Reddit Sentiment Monitor")
14
+
15
+
16
+ # ── Load & transform data ────────────────────────────────────────────────────
17
+ df = load_summary()
18
+ last_update_caption = get_last_updated_hf_caption()
19
+
20
+ # Get colors for each subreddit
21
+ subreddits = df["subreddit"].unique()
22
+ subreddit_colors = get_subreddit_colors(subreddits)
23
+
24
+ # Define time format to use across all charts
25
+ time_format = "%m/%d/%Y"
26
+
27
+ # Get date range from the dataset for the form
28
+ min_date = df["date"].min().date()
29
+ max_date = df["date"].max().date()
30
+
31
+ # ── Community weighted sentiment line chart for all subreddits ───────────────
32
+ st.subheader("Community Weighted Sentiment by Subreddit")
33
+
34
+ # Add date range selector for the time series
35
+ date_range = st.date_input(
36
+ "Select date range for time series",
37
+ (min_date, max_date),
38
+ min_value=min_date,
39
+ max_value=max_date
40
+ )
41
+ start_date, end_date = date_range
42
+ filtered_df = df[(df["date"].dt.date >= start_date) & (df["date"].dt.date <= end_date)]
43
+
44
+ # Add a multiselect widget for choosing which subreddits to display
45
+ selected_subs = st.multiselect(
46
+ "Select subreddits to display",
47
+ options=list(subreddits),
48
+ default=list(subreddits)
49
+ )
50
+ plot_df = filtered_df[filtered_df["subreddit"].isin(selected_subs)]
51
+
52
+ # Define hover selection for nearest point
53
+ nearest = alt.selection_single(
54
+ name="nearest",
55
+ on="mouseover",
56
+ nearest=True,
57
+ fields=["date"],
58
+ empty="none"
59
+ )
60
+
61
+ # Base chart for DRY encoding
62
+ base = alt.Chart(plot_df).encode(
63
+ x=alt.X("date:T", title="Date", axis=alt.Axis(format=time_format)),
64
+ y=alt.Y("community_weighted_sentiment:Q", title="Community Weighted Sentiment"),
65
+ color=alt.Color(
66
+ "subreddit:N",
67
+ scale=alt.Scale(domain=list(subreddits), range=list(subreddit_colors.values())),
68
+ legend=alt.Legend(
69
+ title="Subreddit",
70
+ orient="top",
71
+ direction="vertical",
72
+ columns=1
73
+ )
74
+ )
75
+ )
76
+
77
+ # Draw lines
78
+ line = base.mark_line()
79
+
80
+ # Invisible selectors to capture hover events
81
+ selectors = base.mark_point(opacity=0).add_selection(nearest)
82
+
83
+ # Draw highlighted points on hover
84
+ points_hover = base.mark_point(size=60).encode(
85
+ opacity=alt.condition(nearest, alt.value(1), alt.value(0))
86
+ )
87
+
88
+ # Tooltip rule and popup
89
+ tooltips = base.mark_rule(color="gray").encode(
90
+ tooltip=[
91
+ alt.Tooltip("subreddit:N", title="Subreddit"),
92
+ alt.Tooltip("date:T", title="Date", format=time_format),
93
+ alt.Tooltip("community_weighted_sentiment:Q", title="Sentiment", format=".2f")
94
+ ]
95
+ ).transform_filter(nearest)
96
+
97
+ # Layer everything and make interactive
98
+ hover_chart = alt.layer(line, selectors, points_hover, tooltips).properties(
99
+ height=300
100
+ ).interactive()
101
+
102
+ st.altair_chart(hover_chart, use_container_width=True)
103
+
104
+ # ── Bar chart for post counts by subreddit (side-by-side) ────────────────────
105
+ st.subheader("Daily Post Counts by Subreddit")
106
+
107
+ # Create grouped bar chart for post counts by date and subreddit
108
+ bar_chart = alt.Chart(df).mark_bar().encode(
109
+ x=alt.X("date:T", title="Date", axis=alt.Axis(format=time_format)),
110
+ y=alt.Y("count:Q", title="Post Count"),
111
+ xOffset="subreddit:N", # This creates the side-by-side grouping
112
+ color=alt.Color(
113
+ "subreddit:N",
114
+ scale=alt.Scale(domain=list(subreddits), range=list(subreddit_colors.values())),
115
+ legend=alt.Legend(title="Subreddit")
116
+ ),
117
+ tooltip=["date", "subreddit", "count"]
118
+ ).properties(height=300).interactive()
119
+
120
+ st.altair_chart(bar_chart, use_container_width=True)
121
+
122
+ # ── Latest metrics for each subreddit ─────────────────────────────────────────
123
+ st.subheader("Latest Metrics")
124
+
125
+ # Get the most recent data for each subreddit
126
+ latest_by_subreddit = df.sort_values("date").groupby("subreddit").last().reset_index()
127
+
128
+ # Display metrics in columns
129
+ cols = st.columns(len(latest_by_subreddit))
130
+ for i, (_, row) in enumerate(latest_by_subreddit.iterrows()):
131
+ with cols[i]:
132
+ st.markdown(f"**{row['subreddit']}**")
133
+ st.metric("Community Weighted", f"{row['community_weighted_sentiment']:.2f}")
134
+ st.metric("Posts", int(row["count"]))
135
+
136
+ # ── Analyze sentiment driving posts ─────────────────────────────────────
137
+ st.header("Analyze sentiment driving posts")
138
+ with st.form("analysis_form"):
139
+ col1, col2 = st.columns(2)
140
+ with col1:
141
+ selected_subreddit = st.selectbox("Select Subreddit", options=subreddits)
142
+ with col2:
143
+ selected_date = st.date_input(
144
+ "Select Date",
145
+ value=max_date,
146
+ min_value=min_date,
147
+ max_value=max_date
148
+ )
149
+ submit_button = st.form_submit_button("Analyze Posts")
150
+
151
+ if submit_button:
152
+ date_str = selected_date.strftime("%Y-%m-%d")
153
+ with st.spinner(f"Loading data for r/{selected_subreddit} on {date_str}..."):
154
+ posts_df = load_day(date_str, selected_subreddit)
155
+
156
+ if posts_df.empty:
157
+ st.error(f"No posts found for r/{selected_subreddit} on {date_str}")
158
+ else:
159
+ # Separate posts and comments
160
+ posts = posts_df[posts_df["type"] == "post"]
161
+ comments = posts_df[posts_df["type"] == "comment"]
162
+
163
+ # Overall summary metrics using engagement-adjusted sentiment (EAS)
164
+ n_posts = len(posts)
165
+ df_day = posts_df.copy()
166
+ df_day["score_num"] = pd.to_numeric(df_day["score"], errors="coerce").fillna(0)
167
+ weights_base_day = 1 + np.log1p(df_day["score_num"].clip(lower=0))
168
+ gamma_post = 0.3
169
+ weights_day = weights_base_day * np.where(df_day["type"] == "post", gamma_post, 1.0)
170
+ total_weight_day = weights_day.sum()
171
+ overall_eas = (weights_day * df_day["sentiment"]).sum() / weights_day.sum() if weights_day.sum() > 0 else 0
172
+ # Normalize daily weighted sentiment to range [-1,1]
173
+ overall_eas = 2 * overall_eas - 1
174
+ overall_score = df_day["score"].sum()
175
+
176
+ st.subheader(f"r/{selected_subreddit} on {date_str}")
177
+ c1, c2, c3 = st.columns(3)
178
+ c1.metric("Posts", n_posts)
179
+ c2.metric("Daily Weighted Sentiment, All Posts", f"{overall_eas:.2f}")
180
+ c3.metric("Total Score, All Posts", f"{overall_score:,}")
181
+
182
+ # Wrap analysis and rendering of top posts in a spinner
183
+ with st.spinner("Analyzing sentiment and rendering top posts..."):
184
+ # Build per-post analysis
185
+ analysis_rows = []
186
+ for _, post in posts.iterrows():
187
+ pid = post["post_id"]
188
+ text = post["text"]
189
+ # Gather comments for this post
190
+ post_comments = comments[comments["parent_id"] == f"t3_{pid}"]
191
+
192
+ # Combine post and comments for calculations
193
+ segment = pd.concat([pd.DataFrame([post]), post_comments], ignore_index=True)
194
+ # Compute engagement-adjusted sentiment for this post thread
195
+ segment_score_num = pd.to_numeric(segment["score"], errors="coerce").fillna(0)
196
+ weights_base = 1 + np.log1p(segment_score_num.clip(lower=0))
197
+ gamma_post = 0.3
198
+ weights_seg = weights_base * np.where(segment["type"] == "post", gamma_post, 1.0)
199
+ ws = (weights_seg * segment["sentiment"]).sum() / weights_seg.sum() if weights_seg.sum() > 0 else 0
200
+ # Normalize weighted sentiment of thread to range [-1,1]
201
+ ws = 2 * ws - 1
202
+ ts = segment["score"].sum()
203
+ nc = len(post_comments)
204
+
205
+ thread_weight_sum = weights_seg.sum()
206
+ contrib_weight = thread_weight_sum / total_weight_day if total_weight_day > 0 else 0
207
+ total_contribution = contrib_weight * ws
208
+
209
+ analysis_rows.append({
210
+ "post_id": pid,
211
+ "Post Keywords": "", # placeholder; will compute for top posts only
212
+ "Weighted Sentiment of Thread": ws,
213
+ "Contribution Weight": contrib_weight,
214
+ "Total Sentiment Contribution": total_contribution,
215
+ "# Comments": nc,
216
+ "Total Score": ts
217
+ })
218
+
219
+ analysis_df = pd.DataFrame(analysis_rows)
220
+ # Determine top 5 posts by contribution weight
221
+ top5 = analysis_df.sort_values("Contribution Weight", ascending=False).head(5).copy()
222
+ top5.reset_index(drop=True, inplace=True)
223
+
224
+ # Compute keywords only for top posts
225
+ for idx, row in top5.iterrows():
226
+ pid = row["post_id"]
227
+ post_text = posts[posts["post_id"] == pid].iloc[0]["text"]
228
+ kw = keywords_for_df(pd.DataFrame({"text": [post_text]}), top_n=2)
229
+ keywords_list = [k for k, _ in kw][:2]
230
+ top5.at[idx, "Post Keywords"] = ", ".join(keywords_list)
231
+
232
+ # Format numeric columns
233
+ for df_part in (top5,):
234
+ df_part["Weighted Sentiment of Thread"] = df_part["Weighted Sentiment of Thread"].map("{:.2f}".format)
235
+ df_part["Total Score"] = df_part["Total Score"].map("{:,}".format)
236
+ df_part["Contribution Weight"] = df_part["Contribution Weight"].map("{:.2%}".format)
237
+ df_part["Total Sentiment Contribution"] = df_part["Total Sentiment Contribution"].map("{:.4f}".format)
238
+
239
+ st.subheader("Top 5 Posts by Contribution Weight")
240
+ st.dataframe(
241
+ top5[["Post Keywords", "Weighted Sentiment of Thread", "Contribution Weight", "Total Sentiment Contribution", "# Comments", "Total Score"]],
242
+ use_container_width=True
243
+ )
244
+
245
+ st.subheader("Post Details")
246
+ for idx, row in top5.reset_index(drop=True).iterrows():
247
+ pid = row["post_id"]
248
+ post_obj = posts[posts["post_id"] == pid].iloc[0]
249
+ post_text = post_obj["text"]
250
+
251
+ with st.expander(f"{idx} - {post_text.split('\\n')[0][:50]}..."):
252
+ # Post Metrics
253
+ post_sent = post_obj["sentiment"]
254
+ # Normalize post sentiment to [-1,1]
255
+ post_sent_norm = 2 * post_sent - 1
256
+ post_score = post_obj["score"]
257
+ ps = pd.to_numeric(post_score, errors="coerce")
258
+ post_score_num = ps if (not np.isnan(ps) and ps >= 0) else 0
259
+ # Compute post weight
260
+ post_weight = (1 + np.log1p(post_score_num)) * gamma_post
261
+ st.markdown("**Post:**")
262
+ st.markdown(f"{post_text[:300]}{'...' if len(post_text) > 300 else ''}"
263
+ f"(Sentiment: {post_sent_norm:.2f}, Weight: {post_weight:.2f}, Score: {post_score:,})"
264
+ )
265
+ st.markdown("---")
266
+ # Display top 5 comments with metrics
267
+ top_comments = (
268
+ comments[comments["parent_id"] == f"t3_{pid}"]
269
+ .sort_values("score", ascending=False)
270
+ .head(5)
271
+ )
272
+ st.markdown("**Top Comments:**")
273
+ for c_idx, comment in top_comments.iterrows():
274
+ c_text = comment["text"]
275
+ # Normalize comment sentiment and compute weight
276
+ c_sent_norm = 2 * comment["sentiment"] - 1
277
+ c_score = comment["score"]
278
+ cs = pd.to_numeric(c_score, errors="coerce")
279
+ c_score_num = cs if (not np.isnan(cs) and cs >= 0) else 0
280
+ c_weight = 1 + np.log1p(c_score_num)
281
+ st.markdown(
282
+ f"{c_idx}. {c_text[:200]}{'...' if len(c_text) > 200 else ''} "
283
+ f"(Sentiment: {c_sent_norm:.2f}, Weight: {c_weight:.2f}, Score: {c_score:,})"
284
+ )
285
+
286
+ # Display the data source attribution
287
+ st.markdown(last_update_caption, unsafe_allow_html=True)
frontend/data_utils.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from pathlib import Path
3
+ import os
4
+ import yaml
5
+ import pandas as pd
6
+ import numpy as np
7
+ from huggingface_hub import HfApi
8
+ from datetime import datetime, timezone
9
+ import re
10
+
11
+ # Root directory of the project
12
+ ROOT = Path(__file__).resolve().parent.parent
13
+
14
+ # Detect Streamlit runtime
15
+ try:
16
+ import streamlit as st
17
+ has_streamlit = True
18
+ except ImportError:
19
+ has_streamlit = False
20
+
21
+ # Load environment variables when running locally
22
+ if os.getenv("ENV") == "local" or not has_streamlit:
23
+ from dotenv import load_dotenv
24
+ load_dotenv(ROOT / ".env")
25
+
26
+ # Read Hugging Face dataset repo ID from config
27
+ with open(ROOT / "config.yaml") as f:
28
+ cfg = yaml.safe_load(f)
29
+ REPO_ID: str = cfg["repo_id"]
30
+
31
+ # Initialize Hugging Face API client
32
+ api = HfApi()
33
+
34
+ # URL for the summary CSV in the dataset
35
+ CSV_URL = (
36
+ f"https://huggingface.co/datasets/{REPO_ID}/resolve/main/subreddit_daily_summary.csv"
37
+ )
38
+
39
+
40
+ def get_secret(key: str, default=None) -> str | None:
41
+ """Fetch a secret from environment variables or Streamlit secrets."""
42
+ val = os.getenv(key)
43
+ if val is None and has_streamlit:
44
+ val = st.secrets.get(key, default)
45
+ return val
46
+
47
+
48
+ import streamlit as st
49
+
50
+ @st.cache_data(ttl=6000, show_spinner=False)
51
+ def load_summary() -> pd.DataFrame:
52
+ """Download and return the subreddit daily summary as a DataFrame using HF Hub API. Cached for 10 minutes."""
53
+ # Use HF Hub API to download the file instead of direct URL
54
+ local_file = api.hf_hub_download(
55
+ repo_id=REPO_ID,
56
+ filename="subreddit_daily_summary.csv",
57
+ repo_type="dataset"
58
+ )
59
+ df = pd.read_csv(local_file, parse_dates=["date"])
60
+ needed = {"date", "subreddit", "mean_sentiment", "community_weighted_sentiment", "count"}
61
+ if not needed.issubset(df.columns):
62
+ missing = needed - set(df.columns)
63
+ raise ValueError(f"Missing columns in summary CSV: {missing}")
64
+ return df
65
+
66
+
67
+ def _sanitize(name: str) -> str:
68
+ """
69
+ Make subreddit safe for filenames (removes slashes, spaces, etc.).
70
+ """
71
+ name = name.strip().lower()
72
+ name = re.sub(r"[^\w\-\.]", "_", name)
73
+ return name
74
+
75
+
76
+ @st.cache_data(show_spinner=False, ttl=60*60)
77
+ def load_day(date: str, subreddit: str) -> pd.DataFrame:
78
+ """Lazy-download the parquet shard for one YYYY-MM-DD and return df slice.
79
+
80
+ Args:
81
+ date: Date string in YYYY-MM-DD format
82
+ subreddit: Subreddit name to filter by
83
+
84
+ Returns:
85
+ DataFrame containing posts from the specified subreddit on the given day
86
+ """
87
+ # Download the subreddit-specific file using sanitized subreddit
88
+ safe_sub = _sanitize(subreddit)
89
+ fname = f"data_scored_subreddit/{date}__{safe_sub}.parquet"
90
+ local = api.hf_hub_download(REPO_ID, fname, repo_type="dataset")
91
+ df_day = pd.read_parquet(local)
92
+ # File contains only the selected subreddit; reset index
93
+ return df_day.reset_index(drop=True)
94
+
95
+
96
+ def get_last_updated_hf(repo_id: str) -> datetime:
97
+ """
98
+ Retrieve the dataset repo's last modified datetime via HF Hub API.
99
+ Returns a timezone-aware datetime in UTC.
100
+ """
101
+ info = api.repo_info(repo_id=repo_id, repo_type="dataset")
102
+ dt: datetime = info.lastModified # already a datetime object
103
+ if dt.tzinfo is not None:
104
+ dt = dt.astimezone(timezone.utc)
105
+ return dt
106
+
107
+
108
+ def get_last_updated_hf_caption() -> str:
109
+ """
110
+ Build a markdown-formatted caption string showing the dataset source and last update.
111
+ Uses REPO_ID and the HF Hub API to fetch the timestamp.
112
+ """
113
+ # Generate dataset link and timestamp
114
+ dataset_url = f"https://huggingface.co/datasets/{REPO_ID}"
115
+ last_update_dt = get_last_updated_hf(REPO_ID)
116
+ last_update = last_update_dt.strftime("%Y-%m-%d %H:%M:%S UTC")
117
+
118
+ # Return the small-caption HTML/markdown string
119
+ return (
120
+ f"<small>"
121
+ f"Data source: <a href='{dataset_url}' target='_blank'>{REPO_ID}</a> &bull; "
122
+ f"Last updated: {last_update}"
123
+ f"</small>"
124
+ )
125
+
126
+
127
+ def add_rolling(df: pd.DataFrame, window: int = 7) -> pd.DataFrame:
128
+ """Add a rolling mean for community_weighted_sentiment over the specified window."""
129
+ out = df.copy()
130
+ for sub, grp in out.groupby("subreddit"):
131
+ grp_sorted = grp.sort_values("date")
132
+ roll = grp_sorted["community_weighted_sentiment"].rolling(window).mean()
133
+ out.loc[grp_sorted.index, f"roll_{window}"] = roll
134
+ return out
135
+
136
+
137
+ def get_subreddit_colors(subreddits: list[str]) -> dict[str, str]:
138
+ """Provide a consistent color map for each subreddit."""
139
+ palette = [
140
+ "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728",
141
+ "#9467bd", "#8c564b", "#e377c2", "#7f7f7f",
142
+ ]
143
+ return {sub: palette[i % len(palette)] for i, sub in enumerate(sorted(subreddits))}
frontend/text_analysis.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text analysis utilities for Reddit content insights.
3
+ Provides keyword extraction and similarity matching functions.
4
+ """
5
+ import pandas as pd
6
+ import spacy
7
+ from sentence_transformers import SentenceTransformer
8
+ from keybert import KeyBERT
9
+
10
+ # Initialize spaCy and sentence transformer models
11
+ try:
12
+ nlp = spacy.load("en_core_web_sm")
13
+ except OSError:
14
+ import streamlit as st
15
+ with st.spinner("Downloading NLP model (first run only)..."):
16
+ from spacy.cli import download
17
+ download("en_core_web_sm")
18
+ nlp = spacy.load("en_core_web_sm")
19
+
20
+ # Cache models at module scope for reuse
21
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
22
+ kw_model = KeyBERT(embedder)
23
+
24
+ def keywords_for_df(df: pd.DataFrame, top_n=5):
25
+ """
26
+ Extract keywords from a DataFrame containing Reddit posts.
27
+
28
+ Args:
29
+ df: DataFrame with a 'text' column containing post content
30
+ top_n: Number of top keywords to return
31
+
32
+ Returns:
33
+ List of (keyword, score) tuples
34
+ """
35
+ if df.empty:
36
+ return []
37
+
38
+ # Join all text from the dataframe
39
+ raw = " ".join(df["text"].astype(str))
40
+
41
+ # Process with spaCy to extract noun chunks and named entities
42
+ doc = nlp(raw.lower())
43
+
44
+ # Combine noun chunks and relevant named entities
45
+ cand = " ".join(
46
+ [c.text for c in doc.noun_chunks] +
47
+ [e.text for e in doc.ents if e.label_ in {"PRODUCT", "EVENT", "ORG", "GPE"}]
48
+ )
49
+
50
+ # Quick stopword list to filter common terms
51
+ for ex in ['blog','topic','locked','author','moderator','error','bot','comments','archive','support','discord']:
52
+ cand = cand.replace(ex, " ")
53
+
54
+ # Use KeyBERT to extract keywords with diversity
55
+ return kw_model.extract_keywords(
56
+ cand,
57
+ keyphrase_ngram_range=(1, 3),
58
+ stop_words="english",
59
+ use_mmr=True,
60
+ diversity=0.8,
61
+ top_n=top_n
62
+ )
notebooks/keyword_extraction.ipynb ADDED
@@ -0,0 +1,867 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "9f7cc561-f375-4cd6-953f-65af221bc1ae",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Keyword Extraction Analysis\n",
9
+ "Analyze buzzwords driving sentiment on any given day"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 461,
15
+ "id": "59e2493c-10e5-402f-9875-d07d989cd451",
16
+ "metadata": {
17
+ "execution": {
18
+ "iopub.execute_input": "2025-04-30T02:14:47.665748Z",
19
+ "iopub.status.busy": "2025-04-30T02:14:47.665748Z",
20
+ "iopub.status.idle": "2025-04-30T02:14:47.674765Z",
21
+ "shell.execute_reply": "2025-04-30T02:14:47.673749Z",
22
+ "shell.execute_reply.started": "2025-04-30T02:14:47.665748Z"
23
+ }
24
+ },
25
+ "outputs": [],
26
+ "source": [
27
+ "import os\n",
28
+ "import pandas as pd\n",
29
+ "import numpy as np"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "markdown",
34
+ "id": "d5b34940-8c46-421b-b00b-0badaca194fc",
35
+ "metadata": {},
36
+ "source": [
37
+ "### Download data from HF Hub"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": 462,
43
+ "id": "03be6fde-e68b-4026-8f90-cd6f5d5f21db",
44
+ "metadata": {
45
+ "execution": {
46
+ "iopub.execute_input": "2025-04-30T02:14:49.167723Z",
47
+ "iopub.status.busy": "2025-04-30T02:14:49.167723Z",
48
+ "iopub.status.idle": "2025-04-30T02:14:50.725451Z",
49
+ "shell.execute_reply": "2025-04-30T02:14:50.725451Z",
50
+ "shell.execute_reply.started": "2025-04-30T02:14:49.167723Z"
51
+ }
52
+ },
53
+ "outputs": [
54
+ {
55
+ "name": "stdout",
56
+ "output_type": "stream",
57
+ "text": [
58
+ "Total records across 16 days: 4520\n"
59
+ ]
60
+ }
61
+ ],
62
+ "source": [
63
+ "from huggingface_hub import HfApi\n",
64
+ "\n",
65
+ "api = HfApi()\n",
66
+ "all_files = api.list_repo_files(\"hblim/top_reddit_posts_daily\", repo_type=\"dataset\")\n",
67
+ "parquet_files = sorted([f for f in all_files if f.startswith('data_scored') and f.endswith(\".parquet\")])\n",
68
+ "\n",
69
+ "df = []\n",
70
+ "for shard in parquet_files:\n",
71
+ " local_path = api.hf_hub_download(repo_id=\"hblim/top_reddit_posts_daily\", filename=shard, repo_type=\"dataset\")\n",
72
+ " file_date = os.path.splitext(os.path.basename(local_path))[0]\n",
73
+ " df.append(pd.read_parquet(local_path).assign(filedate=file_date))\n",
74
+ "df = pd.concat(df, ignore_index=True)\n",
75
+ "print(f\"Total records across {df.filedate.nunique()} days: {len(df)}\")"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 464,
81
+ "id": "dbcaf06b-9c29-4913-b3a9-938017eb6ffd",
82
+ "metadata": {
83
+ "execution": {
84
+ "iopub.execute_input": "2025-04-30T02:14:51.655177Z",
85
+ "iopub.status.busy": "2025-04-30T02:14:51.654664Z",
86
+ "iopub.status.idle": "2025-04-30T02:14:51.669190Z",
87
+ "shell.execute_reply": "2025-04-30T02:14:51.669190Z",
88
+ "shell.execute_reply.started": "2025-04-30T02:14:51.655177Z"
89
+ }
90
+ },
91
+ "outputs": [
92
+ {
93
+ "data": {
94
+ "text/html": [
95
+ "<div>\n",
96
+ "<style scoped>\n",
97
+ " .dataframe tbody tr th:only-of-type {\n",
98
+ " vertical-align: middle;\n",
99
+ " }\n",
100
+ "\n",
101
+ " .dataframe tbody tr th {\n",
102
+ " vertical-align: top;\n",
103
+ " }\n",
104
+ "\n",
105
+ " .dataframe thead th {\n",
106
+ " text-align: right;\n",
107
+ " }\n",
108
+ "</style>\n",
109
+ "<table border=\"1\" class=\"dataframe\">\n",
110
+ " <thead>\n",
111
+ " <tr style=\"text-align: right;\">\n",
112
+ " <th></th>\n",
113
+ " <th>subreddit</th>\n",
114
+ " <th>created_at</th>\n",
115
+ " <th>retrieved_at</th>\n",
116
+ " <th>type</th>\n",
117
+ " <th>text</th>\n",
118
+ " <th>score</th>\n",
119
+ " <th>post_id</th>\n",
120
+ " <th>parent_id</th>\n",
121
+ " <th>sentiment</th>\n",
122
+ " <th>confidence</th>\n",
123
+ " <th>filedate</th>\n",
124
+ " </tr>\n",
125
+ " </thead>\n",
126
+ " <tbody>\n",
127
+ " <tr>\n",
128
+ " <th>0</th>\n",
129
+ " <td>apple</td>\n",
130
+ " <td>2025-04-14 11:19:50-05:00</td>\n",
131
+ " <td>2025-04-14 23:44:27.136181-05:00</td>\n",
132
+ " <td>post</td>\n",
133
+ " <td>iPhone 16e Helps Apple Take Q1 2025 Top Spot in Global Smartphone Market\\n\\n</td>\n",
134
+ " <td>655</td>\n",
135
+ " <td>1jz2xrw</td>\n",
136
+ " <td>None</td>\n",
137
+ " <td>1</td>\n",
138
+ " <td>0.9971</td>\n",
139
+ " <td>2025-04-14</td>\n",
140
+ " </tr>\n",
141
+ " <tr>\n",
142
+ " <th>1</th>\n",
143
+ " <td>apple</td>\n",
144
+ " <td>2025-04-14 11:00:16-05:00</td>\n",
145
+ " <td>2025-04-14 23:44:27.136181-05:00</td>\n",
146
+ " <td>comment</td>\n",
147
+ " <td>I've closed all rings every day starting on June 19 2015. This won't be a problem as long as I don't get run over or die.</td>\n",
148
+ " <td>9</td>\n",
149
+ " <td>mn2wpoi</td>\n",
150
+ " <td>t3_1jyzp05</td>\n",
151
+ " <td>1</td>\n",
152
+ " <td>0.9965</td>\n",
153
+ " <td>2025-04-14</td>\n",
154
+ " </tr>\n",
155
+ " <tr>\n",
156
+ " <th>2</th>\n",
157
+ " <td>apple</td>\n",
158
+ " <td>2025-04-14 11:59:56-05:00</td>\n",
159
+ " <td>2025-04-14 23:44:27.136181-05:00</td>\n",
160
+ " <td>post</td>\n",
161
+ " <td>Smartphone tariffs are coming back in ‘a month or two,’ says Trump admin\\n\\n</td>\n",
162
+ " <td>194</td>\n",
163
+ " <td>1jz3wsi</td>\n",
164
+ " <td>None</td>\n",
165
+ " <td>0</td>\n",
166
+ " <td>0.9829</td>\n",
167
+ " <td>2025-04-14</td>\n",
168
+ " </tr>\n",
169
+ " <tr>\n",
170
+ " <th>3</th>\n",
171
+ " <td>apple</td>\n",
172
+ " <td>2025-04-14 11:59:56-05:00</td>\n",
173
+ " <td>2025-04-14 23:44:27.136181-05:00</td>\n",
174
+ " <td>comment</td>\n",
175
+ " <td>This topic has been automatically locked due to being controversial and/or political by nature. However, the submission itself will remain accessible as long as it is related to Apple.\\n\\n\\nThis decision was made by a bot based on specific keywords. If you feel that this was in error, please report it to the moderators so that it can be reviewed.\\n \\n\\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/apple) if you have any questions or concerns.*</td>\n",
176
+ " <td>1</td>\n",
177
+ " <td>mn38mac</td>\n",
178
+ " <td>t3_1jz3wsi</td>\n",
179
+ " <td>0</td>\n",
180
+ " <td>0.9972</td>\n",
181
+ " <td>2025-04-14</td>\n",
182
+ " </tr>\n",
183
+ " <tr>\n",
184
+ " <th>4</th>\n",
185
+ " <td>apple</td>\n",
186
+ " <td>2025-04-14 18:04:42-05:00</td>\n",
187
+ " <td>2025-04-14 23:44:27.136181-05:00</td>\n",
188
+ " <td>post</td>\n",
189
+ " <td>Apple to Analyze User Data on Devices to Bolster AI Technology\\n\\n</td>\n",
190
+ " <td>69</td>\n",
191
+ " <td>1jzcpwz</td>\n",
192
+ " <td>None</td>\n",
193
+ " <td>1</td>\n",
194
+ " <td>0.9976</td>\n",
195
+ " <td>2025-04-14</td>\n",
196
+ " </tr>\n",
197
+ " </tbody>\n",
198
+ "</table>\n",
199
+ "</div>"
200
+ ],
201
+ "text/plain": [
202
+ " subreddit created_at retrieved_at \\\n",
203
+ "0 apple 2025-04-14 11:19:50-05:00 2025-04-14 23:44:27.136181-05:00 \n",
204
+ "1 apple 2025-04-14 11:00:16-05:00 2025-04-14 23:44:27.136181-05:00 \n",
205
+ "2 apple 2025-04-14 11:59:56-05:00 2025-04-14 23:44:27.136181-05:00 \n",
206
+ "3 apple 2025-04-14 11:59:56-05:00 2025-04-14 23:44:27.136181-05:00 \n",
207
+ "4 apple 2025-04-14 18:04:42-05:00 2025-04-14 23:44:27.136181-05:00 \n",
208
+ "\n",
209
+ " type \\\n",
210
+ "0 post \n",
211
+ "1 comment \n",
212
+ "2 post \n",
213
+ "3 comment \n",
214
+ "4 post \n",
215
+ "\n",
216
+ " text \\\n",
217
+ "0 iPhone 16e Helps Apple Take Q1 2025 Top Spot in Global Smartphone Market\\n\\n \n",
218
+ "1 I've closed all rings every day starting on June 19 2015. This won't be a problem as long as I don't get run over or die. \n",
219
+ "2 Smartphone tariffs are coming back in ‘a month or two,’ says Trump admin\\n\\n \n",
220
+ "3 This topic has been automatically locked due to being controversial and/or political by nature. However, the submission itself will remain accessible as long as it is related to Apple.\\n\\n\\nThis decision was made by a bot based on specific keywords. If you feel that this was in error, please report it to the moderators so that it can be reviewed.\\n \\n\\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/apple) if you have any questions or concerns.* \n",
221
+ "4 Apple to Analyze User Data on Devices to Bolster AI Technology\\n\\n \n",
222
+ "\n",
223
+ " score post_id parent_id sentiment confidence filedate \n",
224
+ "0 655 1jz2xrw None 1 0.9971 2025-04-14 \n",
225
+ "1 9 mn2wpoi t3_1jyzp05 1 0.9965 2025-04-14 \n",
226
+ "2 194 1jz3wsi None 0 0.9829 2025-04-14 \n",
227
+ "3 1 mn38mac t3_1jz3wsi 0 0.9972 2025-04-14 \n",
228
+ "4 69 1jzcpwz None 1 0.9976 2025-04-14 "
229
+ ]
230
+ },
231
+ "execution_count": 464,
232
+ "metadata": {},
233
+ "output_type": "execute_result"
234
+ }
235
+ ],
236
+ "source": [
237
+ "df.head()"
238
+ ]
239
+ },
240
+ {
241
+ "cell_type": "markdown",
242
+ "id": "958f8e47-37c9-4d53-9d20-41a29a0c2714",
243
+ "metadata": {},
244
+ "source": [
245
+ "### Look at specific subreddit, date"
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "code",
250
+ "execution_count": 562,
251
+ "id": "9f20145f-1ae0-4fa5-ac99-d00d9909cd76",
252
+ "metadata": {
253
+ "execution": {
254
+ "iopub.execute_input": "2025-04-30T02:54:23.431576Z",
255
+ "iopub.status.busy": "2025-04-30T02:54:23.431576Z",
256
+ "iopub.status.idle": "2025-04-30T02:54:23.436791Z",
257
+ "shell.execute_reply": "2025-04-30T02:54:23.436791Z",
258
+ "shell.execute_reply.started": "2025-04-30T02:54:23.431576Z"
259
+ }
260
+ },
261
+ "outputs": [],
262
+ "source": [
263
+ "# 1. Filter your dataframe\n",
264
+ "date = '2025-04-14'\n",
265
+ "subreddit = 'apple'\n",
266
+ "day_sub = (df['filedate'] == date) & (df['subreddit'] == subreddit) "
267
+ ]
268
+ },
269
+ {
270
+ "cell_type": "code",
271
+ "execution_count": 589,
272
+ "id": "2c286057-13db-49f4-a5d0-178a5d004b53",
273
+ "metadata": {
274
+ "execution": {
275
+ "iopub.execute_input": "2025-04-30T03:24:38.658484Z",
276
+ "iopub.status.busy": "2025-04-30T03:24:38.658484Z",
277
+ "iopub.status.idle": "2025-04-30T03:24:38.669335Z",
278
+ "shell.execute_reply": "2025-04-30T03:24:38.669335Z",
279
+ "shell.execute_reply.started": "2025-04-30T03:24:38.658484Z"
280
+ }
281
+ },
282
+ "outputs": [
283
+ {
284
+ "name": "stdout",
285
+ "output_type": "stream",
286
+ "text": [
287
+ "Daily aggregated sentiment stats\n",
288
+ "Community Weighted Sentiment = 0.3353147742165998\n",
289
+ "Average Sentiment = 0.43902439024390244\n"
290
+ ]
291
+ }
292
+ ],
293
+ "source": [
294
+ "dftest = df[day_sub]\n",
295
+ "print(\"Daily aggregated sentiment stats\")\n",
296
+ "print(\"Community Weighted Sentiment =\",((2 * dftest['sentiment'] - 1) * np.log1p(dftest['score'].clip(0,None))).mean())\n",
297
+ "print(\"Average Sentiment =\",dftest['sentiment'].mean())\n",
298
+ "# dftest.sort_values('score',ascending=False)\n",
299
+ "# dftest.groupby('parent_id').agg({'sentiment': ['mean','sum','count']})"
300
+ ]
301
+ },
302
+ {
303
+ "cell_type": "markdown",
304
+ "id": "85d4797e-c9dd-45b0-b72f-cb8d279f5ebc",
305
+ "metadata": {},
306
+ "source": [
307
+ "### Use KeyBERT and sentiment transformers model to extract keywords"
308
+ ]
309
+ },
310
+ {
311
+ "cell_type": "code",
312
+ "execution_count": 587,
313
+ "id": "8c0a4f6a-d34a-4397-add9-1d68e670eaf7",
314
+ "metadata": {
315
+ "execution": {
316
+ "iopub.execute_input": "2025-04-30T02:59:03.237586Z",
317
+ "iopub.status.busy": "2025-04-30T02:59:03.237586Z",
318
+ "iopub.status.idle": "2025-04-30T02:59:05.388763Z",
319
+ "shell.execute_reply": "2025-04-30T02:59:05.388763Z",
320
+ "shell.execute_reply.started": "2025-04-30T02:59:03.237586Z"
321
+ },
322
+ "scrolled": true
323
+ },
324
+ "outputs": [
325
+ {
326
+ "name": "stdout",
327
+ "output_type": "stream",
328
+ "text": [
329
+ "[('smartphone market', 0.5024), ('command thanks universal', 0.0662), ('dimensions leak years', 0.0196), ('wwdc non paywall', -0.0052), ('animations new techniques', -0.0178)]\n"
330
+ ]
331
+ }
332
+ ],
333
+ "source": [
334
+ "from keybert import KeyBERT\n",
335
+ "from sentence_transformers import SentenceTransformer\n",
336
+ "import spacy\n",
337
+ "\n",
338
+ "raw_text = \" \".join(df.loc[day_sub, 'text'].astype(str))\n",
339
+ "\n",
340
+ "# 2. Load spaCy with parser enabled for noun_chunks\n",
341
+ "nlp = spacy.load(\"en_core_web_sm\") # keep the parser on\n",
342
+ "doc = nlp(raw_text.lower())\n",
343
+ "\n",
344
+ "# 3. Build candidate phrases\n",
345
+ "candidates = \" \".join(\n",
346
+ " [chunk.text for chunk in doc.noun_chunks]\n",
347
+ " + [ent.text for ent in doc.ents if ent.label_ in {\"PRODUCT\",\"EVENT\",}]\n",
348
+ ")\n",
349
+ "\n",
350
+ "for exclude in ['google','pixel','android','apple','rationale','advice','blog','topic','locked','author','moderator','error','bot','comments','archive','support','discord']:\n",
351
+ " candidates = candidates.replace(exclude,' ')\n",
352
+ "\n",
353
+ "# 4. Keyword extraction with local embeddings\n",
354
+ "model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
355
+ "kw_model = KeyBERT(model)\n",
356
+ "keywords = kw_model.extract_keywords(\n",
357
+ " candidates,\n",
358
+ " keyphrase_ngram_range=(1, 3),\n",
359
+ " stop_words=\"english\",\n",
360
+ " use_mmr=True,\n",
361
+ " diversity=0.9,\n",
362
+ " top_n=5\n",
363
+ ")\n",
364
+ "\n",
365
+ "print(keywords)\n"
366
+ ]
367
+ },
368
+ {
369
+ "cell_type": "markdown",
370
+ "id": "3ea178ad-676c-4362-b817-e66563dce6de",
371
+ "metadata": {},
372
+ "source": [
373
+ "### Ensure keywords actually match to posts or comments based on cosine similarity"
374
+ ]
375
+ },
376
+ {
377
+ "cell_type": "code",
378
+ "execution_count": 591,
379
+ "id": "63e1b61f-cdcb-4c5a-9024-531e26eee495",
380
+ "metadata": {
381
+ "execution": {
382
+ "iopub.execute_input": "2025-04-30T03:27:16.957826Z",
383
+ "iopub.status.busy": "2025-04-30T03:27:16.957826Z",
384
+ "iopub.status.idle": "2025-04-30T03:27:17.668631Z",
385
+ "shell.execute_reply": "2025-04-30T03:27:17.667592Z",
386
+ "shell.execute_reply.started": "2025-04-30T03:27:16.957826Z"
387
+ }
388
+ },
389
+ "outputs": [
390
+ {
391
+ "data": {
392
+ "text/html": [
393
+ "<div>\n",
394
+ "<style scoped>\n",
395
+ " .dataframe tbody tr th:only-of-type {\n",
396
+ " vertical-align: middle;\n",
397
+ " }\n",
398
+ "\n",
399
+ " .dataframe tbody tr th {\n",
400
+ " vertical-align: top;\n",
401
+ " }\n",
402
+ "\n",
403
+ " .dataframe thead th {\n",
404
+ " text-align: right;\n",
405
+ " }\n",
406
+ "</style>\n",
407
+ "<table border=\"1\" class=\"dataframe\">\n",
408
+ " <thead>\n",
409
+ " <tr style=\"text-align: right;\">\n",
410
+ " <th></th>\n",
411
+ " <th>keyword</th>\n",
412
+ " <th>mean_sentiment</th>\n",
413
+ " <th>community_weighted_sentiment</th>\n",
414
+ " <th>n_posts</th>\n",
415
+ " <th>total_score</th>\n",
416
+ " </tr>\n",
417
+ " </thead>\n",
418
+ " <tbody>\n",
419
+ " <tr>\n",
420
+ " <th>0</th>\n",
421
+ " <td>smartphone market</td>\n",
422
+ " <td>-0.076923</td>\n",
423
+ " <td>0.841451</td>\n",
424
+ " <td>13</td>\n",
425
+ " <td>2798</td>\n",
426
+ " </tr>\n",
427
+ " <tr>\n",
428
+ " <th>1</th>\n",
429
+ " <td>dimensions leak years</td>\n",
430
+ " <td>-1.000000</td>\n",
431
+ " <td>-5.939423</td>\n",
432
+ " <td>2</td>\n",
433
+ " <td>804</td>\n",
434
+ " </tr>\n",
435
+ " <tr>\n",
436
+ " <th>2</th>\n",
437
+ " <td>animations new techniques</td>\n",
438
+ " <td>1.000000</td>\n",
439
+ " <td>2.944439</td>\n",
440
+ " <td>1</td>\n",
441
+ " <td>18</td>\n",
442
+ " </tr>\n",
443
+ " <tr>\n",
444
+ " <th>3</th>\n",
445
+ " <td>wwdc non paywall</td>\n",
446
+ " <td>-1.000000</td>\n",
447
+ " <td>-2.397895</td>\n",
448
+ " <td>1</td>\n",
449
+ " <td>10</td>\n",
450
+ " </tr>\n",
451
+ " </tbody>\n",
452
+ "</table>\n",
453
+ "</div>"
454
+ ],
455
+ "text/plain": [
456
+ " keyword mean_sentiment community_weighted_sentiment \\\n",
457
+ "0 smartphone market -0.076923 0.841451 \n",
458
+ "1 dimensions leak years -1.000000 -5.939423 \n",
459
+ "2 animations new techniques 1.000000 2.944439 \n",
460
+ "3 wwdc non paywall -1.000000 -2.397895 \n",
461
+ "\n",
462
+ " n_posts total_score \n",
463
+ "0 13 2798 \n",
464
+ "1 2 804 \n",
465
+ "2 1 18 \n",
466
+ "3 1 10 "
467
+ ]
468
+ },
469
+ "execution_count": 591,
470
+ "metadata": {},
471
+ "output_type": "execute_result"
472
+ }
473
+ ],
474
+ "source": [
475
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
476
+ "\n",
477
+ "# 1) Precompute embeddings for all texts in your day/subreddit slice\n",
478
+ "texts = df.loc[day_sub, 'text'].tolist()\n",
479
+ "text_embs = model.encode(texts, convert_to_tensor=False) # shape: (n_texts, 384)\n",
480
+ "\n",
481
+ "results = []\n",
482
+ "subsets = {}\n",
483
+ "# if you only want to test on a single kw, iterate keywords_test instead\n",
484
+ "for kw, _score in keywords: \n",
485
+ " # kw is now a string\n",
486
+ " kw_emb = model.encode(kw, convert_to_tensor=False) # shape: (384,)\n",
487
+ " kw_emb = kw_emb.reshape(1, -1) # shape: (1, 384)\n",
488
+ " \n",
489
+ " sims = cosine_similarity(text_embs, kw_emb).flatten() # OK: (n_texts,) vs (1,384)\n",
490
+ " \n",
491
+ " # rank or threshold as before\n",
492
+ " hits = df.loc[day_sub].iloc[sims.argsort()[::-1]]\n",
493
+ " mask = sims >= 0.3\n",
494
+ " \n",
495
+ " subset = df.loc[day_sub].iloc[mask]\n",
496
+ " if subset.empty:\n",
497
+ " continue\n",
498
+ " subsets[kw] = subset\n",
499
+ " \n",
500
+ " # compute sentiment stats on subset…\n",
501
+ " mean_sent = 2 * subset['sentiment'].mean() - 1\n",
502
+ " weighted = ((2 * subset['sentiment'] - 1) * np.log1p(subset['score'].clip(0,None))).mean()\n",
503
+ " total_score = subset['score'].sum()\n",
504
+ " results.append((kw, mean_sent, weighted, len(subset), total_score))\n",
505
+ "\n",
506
+ "summary = pd.DataFrame(results, columns=[\n",
507
+ " 'keyword', 'mean_sentiment', 'community_weighted_sentiment', 'n_posts' , 'total_score'\n",
508
+ "]).sort_values('total_score', ascending=False).reset_index(drop=True)\n",
509
+ "\n",
510
+ "summary"
511
+ ]
512
+ },
513
+ {
514
+ "cell_type": "markdown",
515
+ "id": "fbcbd745-8bbf-49f3-9fac-a4c9a056de6f",
516
+ "metadata": {},
517
+ "source": [
518
+ "### Manually inspect posts and comments associated with the keyword"
519
+ ]
520
+ },
521
+ {
522
+ "cell_type": "code",
523
+ "execution_count": 593,
524
+ "id": "1b2dadf2-c2b3-447a-bcdf-aec627635f49",
525
+ "metadata": {
526
+ "execution": {
527
+ "iopub.execute_input": "2025-04-30T03:27:33.238178Z",
528
+ "iopub.status.busy": "2025-04-30T03:27:33.237146Z",
529
+ "iopub.status.idle": "2025-04-30T03:27:33.245143Z",
530
+ "shell.execute_reply": "2025-04-30T03:27:33.245143Z",
531
+ "shell.execute_reply.started": "2025-04-30T03:27:33.238178Z"
532
+ }
533
+ },
534
+ "outputs": [
535
+ {
536
+ "data": {
537
+ "text/html": [
538
+ "<style type=\"text/css\">\n",
539
+ "</style>\n",
540
+ "<table id=\"T_d6ecc\">\n",
541
+ " <caption>KEYWORD = smartphone market</caption>\n",
542
+ " <thead>\n",
543
+ " <tr>\n",
544
+ " <th class=\"blank level0\" >&nbsp;</th>\n",
545
+ " <th id=\"T_d6ecc_level0_col0\" class=\"col_heading level0 col0\" >subreddit</th>\n",
546
+ " <th id=\"T_d6ecc_level0_col1\" class=\"col_heading level0 col1\" >created_at</th>\n",
547
+ " <th id=\"T_d6ecc_level0_col2\" class=\"col_heading level0 col2\" >retrieved_at</th>\n",
548
+ " <th id=\"T_d6ecc_level0_col3\" class=\"col_heading level0 col3\" >type</th>\n",
549
+ " <th id=\"T_d6ecc_level0_col4\" class=\"col_heading level0 col4\" >text</th>\n",
550
+ " <th id=\"T_d6ecc_level0_col5\" class=\"col_heading level0 col5\" >score</th>\n",
551
+ " <th id=\"T_d6ecc_level0_col6\" class=\"col_heading level0 col6\" >post_id</th>\n",
552
+ " <th id=\"T_d6ecc_level0_col7\" class=\"col_heading level0 col7\" >parent_id</th>\n",
553
+ " <th id=\"T_d6ecc_level0_col8\" class=\"col_heading level0 col8\" >sentiment</th>\n",
554
+ " <th id=\"T_d6ecc_level0_col9\" class=\"col_heading level0 col9\" >confidence</th>\n",
555
+ " <th id=\"T_d6ecc_level0_col10\" class=\"col_heading level0 col10\" >filedate</th>\n",
556
+ " </tr>\n",
557
+ " </thead>\n",
558
+ " <tbody>\n",
559
+ " <tr>\n",
560
+ " <th id=\"T_d6ecc_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
561
+ " <td id=\"T_d6ecc_row0_col0\" class=\"data row0 col0\" >apple</td>\n",
562
+ " <td id=\"T_d6ecc_row0_col1\" class=\"data row0 col1\" >2025-04-14 11:19:50-05:00</td>\n",
563
+ " <td id=\"T_d6ecc_row0_col2\" class=\"data row0 col2\" >2025-04-14 23:44:27.136181-05:00</td>\n",
564
+ " <td id=\"T_d6ecc_row0_col3\" class=\"data row0 col3\" >post</td>\n",
565
+ " <td id=\"T_d6ecc_row0_col4\" class=\"data row0 col4\" >iPhone 16e Helps Apple Take Q1 2025 Top Spot in Global Smartphone Market\n",
566
+ "\n",
567
+ "</td>\n",
568
+ " <td id=\"T_d6ecc_row0_col5\" class=\"data row0 col5\" >655</td>\n",
569
+ " <td id=\"T_d6ecc_row0_col6\" class=\"data row0 col6\" >1jz2xrw</td>\n",
570
+ " <td id=\"T_d6ecc_row0_col7\" class=\"data row0 col7\" >None</td>\n",
571
+ " <td id=\"T_d6ecc_row0_col8\" class=\"data row0 col8\" >1</td>\n",
572
+ " <td id=\"T_d6ecc_row0_col9\" class=\"data row0 col9\" >0.997100</td>\n",
573
+ " <td id=\"T_d6ecc_row0_col10\" class=\"data row0 col10\" >2025-04-14</td>\n",
574
+ " </tr>\n",
575
+ " <tr>\n",
576
+ " <th id=\"T_d6ecc_level0_row1\" class=\"row_heading level0 row1\" >2</th>\n",
577
+ " <td id=\"T_d6ecc_row1_col0\" class=\"data row1 col0\" >apple</td>\n",
578
+ " <td id=\"T_d6ecc_row1_col1\" class=\"data row1 col1\" >2025-04-14 11:59:56-05:00</td>\n",
579
+ " <td id=\"T_d6ecc_row1_col2\" class=\"data row1 col2\" >2025-04-14 23:44:27.136181-05:00</td>\n",
580
+ " <td id=\"T_d6ecc_row1_col3\" class=\"data row1 col3\" >post</td>\n",
581
+ " <td id=\"T_d6ecc_row1_col4\" class=\"data row1 col4\" >Smartphone tariffs are coming back in ‘a month or two,’ says Trump admin\n",
582
+ "\n",
583
+ "</td>\n",
584
+ " <td id=\"T_d6ecc_row1_col5\" class=\"data row1 col5\" >194</td>\n",
585
+ " <td id=\"T_d6ecc_row1_col6\" class=\"data row1 col6\" >1jz3wsi</td>\n",
586
+ " <td id=\"T_d6ecc_row1_col7\" class=\"data row1 col7\" >None</td>\n",
587
+ " <td id=\"T_d6ecc_row1_col8\" class=\"data row1 col8\" >0</td>\n",
588
+ " <td id=\"T_d6ecc_row1_col9\" class=\"data row1 col9\" >0.982900</td>\n",
589
+ " <td id=\"T_d6ecc_row1_col10\" class=\"data row1 col10\" >2025-04-14</td>\n",
590
+ " </tr>\n",
591
+ " <tr>\n",
592
+ " <th id=\"T_d6ecc_level0_row2\" class=\"row_heading level0 row2\" >3</th>\n",
593
+ " <td id=\"T_d6ecc_row2_col0\" class=\"data row2 col0\" >apple</td>\n",
594
+ " <td id=\"T_d6ecc_row2_col1\" class=\"data row2 col1\" >2025-04-14 11:59:56-05:00</td>\n",
595
+ " <td id=\"T_d6ecc_row2_col2\" class=\"data row2 col2\" >2025-04-14 23:44:27.136181-05:00</td>\n",
596
+ " <td id=\"T_d6ecc_row2_col3\" class=\"data row2 col3\" >comment</td>\n",
597
+ " <td id=\"T_d6ecc_row2_col4\" class=\"data row2 col4\" >This topic has been automatically locked due to being controversial and/or political by nature. However, the submission itself will remain accessible as long as it is related to Apple.\n",
598
+ "\n",
599
+ "\n",
600
+ "This decision was made by a bot based on specific keywords. If you feel that this was in error, please report it to the moderators so that it can be reviewed.\n",
601
+ " \n",
602
+ "\n",
603
+ "*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/apple) if you have any questions or concerns.*</td>\n",
604
+ " <td id=\"T_d6ecc_row2_col5\" class=\"data row2 col5\" >1</td>\n",
605
+ " <td id=\"T_d6ecc_row2_col6\" class=\"data row2 col6\" >mn38mac</td>\n",
606
+ " <td id=\"T_d6ecc_row2_col7\" class=\"data row2 col7\" >t3_1jz3wsi</td>\n",
607
+ " <td id=\"T_d6ecc_row2_col8\" class=\"data row2 col8\" >0</td>\n",
608
+ " <td id=\"T_d6ecc_row2_col9\" class=\"data row2 col9\" >0.997200</td>\n",
609
+ " <td id=\"T_d6ecc_row2_col10\" class=\"data row2 col10\" >2025-04-14</td>\n",
610
+ " </tr>\n",
611
+ " <tr>\n",
612
+ " <th id=\"T_d6ecc_level0_row3\" class=\"row_heading level0 row3\" >23</th>\n",
613
+ " <td id=\"T_d6ecc_row3_col0\" class=\"data row3 col0\" >apple</td>\n",
614
+ " <td id=\"T_d6ecc_row3_col1\" class=\"data row3 col1\" >2025-04-14 11:43:39-05:00</td>\n",
615
+ " <td id=\"T_d6ecc_row3_col2\" class=\"data row3 col2\" >2025-04-14 23:44:27.136181-05:00</td>\n",
616
+ " <td id=\"T_d6ecc_row3_col3\" class=\"data row3 col3\" >comment</td>\n",
617
+ " <td id=\"T_d6ecc_row3_col4\" class=\"data row3 col4\" >My boss purchased me a 16e to use for iOS development. It may not make much sense for end users but it is a nearly perfect corporate phone.</td>\n",
618
+ " <td id=\"T_d6ecc_row3_col5\" class=\"data row3 col5\" >309</td>\n",
619
+ " <td id=\"T_d6ecc_row3_col6\" class=\"data row3 col6\" >mn35a6r</td>\n",
620
+ " <td id=\"T_d6ecc_row3_col7\" class=\"data row3 col7\" >t3_1jz2xrw</td>\n",
621
+ " <td id=\"T_d6ecc_row3_col8\" class=\"data row3 col8\" >1</td>\n",
622
+ " <td id=\"T_d6ecc_row3_col9\" class=\"data row3 col9\" >0.998600</td>\n",
623
+ " <td id=\"T_d6ecc_row3_col10\" class=\"data row3 col10\" >2025-04-14</td>\n",
624
+ " </tr>\n",
625
+ " <tr>\n",
626
+ " <th id=\"T_d6ecc_level0_row4\" class=\"row_heading level0 row4\" >24</th>\n",
627
+ " <td id=\"T_d6ecc_row4_col0\" class=\"data row4 col0\" >apple</td>\n",
628
+ " <td id=\"T_d6ecc_row4_col1\" class=\"data row4 col1\" >2025-04-14 11:24:30-05:00</td>\n",
629
+ " <td id=\"T_d6ecc_row4_col2\" class=\"data row4 col2\" >2025-04-14 23:44:27.136181-05:00</td>\n",
630
+ " <td id=\"T_d6ecc_row4_col3\" class=\"data row4 col3\" >comment</td>\n",
631
+ " <td id=\"T_d6ecc_row4_col4\" class=\"data row4 col4\" >Despite what the tech influencers might have said, Apple clearly knows what they’re doing.</td>\n",
632
+ " <td id=\"T_d6ecc_row4_col5\" class=\"data row4 col5\" >283</td>\n",
633
+ " <td id=\"T_d6ecc_row4_col6\" class=\"data row4 col6\" >mn31i1a</td>\n",
634
+ " <td id=\"T_d6ecc_row4_col7\" class=\"data row4 col7\" >t3_1jz2xrw</td>\n",
635
+ " <td id=\"T_d6ecc_row4_col8\" class=\"data row4 col8\" >1</td>\n",
636
+ " <td id=\"T_d6ecc_row4_col9\" class=\"data row4 col9\" >0.999600</td>\n",
637
+ " <td id=\"T_d6ecc_row4_col10\" class=\"data row4 col10\" >2025-04-14</td>\n",
638
+ " </tr>\n",
639
+ " </tbody>\n",
640
+ "</table>\n"
641
+ ],
642
+ "text/plain": [
643
+ "<pandas.io.formats.style.Styler at 0x26d025bdb80>"
644
+ ]
645
+ },
646
+ "execution_count": 593,
647
+ "metadata": {},
648
+ "output_type": "execute_result"
649
+ }
650
+ ],
651
+ "source": [
652
+ "keyword_index = 0\n",
653
+ "subsets[summary.keyword[keyword_index]].head().style.set_caption(f\"KEYWORD = {summary.keyword[keyword_index]}\")"
654
+ ]
655
+ },
656
+ {
657
+ "cell_type": "markdown",
658
+ "id": "1f8adbfe-3141-417f-bc13-922b7f1098a7",
659
+ "metadata": {},
660
+ "source": [
661
+ "### Helper tool: Retrieve post and comments by post_id"
662
+ ]
663
+ },
664
+ {
665
+ "cell_type": "code",
666
+ "execution_count": 534,
667
+ "id": "ea794d80-bf98-47c7-877d-1b2e6a626b12",
668
+ "metadata": {
669
+ "execution": {
670
+ "iopub.execute_input": "2025-04-30T02:46:15.966447Z",
671
+ "iopub.status.busy": "2025-04-30T02:46:15.966447Z",
672
+ "iopub.status.idle": "2025-04-30T02:46:15.979590Z",
673
+ "shell.execute_reply": "2025-04-30T02:46:15.979590Z",
674
+ "shell.execute_reply.started": "2025-04-30T02:46:15.966447Z"
675
+ }
676
+ },
677
+ "outputs": [
678
+ {
679
+ "data": {
680
+ "text/html": [
681
+ "<div>\n",
682
+ "<style scoped>\n",
683
+ " .dataframe tbody tr th:only-of-type {\n",
684
+ " vertical-align: middle;\n",
685
+ " }\n",
686
+ "\n",
687
+ " .dataframe tbody tr th {\n",
688
+ " vertical-align: top;\n",
689
+ " }\n",
690
+ "\n",
691
+ " .dataframe thead th {\n",
692
+ " text-align: right;\n",
693
+ " }\n",
694
+ "</style>\n",
695
+ "<table border=\"1\" class=\"dataframe\">\n",
696
+ " <thead>\n",
697
+ " <tr style=\"text-align: right;\">\n",
698
+ " <th></th>\n",
699
+ " <th>subreddit</th>\n",
700
+ " <th>created_at</th>\n",
701
+ " <th>retrieved_at</th>\n",
702
+ " <th>type</th>\n",
703
+ " <th>text</th>\n",
704
+ " <th>score</th>\n",
705
+ " <th>post_id</th>\n",
706
+ " <th>parent_id</th>\n",
707
+ " <th>sentiment</th>\n",
708
+ " <th>confidence</th>\n",
709
+ " <th>filedate</th>\n",
710
+ " </tr>\n",
711
+ " </thead>\n",
712
+ " <tbody>\n",
713
+ " <tr>\n",
714
+ " <th>2748</th>\n",
715
+ " <td>Android</td>\n",
716
+ " <td>2025-04-23 08:15:55-05:00</td>\n",
717
+ " <td>2025-04-23 19:03:18.888116-05:00</td>\n",
718
+ " <td>post</td>\n",
719
+ " <td>The new feature that gives higher memory priority to background tabs containing user edits, such as fillable forms or drafts (reducing the chance of them being killed and thus not losing your progress) is now available in Chrome Canary for Android.\\n\\n</td>\n",
720
+ " <td>224</td>\n",
721
+ " <td>1k5ywd6</td>\n",
722
+ " <td>None</td>\n",
723
+ " <td>0</td>\n",
724
+ " <td>0.9717</td>\n",
725
+ " <td>2025-04-23</td>\n",
726
+ " </tr>\n",
727
+ " <tr>\n",
728
+ " <th>2749</th>\n",
729
+ " <td>Android</td>\n",
730
+ " <td>2025-04-23 08:43:37-05:00</td>\n",
731
+ " <td>2025-04-23 19:03:18.888116-05:00</td>\n",
732
+ " <td>comment</td>\n",
733
+ " <td>Android's task refreshing is so bad and random that I've adapted my whole workflow around it by simply never trusting it and constantly copying whatever I input. If I write something and need switch away to another app even for a second, I copy the text before I do it. \\n\\nAndroid still does this even if you have 16GB of RAM!</td>\n",
734
+ " <td>1</td>\n",
735
+ " <td>molv84l</td>\n",
736
+ " <td>t3_1k5ywd6</td>\n",
737
+ " <td>0</td>\n",
738
+ " <td>0.9996</td>\n",
739
+ " <td>2025-04-23</td>\n",
740
+ " </tr>\n",
741
+ " <tr>\n",
742
+ " <th>2750</th>\n",
743
+ " <td>Android</td>\n",
744
+ " <td>2025-04-23 08:19:42-05:00</td>\n",
745
+ " <td>2025-04-23 19:03:18.888116-05:00</td>\n",
746
+ " <td>comment</td>\n",
747
+ " <td>I love that \"it reduces the chance\" but it doesn't eliminate the chance something I am working on it is killed...</td>\n",
748
+ " <td>1</td>\n",
749
+ " <td>molr08u</td>\n",
750
+ " <td>t3_1k5ywd6</td>\n",
751
+ " <td>1</td>\n",
752
+ " <td>0.9835</td>\n",
753
+ " <td>2025-04-23</td>\n",
754
+ " </tr>\n",
755
+ " <tr>\n",
756
+ " <th>2751</th>\n",
757
+ " <td>Android</td>\n",
758
+ " <td>2025-04-23 08:17:05-05:00</td>\n",
759
+ " <td>2025-04-23 19:03:18.888116-05:00</td>\n",
760
+ " <td>comment</td>\n",
761
+ " <td>Context: [**Background tabs containing user edits, such as filled forms or drafts, will soon have a higher memory priority in Chrome for Android, this will reduce the likelihood of these tabs been killed prematurely.**](https://old.reddit.com/r/Android/comments/1j3ktpg/background_tabs_containing_user_edits_such_as/)\\n\\n.\\n\\nThe patch responsible for this change [**was merged yesterday.**](https://chromium-review.googlesource.com/c/chromium/src/+/6321765)</td>\n",
762
+ " <td>1</td>\n",
763
+ " <td>molqjut</td>\n",
764
+ " <td>t3_1k5ywd6</td>\n",
765
+ " <td>0</td>\n",
766
+ " <td>0.9996</td>\n",
767
+ " <td>2025-04-23</td>\n",
768
+ " </tr>\n",
769
+ " <tr>\n",
770
+ " <th>2752</th>\n",
771
+ " <td>Android</td>\n",
772
+ " <td>2025-04-23 12:13:43-05:00</td>\n",
773
+ " <td>2025-04-23 19:03:18.888116-05:00</td>\n",
774
+ " <td>comment</td>\n",
775
+ " <td>Would love it if Android would let me \"pin\" apps by default that I didn't want to come out of memory. Would be amazing for apps that are slow to re-open.</td>\n",
776
+ " <td>1</td>\n",
777
+ " <td>mon1t9v</td>\n",
778
+ " <td>t3_1k5ywd6</td>\n",
779
+ " <td>0</td>\n",
780
+ " <td>0.9928</td>\n",
781
+ " <td>2025-04-23</td>\n",
782
+ " </tr>\n",
783
+ " <tr>\n",
784
+ " <th>2753</th>\n",
785
+ " <td>Android</td>\n",
786
+ " <td>2025-04-23 11:59:23-05:00</td>\n",
787
+ " <td>2025-04-23 19:03:18.888116-05:00</td>\n",
788
+ " <td>comment</td>\n",
789
+ " <td>Solid upgrade for Android users.</td>\n",
790
+ " <td>1</td>\n",
791
+ " <td>momytb8</td>\n",
792
+ " <td>t3_1k5ywd6</td>\n",
793
+ " <td>1</td>\n",
794
+ " <td>0.9996</td>\n",
795
+ " <td>2025-04-23</td>\n",
796
+ " </tr>\n",
797
+ " </tbody>\n",
798
+ "</table>\n",
799
+ "</div>"
800
+ ],
801
+ "text/plain": [
802
+ " subreddit created_at retrieved_at \\\n",
803
+ "2748 Android 2025-04-23 08:15:55-05:00 2025-04-23 19:03:18.888116-05:00 \n",
804
+ "2749 Android 2025-04-23 08:43:37-05:00 2025-04-23 19:03:18.888116-05:00 \n",
805
+ "2750 Android 2025-04-23 08:19:42-05:00 2025-04-23 19:03:18.888116-05:00 \n",
806
+ "2751 Android 2025-04-23 08:17:05-05:00 2025-04-23 19:03:18.888116-05:00 \n",
807
+ "2752 Android 2025-04-23 12:13:43-05:00 2025-04-23 19:03:18.888116-05:00 \n",
808
+ "2753 Android 2025-04-23 11:59:23-05:00 2025-04-23 19:03:18.888116-05:00 \n",
809
+ "\n",
810
+ " type \\\n",
811
+ "2748 post \n",
812
+ "2749 comment \n",
813
+ "2750 comment \n",
814
+ "2751 comment \n",
815
+ "2752 comment \n",
816
+ "2753 comment \n",
817
+ "\n",
818
+ " text \\\n",
819
+ "2748 The new feature that gives higher memory priority to background tabs containing user edits, such as fillable forms or drafts (reducing the chance of them being killed and thus not losing your progress) is now available in Chrome Canary for Android.\\n\\n \n",
820
+ "2749 Android's task refreshing is so bad and random that I've adapted my whole workflow around it by simply never trusting it and constantly copying whatever I input. If I write something and need switch away to another app even for a second, I copy the text before I do it. \\n\\nAndroid still does this even if you have 16GB of RAM! \n",
821
+ "2750 I love that \"it reduces the chance\" but it doesn't eliminate the chance something I am working on it is killed... \n",
822
+ "2751 Context: [**Background tabs containing user edits, such as filled forms or drafts, will soon have a higher memory priority in Chrome for Android, this will reduce the likelihood of these tabs been killed prematurely.**](https://old.reddit.com/r/Android/comments/1j3ktpg/background_tabs_containing_user_edits_such_as/)\\n\\n.\\n\\nThe patch responsible for this change [**was merged yesterday.**](https://chromium-review.googlesource.com/c/chromium/src/+/6321765) \n",
823
+ "2752 Would love it if Android would let me \"pin\" apps by default that I didn't want to come out of memory. Would be amazing for apps that are slow to re-open. \n",
824
+ "2753 Solid upgrade for Android users. \n",
825
+ "\n",
826
+ " score post_id parent_id sentiment confidence filedate \n",
827
+ "2748 224 1k5ywd6 None 0 0.9717 2025-04-23 \n",
828
+ "2749 1 molv84l t3_1k5ywd6 0 0.9996 2025-04-23 \n",
829
+ "2750 1 molr08u t3_1k5ywd6 1 0.9835 2025-04-23 \n",
830
+ "2751 1 molqjut t3_1k5ywd6 0 0.9996 2025-04-23 \n",
831
+ "2752 1 mon1t9v t3_1k5ywd6 0 0.9928 2025-04-23 \n",
832
+ "2753 1 momytb8 t3_1k5ywd6 1 0.9996 2025-04-23 "
833
+ ]
834
+ },
835
+ "execution_count": 534,
836
+ "metadata": {},
837
+ "output_type": "execute_result"
838
+ }
839
+ ],
840
+ "source": [
841
+ "postid = '1k5ywd6'\n",
842
+ "df[lambda x: ((x.post_id == postid) | (x.parent_id == f't3_{postid}'))]"
843
+ ]
844
+ }
845
+ ],
846
+ "metadata": {
847
+ "kernelspec": {
848
+ "display_name": "Python [conda env:reddit_streamlit]",
849
+ "language": "python",
850
+ "name": "conda-env-reddit_streamlit-py"
851
+ },
852
+ "language_info": {
853
+ "codemirror_mode": {
854
+ "name": "ipython",
855
+ "version": 3
856
+ },
857
+ "file_extension": ".py",
858
+ "mimetype": "text/x-python",
859
+ "name": "python",
860
+ "nbconvert_exporter": "python",
861
+ "pygments_lexer": "ipython3",
862
+ "version": "3.12.10"
863
+ }
864
+ },
865
+ "nbformat": 4,
866
+ "nbformat_minor": 5
867
+ }
notebooks/loading_data.ipynb ADDED
@@ -0,0 +1,785 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "dd911245-61d7-49db-9960-570f7feefe2b",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Loading Reddit Data\n",
9
+ "\n",
10
+ "This notebook has functions to read in parquet data from Hugging Face dataset [hblim/top_reddit_posts_daily](https://huggingface.co/datasets/hblim/top_reddit_posts_daily)\n",
11
+ "\n",
12
+ "I created this notebook to help me fix a issue where I overwrote `data_raw/2025-04-20.parquet` with 2025-04-23 data during testing\n",
13
+ "- I went to Hugging Face version history to see when the 2025-04-20 file was overwritten erroneously, and saw newer commits on 2025-04-23)\n",
14
+ "- I cloned the Hugging Face dataset repo locally and checked out last correct version for the corrupted 2025-04-20.parquet file\n",
15
+ "- Verified that the data looked good (e.g. retrieved date > created date), and not duplicated anywhere else, and then pushed the correct version back to the main remote"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 97,
21
+ "id": "d5071073-274b-480d-8503-28d2292422d3",
22
+ "metadata": {
23
+ "execution": {
24
+ "iopub.execute_input": "2025-04-26T20:34:47.775286Z",
25
+ "iopub.status.busy": "2025-04-26T20:34:47.775286Z",
26
+ "iopub.status.idle": "2025-04-26T20:34:47.779253Z",
27
+ "shell.execute_reply": "2025-04-26T20:34:47.779253Z",
28
+ "shell.execute_reply.started": "2025-04-26T20:34:47.775286Z"
29
+ }
30
+ },
31
+ "outputs": [],
32
+ "source": [
33
+ "import os\n",
34
+ "import glob\n",
35
+ "import datetime\n",
36
+ "from pathlib import Path\n",
37
+ "from dotenv import load_dotenv\n",
38
+ "import pandas as pd\n",
39
+ "import pyarrow\n",
40
+ "\n",
41
+ "from huggingface_hub import HfApi"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "markdown",
46
+ "id": "9085f661-ba37-4715-b87b-3699cb75db2f",
47
+ "metadata": {},
48
+ "source": [
49
+ "Download all historical data"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 167,
55
+ "id": "0c14356b-721c-4048-8efb-f69d8eae4900",
56
+ "metadata": {
57
+ "execution": {
58
+ "iopub.execute_input": "2025-04-26T21:37:43.533282Z",
59
+ "iopub.status.busy": "2025-04-26T21:37:43.533282Z",
60
+ "iopub.status.idle": "2025-04-26T21:37:43.539908Z",
61
+ "shell.execute_reply": "2025-04-26T21:37:43.539908Z",
62
+ "shell.execute_reply.started": "2025-04-26T21:37:43.533282Z"
63
+ }
64
+ },
65
+ "outputs": [],
66
+ "source": [
67
+ "def load_reddit_data(source,folder='data_raw'):\n",
68
+ " \"\"\"\n",
69
+ " Load Reddit daily posts data either from Hugging Face Hub or local files.\n",
70
+ "\n",
71
+ " Args:\n",
72
+ " repo_id (str): Hugging Face repo id (e.g., \"hblim/top_reddit_posts_daily\")\n",
73
+ " source (str): \"hub\" to load from Hugging Face, \"local\" to load from local files\n",
74
+ " Returns:\n",
75
+ " pd.DataFrame: Combined DataFrame of all posts\n",
76
+ " \"\"\"\n",
77
+ " df = []\n",
78
+ " \n",
79
+ " if source == \"hub\":\n",
80
+ " api = HfApi()\n",
81
+ " all_files = api.list_repo_files(\"hblim/top_reddit_posts_daily\", repo_type=\"dataset\")\n",
82
+ " parquet_files = sorted([f for f in all_files if f.startswith(folder) and f.endswith(\".parquet\")])\n",
83
+ "\n",
84
+ " for shard in parquet_files:\n",
85
+ " local_path = api.hf_hub_download(repo_id=repo_id, filename=shard, repo_type=\"dataset\")\n",
86
+ " file_date = os.path.splitext(os.path.basename(local_path))[0]\n",
87
+ " df.append(pd.read_parquet(local_path).assign(filedate=file_date))\n",
88
+ "\n",
89
+ " elif source == \"local\":\n",
90
+ " cwd = os.getcwd()\n",
91
+ " local_folder = os.path.join(cwd, \"top_reddit_posts_daily\", folder)\n",
92
+ " local_files = sorted(glob.glob(os.path.join(local_folder, \"*.parquet\")))\n",
93
+ "\n",
94
+ " for local_path in local_files:\n",
95
+ " file_date = os.path.splitext(os.path.basename(local_path))[0]\n",
96
+ " df.append(pd.read_parquet(local_path).assign(filedate=file_date))\n",
97
+ "\n",
98
+ " else:\n",
99
+ " raise ValueError(f\"Unknown source: {source}. Use 'hub' or 'local'.\")\n",
100
+ "\n",
101
+ " df = pd.concat(df, ignore_index=True)\n",
102
+ " print(f\"Total records across {df.filedate.nunique()} days: {len(df)}\")\n",
103
+ " return df"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "markdown",
108
+ "id": "435fe428-0c99-4d77-9c9d-e9c9a974b16e",
109
+ "metadata": {},
110
+ "source": [
111
+ "### Check if raw and raw-deduplicated data line up\n",
112
+ "Is the raw data duplicated anywhere? Then the de-duplicated data should have fewer posts"
113
+ ]
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "execution_count": 164,
118
+ "id": "e5f35dcd-4267-4bef-bc6a-0e89937441c3",
119
+ "metadata": {
120
+ "execution": {
121
+ "iopub.execute_input": "2025-04-26T21:32:17.106045Z",
122
+ "iopub.status.busy": "2025-04-26T21:32:17.106045Z",
123
+ "iopub.status.idle": "2025-04-26T21:32:18.082758Z",
124
+ "shell.execute_reply": "2025-04-26T21:32:18.082758Z",
125
+ "shell.execute_reply.started": "2025-04-26T21:32:17.106045Z"
126
+ }
127
+ },
128
+ "outputs": [
129
+ {
130
+ "name": "stdout",
131
+ "output_type": "stream",
132
+ "text": [
133
+ "Total records across 13 days: 3666\n"
134
+ ]
135
+ }
136
+ ],
137
+ "source": [
138
+ "df = load_reddit_data(\"hub\")"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type": "code",
143
+ "execution_count": 158,
144
+ "id": "ff824b99-c38f-4519-87df-54f9946cc20b",
145
+ "metadata": {
146
+ "execution": {
147
+ "iopub.execute_input": "2025-04-26T21:24:44.078709Z",
148
+ "iopub.status.busy": "2025-04-26T21:24:44.078709Z",
149
+ "iopub.status.idle": "2025-04-26T21:24:44.086147Z",
150
+ "shell.execute_reply": "2025-04-26T21:24:44.086147Z",
151
+ "shell.execute_reply.started": "2025-04-26T21:24:44.078709Z"
152
+ }
153
+ },
154
+ "outputs": [
155
+ {
156
+ "data": {
157
+ "text/plain": [
158
+ "filedate\n",
159
+ "2025-04-14 312\n",
160
+ "2025-04-15 258\n",
161
+ "2025-04-16 330\n",
162
+ "2025-04-17 324\n",
163
+ "2025-04-18 255\n",
164
+ "2025-04-19 232\n",
165
+ "2025-04-20 251\n",
166
+ "2025-04-21 233\n",
167
+ "2025-04-22 268\n",
168
+ "2025-04-23 331\n",
169
+ "2025-04-24 332\n",
170
+ "2025-04-25 309\n",
171
+ "2025-04-26 231\n",
172
+ "Name: subreddit, dtype: int64"
173
+ ]
174
+ },
175
+ "execution_count": 158,
176
+ "metadata": {},
177
+ "output_type": "execute_result"
178
+ }
179
+ ],
180
+ "source": [
181
+ "df.groupby('filedate').subreddit.count()"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 150,
187
+ "id": "6179d986-471b-40cd-bcf5-529f582315ee",
188
+ "metadata": {
189
+ "execution": {
190
+ "iopub.execute_input": "2025-04-26T21:16:54.824986Z",
191
+ "iopub.status.busy": "2025-04-26T21:16:54.824986Z",
192
+ "iopub.status.idle": "2025-04-26T21:16:54.842945Z",
193
+ "shell.execute_reply": "2025-04-26T21:16:54.842945Z",
194
+ "shell.execute_reply.started": "2025-04-26T21:16:54.824986Z"
195
+ }
196
+ },
197
+ "outputs": [
198
+ {
199
+ "name": "stdout",
200
+ "output_type": "stream",
201
+ "text": [
202
+ "Total records across 13 days: 3666\n"
203
+ ]
204
+ }
205
+ ],
206
+ "source": [
207
+ "df[\"retrieved_at\"] = pd.to_datetime(df[\"retrieved_at\"],utc=True)\n",
208
+ "\n",
209
+ "# Step 1: Find duplicate post_ids\n",
210
+ "duplicates = df[df.duplicated(subset=[\"post_id\"], keep=False)]\n",
211
+ "\n",
212
+ "# Report duplicates and their retrieved_at dates\n",
213
+ "duplicate_report = duplicates[[\"post_id\", \"retrieved_at\"]]\n",
214
+ "\n",
215
+ "# Step 2: De-duplicate keeping the first occurrence, sorted by retrieved_at\n",
216
+ "df_deduplicated = df_deduplicated.sort_values(by=\"retrieved_at\").reset_index(drop=True)\n",
217
+ "df_deduplicated = df.drop_duplicates(subset=[\"post_id\"], keep=\"first\")\n",
218
+ "\n",
219
+ "print(f\"Total records across {df_deduplicated.filedate.nunique()} days: {len(df_deduplicated)}\")"
220
+ ]
221
+ },
222
+ {
223
+ "cell_type": "code",
224
+ "execution_count": 153,
225
+ "id": "67a5fd89-8ddc-4247-ba22-8f411169487f",
226
+ "metadata": {
227
+ "execution": {
228
+ "iopub.execute_input": "2025-04-26T21:17:16.136315Z",
229
+ "iopub.status.busy": "2025-04-26T21:17:16.136315Z",
230
+ "iopub.status.idle": "2025-04-26T21:17:16.146070Z",
231
+ "shell.execute_reply": "2025-04-26T21:17:16.146070Z",
232
+ "shell.execute_reply.started": "2025-04-26T21:17:16.136315Z"
233
+ }
234
+ },
235
+ "outputs": [
236
+ {
237
+ "data": {
238
+ "text/html": [
239
+ "<div>\n",
240
+ "<style scoped>\n",
241
+ " .dataframe tbody tr th:only-of-type {\n",
242
+ " vertical-align: middle;\n",
243
+ " }\n",
244
+ "\n",
245
+ " .dataframe tbody tr th {\n",
246
+ " vertical-align: top;\n",
247
+ " }\n",
248
+ "\n",
249
+ " .dataframe thead th {\n",
250
+ " text-align: right;\n",
251
+ " }\n",
252
+ "</style>\n",
253
+ "<table border=\"1\" class=\"dataframe\">\n",
254
+ " <thead>\n",
255
+ " <tr style=\"text-align: right;\">\n",
256
+ " <th></th>\n",
257
+ " <th>original</th>\n",
258
+ " <th>deduplicated</th>\n",
259
+ " <th>matching</th>\n",
260
+ " </tr>\n",
261
+ " <tr>\n",
262
+ " <th>filedate</th>\n",
263
+ " <th></th>\n",
264
+ " <th></th>\n",
265
+ " <th></th>\n",
266
+ " </tr>\n",
267
+ " </thead>\n",
268
+ " <tbody>\n",
269
+ " <tr>\n",
270
+ " <th>2025-04-14</th>\n",
271
+ " <td>312</td>\n",
272
+ " <td>312</td>\n",
273
+ " <td>True</td>\n",
274
+ " </tr>\n",
275
+ " <tr>\n",
276
+ " <th>2025-04-15</th>\n",
277
+ " <td>258</td>\n",
278
+ " <td>258</td>\n",
279
+ " <td>True</td>\n",
280
+ " </tr>\n",
281
+ " <tr>\n",
282
+ " <th>2025-04-16</th>\n",
283
+ " <td>330</td>\n",
284
+ " <td>330</td>\n",
285
+ " <td>True</td>\n",
286
+ " </tr>\n",
287
+ " <tr>\n",
288
+ " <th>2025-04-17</th>\n",
289
+ " <td>324</td>\n",
290
+ " <td>324</td>\n",
291
+ " <td>True</td>\n",
292
+ " </tr>\n",
293
+ " <tr>\n",
294
+ " <th>2025-04-18</th>\n",
295
+ " <td>255</td>\n",
296
+ " <td>255</td>\n",
297
+ " <td>True</td>\n",
298
+ " </tr>\n",
299
+ " <tr>\n",
300
+ " <th>2025-04-19</th>\n",
301
+ " <td>232</td>\n",
302
+ " <td>232</td>\n",
303
+ " <td>True</td>\n",
304
+ " </tr>\n",
305
+ " <tr>\n",
306
+ " <th>2025-04-20</th>\n",
307
+ " <td>251</td>\n",
308
+ " <td>251</td>\n",
309
+ " <td>True</td>\n",
310
+ " </tr>\n",
311
+ " <tr>\n",
312
+ " <th>2025-04-21</th>\n",
313
+ " <td>233</td>\n",
314
+ " <td>233</td>\n",
315
+ " <td>True</td>\n",
316
+ " </tr>\n",
317
+ " <tr>\n",
318
+ " <th>2025-04-22</th>\n",
319
+ " <td>268</td>\n",
320
+ " <td>268</td>\n",
321
+ " <td>True</td>\n",
322
+ " </tr>\n",
323
+ " <tr>\n",
324
+ " <th>2025-04-23</th>\n",
325
+ " <td>331</td>\n",
326
+ " <td>331</td>\n",
327
+ " <td>True</td>\n",
328
+ " </tr>\n",
329
+ " <tr>\n",
330
+ " <th>2025-04-24</th>\n",
331
+ " <td>332</td>\n",
332
+ " <td>332</td>\n",
333
+ " <td>True</td>\n",
334
+ " </tr>\n",
335
+ " <tr>\n",
336
+ " <th>2025-04-25</th>\n",
337
+ " <td>309</td>\n",
338
+ " <td>309</td>\n",
339
+ " <td>True</td>\n",
340
+ " </tr>\n",
341
+ " <tr>\n",
342
+ " <th>2025-04-26</th>\n",
343
+ " <td>231</td>\n",
344
+ " <td>231</td>\n",
345
+ " <td>True</td>\n",
346
+ " </tr>\n",
347
+ " </tbody>\n",
348
+ "</table>\n",
349
+ "</div>"
350
+ ],
351
+ "text/plain": [
352
+ " original deduplicated matching\n",
353
+ "filedate \n",
354
+ "2025-04-14 312 312 True\n",
355
+ "2025-04-15 258 258 True\n",
356
+ "2025-04-16 330 330 True\n",
357
+ "2025-04-17 324 324 True\n",
358
+ "2025-04-18 255 255 True\n",
359
+ "2025-04-19 232 232 True\n",
360
+ "2025-04-20 251 251 True\n",
361
+ "2025-04-21 233 233 True\n",
362
+ "2025-04-22 268 268 True\n",
363
+ "2025-04-23 331 331 True\n",
364
+ "2025-04-24 332 332 True\n",
365
+ "2025-04-25 309 309 True\n",
366
+ "2025-04-26 231 231 True"
367
+ ]
368
+ },
369
+ "execution_count": 153,
370
+ "metadata": {},
371
+ "output_type": "execute_result"
372
+ }
373
+ ],
374
+ "source": [
375
+ "summary1 = df.groupby('filedate').subreddit.count()\n",
376
+ "summary2 = df_deduplicated.groupby('filedate').subreddit.count().loc[summary1.index]\n",
377
+ "\n",
378
+ "comparison = pd.DataFrame({\n",
379
+ " 'original': summary1,\n",
380
+ " 'deduplicated': summary2\n",
381
+ "})\n",
382
+ "\n",
383
+ "comparison['matching'] = comparison['original'] == comparison['deduplicated']\n",
384
+ "comparison"
385
+ ]
386
+ },
387
+ {
388
+ "cell_type": "markdown",
389
+ "id": "e566f098-1402-41cd-a7ec-83900f91e6fb",
390
+ "metadata": {},
391
+ "source": [
392
+ "### Check if raw and summary data line up"
393
+ ]
394
+ },
395
+ {
396
+ "cell_type": "code",
397
+ "execution_count": 154,
398
+ "id": "056e51ff-dabd-474a-84c8-6a76f82a4488",
399
+ "metadata": {
400
+ "execution": {
401
+ "iopub.execute_input": "2025-04-26T21:17:19.652014Z",
402
+ "iopub.status.busy": "2025-04-26T21:17:19.645961Z",
403
+ "iopub.status.idle": "2025-04-26T21:17:19.790646Z",
404
+ "shell.execute_reply": "2025-04-26T21:17:19.790646Z",
405
+ "shell.execute_reply.started": "2025-04-26T21:17:19.652014Z"
406
+ }
407
+ },
408
+ "outputs": [],
409
+ "source": [
410
+ "df_summary = pd.read_csv(api.hf_hub_download(repo_id=repo_id, filename='subreddit_daily_summary.csv', repo_type=\"dataset\"))"
411
+ ]
412
+ },
413
+ {
414
+ "cell_type": "code",
415
+ "execution_count": 155,
416
+ "id": "321eb761-6278-47e8-89f4-24b06f5ddeb3",
417
+ "metadata": {
418
+ "execution": {
419
+ "iopub.execute_input": "2025-04-26T21:17:20.795827Z",
420
+ "iopub.status.busy": "2025-04-26T21:17:20.795309Z",
421
+ "iopub.status.idle": "2025-04-26T21:17:20.805781Z",
422
+ "shell.execute_reply": "2025-04-26T21:17:20.804717Z",
423
+ "shell.execute_reply.started": "2025-04-26T21:17:20.795827Z"
424
+ }
425
+ },
426
+ "outputs": [
427
+ {
428
+ "data": {
429
+ "text/html": [
430
+ "<div>\n",
431
+ "<style scoped>\n",
432
+ " .dataframe tbody tr th:only-of-type {\n",
433
+ " vertical-align: middle;\n",
434
+ " }\n",
435
+ "\n",
436
+ " .dataframe tbody tr th {\n",
437
+ " vertical-align: top;\n",
438
+ " }\n",
439
+ "\n",
440
+ " .dataframe thead th {\n",
441
+ " text-align: right;\n",
442
+ " }\n",
443
+ "</style>\n",
444
+ "<table border=\"1\" class=\"dataframe\">\n",
445
+ " <thead>\n",
446
+ " <tr style=\"text-align: right;\">\n",
447
+ " <th></th>\n",
448
+ " <th>csv_counts</th>\n",
449
+ " <th>parquet_counts</th>\n",
450
+ " </tr>\n",
451
+ " <tr>\n",
452
+ " <th>date</th>\n",
453
+ " <th></th>\n",
454
+ " <th></th>\n",
455
+ " </tr>\n",
456
+ " </thead>\n",
457
+ " <tbody>\n",
458
+ " <tr>\n",
459
+ " <th>2025-04-14</th>\n",
460
+ " <td>312</td>\n",
461
+ " <td>312</td>\n",
462
+ " </tr>\n",
463
+ " <tr>\n",
464
+ " <th>2025-04-15</th>\n",
465
+ " <td>258</td>\n",
466
+ " <td>258</td>\n",
467
+ " </tr>\n",
468
+ " <tr>\n",
469
+ " <th>2025-04-16</th>\n",
470
+ " <td>330</td>\n",
471
+ " <td>330</td>\n",
472
+ " </tr>\n",
473
+ " <tr>\n",
474
+ " <th>2025-04-17</th>\n",
475
+ " <td>324</td>\n",
476
+ " <td>324</td>\n",
477
+ " </tr>\n",
478
+ " <tr>\n",
479
+ " <th>2025-04-18</th>\n",
480
+ " <td>255</td>\n",
481
+ " <td>255</td>\n",
482
+ " </tr>\n",
483
+ " <tr>\n",
484
+ " <th>2025-04-19</th>\n",
485
+ " <td>232</td>\n",
486
+ " <td>232</td>\n",
487
+ " </tr>\n",
488
+ " <tr>\n",
489
+ " <th>2025-04-20</th>\n",
490
+ " <td>251</td>\n",
491
+ " <td>251</td>\n",
492
+ " </tr>\n",
493
+ " <tr>\n",
494
+ " <th>2025-04-21</th>\n",
495
+ " <td>233</td>\n",
496
+ " <td>233</td>\n",
497
+ " </tr>\n",
498
+ " <tr>\n",
499
+ " <th>2025-04-22</th>\n",
500
+ " <td>234</td>\n",
501
+ " <td>268</td>\n",
502
+ " </tr>\n",
503
+ " <tr>\n",
504
+ " <th>2025-04-23</th>\n",
505
+ " <td>309</td>\n",
506
+ " <td>331</td>\n",
507
+ " </tr>\n",
508
+ " <tr>\n",
509
+ " <th>2025-04-24</th>\n",
510
+ " <td>332</td>\n",
511
+ " <td>332</td>\n",
512
+ " </tr>\n",
513
+ " <tr>\n",
514
+ " <th>2025-04-25</th>\n",
515
+ " <td>309</td>\n",
516
+ " <td>309</td>\n",
517
+ " </tr>\n",
518
+ " <tr>\n",
519
+ " <th>2025-04-26</th>\n",
520
+ " <td>231</td>\n",
521
+ " <td>231</td>\n",
522
+ " </tr>\n",
523
+ " </tbody>\n",
524
+ "</table>\n",
525
+ "</div>"
526
+ ],
527
+ "text/plain": [
528
+ " csv_counts parquet_counts\n",
529
+ "date \n",
530
+ "2025-04-14 312 312\n",
531
+ "2025-04-15 258 258\n",
532
+ "2025-04-16 330 330\n",
533
+ "2025-04-17 324 324\n",
534
+ "2025-04-18 255 255\n",
535
+ "2025-04-19 232 232\n",
536
+ "2025-04-20 251 251\n",
537
+ "2025-04-21 233 233\n",
538
+ "2025-04-22 234 268\n",
539
+ "2025-04-23 309 331\n",
540
+ "2025-04-24 332 332\n",
541
+ "2025-04-25 309 309\n",
542
+ "2025-04-26 231 231"
543
+ ]
544
+ },
545
+ "execution_count": 155,
546
+ "metadata": {},
547
+ "output_type": "execute_result"
548
+ }
549
+ ],
550
+ "source": [
551
+ "# First compute both series\n",
552
+ "summary1 = df_summary.groupby('date')['count'].sum()\n",
553
+ "summary2 = df.groupby('filedate').subreddit.count().loc[summary1.index]\n",
554
+ "\n",
555
+ "# Now merge into a single DataFrame\n",
556
+ "merged = pd.DataFrame({\n",
557
+ " 'csv_counts': summary1,\n",
558
+ " 'parquet_counts': summary2\n",
559
+ "})\n",
560
+ "\n",
561
+ "merged"
562
+ ]
563
+ },
564
+ {
565
+ "cell_type": "markdown",
566
+ "id": "96b1f688-c768-4aba-93f6-5247d85f8998",
567
+ "metadata": {},
568
+ "source": [
569
+ "### Check if raw and scored data line up"
570
+ ]
571
+ },
572
+ {
573
+ "cell_type": "code",
574
+ "execution_count": 168,
575
+ "id": "4ef592c5-c36e-454a-bd59-d455a8a3e062",
576
+ "metadata": {
577
+ "execution": {
578
+ "iopub.execute_input": "2025-04-26T21:38:12.679864Z",
579
+ "iopub.status.busy": "2025-04-26T21:38:12.679864Z",
580
+ "iopub.status.idle": "2025-04-26T21:38:15.906491Z",
581
+ "shell.execute_reply": "2025-04-26T21:38:15.905943Z",
582
+ "shell.execute_reply.started": "2025-04-26T21:38:12.679864Z"
583
+ }
584
+ },
585
+ "outputs": [
586
+ {
587
+ "data": {
588
+ "application/vnd.jupyter.widget-view+json": {
589
+ "model_id": "911485558cf84562889cd9245d5e9a24",
590
+ "version_major": 2,
591
+ "version_minor": 0
592
+ },
593
+ "text/plain": [
594
+ "2025-04-22.parquet: 0%| | 0.00/65.9k [00:00<?, ?B/s]"
595
+ ]
596
+ },
597
+ "metadata": {},
598
+ "output_type": "display_data"
599
+ },
600
+ {
601
+ "name": "stdout",
602
+ "output_type": "stream",
603
+ "text": [
604
+ "Total records across 13 days: 3666\n",
605
+ "Total records across 13 days: 3666\n"
606
+ ]
607
+ },
608
+ {
609
+ "data": {
610
+ "text/html": [
611
+ "<div>\n",
612
+ "<style scoped>\n",
613
+ " .dataframe tbody tr th:only-of-type {\n",
614
+ " vertical-align: middle;\n",
615
+ " }\n",
616
+ "\n",
617
+ " .dataframe tbody tr th {\n",
618
+ " vertical-align: top;\n",
619
+ " }\n",
620
+ "\n",
621
+ " .dataframe thead th {\n",
622
+ " text-align: right;\n",
623
+ " }\n",
624
+ "</style>\n",
625
+ "<table border=\"1\" class=\"dataframe\">\n",
626
+ " <thead>\n",
627
+ " <tr style=\"text-align: right;\">\n",
628
+ " <th></th>\n",
629
+ " <th>raw</th>\n",
630
+ " <th>scored</th>\n",
631
+ " <th>matching</th>\n",
632
+ " </tr>\n",
633
+ " <tr>\n",
634
+ " <th>filedate</th>\n",
635
+ " <th></th>\n",
636
+ " <th></th>\n",
637
+ " <th></th>\n",
638
+ " </tr>\n",
639
+ " </thead>\n",
640
+ " <tbody>\n",
641
+ " <tr>\n",
642
+ " <th>2025-04-14</th>\n",
643
+ " <td>312</td>\n",
644
+ " <td>312</td>\n",
645
+ " <td>True</td>\n",
646
+ " </tr>\n",
647
+ " <tr>\n",
648
+ " <th>2025-04-15</th>\n",
649
+ " <td>258</td>\n",
650
+ " <td>258</td>\n",
651
+ " <td>True</td>\n",
652
+ " </tr>\n",
653
+ " <tr>\n",
654
+ " <th>2025-04-16</th>\n",
655
+ " <td>330</td>\n",
656
+ " <td>330</td>\n",
657
+ " <td>True</td>\n",
658
+ " </tr>\n",
659
+ " <tr>\n",
660
+ " <th>2025-04-17</th>\n",
661
+ " <td>324</td>\n",
662
+ " <td>324</td>\n",
663
+ " <td>True</td>\n",
664
+ " </tr>\n",
665
+ " <tr>\n",
666
+ " <th>2025-04-18</th>\n",
667
+ " <td>255</td>\n",
668
+ " <td>255</td>\n",
669
+ " <td>True</td>\n",
670
+ " </tr>\n",
671
+ " <tr>\n",
672
+ " <th>2025-04-19</th>\n",
673
+ " <td>232</td>\n",
674
+ " <td>232</td>\n",
675
+ " <td>True</td>\n",
676
+ " </tr>\n",
677
+ " <tr>\n",
678
+ " <th>2025-04-20</th>\n",
679
+ " <td>251</td>\n",
680
+ " <td>251</td>\n",
681
+ " <td>True</td>\n",
682
+ " </tr>\n",
683
+ " <tr>\n",
684
+ " <th>2025-04-21</th>\n",
685
+ " <td>233</td>\n",
686
+ " <td>233</td>\n",
687
+ " <td>True</td>\n",
688
+ " </tr>\n",
689
+ " <tr>\n",
690
+ " <th>2025-04-22</th>\n",
691
+ " <td>268</td>\n",
692
+ " <td>268</td>\n",
693
+ " <td>True</td>\n",
694
+ " </tr>\n",
695
+ " <tr>\n",
696
+ " <th>2025-04-23</th>\n",
697
+ " <td>331</td>\n",
698
+ " <td>331</td>\n",
699
+ " <td>True</td>\n",
700
+ " </tr>\n",
701
+ " <tr>\n",
702
+ " <th>2025-04-24</th>\n",
703
+ " <td>332</td>\n",
704
+ " <td>332</td>\n",
705
+ " <td>True</td>\n",
706
+ " </tr>\n",
707
+ " <tr>\n",
708
+ " <th>2025-04-25</th>\n",
709
+ " <td>309</td>\n",
710
+ " <td>309</td>\n",
711
+ " <td>True</td>\n",
712
+ " </tr>\n",
713
+ " <tr>\n",
714
+ " <th>2025-04-26</th>\n",
715
+ " <td>231</td>\n",
716
+ " <td>231</td>\n",
717
+ " <td>True</td>\n",
718
+ " </tr>\n",
719
+ " </tbody>\n",
720
+ "</table>\n",
721
+ "</div>"
722
+ ],
723
+ "text/plain": [
724
+ " raw scored matching\n",
725
+ "filedate \n",
726
+ "2025-04-14 312 312 True\n",
727
+ "2025-04-15 258 258 True\n",
728
+ "2025-04-16 330 330 True\n",
729
+ "2025-04-17 324 324 True\n",
730
+ "2025-04-18 255 255 True\n",
731
+ "2025-04-19 232 232 True\n",
732
+ "2025-04-20 251 251 True\n",
733
+ "2025-04-21 233 233 True\n",
734
+ "2025-04-22 268 268 True\n",
735
+ "2025-04-23 331 331 True\n",
736
+ "2025-04-24 332 332 True\n",
737
+ "2025-04-25 309 309 True\n",
738
+ "2025-04-26 231 231 True"
739
+ ]
740
+ },
741
+ "execution_count": 168,
742
+ "metadata": {},
743
+ "output_type": "execute_result"
744
+ }
745
+ ],
746
+ "source": [
747
+ "df = load_reddit_data(\"hub\",folder='data_scored')\n",
748
+ "\n",
749
+ "summary1 = df.groupby('filedate').subreddit.count()\n",
750
+ "\n",
751
+ "df_scored = load_reddit_data(\"hub\",folder='data_scored')\n",
752
+ "summary2 = df_scored.groupby('filedate').subreddit.count().loc[summary1.index]\n",
753
+ "\n",
754
+ "comparison = pd.DataFrame({\n",
755
+ " 'raw': summary1,\n",
756
+ " 'scored': summary2\n",
757
+ "})\n",
758
+ "\n",
759
+ "comparison['matching'] = comparison['raw'] == comparison['scored']\n",
760
+ "comparison"
761
+ ]
762
+ }
763
+ ],
764
+ "metadata": {
765
+ "kernelspec": {
766
+ "display_name": "Python [conda env:reddit_streamlit]",
767
+ "language": "python",
768
+ "name": "conda-env-reddit_streamlit-py"
769
+ },
770
+ "language_info": {
771
+ "codemirror_mode": {
772
+ "name": "ipython",
773
+ "version": 3
774
+ },
775
+ "file_extension": ".py",
776
+ "mimetype": "text/x-python",
777
+ "name": "python",
778
+ "nbconvert_exporter": "python",
779
+ "pygments_lexer": "ipython3",
780
+ "version": "3.12.10"
781
+ }
782
+ },
783
+ "nbformat": 4,
784
+ "nbformat_minor": 5
785
+ }
notebooks/post_analysis.ipynb ADDED
@@ -0,0 +1,567 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "7d423f7b-730c-4669-be82-c0a7141b7c76",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Analyze sentiment driving posts"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 1,
14
+ "id": "83e1a76f-45a1-44d9-ae4a-62425a7af45d",
15
+ "metadata": {
16
+ "execution": {
17
+ "iopub.execute_input": "2025-06-06T18:19:44.555148Z",
18
+ "iopub.status.busy": "2025-06-06T18:19:44.555148Z",
19
+ "iopub.status.idle": "2025-06-06T18:19:45.754942Z",
20
+ "shell.execute_reply": "2025-06-06T18:19:45.754942Z",
21
+ "shell.execute_reply.started": "2025-06-06T18:19:44.555148Z"
22
+ }
23
+ },
24
+ "outputs": [],
25
+ "source": [
26
+ "import os\n",
27
+ "import glob\n",
28
+ "import datetime\n",
29
+ "from pathlib import Path\n",
30
+ "from dotenv import load_dotenv\n",
31
+ "import pandas as pd\n",
32
+ "import pyarrow\n",
33
+ "\n",
34
+ "from huggingface_hub import HfApi"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": 2,
40
+ "id": "c808621b-f55a-4a80-8011-420c0be55151",
41
+ "metadata": {
42
+ "execution": {
43
+ "iopub.execute_input": "2025-06-06T18:25:14.801890Z",
44
+ "iopub.status.busy": "2025-06-06T18:25:14.801890Z",
45
+ "iopub.status.idle": "2025-06-06T18:25:14.811651Z",
46
+ "shell.execute_reply": "2025-06-06T18:25:14.811651Z",
47
+ "shell.execute_reply.started": "2025-06-06T18:25:14.801890Z"
48
+ }
49
+ },
50
+ "outputs": [],
51
+ "source": [
52
+ "\"\"\"\n",
53
+ "Download a single subreddit-day Parquet file from\n",
54
+ "`hblim/top_reddit_posts_daily/data_scored_subreddit/`.\n",
55
+ "\n",
56
+ "Prereqs\n",
57
+ "-------\n",
58
+ "pip install huggingface_hub pandas pyarrow\n",
59
+ "huggingface-cli login # or set HF_TOKEN\n",
60
+ "\"\"\"\n",
61
+ "\n",
62
+ "from __future__ import annotations\n",
63
+ "\n",
64
+ "import re\n",
65
+ "from pathlib import Path\n",
66
+ "from typing import Optional\n",
67
+ "\n",
68
+ "import pandas as pd\n",
69
+ "from huggingface_hub import HfApi, hf_hub_download\n",
70
+ "\n",
71
+ "\n",
72
+ "def _sanitize(sub: str) -> str:\n",
73
+ " \"\"\"\n",
74
+ " Apply the same cleaning rule that was used when the shards were created\n",
75
+ " (lowercase + replace any char that isn't 0-9, a-z, _, -, . with '_').\n",
76
+ " \"\"\"\n",
77
+ " return re.sub(r\"[^\\w\\-.]\", \"_\", sub.strip().lower())\n",
78
+ "\n",
79
+ "\n",
80
+ "def download_subreddit_day(\n",
81
+ " date_str: str, # \"YYYY-MM-DD\"\n",
82
+ " subreddit: str, # e.g. \"MachineLearning\"\n",
83
+ " repo_id: str = \"hblim/top_reddit_posts_daily\",\n",
84
+ " data_folder: str = \"data_scored_subreddit\",\n",
85
+ " out_dir: str | Path = \"downloads\",\n",
86
+ " token: Optional[str] = None,\n",
87
+ ") -> Path:\n",
88
+ " \"\"\"\n",
89
+ " Returns the local path of the downloaded Parquet file.\n",
90
+ "\n",
91
+ " Raises FileNotFoundError if the shard isn't on the Hub.\n",
92
+ " \"\"\"\n",
93
+ " api = HfApi(token=token)\n",
94
+ " safe_sub = _sanitize(subreddit)\n",
95
+ "\n",
96
+ " # remote path is exactly how the splitter wrote it: YYYY-MM-DD__sub.parquet\n",
97
+ " filename_in_repo = f\"{data_folder}/{date_str}__{safe_sub}.parquet\"\n",
98
+ "\n",
99
+ " # sanity check: make sure the file exists in the repo\n",
100
+ " if filename_in_repo not in api.list_repo_files(repo_id, repo_type=\"dataset\"):\n",
101
+ " raise FileNotFoundError(\n",
102
+ " f\"No shard named '{filename_in_repo}' in {repo_id}. \"\n",
103
+ " \"Maybe the date or subreddit is wrong?\"\n",
104
+ " )\n",
105
+ "\n",
106
+ " local_path = hf_hub_download(\n",
107
+ " repo_id=repo_id,\n",
108
+ " filename=filename_in_repo,\n",
109
+ " repo_type=\"dataset\",\n",
110
+ " cache_dir=str(Path(out_dir).expanduser()),\n",
111
+ " )\n",
112
+ " print(f\"✅ Downloaded to: {local_path}\")\n",
113
+ " return Path(local_path)"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": 84,
119
+ "id": "e64146ea-4bdc-461b-9c27-99aaac5a50a8",
120
+ "metadata": {
121
+ "execution": {
122
+ "iopub.execute_input": "2025-06-07T01:10:47.720639Z",
123
+ "iopub.status.busy": "2025-06-07T01:10:47.720639Z",
124
+ "iopub.status.idle": "2025-06-07T01:10:48.845012Z",
125
+ "shell.execute_reply": "2025-06-07T01:10:48.845012Z",
126
+ "shell.execute_reply.started": "2025-06-07T01:10:47.720639Z"
127
+ }
128
+ },
129
+ "outputs": [
130
+ {
131
+ "data": {
132
+ "application/vnd.jupyter.widget-view+json": {
133
+ "model_id": "a07b9648bd3b4454ad05b564e304ca76",
134
+ "version_major": 2,
135
+ "version_minor": 0
136
+ },
137
+ "text/plain": [
138
+ "2025-06-06__localllama.parquet: 0%| | 0.00/69.2k [00:00<?, ?B/s]"
139
+ ]
140
+ },
141
+ "metadata": {},
142
+ "output_type": "display_data"
143
+ },
144
+ {
145
+ "name": "stdout",
146
+ "output_type": "stream",
147
+ "text": [
148
+ "✅ Downloaded to: downloads\\datasets--hblim--top_reddit_posts_daily\\snapshots\\5fc94d45ca6e670268f2e505350bbc08ec7d5d84\\data_scored_subreddit\\2025-06-06__localllama.parquet\n"
149
+ ]
150
+ }
151
+ ],
152
+ "source": [
153
+ "subreddit = 'localllama'\n",
154
+ "date = '2025-06-06'\n",
155
+ "path = download_subreddit_day(\n",
156
+ " date_str=date,\n",
157
+ " subreddit=subreddit)\n",
158
+ "df = pd.read_parquet(path)"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": 85,
164
+ "id": "6c41a5bc-0169-491b-ac26-bc7edae8f852",
165
+ "metadata": {
166
+ "execution": {
167
+ "iopub.execute_input": "2025-06-07T01:10:49.801513Z",
168
+ "iopub.status.busy": "2025-06-07T01:10:49.801513Z",
169
+ "iopub.status.idle": "2025-06-07T01:10:49.851213Z",
170
+ "shell.execute_reply": "2025-06-07T01:10:49.851213Z",
171
+ "shell.execute_reply.started": "2025-06-07T01:10:49.801513Z"
172
+ }
173
+ },
174
+ "outputs": [
175
+ {
176
+ "name": "stderr",
177
+ "output_type": "stream",
178
+ "text": [
179
+ "C:\\Users\\halst\\AppData\\Local\\Temp\\ipykernel_23912\\1682697236.py:32: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
180
+ " thread_metrics = grouped.apply(lambda group: pd.Series({\n"
181
+ ]
182
+ }
183
+ ],
184
+ "source": [
185
+ "import pandas as pd\n",
186
+ "import numpy as np\n",
187
+ "\n",
188
+ "# Assume 'df' is already loaded in the notebook, e.g.:\n",
189
+ "# df = pd.read_csv(\"my_reddit_day.csv\")\n",
190
+ "\n",
191
+ "def compute_metrics_for_df(df, gamma_post=0.3):\n",
192
+ " # 1. Ensure 'score' is numeric\n",
193
+ " df['score_num'] = pd.to_numeric(df['score'], errors='coerce').fillna(0)\n",
194
+ "\n",
195
+ " # 2. Compute weights: log-scaled by score, with a lower multiplier for posts\n",
196
+ " weights = (1 + np.log1p(df['score_num'].clip(lower=0)))\n",
197
+ " weights *= np.where(df['type'] == 'post', gamma_post, 1.0)\n",
198
+ " df['weight'] = weights\n",
199
+ "\n",
200
+ " # 3. Compute a thread_id for each row\n",
201
+ " def thread_id(row):\n",
202
+ " if row['type'] == 'post':\n",
203
+ " return str(row['post_id'])\n",
204
+ " pid = row['parent_id']\n",
205
+ " if isinstance(pid, str) and pid.startswith('t3_'):\n",
206
+ " return pid[3:]\n",
207
+ " return str(pid)\n",
208
+ "\n",
209
+ " df['thread_id'] = df.apply(thread_id, axis=1)\n",
210
+ "\n",
211
+ " # 4. Overall daily weighted sentiment (EAS)\n",
212
+ " day_eas = (df['weight'] * df['sentiment']).sum() / df['weight'].sum()\n",
213
+ "\n",
214
+ " # 5. Per-thread metrics\n",
215
+ " grouped = df.groupby('thread_id')\n",
216
+ " thread_metrics = grouped.apply(lambda group: pd.Series({\n",
217
+ " 'eas': (group['weight'] * group['sentiment']).sum() / group['weight'].sum(),\n",
218
+ " 'tot_weight': group['weight'].sum(),\n",
219
+ " 'title': (\n",
220
+ " group.loc[group['type'] == 'post', 'text']\n",
221
+ " .iloc[0]\n",
222
+ " if (group['type'] == 'post').any()\n",
223
+ " else ''\n",
224
+ " )\n",
225
+ " })).reset_index()\n",
226
+ "\n",
227
+ " # 6. Contribution: how much each thread shifts the day sentiment from 0.5\n",
228
+ " thread_metrics['contrib'] = thread_metrics['tot_weight'] * (thread_metrics['eas'] - 0.5)\n",
229
+ "\n",
230
+ " return day_eas, thread_metrics\n",
231
+ "\n",
232
+ "# === Example usage on your preloaded DataFrame ===\n",
233
+ "day_eas_value, thread_df = compute_metrics_for_df(df)\n",
234
+ "\n",
235
+ "# 7. Show the overall daily sentiment\n",
236
+ "daily_summary = pd.DataFrame([{\n",
237
+ " 'weighted_sentiment (EAS)': round(day_eas_value, 3)\n",
238
+ "}])\n",
239
+ "daily_summary\n",
240
+ "\n",
241
+ "thread_top_pos = thread_df.sort_values('contrib', ascending=False).head(5).copy()\n",
242
+ "thread_top_neg = thread_df.sort_values('contrib').head(5).copy()\n"
243
+ ]
244
+ },
245
+ {
246
+ "cell_type": "code",
247
+ "execution_count": 86,
248
+ "id": "7c32258e-14b4-42d4-a535-b2598e19f968",
249
+ "metadata": {
250
+ "execution": {
251
+ "iopub.execute_input": "2025-06-07T01:11:03.484174Z",
252
+ "iopub.status.busy": "2025-06-07T01:11:03.480647Z",
253
+ "iopub.status.idle": "2025-06-07T01:11:03.528587Z",
254
+ "shell.execute_reply": "2025-06-07T01:11:03.528587Z",
255
+ "shell.execute_reply.started": "2025-06-07T01:11:03.484174Z"
256
+ }
257
+ },
258
+ "outputs": [
259
+ {
260
+ "name": "stderr",
261
+ "output_type": "stream",
262
+ "text": [
263
+ "C:\\Users\\halst\\AppData\\Local\\Temp\\ipykernel_23912\\1682697236.py:32: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
264
+ " thread_metrics = grouped.apply(lambda group: pd.Series({\n"
265
+ ]
266
+ },
267
+ {
268
+ "data": {
269
+ "text/html": [
270
+ "<div>\n",
271
+ "<style scoped>\n",
272
+ " .dataframe tbody tr th:only-of-type {\n",
273
+ " vertical-align: middle;\n",
274
+ " }\n",
275
+ "\n",
276
+ " .dataframe tbody tr th {\n",
277
+ " vertical-align: top;\n",
278
+ " }\n",
279
+ "\n",
280
+ " .dataframe thead th {\n",
281
+ " text-align: right;\n",
282
+ " }\n",
283
+ "</style>\n",
284
+ "<table border=\"1\" class=\"dataframe\">\n",
285
+ " <thead>\n",
286
+ " <tr style=\"text-align: right;\">\n",
287
+ " <th></th>\n",
288
+ " <th>weighted_sentiment (EAS)</th>\n",
289
+ " </tr>\n",
290
+ " </thead>\n",
291
+ " <tbody>\n",
292
+ " <tr>\n",
293
+ " <th>0</th>\n",
294
+ " <td>0.3186</td>\n",
295
+ " </tr>\n",
296
+ " </tbody>\n",
297
+ "</table>\n",
298
+ "</div>"
299
+ ],
300
+ "text/plain": [
301
+ " weighted_sentiment (EAS)\n",
302
+ "0 0.3186"
303
+ ]
304
+ },
305
+ "execution_count": 86,
306
+ "metadata": {},
307
+ "output_type": "execute_result"
308
+ }
309
+ ],
310
+ "source": [
311
+ "# === Example usage on your preloaded DataFrame ===\n",
312
+ "day_eas_value, thread_df = compute_metrics_for_df(df)\n",
313
+ "\n",
314
+ "# 7. Show the overall daily sentiment\n",
315
+ "daily_summary = pd.DataFrame([{\n",
316
+ " 'weighted_sentiment (EAS)': round(day_eas_value, 4)\n",
317
+ "}])\n",
318
+ "daily_summary"
319
+ ]
320
+ },
321
+ {
322
+ "cell_type": "code",
323
+ "execution_count": 87,
324
+ "id": "823deb32-cce6-4b1f-aba2-bebdf1645b6e",
325
+ "metadata": {
326
+ "execution": {
327
+ "iopub.execute_input": "2025-06-07T01:11:45.194222Z",
328
+ "iopub.status.busy": "2025-06-07T01:11:45.188532Z",
329
+ "iopub.status.idle": "2025-06-07T01:11:45.198360Z",
330
+ "shell.execute_reply": "2025-06-07T01:11:45.198360Z",
331
+ "shell.execute_reply.started": "2025-06-07T01:11:45.194222Z"
332
+ }
333
+ },
334
+ "outputs": [],
335
+ "source": [
336
+ "# 8. Extract top 5 positive-contribution threads and top 5 negative-contribution threads\n",
337
+ "thread_top_pos = thread_df.sort_values('contrib', ascending=False).head(5).copy()\n",
338
+ "thread_top_neg = thread_df.sort_values('contrib').head(5).copy()\n",
339
+ "\n",
340
+ "# (Optionally) truncate titles for display\n",
341
+ "# thread_top_pos['title'] = thread_top_pos['title'].str.slice(0, 90)\n",
342
+ "# thread_top_neg['title'] = thread_top_neg['title'].str.slice(0, 90)"
343
+ ]
344
+ },
345
+ {
346
+ "cell_type": "code",
347
+ "execution_count": 89,
348
+ "id": "085652c4-b599-4d7b-bc64-e3a464d3d72c",
349
+ "metadata": {
350
+ "execution": {
351
+ "iopub.execute_input": "2025-06-07T01:12:18.195083Z",
352
+ "iopub.status.busy": "2025-06-07T01:12:18.194068Z",
353
+ "iopub.status.idle": "2025-06-07T01:12:18.201898Z",
354
+ "shell.execute_reply": "2025-06-07T01:12:18.201898Z",
355
+ "shell.execute_reply.started": "2025-06-07T01:12:18.195083Z"
356
+ }
357
+ },
358
+ "outputs": [
359
+ {
360
+ "data": {
361
+ "text/html": [
362
+ "<div>\n",
363
+ "<style scoped>\n",
364
+ " .dataframe tbody tr th:only-of-type {\n",
365
+ " vertical-align: middle;\n",
366
+ " }\n",
367
+ "\n",
368
+ " .dataframe tbody tr th {\n",
369
+ " vertical-align: top;\n",
370
+ " }\n",
371
+ "\n",
372
+ " .dataframe thead th {\n",
373
+ " text-align: right;\n",
374
+ " }\n",
375
+ "</style>\n",
376
+ "<table border=\"1\" class=\"dataframe\">\n",
377
+ " <thead>\n",
378
+ " <tr style=\"text-align: right;\">\n",
379
+ " <th></th>\n",
380
+ " <th>title</th>\n",
381
+ " <th>eas</th>\n",
382
+ " <th>tot_weight</th>\n",
383
+ " </tr>\n",
384
+ " </thead>\n",
385
+ " <tbody>\n",
386
+ " <tr>\n",
387
+ " <th>32</th>\n",
388
+ " <td>Is this the largest \"No synthetic data\" open weight LLM? (142B)\\n\\nFrom the GitHub page of https://huggingface.co/rednote-hilab/dots.llm1.base</td>\n",
389
+ " <td>0.579431</td>\n",
390
+ " <td>28.660264</td>\n",
391
+ " </tr>\n",
392
+ " <tr>\n",
393
+ " <th>14</th>\n",
394
+ " <td>Tokasaurus: An LLM Inference Engine for High-Throughput Workloads\\n\\n</td>\n",
395
+ " <td>0.740024</td>\n",
396
+ " <td>8.072325</td>\n",
397
+ " </tr>\n",
398
+ " <tr>\n",
399
+ " <th>21</th>\n",
400
+ " <td>Real-time conversation with a character on your local machine\\n\\nAnd also the voice split function\\n\\nSorry for my English =)</td>\n",
401
+ " <td>0.551828</td>\n",
402
+ " <td>30.763515</td>\n",
403
+ " </tr>\n",
404
+ " <tr>\n",
405
+ " <th>37</th>\n",
406
+ " <td>Offline verbal chat bot with modular tool calling!\\n\\nThis is an update from my original [post](https://www.reddit.com/r/LocalLLaMA/comments/1l2vrg2/fully_offline_verbal_chat_bot/) where I demoed my fully offline verbal chat bot. I've made a couple updates, and should be releasing it on github soon. \\n\\- Clipboard insertion: allows you to insert your clipboard to the prompt with just a key press \\n\\- Modular tool calling: allows the model to use tools that can be drag and dropped into a folder\\n\\nTo clarify how tool calling works: Behind the scenes the program parses the json headers of all files in the tools folder at startup, and then passes them along with the users message. This means you can simply drag and drop a tool, restart the app, and use it.\\n\\nPlease leave suggestions and ask any questions you might have!</td>\n",
407
+ " <td>0.764096</td>\n",
408
+ " <td>4.431766</td>\n",
409
+ " </tr>\n",
410
+ " <tr>\n",
411
+ " <th>31</th>\n",
412
+ " <td>I thought Qwen3 was putting out some questionable content into my code...\\n\\nOh. \\*\\*SOLVED.\\*\\* See why, I think, at the end.\\n\\nOkay, so I was trying \\`aider\\`. Only tried a bit here and there, but I just switched to using \\`Qwen\\_Qwen3-14B-Q6\\_K\\_L.gguf\\`. And I see this in my aider output:\\n\\n\\`\\`\\`text \\n\\## Signoff: insurgent (razzin' frazzin' motherfu... stupid directx...) \\n\\`\\`\\` \\nNow, please bear in mind, this is script that plots timestamps, like \\`ls | plottimes\\` and, aside from plotting time data as a \\`heatmap\\`, it has no special war or battle terminology, nor profane language in it. I am not familiar with this thing to know where or how that was generated, since it SEEMS to be from a trial run aider did of the code:\\n\\nhttps://preview.redd.it/zamjz1bdsb5f1.jpg?width=719&amp;format=pjpg&amp;auto=webp&amp;s=5ca874f91bdd6fe7fc20f4eb797e5ddc22500dec\\n\\nBut, that seems to be the code running -- not LLM output directly.\\n\\nOdd!\\n\\n...scrolling back to see what's up there:\\n\\n...</td>\n",
413
+ " <td>0.719805</td>\n",
414
+ " <td>4.278161</td>\n",
415
+ " </tr>\n",
416
+ " </tbody>\n",
417
+ "</table>\n",
418
+ "</div>"
419
+ ],
420
+ "text/plain": [
421
+ " title \\\n",
422
+ "32 Is this the largest \"No synthetic data\" open weight LLM? (142B)\\n\\nFrom the GitHub page of https://huggingface.co/rednote-hilab/dots.llm1.base \n",
423
+ "14 Tokasaurus: An LLM Inference Engine for High-Throughput Workloads\\n\\n \n",
424
+ "21 Real-time conversation with a character on your local machine\\n\\nAnd also the voice split function\\n\\nSorry for my English =) \n",
425
+ "37 Offline verbal chat bot with modular tool calling!\\n\\nThis is an update from my original [post](https://www.reddit.com/r/LocalLLaMA/comments/1l2vrg2/fully_offline_verbal_chat_bot/) where I demoed my fully offline verbal chat bot. I've made a couple updates, and should be releasing it on github soon. \\n\\- Clipboard insertion: allows you to insert your clipboard to the prompt with just a key press \\n\\- Modular tool calling: allows the model to use tools that can be drag and dropped into a folder\\n\\nTo clarify how tool calling works: Behind the scenes the program parses the json headers of all files in the tools folder at startup, and then passes them along with the users message. This means you can simply drag and drop a tool, restart the app, and use it.\\n\\nPlease leave suggestions and ask any questions you might have! \n",
426
+ "31 I thought Qwen3 was putting out some questionable content into my code...\\n\\nOh. \\*\\*SOLVED.\\*\\* See why, I think, at the end.\\n\\nOkay, so I was trying \\`aider\\`. Only tried a bit here and there, but I just switched to using \\`Qwen\\_Qwen3-14B-Q6\\_K\\_L.gguf\\`. And I see this in my aider output:\\n\\n\\`\\`\\`text \\n\\## Signoff: insurgent (razzin' frazzin' motherfu... stupid directx...) \\n\\`\\`\\` \\nNow, please bear in mind, this is script that plots timestamps, like \\`ls | plottimes\\` and, aside from plotting time data as a \\`heatmap\\`, it has no special war or battle terminology, nor profane language in it. I am not familiar with this thing to know where or how that was generated, since it SEEMS to be from a trial run aider did of the code:\\n\\nhttps://preview.redd.it/zamjz1bdsb5f1.jpg?width=719&format=pjpg&auto=webp&s=5ca874f91bdd6fe7fc20f4eb797e5ddc22500dec\\n\\nBut, that seems to be the code running -- not LLM output directly.\\n\\nOdd!\\n\\n...scrolling back to see what's up there:\\n\\n... \n",
427
+ "\n",
428
+ " eas tot_weight \n",
429
+ "32 0.579431 28.660264 \n",
430
+ "14 0.740024 8.072325 \n",
431
+ "21 0.551828 30.763515 \n",
432
+ "37 0.764096 4.431766 \n",
433
+ "31 0.719805 4.278161 "
434
+ ]
435
+ },
436
+ "execution_count": 89,
437
+ "metadata": {},
438
+ "output_type": "execute_result"
439
+ }
440
+ ],
441
+ "source": [
442
+ "thread_top_pos[['title', 'eas', 'tot_weight']]"
443
+ ]
444
+ },
445
+ {
446
+ "cell_type": "code",
447
+ "execution_count": 90,
448
+ "id": "4b925100-a077-4178-a826-721677f5461d",
449
+ "metadata": {
450
+ "execution": {
451
+ "iopub.execute_input": "2025-06-07T01:12:18.509027Z",
452
+ "iopub.status.busy": "2025-06-07T01:12:18.509027Z",
453
+ "iopub.status.idle": "2025-06-07T01:12:18.520061Z",
454
+ "shell.execute_reply": "2025-06-07T01:12:18.519048Z",
455
+ "shell.execute_reply.started": "2025-06-07T01:12:18.509027Z"
456
+ }
457
+ },
458
+ "outputs": [
459
+ {
460
+ "data": {
461
+ "text/html": [
462
+ "<div>\n",
463
+ "<style scoped>\n",
464
+ " .dataframe tbody tr th:only-of-type {\n",
465
+ " vertical-align: middle;\n",
466
+ " }\n",
467
+ "\n",
468
+ " .dataframe tbody tr th {\n",
469
+ " vertical-align: top;\n",
470
+ " }\n",
471
+ "\n",
472
+ " .dataframe thead th {\n",
473
+ " text-align: right;\n",
474
+ " }\n",
475
+ "</style>\n",
476
+ "<table border=\"1\" class=\"dataframe\">\n",
477
+ " <thead>\n",
478
+ " <tr style=\"text-align: right;\">\n",
479
+ " <th></th>\n",
480
+ " <th>title</th>\n",
481
+ " <th>eas</th>\n",
482
+ " <th>tot_weight</th>\n",
483
+ " </tr>\n",
484
+ " </thead>\n",
485
+ " <tbody>\n",
486
+ " <tr>\n",
487
+ " <th>23</th>\n",
488
+ " <td>Cannot even run the smallest model on system RAM?\\n\\nI am a bit confused. I am trying to run small LLMs on my Unraid server within the Ollama docker, using just the CPU and 16GB of system RAM.\\n\\nGot Ollama up and running, but even when pulling the smallest models like Qwen 3 0.6B with Q4\\_K\\_M quantization, Ollama tells me I need way more RAM than I have left to spare. Why is that? Should this model not be running on any potato? Does this have to do with context overhead?\\n\\n \\nSorry if this is a stupid question, I am trying to learn more about this and cannot find the solution anywhere else.</td>\n",
489
+ " <td>0.000000</td>\n",
490
+ " <td>23.823146</td>\n",
491
+ " </tr>\n",
492
+ " <tr>\n",
493
+ " <th>36</th>\n",
494
+ " <td>what's the case against flash attention?\\n\\nI accidently stumbled upon the -fa (flash attention) flag in llama.cpp's llama-server. I cannot speak to the speedup in performence as i haven't properly tested it, but the memory optimization is huge: 8B-F16-gguf model with 100k fit comfortably in 32GB vram gpu with some 2-3 GB to spare.\\n\\nA very brief search revealed that flash attention theoretically computes the same mathematical function, and in practice benchmarks show no change in the model's output quality.\\n\\nSo my question is, is flash attention really just free lunch? what's the catch? why is it not enabled by default?</td>\n",
495
+ " <td>0.000000</td>\n",
496
+ " <td>22.075726</td>\n",
497
+ " </tr>\n",
498
+ " <tr>\n",
499
+ " <th>17</th>\n",
500
+ " <td>It is possble to run non-reasoning deepseek-r1-0528?\\n\\nI know, stupid question, but couldn't find an answer to it!</td>\n",
501
+ " <td>0.000000</td>\n",
502
+ " <td>17.520515</td>\n",
503
+ " </tr>\n",
504
+ " <tr>\n",
505
+ " <th>13</th>\n",
506
+ " <td>Can a model be so radically altered that its origin can no longer be recognized? YES!\\n\\n**Phi-lthy4**( [https://huggingface.co/SicariusSicariiStuff/Phi-lthy4](https://huggingface.co/SicariusSicariiStuff/Phi-lthy4) ) has been consistently described as **exceptionally unique** by all who have tested it, **almost devoid of SLOP**, and it is now widely regarded as the **most unique roleplay model available**. It underwent an intensive continued pretraining (CPT) phase, extensive supervised fine-tuning (SFT) on high-quality organic datasets, and leveraged advanced techniques including model merging, parameter pruning, and upscaling.\\n\\nInterestingly, this distinctiveness was validated in a recent paper: [*Gradient-Based Model Fingerprinting for LLM Similarity Detection and Family Classification*](https://arxiv.org/html/2506.01631v1). Among a wide array of models tested, this one stood out as **unclassifiable** by traditional architecture-based fingerprinting—highlighting the extent of ...</td>\n",
507
+ " <td>0.211321</td>\n",
508
+ " <td>27.502412</td>\n",
509
+ " </tr>\n",
510
+ " <tr>\n",
511
+ " <th>12</th>\n",
512
+ " <td>China's Rednote Open-source dots.llm performance &amp; cost\\n\\n\\nhttps://github.com/rednote-hilab/dots.llm1/blob/main/dots1_tech_report.pdf</td>\n",
513
+ " <td>0.000000</td>\n",
514
+ " <td>15.465402</td>\n",
515
+ " </tr>\n",
516
+ " </tbody>\n",
517
+ "</table>\n",
518
+ "</div>"
519
+ ],
520
+ "text/plain": [
521
+ " title \\\n",
522
+ "23 Cannot even run the smallest model on system RAM?\\n\\nI am a bit confused. I am trying to run small LLMs on my Unraid server within the Ollama docker, using just the CPU and 16GB of system RAM.\\n\\nGot Ollama up and running, but even when pulling the smallest models like Qwen 3 0.6B with Q4\\_K\\_M quantization, Ollama tells me I need way more RAM than I have left to spare. Why is that? Should this model not be running on any potato? Does this have to do with context overhead?\\n\\n \\nSorry if this is a stupid question, I am trying to learn more about this and cannot find the solution anywhere else. \n",
523
+ "36 what's the case against flash attention?\\n\\nI accidently stumbled upon the -fa (flash attention) flag in llama.cpp's llama-server. I cannot speak to the speedup in performence as i haven't properly tested it, but the memory optimization is huge: 8B-F16-gguf model with 100k fit comfortably in 32GB vram gpu with some 2-3 GB to spare.\\n\\nA very brief search revealed that flash attention theoretically computes the same mathematical function, and in practice benchmarks show no change in the model's output quality.\\n\\nSo my question is, is flash attention really just free lunch? what's the catch? why is it not enabled by default? \n",
524
+ "17 It is possble to run non-reasoning deepseek-r1-0528?\\n\\nI know, stupid question, but couldn't find an answer to it! \n",
525
+ "13 Can a model be so radically altered that its origin can no longer be recognized? YES!\\n\\n**Phi-lthy4**( [https://huggingface.co/SicariusSicariiStuff/Phi-lthy4](https://huggingface.co/SicariusSicariiStuff/Phi-lthy4) ) has been consistently described as **exceptionally unique** by all who have tested it, **almost devoid of SLOP**, and it is now widely regarded as the **most unique roleplay model available**. It underwent an intensive continued pretraining (CPT) phase, extensive supervised fine-tuning (SFT) on high-quality organic datasets, and leveraged advanced techniques including model merging, parameter pruning, and upscaling.\\n\\nInterestingly, this distinctiveness was validated in a recent paper: [*Gradient-Based Model Fingerprinting for LLM Similarity Detection and Family Classification*](https://arxiv.org/html/2506.01631v1). Among a wide array of models tested, this one stood out as **unclassifiable** by traditional architecture-based fingerprinting—highlighting the extent of ... \n",
526
+ "12 China's Rednote Open-source dots.llm performance & cost\\n\\n\\nhttps://github.com/rednote-hilab/dots.llm1/blob/main/dots1_tech_report.pdf \n",
527
+ "\n",
528
+ " eas tot_weight \n",
529
+ "23 0.000000 23.823146 \n",
530
+ "36 0.000000 22.075726 \n",
531
+ "17 0.000000 17.520515 \n",
532
+ "13 0.211321 27.502412 \n",
533
+ "12 0.000000 15.465402 "
534
+ ]
535
+ },
536
+ "execution_count": 90,
537
+ "metadata": {},
538
+ "output_type": "execute_result"
539
+ }
540
+ ],
541
+ "source": [
542
+ "thread_top_neg[['title', 'eas', 'tot_weight']]"
543
+ ]
544
+ }
545
+ ],
546
+ "metadata": {
547
+ "kernelspec": {
548
+ "display_name": "Python [conda env:reddit_streamlit]",
549
+ "language": "python",
550
+ "name": "conda-env-reddit_streamlit-py"
551
+ },
552
+ "language_info": {
553
+ "codemirror_mode": {
554
+ "name": "ipython",
555
+ "version": 3
556
+ },
557
+ "file_extension": ".py",
558
+ "mimetype": "text/x-python",
559
+ "name": "python",
560
+ "nbconvert_exporter": "python",
561
+ "pygments_lexer": "ipython3",
562
+ "version": "3.12.10"
563
+ }
564
+ },
565
+ "nbformat": 4,
566
+ "nbformat_minor": 5
567
+ }
notebooks/split_data_scored.ipynb ADDED
@@ -0,0 +1,2798 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "e94d29a3-2c94-4131-b951-8604613cdd63",
6
+ "metadata": {},
7
+ "source": [
8
+ "Split daily Reddit Parquet shards by subreddit and re-upload."
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 7,
14
+ "id": "ccdb349a-bfeb-428b-a2be-fa8da62ad644",
15
+ "metadata": {
16
+ "execution": {
17
+ "iopub.execute_input": "2025-06-05T20:35:51.119431Z",
18
+ "iopub.status.busy": "2025-06-05T20:35:51.117431Z",
19
+ "iopub.status.idle": "2025-06-05T20:35:54.082300Z",
20
+ "shell.execute_reply": "2025-06-05T20:35:54.082300Z",
21
+ "shell.execute_reply.started": "2025-06-05T20:35:51.119431Z"
22
+ }
23
+ },
24
+ "outputs": [
25
+ {
26
+ "name": "stdout",
27
+ "output_type": "stream",
28
+ "text": [
29
+ "Requirement already satisfied: huggingface_hub in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (0.30.2)\n",
30
+ "Collecting huggingface_hub\n",
31
+ " Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)\n",
32
+ "Requirement already satisfied: filelock in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from huggingface_hub) (3.13.1)\n",
33
+ "Requirement already satisfied: fsspec>=2023.5.0 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from huggingface_hub) (2024.6.1)\n",
34
+ "Requirement already satisfied: packaging>=20.9 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from huggingface_hub) (24.2)\n",
35
+ "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from huggingface_hub) (6.0.2)\n",
36
+ "Requirement already satisfied: requests in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from huggingface_hub) (2.32.3)\n",
37
+ "Requirement already satisfied: tqdm>=4.42.1 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from huggingface_hub) (4.67.1)\n",
38
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from huggingface_hub) (4.12.2)\n",
39
+ "Requirement already satisfied: colorama in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from tqdm>=4.42.1->huggingface_hub) (0.4.6)\n",
40
+ "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from requests->huggingface_hub) (3.4.1)\n",
41
+ "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from requests->huggingface_hub) (3.10)\n",
42
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from requests->huggingface_hub) (2.4.0)\n",
43
+ "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\halst\\miniconda3\\envs\\reddit\\lib\\site-packages (from requests->huggingface_hub) (2025.1.31)\n",
44
+ "Downloading huggingface_hub-0.32.4-py3-none-any.whl (512 kB)\n",
45
+ "Installing collected packages: huggingface_hub\n",
46
+ " Attempting uninstall: huggingface_hub\n",
47
+ " Found existing installation: huggingface-hub 0.30.2\n",
48
+ " Uninstalling huggingface-hub-0.30.2:\n",
49
+ " Successfully uninstalled huggingface-hub-0.30.2\n",
50
+ "Successfully installed huggingface_hub-0.32.4\n"
51
+ ]
52
+ }
53
+ ],
54
+ "source": [
55
+ "!pip install -q pyarrow fastparquet\n",
56
+ "!pip install -U huggingface_hub"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 11,
62
+ "id": "8fe6bfff-770f-4237-868b-10099ab9468c",
63
+ "metadata": {
64
+ "execution": {
65
+ "iopub.execute_input": "2025-06-05T20:41:40.658262Z",
66
+ "iopub.status.busy": "2025-06-05T20:41:40.658262Z",
67
+ "iopub.status.idle": "2025-06-05T20:41:40.667028Z",
68
+ "shell.execute_reply": "2025-06-05T20:41:40.667028Z",
69
+ "shell.execute_reply.started": "2025-06-05T20:41:40.658262Z"
70
+ }
71
+ },
72
+ "outputs": [
73
+ {
74
+ "data": {
75
+ "text/plain": [
76
+ "True"
77
+ ]
78
+ },
79
+ "execution_count": 11,
80
+ "metadata": {},
81
+ "output_type": "execute_result"
82
+ }
83
+ ],
84
+ "source": [
85
+ "from __future__ import annotations\n",
86
+ "\n",
87
+ "import os\n",
88
+ "import re\n",
89
+ "import shutil\n",
90
+ "import tempfile\n",
91
+ "from pathlib import Path\n",
92
+ "from typing import Iterable\n",
93
+ "\n",
94
+ "import pandas as pd\n",
95
+ "from huggingface_hub import HfApi, hf_hub_download, CommitOperationAdd\n",
96
+ "from tqdm.auto import tqdm\n",
97
+ "\n",
98
+ "from dotenv import load_dotenv\n",
99
+ "\n",
100
+ "load_dotenv()"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 12,
106
+ "id": "3fb82f27-d6ee-4f18-b2eb-86edcdc505db",
107
+ "metadata": {
108
+ "execution": {
109
+ "iopub.execute_input": "2025-06-05T20:41:41.607705Z",
110
+ "iopub.status.busy": "2025-06-05T20:41:41.607705Z",
111
+ "iopub.status.idle": "2025-06-05T20:41:41.625213Z",
112
+ "shell.execute_reply": "2025-06-05T20:41:41.625213Z",
113
+ "shell.execute_reply.started": "2025-06-05T20:41:41.607705Z"
114
+ }
115
+ },
116
+ "outputs": [],
117
+ "source": [
118
+ "def _sanitize(name: str) -> str:\n",
119
+ " \"\"\"\n",
120
+ " Make subreddit safe for filenames (removes slashes, spaces, etc.).\n",
121
+ " \"\"\"\n",
122
+ " name = name.strip().lower()\n",
123
+ " name = re.sub(r\"[^\\w\\-\\.]\", \"_\", name) # keep letters, numbers, _, -, .\n",
124
+ " return name\n",
125
+ " \n",
126
+ "def split_and_upload_by_subreddit(\n",
127
+ " repo_id: str = \"hblim/top_reddit_posts_daily\",\n",
128
+ " source_folder: str = \"data_scored\",\n",
129
+ " target_folder: str = \"data_scored_subreddit\",\n",
130
+ " overwrite: bool = False,\n",
131
+ " batch_size: int = 20,\n",
132
+ " token: str | None = None,\n",
133
+ ") -> None:\n",
134
+ " \"\"\"\n",
135
+ " For every Parquet in `source_folder`, create one Parquet per subreddit\n",
136
+ " and upload to `target_folder`.\n",
137
+ "\n",
138
+ " Parameters\n",
139
+ " ----------\n",
140
+ " repo_id : str\n",
141
+ " Hugging Face dataset repo id.\n",
142
+ " source_folder : str\n",
143
+ " Folder that already contains the daily Parquet files.\n",
144
+ " target_folder : str\n",
145
+ " New folder to hold subreddit-level Parquet shards.\n",
146
+ " overwrite : bool\n",
147
+ " Re-process / re-upload even if the target file already exists.\n",
148
+ " batch_size : int\n",
149
+ " Upload this many files per commit (reduces commit spam).\n",
150
+ " token : str | None\n",
151
+ " HF token; if None, uses the one stored by `huggingface-cli login`.\n",
152
+ " \"\"\"\n",
153
+ " api = HfApi(token=token)\n",
154
+ "\n",
155
+ " # 1. discover daily Parquet files in the repo\n",
156
+ " files_in_repo: Iterable[str] = api.list_repo_files(repo_id, repo_type=\"dataset\")\n",
157
+ " daily_files = sorted(\n",
158
+ " f for f in files_in_repo if f.startswith(source_folder) and f.endswith(\".parquet\")\n",
159
+ " )\n",
160
+ " if not daily_files:\n",
161
+ " raise RuntimeError(f\"No Parquet files found in {source_folder}\")\n",
162
+ "\n",
163
+ " print(f\"Found {len(daily_files)} daily shards in {source_folder}\")\n",
164
+ "\n",
165
+ " with tempfile.TemporaryDirectory() as tmp_dir:\n",
166
+ " tmp_dir = Path(tmp_dir)\n",
167
+ "\n",
168
+ " upload_queue: list[tuple[Path, str]] = []\n",
169
+ " pbar = tqdm(daily_files, desc=\"processing days\", unit=\"file\")\n",
170
+ "\n",
171
+ " for remote_path in pbar:\n",
172
+ " file_date = Path(remote_path).stem # e.g. 2025-05-31\n",
173
+ " local_path = hf_hub_download(\n",
174
+ " repo_id=repo_id,\n",
175
+ " filename=remote_path,\n",
176
+ " repo_type=\"dataset\",\n",
177
+ " cache_dir=tmp_dir, # keep inside temp dir\n",
178
+ " )\n",
179
+ " df = pd.read_parquet(local_path)\n",
180
+ "\n",
181
+ " # 2. split by subreddit\n",
182
+ " for subreddit, sub_df in df.groupby(\"subreddit\", sort=False):\n",
183
+ " safe_sub = _sanitize(subreddit)\n",
184
+ " out_fname = f\"{file_date}__{safe_sub}.parquet\"\n",
185
+ " out_repo_path = f\"{target_folder}/{out_fname}\"\n",
186
+ "\n",
187
+ " # skip if already in repo and not overwriting\n",
188
+ " if not overwrite and out_repo_path in files_in_repo:\n",
189
+ " continue\n",
190
+ "\n",
191
+ " out_local = tmp_dir / out_fname\n",
192
+ " sub_df.to_parquet(out_local, index=False)\n",
193
+ " upload_queue.append((out_local, out_repo_path))\n",
194
+ "\n",
195
+ " # upload in batches to reduce commit churn\n",
196
+ " if len(upload_queue) >= batch_size:\n",
197
+ " _flush_upload_queue(api, repo_id, upload_queue)\n",
198
+ " upload_queue.clear()\n",
199
+ "\n",
200
+ " # flush any leftovers\n",
201
+ " if upload_queue:\n",
202
+ " _flush_upload_queue(api, repo_id, upload_queue)\n",
203
+ "\n",
204
+ " print(\"✅ Done – all subreddit shards uploaded.\")\n",
205
+ "\n",
206
+ "\n",
207
+ "def _flush_upload_queue(api: HfApi, repo_id: str,\n",
208
+ " queue: list[tuple[Path, str]]) -> None:\n",
209
+ " \"\"\"Upload a batch of files in one commit (works on ≥0.28).\"\"\"\n",
210
+ " if not queue:\n",
211
+ " return\n",
212
+ "\n",
213
+ " ops = [\n",
214
+ " CommitOperationAdd(\n",
215
+ " path_in_repo=dst, # where the file will live in the repo\n",
216
+ " path_or_fileobj=str(src) # local temp file\n",
217
+ " )\n",
218
+ " for src, dst in queue\n",
219
+ " ]\n",
220
+ "\n",
221
+ " api.create_commit(\n",
222
+ " repo_id=repo_id,\n",
223
+ " repo_type=\"dataset\",\n",
224
+ " operations=ops,\n",
225
+ " commit_message=f\"Add {len(queue)} subreddit parquet file(s)\",\n",
226
+ " )"
227
+ ]
228
+ },
229
+ {
230
+ "cell_type": "code",
231
+ "execution_count": 13,
232
+ "id": "d8f29912-98b1-4e37-bff5-f3cfff2170d3",
233
+ "metadata": {
234
+ "execution": {
235
+ "iopub.execute_input": "2025-06-05T20:41:42.177374Z",
236
+ "iopub.status.busy": "2025-06-05T20:41:42.177374Z",
237
+ "iopub.status.idle": "2025-06-05T20:42:06.076189Z",
238
+ "shell.execute_reply": "2025-06-05T20:42:06.075678Z",
239
+ "shell.execute_reply.started": "2025-06-05T20:41:42.177374Z"
240
+ },
241
+ "scrolled": true
242
+ },
243
+ "outputs": [
244
+ {
245
+ "name": "stdout",
246
+ "output_type": "stream",
247
+ "text": [
248
+ "Found 35 daily shards in data_scored\n"
249
+ ]
250
+ },
251
+ {
252
+ "data": {
253
+ "application/vnd.jupyter.widget-view+json": {
254
+ "model_id": "68642a82f2ef41c6b663a455d3781374",
255
+ "version_major": 2,
256
+ "version_minor": 0
257
+ },
258
+ "text/plain": [
259
+ "processing days: 0%| | 0/35 [00:00<?, ?file/s]"
260
+ ]
261
+ },
262
+ "metadata": {},
263
+ "output_type": "display_data"
264
+ },
265
+ {
266
+ "data": {
267
+ "application/vnd.jupyter.widget-view+json": {
268
+ "model_id": "1961d119f4254c6a9f4af4139b0dbda5",
269
+ "version_major": 2,
270
+ "version_minor": 0
271
+ },
272
+ "text/plain": [
273
+ "2025-05-01.parquet: 0%| | 0.00/271k [00:00<?, ?B/s]"
274
+ ]
275
+ },
276
+ "metadata": {},
277
+ "output_type": "display_data"
278
+ },
279
+ {
280
+ "data": {
281
+ "application/vnd.jupyter.widget-view+json": {
282
+ "model_id": "a5bfdcb06e4541698ede37f5bf845662",
283
+ "version_major": 2,
284
+ "version_minor": 0
285
+ },
286
+ "text/plain": [
287
+ "2025-05-02.parquet: 0%| | 0.00/202k [00:00<?, ?B/s]"
288
+ ]
289
+ },
290
+ "metadata": {},
291
+ "output_type": "display_data"
292
+ },
293
+ {
294
+ "data": {
295
+ "application/vnd.jupyter.widget-view+json": {
296
+ "model_id": "75929f953cef4b1d949471a9b887fa64",
297
+ "version_major": 2,
298
+ "version_minor": 0
299
+ },
300
+ "text/plain": [
301
+ "2025-05-03.parquet: 0%| | 0.00/231k [00:00<?, ?B/s]"
302
+ ]
303
+ },
304
+ "metadata": {},
305
+ "output_type": "display_data"
306
+ },
307
+ {
308
+ "data": {
309
+ "application/vnd.jupyter.widget-view+json": {
310
+ "model_id": "64947704c3954ddca1e181a905da4ddd",
311
+ "version_major": 2,
312
+ "version_minor": 0
313
+ },
314
+ "text/plain": [
315
+ "2025-05-04.parquet: 0%| | 0.00/195k [00:00<?, ?B/s]"
316
+ ]
317
+ },
318
+ "metadata": {},
319
+ "output_type": "display_data"
320
+ },
321
+ {
322
+ "data": {
323
+ "application/vnd.jupyter.widget-view+json": {
324
+ "model_id": "edf2c664cfd2445f8fdca3eb6c0ee1fb",
325
+ "version_major": 2,
326
+ "version_minor": 0
327
+ },
328
+ "text/plain": [
329
+ "2025-05-05.parquet: 0%| | 0.00/225k [00:00<?, ?B/s]"
330
+ ]
331
+ },
332
+ "metadata": {},
333
+ "output_type": "display_data"
334
+ },
335
+ {
336
+ "data": {
337
+ "application/vnd.jupyter.widget-view+json": {
338
+ "model_id": "a46c2e19563045fd8230a10dec756890",
339
+ "version_major": 2,
340
+ "version_minor": 0
341
+ },
342
+ "text/plain": [
343
+ "2025-05-06.parquet: 0%| | 0.00/225k [00:00<?, ?B/s]"
344
+ ]
345
+ },
346
+ "metadata": {},
347
+ "output_type": "display_data"
348
+ },
349
+ {
350
+ "data": {
351
+ "application/vnd.jupyter.widget-view+json": {
352
+ "model_id": "fe0f07d515e0455391b0a0661196df87",
353
+ "version_major": 2,
354
+ "version_minor": 0
355
+ },
356
+ "text/plain": [
357
+ "2025-05-07.parquet: 0%| | 0.00/188k [00:00<?, ?B/s]"
358
+ ]
359
+ },
360
+ "metadata": {},
361
+ "output_type": "display_data"
362
+ },
363
+ {
364
+ "data": {
365
+ "application/vnd.jupyter.widget-view+json": {
366
+ "model_id": "ade879eedd8a4016a44da6a5b8caf631",
367
+ "version_major": 2,
368
+ "version_minor": 0
369
+ },
370
+ "text/plain": [
371
+ "2025-05-08.parquet: 0%| | 0.00/228k [00:00<?, ?B/s]"
372
+ ]
373
+ },
374
+ "metadata": {},
375
+ "output_type": "display_data"
376
+ },
377
+ {
378
+ "data": {
379
+ "application/vnd.jupyter.widget-view+json": {
380
+ "model_id": "16f30a3d35654112a93d667ba400c0bb",
381
+ "version_major": 2,
382
+ "version_minor": 0
383
+ },
384
+ "text/plain": [
385
+ "2025-05-09.parquet: 0%| | 0.00/221k [00:00<?, ?B/s]"
386
+ ]
387
+ },
388
+ "metadata": {},
389
+ "output_type": "display_data"
390
+ },
391
+ {
392
+ "data": {
393
+ "application/vnd.jupyter.widget-view+json": {
394
+ "model_id": "820260817bf74db78da330a3fbdcf449",
395
+ "version_major": 2,
396
+ "version_minor": 0
397
+ },
398
+ "text/plain": [
399
+ "2025-05-10.parquet: 0%| | 0.00/190k [00:00<?, ?B/s]"
400
+ ]
401
+ },
402
+ "metadata": {},
403
+ "output_type": "display_data"
404
+ },
405
+ {
406
+ "data": {
407
+ "application/vnd.jupyter.widget-view+json": {
408
+ "model_id": "d2721d0d407248f8b488e0176b4a1cf6",
409
+ "version_major": 2,
410
+ "version_minor": 0
411
+ },
412
+ "text/plain": [
413
+ "2025-05-11.parquet: 0%| | 0.00/193k [00:00<?, ?B/s]"
414
+ ]
415
+ },
416
+ "metadata": {},
417
+ "output_type": "display_data"
418
+ },
419
+ {
420
+ "data": {
421
+ "application/vnd.jupyter.widget-view+json": {
422
+ "model_id": "3472b2ab7af24c728ec70ccc8978f8bd",
423
+ "version_major": 2,
424
+ "version_minor": 0
425
+ },
426
+ "text/plain": [
427
+ "2025-05-12.parquet: 0%| | 0.00/230k [00:00<?, ?B/s]"
428
+ ]
429
+ },
430
+ "metadata": {},
431
+ "output_type": "display_data"
432
+ },
433
+ {
434
+ "data": {
435
+ "application/vnd.jupyter.widget-view+json": {
436
+ "model_id": "09ec3ac45f5748f0bc04221d3c19dc5b",
437
+ "version_major": 2,
438
+ "version_minor": 0
439
+ },
440
+ "text/plain": [
441
+ "2025-05-13.parquet: 0%| | 0.00/221k [00:00<?, ?B/s]"
442
+ ]
443
+ },
444
+ "metadata": {},
445
+ "output_type": "display_data"
446
+ },
447
+ {
448
+ "data": {
449
+ "application/vnd.jupyter.widget-view+json": {
450
+ "model_id": "aa4bb60b6c614e61ba03b91e624e2552",
451
+ "version_major": 2,
452
+ "version_minor": 0
453
+ },
454
+ "text/plain": [
455
+ "2025-05-01__localllama.parquet: 0%| | 0.00/151k [00:00<?, ?B/s]"
456
+ ]
457
+ },
458
+ "metadata": {},
459
+ "output_type": "display_data"
460
+ },
461
+ {
462
+ "data": {
463
+ "application/vnd.jupyter.widget-view+json": {
464
+ "model_id": "473044c36385426195e88caf02182cb2",
465
+ "version_major": 2,
466
+ "version_minor": 0
467
+ },
468
+ "text/plain": [
469
+ "2025-05-01__artificial.parquet: 0%| | 0.00/36.2k [00:00<?, ?B/s]"
470
+ ]
471
+ },
472
+ "metadata": {},
473
+ "output_type": "display_data"
474
+ },
475
+ {
476
+ "data": {
477
+ "application/vnd.jupyter.widget-view+json": {
478
+ "model_id": "c870917db331495781261caad218c974",
479
+ "version_major": 2,
480
+ "version_minor": 0
481
+ },
482
+ "text/plain": [
483
+ "2025-05-01__singularity.parquet: 0%| | 0.00/51.6k [00:00<?, ?B/s]"
484
+ ]
485
+ },
486
+ "metadata": {},
487
+ "output_type": "display_data"
488
+ },
489
+ {
490
+ "data": {
491
+ "application/vnd.jupyter.widget-view+json": {
492
+ "model_id": "a4c64168eeb247c695cd3957b37b33a2",
493
+ "version_major": 2,
494
+ "version_minor": 0
495
+ },
496
+ "text/plain": [
497
+ "2025-05-01__openai.parquet: 0%| | 0.00/59.1k [00:00<?, ?B/s]"
498
+ ]
499
+ },
500
+ "metadata": {},
501
+ "output_type": "display_data"
502
+ },
503
+ {
504
+ "data": {
505
+ "application/vnd.jupyter.widget-view+json": {
506
+ "model_id": "d7c6be3476204b6b94f6e4abeea8d64c",
507
+ "version_major": 2,
508
+ "version_minor": 0
509
+ },
510
+ "text/plain": [
511
+ "Upload 52 LFS files: 0%| | 0/52 [00:00<?, ?it/s]"
512
+ ]
513
+ },
514
+ "metadata": {},
515
+ "output_type": "display_data"
516
+ },
517
+ {
518
+ "data": {
519
+ "application/vnd.jupyter.widget-view+json": {
520
+ "model_id": "ce3c5fcbf7e041a0aa780ee2aea950bf",
521
+ "version_major": 2,
522
+ "version_minor": 0
523
+ },
524
+ "text/plain": [
525
+ "2025-05-02__artificial.parquet: 0%| | 0.00/25.5k [00:00<?, ?B/s]"
526
+ ]
527
+ },
528
+ "metadata": {},
529
+ "output_type": "display_data"
530
+ },
531
+ {
532
+ "data": {
533
+ "application/vnd.jupyter.widget-view+json": {
534
+ "model_id": "ff8e4f095ee54929b4b4e2809629ee73",
535
+ "version_major": 2,
536
+ "version_minor": 0
537
+ },
538
+ "text/plain": [
539
+ "2025-05-02__localllama.parquet: 0%| | 0.00/89.5k [00:00<?, ?B/s]"
540
+ ]
541
+ },
542
+ "metadata": {},
543
+ "output_type": "display_data"
544
+ },
545
+ {
546
+ "data": {
547
+ "application/vnd.jupyter.widget-view+json": {
548
+ "model_id": "97147dad0d5442d9a423c69547d10a2b",
549
+ "version_major": 2,
550
+ "version_minor": 0
551
+ },
552
+ "text/plain": [
553
+ "2025-05-02__singularity.parquet: 0%| | 0.00/44.8k [00:00<?, ?B/s]"
554
+ ]
555
+ },
556
+ "metadata": {},
557
+ "output_type": "display_data"
558
+ },
559
+ {
560
+ "data": {
561
+ "application/vnd.jupyter.widget-view+json": {
562
+ "model_id": "820fda08019a45ea8b9147026711446c",
563
+ "version_major": 2,
564
+ "version_minor": 0
565
+ },
566
+ "text/plain": [
567
+ "2025-05-02__openai.parquet: 0%| | 0.00/66.8k [00:00<?, ?B/s]"
568
+ ]
569
+ },
570
+ "metadata": {},
571
+ "output_type": "display_data"
572
+ },
573
+ {
574
+ "data": {
575
+ "application/vnd.jupyter.widget-view+json": {
576
+ "model_id": "319c3e6b6ccb4e4db46b2b3053d62bff",
577
+ "version_major": 2,
578
+ "version_minor": 0
579
+ },
580
+ "text/plain": [
581
+ "2025-05-03__artificial.parquet: 0%| | 0.00/25.3k [00:00<?, ?B/s]"
582
+ ]
583
+ },
584
+ "metadata": {},
585
+ "output_type": "display_data"
586
+ },
587
+ {
588
+ "data": {
589
+ "application/vnd.jupyter.widget-view+json": {
590
+ "model_id": "2ac26f7d96614bd3b9ed109487e63ca9",
591
+ "version_major": 2,
592
+ "version_minor": 0
593
+ },
594
+ "text/plain": [
595
+ "2025-05-03__localllama.parquet: 0%| | 0.00/113k [00:00<?, ?B/s]"
596
+ ]
597
+ },
598
+ "metadata": {},
599
+ "output_type": "display_data"
600
+ },
601
+ {
602
+ "data": {
603
+ "application/vnd.jupyter.widget-view+json": {
604
+ "model_id": "2b1ed4648fb94e6db93f833dccbb047b",
605
+ "version_major": 2,
606
+ "version_minor": 0
607
+ },
608
+ "text/plain": [
609
+ "2025-05-03__singularity.parquet: 0%| | 0.00/57.9k [00:00<?, ?B/s]"
610
+ ]
611
+ },
612
+ "metadata": {},
613
+ "output_type": "display_data"
614
+ },
615
+ {
616
+ "data": {
617
+ "application/vnd.jupyter.widget-view+json": {
618
+ "model_id": "b33518a7498c4a11b859cd6806373dac",
619
+ "version_major": 2,
620
+ "version_minor": 0
621
+ },
622
+ "text/plain": [
623
+ "2025-05-03__openai.parquet: 0%| | 0.00/60.2k [00:00<?, ?B/s]"
624
+ ]
625
+ },
626
+ "metadata": {},
627
+ "output_type": "display_data"
628
+ },
629
+ {
630
+ "data": {
631
+ "application/vnd.jupyter.widget-view+json": {
632
+ "model_id": "983964c1dd7f400a861503208251b6f3",
633
+ "version_major": 2,
634
+ "version_minor": 0
635
+ },
636
+ "text/plain": [
637
+ "2025-05-04__artificial.parquet: 0%| | 0.00/23.6k [00:00<?, ?B/s]"
638
+ ]
639
+ },
640
+ "metadata": {},
641
+ "output_type": "display_data"
642
+ },
643
+ {
644
+ "data": {
645
+ "application/vnd.jupyter.widget-view+json": {
646
+ "model_id": "8258e1f3990a4deeb53f6cb48953f39b",
647
+ "version_major": 2,
648
+ "version_minor": 0
649
+ },
650
+ "text/plain": [
651
+ "2025-05-04__localllama.parquet: 0%| | 0.00/83.6k [00:00<?, ?B/s]"
652
+ ]
653
+ },
654
+ "metadata": {},
655
+ "output_type": "display_data"
656
+ },
657
+ {
658
+ "data": {
659
+ "application/vnd.jupyter.widget-view+json": {
660
+ "model_id": "2da678108a244b809cf3c4ab11164afe",
661
+ "version_major": 2,
662
+ "version_minor": 0
663
+ },
664
+ "text/plain": [
665
+ "2025-05-04__singularity.parquet: 0%| | 0.00/42.0k [00:00<?, ?B/s]"
666
+ ]
667
+ },
668
+ "metadata": {},
669
+ "output_type": "display_data"
670
+ },
671
+ {
672
+ "data": {
673
+ "application/vnd.jupyter.widget-view+json": {
674
+ "model_id": "7d276ff05077468f910033029d6f4d8c",
675
+ "version_major": 2,
676
+ "version_minor": 0
677
+ },
678
+ "text/plain": [
679
+ "2025-05-04__openai.parquet: 0%| | 0.00/68.2k [00:00<?, ?B/s]"
680
+ ]
681
+ },
682
+ "metadata": {},
683
+ "output_type": "display_data"
684
+ },
685
+ {
686
+ "data": {
687
+ "application/vnd.jupyter.widget-view+json": {
688
+ "model_id": "675a18a2606243a5a03437679bbd9baa",
689
+ "version_major": 2,
690
+ "version_minor": 0
691
+ },
692
+ "text/plain": [
693
+ "2025-05-05__artificial.parquet: 0%| | 0.00/12.2k [00:00<?, ?B/s]"
694
+ ]
695
+ },
696
+ "metadata": {},
697
+ "output_type": "display_data"
698
+ },
699
+ {
700
+ "data": {
701
+ "application/vnd.jupyter.widget-view+json": {
702
+ "model_id": "e28f11283a694ded8a360d536b109f14",
703
+ "version_major": 2,
704
+ "version_minor": 0
705
+ },
706
+ "text/plain": [
707
+ "2025-05-05__localllama.parquet: 0%| | 0.00/108k [00:00<?, ?B/s]"
708
+ ]
709
+ },
710
+ "metadata": {},
711
+ "output_type": "display_data"
712
+ },
713
+ {
714
+ "data": {
715
+ "application/vnd.jupyter.widget-view+json": {
716
+ "model_id": "0418075a84ed4154a0a7ab3227bfde91",
717
+ "version_major": 2,
718
+ "version_minor": 0
719
+ },
720
+ "text/plain": [
721
+ "2025-05-05__singularity.parquet: 0%| | 0.00/62.5k [00:00<?, ?B/s]"
722
+ ]
723
+ },
724
+ "metadata": {},
725
+ "output_type": "display_data"
726
+ },
727
+ {
728
+ "data": {
729
+ "application/vnd.jupyter.widget-view+json": {
730
+ "model_id": "be03cf1503af4a7fb8a40024ef4ea61b",
731
+ "version_major": 2,
732
+ "version_minor": 0
733
+ },
734
+ "text/plain": [
735
+ "2025-05-05__openai.parquet: 0%| | 0.00/65.9k [00:00<?, ?B/s]"
736
+ ]
737
+ },
738
+ "metadata": {},
739
+ "output_type": "display_data"
740
+ },
741
+ {
742
+ "data": {
743
+ "application/vnd.jupyter.widget-view+json": {
744
+ "model_id": "49213e6b8e0243da8e9e2cab6c5a7d9e",
745
+ "version_major": 2,
746
+ "version_minor": 0
747
+ },
748
+ "text/plain": [
749
+ "2025-05-06__artificial.parquet: 0%| | 0.00/32.2k [00:00<?, ?B/s]"
750
+ ]
751
+ },
752
+ "metadata": {},
753
+ "output_type": "display_data"
754
+ },
755
+ {
756
+ "data": {
757
+ "application/vnd.jupyter.widget-view+json": {
758
+ "model_id": "af77232ad564422ea9d7fb1fb034666f",
759
+ "version_major": 2,
760
+ "version_minor": 0
761
+ },
762
+ "text/plain": [
763
+ "2025-05-06__localllama.parquet: 0%| | 0.00/107k [00:00<?, ?B/s]"
764
+ ]
765
+ },
766
+ "metadata": {},
767
+ "output_type": "display_data"
768
+ },
769
+ {
770
+ "data": {
771
+ "application/vnd.jupyter.widget-view+json": {
772
+ "model_id": "b71f5d092d6e46409cb2d790b6cdc840",
773
+ "version_major": 2,
774
+ "version_minor": 0
775
+ },
776
+ "text/plain": [
777
+ "2025-05-06__singularity.parquet: 0%| | 0.00/41.9k [00:00<?, ?B/s]"
778
+ ]
779
+ },
780
+ "metadata": {},
781
+ "output_type": "display_data"
782
+ },
783
+ {
784
+ "data": {
785
+ "application/vnd.jupyter.widget-view+json": {
786
+ "model_id": "610bb4f3ccb24cde9e43989cf5e29c68",
787
+ "version_major": 2,
788
+ "version_minor": 0
789
+ },
790
+ "text/plain": [
791
+ "2025-05-06__openai.parquet: 0%| | 0.00/68.4k [00:00<?, ?B/s]"
792
+ ]
793
+ },
794
+ "metadata": {},
795
+ "output_type": "display_data"
796
+ },
797
+ {
798
+ "data": {
799
+ "application/vnd.jupyter.widget-view+json": {
800
+ "model_id": "ca82377832da497e8e55fb31c124d5c4",
801
+ "version_major": 2,
802
+ "version_minor": 0
803
+ },
804
+ "text/plain": [
805
+ "2025-05-07__artificial.parquet: 0%| | 0.00/32.8k [00:00<?, ?B/s]"
806
+ ]
807
+ },
808
+ "metadata": {},
809
+ "output_type": "display_data"
810
+ },
811
+ {
812
+ "data": {
813
+ "application/vnd.jupyter.widget-view+json": {
814
+ "model_id": "dacc1f40174d4872b7f19f9965e5e2e4",
815
+ "version_major": 2,
816
+ "version_minor": 0
817
+ },
818
+ "text/plain": [
819
+ "2025-05-07__localllama.parquet: 0%| | 0.00/89.0k [00:00<?, ?B/s]"
820
+ ]
821
+ },
822
+ "metadata": {},
823
+ "output_type": "display_data"
824
+ },
825
+ {
826
+ "data": {
827
+ "application/vnd.jupyter.widget-view+json": {
828
+ "model_id": "90eb047b0a144558ab6d035dc4735abb",
829
+ "version_major": 2,
830
+ "version_minor": 0
831
+ },
832
+ "text/plain": [
833
+ "2025-05-07__singularity.parquet: 0%| | 0.00/45.4k [00:00<?, ?B/s]"
834
+ ]
835
+ },
836
+ "metadata": {},
837
+ "output_type": "display_data"
838
+ },
839
+ {
840
+ "data": {
841
+ "application/vnd.jupyter.widget-view+json": {
842
+ "model_id": "c135843a057a4273ad027c1754281ca0",
843
+ "version_major": 2,
844
+ "version_minor": 0
845
+ },
846
+ "text/plain": [
847
+ "2025-05-07__openai.parquet: 0%| | 0.00/46.6k [00:00<?, ?B/s]"
848
+ ]
849
+ },
850
+ "metadata": {},
851
+ "output_type": "display_data"
852
+ },
853
+ {
854
+ "data": {
855
+ "application/vnd.jupyter.widget-view+json": {
856
+ "model_id": "989aad4accb640c6bdc3c7357475247b",
857
+ "version_major": 2,
858
+ "version_minor": 0
859
+ },
860
+ "text/plain": [
861
+ "2025-05-08__artificial.parquet: 0%| | 0.00/21.3k [00:00<?, ?B/s]"
862
+ ]
863
+ },
864
+ "metadata": {},
865
+ "output_type": "display_data"
866
+ },
867
+ {
868
+ "data": {
869
+ "application/vnd.jupyter.widget-view+json": {
870
+ "model_id": "51effc38ee62422a8d46bd4226043d01",
871
+ "version_major": 2,
872
+ "version_minor": 0
873
+ },
874
+ "text/plain": [
875
+ "2025-05-08__localllama.parquet: 0%| | 0.00/96.9k [00:00<?, ?B/s]"
876
+ ]
877
+ },
878
+ "metadata": {},
879
+ "output_type": "display_data"
880
+ },
881
+ {
882
+ "data": {
883
+ "application/vnd.jupyter.widget-view+json": {
884
+ "model_id": "a4e3873e7fa241208d0b3bad78e244c6",
885
+ "version_major": 2,
886
+ "version_minor": 0
887
+ },
888
+ "text/plain": [
889
+ "2025-05-08__singularity.parquet: 0%| | 0.00/61.1k [00:00<?, ?B/s]"
890
+ ]
891
+ },
892
+ "metadata": {},
893
+ "output_type": "display_data"
894
+ },
895
+ {
896
+ "data": {
897
+ "application/vnd.jupyter.widget-view+json": {
898
+ "model_id": "488994c7f63e4fd3925f734871bf6344",
899
+ "version_major": 2,
900
+ "version_minor": 0
901
+ },
902
+ "text/plain": [
903
+ "2025-05-08__openai.parquet: 0%| | 0.00/72.3k [00:00<?, ?B/s]"
904
+ ]
905
+ },
906
+ "metadata": {},
907
+ "output_type": "display_data"
908
+ },
909
+ {
910
+ "data": {
911
+ "application/vnd.jupyter.widget-view+json": {
912
+ "model_id": "2c7ad2d13b2e4f9cb0f87ac2ce2996c6",
913
+ "version_major": 2,
914
+ "version_minor": 0
915
+ },
916
+ "text/plain": [
917
+ "2025-05-09__artificial.parquet: 0%| | 0.00/18.5k [00:00<?, ?B/s]"
918
+ ]
919
+ },
920
+ "metadata": {},
921
+ "output_type": "display_data"
922
+ },
923
+ {
924
+ "data": {
925
+ "application/vnd.jupyter.widget-view+json": {
926
+ "model_id": "4cea5e6449a94f96ba138cca713bbaaa",
927
+ "version_major": 2,
928
+ "version_minor": 0
929
+ },
930
+ "text/plain": [
931
+ "2025-05-09__localllama.parquet: 0%| | 0.00/95.1k [00:00<?, ?B/s]"
932
+ ]
933
+ },
934
+ "metadata": {},
935
+ "output_type": "display_data"
936
+ },
937
+ {
938
+ "data": {
939
+ "application/vnd.jupyter.widget-view+json": {
940
+ "model_id": "ab828b861453476d8037c13d53deaf05",
941
+ "version_major": 2,
942
+ "version_minor": 0
943
+ },
944
+ "text/plain": [
945
+ "2025-05-09__singularity.parquet: 0%| | 0.00/64.4k [00:00<?, ?B/s]"
946
+ ]
947
+ },
948
+ "metadata": {},
949
+ "output_type": "display_data"
950
+ },
951
+ {
952
+ "data": {
953
+ "application/vnd.jupyter.widget-view+json": {
954
+ "model_id": "57ddc20f7169441b8eaa53c12921fb71",
955
+ "version_major": 2,
956
+ "version_minor": 0
957
+ },
958
+ "text/plain": [
959
+ "2025-05-09__openai.parquet: 0%| | 0.00/66.6k [00:00<?, ?B/s]"
960
+ ]
961
+ },
962
+ "metadata": {},
963
+ "output_type": "display_data"
964
+ },
965
+ {
966
+ "data": {
967
+ "application/vnd.jupyter.widget-view+json": {
968
+ "model_id": "66679d74053b471799fdb430ed89f477",
969
+ "version_major": 2,
970
+ "version_minor": 0
971
+ },
972
+ "text/plain": [
973
+ "2025-05-10__artificial.parquet: 0%| | 0.00/27.6k [00:00<?, ?B/s]"
974
+ ]
975
+ },
976
+ "metadata": {},
977
+ "output_type": "display_data"
978
+ },
979
+ {
980
+ "data": {
981
+ "application/vnd.jupyter.widget-view+json": {
982
+ "model_id": "6d623dd7af7f466a85175e92dbc5170c",
983
+ "version_major": 2,
984
+ "version_minor": 0
985
+ },
986
+ "text/plain": [
987
+ "2025-05-10__localllama.parquet: 0%| | 0.00/74.8k [00:00<?, ?B/s]"
988
+ ]
989
+ },
990
+ "metadata": {},
991
+ "output_type": "display_data"
992
+ },
993
+ {
994
+ "data": {
995
+ "application/vnd.jupyter.widget-view+json": {
996
+ "model_id": "19b4c4f96ce745d88b00a49a45514312",
997
+ "version_major": 2,
998
+ "version_minor": 0
999
+ },
1000
+ "text/plain": [
1001
+ "2025-05-10__singularity.parquet: 0%| | 0.00/49.6k [00:00<?, ?B/s]"
1002
+ ]
1003
+ },
1004
+ "metadata": {},
1005
+ "output_type": "display_data"
1006
+ },
1007
+ {
1008
+ "data": {
1009
+ "application/vnd.jupyter.widget-view+json": {
1010
+ "model_id": "cea15455d7ad4bc3b2f9229a349a1e97",
1011
+ "version_major": 2,
1012
+ "version_minor": 0
1013
+ },
1014
+ "text/plain": [
1015
+ "2025-05-10__openai.parquet: 0%| | 0.00/62.1k [00:00<?, ?B/s]"
1016
+ ]
1017
+ },
1018
+ "metadata": {},
1019
+ "output_type": "display_data"
1020
+ },
1021
+ {
1022
+ "data": {
1023
+ "application/vnd.jupyter.widget-view+json": {
1024
+ "model_id": "c78bd8a82ef14d7ea479bf9d67e6b928",
1025
+ "version_major": 2,
1026
+ "version_minor": 0
1027
+ },
1028
+ "text/plain": [
1029
+ "2025-05-11__artificial.parquet: 0%| | 0.00/24.4k [00:00<?, ?B/s]"
1030
+ ]
1031
+ },
1032
+ "metadata": {},
1033
+ "output_type": "display_data"
1034
+ },
1035
+ {
1036
+ "data": {
1037
+ "application/vnd.jupyter.widget-view+json": {
1038
+ "model_id": "6be2a2815a014ad8909c8b3527deac85",
1039
+ "version_major": 2,
1040
+ "version_minor": 0
1041
+ },
1042
+ "text/plain": [
1043
+ "2025-05-11__localllama.parquet: 0%| | 0.00/87.3k [00:00<?, ?B/s]"
1044
+ ]
1045
+ },
1046
+ "metadata": {},
1047
+ "output_type": "display_data"
1048
+ },
1049
+ {
1050
+ "data": {
1051
+ "application/vnd.jupyter.widget-view+json": {
1052
+ "model_id": "3ba58801c184431fa691121b89a8e9f2",
1053
+ "version_major": 2,
1054
+ "version_minor": 0
1055
+ },
1056
+ "text/plain": [
1057
+ "2025-05-11__singularity.parquet: 0%| | 0.00/43.2k [00:00<?, ?B/s]"
1058
+ ]
1059
+ },
1060
+ "metadata": {},
1061
+ "output_type": "display_data"
1062
+ },
1063
+ {
1064
+ "data": {
1065
+ "application/vnd.jupyter.widget-view+json": {
1066
+ "model_id": "94c247807ad84aecac35b034f9ac656d",
1067
+ "version_major": 2,
1068
+ "version_minor": 0
1069
+ },
1070
+ "text/plain": [
1071
+ "2025-05-11__openai.parquet: 0%| | 0.00/61.2k [00:00<?, ?B/s]"
1072
+ ]
1073
+ },
1074
+ "metadata": {},
1075
+ "output_type": "display_data"
1076
+ },
1077
+ {
1078
+ "data": {
1079
+ "application/vnd.jupyter.widget-view+json": {
1080
+ "model_id": "a6f81be0b0f34f6db9af3dfdd36ce4d8",
1081
+ "version_major": 2,
1082
+ "version_minor": 0
1083
+ },
1084
+ "text/plain": [
1085
+ "2025-05-12__artificial.parquet: 0%| | 0.00/34.5k [00:00<?, ?B/s]"
1086
+ ]
1087
+ },
1088
+ "metadata": {},
1089
+ "output_type": "display_data"
1090
+ },
1091
+ {
1092
+ "data": {
1093
+ "application/vnd.jupyter.widget-view+json": {
1094
+ "model_id": "22be492306e941c0b9e7a8b4c3841bfe",
1095
+ "version_major": 2,
1096
+ "version_minor": 0
1097
+ },
1098
+ "text/plain": [
1099
+ "2025-05-12__localllama.parquet: 0%| | 0.00/91.9k [00:00<?, ?B/s]"
1100
+ ]
1101
+ },
1102
+ "metadata": {},
1103
+ "output_type": "display_data"
1104
+ },
1105
+ {
1106
+ "data": {
1107
+ "application/vnd.jupyter.widget-view+json": {
1108
+ "model_id": "7909955c0dd34184b58e4b53a8ab30c6",
1109
+ "version_major": 2,
1110
+ "version_minor": 0
1111
+ },
1112
+ "text/plain": [
1113
+ "2025-05-12__singularity.parquet: 0%| | 0.00/67.2k [00:00<?, ?B/s]"
1114
+ ]
1115
+ },
1116
+ "metadata": {},
1117
+ "output_type": "display_data"
1118
+ },
1119
+ {
1120
+ "data": {
1121
+ "application/vnd.jupyter.widget-view+json": {
1122
+ "model_id": "784d09168b7a449f97dddd7e89ee3402",
1123
+ "version_major": 2,
1124
+ "version_minor": 0
1125
+ },
1126
+ "text/plain": [
1127
+ "2025-05-12__openai.parquet: 0%| | 0.00/63.9k [00:00<?, ?B/s]"
1128
+ ]
1129
+ },
1130
+ "metadata": {},
1131
+ "output_type": "display_data"
1132
+ },
1133
+ {
1134
+ "data": {
1135
+ "application/vnd.jupyter.widget-view+json": {
1136
+ "model_id": "a02faa359c034d9b96c2a73e04987551",
1137
+ "version_major": 2,
1138
+ "version_minor": 0
1139
+ },
1140
+ "text/plain": [
1141
+ "2025-05-13__artificial.parquet: 0%| | 0.00/31.6k [00:00<?, ?B/s]"
1142
+ ]
1143
+ },
1144
+ "metadata": {},
1145
+ "output_type": "display_data"
1146
+ },
1147
+ {
1148
+ "data": {
1149
+ "application/vnd.jupyter.widget-view+json": {
1150
+ "model_id": "6331ecc735ff451faa8785eb9986931e",
1151
+ "version_major": 2,
1152
+ "version_minor": 0
1153
+ },
1154
+ "text/plain": [
1155
+ "2025-05-13__localllama.parquet: 0%| | 0.00/110k [00:00<?, ?B/s]"
1156
+ ]
1157
+ },
1158
+ "metadata": {},
1159
+ "output_type": "display_data"
1160
+ },
1161
+ {
1162
+ "data": {
1163
+ "application/vnd.jupyter.widget-view+json": {
1164
+ "model_id": "580144158c4b423a835111eb71c332e6",
1165
+ "version_major": 2,
1166
+ "version_minor": 0
1167
+ },
1168
+ "text/plain": [
1169
+ "2025-05-13__singularity.parquet: 0%| | 0.00/34.1k [00:00<?, ?B/s]"
1170
+ ]
1171
+ },
1172
+ "metadata": {},
1173
+ "output_type": "display_data"
1174
+ },
1175
+ {
1176
+ "data": {
1177
+ "application/vnd.jupyter.widget-view+json": {
1178
+ "model_id": "b0d466b9c969469c8ca1fd602f306120",
1179
+ "version_major": 2,
1180
+ "version_minor": 0
1181
+ },
1182
+ "text/plain": [
1183
+ "2025-05-13__openai.parquet: 0%| | 0.00/77.2k [00:00<?, ?B/s]"
1184
+ ]
1185
+ },
1186
+ "metadata": {},
1187
+ "output_type": "display_data"
1188
+ },
1189
+ {
1190
+ "data": {
1191
+ "application/vnd.jupyter.widget-view+json": {
1192
+ "model_id": "56bea313ab98472389be2d309c49a016",
1193
+ "version_major": 2,
1194
+ "version_minor": 0
1195
+ },
1196
+ "text/plain": [
1197
+ "2025-05-14.parquet: 0%| | 0.00/252k [00:00<?, ?B/s]"
1198
+ ]
1199
+ },
1200
+ "metadata": {},
1201
+ "output_type": "display_data"
1202
+ },
1203
+ {
1204
+ "data": {
1205
+ "application/vnd.jupyter.widget-view+json": {
1206
+ "model_id": "bb79c1aeb9274fdba63b219b2d88b55e",
1207
+ "version_major": 2,
1208
+ "version_minor": 0
1209
+ },
1210
+ "text/plain": [
1211
+ "2025-05-15.parquet: 0%| | 0.00/238k [00:00<?, ?B/s]"
1212
+ ]
1213
+ },
1214
+ "metadata": {},
1215
+ "output_type": "display_data"
1216
+ },
1217
+ {
1218
+ "data": {
1219
+ "application/vnd.jupyter.widget-view+json": {
1220
+ "model_id": "71b8b91c4b114b0f813e99891354f5ef",
1221
+ "version_major": 2,
1222
+ "version_minor": 0
1223
+ },
1224
+ "text/plain": [
1225
+ "2025-05-16.parquet: 0%| | 0.00/215k [00:00<?, ?B/s]"
1226
+ ]
1227
+ },
1228
+ "metadata": {},
1229
+ "output_type": "display_data"
1230
+ },
1231
+ {
1232
+ "data": {
1233
+ "application/vnd.jupyter.widget-view+json": {
1234
+ "model_id": "ef1a788d1759479f91af32b8ec1cc564",
1235
+ "version_major": 2,
1236
+ "version_minor": 0
1237
+ },
1238
+ "text/plain": [
1239
+ "2025-05-17.parquet: 0%| | 0.00/211k [00:00<?, ?B/s]"
1240
+ ]
1241
+ },
1242
+ "metadata": {},
1243
+ "output_type": "display_data"
1244
+ },
1245
+ {
1246
+ "data": {
1247
+ "application/vnd.jupyter.widget-view+json": {
1248
+ "model_id": "399c257195bd4e58be5c5e967d08b92d",
1249
+ "version_major": 2,
1250
+ "version_minor": 0
1251
+ },
1252
+ "text/plain": [
1253
+ "2025-05-18.parquet: 0%| | 0.00/181k [00:00<?, ?B/s]"
1254
+ ]
1255
+ },
1256
+ "metadata": {},
1257
+ "output_type": "display_data"
1258
+ },
1259
+ {
1260
+ "data": {
1261
+ "application/vnd.jupyter.widget-view+json": {
1262
+ "model_id": "f5e5497475a644ebb329022fd10112ff",
1263
+ "version_major": 2,
1264
+ "version_minor": 0
1265
+ },
1266
+ "text/plain": [
1267
+ "2025-05-19.parquet: 0%| | 0.00/203k [00:00<?, ?B/s]"
1268
+ ]
1269
+ },
1270
+ "metadata": {},
1271
+ "output_type": "display_data"
1272
+ },
1273
+ {
1274
+ "data": {
1275
+ "application/vnd.jupyter.widget-view+json": {
1276
+ "model_id": "e12150fd686449e4bbebecff846dc0eb",
1277
+ "version_major": 2,
1278
+ "version_minor": 0
1279
+ },
1280
+ "text/plain": [
1281
+ "2025-05-20.parquet: 0%| | 0.00/200k [00:00<?, ?B/s]"
1282
+ ]
1283
+ },
1284
+ "metadata": {},
1285
+ "output_type": "display_data"
1286
+ },
1287
+ {
1288
+ "data": {
1289
+ "application/vnd.jupyter.widget-view+json": {
1290
+ "model_id": "939ab5b309bd4551b8b81c0d1bc8b8b9",
1291
+ "version_major": 2,
1292
+ "version_minor": 0
1293
+ },
1294
+ "text/plain": [
1295
+ "2025-05-21.parquet: 0%| | 0.00/305k [00:00<?, ?B/s]"
1296
+ ]
1297
+ },
1298
+ "metadata": {},
1299
+ "output_type": "display_data"
1300
+ },
1301
+ {
1302
+ "data": {
1303
+ "application/vnd.jupyter.widget-view+json": {
1304
+ "model_id": "cb043c5ee7bc4d268e14c8502ab38dd4",
1305
+ "version_major": 2,
1306
+ "version_minor": 0
1307
+ },
1308
+ "text/plain": [
1309
+ "2025-05-22.parquet: 0%| | 0.00/268k [00:00<?, ?B/s]"
1310
+ ]
1311
+ },
1312
+ "metadata": {},
1313
+ "output_type": "display_data"
1314
+ },
1315
+ {
1316
+ "data": {
1317
+ "application/vnd.jupyter.widget-view+json": {
1318
+ "model_id": "725ba9d1636e45b59cdf0c1e049125ac",
1319
+ "version_major": 2,
1320
+ "version_minor": 0
1321
+ },
1322
+ "text/plain": [
1323
+ "2025-05-23.parquet: 0%| | 0.00/245k [00:00<?, ?B/s]"
1324
+ ]
1325
+ },
1326
+ "metadata": {},
1327
+ "output_type": "display_data"
1328
+ },
1329
+ {
1330
+ "data": {
1331
+ "application/vnd.jupyter.widget-view+json": {
1332
+ "model_id": "2eb616fa7bf546af8d6905746d2d22e1",
1333
+ "version_major": 2,
1334
+ "version_minor": 0
1335
+ },
1336
+ "text/plain": [
1337
+ "2025-05-24.parquet: 0%| | 0.00/255k [00:00<?, ?B/s]"
1338
+ ]
1339
+ },
1340
+ "metadata": {},
1341
+ "output_type": "display_data"
1342
+ },
1343
+ {
1344
+ "data": {
1345
+ "application/vnd.jupyter.widget-view+json": {
1346
+ "model_id": "408b5ba48b1e440695e075461696db44",
1347
+ "version_major": 2,
1348
+ "version_minor": 0
1349
+ },
1350
+ "text/plain": [
1351
+ "2025-05-25.parquet: 0%| | 0.00/232k [00:00<?, ?B/s]"
1352
+ ]
1353
+ },
1354
+ "metadata": {},
1355
+ "output_type": "display_data"
1356
+ },
1357
+ {
1358
+ "data": {
1359
+ "application/vnd.jupyter.widget-view+json": {
1360
+ "model_id": "74060deaa13a4cd394edee6a6221fb24",
1361
+ "version_major": 2,
1362
+ "version_minor": 0
1363
+ },
1364
+ "text/plain": [
1365
+ "2025-05-26.parquet: 0%| | 0.00/229k [00:00<?, ?B/s]"
1366
+ ]
1367
+ },
1368
+ "metadata": {},
1369
+ "output_type": "display_data"
1370
+ },
1371
+ {
1372
+ "data": {
1373
+ "application/vnd.jupyter.widget-view+json": {
1374
+ "model_id": "54098d40c04740f3bfb52e0ad118f0c9",
1375
+ "version_major": 2,
1376
+ "version_minor": 0
1377
+ },
1378
+ "text/plain": [
1379
+ "2025-05-14__singularity.parquet: 0%| | 0.00/67.7k [00:00<?, ?B/s]"
1380
+ ]
1381
+ },
1382
+ "metadata": {},
1383
+ "output_type": "display_data"
1384
+ },
1385
+ {
1386
+ "data": {
1387
+ "application/vnd.jupyter.widget-view+json": {
1388
+ "model_id": "ed212abb01cf4f3e8c7795966354e763",
1389
+ "version_major": 2,
1390
+ "version_minor": 0
1391
+ },
1392
+ "text/plain": [
1393
+ "2025-05-14__openai.parquet: 0%| | 0.00/77.2k [00:00<?, ?B/s]"
1394
+ ]
1395
+ },
1396
+ "metadata": {},
1397
+ "output_type": "display_data"
1398
+ },
1399
+ {
1400
+ "data": {
1401
+ "application/vnd.jupyter.widget-view+json": {
1402
+ "model_id": "1eda7ad8367343ff9b1aab3af9c0e23e",
1403
+ "version_major": 2,
1404
+ "version_minor": 0
1405
+ },
1406
+ "text/plain": [
1407
+ "Upload 52 LFS files: 0%| | 0/52 [00:00<?, ?it/s]"
1408
+ ]
1409
+ },
1410
+ "metadata": {},
1411
+ "output_type": "display_data"
1412
+ },
1413
+ {
1414
+ "data": {
1415
+ "application/vnd.jupyter.widget-view+json": {
1416
+ "model_id": "e8fdc3d100074276bcf3ef806e816379",
1417
+ "version_major": 2,
1418
+ "version_minor": 0
1419
+ },
1420
+ "text/plain": [
1421
+ "2025-05-14__artificial.parquet: 0%| | 0.00/44.0k [00:00<?, ?B/s]"
1422
+ ]
1423
+ },
1424
+ "metadata": {},
1425
+ "output_type": "display_data"
1426
+ },
1427
+ {
1428
+ "data": {
1429
+ "application/vnd.jupyter.widget-view+json": {
1430
+ "model_id": "5cd6a5e66b1e469badfe3b0eb302e18f",
1431
+ "version_major": 2,
1432
+ "version_minor": 0
1433
+ },
1434
+ "text/plain": [
1435
+ "2025-05-14__localllama.parquet: 0%| | 0.00/86.7k [00:00<?, ?B/s]"
1436
+ ]
1437
+ },
1438
+ "metadata": {},
1439
+ "output_type": "display_data"
1440
+ },
1441
+ {
1442
+ "data": {
1443
+ "application/vnd.jupyter.widget-view+json": {
1444
+ "model_id": "c38e5e4579c04b5faa6a5c1ed195d9ac",
1445
+ "version_major": 2,
1446
+ "version_minor": 0
1447
+ },
1448
+ "text/plain": [
1449
+ "2025-05-15__artificial.parquet: 0%| | 0.00/27.5k [00:00<?, ?B/s]"
1450
+ ]
1451
+ },
1452
+ "metadata": {},
1453
+ "output_type": "display_data"
1454
+ },
1455
+ {
1456
+ "data": {
1457
+ "application/vnd.jupyter.widget-view+json": {
1458
+ "model_id": "10949fbbcab440aea1518bf44738ef76",
1459
+ "version_major": 2,
1460
+ "version_minor": 0
1461
+ },
1462
+ "text/plain": [
1463
+ "2025-05-15__localllama.parquet: 0%| | 0.00/91.4k [00:00<?, ?B/s]"
1464
+ ]
1465
+ },
1466
+ "metadata": {},
1467
+ "output_type": "display_data"
1468
+ },
1469
+ {
1470
+ "data": {
1471
+ "application/vnd.jupyter.widget-view+json": {
1472
+ "model_id": "ab54f1bfa30f4c9fb6fa0ba7b151e172",
1473
+ "version_major": 2,
1474
+ "version_minor": 0
1475
+ },
1476
+ "text/plain": [
1477
+ "2025-05-15__singularity.parquet: 0%| | 0.00/87.4k [00:00<?, ?B/s]"
1478
+ ]
1479
+ },
1480
+ "metadata": {},
1481
+ "output_type": "display_data"
1482
+ },
1483
+ {
1484
+ "data": {
1485
+ "application/vnd.jupyter.widget-view+json": {
1486
+ "model_id": "f39123da09d440b1bbcac24baba73f74",
1487
+ "version_major": 2,
1488
+ "version_minor": 0
1489
+ },
1490
+ "text/plain": [
1491
+ "2025-05-16__artificial.parquet: 0%| | 0.00/28.3k [00:00<?, ?B/s]"
1492
+ ]
1493
+ },
1494
+ "metadata": {},
1495
+ "output_type": "display_data"
1496
+ },
1497
+ {
1498
+ "data": {
1499
+ "application/vnd.jupyter.widget-view+json": {
1500
+ "model_id": "e8fdd23d2d33453a9625e8f964ad5ccf",
1501
+ "version_major": 2,
1502
+ "version_minor": 0
1503
+ },
1504
+ "text/plain": [
1505
+ "2025-05-15__openai.parquet: 0%| | 0.00/62.5k [00:00<?, ?B/s]"
1506
+ ]
1507
+ },
1508
+ "metadata": {},
1509
+ "output_type": "display_data"
1510
+ },
1511
+ {
1512
+ "data": {
1513
+ "application/vnd.jupyter.widget-view+json": {
1514
+ "model_id": "a9cd7f7663c0483d9b884efb7ddf9461",
1515
+ "version_major": 2,
1516
+ "version_minor": 0
1517
+ },
1518
+ "text/plain": [
1519
+ "2025-05-16__localllama.parquet: 0%| | 0.00/88.7k [00:00<?, ?B/s]"
1520
+ ]
1521
+ },
1522
+ "metadata": {},
1523
+ "output_type": "display_data"
1524
+ },
1525
+ {
1526
+ "data": {
1527
+ "application/vnd.jupyter.widget-view+json": {
1528
+ "model_id": "2cad07640f3a426d90ec230979606662",
1529
+ "version_major": 2,
1530
+ "version_minor": 0
1531
+ },
1532
+ "text/plain": [
1533
+ "2025-05-16__singularity.parquet: 0%| | 0.00/61.4k [00:00<?, ?B/s]"
1534
+ ]
1535
+ },
1536
+ "metadata": {},
1537
+ "output_type": "display_data"
1538
+ },
1539
+ {
1540
+ "data": {
1541
+ "application/vnd.jupyter.widget-view+json": {
1542
+ "model_id": "f417ed44a35249709bcc6aa12d2f00fd",
1543
+ "version_major": 2,
1544
+ "version_minor": 0
1545
+ },
1546
+ "text/plain": [
1547
+ "2025-05-16__openai.parquet: 0%| | 0.00/61.3k [00:00<?, ?B/s]"
1548
+ ]
1549
+ },
1550
+ "metadata": {},
1551
+ "output_type": "display_data"
1552
+ },
1553
+ {
1554
+ "data": {
1555
+ "application/vnd.jupyter.widget-view+json": {
1556
+ "model_id": "c90b234944fa4e48928f80a8fcad1b11",
1557
+ "version_major": 2,
1558
+ "version_minor": 0
1559
+ },
1560
+ "text/plain": [
1561
+ "2025-05-17__artificial.parquet: 0%| | 0.00/31.0k [00:00<?, ?B/s]"
1562
+ ]
1563
+ },
1564
+ "metadata": {},
1565
+ "output_type": "display_data"
1566
+ },
1567
+ {
1568
+ "data": {
1569
+ "application/vnd.jupyter.widget-view+json": {
1570
+ "model_id": "b08c9e2a792844c283e409b4c53b311d",
1571
+ "version_major": 2,
1572
+ "version_minor": 0
1573
+ },
1574
+ "text/plain": [
1575
+ "2025-05-17__localllama.parquet: 0%| | 0.00/83.0k [00:00<?, ?B/s]"
1576
+ ]
1577
+ },
1578
+ "metadata": {},
1579
+ "output_type": "display_data"
1580
+ },
1581
+ {
1582
+ "data": {
1583
+ "application/vnd.jupyter.widget-view+json": {
1584
+ "model_id": "cd3aeda6025c4253806583b881f4bde0",
1585
+ "version_major": 2,
1586
+ "version_minor": 0
1587
+ },
1588
+ "text/plain": [
1589
+ "2025-05-17__singularity.parquet: 0%| | 0.00/55.8k [00:00<?, ?B/s]"
1590
+ ]
1591
+ },
1592
+ "metadata": {},
1593
+ "output_type": "display_data"
1594
+ },
1595
+ {
1596
+ "data": {
1597
+ "application/vnd.jupyter.widget-view+json": {
1598
+ "model_id": "c749b4e239f14c14946af293bd5878ec",
1599
+ "version_major": 2,
1600
+ "version_minor": 0
1601
+ },
1602
+ "text/plain": [
1603
+ "2025-05-17__openai.parquet: 0%| | 0.00/75.9k [00:00<?, ?B/s]"
1604
+ ]
1605
+ },
1606
+ "metadata": {},
1607
+ "output_type": "display_data"
1608
+ },
1609
+ {
1610
+ "data": {
1611
+ "application/vnd.jupyter.widget-view+json": {
1612
+ "model_id": "f35850eb833f470eb34a7bb7af90e7bb",
1613
+ "version_major": 2,
1614
+ "version_minor": 0
1615
+ },
1616
+ "text/plain": [
1617
+ "2025-05-18__artificial.parquet: 0%| | 0.00/20.8k [00:00<?, ?B/s]"
1618
+ ]
1619
+ },
1620
+ "metadata": {},
1621
+ "output_type": "display_data"
1622
+ },
1623
+ {
1624
+ "data": {
1625
+ "application/vnd.jupyter.widget-view+json": {
1626
+ "model_id": "8e3a317077b447828b5f188af62e495a",
1627
+ "version_major": 2,
1628
+ "version_minor": 0
1629
+ },
1630
+ "text/plain": [
1631
+ "2025-05-18__localllama.parquet: 0%| | 0.00/89.1k [00:00<?, ?B/s]"
1632
+ ]
1633
+ },
1634
+ "metadata": {},
1635
+ "output_type": "display_data"
1636
+ },
1637
+ {
1638
+ "data": {
1639
+ "application/vnd.jupyter.widget-view+json": {
1640
+ "model_id": "818aeb4d423a44b5bb58042c1631609c",
1641
+ "version_major": 2,
1642
+ "version_minor": 0
1643
+ },
1644
+ "text/plain": [
1645
+ "2025-05-18__singularity.parquet: 0%| | 0.00/37.0k [00:00<?, ?B/s]"
1646
+ ]
1647
+ },
1648
+ "metadata": {},
1649
+ "output_type": "display_data"
1650
+ },
1651
+ {
1652
+ "data": {
1653
+ "application/vnd.jupyter.widget-view+json": {
1654
+ "model_id": "ac8350638ff64429ab8537211b46e349",
1655
+ "version_major": 2,
1656
+ "version_minor": 0
1657
+ },
1658
+ "text/plain": [
1659
+ "2025-05-18__openai.parquet: 0%| | 0.00/59.9k [00:00<?, ?B/s]"
1660
+ ]
1661
+ },
1662
+ "metadata": {},
1663
+ "output_type": "display_data"
1664
+ },
1665
+ {
1666
+ "data": {
1667
+ "application/vnd.jupyter.widget-view+json": {
1668
+ "model_id": "c8ac2c9af38341e988c1cc110a37c0aa",
1669
+ "version_major": 2,
1670
+ "version_minor": 0
1671
+ },
1672
+ "text/plain": [
1673
+ "2025-05-19__artificial.parquet: 0%| | 0.00/34.9k [00:00<?, ?B/s]"
1674
+ ]
1675
+ },
1676
+ "metadata": {},
1677
+ "output_type": "display_data"
1678
+ },
1679
+ {
1680
+ "data": {
1681
+ "application/vnd.jupyter.widget-view+json": {
1682
+ "model_id": "d1b8dfcf11084da182ca8a30b121d6d4",
1683
+ "version_major": 2,
1684
+ "version_minor": 0
1685
+ },
1686
+ "text/plain": [
1687
+ "2025-05-19__localllama.parquet: 0%| | 0.00/83.4k [00:00<?, ?B/s]"
1688
+ ]
1689
+ },
1690
+ "metadata": {},
1691
+ "output_type": "display_data"
1692
+ },
1693
+ {
1694
+ "data": {
1695
+ "application/vnd.jupyter.widget-view+json": {
1696
+ "model_id": "66d7d49bf1cc407d803805624cf5b4e4",
1697
+ "version_major": 2,
1698
+ "version_minor": 0
1699
+ },
1700
+ "text/plain": [
1701
+ "2025-05-19__singularity.parquet: 0%| | 0.00/74.1k [00:00<?, ?B/s]"
1702
+ ]
1703
+ },
1704
+ "metadata": {},
1705
+ "output_type": "display_data"
1706
+ },
1707
+ {
1708
+ "data": {
1709
+ "application/vnd.jupyter.widget-view+json": {
1710
+ "model_id": "e6a5d864b9ae4a47b3aaf6c090f650e0",
1711
+ "version_major": 2,
1712
+ "version_minor": 0
1713
+ },
1714
+ "text/plain": [
1715
+ "2025-05-19__openai.parquet: 0%| | 0.00/39.2k [00:00<?, ?B/s]"
1716
+ ]
1717
+ },
1718
+ "metadata": {},
1719
+ "output_type": "display_data"
1720
+ },
1721
+ {
1722
+ "data": {
1723
+ "application/vnd.jupyter.widget-view+json": {
1724
+ "model_id": "6e6ba9dcefa94b00bd2b9e788cce50c7",
1725
+ "version_major": 2,
1726
+ "version_minor": 0
1727
+ },
1728
+ "text/plain": [
1729
+ "2025-05-20__artificial.parquet: 0%| | 0.00/29.8k [00:00<?, ?B/s]"
1730
+ ]
1731
+ },
1732
+ "metadata": {},
1733
+ "output_type": "display_data"
1734
+ },
1735
+ {
1736
+ "data": {
1737
+ "application/vnd.jupyter.widget-view+json": {
1738
+ "model_id": "73cb4e099ba746ffaf3ce037ca9321b3",
1739
+ "version_major": 2,
1740
+ "version_minor": 0
1741
+ },
1742
+ "text/plain": [
1743
+ "2025-05-20__localllama.parquet: 0%| | 0.00/76.2k [00:00<?, ?B/s]"
1744
+ ]
1745
+ },
1746
+ "metadata": {},
1747
+ "output_type": "display_data"
1748
+ },
1749
+ {
1750
+ "data": {
1751
+ "application/vnd.jupyter.widget-view+json": {
1752
+ "model_id": "1aa484c8a9c64f479b6c00db725e2b08",
1753
+ "version_major": 2,
1754
+ "version_minor": 0
1755
+ },
1756
+ "text/plain": [
1757
+ "2025-05-20__singularity.parquet: 0%| | 0.00/74.3k [00:00<?, ?B/s]"
1758
+ ]
1759
+ },
1760
+ "metadata": {},
1761
+ "output_type": "display_data"
1762
+ },
1763
+ {
1764
+ "data": {
1765
+ "application/vnd.jupyter.widget-view+json": {
1766
+ "model_id": "9bedec9db179405193a5174ae2eaf84c",
1767
+ "version_major": 2,
1768
+ "version_minor": 0
1769
+ },
1770
+ "text/plain": [
1771
+ "2025-05-20__openai.parquet: 0%| | 0.00/44.1k [00:00<?, ?B/s]"
1772
+ ]
1773
+ },
1774
+ "metadata": {},
1775
+ "output_type": "display_data"
1776
+ },
1777
+ {
1778
+ "data": {
1779
+ "application/vnd.jupyter.widget-view+json": {
1780
+ "model_id": "d36d86fa830a4afcb50fa4d7cd6384cb",
1781
+ "version_major": 2,
1782
+ "version_minor": 0
1783
+ },
1784
+ "text/plain": [
1785
+ "2025-05-21__artificial.parquet: 0%| | 0.00/30.5k [00:00<?, ?B/s]"
1786
+ ]
1787
+ },
1788
+ "metadata": {},
1789
+ "output_type": "display_data"
1790
+ },
1791
+ {
1792
+ "data": {
1793
+ "application/vnd.jupyter.widget-view+json": {
1794
+ "model_id": "8c32cb9728e8486a836dd2c7be037eb5",
1795
+ "version_major": 2,
1796
+ "version_minor": 0
1797
+ },
1798
+ "text/plain": [
1799
+ "2025-05-21__localllama.parquet: 0%| | 0.00/103k [00:00<?, ?B/s]"
1800
+ ]
1801
+ },
1802
+ "metadata": {},
1803
+ "output_type": "display_data"
1804
+ },
1805
+ {
1806
+ "data": {
1807
+ "application/vnd.jupyter.widget-view+json": {
1808
+ "model_id": "a63b4c0871e04f9eb20ed25cc689806f",
1809
+ "version_major": 2,
1810
+ "version_minor": 0
1811
+ },
1812
+ "text/plain": [
1813
+ "2025-05-21__singularity.parquet: 0%| | 0.00/134k [00:00<?, ?B/s]"
1814
+ ]
1815
+ },
1816
+ "metadata": {},
1817
+ "output_type": "display_data"
1818
+ },
1819
+ {
1820
+ "data": {
1821
+ "application/vnd.jupyter.widget-view+json": {
1822
+ "model_id": "a5a36a0ecafc49f78455368f8bdacffa",
1823
+ "version_major": 2,
1824
+ "version_minor": 0
1825
+ },
1826
+ "text/plain": [
1827
+ "2025-05-21__openai.parquet: 0%| | 0.00/63.5k [00:00<?, ?B/s]"
1828
+ ]
1829
+ },
1830
+ "metadata": {},
1831
+ "output_type": "display_data"
1832
+ },
1833
+ {
1834
+ "data": {
1835
+ "application/vnd.jupyter.widget-view+json": {
1836
+ "model_id": "edb6712fdac44a57aad37d6960e65c5d",
1837
+ "version_major": 2,
1838
+ "version_minor": 0
1839
+ },
1840
+ "text/plain": [
1841
+ "2025-05-22__artificial.parquet: 0%| | 0.00/29.3k [00:00<?, ?B/s]"
1842
+ ]
1843
+ },
1844
+ "metadata": {},
1845
+ "output_type": "display_data"
1846
+ },
1847
+ {
1848
+ "data": {
1849
+ "application/vnd.jupyter.widget-view+json": {
1850
+ "model_id": "60327c547d1d4970bc93ba721e017401",
1851
+ "version_major": 2,
1852
+ "version_minor": 0
1853
+ },
1854
+ "text/plain": [
1855
+ "2025-05-22__localllama.parquet: 0%| | 0.00/107k [00:00<?, ?B/s]"
1856
+ ]
1857
+ },
1858
+ "metadata": {},
1859
+ "output_type": "display_data"
1860
+ },
1861
+ {
1862
+ "data": {
1863
+ "application/vnd.jupyter.widget-view+json": {
1864
+ "model_id": "51aee3296d81463da75209b20414fad0",
1865
+ "version_major": 2,
1866
+ "version_minor": 0
1867
+ },
1868
+ "text/plain": [
1869
+ "2025-05-22__singularity.parquet: 0%| | 0.00/84.4k [00:00<?, ?B/s]"
1870
+ ]
1871
+ },
1872
+ "metadata": {},
1873
+ "output_type": "display_data"
1874
+ },
1875
+ {
1876
+ "data": {
1877
+ "application/vnd.jupyter.widget-view+json": {
1878
+ "model_id": "147cfa0d1a4a4bd2a026f44f9a21ef4d",
1879
+ "version_major": 2,
1880
+ "version_minor": 0
1881
+ },
1882
+ "text/plain": [
1883
+ "2025-05-22__openai.parquet: 0%| | 0.00/72.0k [00:00<?, ?B/s]"
1884
+ ]
1885
+ },
1886
+ "metadata": {},
1887
+ "output_type": "display_data"
1888
+ },
1889
+ {
1890
+ "data": {
1891
+ "application/vnd.jupyter.widget-view+json": {
1892
+ "model_id": "d6ed6504fe12452a8530429ec72497e7",
1893
+ "version_major": 2,
1894
+ "version_minor": 0
1895
+ },
1896
+ "text/plain": [
1897
+ "2025-05-23__artificial.parquet: 0%| | 0.00/44.2k [00:00<?, ?B/s]"
1898
+ ]
1899
+ },
1900
+ "metadata": {},
1901
+ "output_type": "display_data"
1902
+ },
1903
+ {
1904
+ "data": {
1905
+ "application/vnd.jupyter.widget-view+json": {
1906
+ "model_id": "7ff04044c7a74935a8c7fa06873de5b1",
1907
+ "version_major": 2,
1908
+ "version_minor": 0
1909
+ },
1910
+ "text/plain": [
1911
+ "2025-05-23__localllama.parquet: 0%| | 0.00/97.4k [00:00<?, ?B/s]"
1912
+ ]
1913
+ },
1914
+ "metadata": {},
1915
+ "output_type": "display_data"
1916
+ },
1917
+ {
1918
+ "data": {
1919
+ "application/vnd.jupyter.widget-view+json": {
1920
+ "model_id": "2a0d0d2b3cac40cc9f030b4c9bea1b17",
1921
+ "version_major": 2,
1922
+ "version_minor": 0
1923
+ },
1924
+ "text/plain": [
1925
+ "2025-05-23__singularity.parquet: 0%| | 0.00/80.9k [00:00<?, ?B/s]"
1926
+ ]
1927
+ },
1928
+ "metadata": {},
1929
+ "output_type": "display_data"
1930
+ },
1931
+ {
1932
+ "data": {
1933
+ "application/vnd.jupyter.widget-view+json": {
1934
+ "model_id": "293ac26617d043e9a6b089b6c41a1c24",
1935
+ "version_major": 2,
1936
+ "version_minor": 0
1937
+ },
1938
+ "text/plain": [
1939
+ "2025-05-23__openai.parquet: 0%| | 0.00/53.4k [00:00<?, ?B/s]"
1940
+ ]
1941
+ },
1942
+ "metadata": {},
1943
+ "output_type": "display_data"
1944
+ },
1945
+ {
1946
+ "data": {
1947
+ "application/vnd.jupyter.widget-view+json": {
1948
+ "model_id": "7d4a32215f294ced838440086a2222ed",
1949
+ "version_major": 2,
1950
+ "version_minor": 0
1951
+ },
1952
+ "text/plain": [
1953
+ "2025-05-24__artificial.parquet: 0%| | 0.00/36.4k [00:00<?, ?B/s]"
1954
+ ]
1955
+ },
1956
+ "metadata": {},
1957
+ "output_type": "display_data"
1958
+ },
1959
+ {
1960
+ "data": {
1961
+ "application/vnd.jupyter.widget-view+json": {
1962
+ "model_id": "31ae291a73ea4b54b6119b4917b4028f",
1963
+ "version_major": 2,
1964
+ "version_minor": 0
1965
+ },
1966
+ "text/plain": [
1967
+ "2025-05-24__localllama.parquet: 0%| | 0.00/88.0k [00:00<?, ?B/s]"
1968
+ ]
1969
+ },
1970
+ "metadata": {},
1971
+ "output_type": "display_data"
1972
+ },
1973
+ {
1974
+ "data": {
1975
+ "application/vnd.jupyter.widget-view+json": {
1976
+ "model_id": "2ecffec0359c4c289e22c86feafd5316",
1977
+ "version_major": 2,
1978
+ "version_minor": 0
1979
+ },
1980
+ "text/plain": [
1981
+ "2025-05-24__singularity.parquet: 0%| | 0.00/102k [00:00<?, ?B/s]"
1982
+ ]
1983
+ },
1984
+ "metadata": {},
1985
+ "output_type": "display_data"
1986
+ },
1987
+ {
1988
+ "data": {
1989
+ "application/vnd.jupyter.widget-view+json": {
1990
+ "model_id": "0562f20172df437fb2ab5a2931b27876",
1991
+ "version_major": 2,
1992
+ "version_minor": 0
1993
+ },
1994
+ "text/plain": [
1995
+ "2025-05-24__openai.parquet: 0%| | 0.00/66.3k [00:00<?, ?B/s]"
1996
+ ]
1997
+ },
1998
+ "metadata": {},
1999
+ "output_type": "display_data"
2000
+ },
2001
+ {
2002
+ "data": {
2003
+ "application/vnd.jupyter.widget-view+json": {
2004
+ "model_id": "5cb6500f4252418393e74d5eaaf3c507",
2005
+ "version_major": 2,
2006
+ "version_minor": 0
2007
+ },
2008
+ "text/plain": [
2009
+ "2025-05-25__artificial.parquet: 0%| | 0.00/46.3k [00:00<?, ?B/s]"
2010
+ ]
2011
+ },
2012
+ "metadata": {},
2013
+ "output_type": "display_data"
2014
+ },
2015
+ {
2016
+ "data": {
2017
+ "application/vnd.jupyter.widget-view+json": {
2018
+ "model_id": "89e9eddfd25348f89d045b00dee77e07",
2019
+ "version_major": 2,
2020
+ "version_minor": 0
2021
+ },
2022
+ "text/plain": [
2023
+ "2025-05-25__localllama.parquet: 0%| | 0.00/77.1k [00:00<?, ?B/s]"
2024
+ ]
2025
+ },
2026
+ "metadata": {},
2027
+ "output_type": "display_data"
2028
+ },
2029
+ {
2030
+ "data": {
2031
+ "application/vnd.jupyter.widget-view+json": {
2032
+ "model_id": "2e6d25bafd1042dd8e86c441d13a4c00",
2033
+ "version_major": 2,
2034
+ "version_minor": 0
2035
+ },
2036
+ "text/plain": [
2037
+ "2025-05-25__singularity.parquet: 0%| | 0.00/71.1k [00:00<?, ?B/s]"
2038
+ ]
2039
+ },
2040
+ "metadata": {},
2041
+ "output_type": "display_data"
2042
+ },
2043
+ {
2044
+ "data": {
2045
+ "application/vnd.jupyter.widget-view+json": {
2046
+ "model_id": "10c8dd04e5e74cffb3ca466dfad8c188",
2047
+ "version_major": 2,
2048
+ "version_minor": 0
2049
+ },
2050
+ "text/plain": [
2051
+ "2025-05-25__openai.parquet: 0%| | 0.00/63.2k [00:00<?, ?B/s]"
2052
+ ]
2053
+ },
2054
+ "metadata": {},
2055
+ "output_type": "display_data"
2056
+ },
2057
+ {
2058
+ "data": {
2059
+ "application/vnd.jupyter.widget-view+json": {
2060
+ "model_id": "9c9251a1a90c47beace49fedb97ae176",
2061
+ "version_major": 2,
2062
+ "version_minor": 0
2063
+ },
2064
+ "text/plain": [
2065
+ "2025-05-26__artificial.parquet: 0%| | 0.00/29.4k [00:00<?, ?B/s]"
2066
+ ]
2067
+ },
2068
+ "metadata": {},
2069
+ "output_type": "display_data"
2070
+ },
2071
+ {
2072
+ "data": {
2073
+ "application/vnd.jupyter.widget-view+json": {
2074
+ "model_id": "12d335dfe3b24f1c802bdf8e1c2c89a8",
2075
+ "version_major": 2,
2076
+ "version_minor": 0
2077
+ },
2078
+ "text/plain": [
2079
+ "2025-05-26__localllama.parquet: 0%| | 0.00/103k [00:00<?, ?B/s]"
2080
+ ]
2081
+ },
2082
+ "metadata": {},
2083
+ "output_type": "display_data"
2084
+ },
2085
+ {
2086
+ "data": {
2087
+ "application/vnd.jupyter.widget-view+json": {
2088
+ "model_id": "964f131d58164bcea62374320f3be24b",
2089
+ "version_major": 2,
2090
+ "version_minor": 0
2091
+ },
2092
+ "text/plain": [
2093
+ "2025-05-26__singularity.parquet: 0%| | 0.00/64.8k [00:00<?, ?B/s]"
2094
+ ]
2095
+ },
2096
+ "metadata": {},
2097
+ "output_type": "display_data"
2098
+ },
2099
+ {
2100
+ "data": {
2101
+ "application/vnd.jupyter.widget-view+json": {
2102
+ "model_id": "932b3d54b55a4b2e8419a90af68c2ef8",
2103
+ "version_major": 2,
2104
+ "version_minor": 0
2105
+ },
2106
+ "text/plain": [
2107
+ "2025-05-26__openai.parquet: 0%| | 0.00/59.3k [00:00<?, ?B/s]"
2108
+ ]
2109
+ },
2110
+ "metadata": {},
2111
+ "output_type": "display_data"
2112
+ },
2113
+ {
2114
+ "data": {
2115
+ "application/vnd.jupyter.widget-view+json": {
2116
+ "model_id": "93a7910bb2574a2a92ad5c3f00e72352",
2117
+ "version_major": 2,
2118
+ "version_minor": 0
2119
+ },
2120
+ "text/plain": [
2121
+ "2025-05-27.parquet: 0%| | 0.00/232k [00:00<?, ?B/s]"
2122
+ ]
2123
+ },
2124
+ "metadata": {},
2125
+ "output_type": "display_data"
2126
+ },
2127
+ {
2128
+ "data": {
2129
+ "application/vnd.jupyter.widget-view+json": {
2130
+ "model_id": "e9de3761367f4fcf80fc5978b5ef9d08",
2131
+ "version_major": 2,
2132
+ "version_minor": 0
2133
+ },
2134
+ "text/plain": [
2135
+ "2025-05-28.parquet: 0%| | 0.00/270k [00:00<?, ?B/s]"
2136
+ ]
2137
+ },
2138
+ "metadata": {},
2139
+ "output_type": "display_data"
2140
+ },
2141
+ {
2142
+ "data": {
2143
+ "application/vnd.jupyter.widget-view+json": {
2144
+ "model_id": "55b638eb2ae3431ba0facd50aac5146e",
2145
+ "version_major": 2,
2146
+ "version_minor": 0
2147
+ },
2148
+ "text/plain": [
2149
+ "2025-05-29.parquet: 0%| | 0.00/262k [00:00<?, ?B/s]"
2150
+ ]
2151
+ },
2152
+ "metadata": {},
2153
+ "output_type": "display_data"
2154
+ },
2155
+ {
2156
+ "data": {
2157
+ "application/vnd.jupyter.widget-view+json": {
2158
+ "model_id": "83a2c27a2bc14de787b45736894bf9b8",
2159
+ "version_major": 2,
2160
+ "version_minor": 0
2161
+ },
2162
+ "text/plain": [
2163
+ "2025-05-30.parquet: 0%| | 0.00/240k [00:00<?, ?B/s]"
2164
+ ]
2165
+ },
2166
+ "metadata": {},
2167
+ "output_type": "display_data"
2168
+ },
2169
+ {
2170
+ "data": {
2171
+ "application/vnd.jupyter.widget-view+json": {
2172
+ "model_id": "5a90b1ee7ad14a2a804990f948f575c7",
2173
+ "version_major": 2,
2174
+ "version_minor": 0
2175
+ },
2176
+ "text/plain": [
2177
+ "2025-05-31.parquet: 0%| | 0.00/231k [00:00<?, ?B/s]"
2178
+ ]
2179
+ },
2180
+ "metadata": {},
2181
+ "output_type": "display_data"
2182
+ },
2183
+ {
2184
+ "data": {
2185
+ "application/vnd.jupyter.widget-view+json": {
2186
+ "model_id": "f703a9d565b54fe992e457891077635b",
2187
+ "version_major": 2,
2188
+ "version_minor": 0
2189
+ },
2190
+ "text/plain": [
2191
+ "2025-06-01.parquet: 0%| | 0.00/167k [00:00<?, ?B/s]"
2192
+ ]
2193
+ },
2194
+ "metadata": {},
2195
+ "output_type": "display_data"
2196
+ },
2197
+ {
2198
+ "data": {
2199
+ "application/vnd.jupyter.widget-view+json": {
2200
+ "model_id": "8c79327d81a14ce6a526ee88c5bdb9e5",
2201
+ "version_major": 2,
2202
+ "version_minor": 0
2203
+ },
2204
+ "text/plain": [
2205
+ "2025-06-02.parquet: 0%| | 0.00/250k [00:00<?, ?B/s]"
2206
+ ]
2207
+ },
2208
+ "metadata": {},
2209
+ "output_type": "display_data"
2210
+ },
2211
+ {
2212
+ "data": {
2213
+ "application/vnd.jupyter.widget-view+json": {
2214
+ "model_id": "1cf197678ff842d2bfe2fe30125cb924",
2215
+ "version_major": 2,
2216
+ "version_minor": 0
2217
+ },
2218
+ "text/plain": [
2219
+ "2025-06-03.parquet: 0%| | 0.00/206k [00:00<?, ?B/s]"
2220
+ ]
2221
+ },
2222
+ "metadata": {},
2223
+ "output_type": "display_data"
2224
+ },
2225
+ {
2226
+ "data": {
2227
+ "application/vnd.jupyter.widget-view+json": {
2228
+ "model_id": "28cda701d8174e00966856b4d32a74b6",
2229
+ "version_major": 2,
2230
+ "version_minor": 0
2231
+ },
2232
+ "text/plain": [
2233
+ "2025-06-04.parquet: 0%| | 0.00/269k [00:00<?, ?B/s]"
2234
+ ]
2235
+ },
2236
+ "metadata": {},
2237
+ "output_type": "display_data"
2238
+ },
2239
+ {
2240
+ "data": {
2241
+ "application/vnd.jupyter.widget-view+json": {
2242
+ "model_id": "0105914388e94b5289036a1415360afa",
2243
+ "version_major": 2,
2244
+ "version_minor": 0
2245
+ },
2246
+ "text/plain": [
2247
+ "2025-05-27__artificial.parquet: 0%| | 0.00/32.8k [00:00<?, ?B/s]"
2248
+ ]
2249
+ },
2250
+ "metadata": {},
2251
+ "output_type": "display_data"
2252
+ },
2253
+ {
2254
+ "data": {
2255
+ "application/vnd.jupyter.widget-view+json": {
2256
+ "model_id": "14874b9c03024c2d863c285345810028",
2257
+ "version_major": 2,
2258
+ "version_minor": 0
2259
+ },
2260
+ "text/plain": [
2261
+ "2025-05-27__singularity.parquet: 0%| | 0.00/85.7k [00:00<?, ?B/s]"
2262
+ ]
2263
+ },
2264
+ "metadata": {},
2265
+ "output_type": "display_data"
2266
+ },
2267
+ {
2268
+ "data": {
2269
+ "application/vnd.jupyter.widget-view+json": {
2270
+ "model_id": "4ac23fb5af124e04b64ff29059912298",
2271
+ "version_major": 2,
2272
+ "version_minor": 0
2273
+ },
2274
+ "text/plain": [
2275
+ "Upload 36 LFS files: 0%| | 0/36 [00:00<?, ?it/s]"
2276
+ ]
2277
+ },
2278
+ "metadata": {},
2279
+ "output_type": "display_data"
2280
+ },
2281
+ {
2282
+ "data": {
2283
+ "application/vnd.jupyter.widget-view+json": {
2284
+ "model_id": "cd4317b62cdb4be1a4ff725f12c6176c",
2285
+ "version_major": 2,
2286
+ "version_minor": 0
2287
+ },
2288
+ "text/plain": [
2289
+ "2025-05-27__localllama.parquet: 0%| | 0.00/86.4k [00:00<?, ?B/s]"
2290
+ ]
2291
+ },
2292
+ "metadata": {},
2293
+ "output_type": "display_data"
2294
+ },
2295
+ {
2296
+ "data": {
2297
+ "application/vnd.jupyter.widget-view+json": {
2298
+ "model_id": "c87ab7ac7f414fd89a5e73464dcb3de1",
2299
+ "version_major": 2,
2300
+ "version_minor": 0
2301
+ },
2302
+ "text/plain": [
2303
+ "2025-05-27__openai.parquet: 0%| | 0.00/50.3k [00:00<?, ?B/s]"
2304
+ ]
2305
+ },
2306
+ "metadata": {},
2307
+ "output_type": "display_data"
2308
+ },
2309
+ {
2310
+ "data": {
2311
+ "application/vnd.jupyter.widget-view+json": {
2312
+ "model_id": "b0f9464f6d01470585b06fb3a51bb702",
2313
+ "version_major": 2,
2314
+ "version_minor": 0
2315
+ },
2316
+ "text/plain": [
2317
+ "2025-05-28__artificial.parquet: 0%| | 0.00/27.6k [00:00<?, ?B/s]"
2318
+ ]
2319
+ },
2320
+ "metadata": {},
2321
+ "output_type": "display_data"
2322
+ },
2323
+ {
2324
+ "data": {
2325
+ "application/vnd.jupyter.widget-view+json": {
2326
+ "model_id": "203e32d6504344708541a4d304496100",
2327
+ "version_major": 2,
2328
+ "version_minor": 0
2329
+ },
2330
+ "text/plain": [
2331
+ "2025-05-28__localllama.parquet: 0%| | 0.00/93.1k [00:00<?, ?B/s]"
2332
+ ]
2333
+ },
2334
+ "metadata": {},
2335
+ "output_type": "display_data"
2336
+ },
2337
+ {
2338
+ "data": {
2339
+ "application/vnd.jupyter.widget-view+json": {
2340
+ "model_id": "c8dcbc994b294834bb073eb194053f9e",
2341
+ "version_major": 2,
2342
+ "version_minor": 0
2343
+ },
2344
+ "text/plain": [
2345
+ "2025-05-28__singularity.parquet: 0%| | 0.00/115k [00:00<?, ?B/s]"
2346
+ ]
2347
+ },
2348
+ "metadata": {},
2349
+ "output_type": "display_data"
2350
+ },
2351
+ {
2352
+ "data": {
2353
+ "application/vnd.jupyter.widget-view+json": {
2354
+ "model_id": "853063d0703141dfbd81a3e2540a569d",
2355
+ "version_major": 2,
2356
+ "version_minor": 0
2357
+ },
2358
+ "text/plain": [
2359
+ "2025-05-28__openai.parquet: 0%| | 0.00/62.4k [00:00<?, ?B/s]"
2360
+ ]
2361
+ },
2362
+ "metadata": {},
2363
+ "output_type": "display_data"
2364
+ },
2365
+ {
2366
+ "data": {
2367
+ "application/vnd.jupyter.widget-view+json": {
2368
+ "model_id": "2e24f6a4094d4196989db9cf4e333ee4",
2369
+ "version_major": 2,
2370
+ "version_minor": 0
2371
+ },
2372
+ "text/plain": [
2373
+ "2025-05-29__artificial.parquet: 0%| | 0.00/26.9k [00:00<?, ?B/s]"
2374
+ ]
2375
+ },
2376
+ "metadata": {},
2377
+ "output_type": "display_data"
2378
+ },
2379
+ {
2380
+ "data": {
2381
+ "application/vnd.jupyter.widget-view+json": {
2382
+ "model_id": "3d9974e8ab9f4796beb67915b338f59b",
2383
+ "version_major": 2,
2384
+ "version_minor": 0
2385
+ },
2386
+ "text/plain": [
2387
+ "2025-05-29__localllama.parquet: 0%| | 0.00/123k [00:00<?, ?B/s]"
2388
+ ]
2389
+ },
2390
+ "metadata": {},
2391
+ "output_type": "display_data"
2392
+ },
2393
+ {
2394
+ "data": {
2395
+ "application/vnd.jupyter.widget-view+json": {
2396
+ "model_id": "6b8de7975297472a8ef7b8daf904ca0c",
2397
+ "version_major": 2,
2398
+ "version_minor": 0
2399
+ },
2400
+ "text/plain": [
2401
+ "2025-05-29__singularity.parquet: 0%| | 0.00/100k [00:00<?, ?B/s]"
2402
+ ]
2403
+ },
2404
+ "metadata": {},
2405
+ "output_type": "display_data"
2406
+ },
2407
+ {
2408
+ "data": {
2409
+ "application/vnd.jupyter.widget-view+json": {
2410
+ "model_id": "0c17c89f875746759f2832fc3d64f3e1",
2411
+ "version_major": 2,
2412
+ "version_minor": 0
2413
+ },
2414
+ "text/plain": [
2415
+ "2025-05-29__openai.parquet: 0%| | 0.00/42.5k [00:00<?, ?B/s]"
2416
+ ]
2417
+ },
2418
+ "metadata": {},
2419
+ "output_type": "display_data"
2420
+ },
2421
+ {
2422
+ "data": {
2423
+ "application/vnd.jupyter.widget-view+json": {
2424
+ "model_id": "2bb8c8922cab406e86de439df0cb9154",
2425
+ "version_major": 2,
2426
+ "version_minor": 0
2427
+ },
2428
+ "text/plain": [
2429
+ "2025-05-30__artificial.parquet: 0%| | 0.00/29.4k [00:00<?, ?B/s]"
2430
+ ]
2431
+ },
2432
+ "metadata": {},
2433
+ "output_type": "display_data"
2434
+ },
2435
+ {
2436
+ "data": {
2437
+ "application/vnd.jupyter.widget-view+json": {
2438
+ "model_id": "6336539762264fca8d65f34d57a73632",
2439
+ "version_major": 2,
2440
+ "version_minor": 0
2441
+ },
2442
+ "text/plain": [
2443
+ "2025-05-30__localllama.parquet: 0%| | 0.00/94.9k [00:00<?, ?B/s]"
2444
+ ]
2445
+ },
2446
+ "metadata": {},
2447
+ "output_type": "display_data"
2448
+ },
2449
+ {
2450
+ "data": {
2451
+ "application/vnd.jupyter.widget-view+json": {
2452
+ "model_id": "fdffb60713f041379a0310ea550b217d",
2453
+ "version_major": 2,
2454
+ "version_minor": 0
2455
+ },
2456
+ "text/plain": [
2457
+ "2025-05-30__singularity.parquet: 0%| | 0.00/88.7k [00:00<?, ?B/s]"
2458
+ ]
2459
+ },
2460
+ "metadata": {},
2461
+ "output_type": "display_data"
2462
+ },
2463
+ {
2464
+ "data": {
2465
+ "application/vnd.jupyter.widget-view+json": {
2466
+ "model_id": "c569149b6e0a4b039c7bd942afb0c0e4",
2467
+ "version_major": 2,
2468
+ "version_minor": 0
2469
+ },
2470
+ "text/plain": [
2471
+ "2025-05-30__openai.parquet: 0%| | 0.00/50.6k [00:00<?, ?B/s]"
2472
+ ]
2473
+ },
2474
+ "metadata": {},
2475
+ "output_type": "display_data"
2476
+ },
2477
+ {
2478
+ "data": {
2479
+ "application/vnd.jupyter.widget-view+json": {
2480
+ "model_id": "c6862669361e42048b862f4bd314e700",
2481
+ "version_major": 2,
2482
+ "version_minor": 0
2483
+ },
2484
+ "text/plain": [
2485
+ "2025-05-31__artificial.parquet: 0%| | 0.00/34.1k [00:00<?, ?B/s]"
2486
+ ]
2487
+ },
2488
+ "metadata": {},
2489
+ "output_type": "display_data"
2490
+ },
2491
+ {
2492
+ "data": {
2493
+ "application/vnd.jupyter.widget-view+json": {
2494
+ "model_id": "98436288eb734934a80746c56f46e3f3",
2495
+ "version_major": 2,
2496
+ "version_minor": 0
2497
+ },
2498
+ "text/plain": [
2499
+ "2025-05-31__localllama.parquet: 0%| | 0.00/82.8k [00:00<?, ?B/s]"
2500
+ ]
2501
+ },
2502
+ "metadata": {},
2503
+ "output_type": "display_data"
2504
+ },
2505
+ {
2506
+ "data": {
2507
+ "application/vnd.jupyter.widget-view+json": {
2508
+ "model_id": "4273f1d1d0e34eb5b9495b552c132d0d",
2509
+ "version_major": 2,
2510
+ "version_minor": 0
2511
+ },
2512
+ "text/plain": [
2513
+ "2025-05-31__singularity.parquet: 0%| | 0.00/82.2k [00:00<?, ?B/s]"
2514
+ ]
2515
+ },
2516
+ "metadata": {},
2517
+ "output_type": "display_data"
2518
+ },
2519
+ {
2520
+ "data": {
2521
+ "application/vnd.jupyter.widget-view+json": {
2522
+ "model_id": "e12cda59fafe43db8d005655a913da40",
2523
+ "version_major": 2,
2524
+ "version_minor": 0
2525
+ },
2526
+ "text/plain": [
2527
+ "2025-05-31__openai.parquet: 0%| | 0.00/58.9k [00:00<?, ?B/s]"
2528
+ ]
2529
+ },
2530
+ "metadata": {},
2531
+ "output_type": "display_data"
2532
+ },
2533
+ {
2534
+ "data": {
2535
+ "application/vnd.jupyter.widget-view+json": {
2536
+ "model_id": "c0254fe8ee8743688db50c519208007c",
2537
+ "version_major": 2,
2538
+ "version_minor": 0
2539
+ },
2540
+ "text/plain": [
2541
+ "2025-06-01__artificial.parquet: 0%| | 0.00/12.9k [00:00<?, ?B/s]"
2542
+ ]
2543
+ },
2544
+ "metadata": {},
2545
+ "output_type": "display_data"
2546
+ },
2547
+ {
2548
+ "data": {
2549
+ "application/vnd.jupyter.widget-view+json": {
2550
+ "model_id": "2b54a17e08034a4387a70a6042ffea2c",
2551
+ "version_major": 2,
2552
+ "version_minor": 0
2553
+ },
2554
+ "text/plain": [
2555
+ "2025-06-01__localllama.parquet: 0%| | 0.00/71.4k [00:00<?, ?B/s]"
2556
+ ]
2557
+ },
2558
+ "metadata": {},
2559
+ "output_type": "display_data"
2560
+ },
2561
+ {
2562
+ "data": {
2563
+ "application/vnd.jupyter.widget-view+json": {
2564
+ "model_id": "c636b07f5f85442ea7cd7c9de3a92988",
2565
+ "version_major": 2,
2566
+ "version_minor": 0
2567
+ },
2568
+ "text/plain": [
2569
+ "2025-06-01__singularity.parquet: 0%| | 0.00/50.8k [00:00<?, ?B/s]"
2570
+ ]
2571
+ },
2572
+ "metadata": {},
2573
+ "output_type": "display_data"
2574
+ },
2575
+ {
2576
+ "data": {
2577
+ "application/vnd.jupyter.widget-view+json": {
2578
+ "model_id": "a88a12a5548949e69285c8fb32cb6c8f",
2579
+ "version_major": 2,
2580
+ "version_minor": 0
2581
+ },
2582
+ "text/plain": [
2583
+ "2025-06-01__openai.parquet: 0%| | 0.00/52.1k [00:00<?, ?B/s]"
2584
+ ]
2585
+ },
2586
+ "metadata": {},
2587
+ "output_type": "display_data"
2588
+ },
2589
+ {
2590
+ "data": {
2591
+ "application/vnd.jupyter.widget-view+json": {
2592
+ "model_id": "ec5a3ca174944a679bf80805d5981e79",
2593
+ "version_major": 2,
2594
+ "version_minor": 0
2595
+ },
2596
+ "text/plain": [
2597
+ "2025-06-02__artificial.parquet: 0%| | 0.00/30.2k [00:00<?, ?B/s]"
2598
+ ]
2599
+ },
2600
+ "metadata": {},
2601
+ "output_type": "display_data"
2602
+ },
2603
+ {
2604
+ "data": {
2605
+ "application/vnd.jupyter.widget-view+json": {
2606
+ "model_id": "ad5f8a6c6a014fd4b402d7f331212ef2",
2607
+ "version_major": 2,
2608
+ "version_minor": 0
2609
+ },
2610
+ "text/plain": [
2611
+ "2025-06-02__localllama.parquet: 0%| | 0.00/102k [00:00<?, ?B/s]"
2612
+ ]
2613
+ },
2614
+ "metadata": {},
2615
+ "output_type": "display_data"
2616
+ },
2617
+ {
2618
+ "data": {
2619
+ "application/vnd.jupyter.widget-view+json": {
2620
+ "model_id": "744563bd5cbd485aa7bb55794d7fbfbb",
2621
+ "version_major": 2,
2622
+ "version_minor": 0
2623
+ },
2624
+ "text/plain": [
2625
+ "2025-06-02__singularity.parquet: 0%| | 0.00/81.2k [00:00<?, ?B/s]"
2626
+ ]
2627
+ },
2628
+ "metadata": {},
2629
+ "output_type": "display_data"
2630
+ },
2631
+ {
2632
+ "data": {
2633
+ "application/vnd.jupyter.widget-view+json": {
2634
+ "model_id": "54b2c7fab6ec45b78bb9da01048b6fe3",
2635
+ "version_major": 2,
2636
+ "version_minor": 0
2637
+ },
2638
+ "text/plain": [
2639
+ "2025-06-02__openai.parquet: 0%| | 0.00/59.5k [00:00<?, ?B/s]"
2640
+ ]
2641
+ },
2642
+ "metadata": {},
2643
+ "output_type": "display_data"
2644
+ },
2645
+ {
2646
+ "data": {
2647
+ "application/vnd.jupyter.widget-view+json": {
2648
+ "model_id": "9d2b23456fcf404fb91413d2f28ced9d",
2649
+ "version_major": 2,
2650
+ "version_minor": 0
2651
+ },
2652
+ "text/plain": [
2653
+ "2025-06-03__artificial.parquet: 0%| | 0.00/37.6k [00:00<?, ?B/s]"
2654
+ ]
2655
+ },
2656
+ "metadata": {},
2657
+ "output_type": "display_data"
2658
+ },
2659
+ {
2660
+ "data": {
2661
+ "application/vnd.jupyter.widget-view+json": {
2662
+ "model_id": "674defc04a144c4fbb2a450992a1b2f2",
2663
+ "version_major": 2,
2664
+ "version_minor": 0
2665
+ },
2666
+ "text/plain": [
2667
+ "2025-06-03__localllama.parquet: 0%| | 0.00/76.1k [00:00<?, ?B/s]"
2668
+ ]
2669
+ },
2670
+ "metadata": {},
2671
+ "output_type": "display_data"
2672
+ },
2673
+ {
2674
+ "data": {
2675
+ "application/vnd.jupyter.widget-view+json": {
2676
+ "model_id": "e84930c46a5c422b9fb7a2689b8560a5",
2677
+ "version_major": 2,
2678
+ "version_minor": 0
2679
+ },
2680
+ "text/plain": [
2681
+ "2025-06-03__singularity.parquet: 0%| | 0.00/57.1k [00:00<?, ?B/s]"
2682
+ ]
2683
+ },
2684
+ "metadata": {},
2685
+ "output_type": "display_data"
2686
+ },
2687
+ {
2688
+ "data": {
2689
+ "application/vnd.jupyter.widget-view+json": {
2690
+ "model_id": "73626e3ab52a4f0084c4b95305a0685a",
2691
+ "version_major": 2,
2692
+ "version_minor": 0
2693
+ },
2694
+ "text/plain": [
2695
+ "2025-06-03__openai.parquet: 0%| | 0.00/59.1k [00:00<?, ?B/s]"
2696
+ ]
2697
+ },
2698
+ "metadata": {},
2699
+ "output_type": "display_data"
2700
+ },
2701
+ {
2702
+ "data": {
2703
+ "application/vnd.jupyter.widget-view+json": {
2704
+ "model_id": "0310161f060647ac9468e630615a23b2",
2705
+ "version_major": 2,
2706
+ "version_minor": 0
2707
+ },
2708
+ "text/plain": [
2709
+ "2025-06-04__artificial.parquet: 0%| | 0.00/35.2k [00:00<?, ?B/s]"
2710
+ ]
2711
+ },
2712
+ "metadata": {},
2713
+ "output_type": "display_data"
2714
+ },
2715
+ {
2716
+ "data": {
2717
+ "application/vnd.jupyter.widget-view+json": {
2718
+ "model_id": "cf77892cf2ad4609aeb14b3ff8619c22",
2719
+ "version_major": 2,
2720
+ "version_minor": 0
2721
+ },
2722
+ "text/plain": [
2723
+ "2025-06-04__localllama.parquet: 0%| | 0.00/84.6k [00:00<?, ?B/s]"
2724
+ ]
2725
+ },
2726
+ "metadata": {},
2727
+ "output_type": "display_data"
2728
+ },
2729
+ {
2730
+ "data": {
2731
+ "application/vnd.jupyter.widget-view+json": {
2732
+ "model_id": "6a661e7c21d74c468450b419f8a04bf8",
2733
+ "version_major": 2,
2734
+ "version_minor": 0
2735
+ },
2736
+ "text/plain": [
2737
+ "2025-06-04__singularity.parquet: 0%| | 0.00/84.3k [00:00<?, ?B/s]"
2738
+ ]
2739
+ },
2740
+ "metadata": {},
2741
+ "output_type": "display_data"
2742
+ },
2743
+ {
2744
+ "data": {
2745
+ "application/vnd.jupyter.widget-view+json": {
2746
+ "model_id": "f0d0f2d06033432e86853f24aa427af2",
2747
+ "version_major": 2,
2748
+ "version_minor": 0
2749
+ },
2750
+ "text/plain": [
2751
+ "2025-06-04__openai.parquet: 0%| | 0.00/88.2k [00:00<?, ?B/s]"
2752
+ ]
2753
+ },
2754
+ "metadata": {},
2755
+ "output_type": "display_data"
2756
+ },
2757
+ {
2758
+ "name": "stdout",
2759
+ "output_type": "stream",
2760
+ "text": [
2761
+ "✅ Done – all subreddit shards uploaded.\n"
2762
+ ]
2763
+ }
2764
+ ],
2765
+ "source": [
2766
+ "# Example call – adjust repo_id / token as needed\n",
2767
+ "split_and_upload_by_subreddit(\n",
2768
+ " repo_id=\"hblim/top_reddit_posts_daily\",\n",
2769
+ " source_folder=\"data_scored\",\n",
2770
+ " target_folder=\"data_scored_subreddit\",\n",
2771
+ " overwrite=False, # set True if you need to regenerate everything\n",
2772
+ " batch_size=50, # tweak for faster / slower commits\n",
2773
+ ")"
2774
+ ]
2775
+ }
2776
+ ],
2777
+ "metadata": {
2778
+ "kernelspec": {
2779
+ "display_name": "Python [conda env:reddit]",
2780
+ "language": "python",
2781
+ "name": "conda-env-reddit-py"
2782
+ },
2783
+ "language_info": {
2784
+ "codemirror_mode": {
2785
+ "name": "ipython",
2786
+ "version": 3
2787
+ },
2788
+ "file_extension": ".py",
2789
+ "mimetype": "text/x-python",
2790
+ "name": "python",
2791
+ "nbconvert_exporter": "python",
2792
+ "pygments_lexer": "ipython3",
2793
+ "version": "3.11.12"
2794
+ }
2795
+ },
2796
+ "nbformat": 4,
2797
+ "nbformat_minor": 5
2798
+ }
pyproject.toml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "reddit_analysis"
7
+ version = "0.1.0"
8
+ authors = [
9
+ { name = "Halston Lim", email = "halstonblim@gmail.com" },
10
+ ]
11
+ description = "A pipeline for scraping, analyzing, and summarizing Reddit data"
12
+ readme = "README.md"
13
+ requires-python = ">=3.8"
14
+ dependencies = [
15
+ "pandas",
16
+ "praw",
17
+ "pyarrow",
18
+ "huggingface-hub",
19
+ "replicate",
20
+ "python-dotenv",
21
+ "pyyaml",
22
+ ]
23
+
24
+ [project.optional-dependencies]
25
+ dev = [
26
+ "pytest",
27
+ "black",
28
+ "isort",
29
+ ]
30
+
31
+ [tool.setuptools]
32
+ packages = ["reddit_analysis"]
reddit_analysis/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """
2
+ Reddit Analysis Pipeline
3
+
4
+ A package for scraping, analyzing, and summarizing Reddit data.
5
+ """
6
+
7
+ __version__ = "0.1.0"
reddit_analysis/common_metrics.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from prometheus_client import CollectorRegistry, Histogram, Gauge, push_to_gateway
2
+ import time, os, sys
3
+ from reddit_analysis.config_utils import load_environment, get_secret
4
+
5
+ REGISTRY = CollectorRegistry()
6
+ EXEC_DURATION = Gauge(
7
+ "job_duration_seconds",
8
+ "Wall-clock duration of the most recent job run",
9
+ ["job"],
10
+ registry=REGISTRY
11
+ )
12
+ SUCCESS = Gauge("job_success", "Did the job finish without exception? (1/0)", ["job"], registry=REGISTRY)
13
+ load_environment()
14
+
15
+ def get_gateway():
16
+ try:
17
+ return get_secret("PROM_PUSHGW_HOST")
18
+ except Exception:
19
+ return None
20
+
21
+ def run_with_metrics(job_name, func, *args, **kwargs):
22
+ gateway = get_gateway()
23
+ if not gateway:
24
+ # Metrics disabled, just run the function
25
+ return func(*args, **kwargs)
26
+ start = time.time()
27
+ ok = 0
28
+ try:
29
+ result = func(*args, **kwargs)
30
+ ok = 1
31
+ return result
32
+ finally:
33
+ elapsed = time.time() - start
34
+ EXEC_DURATION.labels(job=job_name).set(elapsed)
35
+ SUCCESS.labels(job=job_name).set(ok)
36
+ try:
37
+ print("Pushing to gateway")
38
+ push_to_gateway(gateway, job=job_name, registry=REGISTRY)
39
+ except Exception as e:
40
+ print(f"[metrics] WARNING: push to {gateway} failed: {e}")
reddit_analysis/config_utils.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration utilities for Reddit analysis tools.
3
+ Handles loading of config from YAML and secrets from environment or Streamlit.
4
+ """
5
+ import os
6
+ from pathlib import Path
7
+ import yaml
8
+
9
+ # Determine if Streamlit is available
10
+ try:
11
+ import streamlit as st
12
+ HAS_STREAMLIT = True
13
+ except ImportError:
14
+ HAS_STREAMLIT = False
15
+
16
+ # Project root - now points to the project root directory
17
+ ROOT = Path(__file__).resolve().parent.parent
18
+
19
+ def is_running_streamlit():
20
+ # The only reliable way to detect if running inside a Streamlit app
21
+ return os.getenv("STREAMLIT_SERVER_PORT") is not None
22
+
23
+ def load_environment():
24
+ """Load environment variables from .env if not running as a Streamlit app."""
25
+ if not is_running_streamlit():
26
+ from dotenv import load_dotenv
27
+ load_dotenv(dotenv_path=ROOT / '.env')
28
+
29
+ def get_secret(key, default=None):
30
+ """Get a secret from environment variables or Streamlit secrets."""
31
+ value = os.getenv(key)
32
+ if value is None and HAS_STREAMLIT and is_running_streamlit():
33
+ value = st.secrets.get(key, default)
34
+ if value is None and default is None:
35
+ raise ValueError(f"Required secret {key} not found in environment or Streamlit secrets")
36
+ return value
37
+
38
+ def load_config(config_path=None):
39
+ """Load configuration from YAML file."""
40
+ if config_path is None:
41
+ config_path = ROOT / "config.yaml"
42
+ else:
43
+ config_path = Path(config_path)
44
+
45
+ with open(config_path, 'r') as f:
46
+ config = yaml.safe_load(f)
47
+
48
+ return config
49
+
50
+ def get_project_root():
51
+ """Return the project root directory."""
52
+ return ROOT
53
+
54
+ def setup_config():
55
+ """
56
+ Set up and return configuration and commonly used values.
57
+
58
+ Returns:
59
+ A dictionary containing configuration and common values:
60
+ - config: The parsed YAML config
61
+ - secrets: A dictionary of required secrets (e.g., HF_TOKEN)
62
+ - paths: Common file paths (all relative to project root)
63
+ """
64
+ # Load environment variables
65
+ load_environment()
66
+
67
+ # Load config
68
+ config = load_config()
69
+
70
+ # Common secrets
71
+ secrets = {
72
+ 'HF_TOKEN': get_secret('HF_TOKEN')
73
+ }
74
+
75
+ # Get directory paths from config or use defaults
76
+ raw_dir = config.get('raw_dir', 'data_raw')
77
+ scored_dir = config.get('scored_dir', 'data_scored')
78
+ logs_dir = config.get('logs_dir', 'logs')
79
+
80
+ # Get HF repository directories (paths within the HF repo)
81
+ hf_raw_dir = config.get('hf_raw_dir', 'data_raw')
82
+ hf_scored_dir = config.get('hf_scored_dir', 'data_scored')
83
+
84
+ # Common paths and constants (all paths are relative to project root)
85
+ paths = {
86
+ 'root': ROOT,
87
+ 'raw_dir': ROOT / raw_dir,
88
+ 'scored_dir': ROOT / scored_dir,
89
+ 'logs_dir': ROOT / logs_dir,
90
+ 'summary_file': ROOT / config.get('summary_file', 'subreddit_daily_summary.csv'),
91
+ 'hf_raw_dir': hf_raw_dir,
92
+ 'hf_scored_dir': hf_scored_dir
93
+ }
94
+
95
+ # Add REPLICATE_API_TOKEN if it's in the environment
96
+ try:
97
+ secrets['REPLICATE_API_TOKEN'] = get_secret('REPLICATE_API_TOKEN')
98
+ except ValueError:
99
+ # This is optional for scrape.py, so we'll ignore if missing
100
+ pass
101
+
102
+ # Add Reddit API credentials if available
103
+ for key in ['REDDIT_CLIENT_ID', 'REDDIT_CLIENT_SECRET', 'REDDIT_USER_AGENT']:
104
+ try:
105
+ secrets[key] = get_secret(key)
106
+ except ValueError:
107
+ # These are required by scrape.py but we'll check there
108
+ pass
109
+
110
+ return {
111
+ 'config': config,
112
+ 'secrets': secrets,
113
+ 'paths': paths
114
+ }
reddit_analysis/inference/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """
2
+ Inference subpackage for Reddit Analysis Pipeline.
3
+
4
+ Contains functionality for sentiment analysis of Reddit data.
5
+ """
reddit_analysis/inference/score.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Score Reddit posts and comments using Replicate.
4
+ CLI examples
5
+ ------------
6
+ # Score data for a specific date
7
+ python -m reddit_analysis.inference.score --date 2025-04-20
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import logging
13
+ from datetime import date, timedelta
14
+ from pathlib import Path
15
+ from typing import Optional, List, Dict, Any
16
+ import pandas as pd
17
+ import pyarrow.parquet as pq
18
+ from huggingface_hub import (
19
+ hf_hub_download,
20
+ list_repo_files,
21
+ login,
22
+ upload_file,
23
+ HfApi
24
+ )
25
+ import replicate
26
+ import json
27
+ import httpx
28
+ import re
29
+
30
+ from reddit_analysis.config_utils import setup_config
31
+
32
+ import json
33
+ import time
34
+ from typing import List, Dict
35
+
36
+ import httpx
37
+ import replicate
38
+
39
+
40
+ def setup_logging(logs_dir: Path) -> logging.Logger:
41
+ """Set up logging configuration using logs_dir from config."""
42
+ logs_dir.mkdir(parents=True, exist_ok=True)
43
+
44
+ # Create log filename with current date
45
+ log_file = logs_dir / f"reddit_scorer_{date.today().strftime('%Y-%m-%d')}.log"
46
+
47
+ # Configure logging
48
+ logging.basicConfig(
49
+ level=logging.INFO,
50
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
51
+ handlers=[
52
+ logging.FileHandler(log_file, encoding="utf-8")
53
+ ]
54
+ )
55
+
56
+ logger = logging.getLogger(__name__)
57
+ logger.info(f"Logging initialized. Log file: {log_file}")
58
+ return logger
59
+
60
+
61
+ class ReplicateAPI:
62
+ """Wrapper class for Replicate API interactions."""
63
+ def __init__(self, api_token: str, model: str, timeout_s: int = 1200):
64
+ # Replicate accepts an httpx.Timeout via the `timeout=` kwarg
65
+ self.client = replicate.Client(
66
+ api_token=api_token,
67
+ timeout=httpx.Timeout(timeout_s) # same limit for connect/read/write/pool
68
+ )
69
+ self.model = model
70
+ self.retries = 3 # total attempts per batch
71
+ self.logger = logging.getLogger(__name__)
72
+
73
+ def predict(self, texts: List[str]) -> Dict[str, List[float]]:
74
+ """Run sentiment analysis on a batch of texts.
75
+
76
+ Sends payload as a *JSON string* (your requirement) and
77
+ retries on transient HTTP/1.1 disconnects or timeouts.
78
+ """
79
+ payload = {"texts": json.dumps(texts)} # keep JSON string
80
+
81
+ for attempt in range(self.retries):
82
+ try:
83
+ result = self.client.run(self.model, input=payload)
84
+
85
+ # Expected Replicate output structure
86
+ return {
87
+ "predicted_labels": result.get("predicted_labels", []),
88
+ "confidences": result.get("confidences", []),
89
+ }
90
+
91
+ except (httpx.RemoteProtocolError, httpx.ReadTimeout) as err:
92
+ if attempt == self.retries - 1:
93
+ raise # re‑raise on final failure
94
+ backoff = 2 ** attempt # 1 s, 2 s, 4 s …
95
+ self.logger.warning(f"{err!s} – retrying in {backoff}s")
96
+ time.sleep(backoff)
97
+
98
+
99
+ class FileManager:
100
+ """Wrapper class for file operations that can be mocked for testing."""
101
+ def __init__(self, base_dir: Path):
102
+ self.base_dir = base_dir
103
+ self.base_dir.mkdir(parents=True, exist_ok=True)
104
+
105
+ def save_parquet(self, df: pd.DataFrame, filename: str) -> Path:
106
+ path = self.base_dir / f"{filename}.parquet"
107
+ df.to_parquet(path, index=False)
108
+ return path
109
+
110
+ def read_parquet(self, filename: str) -> pd.DataFrame:
111
+ path = self.base_dir / f"{filename}"
112
+ return pd.read_parquet(path)
113
+
114
+ class HuggingFaceManager:
115
+ """Wrapper class for HuggingFace Hub operations that can be mocked for testing."""
116
+ def __init__(self, token: str, repo_id: str, repo_type: str = "dataset"):
117
+ self.token = token
118
+ self.repo_id = repo_id
119
+ self.repo_type = repo_type
120
+ self.api = HfApi(token=token)
121
+
122
+ def download_file(self, path_in_repo: str) -> Path:
123
+ return Path(hf_hub_download(
124
+ repo_id=self.repo_id,
125
+ repo_type=self.repo_type,
126
+ filename=path_in_repo,
127
+ token=self.token
128
+ ))
129
+
130
+ def upload_file(self, local_path: str, path_in_repo: str):
131
+ self.api.upload_file(
132
+ path_or_fileobj=local_path,
133
+ path_in_repo=path_in_repo,
134
+ repo_id=self.repo_id,
135
+ repo_type=self.repo_type,
136
+ token=self.token
137
+ )
138
+
139
+ def list_files(self, prefix: str) -> List[str]:
140
+ files = self.api.list_repo_files(
141
+ repo_id=self.repo_id,
142
+ repo_type=self.repo_type
143
+ )
144
+ return [file for file in files if file.startswith(prefix)]
145
+
146
+
147
+ class SentimentScorer:
148
+ def __init__(
149
+ self,
150
+ cfg: Dict[str, Any],
151
+ replicate_api: Optional[ReplicateAPI] = None,
152
+ file_manager: Optional[FileManager] = None,
153
+ hf_manager: Optional[HuggingFaceManager] = None
154
+ ):
155
+ self.config = cfg['config']
156
+ self.secrets = cfg['secrets']
157
+ self.paths = cfg['paths']
158
+ self.logger = logging.getLogger(__name__)
159
+
160
+ # Initialize services with dependency injection
161
+ self.replicate_api = replicate_api or ReplicateAPI(
162
+ api_token=self.secrets['REPLICATE_API_TOKEN'],
163
+ model=self.config['replicate_model']
164
+ )
165
+
166
+ self.file_manager = file_manager or FileManager(self.paths['scored_dir'])
167
+
168
+ self.hf_manager = hf_manager or HuggingFaceManager(
169
+ token=self.secrets['HF_TOKEN'],
170
+ repo_id=self.config['repo_id'],
171
+ repo_type=self.config.get('repo_type', 'dataset')
172
+ )
173
+
174
+ def process_batch(self, texts: List[str]) -> tuple[List[float], List[float]]:
175
+ """Process a batch of texts through the sentiment model."""
176
+ result = self.replicate_api.predict(texts)
177
+ return result['predicted_labels'], result['confidences']
178
+
179
+ def get_existing_subreddits(self, date_str: str) -> set:
180
+ """Get set of subreddits that already have scored files for the given date."""
181
+ scored_files = self.hf_manager.list_files("data_scored_subreddit/")
182
+ existing_subreddits = set()
183
+ for fn in scored_files:
184
+ if fn.startswith(f"data_scored_subreddit/{date_str}__") and fn.endswith('.parquet'):
185
+ # Extract subreddit from filename: data_scored_subreddit/{date}__{subreddit}.parquet
186
+ subreddit = Path(fn).stem.split('__', 1)[1]
187
+ existing_subreddits.add(subreddit)
188
+ return existing_subreddits
189
+
190
+ def _sanitize(self, name: str) -> str:
191
+ """
192
+ Make subreddit safe for filenames (removes slashes, spaces, etc.).
193
+ """
194
+ name = name.strip().lower()
195
+ name = re.sub(r"[^\w\-\.]", "_", name)
196
+ return name
197
+
198
+ def score_date(self, date_str: str, overwrite: bool = False) -> None:
199
+ """Process a single date: download, score, save, and upload separate files per subreddit."""
200
+ self.logger.info(f"Scoring date: {date_str}")
201
+
202
+ # Get existing subreddits if not overwriting
203
+ existing_subreddits = set()
204
+ if not overwrite:
205
+ existing_subreddits = self.get_existing_subreddits(date_str)
206
+ if existing_subreddits:
207
+ self.logger.info(f"Found {len(existing_subreddits)} existing subreddit files for {date_str}")
208
+
209
+ # Download raw file
210
+ raw_path = f"{self.paths['hf_raw_dir']}/{date_str}.parquet"
211
+ local_path = self.hf_manager.download_file(raw_path)
212
+ df = self.file_manager.read_parquet(str(local_path))
213
+
214
+ # Validate required columns
215
+ required_columns = {'text', 'score', 'post_id', 'subreddit'}
216
+ missing_columns = required_columns - set(df.columns)
217
+ if missing_columns:
218
+ raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")
219
+
220
+ # Filter out existing subreddits if not overwriting
221
+ subreddits_to_process = df['subreddit'].unique()
222
+ if not overwrite and existing_subreddits:
223
+ subreddits_to_process = [s for s in subreddits_to_process if s not in existing_subreddits]
224
+ if not subreddits_to_process:
225
+ self.logger.info(f"All subreddits already processed for {date_str}")
226
+ return
227
+ df = df[df['subreddit'].isin(subreddits_to_process)].copy()
228
+ self.logger.info(f"Processing {len(subreddits_to_process)} new subreddits for {date_str}")
229
+
230
+ # Process in batches
231
+ batch_size = self.config.get('batch_size', 16)
232
+ texts = df['text'].tolist()
233
+ sentiments = []
234
+ confidences = []
235
+
236
+ for i in range(0, len(texts), batch_size):
237
+ batch = texts[i:i + batch_size]
238
+ batch_sentiments, batch_confidences = self.process_batch(batch)
239
+ sentiments.extend(batch_sentiments[:len(batch)]) # Only take as many results as input texts
240
+ confidences.extend(batch_confidences[:len(batch)]) # Only take as many results as input texts
241
+
242
+ # Add results to DataFrame
243
+ df['sentiment'] = sentiments
244
+ df['confidence'] = confidences
245
+
246
+ # Group by subreddit and save separate files
247
+ subreddits = df['subreddit'].unique()
248
+ self.logger.info(f"Found {len(subreddits)} subreddits to process for {date_str}")
249
+
250
+ for subreddit in subreddits:
251
+ subreddit_df = df[df['subreddit'] == subreddit].copy()
252
+
253
+ # Save scored file per subreddit using sanitized subreddit
254
+ safe_sub = self._sanitize(subreddit)
255
+ filename = f"{date_str}__{safe_sub}"
256
+ scored_path = self.file_manager.save_parquet(subreddit_df, filename)
257
+
258
+ # Upload to HuggingFace with new path structure
259
+ path_in_repo = f"data_scored_subreddit/{date_str}__{safe_sub}.parquet"
260
+ self.hf_manager.upload_file(str(scored_path), path_in_repo)
261
+ self.logger.info(f"Uploaded scored file for {date_str}/{subreddit} ({len(subreddit_df)} rows) to {self.config['repo_id']}/{path_in_repo}")
262
+
263
+ def main(date_arg: str = None, overwrite: bool = False) -> None:
264
+ if date_arg is None:
265
+ raise ValueError("Date argument is required")
266
+
267
+ # Load configuration
268
+ cfg = setup_config()
269
+
270
+ # Initialize logging
271
+ logger = setup_logging(cfg['paths']['logs_dir'])
272
+
273
+ # Check if REPLICATE_API_TOKEN is available
274
+ if 'REPLICATE_API_TOKEN' not in cfg['secrets']:
275
+ raise ValueError("REPLICATE_API_TOKEN is required for scoring")
276
+
277
+ # Initialize scorer
278
+ scorer = SentimentScorer(cfg)
279
+
280
+ # Check if date exists in raw files
281
+ raw_dates = set()
282
+ for fn in scorer.hf_manager.list_files(scorer.paths['hf_raw_dir']):
283
+ if fn.endswith('.parquet'):
284
+ raw_dates.add(Path(fn).stem)
285
+
286
+ if date_arg not in raw_dates:
287
+ logger.warning(f"No raw file found for date {date_arg}")
288
+ return
289
+
290
+ # Check if date already exists in scored files (check subreddit files)
291
+ if not overwrite:
292
+ # Get existing scored files for this date
293
+ scored_files = scorer.hf_manager.list_files("data_scored_subreddit/")
294
+ existing_subreddits = set()
295
+ for fn in scored_files:
296
+ if fn.startswith(f"data_scored_subreddit/{date_arg}__") and fn.endswith('.parquet'):
297
+ # Extract subreddit from filename: data_scored_subreddit/{date}__{subreddit}.parquet
298
+ subreddit = Path(fn).stem.split('__', 1)[1]
299
+ existing_subreddits.add(subreddit)
300
+
301
+ # Check what subreddits are in the raw data
302
+ raw_path = f"{scorer.paths['hf_raw_dir']}/{date_arg}.parquet"
303
+ try:
304
+ local_path = scorer.hf_manager.download_file(raw_path)
305
+ df = scorer.file_manager.read_parquet(str(local_path))
306
+ raw_subreddits = set(df['subreddit'].unique())
307
+
308
+ # If all subreddits already exist, skip processing
309
+ if raw_subreddits.issubset(existing_subreddits):
310
+ logger.info(f"All subreddits for date {date_arg} already scored ({len(existing_subreddits)} files)")
311
+ return
312
+ else:
313
+ missing_subreddits = raw_subreddits - existing_subreddits
314
+ logger.info(f"Some subreddits missing for {date_arg}: {missing_subreddits}")
315
+ except Exception as e:
316
+ logger.warning(f"Could not check existing subreddits for {date_arg}: {e}")
317
+
318
+ # Score the specified date
319
+ scorer.score_date(date_arg, overwrite)
320
+
321
+ if __name__ == '__main__':
322
+ from reddit_analysis.common_metrics import run_with_metrics
323
+ parser = argparse.ArgumentParser(description='Score raw HF dataset files via Replicate.')
324
+ parser.add_argument('--date', type=str, required=True, help='YYYY-MM-DD date to process')
325
+ parser.add_argument('--overwrite', action='store_true', help='Overwrite existing scored file')
326
+ args = parser.parse_args()
327
+ run_with_metrics("score", main, args.date, args.overwrite)
reddit_analysis/monitoring/dashboard.json ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "annotations": {
3
+ "list": [
4
+ {
5
+ "builtIn": 1,
6
+ "datasource": {
7
+ "type": "grafana",
8
+ "uid": "-- Grafana --"
9
+ },
10
+ "enable": true,
11
+ "hide": true,
12
+ "iconColor": "rgba(0, 211, 255, 1)",
13
+ "name": "Annotations & Alerts",
14
+ "type": "dashboard"
15
+ }
16
+ ]
17
+ },
18
+ "editable": true,
19
+ "fiscalYearStartMonth": 0,
20
+ "graphTooltip": 0,
21
+ "id": 1,
22
+ "links": [],
23
+ "panels": [
24
+ {
25
+ "datasource": {
26
+ "type": "prometheus",
27
+ "uid": "fejtp15n071moe"
28
+ },
29
+ "fieldConfig": {
30
+ "defaults": {
31
+ "color": {
32
+ "mode": "thresholds"
33
+ },
34
+ "mappings": [
35
+ {
36
+ "options": {
37
+ "0": {
38
+ "index": 0,
39
+ "text": "Failure"
40
+ },
41
+ "1": {
42
+ "index": 1,
43
+ "text": "Success"
44
+ }
45
+ },
46
+ "type": "value"
47
+ }
48
+ ],
49
+ "max": 1,
50
+ "min": 0,
51
+ "thresholds": {
52
+ "mode": "absolute",
53
+ "steps": [
54
+ {
55
+ "color": "red"
56
+ },
57
+ {
58
+ "color": "green",
59
+ "value": 0.5
60
+ }
61
+ ]
62
+ }
63
+ },
64
+ "overrides": []
65
+ },
66
+ "gridPos": {
67
+ "h": 6,
68
+ "w": 24,
69
+ "x": 0,
70
+ "y": 0
71
+ },
72
+ "id": 1,
73
+ "options": {
74
+ "minVizHeight": 75,
75
+ "minVizWidth": 75,
76
+ "orientation": "auto",
77
+ "reduceOptions": {
78
+ "calcs": [
79
+ "lastNotNull"
80
+ ],
81
+ "fields": "",
82
+ "values": false
83
+ },
84
+ "showThresholdLabels": false,
85
+ "showThresholdMarkers": false,
86
+ "sizing": "auto",
87
+ "text": {}
88
+ },
89
+ "pluginVersion": "11.6.1",
90
+ "targets": [
91
+ {
92
+ "datasource": {
93
+ "type": "prometheus",
94
+ "uid": "fejtp15n071moe"
95
+ },
96
+ "editorMode": "code",
97
+ "expr": "job_success{job=~\"scrape|score|summarize\"}",
98
+ "interval": "",
99
+ "legendFormat": "{{job}}",
100
+ "range": true,
101
+ "refId": "A"
102
+ }
103
+ ],
104
+ "title": "LAST RUN STATUS (Live)",
105
+ "type": "gauge"
106
+ },
107
+ {
108
+ "datasource": {
109
+ "type": "prometheus",
110
+ "uid": "fejtp15n071moe"
111
+ },
112
+ "fieldConfig": {
113
+ "defaults": {
114
+ "color": {
115
+ "mode": "palette-classic"
116
+ },
117
+ "custom": {
118
+ "axisBorderShow": false,
119
+ "axisCenteredZero": false,
120
+ "axisColorMode": "text",
121
+ "axisLabel": "",
122
+ "axisPlacement": "auto",
123
+ "barAlignment": 0,
124
+ "barWidthFactor": 0.6,
125
+ "drawStyle": "line",
126
+ "fillOpacity": 0,
127
+ "gradientMode": "none",
128
+ "hideFrom": {
129
+ "legend": false,
130
+ "tooltip": false,
131
+ "viz": false
132
+ },
133
+ "insertNulls": false,
134
+ "lineInterpolation": "linear",
135
+ "lineWidth": 1,
136
+ "pointSize": 5,
137
+ "scaleDistribution": {
138
+ "type": "linear"
139
+ },
140
+ "showPoints": "auto",
141
+ "spanNulls": false,
142
+ "stacking": {
143
+ "group": "A",
144
+ "mode": "none"
145
+ },
146
+ "thresholdsStyle": {
147
+ "mode": "off"
148
+ }
149
+ },
150
+ "mappings": [],
151
+ "thresholds": {
152
+ "mode": "absolute",
153
+ "steps": [
154
+ {
155
+ "color": "green"
156
+ },
157
+ {
158
+ "color": "red",
159
+ "value": 80
160
+ }
161
+ ]
162
+ }
163
+ },
164
+ "overrides": []
165
+ },
166
+ "gridPos": {
167
+ "h": 8,
168
+ "w": 12,
169
+ "x": 0,
170
+ "y": 6
171
+ },
172
+ "id": 3,
173
+ "options": {
174
+ "legend": {
175
+ "calcs": [],
176
+ "displayMode": "list",
177
+ "placement": "bottom",
178
+ "showLegend": true
179
+ },
180
+ "tooltip": {
181
+ "hideZeros": false,
182
+ "mode": "single",
183
+ "sort": "none"
184
+ }
185
+ },
186
+ "pluginVersion": "11.6.1",
187
+ "targets": [
188
+ {
189
+ "editorMode": "code",
190
+ "expr": "job_success{job=~\"scrape|score|summarize\"}",
191
+ "interval": "",
192
+ "legendFormat": "{{job}} state",
193
+ "range": true,
194
+ "refId": "A"
195
+ }
196
+ ],
197
+ "title": "Job Success Monitor",
198
+ "type": "timeseries"
199
+ },
200
+ {
201
+ "datasource": {
202
+ "type": "prometheus",
203
+ "uid": "fejtp15n071moe"
204
+ },
205
+ "fieldConfig": {
206
+ "defaults": {
207
+ "color": {
208
+ "mode": "palette-classic"
209
+ },
210
+ "custom": {
211
+ "axisBorderShow": false,
212
+ "axisCenteredZero": false,
213
+ "axisColorMode": "text",
214
+ "axisLabel": "",
215
+ "axisPlacement": "auto",
216
+ "barAlignment": 0,
217
+ "barWidthFactor": 0.6,
218
+ "drawStyle": "line",
219
+ "fillOpacity": 0,
220
+ "gradientMode": "none",
221
+ "hideFrom": {
222
+ "legend": false,
223
+ "tooltip": false,
224
+ "viz": false
225
+ },
226
+ "insertNulls": false,
227
+ "lineInterpolation": "linear",
228
+ "lineWidth": 1,
229
+ "pointSize": 5,
230
+ "scaleDistribution": {
231
+ "type": "linear"
232
+ },
233
+ "showPoints": "auto",
234
+ "spanNulls": false,
235
+ "stacking": {
236
+ "group": "A",
237
+ "mode": "none"
238
+ },
239
+ "thresholdsStyle": {
240
+ "mode": "off"
241
+ }
242
+ },
243
+ "mappings": [],
244
+ "thresholds": {
245
+ "mode": "absolute",
246
+ "steps": [
247
+ {
248
+ "color": "green"
249
+ },
250
+ {
251
+ "color": "red",
252
+ "value": 80
253
+ }
254
+ ]
255
+ }
256
+ },
257
+ "overrides": []
258
+ },
259
+ "gridPos": {
260
+ "h": 8,
261
+ "w": 12,
262
+ "x": 12,
263
+ "y": 6
264
+ },
265
+ "id": 2,
266
+ "options": {
267
+ "legend": {
268
+ "calcs": [],
269
+ "displayMode": "list",
270
+ "placement": "bottom",
271
+ "showLegend": true
272
+ },
273
+ "tooltip": {
274
+ "hideZeros": false,
275
+ "mode": "single",
276
+ "sort": "none"
277
+ }
278
+ },
279
+ "pluginVersion": "11.6.1",
280
+ "targets": [
281
+ {
282
+ "editorMode": "code",
283
+ "expr": "job_duration_seconds{job=~\"scrape|score|summarize\"}",
284
+ "interval": "",
285
+ "legendFormat": "{{job}} duration (seconds)",
286
+ "range": true,
287
+ "refId": "A"
288
+ }
289
+ ],
290
+ "title": "Wall Clock Time Monitor",
291
+ "type": "timeseries"
292
+ }
293
+ ],
294
+ "preload": false,
295
+ "schemaVersion": 41,
296
+ "tags": [],
297
+ "templating": {
298
+ "list": []
299
+ },
300
+ "time": {
301
+ "from": "now-20m",
302
+ "to": "now"
303
+ },
304
+ "timepicker": {},
305
+ "timezone": "browser",
306
+ "title": "Pipeline Health",
307
+ "uid": "aejtpwaxibk00d",
308
+ "version": 11
309
+ }
reddit_analysis/monitoring/dashboard_failure.png ADDED
reddit_analysis/monitoring/dashboard_success.png ADDED
reddit_analysis/monitoring/docker-compose.yml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ pushgateway:
3
+ image: prom/pushgateway
4
+ ports: ["9091:9091"]
5
+
6
+ prometheus:
7
+ image: prom/prometheus
8
+ command:
9
+ - "--config.file=/etc/prometheus/prometheus.yml"
10
+ volumes:
11
+ - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
12
+ ports: ["9090:9090"]
13
+ depends_on: [pushgateway]
14
+
15
+ grafana:
16
+ image: grafana/grafana
17
+ environment:
18
+ - GF_SECURITY_ADMIN_PASSWORD=admin
19
+ - GF_USERS_ALLOW_SIGN_UP=false
20
+ ports: ["3000:3000"]
21
+ depends_on: [prometheus]
reddit_analysis/monitoring/prometheus.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ global:
2
+ scrape_interval: 15s
3
+
4
+ scrape_configs:
5
+ - job_name: pushgateway
6
+ honor_labels: true
7
+ static_configs:
8
+ - targets: ['pushgateway:9091']
reddit_analysis/scraper/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """
2
+ Scraper subpackage for Reddit Analysis Pipeline.
3
+
4
+ Contains functionality for scraping Reddit data.
5
+ """
reddit_analysis/scraper/scrape.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Scrape Reddit posts and comments.
4
+ CLI examples
5
+ ------------
6
+ # Scrape data for a specific date
7
+ python -m reddit_analysis.scraper.scrape --date 2025-04-20
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import os
13
+ import sys
14
+ from datetime import datetime, timedelta
15
+ from pathlib import Path
16
+ from typing import Optional, List, Dict, Any
17
+
18
+ import pandas as pd
19
+ import pyarrow.parquet as pq
20
+ from huggingface_hub import (
21
+ hf_hub_download,
22
+ list_repo_files,
23
+ login,
24
+ upload_file,
25
+ HfApi
26
+ )
27
+ import praw
28
+ import logging
29
+ import pytz
30
+ from tqdm import tqdm
31
+
32
+ from reddit_analysis.config_utils import setup_config
33
+
34
+ class RedditAPI:
35
+ """Wrapper class for Reddit API interactions that can be mocked for testing."""
36
+ def __init__(self, client_id: str, client_secret: str, user_agent: str):
37
+ self.reddit = praw.Reddit(
38
+ client_id=client_id,
39
+ client_secret=client_secret,
40
+ user_agent=user_agent
41
+ )
42
+
43
+ def get_subreddit(self, name: str):
44
+ return self.reddit.subreddit(name)
45
+
46
+ def get_rate_limit_info(self) -> Dict[str, Any]:
47
+ return {
48
+ 'used': self.reddit.auth.limits.get('used'),
49
+ 'remaining': self.reddit.auth.limits.get('remaining'),
50
+ 'reset_timestamp': self.reddit.auth.limits.get('reset_timestamp')
51
+ }
52
+
53
+ class FileManager:
54
+ """Wrapper class for file operations that can be mocked for testing."""
55
+ def __init__(self, base_dir: Path):
56
+ self.base_dir = base_dir
57
+ self.base_dir.mkdir(parents=True, exist_ok=True)
58
+
59
+ def save_csv(self, df: pd.DataFrame, filename: str) -> Path:
60
+ path = self.base_dir / f"{filename}.csv"
61
+ df.to_csv(path, index=False)
62
+ return path
63
+
64
+ def save_parquet(self, df: pd.DataFrame, filename: str) -> Path:
65
+ path = self.base_dir / f"{filename}.parquet"
66
+ df.to_parquet(path, index=False)
67
+ return path
68
+
69
+ def read_parquet(self, filename: str) -> pd.DataFrame:
70
+ path = self.base_dir / f"{filename}.parquet"
71
+ return pd.read_parquet(path)
72
+
73
+ class HuggingFaceManager:
74
+ """Wrapper class for HuggingFace Hub operations that can be mocked for testing."""
75
+ def __init__(self, token: str, repo_id: str, repo_type: str = "dataset"):
76
+ self.token = token
77
+ self.repo_id = repo_id
78
+ self.repo_type = repo_type
79
+ self.api = HfApi(token=token)
80
+
81
+ def download_file(self, path_in_repo: str) -> Path:
82
+ return Path(hf_hub_download(
83
+ repo_id=self.repo_id,
84
+ repo_type=self.repo_type,
85
+ filename=path_in_repo,
86
+ token=self.token
87
+ ))
88
+
89
+ def upload_file(self, local_path: str, path_in_repo: str):
90
+ self.api.upload_file(
91
+ path_or_fileobj=local_path,
92
+ path_in_repo=path_in_repo,
93
+ repo_id=self.repo_id,
94
+ repo_type=self.repo_type,
95
+ token=self.token
96
+ )
97
+
98
+ def list_files(self, prefix: str) -> List[str]:
99
+ return self.api.list_repo_files(
100
+ repo_id=self.repo_id,
101
+ repo_type=self.repo_type
102
+ )
103
+
104
+ class RedditScraper:
105
+ def __init__(
106
+ self,
107
+ cfg: Dict[str, Any],
108
+ reddit_api: Optional[RedditAPI] = None,
109
+ file_manager: Optional[FileManager] = None,
110
+ hf_manager: Optional[HuggingFaceManager] = None
111
+ ):
112
+ self.config = cfg['config']
113
+ self.secrets = cfg['secrets']
114
+ self.paths = cfg['paths']
115
+ self.logger = logging.getLogger(__name__)
116
+
117
+ # Initialize services with dependency injection
118
+ self.reddit_api = reddit_api or RedditAPI(
119
+ client_id=self.secrets.get('REDDIT_CLIENT_ID'),
120
+ client_secret=self.secrets.get('REDDIT_CLIENT_SECRET'),
121
+ user_agent=self.secrets.get('REDDIT_USER_AGENT')
122
+ )
123
+
124
+ self.file_manager = file_manager or FileManager(self.paths['raw_dir'])
125
+
126
+ if self.config.get('push_to_hf', False):
127
+ self.hf_manager = hf_manager or HuggingFaceManager(
128
+ token=self.secrets.get('HF_TOKEN'),
129
+ repo_id=self.config.get('repo_id'),
130
+ repo_type=self.config.get('repo_type', 'dataset')
131
+ )
132
+ else:
133
+ self.hf_manager = hf_manager
134
+
135
+ self.timezone = pytz.timezone(self.config['timezone'])
136
+ self.logger.info(f"Output directory set to: {self.paths['raw_dir']}")
137
+
138
+ def get_posts(self, subreddit_config: Dict[str, Any]) -> pd.DataFrame:
139
+ """Fetch posts and comments from a subreddit based on configuration."""
140
+ subreddit_name = subreddit_config['name']
141
+ post_limit = subreddit_config['post_limit']
142
+ comment_limit = subreddit_config['comment_limit']
143
+ retrieved_at = datetime.now(self.timezone)
144
+ records = []
145
+
146
+ subreddit = self.reddit_api.get_subreddit(subreddit_name)
147
+
148
+ self.logger.info(f"Fetching {post_limit} posts from r/{subreddit_name}")
149
+
150
+ for submission in tqdm(
151
+ subreddit.top(time_filter="day", limit=post_limit),
152
+ total=post_limit,
153
+ desc=f"Processing r/{subreddit_name}"
154
+ ):
155
+ # Add post record
156
+ records.append({
157
+ "subreddit": subreddit_name,
158
+ "created_at": datetime.fromtimestamp(submission.created_utc, tz=self.timezone),
159
+ "retrieved_at": retrieved_at,
160
+ "type": "post",
161
+ "text": submission.title + "\n\n" + submission.selftext,
162
+ "score": submission.score,
163
+ "post_id": submission.id,
164
+ "parent_id": None
165
+ })
166
+
167
+ # Get top comments if comment_limit > 0
168
+ if comment_limit > 0:
169
+ submission.comment_sort = 'top'
170
+ submission.comments.replace_more(limit=0)
171
+ comments = getattr(submission.comments, '_comments', [])[:comment_limit]
172
+ for comment in comments:
173
+ records.append({
174
+ "subreddit": subreddit_name,
175
+ "created_at": datetime.fromtimestamp(comment.created_utc, tz=self.timezone),
176
+ "retrieved_at": retrieved_at,
177
+ "type": "comment",
178
+ "text": comment.body,
179
+ "score": comment.score,
180
+ "post_id": comment.id,
181
+ "parent_id": comment.parent_id
182
+ })
183
+
184
+ return pd.DataFrame(records)
185
+
186
+ def print_rate_limit_info(self):
187
+ """Print current Reddit API rate limit information."""
188
+ limits = self.reddit_api.get_rate_limit_info()
189
+ reset_ts = limits.get('reset_timestamp')
190
+ reset_time = (
191
+ datetime.fromtimestamp(reset_ts, tz=self.timezone)
192
+ .strftime("%Y-%m-%d %I:%M:%S %p %Z")
193
+ if reset_ts else "Unknown"
194
+ )
195
+
196
+ self.logger.info("Reddit API Rate Limit Info")
197
+ self.logger.info(f"Requests used: {limits.get('used')}")
198
+ self.logger.info(f"Requests remaining: {limits.get('remaining')}")
199
+ self.logger.info(f"Resets at: {reset_time}")
200
+
201
+ def process_date(self, date_str: str) -> None:
202
+ """Process data for a specific date."""
203
+ self.logger.info(f"Processing data for date: {date_str}")
204
+
205
+ all_records = []
206
+ for sub_cfg in self.config['subreddits']:
207
+ self.logger.info(f"Processing subreddit: {sub_cfg['name']}")
208
+ df = self.get_posts(sub_cfg)
209
+ all_records.append(df)
210
+
211
+ combined_df = pd.concat(all_records, ignore_index=True)
212
+ self.logger.info(f"Total records collected: {len(combined_df)}")
213
+
214
+ # Save to CSV
215
+ self.file_manager.save_csv(combined_df, date_str)
216
+
217
+ # Upload to HuggingFace if configured
218
+ if self.config.get('push_to_hf', False):
219
+ self._upload_to_hf(combined_df, date_str)
220
+
221
+ self.print_rate_limit_info()
222
+ self.logger.info("Reddit scraper completed successfully")
223
+
224
+ def _upload_to_hf(self, df: pd.DataFrame, date_str: str) -> None:
225
+ """Upload data to HuggingFace Hub."""
226
+ try:
227
+ current_date = datetime.strptime(date_str, "%Y-%m-%d")
228
+ prev_date = (current_date - timedelta(days=1)).strftime("%Y-%m-%d")
229
+ prev_file_path = f"{self.paths['hf_raw_dir']}/{prev_date}.parquet"
230
+
231
+ self.logger.info(f"Checking for previous day's file: {prev_file_path}")
232
+ try:
233
+ downloaded_path = self.hf_manager.download_file(prev_file_path)
234
+ existing_df = pd.read_parquet(downloaded_path)
235
+ existing_ids = set(existing_df["post_id"].tolist())
236
+ Path(downloaded_path).unlink()
237
+
238
+ original_count = len(df)
239
+ df = df[~df["post_id"].isin(existing_ids)]
240
+ filtered_count = len(df)
241
+ self.logger.info(f"Filtered {original_count - filtered_count} duplicates")
242
+
243
+ if df.empty:
244
+ self.logger.info("No new posts to upload after deduplication")
245
+ return
246
+
247
+ except Exception as e:
248
+ self.logger.warning(f"Could not fetch/process previous file: {e}")
249
+
250
+ parquet_path = self.file_manager.save_parquet(df, date_str)
251
+ path_in_repo = f"{self.paths['hf_raw_dir']}/{date_str}.parquet"
252
+ self.hf_manager.upload_file(str(parquet_path), path_in_repo)
253
+ self.logger.info(f"Uploaded {len(df)} rows for {date_str} → {path_in_repo}")
254
+ except Exception as e:
255
+ self.logger.error(f"Failed to upload to Hugging Face: {e}")
256
+ raise
257
+
258
+ def setup_logging(logs_dir: Path) -> logging.Logger:
259
+ """Set up logging configuration using logs_dir from config."""
260
+ logs_dir.mkdir(parents=True, exist_ok=True)
261
+
262
+ # Create log filename with current date
263
+ log_file = logs_dir / f"reddit_scraper_{datetime.now().strftime('%Y-%m-%d')}.log"
264
+
265
+ # Configure logging
266
+ logging.basicConfig(
267
+ level=logging.INFO,
268
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
269
+ handlers=[
270
+ logging.FileHandler(log_file, encoding="utf-8")
271
+ ]
272
+ )
273
+
274
+ logger = logging.getLogger(__name__)
275
+ logger.info(f"Logging initialized. Log file: {log_file}")
276
+ return logger
277
+
278
+ def main(date_str: str = None) -> None:
279
+ # Load configuration first
280
+ cfg = setup_config()
281
+
282
+ # Initialize logging with configured logs_dir
283
+ logs_dir = cfg['paths']['logs_dir']
284
+ logger = setup_logging(logs_dir)
285
+ logger.info("Starting Reddit scraper...")
286
+
287
+ # Validate environment variables
288
+ required_env_vars = ["REDDIT_CLIENT_ID", "REDDIT_CLIENT_SECRET", "REDDIT_USER_AGENT"]
289
+ if cfg['config'].get('push_to_hf', False):
290
+ required_env_vars.append("HF_TOKEN")
291
+ missing = [v for v in required_env_vars if not cfg['secrets'].get(v) and not os.getenv(v)]
292
+ if missing:
293
+ logger.error(f"Missing required environment variables: {', '.join(missing)}")
294
+ raise ValueError(f"Missing required environment variables: {', '.join(missing)}")
295
+
296
+ # Instantiate and run
297
+ logger.info("Initializing Reddit scraper...")
298
+ scraper = RedditScraper(cfg)
299
+
300
+ if date_str is None:
301
+ date_str = datetime.now(pytz.timezone(cfg['config']['timezone'])).strftime("%Y-%m-%d")
302
+
303
+ scraper.process_date(date_str)
304
+
305
+ if __name__ == "__main__":
306
+ from reddit_analysis.common_metrics import run_with_metrics
307
+ parser = argparse.ArgumentParser(description='Scrape Reddit posts and comments.')
308
+ parser.add_argument('--date', type=str, help='YYYY-MM-DD date to process')
309
+ args = parser.parse_args()
310
+ run_with_metrics("scrape", main, args.date)
reddit_analysis/summarizer/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """
2
+ Summarizer subpackage for Reddit Analysis Pipeline.
3
+
4
+ Contains functionality for summarizing Reddit data analysis.
5
+ """
reddit_analysis/summarizer/aggregator.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pure‑function helpers for daily aggregation."""
2
+
3
+ from __future__ import annotations
4
+ import pandas as pd
5
+ import numpy as np
6
+
7
+
8
+ def summary_from_df(df: pd.DataFrame, gamma_post: float = 0.3) -> pd.DataFrame:
9
+ """
10
+ Return a DataFrame with daily & subreddit aggregates.
11
+
12
+ Expects columns:
13
+ retrieved_at - UTC timestamp or ISO-date string
14
+ subreddit - subreddit name
15
+ sentiment - numeric score (e.g. −1 … 1)
16
+ score - numeric weight / post score
17
+
18
+ Output columns:
19
+ date (datetime.date)
20
+ subreddit (string)
21
+ mean_sentiment
22
+ community_weighted_sentiment
23
+ count
24
+ """
25
+ # Normalize retrieved_at to datetime and extract calendar day
26
+ df = df.copy()
27
+ df["date"] = pd.to_datetime(df["retrieved_at"]).dt.date
28
+
29
+ # Group by date and subreddit
30
+ grouped = df.groupby(["date", "subreddit"])
31
+
32
+ # Aggregate metrics
33
+ result = grouped.agg(
34
+ # First calculate raw mean_sentiment
35
+ raw_mean_sentiment=("sentiment", "mean"),
36
+ count=("sentiment", "count"),
37
+ ).reset_index()
38
+
39
+ # Apply transformation to raw_mean_sentiment to get values in range [-1, 1] instead of [0, 1]
40
+ result["mean_sentiment"] = 2 * result["raw_mean_sentiment"] - 1
41
+
42
+ # Remove the raw mean column
43
+ result = result.drop(columns="raw_mean_sentiment")
44
+
45
+ # Calculate engagement-adjusted sentiment (EAS) for each group
46
+ # 1. Ensure 'score' is numeric
47
+ df["score_num"] = pd.to_numeric(df["score"], errors="coerce").fillna(0)
48
+ # 2. Compute base weights (1 + log1p(score))
49
+ weights_base = 1 + np.log1p(df["score_num"].clip(lower=0))
50
+ # 3. Apply post weight multiplier
51
+ weights = weights_base * np.where(df.get("type", None) == "post", gamma_post, 1.0)
52
+ df["weight"] = weights
53
+ # 4. Compute EAS per group: weighted average of sentiment
54
+ community_weighted_sentiments = []
55
+ for (date, subreddit), group in grouped:
56
+ w = group["weight"]
57
+ s = group["sentiment"]
58
+ eas = (w * s).sum() / w.sum() if w.sum() > 0 else 0
59
+ community_weighted_sentiments.append(eas)
60
+ result["community_weighted_sentiment"] = community_weighted_sentiments
61
+
62
+ # Normalize community_weighted_sentiment to range [-1,1]
63
+ result["community_weighted_sentiment"] = 2 * result["community_weighted_sentiment"] - 1
64
+
65
+ # Ensure consistent column order
66
+ result = result[["date", "subreddit", "mean_sentiment", "community_weighted_sentiment", "count"]]
67
+
68
+ return result
reddit_analysis/summarizer/summarize.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Summarise scored shards into one daily_summary.csv
4
+
5
+ CLI examples
6
+ ------------
7
+ # Summarize data for a specific date
8
+ python -m reddit_analysis.summarizer.summarize --date 2025-04-20
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ from datetime import date
14
+ from pathlib import Path
15
+ from typing import Optional, List, Dict, Any, Set, Tuple
16
+
17
+ import pandas as pd
18
+ from huggingface_hub import hf_hub_download, HfApi
19
+
20
+ from reddit_analysis.config_utils import setup_config
21
+ from reddit_analysis.summarizer.aggregator import summary_from_df
22
+
23
+
24
+ # --------------------------------------------------------------------------- #
25
+ # Utilities #
26
+ # --------------------------------------------------------------------------- #
27
+ class FileManager:
28
+ """Wrapper class for simple local file I/O that can be mocked for testing."""
29
+ def __init__(self, base_dir: Path):
30
+ self.base_dir = base_dir
31
+ self.base_dir.mkdir(parents=True, exist_ok=True)
32
+
33
+ # ---------- CSV helpers ------------------------------------------------- #
34
+ def read_csv(self, path: Path) -> pd.DataFrame:
35
+ if not path.exists() or path.stat().st_size == 0:
36
+ return pd.DataFrame(
37
+ columns=["date", "subreddit",
38
+ "mean_sentiment", "community_weighted_sentiment", "count"]
39
+ )
40
+ return pd.read_csv(path)
41
+
42
+ def write_csv(self, df: pd.DataFrame, path: Path) -> Path:
43
+ df.to_csv(path, index=False)
44
+ return path
45
+
46
+ # ---------- Parquet helper --------------------------------------------- #
47
+ @staticmethod
48
+ def read_parquet(path: Path) -> pd.DataFrame:
49
+ return pd.read_parquet(path)
50
+
51
+
52
+ class HuggingFaceManager:
53
+ """Thin wrapper around Hugging Face Hub file ops (mock‑friendly)."""
54
+ def __init__(self, token: str, repo_id: str, repo_type: str = "dataset"):
55
+ self.token = token
56
+ self.repo_id = repo_id
57
+ self.repo_type = repo_type
58
+ self.api = HfApi(token=token)
59
+
60
+ def download_file(self, path_in_repo: str) -> Path:
61
+ return Path(
62
+ hf_hub_download(
63
+ repo_id=self.repo_id,
64
+ repo_type=self.repo_type,
65
+ filename=path_in_repo,
66
+ token=self.token
67
+ )
68
+ )
69
+
70
+ def upload_file(self, local_path: str, path_in_repo: str):
71
+ self.api.upload_file(
72
+ path_or_fileobj=local_path,
73
+ path_in_repo=path_in_repo,
74
+ repo_id=self.repo_id,
75
+ repo_type=self.repo_type,
76
+ token=self.token
77
+ )
78
+
79
+ def list_files(self, prefix: str) -> List[str]:
80
+ """List files in the HF repo filtered by prefix."""
81
+ files = self.api.list_repo_files(
82
+ repo_id=self.repo_id,
83
+ repo_type=self.repo_type
84
+ )
85
+ return [f for f in files if f.startswith(prefix)]
86
+
87
+
88
+ # --------------------------------------------------------------------------- #
89
+ # Core manager #
90
+ # --------------------------------------------------------------------------- #
91
+ class SummaryManager:
92
+ def __init__(
93
+ self,
94
+ cfg: Dict[str, Any],
95
+ file_manager: Optional[FileManager] = None,
96
+ hf_manager: Optional[HuggingFaceManager] = None
97
+ ):
98
+ self.config = cfg["config"]
99
+ self.secrets = cfg["secrets"]
100
+ self.paths = cfg["paths"]
101
+
102
+ # I/O helpers
103
+ self.file_manager = file_manager or FileManager(self.paths["root"])
104
+ self.hf_manager = hf_manager or HuggingFaceManager(
105
+ token=self.secrets["HF_TOKEN"],
106
+ repo_id=self.config["repo_id"],
107
+ repo_type=self.config.get("repo_type", "dataset"),
108
+ )
109
+
110
+ # Cache path for the combined summary file on disk
111
+ self.local_summary_path: Path = self.paths["summary_file"]
112
+
113
+ # --------------------------------------------------------------------- #
114
+ # Remote summary helpers #
115
+ # --------------------------------------------------------------------- #
116
+ def _load_remote_summary(self) -> pd.DataFrame:
117
+ """
118
+ Ensure `daily_summary.csv` is present locally by downloading the
119
+ latest version from HF Hub (if it exists) and return it as a DataFrame.
120
+ """
121
+ remote_name = self.paths["summary_file"].name
122
+
123
+ try:
124
+ cached_path = self.hf_manager.download_file(remote_name)
125
+ except Exception:
126
+ # first run – file doesn't exist yet on the Hub
127
+ return pd.DataFrame(
128
+ columns=["date", "subreddit",
129
+ "mean_sentiment", "community_weighted_sentiment", "count"]
130
+ )
131
+
132
+ return pd.read_csv(cached_path)
133
+
134
+ def _save_and_push_summary(self, df: pd.DataFrame):
135
+ """Persist the updated summary both locally and back to HF Hub."""
136
+ self.file_manager.write_csv(df, self.local_summary_path)
137
+ self.hf_manager.upload_file(str(self.local_summary_path),
138
+ self.local_summary_path.name)
139
+
140
+ # --------------------------------------------------------------------- #
141
+ # Public helpers #
142
+ # --------------------------------------------------------------------- #
143
+ def get_processed_combinations(self) -> Set[Tuple[date, str]]:
144
+ """
145
+ Return a set of (date, subreddit) pairs that are *already* present
146
+ in the remote summary so we can de‑duplicate.
147
+ """
148
+ df_summary = self._load_remote_summary()
149
+ if df_summary.empty:
150
+ return set()
151
+
152
+ df_summary["date"] = pd.to_datetime(df_summary["date"]).dt.date
153
+ return {
154
+ (row["date"], row["subreddit"])
155
+ for _, row in df_summary.iterrows()
156
+ }
157
+
158
+ # --------------------------------------------------------------------- #
159
+ # Main workflow #
160
+ # --------------------------------------------------------------------- #
161
+ def process_date(self, date_str: str, overwrite: bool = False) -> None:
162
+ """Download scored data for `date_str`, aggregate, and append/upload."""
163
+ # ---------- Pull scored shards for the given date ------------------ #
164
+ prefix = f"{self.paths['hf_scored_dir']}/{date_str}__"
165
+ # List all remote shards
166
+ try:
167
+ all_files = self.hf_manager.list_files(self.paths['hf_scored_dir'])
168
+ except Exception as err:
169
+ print(f"Error: could not list scored shards in {self.paths['hf_scored_dir']}: {err}")
170
+ return
171
+
172
+ # Filter to shards matching this date
173
+ try:
174
+ shards = [fn for fn in all_files if fn.startswith(prefix) and fn.endswith('.parquet')]
175
+ except TypeError:
176
+ # fall back in case list_files returned a non-iterable (e.g., a mock)
177
+ shards = [all_files]
178
+
179
+ if not shards:
180
+ print(f"No scored shards found for {date_str} under {self.paths['hf_scored_dir']}")
181
+ return
182
+
183
+ # Download and concatenate all shards
184
+ dfs: List[pd.DataFrame] = []
185
+ for shard in shards:
186
+ try:
187
+ local_path = self.hf_manager.download_file(shard)
188
+ except Exception as err:
189
+ print(f"Error: could not download scored shard {shard}: {err}")
190
+ return
191
+ dfs.append(self.file_manager.read_parquet(local_path))
192
+ df_day = pd.concat(dfs, ignore_index=True)
193
+
194
+ # sanity‑check
195
+ required_cols = {"retrieved_at", "subreddit", "sentiment", "score"}
196
+ if not required_cols.issubset(df_day.columns):
197
+ raise ValueError(f"{shards[0]} missing columns {required_cols}")
198
+
199
+ # ---------- Aggregate ------------------------------------------------ #
200
+ df_summary_day = summary_from_df(df_day)
201
+
202
+ # ---------- De‑duplication / overwrite ------------------------------ #
203
+ existing_pairs = self.get_processed_combinations()
204
+ if not overwrite:
205
+ df_summary_day = df_summary_day[
206
+ ~df_summary_day.apply(
207
+ lambda r: (r["date"], r["subreddit"]) in existing_pairs,
208
+ axis=1,
209
+ )
210
+ ]
211
+ if df_summary_day.empty:
212
+ print("Nothing new to summarise for this date.")
213
+ return
214
+
215
+ # ---------- Combine with historical summary ------------------------- #
216
+ df_summary = self._load_remote_summary()
217
+ if overwrite:
218
+ df_summary = df_summary[df_summary["date"] != date_str]
219
+
220
+ # Remove weighted_sentiment column if it exists
221
+ if "weighted_sentiment" in df_summary.columns:
222
+ df_summary = df_summary.drop(columns=["weighted_sentiment"])
223
+
224
+ df_out = (
225
+ pd.concat([df_summary, df_summary_day], ignore_index=True)
226
+ if not df_summary.empty
227
+ else df_summary_day
228
+ )
229
+ df_out["date"] = pd.to_datetime(df_out["date"]).dt.date
230
+ df_out.sort_values(["date", "subreddit"], inplace=True)
231
+
232
+ # Ensure the weighted_sentiment column is dropped from final output
233
+ if "weighted_sentiment" in df_out.columns:
234
+ df_out = df_out.drop(columns=["weighted_sentiment"])
235
+
236
+ # Round floating point columns to 4 decimal places
237
+ if "mean_sentiment" in df_out.columns:
238
+ df_out["mean_sentiment"] = df_out["mean_sentiment"].round(4)
239
+ if "community_weighted_sentiment" in df_out.columns:
240
+ df_out["community_weighted_sentiment"] = df_out["community_weighted_sentiment"].round(4)
241
+
242
+ # ---------- Save & upload ------------------------------------------- #
243
+ self._save_and_push_summary(df_out)
244
+ print(f"Updated {self.local_summary_path.name} → {len(df_out)} rows")
245
+
246
+
247
+ # --------------------------------------------------------------------------- #
248
+ # CLI entry‑point #
249
+ # --------------------------------------------------------------------------- #
250
+ def main(date_str: str, overwrite: bool = False) -> None:
251
+ if not date_str:
252
+ raise ValueError("--date is required (YYYY-MM-DD)")
253
+
254
+ # Confirm valid date
255
+ try:
256
+ date.fromisoformat(date_str)
257
+ except ValueError:
258
+ raise ValueError(f"Invalid date: {date_str} (expected YYYY‑MM‑DD)")
259
+
260
+ cfg = setup_config()
261
+ SummaryManager(cfg).process_date(date_str, overwrite)
262
+
263
+
264
+ if __name__ == "__main__":
265
+ from reddit_analysis.common_metrics import run_with_metrics
266
+ parser = argparse.ArgumentParser(
267
+ description="Summarize scored Reddit data for a specific date."
268
+ )
269
+ parser.add_argument("--date", required=True,
270
+ help="YYYY-MM-DD date to process")
271
+ parser.add_argument("--overwrite", action="store_true",
272
+ help="Replace any existing rows for this date")
273
+ args = parser.parse_args()
274
+ run_with_metrics("summarize", main, args.date, args.overwrite)
reddit_analysis/test_config.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Test script for config_utils module.
4
+ This allows us to verify that our common configuration loading works properly.
5
+ """
6
+ import argparse
7
+ import os
8
+ from pprint import pprint
9
+ import reddit_analysis.config_utils as config_utils
10
+
11
+ def main():
12
+ """Test the config_utils module."""
13
+ print("Testing config_utils.py")
14
+
15
+ # Load the configuration
16
+ cfg = config_utils.setup_config()
17
+
18
+ # Print the configuration (excluding sensitive values)
19
+ print("\nConfiguration:")
20
+ print("--------------")
21
+ print(f"Project root: {cfg['paths']['root']}")
22
+ print(f"Repo ID: {cfg['config'].get('repo_id', 'Not specified')}")
23
+
24
+ # Print directory configurations
25
+ print("\nLocal Directory Paths:")
26
+ print("--------------------")
27
+ print(f"Raw data directory: {cfg['paths']['raw_dir']}")
28
+ print(f"Scored data directory: {cfg['paths']['scored_dir']}")
29
+ print(f"Logs directory: {cfg['paths']['logs_dir']}")
30
+ print(f"Summary file: {cfg['paths']['summary_file']}")
31
+
32
+ # Print HF repository paths
33
+ print("\nHugging Face Repository Paths:")
34
+ print("---------------------------")
35
+ print(f"HF Raw data directory: {cfg['paths']['hf_raw_dir']}")
36
+ print(f"HF Scored data directory: {cfg['paths']['hf_scored_dir']}")
37
+
38
+ # Check if these directories exist
39
+ print("\nDirectory Status:")
40
+ print("----------------")
41
+ for dir_name, dir_path in [
42
+ ('Raw data', cfg['paths']['raw_dir']),
43
+ ('Scored data', cfg['paths']['scored_dir']),
44
+ ('Logs', cfg['paths']['logs_dir'])
45
+ ]:
46
+ exists = os.path.exists(dir_path)
47
+ status = "Exists" if exists else "Does not exist"
48
+ print(f"{dir_name} directory ({dir_path}): {status}")
49
+
50
+ # Check if summary file exists
51
+ summary_exists = os.path.exists(cfg['paths']['summary_file'])
52
+ print(f"Summary file ({cfg['paths']['summary_file']}): {'Exists' if summary_exists else 'Does not exist'}")
53
+
54
+ # Check if essential secrets are present (without printing their values)
55
+ print("\nSecrets available:")
56
+ print("-----------------")
57
+ print(f"HF_TOKEN: {'Present' if 'HF_TOKEN' in cfg['secrets'] else 'Missing'}")
58
+ print(f"REPLICATE_API_TOKEN: {'Present' if 'REPLICATE_API_TOKEN' in cfg['secrets'] else 'Missing'}")
59
+
60
+ # Check Reddit API credentials
61
+ for key in ['REDDIT_CLIENT_ID', 'REDDIT_CLIENT_SECRET', 'REDDIT_USER_AGENT']:
62
+ print(f"{key}: {'Present' if key in cfg['secrets'] or os.getenv(key) else 'Missing'}")
63
+
64
+ # List the subreddits from config if available
65
+ if 'subreddits' in cfg['config']:
66
+ print("\nConfigured subreddits:")
67
+ print("---------------------")
68
+ for sub in cfg['config']['subreddits']:
69
+ print(f"- {sub.get('name', 'unnamed')}: {sub.get('post_limit', 'N/A')} posts, {sub.get('comment_limit', 'N/A')} comments")
70
+
71
+ if __name__ == "__main__":
72
+ main()
reddit_analysis/tests/README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### `test_config_utils.py`
2
+ - **Functions under test**
3
+ - `load_config(path)` — reads settings from a YAML file.
4
+ - `get_secret(key)` — retrieves a secret first from `os.environ`, then from `streamlit.secrets`, else raises.
5
+ - **Patching & mocking**
6
+ - Environment variables via `os.environ` or `monkeypatch.setenv()` / `monkeypatch.delenv()`.
7
+ - `reddit_analysis.config_utils.HAS_STREAMLIT` toggled to simulate presence of Streamlit.
8
+ - `streamlit.secrets` replaced with a `MockSecrets` object exposing a `.get(key)` method.
9
+ - **Example inputs**
10
+ - A temporary `config.yaml` with keys like `repo_id: test/repo`, `batch_size: 16`, `replicate_model: test/model`.
11
+ - Secret key `"TEST_SECRET"` set in `os.environ` or returned by `MockSecrets.get()`.
12
+ - Missing secret scenario triggers `ValueError("Required secret TEST_SECRET not found…")`.
13
+
14
+ ---
15
+
16
+ ### `test_scrape.py`
17
+ - **Methods under test**
18
+ - `RedditScraper.get_posts(subreddit)` — calls PRAW client’s `.subreddit(...).top()` and returns a DataFrame with columns `post_id, title, text, score, subreddit, created_utc, url, num_comments`.
19
+ - `RedditScraper.upload_to_hf(df, date)` — downloads existing parquet via `hf_hub_download`, deduplicates by `post_id`, then calls `hf_api.upload_file(...)`.
20
+ - `main(date)` CLI — loads config, checks for Reddit credentials, raises if missing.
21
+ - **Patching & mocking**
22
+ - A fake PRAW client (`mock_reddit_client`) whose `.subreddit().top()` yields two `Mock` submissions (ids `post0`, `post1`).
23
+ - `hf_hub_download` patched to return a path for a “previous” parquet file containing `prev_df`.
24
+ - `mock_hf_api.upload_file` to capture the uploaded parquet path.
25
+ - Environment via `monkeypatch` and `reddit_analysis.config_utils.HAS_STREAMLIT` + `streamlit.secrets`.
26
+ - **Example inputs**
27
+ - **`get_posts`** uses two submissions with `id='post0'`, `title='Test Post 0'`, etc., expecting a 2‑row DataFrame.
28
+ - **`upload_to_hf`** combines `prev_df` (posts 0 & 1) with `new_df` (posts 1 & 2), resulting in only `post1` & `post2` uploaded.
29
+ - **CLI** invoked with no Reddit env vars, raising `ValueError("Missing required Reddit API credentials")`.
30
+
31
+ ---
32
+
33
+ ### `test_summarize.py`
34
+ - **Methods under test**
35
+ - `RedditSummarizer.summarize_date(date)` — downloads scored parquet, groups by `subreddit`, and computes `mean_sentiment`, `count`, `total_score`, `weighted_sentiment`, plus `date`.
36
+ - `RedditSummarizer.update_summary(df)` — appends to or creates `summary_file`, preserving chronological order.
37
+ - CLI entrypoint in `main(date)` — validates date format or scored-file existence.
38
+ - **Patching & mocking**
39
+ - `hf_hub_download` patched to return a temp parquet containing `sample_scored_data` (4 rows for two subreddits).
40
+ - `reddit_analysis.config_utils.HAS_STREAMLIT` and `streamlit.secrets.get(...)` for missing-file tests.
41
+ - **Example inputs & expectations**
42
+ - **`summarize_date`**:
43
+ ```python
44
+ sample_scored_data = pd.DataFrame({
45
+ 'subreddit': ['test1','test1','test2','test2'],
46
+ 'sentiment': [0.8,0.6,0.4,0.2],
47
+ 'score': [10,20,30,40],
48
+
49
+ })
50
+ ```
51
+ – Expect two summary rows:
52
+ - test1: `mean_sentiment≈0.7`, `count=2`, `total_score=30`, `weighted_sentiment≈0.6667`
53
+ - test2: `mean_sentiment≈0.3`, `count=2`, `total_score=70`, `weighted_sentiment≈0.2857`
54
+ - **`update_summary`**: merges an initial 2‑row file for `2025-04-19` with a new 2‑row file for `2025-04-20`, ending with 4 total rows.
55
+ - **CLI invalid date**: `main('2025-04-20-invalid')` → `ValueError("Invalid date format")`.
56
+ - **Missing scored file**: patched `hf_hub_download` raises → `ValueError("Failed to download scored file…")`.
57
+
58
+ ---
59
+
60
+ ### `test_score.py`
61
+ - **Class & functions under test**
62
+ - `RedditScorer.score_date(date)` — downloads input parquet, asserts required columns (`text, score, post_id, subreddit`), splits into batches, calls `replicate_client.run()`, injects `sentiment` & `confidence`, writes parquet, then calls `hf_api.upload_file()`.
63
+ - CLI `main(date)` — reads `.env` or `streamlit.secrets`, requires `REPLICATE_API_TOKEN`, else raises.
64
+ - **Patching & mocking**
65
+ - `hf_hub_download` patched to return a temp parquet for the “input” DataFrame.
66
+ - `mock_hf_api` supplying a stubbed `upload_file` method.
67
+ - `mock_replicate_client.run` side‑effect that:
68
+ ```python
69
+ texts = json.loads(input['texts'])
70
+ sentiments = ['positive' if i%2==0 else 'negative' for i in range(len(texts))]
71
+ confidences = [0.9 if i%2==0 else 0.8 for i in range(len(texts))]
72
+ ```
73
+ - `reddit_analysis.config_utils.HAS_STREAMLIT` + `streamlit.secrets.get(...)` for the CLI missing‑token test.
74
+ - **Example inputs & expectations**
75
+ - **`test_score_date`**: input DataFrame with two rows (`'Test text 1'`, `'Test text 2'`), expects uploaded parquet to have `sentiment=['positive','negative']`, `confidence=[0.9,0.8]` and all six columns present.
76
+ - **`test_score_date_missing_columns`**: input missing `post_id`/`subreddit` → `ValueError("missing expected columns")`.
77
+ - **`test_score_date_batch_processing`**: input of 5 texts, `batch_size=2` → `replicate_client.run` called 3 times, final uploaded file contains all 5 rows.
78
+ - **`test_cli_missing_token`**: no `REPLICATE_API_TOKEN` in env or secrets → `ValueError("REPLICATE_API_TOKEN is required for scoring")`.
reddit_analysis/tests/inference/test_score.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import pytest
4
+ import pandas as pd
5
+ from datetime import datetime
6
+ import pytz
7
+ from unittest.mock import Mock, patch
8
+ import json
9
+
10
+ from reddit_analysis.inference.score import SentimentScorer, ReplicateAPI, FileManager, HuggingFaceManager
11
+
12
+ @pytest.fixture
13
+ def mock_config():
14
+ """Create a mock configuration dictionary."""
15
+ return {
16
+ 'config': {
17
+ 'repo_id': 'test/repo',
18
+ 'repo_type': 'dataset',
19
+ 'batch_size': 2,
20
+ 'replicate_model': 'test/model'
21
+ },
22
+ 'paths': {
23
+ 'raw_dir': Path('data/raw'),
24
+ 'scored_dir': Path('data/scored'),
25
+ 'hf_raw_dir': 'data/raw',
26
+ 'hf_scored_dir': 'data/scored'
27
+ },
28
+ 'secrets': {
29
+ 'HF_TOKEN': 'test_token',
30
+ 'REPLICATE_API_TOKEN': 'test_token'
31
+ }
32
+ }
33
+
34
+ @pytest.fixture
35
+ def mock_replicate_api():
36
+ """Create a mock ReplicateAPI."""
37
+ mock = Mock(spec=ReplicateAPI)
38
+ mock.predict.return_value = {
39
+ 'predicted_labels': ['positive', 'negative'],
40
+ 'confidences': [0.9, 0.8]
41
+ }
42
+ return mock
43
+
44
+ @pytest.fixture
45
+ def mock_file_manager():
46
+ """Create a mock FileManager."""
47
+ mock = Mock(spec=FileManager)
48
+ return mock
49
+
50
+ @pytest.fixture
51
+ def mock_hf_manager():
52
+ """Create a mock HuggingFaceManager."""
53
+ mock = Mock(spec=HuggingFaceManager)
54
+ return mock
55
+
56
+ def test_score_date(mock_config, mock_replicate_api, mock_file_manager, mock_hf_manager):
57
+ """Test the score_date method."""
58
+ # Create test input DataFrame
59
+ input_df = pd.DataFrame({
60
+ 'text': ['Test text 1', 'Test text 2'],
61
+ 'score': [1, 2],
62
+ 'post_id': ['post1', 'post2'],
63
+ 'subreddit': ['test1', 'test1']
64
+ })
65
+
66
+ # Mock file operations
67
+ mock_hf_manager.download_file.return_value = Path('test.parquet')
68
+ mock_file_manager.read_parquet.return_value = input_df
69
+ mock_hf_manager.list_files.return_value = [] # No existing files
70
+
71
+ # Initialize scorer with mocked dependencies
72
+ scorer = SentimentScorer(
73
+ mock_config,
74
+ replicate_api=mock_replicate_api,
75
+ file_manager=mock_file_manager,
76
+ hf_manager=mock_hf_manager
77
+ )
78
+
79
+ # Score the data
80
+ scorer.score_date('2025-04-20')
81
+
82
+ # Verify API calls
83
+ mock_replicate_api.predict.assert_called_once()
84
+ mock_file_manager.save_parquet.assert_called_once()
85
+ mock_hf_manager.upload_file.assert_called_once()
86
+
87
+ def test_score_date_missing_columns(mock_config, mock_replicate_api, mock_file_manager, mock_hf_manager):
88
+ """Test score_date with missing required columns."""
89
+ # Create test input DataFrame missing required columns
90
+ input_df = pd.DataFrame({
91
+ 'text': ['Test text 1', 'Test text 2'],
92
+ 'score': [1, 2]
93
+ })
94
+
95
+ # Mock file operations
96
+ mock_hf_manager.download_file.return_value = Path('test.parquet')
97
+ mock_file_manager.read_parquet.return_value = input_df
98
+ mock_hf_manager.list_files.return_value = [] # No existing files
99
+
100
+ # Initialize scorer with mocked dependencies
101
+ scorer = SentimentScorer(
102
+ mock_config,
103
+ replicate_api=mock_replicate_api,
104
+ file_manager=mock_file_manager,
105
+ hf_manager=mock_hf_manager
106
+ )
107
+
108
+ # Verify it raises ValueError
109
+ with pytest.raises(ValueError) as exc_info:
110
+ scorer.score_date('2025-04-20')
111
+ assert "Missing required columns" in str(exc_info.value)
112
+
113
+ def test_score_date_batch_processing(mock_config, mock_replicate_api, mock_file_manager, mock_hf_manager):
114
+ """Test that score_date correctly processes data in batches."""
115
+ # Create test input DataFrame with more rows than batch size
116
+ input_df = pd.DataFrame({
117
+ 'text': [f'Test text {i}' for i in range(5)],
118
+ 'score': [i + 1 for i in range(5)],
119
+ 'post_id': [f'post{i}' for i in range(5)],
120
+ 'subreddit': ['test1'] * 5
121
+ })
122
+
123
+ # Mock file operations
124
+ mock_hf_manager.download_file.return_value = Path('test.parquet')
125
+ mock_file_manager.read_parquet.return_value = input_df
126
+ mock_hf_manager.list_files.return_value = [] # No existing files
127
+
128
+ # Initialize scorer with mocked dependencies
129
+ scorer = SentimentScorer(
130
+ mock_config,
131
+ replicate_api=mock_replicate_api,
132
+ file_manager=mock_file_manager,
133
+ hf_manager=mock_hf_manager
134
+ )
135
+
136
+ # Score the data
137
+ scorer.score_date('2025-04-20')
138
+
139
+ # Verify that replicate_api.predict was called the correct number of times
140
+ assert mock_replicate_api.predict.call_count == 3 # 5 rows with batch_size=2
141
+
142
+ # Verify file operations
143
+ mock_file_manager.save_parquet.assert_called_once()
144
+ mock_hf_manager.upload_file.assert_called_once()
145
+
146
+ def test_score_date_multiple_subreddits(mock_config, mock_replicate_api, mock_file_manager, mock_hf_manager):
147
+ """Test that score_date correctly handles multiple subreddits."""
148
+ # Create test input DataFrame with multiple subreddits
149
+ input_df = pd.DataFrame({
150
+ 'text': ['Test text 1', 'Test text 2', 'Test text 3', 'Test text 4'],
151
+ 'score': [1, 2, 3, 4],
152
+ 'post_id': ['post1', 'post2', 'post3', 'post4'],
153
+ 'subreddit': ['test1', 'test1', 'test2', 'test2']
154
+ })
155
+
156
+ # Mock file operations
157
+ mock_hf_manager.download_file.return_value = Path('test.parquet')
158
+ mock_file_manager.read_parquet.return_value = input_df
159
+ mock_hf_manager.list_files.return_value = [] # No existing files
160
+
161
+ # Initialize scorer with mocked dependencies
162
+ scorer = SentimentScorer(
163
+ mock_config,
164
+ replicate_api=mock_replicate_api,
165
+ file_manager=mock_file_manager,
166
+ hf_manager=mock_hf_manager
167
+ )
168
+
169
+ # Score the data
170
+ scorer.score_date('2025-04-20')
171
+
172
+ # Verify API calls
173
+ mock_replicate_api.predict.assert_called()
174
+
175
+ # Verify that save_parquet was called for each subreddit
176
+ assert mock_file_manager.save_parquet.call_count == 2 # 2 subreddits
177
+
178
+ # Verify that upload_file was called for each subreddit
179
+ assert mock_hf_manager.upload_file.call_count == 2 # 2 subreddits
180
+
181
+ # Check that the upload paths are correct
182
+ upload_calls = mock_hf_manager.upload_file.call_args_list
183
+ upload_paths = [call[0][1] for call in upload_calls] # Second positional argument is path_in_repo
184
+ assert 'data_scored_subreddit/2025-04-20__test1.parquet' in upload_paths
185
+ assert 'data_scored_subreddit/2025-04-20__test2.parquet' in upload_paths
186
+
187
+ def test_score_date_with_existing_subreddits(mock_config, mock_replicate_api, mock_file_manager, mock_hf_manager):
188
+ """Test that score_date skips existing subreddits when overwrite=False."""
189
+ # Create test input DataFrame with multiple subreddits
190
+ input_df = pd.DataFrame({
191
+ 'text': ['Test text 1', 'Test text 2', 'Test text 3', 'Test text 4'],
192
+ 'score': [1, 2, 3, 4],
193
+ 'post_id': ['post1', 'post2', 'post3', 'post4'],
194
+ 'subreddit': ['test1', 'test1', 'test2', 'test2']
195
+ })
196
+
197
+ # Mock file operations
198
+ mock_hf_manager.download_file.return_value = Path('test.parquet')
199
+ mock_file_manager.read_parquet.return_value = input_df
200
+ # Mock existing files - test1 already exists
201
+ mock_hf_manager.list_files.return_value = ['data_scored_subreddit/2025-04-20__test1.parquet']
202
+
203
+ # Initialize scorer with mocked dependencies
204
+ scorer = SentimentScorer(
205
+ mock_config,
206
+ replicate_api=mock_replicate_api,
207
+ file_manager=mock_file_manager,
208
+ hf_manager=mock_hf_manager
209
+ )
210
+
211
+ # Score the data (overwrite=False by default)
212
+ scorer.score_date('2025-04-20', overwrite=False)
213
+
214
+ # Verify API calls - should only process test2 subreddit (2 texts)
215
+ mock_replicate_api.predict.assert_called()
216
+
217
+ # Verify that save_parquet was called only for test2
218
+ assert mock_file_manager.save_parquet.call_count == 1
219
+
220
+ # Verify that upload_file was called only for test2
221
+ assert mock_hf_manager.upload_file.call_count == 1
222
+
223
+ # Check that only test2 was uploaded
224
+ upload_calls = mock_hf_manager.upload_file.call_args_list
225
+ upload_paths = [call[0][1] for call in upload_calls] # Second positional argument is path_in_repo
226
+ assert 'data_scored_subreddit/2025-04-20__test2.parquet' in upload_paths
227
+ assert 'data_scored_subreddit/2025-04-20__test1.parquet' not in upload_paths
228
+
229
+ def test_get_existing_subreddits(mock_config, mock_replicate_api, mock_file_manager, mock_hf_manager):
230
+ """Test the get_existing_subreddits method."""
231
+ # Mock existing files
232
+ mock_hf_manager.list_files.return_value = [
233
+ 'data_scored_subreddit/2025-04-20__test1.parquet',
234
+ 'data_scored_subreddit/2025-04-20__test2.parquet',
235
+ 'data_scored_subreddit/2025-04-21__test1.parquet', # Different date
236
+ 'other_folder/2025-04-20__test3.parquet' # Different folder
237
+ ]
238
+
239
+ # Initialize scorer with mocked dependencies
240
+ scorer = SentimentScorer(
241
+ mock_config,
242
+ replicate_api=mock_replicate_api,
243
+ file_manager=mock_file_manager,
244
+ hf_manager=mock_hf_manager
245
+ )
246
+
247
+ # Get existing subreddits for 2025-04-20
248
+ existing = scorer.get_existing_subreddits('2025-04-20')
249
+
250
+ # Should only include test1 and test2 for the correct date
251
+ assert existing == {'test1', 'test2'}
252
+
253
+ def test_cli_missing_token(monkeypatch, tmp_path):
254
+ """Test CLI with missing REPLICATE_API_TOKEN."""
255
+ # Create a temporary .env file without REPLICATE_API_TOKEN
256
+ env_path = tmp_path / '.env'
257
+ env_path.write_text('')
258
+
259
+ # Set environment variable to point to our test .env
260
+ monkeypatch.setenv('REDDIT_ANALYSIS_ENV', str(env_path))
261
+
262
+ # Remove REPLICATE_API_TOKEN from environment
263
+ monkeypatch.delenv('REPLICATE_API_TOKEN', raising=False)
264
+ # Ensure HF_TOKEN is present so only REPLICATE_API_TOKEN is missing
265
+ monkeypatch.setenv('HF_TOKEN', 'dummy_hf_token')
266
+
267
+ # Mock Streamlit's HAS_STREAMLIT to True
268
+ monkeypatch.setattr('reddit_analysis.config_utils.HAS_STREAMLIT', True)
269
+ # Mock is_running_streamlit to True
270
+ monkeypatch.setattr('reddit_analysis.config_utils.is_running_streamlit', lambda: True)
271
+ # Mock Streamlit secrets
272
+ mock_secrets = Mock()
273
+ mock_secrets.get.return_value = None
274
+ monkeypatch.setattr('streamlit.secrets', mock_secrets)
275
+ # Print for debug
276
+ import os
277
+ print('DEBUG: REPLICATE_API_TOKEN value before main:', os.environ.get('REPLICATE_API_TOKEN'))
278
+ # Run the CLI with --date argument
279
+ with pytest.raises(ValueError) as exc_info:
280
+ from reddit_analysis.inference.score import main
281
+ main('2025-04-20')
282
+ assert "REPLICATE_API_TOKEN is required for scoring" in str(exc_info.value)
reddit_analysis/tests/scraper/test_scrape.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import pytest
4
+ import pandas as pd
5
+ from datetime import datetime, date
6
+ import pytz
7
+ from unittest.mock import Mock, patch
8
+
9
+ from reddit_analysis.scraper.scrape import RedditScraper, RedditAPI, FileManager, HuggingFaceManager
10
+
11
+ @pytest.fixture
12
+ def mock_config():
13
+ """Create a mock configuration dictionary."""
14
+ return {
15
+ 'config': {
16
+ 'repo_id': 'test/repo',
17
+ 'repo_type': 'dataset',
18
+ 'subreddits': [
19
+ {'name': 'test1', 'post_limit': 2, 'comment_limit': 2},
20
+ {'name': 'test2', 'post_limit': 2, 'comment_limit': 2}
21
+ ],
22
+ 'post_limit': 100,
23
+ 'timezone': 'UTC'
24
+ },
25
+ 'paths': {
26
+ 'raw_dir': Path('data/raw'),
27
+ 'logs_dir': Path('logs'),
28
+ 'hf_raw_dir': 'data/raw'
29
+ },
30
+ 'secrets': {
31
+ 'HF_TOKEN': 'test_token',
32
+ 'REDDIT_CLIENT_ID': 'test_id',
33
+ 'REDDIT_CLIENT_SECRET': 'test_secret',
34
+ 'REDDIT_USER_AGENT': 'test_agent'
35
+ }
36
+ }
37
+
38
+ @pytest.fixture
39
+ def mock_reddit_api():
40
+ """Create a mock RedditAPI."""
41
+ mock = Mock(spec=RedditAPI)
42
+
43
+ # Create mock submission objects
44
+ mock_submissions = []
45
+ for i in range(2):
46
+ submission = Mock()
47
+ submission.id = f'post{i}'
48
+ submission.title = f'Test Post {i}'
49
+ submission.selftext = f'Test content {i}'
50
+ submission.score = i + 1
51
+ submission.created_utc = datetime.now(pytz.UTC).timestamp()
52
+ submission.url = f'https://reddit.com/test{i}'
53
+ submission.num_comments = i * 10
54
+
55
+ # Mock the comments
56
+ comment = Mock()
57
+ comment.id = f'comment{i}'
58
+ comment.body = f'Test comment {i}'
59
+ comment.score = i + 5
60
+ comment.created_utc = datetime.now(pytz.UTC).timestamp()
61
+ comment.parent_id = submission.id
62
+
63
+ # Set up comment attributes
64
+ submission.comments = Mock()
65
+ submission.comments._comments = [comment]
66
+ submission.comments.replace_more = Mock(return_value=None)
67
+
68
+ mock_submissions.append(submission)
69
+
70
+ # Set up the mock subreddit
71
+ mock_subreddit = Mock()
72
+ mock_subreddit.top.return_value = mock_submissions
73
+ mock.get_subreddit.return_value = mock_subreddit
74
+
75
+ return mock
76
+
77
+ @pytest.fixture
78
+ def mock_file_manager():
79
+ """Create a mock FileManager."""
80
+ mock = Mock(spec=FileManager)
81
+ return mock
82
+
83
+ @pytest.fixture
84
+ def mock_hf_manager():
85
+ """Create a mock HuggingFaceManager."""
86
+ mock = Mock(spec=HuggingFaceManager)
87
+ return mock
88
+
89
+ def test_get_posts(mock_config, mock_reddit_api):
90
+ """Test the get_posts method."""
91
+ # Initialize scraper with mocked RedditAPI
92
+ scraper = RedditScraper(mock_config, reddit_api=mock_reddit_api)
93
+
94
+ # Get posts for a test subreddit
95
+ df = scraper.get_posts({'name': 'test1', 'post_limit': 2, 'comment_limit': 2})
96
+
97
+ # Verify DataFrame structure and content
98
+ assert isinstance(df, pd.DataFrame)
99
+ assert len(df) == 4 # 2 posts + 2 comments
100
+
101
+ # Verify posts
102
+ posts_df = df[df['type'] == 'post']
103
+ assert len(posts_df) == 2
104
+ assert posts_df['subreddit'].iloc[0] == 'test1'
105
+ assert posts_df['post_id'].iloc[0] == 'post0'
106
+ assert posts_df['post_id'].iloc[1] == 'post1'
107
+
108
+ # Verify comments
109
+ comments_df = df[df['type'] == 'comment']
110
+ assert len(comments_df) == 2
111
+ assert comments_df['subreddit'].iloc[0] == 'test1'
112
+ assert comments_df['post_id'].iloc[0] == 'comment0'
113
+ assert comments_df['parent_id'].iloc[0] == 'post0'
114
+
115
+ def test_upload_to_hf_deduplication(mock_config, mock_file_manager, mock_hf_manager):
116
+ """Test the upload_to_hf method with deduplication."""
117
+ # Create test DataFrames
118
+ prev_df = pd.DataFrame({
119
+ 'post_id': ['post0', 'post1'],
120
+ 'title': ['Old Post 0', 'Old Post 1'],
121
+ 'text': ['Old content 0', 'Old content 1'],
122
+ 'score': [1, 2],
123
+ 'subreddit': ['test1', 'test1'],
124
+ 'created_utc': [datetime.now(pytz.UTC)] * 2,
125
+ 'url': ['https://reddit.com/old0', 'https://reddit.com/old1'],
126
+ 'num_comments': [10, 20]
127
+ })
128
+
129
+ new_df = pd.DataFrame({
130
+ 'post_id': ['post1', 'post2'],
131
+ 'title': ['New Post 1', 'New Post 2'],
132
+ 'text': ['New content 1', 'New content 2'],
133
+ 'score': [3, 4],
134
+ 'subreddit': ['test1', 'test1'],
135
+ 'created_utc': [datetime.now(pytz.UTC)] * 2,
136
+ 'url': ['https://reddit.com/new1', 'https://reddit.com/new2'],
137
+ 'num_comments': [30, 40]
138
+ })
139
+
140
+ # Mock file operations
141
+ mock_hf_manager.download_file.return_value = Path('test.parquet')
142
+ mock_file_manager.read_parquet.return_value = prev_df
143
+
144
+ # Initialize scraper with mocked dependencies
145
+ scraper = RedditScraper(
146
+ mock_config,
147
+ file_manager=mock_file_manager,
148
+ hf_manager=mock_hf_manager
149
+ )
150
+
151
+ # Upload new data
152
+ scraper._upload_to_hf(new_df, '2025-04-20')
153
+
154
+ # Verify file operations
155
+ mock_file_manager.save_parquet.assert_called_once()
156
+ mock_hf_manager.upload_file.assert_called_once()
157
+
158
+ def test_cli_missing_env(monkeypatch, tmp_path):
159
+ """Test CLI with missing environment variables."""
160
+ # Create a temporary .env file without required variables
161
+ env_path = tmp_path / '.env'
162
+ env_path.write_text('')
163
+
164
+ # Set environment variable to point to our test .env
165
+ monkeypatch.setenv('REDDIT_ANALYSIS_ENV', str(env_path))
166
+
167
+ # Remove any existing Reddit API credentials from environment
168
+ for key in ['REDDIT_CLIENT_ID', 'REDDIT_CLIENT_SECRET', 'REDDIT_USER_AGENT']:
169
+ monkeypatch.delenv(key, raising=False)
170
+ # Ensure HF_TOKEN is present so only Reddit client vars are missing
171
+ monkeypatch.setenv('HF_TOKEN', 'dummy_hf_token')
172
+ # Mock Streamlit's HAS_STREAMLIT to True
173
+ monkeypatch.setattr('reddit_analysis.config_utils.HAS_STREAMLIT', True)
174
+ # Mock is_running_streamlit to True
175
+ monkeypatch.setattr('reddit_analysis.config_utils.is_running_streamlit', lambda: True)
176
+ # Mock Streamlit secrets to return None
177
+ mock_secrets = Mock()
178
+ mock_secrets.get.return_value = None
179
+ monkeypatch.setattr('streamlit.secrets', mock_secrets)
180
+ # Print for debug
181
+ import os
182
+ print('DEBUG: REDDIT_CLIENT_ID value before main:', os.environ.get('REDDIT_CLIENT_ID'))
183
+ # Run the CLI with --date argument
184
+ with pytest.raises(ValueError) as exc_info:
185
+ from reddit_analysis.scraper.scrape import main
186
+ main('2025-04-20')
187
+ assert "Missing required environment variables: REDDIT_CLIENT_ID" in str(exc_info.value)
reddit_analysis/tests/summarizer/test_summarize.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from datetime import date
5
+ from unittest.mock import Mock, patch
6
+
7
+ from reddit_analysis.summarizer.summarize import (
8
+ SummaryManager,
9
+ FileManager,
10
+ HuggingFaceManager,
11
+ )
12
+
13
+
14
+ # --------------------------------------------------------------------------- #
15
+ # Fixtures #
16
+ # --------------------------------------------------------------------------- #
17
+ @pytest.fixture
18
+ def mock_config(tmp_path):
19
+ """Minimal config dict compatible with SummaryManager."""
20
+ return {
21
+ "config": {
22
+ "repo_id": "test/repo",
23
+ "repo_type": "dataset",
24
+ },
25
+ "paths": {
26
+ "root": tmp_path,
27
+ "scored_dir": tmp_path / "scored",
28
+ "hf_scored_dir": "scored", # relative path in the Hub
29
+ "summary_file": tmp_path / "summary.csv",
30
+ },
31
+ "secrets": {"HF_TOKEN": "fake"},
32
+ }
33
+
34
+
35
+ @pytest.fixture
36
+ def mock_file_manager():
37
+ """FileManager double with just the methods we need."""
38
+ m = Mock(spec=FileManager)
39
+ # read_parquet returns sample data we set in each test
40
+ # write_csv just returns a Path so downstream code is happy
41
+ m.write_csv.return_value = Path("summary.csv")
42
+ return m
43
+
44
+
45
+ @pytest.fixture
46
+ def mock_hf_manager():
47
+ """HuggingFaceManager double."""
48
+ return Mock(spec=HuggingFaceManager)
49
+
50
+
51
+ # --------------------------------------------------------------------------- #
52
+ # Tests #
53
+ # --------------------------------------------------------------------------- #
54
+ def test_process_date(mock_config, mock_file_manager, mock_hf_manager):
55
+ """End‑to‑end happy path."""
56
+ # ---------- sample scored shard --------------------------------------- #
57
+ sample = pd.DataFrame(
58
+ {
59
+ "subreddit": ["a", "a", "b", "b"],
60
+ "sentiment": [0.8, 0.6, 0.4, 0.2],
61
+ "score": [10, 20, 30, 40],
62
+ "post_id": ["p1", "p2", "p3", "p4"],
63
+ "text": ["t1", "t2", "t3", "t4"],
64
+ "retrieved_at": pd.Timestamp.utcnow(),
65
+ }
66
+ )
67
+ mock_file_manager.read_parquet.return_value = sample
68
+ # first call → download scored file, second call (within _save_and_push_summary) unused here
69
+ mock_hf_manager.download_file.return_value = Path("dummy.parquet")
70
+
71
+ with patch.object(
72
+ SummaryManager, "_load_remote_summary", return_value=pd.DataFrame()
73
+ ):
74
+ mgr = SummaryManager(
75
+ mock_config, file_manager=mock_file_manager, hf_manager=mock_hf_manager
76
+ )
77
+ mgr.process_date("2025-04-20")
78
+
79
+ # assertions
80
+ mock_file_manager.read_parquet.assert_called_once()
81
+ mock_file_manager.write_csv.assert_called_once()
82
+ mock_hf_manager.upload_file.assert_called_once()
83
+
84
+
85
+ def test_get_processed_combinations(mock_config, mock_file_manager, mock_hf_manager):
86
+ """The helper should translate the existing CSV into a set of tuples."""
87
+ existing = pd.DataFrame(
88
+ {
89
+ "date": ["2025-04-19", "2025-04-19"],
90
+ "subreddit": ["a", "b"],
91
+ "mean_sentiment": [0.5, 0.3],
92
+ "weighted_sentiment": [0.4, 0.2],
93
+ "count": [1, 1],
94
+ }
95
+ )
96
+
97
+ with patch.object(
98
+ SummaryManager, "_load_remote_summary", return_value=existing
99
+ ):
100
+ mgr = SummaryManager(
101
+ mock_config, file_manager=mock_file_manager, hf_manager=mock_hf_manager
102
+ )
103
+ processed = mgr.get_processed_combinations()
104
+
105
+ assert processed == {(date(2025, 4, 19), "a"), (date(2025, 4, 19), "b")}
106
+
107
+
108
+ def test_cli_invalid_date():
109
+ """main() should raise on malformed dates."""
110
+ from reddit_analysis.summarizer.summarize import main
111
+
112
+ with pytest.raises(ValueError):
113
+ main("bad‑date‑format")
114
+
115
+
116
+ def test_cli_missing_scored_file(mock_config, mock_file_manager, mock_hf_manager):
117
+ """Gracefully handles a missing *_scored.parquet on the Hub."""
118
+ # download of scored file raises, but remote summary loads fine →
119
+ mock_hf_manager.download_file.side_effect = Exception("not found")
120
+ with patch.object(
121
+ SummaryManager, "_load_remote_summary", return_value=pd.DataFrame()
122
+ ):
123
+ mgr = SummaryManager(
124
+ mock_config, file_manager=mock_file_manager, hf_manager=mock_hf_manager
125
+ )
126
+ # Should simply return after printing error, not raise.
127
+ assert mgr.process_date("2025-04-20") is None
reddit_analysis/tests/test_config_utils.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import pytest
4
+ import yaml
5
+ from reddit_analysis.config_utils import load_config, get_secret, ROOT
6
+
7
+ @pytest.fixture
8
+ def temp_config_file(tmp_path):
9
+ """Create a temporary config file with test data."""
10
+ config = {
11
+ 'repo_id': 'test/repo',
12
+ 'repo_type': 'dataset',
13
+ 'raw_dir': 'data/raw',
14
+ 'scored_dir': 'data/scored',
15
+ 'logs_dir': 'logs',
16
+ 'summary_file': 'summary.csv',
17
+ 'hf_raw_dir': 'data/raw',
18
+ 'hf_scored_dir': 'data/scored',
19
+ 'batch_size': 16,
20
+ 'replicate_model': 'test/model',
21
+ 'subreddits': ['test1', 'test2'],
22
+ 'post_limit': 100
23
+ }
24
+
25
+ config_path = tmp_path / 'config.yaml'
26
+ with open(config_path, 'w') as f:
27
+ yaml.dump(config, f)
28
+
29
+ return config_path
30
+
31
+ def test_load_config(temp_config_file, monkeypatch):
32
+ """Test that load_config correctly reads the config file."""
33
+ # Mock the ROOT path to point to our test directory
34
+ monkeypatch.setattr('reddit_analysis.config_utils.ROOT', temp_config_file.parent)
35
+
36
+ # Load the config
37
+ config = load_config() # Should now find config.yaml in the test directory
38
+
39
+ # Verify the values
40
+ assert config['repo_id'] == 'test/repo'
41
+ assert config['repo_type'] == 'dataset'
42
+ assert config['raw_dir'] == 'data/raw'
43
+ assert config['scored_dir'] == 'data/scored'
44
+ assert config['logs_dir'] == 'logs'
45
+ assert config['summary_file'] == 'summary.csv'
46
+ assert config['hf_raw_dir'] == 'data/raw'
47
+ assert config['hf_scored_dir'] == 'data/scored'
48
+ assert config['batch_size'] == 16
49
+ assert config['replicate_model'] == 'test/model'
50
+ assert config['subreddits'] == ['test1', 'test2']
51
+ assert config['post_limit'] == 100
52
+
53
+ def test_get_secret_env_var(monkeypatch):
54
+ """Test get_secret with environment variable."""
55
+ # Set a test environment variable
56
+ monkeypatch.setenv('TEST_SECRET', 'env_value')
57
+
58
+ # Get the secret
59
+ value = get_secret('TEST_SECRET')
60
+
61
+ # Verify it returns the environment variable value
62
+ assert value == 'env_value'
63
+
64
+ def test_get_secret_streamlit(monkeypatch):
65
+ """Test get_secret with Streamlit secrets."""
66
+ # Remove environment variable
67
+ monkeypatch.delenv('TEST_SECRET', raising=False)
68
+
69
+ # Mock Streamlit's HAS_STREAMLIT to True
70
+ monkeypatch.setattr('reddit_analysis.config_utils.HAS_STREAMLIT', True)
71
+ # Mock is_running_streamlit to True
72
+ monkeypatch.setattr('reddit_analysis.config_utils.is_running_streamlit', lambda: True)
73
+ # Mock Streamlit secrets
74
+ class MockSecrets:
75
+ def get(self, key, default=None):
76
+ return 'streamlit_value'
77
+ monkeypatch.setattr('streamlit.secrets', MockSecrets())
78
+ # Get the secret
79
+ value = get_secret('TEST_SECRET')
80
+ # Verify it returns the Streamlit secret value
81
+ assert value == 'streamlit_value'
82
+
83
+ def test_get_secret_missing(monkeypatch):
84
+ """Test get_secret when secret is missing from both sources."""
85
+ # Remove environment variable
86
+ monkeypatch.delenv('TEST_SECRET', raising=False)
87
+
88
+ # Mock Streamlit's HAS_STREAMLIT to True
89
+ monkeypatch.setattr('reddit_analysis.config_utils.HAS_STREAMLIT', True)
90
+
91
+ # Mock Streamlit secrets to return None
92
+ class MockSecrets:
93
+ def get(self, key, default=None):
94
+ return default
95
+
96
+ monkeypatch.setattr('streamlit.secrets', MockSecrets())
97
+
98
+ # Verify it raises ValueError
99
+ with pytest.raises(ValueError) as exc_info:
100
+ get_secret('TEST_SECRET')
101
+ assert "Required secret TEST_SECRET not found" in str(exc_info.value)
requirements-dev.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas==2.2.3
2
+ python-dotenv==1.1.0
3
+ pyyaml==6.0.2
4
+ replicate==1.0.4
5
+ huggingface-hub==0.30.2
6
+ streamlit==1.44.1
7
+ altair==5.5.0
8
+ pyarrow==19.0.1
9
+ pytest==8.3.5
10
+ praw>=7.8.1
11
+ prometheus-client==0.21.1
requirements.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ streamlit==1.44.1
3
+ pandas==2.2.3
4
+ numpy==2.2.4
5
+ altair==5.5.0
6
+
7
+ # Data handling
8
+ huggingface-hub==0.30.2
9
+ pyyaml==6.0.2
10
+
11
+ # Text analysis
12
+ spacy==3.8.5
13
+ scikit-learn==1.6.1
14
+ sentence-transformers==4.1.0
15
+ keybert==0.9.0
16
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
17
+
18
+ # Local development
19
+ python-dotenv==1.1.0
20
+
21
+ # Added for parquet reading on Spaces
22
+ pyarrow==16.1.0
subreddit_daily_summary.csv ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ date,subreddit,mean_sentiment,community_weighted_sentiment,count
2
+ 2025-05-01,LocalLLaMA,-0.4952,-0.4779,523
3
+ 2025-05-01,OpenAI,-0.5242,-0.517,227
4
+ 2025-05-01,artificial,-0.475,-0.5065,80
5
+ 2025-05-01,singularity,-0.3839,-0.4357,211
6
+ 2025-05-02,LocalLLaMA,-0.5027,-0.4347,366
7
+ 2025-05-02,OpenAI,-0.4947,-0.4879,285
8
+ 2025-05-02,artificial,-0.3913,-0.5042,46
9
+ 2025-05-02,singularity,-0.4244,-0.4151,205
10
+ 2025-05-03,LocalLLaMA,-0.5518,-0.5262,415
11
+ 2025-05-03,OpenAI,-0.4688,-0.4546,256
12
+ 2025-05-03,artificial,-0.4286,-0.3653,56
13
+ 2025-05-03,singularity,-0.3534,-0.3939,232
14
+ 2025-05-04,LocalLLaMA,-0.3213,-0.3051,333
15
+ 2025-05-04,OpenAI,-0.4365,-0.4455,252
16
+ 2025-05-04,artificial,-0.5172,-0.5418,58
17
+ 2025-05-04,singularity,-0.6024,-0.6052,166
18
+ 2025-05-05,LocalLLaMA,-0.473,-0.4502,444
19
+ 2025-05-05,OpenAI,-0.4486,-0.426,243
20
+ 2025-05-05,artificial,-0.3714,-0.3691,35
21
+ 2025-05-05,singularity,-0.3362,-0.3324,232
22
+ 2025-05-06,LocalLLaMA,-0.5656,-0.5327,419
23
+ 2025-05-06,OpenAI,-0.5019,-0.5556,269
24
+ 2025-05-06,artificial,-0.4468,-0.4585,94
25
+ 2025-05-06,singularity,-0.3714,-0.3904,245
26
+ 2025-05-07,LocalLLaMA,-0.4633,-0.4799,354
27
+ 2025-05-07,OpenAI,-0.4258,-0.4588,209
28
+ 2025-05-07,artificial,-0.5146,-0.5191,103
29
+ 2025-05-07,singularity,-0.3407,-0.3575,182
30
+ 2025-05-08,LocalLLaMA,-0.4769,-0.4615,325
31
+ 2025-05-08,OpenAI,-0.5182,-0.4833,303
32
+ 2025-05-08,artificial,-0.5,-0.5292,52
33
+ 2025-05-08,singularity,-0.494,-0.5126,249
34
+ 2025-05-09,LocalLLaMA,-0.4595,-0.4251,333
35
+ 2025-05-09,OpenAI,-0.4436,-0.4213,266
36
+ 2025-05-09,artificial,-0.5238,-0.5502,63
37
+ 2025-05-09,singularity,-0.502,-0.5425,253
38
+ 2025-05-10,LocalLLaMA,-0.4552,-0.4159,279
39
+ 2025-05-10,OpenAI,-0.5269,-0.5317,186
40
+ 2025-05-10,artificial,-0.443,-0.4837,79
41
+ 2025-05-10,singularity,-0.5192,-0.5185,208
42
+ 2025-05-11,LocalLLaMA,-0.5655,-0.5272,290
43
+ 2025-05-11,OpenAI,-0.4572,-0.4568,269
44
+ 2025-05-11,artificial,-0.5882,-0.5505,68
45
+ 2025-05-11,singularity,-0.3736,-0.3069,182
46
+ 2025-05-12,LocalLLaMA,-0.562,-0.5701,274
47
+ 2025-05-12,OpenAI,-0.5152,-0.4961,264
48
+ 2025-05-12,artificial,-0.4667,-0.5324,75
49
+ 2025-05-12,singularity,-0.498,-0.5231,247
50
+ 2025-05-13,LocalLLaMA,-0.4971,-0.4682,342
51
+ 2025-05-13,OpenAI,-0.5833,-0.6451,288
52
+ 2025-05-13,artificial,-0.3671,-0.4165,79
53
+ 2025-05-13,singularity,-0.3571,-0.4305,140
54
+ 2025-05-14,LocalLLaMA,-0.3776,-0.2943,286
55
+ 2025-05-14,OpenAI,-0.5369,-0.569,298
56
+ 2025-05-14,artificial,-0.5,-0.5367,88
57
+ 2025-05-14,singularity,-0.36,-0.3138,300
58
+ 2025-05-15,LocalLLaMA,-0.5308,-0.5159,341
59
+ 2025-05-15,OpenAI,-0.533,-0.5087,227
60
+ 2025-05-15,artificial,-0.5942,-0.6274,69
61
+ 2025-05-15,singularity,-0.4604,-0.4808,341
62
+ 2025-05-16,LocalLLaMA,-0.5168,-0.5195,327
63
+ 2025-05-16,OpenAI,-0.5751,-0.5613,273
64
+ 2025-05-16,artificial,-0.5802,-0.5783,81
65
+ 2025-05-16,singularity,-0.4568,-0.4414,324
66
+ 2025-05-17,LocalLLaMA,-0.5658,-0.5476,281
67
+ 2025-05-17,OpenAI,-0.5299,-0.5133,234
68
+ 2025-05-17,artificial,-0.4545,-0.5082,77
69
+ 2025-05-17,singularity,-0.5506,-0.5318,178
70
+ 2025-05-18,LocalLLaMA,-0.4783,-0.4879,230
71
+ 2025-05-18,OpenAI,-0.4545,-0.4629,165
72
+ 2025-05-18,artificial,-0.4902,-0.4985,51
73
+ 2025-05-18,singularity,-0.5461,-0.5584,141
74
+ 2025-05-19,LocalLLaMA,-0.4875,-0.4821,320
75
+ 2025-05-19,OpenAI,-0.6712,-0.6774,146
76
+ 2025-05-19,artificial,-0.4766,-0.5524,107
77
+ 2025-05-19,singularity,-0.391,-0.3832,335
78
+ 2025-05-20,LocalLLaMA,-0.5137,-0.466,329
79
+ 2025-05-20,OpenAI,-0.4822,-0.4133,197
80
+ 2025-05-20,artificial,-0.4,-0.4258,60
81
+ 2025-05-20,singularity,-0.3147,-0.3419,429
82
+ 2025-05-21,LocalLLaMA,-0.5,-0.5178,372
83
+ 2025-05-21,OpenAI,-0.4307,-0.5103,267
84
+ 2025-05-21,artificial,-0.5263,-0.6283,76
85
+ 2025-05-21,singularity,-0.3589,-0.3353,599
86
+ 2025-05-22,LocalLLaMA,-0.4813,-0.4787,374
87
+ 2025-05-22,OpenAI,-0.4939,-0.5255,328
88
+ 2025-05-22,artificial,-0.6667,-0.6903,72
89
+ 2025-05-22,singularity,-0.4947,-0.5288,566
90
+ 2025-05-23,LocalLLaMA,-0.5445,-0.5678,382
91
+ 2025-05-23,OpenAI,-0.4605,-0.4919,215
92
+ 2025-05-23,artificial,-0.3274,-0.3235,113
93
+ 2025-05-23,singularity,-0.393,-0.3799,402
94
+ 2025-05-24,LocalLLaMA,-0.4333,-0.4601,240
95
+ 2025-05-24,OpenAI,-0.344,-0.3161,250
96
+ 2025-05-24,artificial,-0.3488,-0.3326,86
97
+ 2025-05-24,singularity,-0.4379,-0.4447,491
98
+ 2025-05-25,LocalLLaMA,-0.5081,-0.5379,248
99
+ 2025-05-25,OpenAI,-0.345,-0.38,229
100
+ 2025-05-25,artificial,-0.3535,-0.3803,99
101
+ 2025-05-25,singularity,-0.3897,-0.3967,331
102
+ 2025-05-26,LocalLLaMA,-0.481,-0.4868,343
103
+ 2025-05-26,OpenAI,-0.5125,-0.5693,160
104
+ 2025-05-26,artificial,-0.2609,-0.2746,46
105
+ 2025-05-26,singularity,-0.4444,-0.4124,270
106
+ 2025-05-27,LocalLLaMA,-0.5611,-0.5977,319
107
+ 2025-05-27,OpenAI,-0.5197,-0.4877,229
108
+ 2025-05-27,artificial,-0.6436,-0.6834,101
109
+ 2025-05-27,singularity,-0.2628,-0.284,331
110
+ 2025-05-28,LocalLLaMA,-0.5333,-0.4956,360
111
+ 2025-05-28,OpenAI,-0.4729,-0.4775,258
112
+ 2025-05-28,artificial,-0.4186,-0.4808,86
113
+ 2025-05-28,singularity,-0.4292,-0.4194,459
114
+ 2025-05-29,LocalLLaMA,-0.4661,-0.4449,472
115
+ 2025-05-29,OpenAI,-0.5281,-0.5376,178
116
+ 2025-05-29,artificial,-0.0909,-0.0087,66
117
+ 2025-05-29,singularity,-0.3836,-0.4301,464
118
+ 2025-05-30,LocalLLaMA,-0.4895,-0.4555,380
119
+ 2025-05-30,OpenAI,-0.4791,-0.4653,215
120
+ 2025-05-30,artificial,-0.5333,-0.5649,90
121
+ 2025-05-30,singularity,-0.3952,-0.4286,377
122
+ 2025-05-31,LocalLLaMA,-0.5974,-0.6178,313
123
+ 2025-05-31,OpenAI,-0.4913,-0.4578,173
124
+ 2025-05-31,artificial,-0.3077,-0.258,78
125
+ 2025-05-31,singularity,-0.4563,-0.4511,309
126
+ 2025-06-01,LocalLLaMA,-0.4754,-0.4483,244
127
+ 2025-06-01,OpenAI,-0.4286,-0.4016,203
128
+ 2025-06-01,artificial,-0.2941,-0.2128,17
129
+ 2025-06-01,singularity,-0.4667,-0.4858,180
130
+ 2025-06-02,LocalLLaMA,-0.4886,-0.4693,352
131
+ 2025-06-02,OpenAI,-0.5528,-0.5055,246
132
+ 2025-06-02,artificial,-0.4792,-0.6261,96
133
+ 2025-06-02,singularity,-0.5287,-0.5184,314
134
+ 2025-06-03,LocalLLaMA,-0.405,-0.3515,279
135
+ 2025-06-03,OpenAI,-0.545,-0.5749,211
136
+ 2025-06-03,artificial,-0.6,-0.6247,80
137
+ 2025-06-03,singularity,-0.4876,-0.5192,242
138
+ 2025-06-04,LocalLLaMA,-0.4672,-0.4955,274
139
+ 2025-06-04,OpenAI,-0.5962,-0.5539,317
140
+ 2025-06-04,artificial,-0.5435,-0.605,92
141
+ 2025-06-04,singularity,-0.3316,-0.299,377
142
+ 2025-06-05,LocalLLaMA,-0.4882,-0.4796,297
143
+ 2025-06-05,OpenAI,-0.4632,-0.4344,231
144
+ 2025-06-05,artificial,-0.6712,-0.7541,73
145
+ 2025-06-05,singularity,-0.4007,-0.3616,307
146
+ 2025-06-06,LocalLLaMA,-0.438,-0.3628,274
147
+ 2025-06-06,OpenAI,-0.5,-0.4981,148
148
+ 2025-06-06,artificial,-0.5,-0.5466,72
149
+ 2025-06-06,singularity,-0.3361,-0.3444,238
150
+ 2025-06-07,LocalLLaMA,-0.4808,-0.4602,208
151
+ 2025-06-07,OpenAI,-0.4357,-0.4429,241
152
+ 2025-06-07,artificial,-0.4563,-0.4383,103
153
+ 2025-06-07,singularity,-0.373,-0.3527,252
154
+ 2025-06-08,LocalLLaMA,-0.5448,-0.5058,268
155
+ 2025-06-08,OpenAI,-0.5039,-0.4824,254
156
+ 2025-06-08,artificial,-0.6364,-0.678,77
157
+ 2025-06-08,singularity,-0.4054,-0.4938,148
158
+ 2025-06-09,LocalLLaMA,-0.5054,-0.4401,279
159
+ 2025-06-09,OpenAI,-0.4878,-0.4226,246
160
+ 2025-06-09,artificial,-0.4478,-0.4649,134
161
+ 2025-06-09,singularity,-0.4618,-0.4825,249
162
+ 2025-06-10,LocalLLaMA,-0.348,-0.3122,273
163
+ 2025-06-10,OpenAI,-0.4957,-0.5238,349
164
+ 2025-06-10,artificial,-0.5663,-0.5317,83
165
+ 2025-06-10,singularity,-0.383,-0.4165,376
166
+ 2025-06-11,LocalLLaMA,-0.4113,-0.3931,248
167
+ 2025-06-11,OpenAI,-0.4286,-0.3769,217
168
+ 2025-06-11,artificial,-0.4955,-0.5698,111
169
+ 2025-06-11,singularity,-0.368,-0.359,269
170
+ 2025-06-12,LocalLLaMA,-0.4094,-0.4098,254
171
+ 2025-06-12,OpenAI,-0.5276,-0.5785,254
172
+ 2025-06-12,artificial,-0.3735,-0.4439,83
173
+ 2025-06-12,singularity,-0.2961,-0.3084,233
174
+ 2025-06-13,LocalLLaMA,-0.3556,-0.3242,270
175
+ 2025-06-13,OpenAI,-0.4382,-0.4042,178
176
+ 2025-06-13,artificial,-0.3821,-0.3721,123
177
+ 2025-06-13,singularity,-0.2,-0.2585,125
178
+ 2025-06-14,LocalLLaMA,-0.5736,-0.5512,197
179
+ 2025-06-14,OpenAI,-0.3966,-0.4258,179
180
+ 2025-06-14,artificial,-0.4167,-0.4459,96
181
+ 2025-06-14,singularity,-0.1354,-0.1692,192
182
+ 2025-06-15,LocalLLaMA,-0.4684,-0.4165,237
183
+ 2025-06-15,OpenAI,-0.5294,-0.5125,102
184
+ 2025-06-15,artificial,-0.4754,-0.5787,61
185
+ 2025-06-15,singularity,-0.427,-0.392,178
186
+ 2025-06-16,LocalLLaMA,-0.518,-0.5471,278
187
+ 2025-06-16,OpenAI,-0.5169,-0.5528,207
188
+ 2025-06-16,artificial,-0.5696,-0.5846,79
189
+ 2025-06-16,singularity,-0.3418,-0.3892,158
190
+ 2025-06-17,LocalLLaMA,-0.4744,-0.4956,293
191
+ 2025-06-17,OpenAI,-0.426,-0.4405,223
192
+ 2025-06-17,artificial,-0.3608,-0.3481,97
193
+ 2025-06-17,singularity,-0.433,-0.4326,321
194
+ 2025-06-18,LocalLLaMA,-0.4528,-0.4244,307
195
+ 2025-06-18,OpenAI,-0.5152,-0.4993,231
196
+ 2025-06-18,artificial,-0.541,-0.5906,61
197
+ 2025-06-18,singularity,-0.4416,-0.4302,197
198
+ 2025-06-19,LocalLLaMA,-0.528,-0.5063,411
199
+ 2025-06-19,OpenAI,-0.3475,-0.3711,236
200
+ 2025-06-19,artificial,-0.5696,-0.6251,79
201
+ 2025-06-19,singularity,-0.4249,-0.4267,313
202
+ 2025-06-20,LocalLLaMA,-0.4225,-0.3431,374
203
+ 2025-06-20,OpenAI,-0.6126,-0.6681,222
204
+ 2025-06-20,artificial,-0.5238,-0.6093,63
205
+ 2025-06-20,singularity,-0.445,-0.4342,209
206
+ 2025-06-21,LocalLLaMA,-0.5521,-0.5479,317
207
+ 2025-06-21,OpenAI,-0.5932,-0.6164,177
208
+ 2025-06-21,artificial,-0.5579,-0.6365,95
209
+ 2025-06-21,singularity,-0.3566,-0.4154,286
210
+ 2025-06-22,LocalLLaMA,-0.5122,-0.3947,41
211
+ 2025-06-22,OpenAI,-0.3846,-0.4019,130
212
+ 2025-06-22,artificial,-0.28,-0.4088,75
213
+ 2025-06-22,singularity,-0.28,-0.2504,125