Binx Claude Sonnet 4.6 commited on
Commit Β·
da605e9
0
Parent(s):
Initial commit: analysis app, deployment config, UI improvements
Browse files- .gitignore +12 -0
- .streamlit/config.toml +10 -0
- CLAUDE.md +37 -0
- Dockerfile +19 -0
- README.md +32 -0
- REPLICATION.md +395 -0
- agent/.env.example +4 -0
- agent/.streamlit/config.toml +10 -0
- agent/CLAUDE.md +372 -0
- agent/README.md +73 -0
- agent/analysis.py +1480 -0
- agent/rebuild_parquets.py +198 -0
- agent/requirements.txt +9 -0
- app.py +1059 -0
- requirements.txt +10 -0
.gitignore
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.venv/
|
| 2 |
+
.cache/
|
| 3 |
+
.tmp-home/
|
| 4 |
+
.env
|
| 5 |
+
**/.env
|
| 6 |
+
*.pyc
|
| 7 |
+
__pycache__/
|
| 8 |
+
data/*.parquet
|
| 9 |
+
data/*.csv
|
| 10 |
+
data/*.feather
|
| 11 |
+
agent/outputs/
|
| 12 |
+
agent/logs/
|
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[theme]
|
| 2 |
+
base = "light"
|
| 3 |
+
backgroundColor = "#ffffff"
|
| 4 |
+
secondaryBackgroundColor = "#f5f5f5"
|
| 5 |
+
textColor = "#000000"
|
| 6 |
+
primaryColor = "#000000"
|
| 7 |
+
font = "sans serif"
|
| 8 |
+
|
| 9 |
+
[server]
|
| 10 |
+
headless = true
|
CLAUDE.md
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CLAUDE.md β Project Conventions
|
| 2 |
+
|
| 3 |
+
## File Operations
|
| 4 |
+
|
| 5 |
+
- When asked to edit a file, use Glob/Grep to locate it rather than asking the user for the path.
|
| 6 |
+
- Prefer targeted reads (offset + limit) over reading entire large files.
|
| 7 |
+
- Before creating a new file, check whether a suitable one already exists.
|
| 8 |
+
- Do not re-read a file after editing it to verify β edits would have errored if they failed.
|
| 9 |
+
|
| 10 |
+
## Editing Conventions
|
| 11 |
+
|
| 12 |
+
- When editing R/Quarto files, check for stale object/column name references after renames (e.g., `title` -> new name) across the entire file, not just the edit site.
|
| 13 |
+
- Make the minimal change needed β do not refactor or clean up surrounding code unless asked.
|
| 14 |
+
- Do not add comments, docstrings, or type annotations to code you did not change.
|
| 15 |
+
- Avoid backwards-compatibility shims for removed code; delete unused code outright.
|
| 16 |
+
|
| 17 |
+
## R Conventions
|
| 18 |
+
|
| 19 |
+
- This project uses data.table, not data.frame/dplyr. Always use data.table syntax (e.g., `DT[, .(col)]`, `:=`) and verify column access patterns work on data.table objects before finalizing edits.
|
| 20 |
+
- Use `set*` functions (`setnames`, `setorder`, `setkey`) for in-place mutations to avoid copies.
|
| 21 |
+
- Prefer `fread`/`fwrite` over `read.csv`/`write.csv`.
|
| 22 |
+
- Do not load tidyverse or dplyr unless explicitly requested.
|
| 23 |
+
|
| 24 |
+
## Performance / Memory
|
| 25 |
+
|
| 26 |
+
- Large datasets hit R memory limits β prefer chunking, avoid unnecessary copies, and add defensive guards for list vs data.table structures before `rbindlist`.
|
| 27 |
+
- Never use `rbind` in a loop; accumulate results in a list and call `rbindlist` once.
|
| 28 |
+
- Avoid `apply`-family functions on large data.tables; use vectorized data.table operations instead.
|
| 29 |
+
- Check `object.size()` / `lobstr::obj_size()` when debugging memory issues rather than guessing.
|
| 30 |
+
|
| 31 |
+
## Token Efficiency
|
| 32 |
+
|
| 33 |
+
- Do not summarize what you just did at the end of a response β the diff is visible.
|
| 34 |
+
- Do not repeat back the user's request before answering.
|
| 35 |
+
- Skip pleasantries and filler phrases ("Great question!", "Certainly!", etc.).
|
| 36 |
+
- When reading code to answer a question, read only the relevant section, not the whole file.
|
| 37 |
+
- Use `files_with_matches` output mode for Grep unless line content is needed.
|
Dockerfile
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt .
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
+
|
| 8 |
+
COPY . .
|
| 9 |
+
|
| 10 |
+
ENV HF_DATASET_REPO=Binxk/goon-data
|
| 11 |
+
ENV DATA_DIR=/app/data
|
| 12 |
+
|
| 13 |
+
RUN mkdir -p /app/data
|
| 14 |
+
|
| 15 |
+
EXPOSE 8501
|
| 16 |
+
|
| 17 |
+
CMD ["sh", "-c", \
|
| 18 |
+
"python -c \"import os; from huggingface_hub import snapshot_download; snapshot_download(repo_id=os.environ['HF_DATASET_REPO'], repo_type='dataset', local_dir=os.environ['DATA_DIR'])\" && \
|
| 19 |
+
streamlit run app.py --server.port=8501 --server.address=0.0.0.0"]
|
README.md
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Subreddits in subreddits25
|
| 2 |
+
|
| 3 |
+
- FemboyGooned
|
| 4 |
+
- furrygooners
|
| 5 |
+
- gaygooncave
|
| 6 |
+
- GayGoonerGuys
|
| 7 |
+
- GirlcockEgirlgooning
|
| 8 |
+
- girlgooners
|
| 9 |
+
- gonegoonergirls
|
| 10 |
+
- Goon_Galaxy
|
| 11 |
+
- GoonCaves
|
| 12 |
+
- GOONED
|
| 13 |
+
- GOONEDISBACK
|
| 14 |
+
- GOONEDmeetup
|
| 15 |
+
- GoonerUtopia
|
| 16 |
+
- GoonetteHub
|
| 17 |
+
- GoonFeet
|
| 18 |
+
- GoonForAlice
|
| 19 |
+
- GoonForAss
|
| 20 |
+
- GoonGay
|
| 21 |
+
- gooning4bbw
|
| 22 |
+
- GooningTrees
|
| 23 |
+
- GooningUnlimited
|
| 24 |
+
- JerkBudsGoonTogether
|
| 25 |
+
- NickiMinajGoonFarm
|
| 26 |
+
- NSFW_CAPTION_AND_GOON
|
| 27 |
+
- ShemaleGOONED
|
| 28 |
+
- sissyGOONED
|
| 29 |
+
- TGOONER
|
| 30 |
+
- TheGoonCaveOfficial
|
| 31 |
+
- TransGooned
|
| 32 |
+
- transgoonergirls
|
REPLICATION.md
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
editor_options:
|
| 3 |
+
markdown:
|
| 4 |
+
wrap: 72
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
# Replication Guide β Grasping 'Gooning'
|
| 8 |
+
|
| 9 |
+
An observational analysis of Reddit gooning communities via BERTopic
|
| 10 |
+
topic modelling.
|
| 11 |
+
|
| 12 |
+
------------------------------------------------------------------------
|
| 13 |
+
|
| 14 |
+
## Project summary
|
| 15 |
+
|
| 16 |
+
This study empirically analyses publicly available Reddit subreddits
|
| 17 |
+
dedicated to gooning (prolonged, trancelike masturbation). The analysis
|
| 18 |
+
pipeline combines:
|
| 19 |
+
|
| 20 |
+
- **Descriptive statistics** β demographics (age, gender, sexuality
|
| 21 |
+
from user flair), word frequencies, and substance mentions across 30
|
| 22 |
+
subreddits
|
| 23 |
+
- **BERTopic topic modelling** β semantic clustering of \~2.35M
|
| 24 |
+
documents (posts and comments) to identify the major themes and
|
| 25 |
+
discourse patterns
|
| 26 |
+
|
| 27 |
+
**Full corpus:** 30 subreddits, \~22M raw records, 2019β2025
|
| 28 |
+
|
| 29 |
+
------------------------------------------------------------------------
|
| 30 |
+
|
| 31 |
+
## Repository structure
|
| 32 |
+
|
| 33 |
+
```
|
| 34 |
+
Goon/
|
| 35 |
+
βββ REPLICATION.md β this file
|
| 36 |
+
βββ README.md β subreddit list
|
| 37 |
+
βββ Goon.Rproj β RStudio project (sets working directory)
|
| 38 |
+
βββ data_prep.qmd β Step 1: R β ingest CSVs, clean, export Parquet
|
| 39 |
+
βββ goon_analysis.qmd β Step 2: R β descriptive analyses
|
| 40 |
+
βββ goon_topic_analysis.qmd β Step 4: R β visualise BERTopic outputs
|
| 41 |
+
βββ data/
|
| 42 |
+
β βββ *.csv β raw Reddit exports (one file per subreddit per year)
|
| 43 |
+
β βββ GOONED_comments.csv β pre-concatenated GOONED comments (all years)
|
| 44 |
+
β βββ GOONED_submissions.csv β pre-concatenated GOONED submissions (all years)
|
| 45 |
+
β βββ comments.parquet β output of data_prep.qmd (18.2M rows)
|
| 46 |
+
β βββ posts.parquet β output of data_prep.qmd (3.8M rows)
|
| 47 |
+
β βββ corpus_clean.parquet β output of topic_analysis.qmd (modelling corpus)
|
| 48 |
+
β βββ corpus_deleted.parquet β deleted/removed rows (tracked separately)
|
| 49 |
+
βββ topic analysis/
|
| 50 |
+
β βββ topic_analysis.qmd β Step 3: Python β BERTopic pipeline
|
| 51 |
+
β βββ topic_api_labeling.qmd β Step 3b: Python β optional API-based topic labelling
|
| 52 |
+
β βββ build_topic_results_summary.py β helper: generate an HTML summary from run outputs
|
| 53 |
+
β βββ run_topic_analysis.sh β shell wrapper that sets env vars and calls quarto
|
| 54 |
+
β βββ README.md β detailed pipeline documentation
|
| 55 |
+
β βββ MEMORY_MANAGEMENT.md β strategies for large-corpus runs
|
| 56 |
+
β βββ runs/
|
| 57 |
+
β βββ full_run_v1/ β complete output of the full corpus run
|
| 58 |
+
β βββ bertopic/ β saved BERTopic models + doc-topic assignments
|
| 59 |
+
β βββ topics/ β keyword tables, summaries, evaluation, API labels
|
| 60 |
+
β βββ figures/ β generated charts
|
| 61 |
+
βββ .venv/ β Python virtual environment (created per instructions below)
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
------------------------------------------------------------------------
|
| 65 |
+
|
| 66 |
+
## System requirements
|
| 67 |
+
|
| 68 |
+
### Hardware
|
| 69 |
+
|
| 70 |
+
| Component | Minimum | Recommended |
|
| 71 |
+
|----|----|----|
|
| 72 |
+
| RAM | 32 GB | 64 GB (full corpus run; see MEMORY_MANAGEMENT.md) |
|
| 73 |
+
| Disk | 50 GB free | 100 GB free |
|
| 74 |
+
| CPU | Any modern multi-core | 8+ cores for embedding generation |
|
| 75 |
+
| GPU | Not required | Optional β speeds up embedding by \~10Γ |
|
| 76 |
+
|
| 77 |
+
> The full corpus run was executed on a Linux VM with 96 GB RAM. A local
|
| 78 |
+
> Mac with 24 GB will work for the pilot/sample runs but may struggle
|
| 79 |
+
> with the full corpus embedding step.
|
| 80 |
+
|
| 81 |
+
### Software
|
| 82 |
+
|
| 83 |
+
| Tool | Version tested | Notes |
|
| 84 |
+
|----|----|----|
|
| 85 |
+
| R | β₯ 4.3 | For data_prep.qmd and goon_analysis.qmd |
|
| 86 |
+
| RStudio / Quarto CLI | β₯ 1.4 | To render .qmd files |
|
| 87 |
+
| Python | β₯ 3.10 | For topic_analysis.qmd |
|
| 88 |
+
| Quarto | β₯ 1.4 | Installed with RStudio or standalone |
|
| 89 |
+
|
| 90 |
+
------------------------------------------------------------------------
|
| 91 |
+
|
| 92 |
+
## Step-by-step replication
|
| 93 |
+
|
| 94 |
+
### Step 0: Clone / obtain the repository
|
| 95 |
+
|
| 96 |
+
The `data/` folder containing raw CSVs is required. These are large
|
| 97 |
+
files and are not distributed via git β they must be present locally.
|
| 98 |
+
|
| 99 |
+
Open `Goon.Rproj` in RStudio. This sets the working directory to the
|
| 100 |
+
project root so that `here::here()` paths resolve correctly.
|
| 101 |
+
|
| 102 |
+
------------------------------------------------------------------------
|
| 103 |
+
|
| 104 |
+
### Step 1: R data preparation (`data_prep.qmd`)
|
| 105 |
+
|
| 106 |
+
**Purpose:** Reads all raw CSVs, combines them into unified data frames,
|
| 107 |
+
applies minimal cleaning, and exports `data/comments.parquet` and
|
| 108 |
+
`data/posts.parquet`.
|
| 109 |
+
|
| 110 |
+
**R packages required:**
|
| 111 |
+
|
| 112 |
+
``` r
|
| 113 |
+
install.packages(c("dplyr", "tidyr", "tibble", "purrr",
|
| 114 |
+
"data.table", "arrow", "here"))
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
**Run:**
|
| 118 |
+
|
| 119 |
+
Open `data_prep.qmd` in RStudio and click **Render** (or run all chunks
|
| 120 |
+
in order).
|
| 121 |
+
|
| 122 |
+
Alternatively, from the terminal:
|
| 123 |
+
|
| 124 |
+
``` bash
|
| 125 |
+
cd /Users/bkot7579/Desktop/Goon
|
| 126 |
+
quarto render data_prep.qmd
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
**Expected outputs:** - `data/comments.parquet` β \~18.2M rows, \~528
|
| 130 |
+
MB - `data/posts.parquet` β \~3.8M rows, \~186 MB
|
| 131 |
+
|
| 132 |
+
**Time estimate:** 15β45 minutes depending on RAM and I/O speed (the
|
| 133 |
+
GOONED CSV files alone are \~7 GB).
|
| 134 |
+
|
| 135 |
+
------------------------------------------------------------------------
|
| 136 |
+
|
| 137 |
+
### Step 2: R descriptive analysis (`goon_analysis.qmd`)
|
| 138 |
+
|
| 139 |
+
**Purpose:** Demographic analysis (r/GOONEDmeetup flair), word frequency
|
| 140 |
+
analysis, substance mention counts.
|
| 141 |
+
|
| 142 |
+
**R packages required:**
|
| 143 |
+
|
| 144 |
+
``` r
|
| 145 |
+
install.packages(c("dplyr", "tidyr", "tibble", "purrr", "ggplot2",
|
| 146 |
+
"stringr", "here", "e1071", "tidytext",
|
| 147 |
+
"data.table", "arrow"))
|
| 148 |
+
```
|
| 149 |
+
|
| 150 |
+
**Run:**
|
| 151 |
+
|
| 152 |
+
Open `goon_analysis.qmd` in RStudio and click **Render**.
|
| 153 |
+
|
| 154 |
+
**Prerequisites:** `data/comments.parquet` and `data/posts.parquet` must
|
| 155 |
+
exist (Step 1).
|
| 156 |
+
|
| 157 |
+
**Expected outputs:** - Rendered HTML with plots embedded - In-memory:
|
| 158 |
+
word count tables, substance mention counts, demographic counts
|
| 159 |
+
|
| 160 |
+
------------------------------------------------------------------------
|
| 161 |
+
|
| 162 |
+
### Step 3: Python BERTopic topic modelling (`topic analysis/topic_analysis.qmd`)
|
| 163 |
+
|
| 164 |
+
**Purpose:** Embeds all documents with `all-MiniLM-L6-v2`, runs HDBSCAN
|
| 165 |
+
topic modelling via BERTopic, reduces topics using c-TF-IDF
|
| 166 |
+
agglomerative clustering, evaluates models, and exports all topic
|
| 167 |
+
outputs.
|
| 168 |
+
|
| 169 |
+
#### 3a. Create the Python virtual environment
|
| 170 |
+
|
| 171 |
+
``` bash
|
| 172 |
+
cd /Users/bkot7579/Desktop/Goon
|
| 173 |
+
python3 -m venv .venv
|
| 174 |
+
source .venv/bin/activate
|
| 175 |
+
|
| 176 |
+
pip install bertopic umap-learn hdbscan sentence-transformers \
|
| 177 |
+
scikit-learn pandas pyarrow quarto
|
| 178 |
+
```
|
| 179 |
+
|
| 180 |
+
**Key package versions (tested):**
|
| 181 |
+
|
| 182 |
+
| Package | Version |
|
| 183 |
+
|-----------------------|---------|
|
| 184 |
+
| bertopic | 0.16.x |
|
| 185 |
+
| umap-learn | 0.5.x |
|
| 186 |
+
| hdbscan | 0.8.x |
|
| 187 |
+
| sentence-transformers | 2.x |
|
| 188 |
+
| scikit-learn | 1.x |
|
| 189 |
+
| pandas | 2.x |
|
| 190 |
+
| pyarrow | 14.x |
|
| 191 |
+
|
| 192 |
+
#### 3b. Run the pipeline
|
| 193 |
+
|
| 194 |
+
**Pilot run (200k documents β recommended first):**
|
| 195 |
+
|
| 196 |
+
``` bash
|
| 197 |
+
cd /Users/bkot7579/Desktop/Goon
|
| 198 |
+
./topic\ analysis/run_topic_analysis.sh --max-docs 200000 --run-tag pilot_200k
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
Outputs land in `topic analysis/runs/pilot_200k/`.
|
| 202 |
+
|
| 203 |
+
**Full corpus run (\~2.35M cleaned documents after filtering):**
|
| 204 |
+
|
| 205 |
+
``` bash
|
| 206 |
+
cd /Users/bkot7579/Desktop/Goon
|
| 207 |
+
./topic\ analysis/run_topic_analysis.sh --run-tag full_run_v1
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
> **Warning:** The full run requires \~64 GB RAM for the embedding +
|
| 211 |
+
> UMAP stages. See `topic analysis/MEMORY_MANAGEMENT.md` for strategies
|
| 212 |
+
> if RAM is limited.
|
| 213 |
+
|
| 214 |
+
**Pipeline stages (automatically executed in order):**
|
| 215 |
+
|
| 216 |
+
1. **Data ingestion** β loads `data/posts.parquet` +
|
| 217 |
+
`data/comments.parquet`
|
| 218 |
+
2. **Light cleaning** β deduplication, URL/username/subreddit
|
| 219 |
+
anonymisation, markdown stripping; deleted/removed rows saved
|
| 220 |
+
separately to `data/corpus_deleted.parquet`
|
| 221 |
+
3. **Embedding** β `all-MiniLM-L6-v2` (384-dim), batch_size=512, saved
|
| 222 |
+
as shards; skips already-generated shards on resume
|
| 223 |
+
4. **UMAP** β pre-computed once, reused across all HDBSCAN configs
|
| 224 |
+
5. **BERTopic** β 6 configurations: min_cluster_size β {50, 100, 200} Γ
|
| 225 |
+
method β {eom, leaf}
|
| 226 |
+
6. **Topic reduction** β c-TF-IDF agglomerative clustering to 100, 50,
|
| 227 |
+
and 25 topics
|
| 228 |
+
7. **Evaluation** β NPMI coherence, topic diversity, outlier rates
|
| 229 |
+
8. **Export** β keyword tables, representative docs, summary CSVs
|
| 230 |
+
|
| 231 |
+
**Reproducibility:** Random seed = 42 throughout. A
|
| 232 |
+
`reproducibility_log.json` is written to the run folder with all
|
| 233 |
+
settings and package versions.
|
| 234 |
+
|
| 235 |
+
#### 3c. Optional API-based topic labelling (`topic_api_labeling.qmd`)
|
| 236 |
+
|
| 237 |
+
Sends reduced topic summaries (keywords + representative texts) to an
|
| 238 |
+
LLM API to generate human-readable labels. Does NOT send raw corpus
|
| 239 |
+
text.
|
| 240 |
+
|
| 241 |
+
Set your API key, then render:
|
| 242 |
+
|
| 243 |
+
``` bash
|
| 244 |
+
export OPENAI_API_KEY="your-key-here"
|
| 245 |
+
quarto render "topic analysis/topic_api_labeling.qmd"
|
| 246 |
+
```
|
| 247 |
+
|
| 248 |
+
Outputs are saved to `topic analysis/runs/<run-tag>/topics/api/`.
|
| 249 |
+
|
| 250 |
+
------------------------------------------------------------------------
|
| 251 |
+
|
| 252 |
+
### Step 4: R topic visualisations (`goon_topic_analysis.qmd`)
|
| 253 |
+
|
| 254 |
+
**Purpose:** Loads the CSV/Parquet outputs from the BERTopic run and
|
| 255 |
+
produces exploratory visualisations: topic size bar chart, subreddit Γ
|
| 256 |
+
topic heatmap, topic prevalence over time, post vs comment split,
|
| 257 |
+
representative documents.
|
| 258 |
+
|
| 259 |
+
**R packages required:**
|
| 260 |
+
|
| 261 |
+
``` r
|
| 262 |
+
install.packages(c("dplyr", "tidyr", "tibble", "purrr",
|
| 263 |
+
"ggplot2", "stringr", "here", "arrow"))
|
| 264 |
+
```
|
| 265 |
+
|
| 266 |
+
**Run:**
|
| 267 |
+
|
| 268 |
+
``` bash
|
| 269 |
+
quarto render goon_topic_analysis.qmd
|
| 270 |
+
```
|
| 271 |
+
|
| 272 |
+
**Prerequisites:** Step 3 must have completed and outputs must exist
|
| 273 |
+
under `topic analysis/runs/full_run_v1/topics/`.
|
| 274 |
+
|
| 275 |
+
------------------------------------------------------------------------
|
| 276 |
+
|
| 277 |
+
## Execution order summary
|
| 278 |
+
|
| 279 |
+
```
|
| 280 |
+
Step 1 β data_prep.qmd (R) ~30 min
|
| 281 |
+
Step 2 β goon_analysis.qmd (R) ~10 min
|
| 282 |
+
Step 3 β topic_analysis.qmd (Python) ~6β48 hours (full corpus)
|
| 283 |
+
Step 3b β topic_api_labeling.qmd (Python) ~5 min + API cost (optional)
|
| 284 |
+
Step 4 β goon_topic_analysis.qmd (R) ~2 min
|
| 285 |
+
```
|
| 286 |
+
|
| 287 |
+
Steps 2 and 3 are independent of each other and can run in parallel.
|
| 288 |
+
|
| 289 |
+
------------------------------------------------------------------------
|
| 290 |
+
|
| 291 |
+
## Key modelling decisions
|
| 292 |
+
|
| 293 |
+
| Decision | Choice | Rationale |
|
| 294 |
+
|-------------------------|--------------------|---------------------------|
|
| 295 |
+
| Embedding model | `all-MiniLM-L6-v2` | Fast, runs on CPU, 384-dim sufficient for topic structure |
|
| 296 |
+
| UMAP n_neighbors | 15 | BERTopic default; balances local vs global structure |
|
| 297 |
+
| UMAP n_components | 5 | Low enough for HDBSCAN to work well |
|
| 298 |
+
| HDBSCAN min_cluster_size | 50, 100, 200 | Tested all three; mcs=100 eom selected as reference |
|
| 299 |
+
| Topic reduction method | c-TF-IDF agglomerative | Merges semantically similar topics rather than splitting clusters |
|
| 300 |
+
| Reduction targets | 100, 50, 25 | 50 selected for reporting (NPMI=0.27, diversity=0.74) |
|
| 301 |
+
| Preprocessing | Minimal | Preserves informal language and slang; CountVectorizer handles casing/stopwords |
|
| 302 |
+
| Random seed | 42 | Applied to UMAP, HDBSCAN sampling, and document cap sampling |
|
| 303 |
+
|
| 304 |
+
------------------------------------------------------------------------
|
| 305 |
+
|
| 306 |
+
## Known issue: r/GOONED missing from BERTopic results
|
| 307 |
+
|
| 308 |
+
r/GOONED is the dominant subreddit in the corpus by a large margin:
|
| 309 |
+
|
| 310 |
+
| | Count |
|
| 311 |
+
|---------------------|----------------|
|
| 312 |
+
| r/GOONED posts | 2,765,119 |
|
| 313 |
+
| r/GOONED comments | 15,493,075 |
|
| 314 |
+
| **r/GOONED total** | **18,258,194** |
|
| 315 |
+
| Full cleaned corpus | 22,011,124 |
|
| 316 |
+
| **r/GOONED share** | **82.9%** |
|
| 317 |
+
|
| 318 |
+
Despite this, r/GOONED is **entirely absent** from
|
| 319 |
+
`topic analysis/runs/full_run_v1/` outputs. The cloud VM that ran the
|
| 320 |
+
BERTopic pipeline did not have the `GOONED_comments.csv` and
|
| 321 |
+
`GOONED_submissions.csv` files available (most likely because their
|
| 322 |
+
combined size of \~7 GB made transfer impractical), and
|
| 323 |
+
`corpus_clean.parquet` on the VM was generated without them.
|
| 324 |
+
|
| 325 |
+
**Consequence:** All topic modelling results represent 29 subreddits
|
| 326 |
+
(3.75M records) rather than the full 30-subreddit corpus (22M records).
|
| 327 |
+
Topic proportions, dominant themes, and subreddit distribution tables
|
| 328 |
+
are therefore not representative of the full corpus.
|
| 329 |
+
|
| 330 |
+
**To fix:** Ensure the `GOONED_*.csv` files (or the pre-built
|
| 331 |
+
`comments.parquet` / `posts.parquet`) are available on the compute
|
| 332 |
+
environment, then re-run:
|
| 333 |
+
|
| 334 |
+
``` bash
|
| 335 |
+
./topic\ analysis/run_topic_analysis.sh --run-tag full_run_v2
|
| 336 |
+
```
|
| 337 |
+
|
| 338 |
+
Note: the R-side analyses (`goon_analysis.qmd`) are **not** affected by
|
| 339 |
+
this issue β they read directly from the local `data/posts.parquet` and
|
| 340 |
+
`data/comments.parquet` files, which do contain r/GOONED data.
|
| 341 |
+
|
| 342 |
+
------------------------------------------------------------------------
|
| 343 |
+
|
| 344 |
+
## Known limitations
|
| 345 |
+
|
| 346 |
+
1. **Outlier rate:** 37.3% of documents are assigned to the outlier
|
| 347 |
+
topic (-1) by HDBSCAN. This is typical for short-text social media
|
| 348 |
+
corpora. Outliers are excluded from topic analyses but are retained
|
| 349 |
+
in the corpus parquet files.
|
| 350 |
+
|
| 351 |
+
2. **Flair ambiguity:** Gender/sexuality classifiers rely on
|
| 352 |
+
voluntarily set flair strings. Flair adoption is uneven across
|
| 353 |
+
subreddits and users, introducing selection bias. Abbreviations like
|
| 354 |
+
`\\bt\\b` may match unintended strings (e.g. US state TX).
|
| 355 |
+
|
| 356 |
+
3. **Deleted content:** Posts and comments marked `[deleted]` or
|
| 357 |
+
`[removed]` are excluded from modelling but counted separately.
|
| 358 |
+
These may disproportionately represent controversial content.
|
| 359 |
+
|
| 360 |
+
4. **Temporal coverage:** Coverage varies by subreddit. Some
|
| 361 |
+
communities only appear in later years (2024β2025); others span the
|
| 362 |
+
full 2019β2025 window.
|
| 363 |
+
|
| 364 |
+
5. **Platform-specific norms:** Moderation rules, flair conventions,
|
| 365 |
+
and posting styles differ across subreddits, which may shape topics
|
| 366 |
+
in ways that are not generalisable.
|
| 367 |
+
|
| 368 |
+
6. **Unobserved participants:** Lurkers, banned users, and deleted
|
| 369 |
+
accounts are not captured.
|
| 370 |
+
|
| 371 |
+
------------------------------------------------------------------------
|
| 372 |
+
|
| 373 |
+
## Output files (full_run_v1)
|
| 374 |
+
|
| 375 |
+
| File | Description |
|
| 376 |
+
|-----------------------|-------------------------------------------------|
|
| 377 |
+
| `topics/final_topic_summary.csv` | 50 reduced topics with size, keywords, representative texts |
|
| 378 |
+
| `topics/final_model_comparison.csv` | Coherence, diversity, outlier rate for all 9 runs |
|
| 379 |
+
| `topics/evaluation_table.csv` | Same as above, alternate format |
|
| 380 |
+
| `topics/bertopic_run_summary.csv` | Initial topic counts across 6 HDBSCAN configurations |
|
| 381 |
+
| `topics/topic_by_subreddit.csv` | Topic Γ subreddit document counts |
|
| 382 |
+
| `topics/topic_by_month.csv` | Topic Γ month document counts |
|
| 383 |
+
| `topics/topic_by_doctype.csv` | Topic Γ doc type (post/comment) |
|
| 384 |
+
| `topics/preprocessing_decisions.json` | Logged cleaning decisions |
|
| 385 |
+
| `topics/api/` | API-generated labels, summaries, category annotations |
|
| 386 |
+
| `bertopic/` | Saved BERTopic models + doc-topic parquet files for all 6 initial runs |
|
| 387 |
+
|
| 388 |
+
------------------------------------------------------------------------
|
| 389 |
+
|
| 390 |
+
## Contacts and citation
|
| 391 |
+
|
| 392 |
+
This analysis was conducted as part of a preliminary empirical study of
|
| 393 |
+
online gooning communities. If replicating, please cite the original
|
| 394 |
+
study and note the random seed, embedding model, and reduction target
|
| 395 |
+
used.
|
agent/.env.example
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ANTHROPIC_API_KEY=your_key_here
|
| 2 |
+
TOGETHER_API_KEY=your_key_here
|
| 3 |
+
# Optional: override default data directory (default: ../data relative to agent/)
|
| 4 |
+
# DATA_DIR=/absolute/path/to/data
|
agent/.streamlit/config.toml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[theme]
|
| 2 |
+
base = "light"
|
| 3 |
+
backgroundColor = "#f5f3ef"
|
| 4 |
+
secondaryBackgroundColor = "#edeae4"
|
| 5 |
+
textColor = "#1a1a1a"
|
| 6 |
+
primaryColor = "#1a1a1a"
|
| 7 |
+
font = "sans serif"
|
| 8 |
+
|
| 9 |
+
[server]
|
| 10 |
+
headless = true
|
agent/CLAUDE.md
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CLAUDE.md
|
| 2 |
+
|
| 3 |
+
## Project
|
| 4 |
+
|
| 5 |
+
Build an agent that:
|
| 6 |
+
|
| 7 |
+
1. Accepts user questions about a dataset.
|
| 8 |
+
2. Inspects available data files and schema.
|
| 9 |
+
3. Chooses appropriate analysis steps.
|
| 10 |
+
4. Runs reproducible analyses in code.
|
| 11 |
+
5. Returns answers in plain language plus supporting outputs.
|
| 12 |
+
6. O([docs.anthropic.com](https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/overview?utm_source=chatgpt.com))ewing results.
|
| 13 |
+
|
| 14 |
+
## Core objective
|
| 15 |
+
|
| 16 |
+
This repository is for a question-driven data analysis agent.
|
| 17 |
+
The agent should behave like an analysis system, not a general chatbot.
|
| 18 |
+
Its job is to translate a user question into a reproducible analytic workflow grounded in the available data.
|
| 19 |
+
|
| 20 |
+
The agent must:
|
| 21 |
+
|
| 22 |
+
* inspect files before making assumptions
|
| 23 |
+
* prefer deterministic code execution over unsupported claims
|
| 24 |
+
* distinguish clearly between observed results, assumptions, and uncertainty
|
| 25 |
+
* save intermediate outputs when useful
|
| 26 |
+
* produce answers that map directly to the user's question
|
| 27 |
+
|
| 28 |
+
## Non-goals
|
| 29 |
+
|
| 30 |
+
Do not:
|
| 31 |
+
|
| 32 |
+
* invent variables, files, columns, or results
|
| 33 |
+
* answer from background knowledge when the answer should come from the data
|
| 34 |
+
* silently skip failed steps
|
| 35 |
+
* overwrite user data files unless explicitly instructed
|
| 36 |
+
* present speculative findings as if they were measured
|
| 37 |
+
|
| 38 |
+
## Default workflow
|
| 39 |
+
|
| 40 |
+
For each user request, follow this sequence:
|
| 41 |
+
|
| 42 |
+
1. Interpret the question
|
| 43 |
+
|
| 44 |
+
* Restate the task internally in operational terms.
|
| 45 |
+
* Identify whether the question is descriptive, comparative, predictive, inferential, filtering-based, text-analysis-based, or dashboard/UI-related.
|
| 46 |
+
* Identify the likely required inputs, outputs, and constraints.
|
| 47 |
+
|
| 48 |
+
2. Inspect the repository and data
|
| 49 |
+
|
| 50 |
+
* Find relevant files, scripts, configs, notebooks, and prior outputs.
|
| 51 |
+
* Detect available datasets and likely schema.
|
| 52 |
+
* Read small samples of files first.
|
| 53 |
+
* Determine data types, missingness, date fields, identifiers, and likely join keys.
|
| 54 |
+
|
| 55 |
+
3. Plan before acting
|
| 56 |
+
|
| 57 |
+
* Write a short task plan in the response or working notes.
|
| 58 |
+
* Select the minimum analysis needed to answer the question properly.
|
| 59 |
+
* Prefer existing project utilities where available.
|
| 60 |
+
|
| 61 |
+
4. Execute reproducibly
|
| 62 |
+
|
| 63 |
+
* Use code, scripts, or SQL rather than manual reasoning for calculations.
|
| 64 |
+
* Save generated code in the repo when it is likely to be reused.
|
| 65 |
+
* Save outputs to a structured outputs directory.
|
| 66 |
+
|
| 67 |
+
5. Validate
|
| 68 |
+
|
| 69 |
+
* Check row counts, duplicates, filters, and assumptions.
|
| 70 |
+
* Inspect whether results are plausible.
|
| 71 |
+
* Flag ambiguities, weak inference, or low data quality.
|
| 72 |
+
|
| 73 |
+
6. Respond
|
| 74 |
+
|
| 75 |
+
* Answer the exact question first.
|
| 76 |
+
* Then show method, assumptions, and supporting numbers.
|
| 77 |
+
* Keep prose direct and structured.
|
| 78 |
+
|
| 79 |
+
## Response format
|
| 80 |
+
|
| 81 |
+
Unless the user asks otherwise, structure final responses as:
|
| 82 |
+
|
| 83 |
+
1. Answer
|
| 84 |
+
|
| 85 |
+
* Direct answer to the question.
|
| 86 |
+
|
| 87 |
+
2. What was analysed
|
| 88 |
+
|
| 89 |
+
* Files used
|
| 90 |
+
* Filters used
|
| 91 |
+
* Key variables used
|
| 92 |
+
|
| 93 |
+
3. Method
|
| 94 |
+
|
| 95 |
+
* Exact analysis steps
|
| 96 |
+
* Statistical or computational approach
|
| 97 |
+
|
| 98 |
+
4. Results
|
| 99 |
+
|
| 100 |
+
* Main numbers, estimates, comparisons, or model outputs
|
| 101 |
+
|
| 102 |
+
5. Caveats
|
| 103 |
+
|
| 104 |
+
* Missing data, ambiguity, low sample size, measurement issues, or assumptions
|
| 105 |
+
|
| 106 |
+
6. Saved outputs
|
| 107 |
+
|
| 108 |
+
* Paths to scripts, tables, figures, or logs created during the run
|
| 109 |
+
|
| 110 |
+
## Behavioural rules
|
| 111 |
+
|
| 112 |
+
* Be concise but complete.
|
| 113 |
+
* Prefer bulletless structured sections unless lists are clearer.
|
| 114 |
+
* Never claim an analysis was run if it was not run.
|
| 115 |
+
* If a file or field is missing, say so explicitly.
|
| 116 |
+
* If the user asks a vague question, propose the most reasonable operationalization and proceed.
|
| 117 |
+
* If multiple interpretations are possible, state the one chosen and why.
|
| 118 |
+
* Prefer transparency over fluency.
|
| 119 |
+
|
| 120 |
+
## Analysis policy
|
| 121 |
+
|
| 122 |
+
### Descriptive questions
|
| 123 |
+
|
| 124 |
+
For questions like:
|
| 125 |
+
|
| 126 |
+
* How many
|
| 127 |
+
* What proportion
|
| 128 |
+
* What is the average
|
| 129 |
+
* What changed over time
|
| 130 |
+
|
| 131 |
+
Use:
|
| 132 |
+
|
| 133 |
+
* counts
|
| 134 |
+
* percentages
|
| 135 |
+
* grouped summaries
|
| 136 |
+
* date aggregation
|
| 137 |
+
* plots when helpful
|
| 138 |
+
|
| 139 |
+
### Comparative questions
|
| 140 |
+
|
| 141 |
+
For questions like:
|
| 142 |
+
|
| 143 |
+
* Is group A different from group B
|
| 144 |
+
* Which variables differ most across groups
|
| 145 |
+
|
| 146 |
+
Use:
|
| 147 |
+
|
| 148 |
+
* grouped summaries first
|
| 149 |
+
* effect sizes where appropriate
|
| 150 |
+
* inferential tests only if defensible for the data and design
|
| 151 |
+
|
| 152 |
+
### Predictive questions
|
| 153 |
+
|
| 154 |
+
For questions like:
|
| 155 |
+
|
| 156 |
+
* Can we predict X from Y
|
| 157 |
+
* Which variables matter most
|
| 158 |
+
|
| 159 |
+
Use:
|
| 160 |
+
|
| 161 |
+
* explicit train/test logic
|
| 162 |
+
* baseline models before complex ones
|
| 163 |
+
* interpretable models unless performance is the main goal
|
| 164 |
+
* appropriate metrics for the target type
|
| 165 |
+
|
| 166 |
+
### Text questions
|
| 167 |
+
|
| 168 |
+
For questions over text data:
|
| 169 |
+
|
| 170 |
+
* inspect raw examples first
|
| 171 |
+
* identify the unit of analysis: document, post, comment, sentence, token
|
| 172 |
+
* choose methods that fit the question: keyword rules, embeddings, topic modelling, clustering, classification, sentiment, summarisation, or retrieval
|
| 173 |
+
* keep raw text outputs traceable to source rows where allowed
|
| 174 |
+
|
| 175 |
+
### Causal language
|
| 176 |
+
|
| 177 |
+
Do not use causal language unless the design supports it.
|
| 178 |
+
Replace causal claims with associational language by default.
|
| 179 |
+
|
| 180 |
+
## Reproducibility requirements
|
| 181 |
+
|
| 182 |
+
* Every non-trivial analysis should produce a saved artifact.
|
| 183 |
+
* Prefer these folders when they exist; otherwise create them:
|
| 184 |
+
|
| 185 |
+
* `data/`
|
| 186 |
+
* `analysis/`
|
| 187 |
+
* `app/`
|
| 188 |
+
* `outputs/`
|
| 189 |
+
* `logs/`
|
| 190 |
+
* Save generated scripts with descriptive names.
|
| 191 |
+
* Save machine-readable outputs in csv, json, parquet, or feather where sensible.
|
| 192 |
+
* Save plots as png or svg.
|
| 193 |
+
* Save a short run log for substantial jobs.
|
| 194 |
+
|
| 195 |
+
## File and code conventions
|
| 196 |
+
|
| 197 |
+
* Never modify raw source data in place.
|
| 198 |
+
* Write derived data to `data/derived/`.
|
| 199 |
+
* Write one-off analysis scripts to `analysis/`.
|
| 200 |
+
* Write reusable utilities to `analysis/lib/` or an existing utilities folder.
|
| 201 |
+
* Name files to reflect task and date when useful.
|
| 202 |
+
|
| 203 |
+
## UI goal
|
| 204 |
+
|
| 205 |
+
If UI work is requested, build a lightweight question interface that sits on top of the analysis pipeline.
|
| 206 |
+
|
| 207 |
+
Preferred UI order:
|
| 208 |
+
|
| 209 |
+
1. Streamlit for fastest single-user internal tool
|
| 210 |
+
2. Gradio for rapid prototype interaction
|
| 211 |
+
3. React + FastAPI if a more controlled app is needed
|
| 212 |
+
|
| 213 |
+
Default recommendation:
|
| 214 |
+
|
| 215 |
+
* Use Streamlit if no frontend stack is already specified.
|
| 216 |
+
|
| 217 |
+
## Default UI specification
|
| 218 |
+
|
| 219 |
+
When building a UI, include:
|
| 220 |
+
|
| 221 |
+
* a file uploader or dataset selector
|
| 222 |
+
* a question input box
|
| 223 |
+
* an optional advanced settings panel
|
| 224 |
+
* a results panel with direct answer
|
| 225 |
+
* expandable method details
|
| 226 |
+
* downloadable outputs when possible
|
| 227 |
+
* a history pane of prior questions in the current session
|
| 228 |
+
|
| 229 |
+
## Suggested app architecture
|
| 230 |
+
|
| 231 |
+
### Minimal version
|
| 232 |
+
|
| 233 |
+
* `app/app.py` for Streamlit UI
|
| 234 |
+
* `analysis/router.py` to classify question type
|
| 235 |
+
* `analysis/inspect_data.py` to inspect schema and file summaries
|
| 236 |
+
* `analysis/run_analysis.py` to execute question-specific workflows
|
| 237 |
+
* `analysis/format_response.py` to convert outputs into a final answer
|
| 238 |
+
* `outputs/` for saved artifacts
|
| 239 |
+
|
| 240 |
+
### API version
|
| 241 |
+
|
| 242 |
+
* `app/frontend/` for React UI
|
| 243 |
+
* `app/api/main.py` for FastAPI endpoints
|
| 244 |
+
* `analysis/` for shared analysis logic
|
| 245 |
+
* `outputs/` and `logs/` for artifacts and traceability
|
| 246 |
+
|
| 247 |
+
## Question routing logic
|
| 248 |
+
|
| 249 |
+
Map user questions to one of these modes:
|
| 250 |
+
|
| 251 |
+
* `describe`
|
| 252 |
+
* `compare`
|
| 253 |
+
* `trend`
|
| 254 |
+
* `predict`
|
| 255 |
+
* `text_search`
|
| 256 |
+
* `text_cluster`
|
| 257 |
+
* `summarise_subset`
|
| 258 |
+
* `dashboard_request`
|
| 259 |
+
* `data_quality_check`
|
| 260 |
+
|
| 261 |
+
When uncertain, start with `describe` plus schema inspection.
|
| 262 |
+
|
| 263 |
+
## Tool preferences
|
| 264 |
+
|
| 265 |
+
Prefer this order:
|
| 266 |
+
|
| 267 |
+
1. Existing project scripts
|
| 268 |
+
2. Python analysis scripts
|
| 269 |
+
3. SQL queries if the data store is relational
|
| 270 |
+
4. R scripts if the project already uses R heavily
|
| 271 |
+
5. Shell utilities for fast inspection only
|
| 272 |
+
|
| 273 |
+
## Statistical discipline
|
| 274 |
+
|
| 275 |
+
* Report denominators.
|
| 276 |
+
* Report missingness when relevant.
|
| 277 |
+
* Do not run significance tests by reflex.
|
| 278 |
+
* Match methods to the measurement level and design.
|
| 279 |
+
* For small samples or noisy text outputs, emphasise uncertainty.
|
| 280 |
+
|
| 281 |
+
## Error handling
|
| 282 |
+
|
| 283 |
+
When something fails:
|
| 284 |
+
|
| 285 |
+
* state exactly what failed
|
| 286 |
+
* include the file, command, or function involved
|
| 287 |
+
* propose the next best fallback
|
| 288 |
+
* continue where possible instead of stopping entirely
|
| 289 |
+
|
| 290 |
+
## Safe execution rules
|
| 291 |
+
|
| 292 |
+
* Confirm before destructive operations.
|
| 293 |
+
* Avoid network calls unless needed.
|
| 294 |
+
* Do not expose secrets from `.env`, keys, or credentials.
|
| 295 |
+
* Do not execute untrusted code from data files.
|
| 296 |
+
|
| 297 |
+
## Preferred implementation plan when asked to build the system
|
| 298 |
+
|
| 299 |
+
1. Inspect repo and identify current stack.
|
| 300 |
+
2. Create analysis router.
|
| 301 |
+
3. Create schema inspection utility.
|
| 302 |
+
4. Create question-to-analysis execution module.
|
| 303 |
+
5. Create response formatter.
|
| 304 |
+
6. Create Streamlit UI if no existing frontend is present.
|
| 305 |
+
7. Test on one small example dataset.
|
| 306 |
+
8. Save outputs and usage instructions.
|
| 307 |
+
|
| 308 |
+
## What to do when the repo is empty
|
| 309 |
+
|
| 310 |
+
If the repository is mostly empty, scaffold this structure:
|
| 311 |
+
|
| 312 |
+
```text
|
| 313 |
+
project_root/
|
| 314 |
+
CLAUDE.md
|
| 315 |
+
README.md
|
| 316 |
+
requirements.txt
|
| 317 |
+
.env.example
|
| 318 |
+
data/
|
| 319 |
+
raw/
|
| 320 |
+
derived/
|
| 321 |
+
analysis/
|
| 322 |
+
__init__.py
|
| 323 |
+
inspect_data.py
|
| 324 |
+
router.py
|
| 325 |
+
run_analysis.py
|
| 326 |
+
format_response.py
|
| 327 |
+
app/
|
| 328 |
+
app.py
|
| 329 |
+
outputs/
|
| 330 |
+
logs/
|
| 331 |
+
```
|
| 332 |
+
|
| 333 |
+
Then implement:
|
| 334 |
+
|
| 335 |
+
* a schema inspector
|
| 336 |
+
* a rule-based question router
|
| 337 |
+
* a first-pass analysis runner for descriptive and comparative questions
|
| 338 |
+
* a simple Streamlit UI
|
| 339 |
+
|
| 340 |
+
## Definition of done
|
| 341 |
+
|
| 342 |
+
A task is complete only when:
|
| 343 |
+
|
| 344 |
+
* the question has been answered directly
|
| 345 |
+
* the analysis actually ran or the blocker is explicit
|
| 346 |
+
* outputs are saved where appropriate
|
| 347 |
+
* the response includes methods and caveats
|
| 348 |
+
* any created UI or scripts can be run by another developer with minimal guesswork
|
| 349 |
+
|
| 350 |
+
## Example high-quality user requests
|
| 351 |
+
|
| 352 |
+
* Analyse `data/raw/survey.csv` and tell me whether anxiety differs by gender.
|
| 353 |
+
* Compare yearly trends in posts by topic using the reddit export in `data/raw/`.
|
| 354 |
+
* Build a UI where I can upload a csv and ask natural-language questions.
|
| 355 |
+
* Inspect this dataset and tell me what questions it can answer reliably.
|
| 356 |
+
|
| 357 |
+
## Example implementation prompt for Claude Code
|
| 358 |
+
|
| 359 |
+
Build a question-driven data analysis agent in this repository.
|
| 360 |
+
|
| 361 |
+
Requirements:
|
| 362 |
+
|
| 363 |
+
* ingest tabular files and optionally text datasets
|
| 364 |
+
* inspect schema before analysis
|
| 365 |
+
* classify incoming questions into analysis modes
|
| 366 |
+
* run reproducible analyses in Python
|
| 367 |
+
* save scripts and outputs
|
| 368 |
+
* return direct answers plus methods and caveats
|
| 369 |
+
* build a simple Streamlit UI with file upload, question input, and results panel
|
| 370 |
+
|
| 371 |
+
Start by scaffolding the project if needed, then implement the minimal working version for csv files.
|
| 372 |
+
Do not invent results. Run actual analyses against the provided data.
|
agent/README.md
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Agent App
|
| 2 |
+
|
| 3 |
+
## Run the UI
|
| 4 |
+
|
| 5 |
+
```bash
|
| 6 |
+
cd /Users/binx/Desktop/Goon/agent
|
| 7 |
+
python3 -m venv .venv
|
| 8 |
+
source .venv/bin/activate
|
| 9 |
+
pip install -r requirements.txt
|
| 10 |
+
streamlit run app/app.py
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
Set your Anthropic key in `agent/.env`:
|
| 14 |
+
|
| 15 |
+
```bash
|
| 16 |
+
ANTHROPIC_API_KEY=your_key_here
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
Or paste it into the sidebar after the app starts.
|
| 20 |
+
|
| 21 |
+
## Local Python API
|
| 22 |
+
|
| 23 |
+
This repo does not expose an HTTP API server. The supported programmatic interface is the local Python function in `analysis.agent`.
|
| 24 |
+
|
| 25 |
+
### Basic usage
|
| 26 |
+
|
| 27 |
+
```python
|
| 28 |
+
from analysis.agent import run_agent
|
| 29 |
+
|
| 30 |
+
result = run_agent("How many posts per subreddit?")
|
| 31 |
+
print(result["answer"])
|
| 32 |
+
print(result["tool_calls"])
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
### With prior context
|
| 36 |
+
|
| 37 |
+
```python
|
| 38 |
+
from analysis.agent import run_agent
|
| 39 |
+
|
| 40 |
+
turns = [
|
| 41 |
+
{
|
| 42 |
+
"question": "How many posts per subreddit?",
|
| 43 |
+
"answer": "Previous answer text",
|
| 44 |
+
"tool_calls": [],
|
| 45 |
+
"artifacts": [],
|
| 46 |
+
"plotly_json": "",
|
| 47 |
+
"route": "describe",
|
| 48 |
+
}
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
result = run_agent(
|
| 52 |
+
"Which subreddits changed most over time?",
|
| 53 |
+
turns=turns,
|
| 54 |
+
)
|
| 55 |
+
print(result["route"])
|
| 56 |
+
print(result["answer"])
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
## Return shape
|
| 60 |
+
|
| 61 |
+
`run_agent(...)` returns a dictionary with:
|
| 62 |
+
|
| 63 |
+
- `answer`: final assistant response
|
| 64 |
+
- `tool_calls`: executed tool calls plus arguments and results
|
| 65 |
+
- `plotly_json`: chart payload when a plot was generated
|
| 66 |
+
- `route`: detected route for the question
|
| 67 |
+
- `allowed_tools`: tools exposed for that route
|
| 68 |
+
|
| 69 |
+
## Notes
|
| 70 |
+
|
| 71 |
+
- Use `python3`, not `python`, in this environment.
|
| 72 |
+
- The app stores structured turn state in the Streamlit session so follow-up questions can reuse prior analytical context.
|
| 73 |
+
- Generated CSV and PNG artifacts are written to `agent/outputs/`.
|
agent/analysis.py
ADDED
|
@@ -0,0 +1,1480 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Goon analysis agent β all modules in one file.
|
| 3 |
+
|
| 4 |
+
Sections:
|
| 5 |
+
1. Response formatter
|
| 6 |
+
2. Data inspection & sampling
|
| 7 |
+
3. Question router
|
| 8 |
+
4. Image analysis
|
| 9 |
+
5. Text pattern extraction
|
| 10 |
+
6. Analysis execution (count, trend, stats, search, word freq, compare)
|
| 11 |
+
7. Core agent loop
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
# ββ stdlib βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 17 |
+
import base64
|
| 18 |
+
import glob
|
| 19 |
+
import io
|
| 20 |
+
import json
|
| 21 |
+
import os
|
| 22 |
+
import re
|
| 23 |
+
import traceback as _traceback
|
| 24 |
+
import urllib.request
|
| 25 |
+
from dataclasses import dataclass
|
| 26 |
+
from datetime import datetime, timezone
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
|
| 29 |
+
# ββ third-party ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
+
import anthropic
|
| 31 |
+
import openai # Together AI uses an OpenAI-compatible endpoint
|
| 32 |
+
import pandas as pd
|
| 33 |
+
import plotly.express as px
|
| 34 |
+
import plotly.io as pio
|
| 35 |
+
import pyarrow.dataset as ds
|
| 36 |
+
import pyarrow.parquet as pq
|
| 37 |
+
from pyarrow.compute import field
|
| 38 |
+
from sklearn.metrics import cohen_kappa_score
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 42 |
+
# 1. Response formatter
|
| 43 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 44 |
+
|
| 45 |
+
def format_result(result: dict, answer_text: str = "") -> str:
|
| 46 |
+
"""Combine Claude's prose answer with the structured analysis result as markdown."""
|
| 47 |
+
lines = []
|
| 48 |
+
|
| 49 |
+
if answer_text:
|
| 50 |
+
lines.append("## Answer\n")
|
| 51 |
+
lines.append(answer_text.strip())
|
| 52 |
+
lines.append("")
|
| 53 |
+
|
| 54 |
+
dataset = result.get("dataset", "")
|
| 55 |
+
if dataset:
|
| 56 |
+
lines.append("## What was analysed\n")
|
| 57 |
+
lines.append(f"- Dataset: `{dataset}`")
|
| 58 |
+
if result.get("subreddit_filter"):
|
| 59 |
+
lines.append(f"- Subreddit filter: `{result['subreddit_filter']}`")
|
| 60 |
+
if result.get("group_col"):
|
| 61 |
+
lines.append(f"- Grouped by: `{result['group_col']}`")
|
| 62 |
+
if result.get("value_col"):
|
| 63 |
+
lines.append(f"- Value column: `{result['value_col']}`")
|
| 64 |
+
lines.append("")
|
| 65 |
+
|
| 66 |
+
table = result.get("table")
|
| 67 |
+
if table:
|
| 68 |
+
lines.append("## Results\n")
|
| 69 |
+
lines.append(_dict_list_to_md_table(table))
|
| 70 |
+
lines.append("")
|
| 71 |
+
|
| 72 |
+
saved = [result[k] for k in ("saved_csv", "saved_png") if result.get(k)]
|
| 73 |
+
if saved:
|
| 74 |
+
lines.append("## Saved outputs\n")
|
| 75 |
+
for s in saved:
|
| 76 |
+
lines.append(f"- `{s}`")
|
| 77 |
+
lines.append("")
|
| 78 |
+
|
| 79 |
+
return "\n".join(lines)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def _dict_list_to_md_table(records: list[dict]) -> str:
|
| 83 |
+
if not records:
|
| 84 |
+
return "_No results._"
|
| 85 |
+
headers = list(records[0].keys())
|
| 86 |
+
rows = [[str(r.get(h, "")) for h in headers] for r in records]
|
| 87 |
+
widths = [max(len(h), max((len(r[i]) for r in rows), default=0)) for i, h in enumerate(headers)]
|
| 88 |
+
sep = "| " + " | ".join("-" * w for w in widths) + " |"
|
| 89 |
+
header_row = "| " + " | ".join(h.ljust(widths[i]) for i, h in enumerate(headers)) + " |"
|
| 90 |
+
data_rows = [
|
| 91 |
+
"| " + " | ".join(cell.ljust(widths[i]) for i, cell in enumerate(row)) + " |"
|
| 92 |
+
for row in rows[:50]
|
| 93 |
+
]
|
| 94 |
+
return "\n".join([header_row, sep] + data_rows)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 98 |
+
# 2. Data inspection & sampling
|
| 99 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 100 |
+
|
| 101 |
+
DATA_DIR = Path(os.environ.get("DATA_DIR", Path(__file__).parent.parent / "data"))
|
| 102 |
+
OUTPUTS_DIR = Path(__file__).parent / "outputs"
|
| 103 |
+
OUTPUTS_DIR.mkdir(exist_ok=True)
|
| 104 |
+
METADATA_CACHE = OUTPUTS_DIR / "dataset_metadata.json"
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _best(name: str) -> Path:
|
| 108 |
+
"""Prefer the full rebuilt parquet over the original, but validate it first."""
|
| 109 |
+
full = DATA_DIR / f"{name}_full.parquet"
|
| 110 |
+
orig = DATA_DIR / f"{name}.parquet"
|
| 111 |
+
if full.exists():
|
| 112 |
+
try:
|
| 113 |
+
pq.read_schema(full)
|
| 114 |
+
return full
|
| 115 |
+
except Exception:
|
| 116 |
+
pass
|
| 117 |
+
return orig
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
DATASETS = {
|
| 121 |
+
"posts": _best("posts"),
|
| 122 |
+
"comments": _best("comments"),
|
| 123 |
+
"corpus_clean": DATA_DIR / "corpus_clean.parquet",
|
| 124 |
+
"titles": _best("titles"),
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def _dataset_path(name: str) -> Path:
|
| 129 |
+
path = DATASETS.get(name)
|
| 130 |
+
if path is None or not path.exists():
|
| 131 |
+
raise FileNotFoundError(f"Dataset '{name}' not found at {path}")
|
| 132 |
+
return path
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def _load(name: str, columns: list[str] | None = None) -> pd.DataFrame:
|
| 136 |
+
return pd.read_parquet(_dataset_path(name), columns=columns)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def _scanner(name: str, columns: list[str] | None = None, filters: dict | None = None) -> ds.Scanner:
|
| 140 |
+
path = _dataset_path(name)
|
| 141 |
+
dataset = ds.dataset(path, format="parquet")
|
| 142 |
+
expression = None
|
| 143 |
+
for col, value in (filters or {}).items():
|
| 144 |
+
if col not in dataset.schema.names or value in (None, ""):
|
| 145 |
+
continue
|
| 146 |
+
clause = field(col) == value
|
| 147 |
+
expression = clause if expression is None else expression & clause
|
| 148 |
+
return dataset.scanner(columns=columns, filter=expression)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def _read_distinct_values(name: str, column: str, limit: int = 200) -> list[str] | None:
|
| 152 |
+
if column not in _schema_names(name):
|
| 153 |
+
return None
|
| 154 |
+
table = _scanner(name, columns=[column]).to_table()
|
| 155 |
+
values = table.column(column).drop_null().unique().to_pylist()
|
| 156 |
+
return sorted(str(v) for v in values)[:limit]
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def _read_date_range(name: str) -> dict | None:
|
| 160 |
+
if "created_utc" not in _schema_names(name):
|
| 161 |
+
return None
|
| 162 |
+
table = _scanner(name, columns=["created_utc"]).to_table()
|
| 163 |
+
if table.num_rows == 0:
|
| 164 |
+
return None
|
| 165 |
+
series = table.column("created_utc").to_pandas().dropna()
|
| 166 |
+
if series.empty:
|
| 167 |
+
return None
|
| 168 |
+
return {
|
| 169 |
+
"earliest": datetime.fromtimestamp(series.min(), tz=timezone.utc).strftime("%Y-%m-%d"),
|
| 170 |
+
"latest": datetime.fromtimestamp(series.max(), tz=timezone.utc).strftime("%Y-%m-%d"),
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def _schema_names(name: str) -> list[str]:
|
| 175 |
+
return pq.read_schema(_dataset_path(name)).names
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def compute_dataset_metadata() -> dict:
|
| 179 |
+
result = {}
|
| 180 |
+
for name, path in DATASETS.items():
|
| 181 |
+
if not path.exists():
|
| 182 |
+
result[name] = {"available": False}
|
| 183 |
+
continue
|
| 184 |
+
parquet = pq.ParquetFile(path)
|
| 185 |
+
schema = parquet.schema_arrow
|
| 186 |
+
result[name] = {
|
| 187 |
+
"available": True,
|
| 188 |
+
"path": str(path),
|
| 189 |
+
"rows": parquet.metadata.num_rows,
|
| 190 |
+
"columns": {f.name: str(f.type) for f in schema},
|
| 191 |
+
"subreddits": _read_distinct_values(name, "subreddit"),
|
| 192 |
+
"date_range": _read_date_range(name),
|
| 193 |
+
"metadata_cached_at": datetime.now(timezone.utc).isoformat(),
|
| 194 |
+
}
|
| 195 |
+
METADATA_CACHE.write_text(json.dumps(result, indent=2))
|
| 196 |
+
return result
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def get_dataset_metadata(refresh: bool = False) -> dict:
|
| 200 |
+
if METADATA_CACHE.exists() and not refresh:
|
| 201 |
+
return json.loads(METADATA_CACHE.read_text())
|
| 202 |
+
return compute_dataset_metadata()
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def list_datasets(refresh: bool = False) -> dict:
|
| 206 |
+
"""Return cached dataset metadata instead of loading full tables."""
|
| 207 |
+
return get_dataset_metadata(refresh=refresh)
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def sample_rows(
|
| 211 |
+
dataset: str,
|
| 212 |
+
n: int = 5,
|
| 213 |
+
filters: dict | None = None,
|
| 214 |
+
columns: list[str] | None = None,
|
| 215 |
+
) -> dict:
|
| 216 |
+
"""Return a small deterministic preview of rows from a dataset, optionally filtered."""
|
| 217 |
+
selected_columns = columns or _schema_names(dataset)
|
| 218 |
+
table = _scanner(dataset, columns=selected_columns, filters=filters).head(n)
|
| 219 |
+
df = table.to_pandas() if table.num_rows else pd.DataFrame(columns=selected_columns)
|
| 220 |
+
return {
|
| 221 |
+
"dataset": dataset,
|
| 222 |
+
"filters": filters or {},
|
| 223 |
+
"n_returned": len(df),
|
| 224 |
+
"rows": df.fillna("").to_dict(orient="records"),
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 229 |
+
# 3. Question router
|
| 230 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 231 |
+
|
| 232 |
+
@dataclass(frozen=True)
|
| 233 |
+
class RoutePlan:
|
| 234 |
+
mode: str
|
| 235 |
+
allowed_tools: list[str]
|
| 236 |
+
guidance: str
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
ALL_TOOL_NAMES = [
|
| 240 |
+
"list_datasets", "sample_rows", "count_by_group", "trend_over_time",
|
| 241 |
+
"summary_stats", "top_posts", "text_search", "word_freq", "compare_groups",
|
| 242 |
+
"extract_frequency_patterns", "extract_dominance_patterns", "analyze_image_sample",
|
| 243 |
+
"export_reliability_sample", "compute_reliability",
|
| 244 |
+
]
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def route_question(question: str) -> RoutePlan:
|
| 248 |
+
q = question.lower()
|
| 249 |
+
|
| 250 |
+
if any(t in q for t in ["image", "images", "photo", "photos", "visual", "depicted"]):
|
| 251 |
+
return RoutePlan(
|
| 252 |
+
mode="image",
|
| 253 |
+
allowed_tools=["list_datasets", "sample_rows", "analyze_image_sample", "export_reliability_sample", "compute_reliability"],
|
| 254 |
+
guidance="This is a visual-content question. Prefer image analysis tools and avoid text-only proxies. Always provide a coding_scheme.",
|
| 255 |
+
)
|
| 256 |
+
if any(t in q for t in ["reliability", "kappa", "human coding", "inter-rater", "validate"]):
|
| 257 |
+
return RoutePlan(
|
| 258 |
+
mode="reliability",
|
| 259 |
+
allowed_tools=["export_reliability_sample", "compute_reliability"],
|
| 260 |
+
guidance="This is a reliability/validation question. Use export_reliability_sample then compute_reliability.",
|
| 261 |
+
)
|
| 262 |
+
if any(t in q for t in ["how often", "how long", "times per", "every day", "session length", "streak"]):
|
| 263 |
+
return RoutePlan(
|
| 264 |
+
mode="pattern_frequency",
|
| 265 |
+
allowed_tools=["list_datasets", "sample_rows", "extract_frequency_patterns", "text_search"],
|
| 266 |
+
guidance="This is a behavioral frequency/duration question. Prefer regex pattern extraction over generic word counts.",
|
| 267 |
+
)
|
| 268 |
+
if any(t in q for t in ["dominant", "subordinate", "mistress", "goddess", "femdom", "submissive"]):
|
| 269 |
+
return RoutePlan(
|
| 270 |
+
mode="pattern_dominance",
|
| 271 |
+
allowed_tools=["list_datasets", "sample_rows", "extract_dominance_patterns", "text_search", "analyze_image_sample"],
|
| 272 |
+
guidance="This is a dominance/subordination framing question. Use the text pattern tool unless the user explicitly asks about images.",
|
| 273 |
+
)
|
| 274 |
+
if any(t in q for t in ["over time", "trend", "changed", "change over time", "monthly", "yearly"]):
|
| 275 |
+
return RoutePlan(
|
| 276 |
+
mode="trend",
|
| 277 |
+
allowed_tools=["list_datasets", "sample_rows", "trend_over_time", "count_by_group"],
|
| 278 |
+
guidance="This is a time-series question. Prefer trend_over_time and only use grouping/count tools to contextualize it.",
|
| 279 |
+
)
|
| 280 |
+
if any(t in q for t in ["compare", "difference", "versus", "vs", "higher", "lower"]):
|
| 281 |
+
return RoutePlan(
|
| 282 |
+
mode="compare",
|
| 283 |
+
allowed_tools=["list_datasets", "sample_rows", "compare_groups", "summary_stats", "count_by_group"],
|
| 284 |
+
guidance="This is a comparison question. Prefer compare_groups or summary_stats with explicit filters.",
|
| 285 |
+
)
|
| 286 |
+
if any(t in q for t in ["top", "highest", "best scoring", "most upvoted"]):
|
| 287 |
+
return RoutePlan(
|
| 288 |
+
mode="ranking",
|
| 289 |
+
allowed_tools=["list_datasets", "sample_rows", "top_posts", "summary_stats"],
|
| 290 |
+
guidance="This is a ranking question. Prefer top_posts and use summary_stats only if it supports the answer.",
|
| 291 |
+
)
|
| 292 |
+
if any(t in q for t in ["search", "find", "mention", "contains", "where people say"]):
|
| 293 |
+
return RoutePlan(
|
| 294 |
+
mode="search",
|
| 295 |
+
allowed_tools=["list_datasets", "sample_rows", "text_search", "top_posts"],
|
| 296 |
+
guidance="This is a retrieval question. Prefer text_search with the right dataset and text column.",
|
| 297 |
+
)
|
| 298 |
+
if any(t in q for t in ["common words", "most common words", "word frequency", "tokens"]):
|
| 299 |
+
return RoutePlan(
|
| 300 |
+
mode="lexical",
|
| 301 |
+
allowed_tools=["list_datasets", "sample_rows", "word_freq", "text_search"],
|
| 302 |
+
guidance="This is a lexical summary question. Prefer word_freq and inspect text samples only if needed.",
|
| 303 |
+
)
|
| 304 |
+
if any(t in q for t in ["how many", "count", "number of", "what proportion"]):
|
| 305 |
+
return RoutePlan(
|
| 306 |
+
mode="describe",
|
| 307 |
+
allowed_tools=["list_datasets", "sample_rows", "count_by_group", "summary_stats", "trend_over_time"],
|
| 308 |
+
guidance="This is a descriptive count question. Prefer count_by_group or summary_stats and keep the plan minimal.",
|
| 309 |
+
)
|
| 310 |
+
return RoutePlan(
|
| 311 |
+
mode="unknown",
|
| 312 |
+
allowed_tools=ALL_TOOL_NAMES,
|
| 313 |
+
guidance="Question type is ambiguous. Inspect metadata first, then choose the minimum reliable tool path.",
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 318 |
+
# 4. Image analysis
|
| 319 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 320 |
+
|
| 321 |
+
VISION_MODEL = "Qwen/Qwen2-VL-72B-Instruct"
|
| 322 |
+
TOGETHER_BASE_URL = "https://api.together.xyz/v1"
|
| 323 |
+
DIRECT_IMAGE_DOMAINS = {"i.redd.it", "i.imgur.com", "i.redgifs.com"}
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
def _load_image_urls(subreddit: str | None = None, n: int = 50) -> pd.DataFrame:
|
| 327 |
+
pattern = str(DATA_DIR / "*_submissions_*.csv")
|
| 328 |
+
files = sorted(glob.glob(pattern))
|
| 329 |
+
if subreddit:
|
| 330 |
+
files = [f for f in files if Path(f).name.lower().startswith(subreddit.lower())]
|
| 331 |
+
|
| 332 |
+
needed_cols = ["subreddit", "title", "url", "domain", "score", "is_self"]
|
| 333 |
+
frames = []
|
| 334 |
+
for f in files:
|
| 335 |
+
try:
|
| 336 |
+
df = pd.read_csv(f, usecols=lambda c: c in needed_cols, low_memory=False)
|
| 337 |
+
if "is_self" in df.columns:
|
| 338 |
+
df = df[df["is_self"] == False]
|
| 339 |
+
if "url" in df.columns and "domain" in df.columns:
|
| 340 |
+
df = df[df["domain"].isin(DIRECT_IMAGE_DOMAINS)].dropna(subset=["url"])
|
| 341 |
+
frames.append(df[["subreddit", "title", "url", "domain", "score"]])
|
| 342 |
+
except Exception:
|
| 343 |
+
continue
|
| 344 |
+
|
| 345 |
+
if not frames:
|
| 346 |
+
return pd.DataFrame()
|
| 347 |
+
combined = pd.concat(frames, ignore_index=True)
|
| 348 |
+
if len(combined) > n * 10:
|
| 349 |
+
combined = combined.sample(min(n * 10, len(combined)), random_state=42)
|
| 350 |
+
return combined.head(n * 10)
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
def _fetch_image_b64(url: str, timeout: int = 8) -> tuple[str, str] | None:
|
| 354 |
+
try:
|
| 355 |
+
url.encode("ascii")
|
| 356 |
+
except UnicodeEncodeError:
|
| 357 |
+
return None
|
| 358 |
+
try:
|
| 359 |
+
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 (research bot)"})
|
| 360 |
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
| 361 |
+
content_type = resp.headers.get("Content-Type", "image/jpeg").split(";")[0].strip()
|
| 362 |
+
if not content_type.startswith("image/"):
|
| 363 |
+
return None
|
| 364 |
+
data = resp.read()
|
| 365 |
+
if len(data) < 1000:
|
| 366 |
+
return None
|
| 367 |
+
return base64.standard_b64encode(data).decode("utf-8"), content_type
|
| 368 |
+
except Exception:
|
| 369 |
+
return None
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
def analyze_image_sample(
|
| 373 |
+
question: str,
|
| 374 |
+
subreddit: str | None = None,
|
| 375 |
+
n_sample: int = 100,
|
| 376 |
+
coding_scheme: dict | None = None,
|
| 377 |
+
) -> dict:
|
| 378 |
+
"""
|
| 379 |
+
Sample image posts, fetch them, and ask Qwen2-VL a structured content-analysis question.
|
| 380 |
+
Uses Together AI (no content filters). n_sample is uncapped β set as needed.
|
| 381 |
+
"""
|
| 382 |
+
client = openai.OpenAI(
|
| 383 |
+
api_key=os.environ["TOGETHER_API_KEY"],
|
| 384 |
+
base_url=TOGETHER_BASE_URL,
|
| 385 |
+
)
|
| 386 |
+
candidates = _load_image_urls(subreddit=subreddit, n=n_sample * 5)
|
| 387 |
+
|
| 388 |
+
if candidates.empty:
|
| 389 |
+
return {
|
| 390 |
+
"analysis": "analyze_image_sample",
|
| 391 |
+
"error": "No direct image URLs found in raw CSVs for the given filters.",
|
| 392 |
+
"subreddit_filter": subreddit,
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
if coding_scheme:
|
| 396 |
+
scheme_text = "\n".join(f"- {k}: {v}" for k, v in coding_scheme.items())
|
| 397 |
+
prompt = (
|
| 398 |
+
f"{question}\n\nCoding scheme:\n{scheme_text}\n\n"
|
| 399 |
+
"Reply with ONLY the label and a one-sentence justification, "
|
| 400 |
+
"formatted as: LABEL | justification"
|
| 401 |
+
)
|
| 402 |
+
else:
|
| 403 |
+
prompt = (
|
| 404 |
+
f"{question}\n\n"
|
| 405 |
+
"Reply with a short structured answer. "
|
| 406 |
+
"If you cannot determine this from the image, reply: UNCLEAR | reason"
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
results = []
|
| 410 |
+
attempted = 0
|
| 411 |
+
|
| 412 |
+
for _, row in candidates.iterrows():
|
| 413 |
+
if len(results) >= n_sample:
|
| 414 |
+
break
|
| 415 |
+
attempted += 1
|
| 416 |
+
img = _fetch_image_b64(row["url"])
|
| 417 |
+
if img is None:
|
| 418 |
+
continue
|
| 419 |
+
b64data, media_type = img
|
| 420 |
+
|
| 421 |
+
try:
|
| 422 |
+
response = client.chat.completions.create(
|
| 423 |
+
model=VISION_MODEL,
|
| 424 |
+
max_tokens=200,
|
| 425 |
+
messages=[{
|
| 426 |
+
"role": "user",
|
| 427 |
+
"content": [
|
| 428 |
+
{"type": "image_url", "image_url": {"url": f"data:{media_type};base64,{b64data}"}},
|
| 429 |
+
{"type": "text", "text": prompt},
|
| 430 |
+
],
|
| 431 |
+
}],
|
| 432 |
+
)
|
| 433 |
+
answer = response.choices[0].message.content.strip()
|
| 434 |
+
except Exception as e:
|
| 435 |
+
answer = f"ERROR | {e}"
|
| 436 |
+
|
| 437 |
+
parts = answer.split("|", 1)
|
| 438 |
+
label = parts[0].strip().upper() if parts else "UNCLEAR"
|
| 439 |
+
justification = parts[1].strip() if len(parts) > 1 else ""
|
| 440 |
+
|
| 441 |
+
def _ascii_safe(s: str) -> str:
|
| 442 |
+
return s.encode("ascii", errors="replace").decode("ascii")
|
| 443 |
+
|
| 444 |
+
results.append({
|
| 445 |
+
"subreddit": _ascii_safe(str(row.get("subreddit", ""))),
|
| 446 |
+
"title": _ascii_safe(str(row.get("title", ""))),
|
| 447 |
+
"url": row["url"],
|
| 448 |
+
"label": label,
|
| 449 |
+
"justification": _ascii_safe(justification),
|
| 450 |
+
"score": row.get("score", None),
|
| 451 |
+
})
|
| 452 |
+
|
| 453 |
+
label_counts: dict[str, int] = {}
|
| 454 |
+
for r in results:
|
| 455 |
+
label_counts[r["label"]] = label_counts.get(r["label"], 0) + 1
|
| 456 |
+
|
| 457 |
+
total_coded = len(results)
|
| 458 |
+
saved_csv = None
|
| 459 |
+
if results:
|
| 460 |
+
out_df = pd.DataFrame(results)
|
| 461 |
+
stem = f"image_analysis_{subreddit or 'all'}"
|
| 462 |
+
saved_csv = str(OUTPUTS_DIR / f"{stem}.csv")
|
| 463 |
+
out_df.to_csv(saved_csv, index=False)
|
| 464 |
+
|
| 465 |
+
return {
|
| 466 |
+
"analysis": "analyze_image_sample",
|
| 467 |
+
"question": question,
|
| 468 |
+
"subreddit_filter": subreddit,
|
| 469 |
+
"n_attempted": attempted,
|
| 470 |
+
"n_successfully_coded": total_coded,
|
| 471 |
+
"label_counts": label_counts,
|
| 472 |
+
"label_pct": {k: round(v / total_coded * 100, 1) for k, v in label_counts.items()} if total_coded else {},
|
| 473 |
+
"per_image_results": results,
|
| 474 |
+
"saved_csv": saved_csv,
|
| 475 |
+
"caveats": [
|
| 476 |
+
"Sample limited to direct-image domains (i.redd.it, i.imgur.com, i.redgifs.com) β galleries and videos excluded.",
|
| 477 |
+
f"Vision model: {VISION_MODEL} via Together AI.",
|
| 478 |
+
"Coded by a single model β validate with human reliability sample before reporting.",
|
| 479 |
+
],
|
| 480 |
+
}
|
| 481 |
+
|
| 482 |
+
|
| 483 |
+
def export_reliability_sample(
|
| 484 |
+
source_csv: str | None = None,
|
| 485 |
+
n: int = 200,
|
| 486 |
+
random_state: int = 42,
|
| 487 |
+
) -> dict:
|
| 488 |
+
"""
|
| 489 |
+
Draw a stratified random sample of n images from a completed image_analysis CSV
|
| 490 |
+
for human coding. Saves a CSV with an empty human_label column.
|
| 491 |
+
"""
|
| 492 |
+
if source_csv is None:
|
| 493 |
+
# Default to most recently written image_analysis file
|
| 494 |
+
candidates = sorted(OUTPUTS_DIR.glob("image_analysis_*.csv"))
|
| 495 |
+
if not candidates:
|
| 496 |
+
return {"error": "No image_analysis CSV found in outputs/. Run analyze_image_sample first."}
|
| 497 |
+
source_csv = str(candidates[-1])
|
| 498 |
+
|
| 499 |
+
df = pd.read_csv(source_csv)
|
| 500 |
+
df = df[df["label"].notna() & ~df["label"].str.startswith("ERROR")]
|
| 501 |
+
|
| 502 |
+
# Stratified sample by label
|
| 503 |
+
sampled = (
|
| 504 |
+
df.groupby("label", group_keys=False)
|
| 505 |
+
.apply(lambda g: g.sample(min(len(g), max(1, int(n * len(g) / len(df)))), random_state=random_state))
|
| 506 |
+
)
|
| 507 |
+
# Top up to exactly n if rounding left us short
|
| 508 |
+
if len(sampled) < n and len(df) >= n:
|
| 509 |
+
remaining = df[~df.index.isin(sampled.index)]
|
| 510 |
+
top_up = remaining.sample(n - len(sampled), random_state=random_state)
|
| 511 |
+
sampled = pd.concat([sampled, top_up])
|
| 512 |
+
|
| 513 |
+
sampled = sampled.sample(frac=1, random_state=random_state).reset_index(drop=True) # shuffle
|
| 514 |
+
sampled.insert(0, "image_id", range(1, len(sampled) + 1))
|
| 515 |
+
sampled = sampled.rename(columns={"label": "model_label", "justification": "model_justification"})
|
| 516 |
+
sampled["human_label"] = ""
|
| 517 |
+
|
| 518 |
+
out_cols = ["image_id", "url", "title", "subreddit", "model_label", "model_justification", "human_label"]
|
| 519 |
+
out_cols = [c for c in out_cols if c in sampled.columns]
|
| 520 |
+
out_path = str(OUTPUTS_DIR / "reliability_sample.csv")
|
| 521 |
+
sampled[out_cols].to_csv(out_path, index=False)
|
| 522 |
+
|
| 523 |
+
return {
|
| 524 |
+
"analysis": "export_reliability_sample",
|
| 525 |
+
"source_csv": source_csv,
|
| 526 |
+
"n_exported": len(sampled),
|
| 527 |
+
"label_distribution": sampled["model_label"].value_counts().to_dict(),
|
| 528 |
+
"saved_csv": out_path,
|
| 529 |
+
"next_step": "Fill in the human_label column, then run compute_reliability.",
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
|
| 533 |
+
def compute_reliability(human_csv_path: str | None = None) -> dict:
|
| 534 |
+
"""
|
| 535 |
+
Compute Cohen's kappa between model_label and human_label columns
|
| 536 |
+
in a completed reliability sample CSV.
|
| 537 |
+
"""
|
| 538 |
+
if human_csv_path is None:
|
| 539 |
+
human_csv_path = str(OUTPUTS_DIR / "reliability_sample.csv")
|
| 540 |
+
|
| 541 |
+
df = pd.read_csv(human_csv_path)
|
| 542 |
+
df = df[df["human_label"].notna() & (df["human_label"].astype(str).str.strip() != "")]
|
| 543 |
+
|
| 544 |
+
if len(df) < 2:
|
| 545 |
+
return {"error": "Not enough human-coded rows. Fill in human_label column first."}
|
| 546 |
+
|
| 547 |
+
model = df["model_label"].astype(str).str.strip().str.upper()
|
| 548 |
+
human = df["human_label"].astype(str).str.strip().str.upper()
|
| 549 |
+
|
| 550 |
+
kappa = cohen_kappa_score(human, model)
|
| 551 |
+
pct_agreement = round((human == model).mean() * 100, 1)
|
| 552 |
+
|
| 553 |
+
per_label = {}
|
| 554 |
+
for label in sorted(human.unique()):
|
| 555 |
+
h = (human == label)
|
| 556 |
+
m = (model == label)
|
| 557 |
+
tp = int((h & m).sum())
|
| 558 |
+
fp = int((~h & m).sum())
|
| 559 |
+
fn = int((h & ~m).sum())
|
| 560 |
+
per_label[label] = {"human_n": int(h.sum()), "model_n": int(m.sum()),
|
| 561 |
+
"exact_matches": tp, "false_positives": fp, "false_negatives": fn}
|
| 562 |
+
|
| 563 |
+
report = {
|
| 564 |
+
"analysis": "compute_reliability",
|
| 565 |
+
"n_coded": len(df),
|
| 566 |
+
"cohens_kappa": round(kappa, 3),
|
| 567 |
+
"percent_agreement": pct_agreement,
|
| 568 |
+
"interpretation": (
|
| 569 |
+
"excellent (ΞΊ β₯ 0.80)" if kappa >= 0.80 else
|
| 570 |
+
"substantial (ΞΊ 0.60β0.79)" if kappa >= 0.60 else
|
| 571 |
+
"moderate (ΞΊ 0.40β0.59)" if kappa >= 0.40 else
|
| 572 |
+
"fair (ΞΊ 0.20β0.39)" if kappa >= 0.20 else
|
| 573 |
+
"poor (ΞΊ < 0.20)"
|
| 574 |
+
),
|
| 575 |
+
"per_label": per_label,
|
| 576 |
+
}
|
| 577 |
+
|
| 578 |
+
out_path = str(OUTPUTS_DIR / "reliability_report.json")
|
| 579 |
+
Path(out_path).write_text(json.dumps(report, indent=2))
|
| 580 |
+
report["saved_json"] = out_path
|
| 581 |
+
return report
|
| 582 |
+
|
| 583 |
+
|
| 584 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 585 |
+
# 5. Text pattern extraction
|
| 586 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 587 |
+
|
| 588 |
+
FREQUENCY_PATTERNS = {
|
| 589 |
+
"times_per_day": [
|
| 590 |
+
r"\b(\d+)\s*(?:times?|x)\s*(?:a|per)\s*day\b",
|
| 591 |
+
r"\b(\d+)\s*(?:times?|x)\s*daily\b",
|
| 592 |
+
],
|
| 593 |
+
"times_per_week": [
|
| 594 |
+
r"\b(\d+)\s*(?:times?|x)\s*(?:a|per)\s*week\b",
|
| 595 |
+
r"\b(\d+)\s*(?:times?|x)\s*weekly\b",
|
| 596 |
+
],
|
| 597 |
+
"hours_per_session": [
|
| 598 |
+
r"\b(\d+(?:\.\d+)?)\s*(?:hours?|hrs?)\b",
|
| 599 |
+
r"\b(\d+(?:\.\d+)?)\s*(?:hours?|hrs?)\s*(?:session|goon|long|straight|solid|non.?stop)\b",
|
| 600 |
+
],
|
| 601 |
+
"all_day": [
|
| 602 |
+
r"\ball\s*day\b", r"\ball\s*night\b", r"\ball\s*weekend\b", r"\bfor\s*hours\b",
|
| 603 |
+
],
|
| 604 |
+
"daily_habit": [
|
| 605 |
+
r"\bevery\s*day\b", r"\bevery\s*night\b", r"\bdaily\b", r"\bmost\s*days?\b",
|
| 606 |
+
],
|
| 607 |
+
"streak_days": [
|
| 608 |
+
r"\b(\d+)\s*(?:days?\s*(?:in\s*a\s*row|straight|streak|running))\b",
|
| 609 |
+
r"\b(\d+)\s*(?:-|β)?\s*day\s*(?:streak|binge)\b",
|
| 610 |
+
],
|
| 611 |
+
}
|
| 612 |
+
|
| 613 |
+
DOMINANCE_PATTERNS = {
|
| 614 |
+
"dominant_language": [
|
| 615 |
+
r"\bdominat(?:e|es|ed|ing|ion|rix|rix)\b", r"\bfem(?:dom|domme)\b",
|
| 616 |
+
r"\bmistress\b", r"\bgoddess\b", r"\bqueen\b", r"\bowner\b",
|
| 617 |
+
r"\balpha\b", r"\bin\s*control\b", r"\bboss\b",
|
| 618 |
+
],
|
| 619 |
+
"subordinate_language": [
|
| 620 |
+
r"\bsubmiss(?:ive|ion)\b", r"\bsub\b", r"\bobedient\b", r"\bslave\b",
|
| 621 |
+
r"\bpet\b", r"\bslut\b", r"\bwhore\b", r"\bused\b",
|
| 622 |
+
r"\bcontrolled\b", r"\bworshiped?\b", r"\bworship(?:ped|ing)\b",
|
| 623 |
+
],
|
| 624 |
+
"neutral_object": [
|
| 625 |
+
r"\bperfect\b", r"\bbeautiful\b", r"\bhot\b", r"\bsexy\b", r"\bstunning\b",
|
| 626 |
+
],
|
| 627 |
+
}
|
| 628 |
+
|
| 629 |
+
|
| 630 |
+
def _compile(patterns: list[str]) -> re.Pattern:
|
| 631 |
+
return re.compile("|".join(patterns), re.IGNORECASE)
|
| 632 |
+
|
| 633 |
+
|
| 634 |
+
def extract_frequency_patterns(
|
| 635 |
+
dataset: str = "comments",
|
| 636 |
+
text_col: str = "body",
|
| 637 |
+
subreddit: str | None = None,
|
| 638 |
+
n_examples: int = 5,
|
| 639 |
+
sample_size: int = 5_000_000,
|
| 640 |
+
) -> dict:
|
| 641 |
+
"""Mine text for frequency and duration language across the full dataset."""
|
| 642 |
+
cols = [text_col] + (["subreddit"] if subreddit else [])
|
| 643 |
+
df = _scanner(
|
| 644 |
+
dataset, columns=cols,
|
| 645 |
+
filters={"subreddit": subreddit} if subreddit else None,
|
| 646 |
+
).head(sample_size).to_pandas()
|
| 647 |
+
|
| 648 |
+
text = df[text_col].fillna("")
|
| 649 |
+
total_docs = len(text)
|
| 650 |
+
results = {}
|
| 651 |
+
|
| 652 |
+
for category, pats in FREQUENCY_PATTERNS.items():
|
| 653 |
+
regex = _compile(pats)
|
| 654 |
+
matches_mask = text.str.contains(regex.pattern, regex=True, na=False)
|
| 655 |
+
hit_texts = text[matches_mask]
|
| 656 |
+
values = []
|
| 657 |
+
for pat in pats:
|
| 658 |
+
r = re.compile(pat, re.IGNORECASE)
|
| 659 |
+
for t in hit_texts:
|
| 660 |
+
for m in r.finditer(t):
|
| 661 |
+
if m.groups():
|
| 662 |
+
try:
|
| 663 |
+
values.append(float(m.group(1)))
|
| 664 |
+
except (IndexError, ValueError):
|
| 665 |
+
pass
|
| 666 |
+
raw_examples = hit_texts.sample(min(n_examples, len(hit_texts)), random_state=42).tolist() if len(hit_texts) > 0 else []
|
| 667 |
+
results[category] = {
|
| 668 |
+
"count": int(matches_mask.sum()),
|
| 669 |
+
"pct_of_docs": round(matches_mask.mean() * 100, 3),
|
| 670 |
+
"numeric_values": sorted(values)[:50] if values else [],
|
| 671 |
+
"mean_value": round(sum(values) / len(values), 2) if values else None,
|
| 672 |
+
"examples": [t.encode("ascii", errors="replace").decode("ascii") for t in raw_examples],
|
| 673 |
+
}
|
| 674 |
+
|
| 675 |
+
return {
|
| 676 |
+
"analysis": "extract_frequency_patterns",
|
| 677 |
+
"dataset": dataset,
|
| 678 |
+
"text_col": text_col,
|
| 679 |
+
"subreddit_filter": subreddit,
|
| 680 |
+
"total_docs_sampled": total_docs,
|
| 681 |
+
"patterns": results,
|
| 682 |
+
}
|
| 683 |
+
|
| 684 |
+
|
| 685 |
+
def extract_dominance_patterns(
|
| 686 |
+
dataset: str = "comments",
|
| 687 |
+
text_col: str = "body",
|
| 688 |
+
subreddit: str | None = None,
|
| 689 |
+
sample_size: int = 5_000_000,
|
| 690 |
+
) -> dict:
|
| 691 |
+
"""Count dominant, subordinate, and neutral language in text."""
|
| 692 |
+
cols = [text_col] + (["subreddit"] if subreddit else [])
|
| 693 |
+
df = _scanner(
|
| 694 |
+
dataset, columns=cols,
|
| 695 |
+
filters={"subreddit": subreddit} if subreddit else None,
|
| 696 |
+
).head(sample_size).to_pandas()
|
| 697 |
+
|
| 698 |
+
text = df[text_col].fillna("")
|
| 699 |
+
total_docs = len(text)
|
| 700 |
+
results = {}
|
| 701 |
+
|
| 702 |
+
for category, pats in DOMINANCE_PATTERNS.items():
|
| 703 |
+
regex = _compile(pats)
|
| 704 |
+
mask = text.str.contains(regex, na=False)
|
| 705 |
+
hits = text[mask]
|
| 706 |
+
raw_examples = hits.sample(min(5, len(hits)), random_state=42).tolist() if len(hits) > 0 else []
|
| 707 |
+
results[category] = {
|
| 708 |
+
"count": int(mask.sum()),
|
| 709 |
+
"pct_of_docs": round(mask.mean() * 100, 3),
|
| 710 |
+
"examples": [t.encode("ascii", errors="replace").decode("ascii") for t in raw_examples],
|
| 711 |
+
}
|
| 712 |
+
|
| 713 |
+
dom = results.get("dominant_language", {}).get("count", 0)
|
| 714 |
+
sub = results.get("subordinate_language", {}).get("count", 0)
|
| 715 |
+
total = dom + sub
|
| 716 |
+
ratio = {
|
| 717 |
+
"dominant_pct": round(dom / total * 100, 1) if total else None,
|
| 718 |
+
"subordinate_pct": round(sub / total * 100, 1) if total else None,
|
| 719 |
+
"interpretation": (
|
| 720 |
+
"More subordinate language" if sub > dom else
|
| 721 |
+
"More dominant language" if dom > sub else
|
| 722 |
+
"Roughly balanced"
|
| 723 |
+
) if total else "No data",
|
| 724 |
+
}
|
| 725 |
+
|
| 726 |
+
return {
|
| 727 |
+
"analysis": "extract_dominance_patterns",
|
| 728 |
+
"dataset": dataset,
|
| 729 |
+
"text_col": text_col,
|
| 730 |
+
"subreddit_filter": subreddit,
|
| 731 |
+
"total_docs_sampled": total_docs,
|
| 732 |
+
"categories": results,
|
| 733 |
+
"dominance_ratio": ratio,
|
| 734 |
+
"caveat": (
|
| 735 |
+
"This analysis counts language patterns in text, not visual image content. "
|
| 736 |
+
"It reflects how women are described in text, not how they appear in images. "
|
| 737 |
+
"For image-based analysis use analyze_image_sample."
|
| 738 |
+
),
|
| 739 |
+
}
|
| 740 |
+
|
| 741 |
+
|
| 742 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 743 |
+
# 6. Analysis execution
|
| 744 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 745 |
+
|
| 746 |
+
def _ts_to_date(series: pd.Series) -> pd.Series:
|
| 747 |
+
return pd.to_datetime(series, unit="s", utc=True)
|
| 748 |
+
|
| 749 |
+
|
| 750 |
+
def _normalize_filters(
|
| 751 |
+
filters: dict | None = None,
|
| 752 |
+
subreddit: str | None = None,
|
| 753 |
+
date_from: str | None = None,
|
| 754 |
+
date_to: str | None = None,
|
| 755 |
+
min_score: float | None = None,
|
| 756 |
+
) -> dict:
|
| 757 |
+
merged = dict(filters or {})
|
| 758 |
+
if subreddit:
|
| 759 |
+
merged["subreddit"] = subreddit
|
| 760 |
+
if date_from:
|
| 761 |
+
merged["date_from"] = date_from
|
| 762 |
+
if date_to:
|
| 763 |
+
merged["date_to"] = date_to
|
| 764 |
+
if min_score is not None:
|
| 765 |
+
merged["min_score"] = min_score
|
| 766 |
+
return merged
|
| 767 |
+
|
| 768 |
+
|
| 769 |
+
def _apply_filters(df: pd.DataFrame, filters: dict | None = None) -> pd.DataFrame:
|
| 770 |
+
if not filters:
|
| 771 |
+
return df
|
| 772 |
+
filtered = df
|
| 773 |
+
if filters.get("subreddit") and "subreddit" in filtered.columns:
|
| 774 |
+
filtered = filtered[filtered["subreddit"] == filters["subreddit"]]
|
| 775 |
+
if filters.get("author") and "author" in filtered.columns:
|
| 776 |
+
filtered = filtered[filtered["author"] == filters["author"]]
|
| 777 |
+
if filters.get("min_score") is not None and "score" in filtered.columns:
|
| 778 |
+
filtered["score"] = pd.to_numeric(filtered["score"], errors="coerce")
|
| 779 |
+
filtered = filtered[filtered["score"] >= filters["min_score"]]
|
| 780 |
+
if ("date_from" in filters or "date_to" in filters) and "created_utc" in filtered.columns:
|
| 781 |
+
created = _ts_to_date(filtered["created_utc"])
|
| 782 |
+
if filters.get("date_from"):
|
| 783 |
+
filtered = filtered[created >= pd.Timestamp(filters["date_from"], tz="UTC")]
|
| 784 |
+
created = _ts_to_date(filtered["created_utc"])
|
| 785 |
+
if filters.get("date_to"):
|
| 786 |
+
filtered = filtered[created <= pd.Timestamp(filters["date_to"], tz="UTC")]
|
| 787 |
+
return filtered
|
| 788 |
+
|
| 789 |
+
|
| 790 |
+
def _save_csv(df: pd.DataFrame, stem: str) -> str:
|
| 791 |
+
path = OUTPUTS_DIR / f"{stem}.csv"
|
| 792 |
+
df.to_csv(path, index=False)
|
| 793 |
+
return str(path)
|
| 794 |
+
|
| 795 |
+
|
| 796 |
+
def _save_fig(fig, stem: str) -> str:
|
| 797 |
+
path = OUTPUTS_DIR / f"{stem}.png"
|
| 798 |
+
pio.write_image(fig, str(path), scale=2)
|
| 799 |
+
return str(path)
|
| 800 |
+
|
| 801 |
+
|
| 802 |
+
def count_by_group(dataset: str, group_col: str, top_n: int = 30, filters: dict | None = None) -> dict:
|
| 803 |
+
"""Count rows grouped by a column. Returns sorted table + bar chart."""
|
| 804 |
+
filter_cols = [c for c in ["subreddit", "author", "score", "created_utc"] if c != group_col]
|
| 805 |
+
df = _load(dataset, columns=list(dict.fromkeys([group_col] + filter_cols)))
|
| 806 |
+
df = _apply_filters(df, filters)
|
| 807 |
+
counts = (
|
| 808 |
+
df.groupby(group_col, dropna=False)
|
| 809 |
+
.size().reset_index(name="count")
|
| 810 |
+
.sort_values("count", ascending=False).head(top_n)
|
| 811 |
+
)
|
| 812 |
+
stem = f"count_by_{group_col}_{dataset}"
|
| 813 |
+
saved_csv = _save_csv(counts, stem)
|
| 814 |
+
fig = px.bar(
|
| 815 |
+
counts.sort_values("count"), x="count", y=group_col, orientation="h",
|
| 816 |
+
title=f"Count by {group_col}", labels={"count": "Count", group_col: group_col},
|
| 817 |
+
)
|
| 818 |
+
fig.update_layout(yaxis={"categoryorder": "total ascending"})
|
| 819 |
+
try:
|
| 820 |
+
saved_png = _save_fig(fig, stem)
|
| 821 |
+
except Exception:
|
| 822 |
+
saved_png = None
|
| 823 |
+
return {
|
| 824 |
+
"analysis": "count_by_group", "dataset": dataset, "group_col": group_col,
|
| 825 |
+
"filters": filters or {}, "total_rows": len(df),
|
| 826 |
+
"table": counts.to_dict(orient="records"),
|
| 827 |
+
"saved_csv": saved_csv, "saved_png": saved_png, "plotly_json": fig.to_json(),
|
| 828 |
+
}
|
| 829 |
+
|
| 830 |
+
|
| 831 |
+
def trend_over_time(
|
| 832 |
+
dataset: str, freq: str = "M", group_col: str | None = None,
|
| 833 |
+
top_groups: int = 8, filters: dict | None = None,
|
| 834 |
+
) -> dict:
|
| 835 |
+
"""Count posts/comments over time, optionally broken out by a grouping column."""
|
| 836 |
+
cols = ["created_utc"] + ([group_col] if group_col else []) + ["subreddit", "author", "score"]
|
| 837 |
+
cols = list(dict.fromkeys(cols))
|
| 838 |
+
df = _load(dataset, columns=cols)
|
| 839 |
+
df = _apply_filters(df, filters)
|
| 840 |
+
df["period"] = _ts_to_date(df["created_utc"]).dt.to_period(freq).astype(str)
|
| 841 |
+
|
| 842 |
+
if group_col:
|
| 843 |
+
top = df[group_col].value_counts().head(top_groups).index.tolist()
|
| 844 |
+
df = df[df[group_col].isin(top)]
|
| 845 |
+
counts = (
|
| 846 |
+
df.groupby(["period", group_col]).size()
|
| 847 |
+
.reset_index(name="count").sort_values("period")
|
| 848 |
+
)
|
| 849 |
+
fig = px.line(counts, x="period", y="count", color=group_col,
|
| 850 |
+
title=f"Activity over time by {group_col}")
|
| 851 |
+
else:
|
| 852 |
+
counts = (
|
| 853 |
+
df.groupby("period").size()
|
| 854 |
+
.reset_index(name="count").sort_values("period")
|
| 855 |
+
)
|
| 856 |
+
fig = px.line(counts, x="period", y="count", title="Activity over time")
|
| 857 |
+
|
| 858 |
+
stem = f"trend_{dataset}_{group_col or 'all'}_{freq}"
|
| 859 |
+
saved_csv = _save_csv(counts, stem)
|
| 860 |
+
try:
|
| 861 |
+
saved_png = _save_fig(fig, stem)
|
| 862 |
+
except Exception:
|
| 863 |
+
saved_png = None
|
| 864 |
+
return {
|
| 865 |
+
"analysis": "trend_over_time", "dataset": dataset, "freq": freq,
|
| 866 |
+
"group_col": group_col, "filters": filters or {},
|
| 867 |
+
"table": counts.to_dict(orient="records"),
|
| 868 |
+
"saved_csv": saved_csv, "saved_png": saved_png, "plotly_json": fig.to_json(),
|
| 869 |
+
}
|
| 870 |
+
|
| 871 |
+
|
| 872 |
+
def summary_stats(
|
| 873 |
+
dataset: str, value_col: str, group_col: str | None = None,
|
| 874 |
+
top_n: int = 30, filters: dict | None = None,
|
| 875 |
+
) -> dict:
|
| 876 |
+
"""Descriptive statistics for a numeric column, optionally by group."""
|
| 877 |
+
cols = [value_col] + ([group_col] if group_col else []) + ["subreddit", "author", "score", "created_utc"]
|
| 878 |
+
cols = list(dict.fromkeys(cols))
|
| 879 |
+
df = _load(dataset, columns=cols)
|
| 880 |
+
df = _apply_filters(df, filters)
|
| 881 |
+
df[value_col] = pd.to_numeric(df[value_col], errors="coerce")
|
| 882 |
+
|
| 883 |
+
if group_col:
|
| 884 |
+
stats = (
|
| 885 |
+
df.groupby(group_col)[value_col]
|
| 886 |
+
.agg(["count", "mean", "median", "std", "min", "max"])
|
| 887 |
+
.reset_index().sort_values("mean", ascending=False).head(top_n).round(2)
|
| 888 |
+
)
|
| 889 |
+
else:
|
| 890 |
+
raw = df[value_col].describe().round(2)
|
| 891 |
+
stats = raw.reset_index()
|
| 892 |
+
stats.columns = ["stat", "value"]
|
| 893 |
+
|
| 894 |
+
stem = f"stats_{value_col}_{group_col or 'all'}_{dataset}"
|
| 895 |
+
saved_csv = _save_csv(stats, stem)
|
| 896 |
+
try:
|
| 897 |
+
if group_col:
|
| 898 |
+
fig = px.bar(stats, x=group_col, y="mean", error_y="std",
|
| 899 |
+
title=f"{value_col} by {group_col}",
|
| 900 |
+
labels={"mean": f"Mean {value_col}"})
|
| 901 |
+
else:
|
| 902 |
+
fig = px.histogram(df[value_col].dropna(), nbins=50,
|
| 903 |
+
title=f"Distribution of {value_col}",
|
| 904 |
+
labels={"value": value_col})
|
| 905 |
+
saved_png = _save_fig(fig, stem)
|
| 906 |
+
plotly_json = fig.to_json()
|
| 907 |
+
except Exception:
|
| 908 |
+
saved_png = None
|
| 909 |
+
plotly_json = None
|
| 910 |
+
return {
|
| 911 |
+
"analysis": "summary_stats", "dataset": dataset, "value_col": value_col,
|
| 912 |
+
"group_col": group_col, "filters": filters or {},
|
| 913 |
+
"n_total": len(df), "n_missing": int(df[value_col].isna().sum()),
|
| 914 |
+
"table": stats.to_dict(orient="records"),
|
| 915 |
+
"saved_csv": saved_csv, "saved_png": saved_png, "plotly_json": plotly_json,
|
| 916 |
+
}
|
| 917 |
+
|
| 918 |
+
|
| 919 |
+
def top_posts(
|
| 920 |
+
dataset: str = "posts", n: int = 20,
|
| 921 |
+
subreddit: str | None = None, text_col: str = "title",
|
| 922 |
+
filters: dict | None = None,
|
| 923 |
+
) -> dict:
|
| 924 |
+
"""Return the highest-scoring posts, optionally filtered to a subreddit."""
|
| 925 |
+
filters = _normalize_filters(filters=filters, subreddit=subreddit)
|
| 926 |
+
cols = [c for c in ["subreddit", "author", text_col, "score", "created_utc"] if c]
|
| 927 |
+
df = _load(dataset, columns=cols)
|
| 928 |
+
df = _apply_filters(df, filters)
|
| 929 |
+
top = df.nlargest(n, "score")[cols].copy()
|
| 930 |
+
top["date"] = _ts_to_date(top["created_utc"]).dt.strftime("%Y-%m-%d")
|
| 931 |
+
top = top.drop(columns=["created_utc"])
|
| 932 |
+
stem = f"top_posts_{subreddit or 'all'}_{dataset}"
|
| 933 |
+
saved = _save_csv(top, stem)
|
| 934 |
+
return {
|
| 935 |
+
"analysis": "top_posts", "dataset": dataset,
|
| 936 |
+
"subreddit_filter": subreddit, "filters": filters, "n": n,
|
| 937 |
+
"table": top.fillna("").to_dict(orient="records"), "saved_csv": saved,
|
| 938 |
+
}
|
| 939 |
+
|
| 940 |
+
|
| 941 |
+
def text_search(
|
| 942 |
+
dataset: str, query: str, text_col: str = "body",
|
| 943 |
+
n: int = 20, case_sensitive: bool = False,
|
| 944 |
+
subreddit: str | None = None, filters: dict | None = None,
|
| 945 |
+
) -> dict:
|
| 946 |
+
"""Search for a string pattern in a text column."""
|
| 947 |
+
filters = _normalize_filters(filters=filters, subreddit=subreddit)
|
| 948 |
+
cols = [c for c in ["subreddit", "author", text_col, "score", "created_utc"] if c]
|
| 949 |
+
df = _load(dataset, columns=cols)
|
| 950 |
+
df = _apply_filters(df, filters)
|
| 951 |
+
mask = df[text_col].fillna("").str.contains(query, case=case_sensitive, regex=False)
|
| 952 |
+
hits = df[mask].nlargest(n, "score").copy()
|
| 953 |
+
hits["date"] = _ts_to_date(hits["created_utc"]).dt.strftime("%Y-%m-%d")
|
| 954 |
+
hits = hits.drop(columns=["created_utc"])
|
| 955 |
+
stem = f"search_{query[:30].replace(' ', '_')}_{dataset}"
|
| 956 |
+
saved = _save_csv(hits, stem)
|
| 957 |
+
return {
|
| 958 |
+
"analysis": "text_search", "dataset": dataset, "query": query,
|
| 959 |
+
"text_col": text_col, "filters": filters,
|
| 960 |
+
"n_matches": int(mask.sum()), "n_returned": len(hits),
|
| 961 |
+
"table": hits.fillna("").to_dict(orient="records"), "saved_csv": saved,
|
| 962 |
+
}
|
| 963 |
+
|
| 964 |
+
|
| 965 |
+
def word_freq(
|
| 966 |
+
dataset: str = "corpus_clean", text_col: str = "text_cleaned",
|
| 967 |
+
top_n: int = 50, subreddit: str | None = None,
|
| 968 |
+
min_length: int = 4, filters: dict | None = None,
|
| 969 |
+
) -> dict:
|
| 970 |
+
"""Count word frequencies in a text column."""
|
| 971 |
+
filters = _normalize_filters(filters=filters, subreddit=subreddit)
|
| 972 |
+
cols = list(dict.fromkeys([text_col] + (["subreddit"] if subreddit else []) + ["author", "score", "created_utc"]))
|
| 973 |
+
df = _load(dataset, columns=cols)
|
| 974 |
+
df = _apply_filters(df, filters)
|
| 975 |
+
|
| 976 |
+
stop = {
|
| 977 |
+
"the","and","for","that","with","this","you","are","was","not",
|
| 978 |
+
"have","from","they","will","what","been","when","your","more",
|
| 979 |
+
"just","about","like","there","were","would","into","than","then",
|
| 980 |
+
"some","also","very","only","over","back","can","out","all","but",
|
| 981 |
+
"one","had","has","its","which","their","time","our","who","may",
|
| 982 |
+
"after","other","these","those","such","each","him","her","his",
|
| 983 |
+
"she","how","did","being","now","way","any","too","much","even",
|
| 984 |
+
"get","got","got","could","should","make","made","said","still",
|
| 985 |
+
"here","because","really","know","think","going","reddit","post",
|
| 986 |
+
"comment","deleted","removed",
|
| 987 |
+
}
|
| 988 |
+
|
| 989 |
+
words = (
|
| 990 |
+
df[text_col].fillna("").str.lower()
|
| 991 |
+
.str.replace(r"[^a-z\s]", " ", regex=True).str.split().explode()
|
| 992 |
+
)
|
| 993 |
+
words = words[words.str.len() >= min_length]
|
| 994 |
+
words = words[~words.isin(stop)]
|
| 995 |
+
counts = words.value_counts().head(top_n).reset_index()
|
| 996 |
+
counts.columns = ["word", "count"]
|
| 997 |
+
stem = f"wordfreq_{text_col}_{subreddit or 'all'}_{dataset}"
|
| 998 |
+
saved_csv = _save_csv(counts, stem)
|
| 999 |
+
fig = px.bar(
|
| 1000 |
+
counts.head(30).sort_values("count"), x="count", y="word", orientation="h",
|
| 1001 |
+
title="Top words by frequency", labels={"count": "Count", "word": "Word"},
|
| 1002 |
+
)
|
| 1003 |
+
fig.update_layout(yaxis={"categoryorder": "total ascending"})
|
| 1004 |
+
try:
|
| 1005 |
+
saved_png = _save_fig(fig, stem)
|
| 1006 |
+
except Exception:
|
| 1007 |
+
saved_png = None
|
| 1008 |
+
return {
|
| 1009 |
+
"analysis": "word_freq", "dataset": dataset, "text_col": text_col,
|
| 1010 |
+
"subreddit_filter": subreddit, "filters": filters, "total_docs": len(df),
|
| 1011 |
+
"table": counts.to_dict(orient="records"),
|
| 1012 |
+
"saved_csv": saved_csv, "saved_png": saved_png, "plotly_json": fig.to_json(),
|
| 1013 |
+
}
|
| 1014 |
+
|
| 1015 |
+
|
| 1016 |
+
def compare_groups(
|
| 1017 |
+
dataset: str, group_col: str, value_col: str,
|
| 1018 |
+
groups: list[str] | None = None, filters: dict | None = None,
|
| 1019 |
+
) -> dict:
|
| 1020 |
+
"""Compare a numeric value across groups with descriptive stats."""
|
| 1021 |
+
cols = list(dict.fromkeys([group_col, value_col, "subreddit", "author", "score", "created_utc"]))
|
| 1022 |
+
df = _load(dataset, columns=cols)
|
| 1023 |
+
df = _apply_filters(df, filters)
|
| 1024 |
+
df[value_col] = pd.to_numeric(df[value_col], errors="coerce")
|
| 1025 |
+
if groups:
|
| 1026 |
+
df = df[df[group_col].isin(groups)]
|
| 1027 |
+
stats = (
|
| 1028 |
+
df.groupby(group_col)[value_col]
|
| 1029 |
+
.agg(count="count", mean="mean", median="median", std="std")
|
| 1030 |
+
.reset_index().sort_values("median", ascending=False).round(3)
|
| 1031 |
+
)
|
| 1032 |
+
stem = f"compare_{group_col}_{value_col}_{dataset}"
|
| 1033 |
+
saved_csv = _save_csv(stats, stem)
|
| 1034 |
+
fig = px.bar(stats, x=group_col, y="median", error_y="std",
|
| 1035 |
+
title=f"{value_col} by {group_col} (median Β± std)",
|
| 1036 |
+
labels={"median": f"Median {value_col}"})
|
| 1037 |
+
try:
|
| 1038 |
+
saved_png = _save_fig(fig, stem)
|
| 1039 |
+
except Exception:
|
| 1040 |
+
saved_png = None
|
| 1041 |
+
return {
|
| 1042 |
+
"analysis": "compare_groups", "dataset": dataset,
|
| 1043 |
+
"group_col": group_col, "value_col": value_col,
|
| 1044 |
+
"filters": filters or {}, "groups_compared": stats[group_col].tolist(),
|
| 1045 |
+
"table": stats.to_dict(orient="records"),
|
| 1046 |
+
"saved_csv": saved_csv, "saved_png": saved_png, "plotly_json": fig.to_json(),
|
| 1047 |
+
}
|
| 1048 |
+
|
| 1049 |
+
|
| 1050 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1051 |
+
# 7. Core agent loop
|
| 1052 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1053 |
+
|
| 1054 |
+
MODEL = "claude-opus-4-6"
|
| 1055 |
+
|
| 1056 |
+
TOOLS = [
|
| 1057 |
+
{
|
| 1058 |
+
"name": "list_datasets",
|
| 1059 |
+
"description": (
|
| 1060 |
+
"List cached dataset metadata: paths, row counts, columns, subreddits, and date ranges. "
|
| 1061 |
+
"Use this to inspect the available data without loading full tables."
|
| 1062 |
+
),
|
| 1063 |
+
"input_schema": {
|
| 1064 |
+
"type": "object",
|
| 1065 |
+
"properties": {"refresh": {"type": "boolean", "default": False,
|
| 1066 |
+
"description": "Recompute metadata from source parquets instead of using the cache."}},
|
| 1067 |
+
"required": [],
|
| 1068 |
+
},
|
| 1069 |
+
},
|
| 1070 |
+
{
|
| 1071 |
+
"name": "sample_rows",
|
| 1072 |
+
"description": "Return a small deterministic preview of rows from a dataset, optionally filtered and column-limited.",
|
| 1073 |
+
"input_schema": {
|
| 1074 |
+
"type": "object",
|
| 1075 |
+
"properties": {
|
| 1076 |
+
"dataset": {"type": "string", "enum": ["posts", "comments", "corpus_clean", "titles"]},
|
| 1077 |
+
"n": {"type": "integer", "default": 5},
|
| 1078 |
+
"filters": {"type": "object", "description": "Optional equality filters, e.g. {\"subreddit\": \"GOONED\"}"},
|
| 1079 |
+
"columns": {"type": "array", "items": {"type": "string"}, "description": "Optional subset of columns to preview."},
|
| 1080 |
+
},
|
| 1081 |
+
"required": ["dataset"],
|
| 1082 |
+
},
|
| 1083 |
+
},
|
| 1084 |
+
{
|
| 1085 |
+
"name": "count_by_group",
|
| 1086 |
+
"description": "Count rows in a dataset grouped by one column, with optional shared filters.",
|
| 1087 |
+
"input_schema": {
|
| 1088 |
+
"type": "object",
|
| 1089 |
+
"properties": {
|
| 1090 |
+
"dataset": {"type": "string", "enum": ["posts", "comments", "corpus_clean", "titles"]},
|
| 1091 |
+
"group_col": {"type": "string"},
|
| 1092 |
+
"top_n": {"type": "integer", "default": 30},
|
| 1093 |
+
"filters": {"type": "object"},
|
| 1094 |
+
},
|
| 1095 |
+
"required": ["dataset", "group_col"],
|
| 1096 |
+
},
|
| 1097 |
+
},
|
| 1098 |
+
{
|
| 1099 |
+
"name": "trend_over_time",
|
| 1100 |
+
"description": "Count rows over time, optionally split by one grouping column, with optional shared filters.",
|
| 1101 |
+
"input_schema": {
|
| 1102 |
+
"type": "object",
|
| 1103 |
+
"properties": {
|
| 1104 |
+
"dataset": {"type": "string", "enum": ["posts", "comments", "corpus_clean", "titles"]},
|
| 1105 |
+
"freq": {"type": "string", "enum": ["D", "W", "M", "Q", "Y"], "default": "M"},
|
| 1106 |
+
"group_col": {"type": "string"},
|
| 1107 |
+
"top_groups": {"type": "integer", "default": 8},
|
| 1108 |
+
"filters": {"type": "object"},
|
| 1109 |
+
},
|
| 1110 |
+
"required": ["dataset"],
|
| 1111 |
+
},
|
| 1112 |
+
},
|
| 1113 |
+
{
|
| 1114 |
+
"name": "summary_stats",
|
| 1115 |
+
"description": "Descriptive statistics for a numeric column, optionally grouped and filtered.",
|
| 1116 |
+
"input_schema": {
|
| 1117 |
+
"type": "object",
|
| 1118 |
+
"properties": {
|
| 1119 |
+
"dataset": {"type": "string", "enum": ["posts", "comments", "corpus_clean", "titles"]},
|
| 1120 |
+
"value_col": {"type": "string"},
|
| 1121 |
+
"group_col": {"type": "string"},
|
| 1122 |
+
"top_n": {"type": "integer", "default": 30},
|
| 1123 |
+
"filters": {"type": "object"},
|
| 1124 |
+
},
|
| 1125 |
+
"required": ["dataset", "value_col"],
|
| 1126 |
+
},
|
| 1127 |
+
},
|
| 1128 |
+
{
|
| 1129 |
+
"name": "top_posts",
|
| 1130 |
+
"description": "Return the highest-scoring posts, optionally filtered by subreddit or shared filters.",
|
| 1131 |
+
"input_schema": {
|
| 1132 |
+
"type": "object",
|
| 1133 |
+
"properties": {
|
| 1134 |
+
"dataset": {"type": "string", "enum": ["posts", "titles"], "default": "posts"},
|
| 1135 |
+
"n": {"type": "integer", "default": 20},
|
| 1136 |
+
"subreddit": {"type": "string"},
|
| 1137 |
+
"text_col": {"type": "string", "default": "title"},
|
| 1138 |
+
"filters": {"type": "object"},
|
| 1139 |
+
},
|
| 1140 |
+
"required": [],
|
| 1141 |
+
},
|
| 1142 |
+
},
|
| 1143 |
+
{
|
| 1144 |
+
"name": "text_search",
|
| 1145 |
+
"description": "Search for a phrase in a text column and return top matching rows.",
|
| 1146 |
+
"input_schema": {
|
| 1147 |
+
"type": "object",
|
| 1148 |
+
"properties": {
|
| 1149 |
+
"dataset": {"type": "string", "enum": ["posts", "comments", "corpus_clean", "titles"]},
|
| 1150 |
+
"query": {"type": "string"},
|
| 1151 |
+
"text_col": {"type": "string", "default": "body"},
|
| 1152 |
+
"n": {"type": "integer", "default": 20},
|
| 1153 |
+
"subreddit": {"type": "string"},
|
| 1154 |
+
"filters": {"type": "object"},
|
| 1155 |
+
},
|
| 1156 |
+
"required": ["dataset", "query"],
|
| 1157 |
+
},
|
| 1158 |
+
},
|
| 1159 |
+
{
|
| 1160 |
+
"name": "word_freq",
|
| 1161 |
+
"description": "Count word frequencies in a text column with optional shared filters.",
|
| 1162 |
+
"input_schema": {
|
| 1163 |
+
"type": "object",
|
| 1164 |
+
"properties": {
|
| 1165 |
+
"dataset": {"type": "string", "enum": ["posts", "comments", "corpus_clean", "titles"], "default": "corpus_clean"},
|
| 1166 |
+
"text_col": {"type": "string", "default": "text_cleaned"},
|
| 1167 |
+
"top_n": {"type": "integer", "default": 50},
|
| 1168 |
+
"subreddit": {"type": "string"},
|
| 1169 |
+
"min_length": {"type": "integer", "default": 4},
|
| 1170 |
+
"filters": {"type": "object"},
|
| 1171 |
+
},
|
| 1172 |
+
"required": [],
|
| 1173 |
+
},
|
| 1174 |
+
},
|
| 1175 |
+
{
|
| 1176 |
+
"name": "compare_groups",
|
| 1177 |
+
"description": "Compare one numeric column across groups with optional shared filters.",
|
| 1178 |
+
"input_schema": {
|
| 1179 |
+
"type": "object",
|
| 1180 |
+
"properties": {
|
| 1181 |
+
"dataset": {"type": "string", "enum": ["posts", "comments", "corpus_clean", "titles"]},
|
| 1182 |
+
"group_col": {"type": "string"},
|
| 1183 |
+
"value_col": {"type": "string"},
|
| 1184 |
+
"groups": {"type": "array", "items": {"type": "string"}},
|
| 1185 |
+
"filters": {"type": "object"},
|
| 1186 |
+
},
|
| 1187 |
+
"required": ["dataset", "group_col", "value_col"],
|
| 1188 |
+
},
|
| 1189 |
+
},
|
| 1190 |
+
{
|
| 1191 |
+
"name": "extract_frequency_patterns",
|
| 1192 |
+
"description": "Mine text for frequency and duration language across the full dataset.",
|
| 1193 |
+
"input_schema": {
|
| 1194 |
+
"type": "object",
|
| 1195 |
+
"properties": {
|
| 1196 |
+
"dataset": {"type": "string", "enum": ["posts", "comments", "corpus_clean"], "default": "comments"},
|
| 1197 |
+
"text_col": {"type": "string", "default": "body"},
|
| 1198 |
+
"subreddit": {"type": "string"},
|
| 1199 |
+
"n_examples": {"type": "integer", "default": 5},
|
| 1200 |
+
"sample_size": {"type": "integer", "default": 5000000},
|
| 1201 |
+
},
|
| 1202 |
+
"required": [],
|
| 1203 |
+
},
|
| 1204 |
+
},
|
| 1205 |
+
{
|
| 1206 |
+
"name": "extract_dominance_patterns",
|
| 1207 |
+
"description": "Count dominant vs subordinate language in text, not images.",
|
| 1208 |
+
"input_schema": {
|
| 1209 |
+
"type": "object",
|
| 1210 |
+
"properties": {
|
| 1211 |
+
"dataset": {"type": "string", "enum": ["posts", "comments", "corpus_clean"], "default": "comments"},
|
| 1212 |
+
"text_col": {"type": "string", "default": "body"},
|
| 1213 |
+
"subreddit": {"type": "string"},
|
| 1214 |
+
"sample_size": {"type": "integer", "default": 5000000},
|
| 1215 |
+
},
|
| 1216 |
+
"required": [],
|
| 1217 |
+
},
|
| 1218 |
+
},
|
| 1219 |
+
{
|
| 1220 |
+
"name": "analyze_image_sample",
|
| 1221 |
+
"description": "Run vision coding on a sample of image posts using Qwen2-VL via Together AI (no content filters). Always provide a coding_scheme for research use.",
|
| 1222 |
+
"input_schema": {
|
| 1223 |
+
"type": "object",
|
| 1224 |
+
"properties": {
|
| 1225 |
+
"question": {"type": "string"},
|
| 1226 |
+
"subreddit": {"type": "string"},
|
| 1227 |
+
"n_sample": {"type": "integer", "default": 100, "description": "Number of images to code. No hard cap β set to 500+ for large analyses."},
|
| 1228 |
+
"coding_scheme": {"type": "object", "description": "Dict of {label: definition}. Always provide this for research questions."},
|
| 1229 |
+
},
|
| 1230 |
+
"required": ["question"],
|
| 1231 |
+
},
|
| 1232 |
+
},
|
| 1233 |
+
{
|
| 1234 |
+
"name": "export_reliability_sample",
|
| 1235 |
+
"description": "Export a stratified random sample of coded images for human validation. Run after analyze_image_sample.",
|
| 1236 |
+
"input_schema": {
|
| 1237 |
+
"type": "object",
|
| 1238 |
+
"properties": {
|
| 1239 |
+
"source_csv": {"type": "string", "description": "Path to image_analysis CSV. Defaults to most recent."},
|
| 1240 |
+
"n": {"type": "integer", "default": 200},
|
| 1241 |
+
"random_state": {"type": "integer", "default": 42},
|
| 1242 |
+
},
|
| 1243 |
+
"required": [],
|
| 1244 |
+
},
|
| 1245 |
+
},
|
| 1246 |
+
{
|
| 1247 |
+
"name": "compute_reliability",
|
| 1248 |
+
"description": "Compute Cohen's kappa between model and human codes after the human_label column has been filled in.",
|
| 1249 |
+
"input_schema": {
|
| 1250 |
+
"type": "object",
|
| 1251 |
+
"properties": {
|
| 1252 |
+
"human_csv_path": {"type": "string", "description": "Path to completed reliability_sample.csv. Defaults to outputs/reliability_sample.csv."},
|
| 1253 |
+
},
|
| 1254 |
+
"required": [],
|
| 1255 |
+
},
|
| 1256 |
+
},
|
| 1257 |
+
]
|
| 1258 |
+
|
| 1259 |
+
TOOL_FN_MAP = {
|
| 1260 |
+
"list_datasets": lambda args: list_datasets(**args),
|
| 1261 |
+
"sample_rows": lambda args: sample_rows(**args),
|
| 1262 |
+
"count_by_group": lambda args: count_by_group(**args),
|
| 1263 |
+
"trend_over_time": lambda args: trend_over_time(**args),
|
| 1264 |
+
"summary_stats": lambda args: summary_stats(**args),
|
| 1265 |
+
"top_posts": lambda args: top_posts(**args),
|
| 1266 |
+
"text_search": lambda args: text_search(**args),
|
| 1267 |
+
"word_freq": lambda args: word_freq(**args),
|
| 1268 |
+
"compare_groups": lambda args: compare_groups(**args),
|
| 1269 |
+
"extract_frequency_patterns": lambda args: extract_frequency_patterns(**args),
|
| 1270 |
+
"extract_dominance_patterns": lambda args: extract_dominance_patterns(**args),
|
| 1271 |
+
"analyze_image_sample": lambda args: analyze_image_sample(**args),
|
| 1272 |
+
"export_reliability_sample": lambda args: export_reliability_sample(**args),
|
| 1273 |
+
"compute_reliability": lambda args: compute_reliability(**args),
|
| 1274 |
+
}
|
| 1275 |
+
|
| 1276 |
+
|
| 1277 |
+
def _safe_str(obj: object) -> object:
|
| 1278 |
+
"""Recursively encode any non-ASCII strings as JSON-safe escaped text."""
|
| 1279 |
+
if isinstance(obj, str):
|
| 1280 |
+
return obj.encode("ascii", errors="backslashreplace").decode("ascii")
|
| 1281 |
+
if isinstance(obj, dict):
|
| 1282 |
+
return {k: _safe_str(v) for k, v in obj.items()}
|
| 1283 |
+
if isinstance(obj, list):
|
| 1284 |
+
return [_safe_str(item) for item in obj]
|
| 1285 |
+
return obj
|
| 1286 |
+
|
| 1287 |
+
|
| 1288 |
+
def _compact_result(result: object) -> dict:
|
| 1289 |
+
if not isinstance(result, dict):
|
| 1290 |
+
return {"value": result}
|
| 1291 |
+
compact = {}
|
| 1292 |
+
for key in ("analysis", "dataset", "group_col", "value_col", "query", "filters",
|
| 1293 |
+
"n_matches", "n_returned", "n_total", "groups_compared", "saved_csv", "saved_png", "error"):
|
| 1294 |
+
if key in result and result.get(key) is not None:
|
| 1295 |
+
compact[key] = result[key]
|
| 1296 |
+
table = result.get("table")
|
| 1297 |
+
if isinstance(table, list):
|
| 1298 |
+
compact["table_preview"] = table[:3]
|
| 1299 |
+
compact["table_rows"] = len(table)
|
| 1300 |
+
return compact
|
| 1301 |
+
|
| 1302 |
+
|
| 1303 |
+
def _conversation_state_summary(turns: list[dict] | None) -> str:
|
| 1304 |
+
if not turns:
|
| 1305 |
+
return "No prior analytical state."
|
| 1306 |
+
summary = []
|
| 1307 |
+
for idx, turn in enumerate(turns[-3:], start=1):
|
| 1308 |
+
summary.append({
|
| 1309 |
+
"turn": idx,
|
| 1310 |
+
"question": _safe_str(turn.get("question", "")),
|
| 1311 |
+
"answer": _safe_str(turn.get("answer", "")),
|
| 1312 |
+
"tool_calls": [
|
| 1313 |
+
{"tool": tc.get("tool"), "args": _safe_str(tc.get("args", {})),
|
| 1314 |
+
"result": _safe_str(_compact_result(tc.get("result")))}
|
| 1315 |
+
for tc in turn.get("tool_calls", [])
|
| 1316 |
+
],
|
| 1317 |
+
"artifacts": turn.get("artifacts", []),
|
| 1318 |
+
})
|
| 1319 |
+
return json.dumps(summary, default=str, indent=2)
|
| 1320 |
+
|
| 1321 |
+
|
| 1322 |
+
def _tool_names(tools: list[dict]) -> list[str]:
|
| 1323 |
+
return [t["name"] for t in tools]
|
| 1324 |
+
|
| 1325 |
+
|
| 1326 |
+
def _tool_subset(allowed_tools: list[str]) -> list[dict]:
|
| 1327 |
+
allowed = set(allowed_tools)
|
| 1328 |
+
return [t for t in TOOLS if t["name"] in allowed]
|
| 1329 |
+
|
| 1330 |
+
|
| 1331 |
+
def _system_prompt(route_mode: str, route_guidance: str, conversation_state: str) -> str:
|
| 1332 |
+
metadata = get_dataset_metadata()
|
| 1333 |
+
dataset_lines = []
|
| 1334 |
+
for name, info in metadata.items():
|
| 1335 |
+
if not info.get("available"):
|
| 1336 |
+
continue
|
| 1337 |
+
date_range = info.get("date_range") or {}
|
| 1338 |
+
dataset_lines.append(
|
| 1339 |
+
f"- {name}: {info.get('rows')} rows; columns={list(info.get('columns', {}).keys())}; "
|
| 1340 |
+
f"date_range={date_range or 'n/a'}"
|
| 1341 |
+
)
|
| 1342 |
+
dataset_summary = "\n".join(dataset_lines)
|
| 1343 |
+
return f"""You are a question-driven data analysis agent working over local Reddit datasets.
|
| 1344 |
+
|
| 1345 |
+
Available dataset metadata:
|
| 1346 |
+
{dataset_summary}
|
| 1347 |
+
|
| 1348 |
+
Current route mode: {route_mode}
|
| 1349 |
+
Route guidance: {route_guidance}
|
| 1350 |
+
|
| 1351 |
+
Prior analytical state:
|
| 1352 |
+
{conversation_state}
|
| 1353 |
+
|
| 1354 |
+
Rules:
|
| 1355 |
+
1. Use the route guidance and only the provided tools.
|
| 1356 |
+
2. Inspect metadata or row previews before making assumptions when the schema is unclear.
|
| 1357 |
+
3. Run actual tools for numbers; do not guess.
|
| 1358 |
+
4. Prefer one minimal reproducible tool path over exploratory tool spam.
|
| 1359 |
+
5. Distinguish direct findings from caveats.
|
| 1360 |
+
6. If prior turns already produced a relevant result, reuse that context instead of recomputing unless the user asks for a change.
|
| 1361 |
+
7. Answer with this structure: direct answer, what was analysed, method, caveats.
|
| 1362 |
+
8. ALWAYS prefer tools that produce charts (trend_over_time, count_by_group, compare_groups, summary_stats, word_freq) over plain text summaries when the question is quantitative. Every numeric answer should have a chart.
|
| 1363 |
+
9. For questions about images or visual content, use analyze_image_sample. It reads from raw CSV files with image URLs β no separate setup needed. ALWAYS generate an explicit coding_scheme dict (with label names as keys and definitions as values) before calling this tool β never leave coding_scheme null for a research question.
|
| 1364 |
+
10. After a large image coding run, offer to run export_reliability_sample to generate a human validation set, then compute_reliability once the user has filled in the human_label column.
|
| 1365 |
+
11. The dataset covers 30 subreddits including GOONED, GOONEDISBACK, GoonCaves, girlgooners, and more. Use subreddit filters to drill into specific communities."""
|
| 1366 |
+
|
| 1367 |
+
|
| 1368 |
+
def run_agent(
|
| 1369 |
+
question: str,
|
| 1370 |
+
history: list[dict] | None = None,
|
| 1371 |
+
turns: list[dict] | None = None,
|
| 1372 |
+
analysis_context: list[dict] | None = None,
|
| 1373 |
+
conversation_state: list[dict] | None = None,
|
| 1374 |
+
) -> dict:
|
| 1375 |
+
"""Run the agent for a user question with deterministic routing and structured prior state."""
|
| 1376 |
+
try:
|
| 1377 |
+
return _run_agent_inner(question, history, turns, analysis_context, conversation_state)
|
| 1378 |
+
except UnicodeEncodeError as exc:
|
| 1379 |
+
tb = _traceback.format_exc()
|
| 1380 |
+
raise RuntimeError(
|
| 1381 |
+
f"Unicode encoding error (non-ASCII character in data pipeline).\n\n"
|
| 1382 |
+
f"Detail: {exc}\n\nTraceback:\n{tb}"
|
| 1383 |
+
) from exc
|
| 1384 |
+
|
| 1385 |
+
|
| 1386 |
+
def _run_agent_inner(
|
| 1387 |
+
question: str,
|
| 1388 |
+
history: list[dict] | None = None,
|
| 1389 |
+
turns: list[dict] | None = None,
|
| 1390 |
+
analysis_context: list[dict] | None = None,
|
| 1391 |
+
conversation_state: list[dict] | None = None,
|
| 1392 |
+
) -> dict:
|
| 1393 |
+
client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
|
| 1394 |
+
prior_turns = turns or analysis_context or conversation_state or []
|
| 1395 |
+
route = route_question(question)
|
| 1396 |
+
available_tools = _tool_subset(route.allowed_tools)
|
| 1397 |
+
|
| 1398 |
+
safe_history = [_safe_str(msg) for msg in (history or [])]
|
| 1399 |
+
messages = safe_history
|
| 1400 |
+
messages.append({"role": "user", "content": _safe_str(question)})
|
| 1401 |
+
|
| 1402 |
+
tool_calls_log = []
|
| 1403 |
+
plotly_jsons = []
|
| 1404 |
+
total_input_tokens = 0
|
| 1405 |
+
total_output_tokens = 0
|
| 1406 |
+
system = _system_prompt(
|
| 1407 |
+
route_mode=route.mode,
|
| 1408 |
+
route_guidance=route.guidance,
|
| 1409 |
+
conversation_state=_conversation_state_summary(prior_turns),
|
| 1410 |
+
)
|
| 1411 |
+
|
| 1412 |
+
while True:
|
| 1413 |
+
safe_messages = _safe_str(messages)
|
| 1414 |
+
safe_system = _safe_str(system)
|
| 1415 |
+
try:
|
| 1416 |
+
response = client.messages.create(
|
| 1417 |
+
model=MODEL,
|
| 1418 |
+
max_tokens=4096,
|
| 1419 |
+
system=safe_system,
|
| 1420 |
+
tools=available_tools,
|
| 1421 |
+
messages=safe_messages,
|
| 1422 |
+
)
|
| 1423 |
+
except UnicodeEncodeError:
|
| 1424 |
+
stripped_messages = json.loads(json.dumps(safe_messages, default=str, ensure_ascii=True))
|
| 1425 |
+
stripped_system = safe_system.encode("ascii", errors="ignore").decode("ascii")
|
| 1426 |
+
response = client.messages.create(
|
| 1427 |
+
model=MODEL,
|
| 1428 |
+
max_tokens=4096,
|
| 1429 |
+
system=stripped_system,
|
| 1430 |
+
tools=available_tools,
|
| 1431 |
+
messages=stripped_messages,
|
| 1432 |
+
)
|
| 1433 |
+
|
| 1434 |
+
text_parts = [block.text for block in response.content if block.type == "text"]
|
| 1435 |
+
tool_use_blocks = [block for block in response.content if block.type == "tool_use"]
|
| 1436 |
+
|
| 1437 |
+
total_input_tokens += getattr(response.usage, "input_tokens", 0) or 0
|
| 1438 |
+
total_output_tokens += getattr(response.usage, "output_tokens", 0) or 0
|
| 1439 |
+
|
| 1440 |
+
if response.stop_reason == "end_turn" or not tool_use_blocks:
|
| 1441 |
+
# Claude Opus 4.6 pricing: $15/M input, $75/M output
|
| 1442 |
+
cost_usd = (total_input_tokens / 1_000_000 * 15.0) + (total_output_tokens / 1_000_000 * 75.0)
|
| 1443 |
+
return {
|
| 1444 |
+
"answer": "\n".join(text_parts).strip(),
|
| 1445 |
+
"tool_calls": tool_calls_log,
|
| 1446 |
+
"plotly_json": plotly_jsons[-1] if plotly_jsons else None,
|
| 1447 |
+
"plotly_jsons": plotly_jsons,
|
| 1448 |
+
"route": route.mode,
|
| 1449 |
+
"allowed_tools": _tool_names(available_tools),
|
| 1450 |
+
"usage": {
|
| 1451 |
+
"input_tokens": total_input_tokens,
|
| 1452 |
+
"output_tokens": total_output_tokens,
|
| 1453 |
+
"cost_usd": round(cost_usd, 4),
|
| 1454 |
+
},
|
| 1455 |
+
}
|
| 1456 |
+
|
| 1457 |
+
tool_results = []
|
| 1458 |
+
for block in tool_use_blocks:
|
| 1459 |
+
fn = TOOL_FN_MAP.get(block.name)
|
| 1460 |
+
if fn is None:
|
| 1461 |
+
result = {"error": f"Unknown tool: {block.name}"}
|
| 1462 |
+
else:
|
| 1463 |
+
try:
|
| 1464 |
+
result = fn(block.input)
|
| 1465 |
+
if isinstance(result, dict) and result.get("plotly_json"):
|
| 1466 |
+
plotly_jsons.append(result["plotly_json"])
|
| 1467 |
+
except Exception as exc:
|
| 1468 |
+
result = {"error": str(exc)}
|
| 1469 |
+
|
| 1470 |
+
safe_result = _safe_str(result)
|
| 1471 |
+
tool_calls_log.append({"tool": block.name, "args": block.input, "result": result})
|
| 1472 |
+
tool_results.append({
|
| 1473 |
+
"type": "tool_result",
|
| 1474 |
+
"tool_use_id": block.id,
|
| 1475 |
+
"content": json.dumps(safe_result, default=str, ensure_ascii=True),
|
| 1476 |
+
})
|
| 1477 |
+
|
| 1478 |
+
assistant_content = [_safe_str(block.model_dump()) for block in response.content]
|
| 1479 |
+
messages.append({"role": "assistant", "content": assistant_content})
|
| 1480 |
+
messages.append({"role": "user", "content": tool_results})
|
agent/rebuild_parquets.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Rebuild posts.parquet and comments.parquet from raw CSVs.
|
| 3 |
+
|
| 4 |
+
Strategy:
|
| 5 |
+
- GOONED: use GOONED_submissions.csv + GOONED_comments.csv (full combined dumps)
|
| 6 |
+
- All other subreddits: concatenate their yearly CSV files
|
| 7 |
+
- Deduplicate by id within each type
|
| 8 |
+
- Write to data/posts_full.parquet and data/comments_full.parquet
|
| 9 |
+
|
| 10 |
+
Run from the project root (Goon/):
|
| 11 |
+
python agent/rebuild_parquets.py
|
| 12 |
+
|
| 13 |
+
Expect ~5-10 minutes and 4-8GB RAM for the comments file.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import gc
|
| 17 |
+
import glob
|
| 18 |
+
import sys
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
|
| 21 |
+
import pandas as pd
|
| 22 |
+
import pyarrow as pa
|
| 23 |
+
import pyarrow.parquet as pq
|
| 24 |
+
|
| 25 |
+
DATA_DIR = Path("data")
|
| 26 |
+
OUT_POSTS = DATA_DIR / "posts_full.parquet"
|
| 27 |
+
OUT_COMMENTS = DATA_DIR / "comments_full.parquet"
|
| 28 |
+
|
| 29 |
+
SUBMISSION_COLS = [
|
| 30 |
+
"id", "subreddit", "author", "title", "selftext",
|
| 31 |
+
"score", "num_comments", "upvote_ratio", "created_utc",
|
| 32 |
+
"author_flair_text", "url", "domain", "is_self", "is_video",
|
| 33 |
+
]
|
| 34 |
+
COMMENT_COLS = [
|
| 35 |
+
"id", "subreddit", "author", "body",
|
| 36 |
+
"score", "created_utc", "author_flair_text",
|
| 37 |
+
"parent_id", "is_submitter",
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
CHUNK_SIZE = 500_000
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def safe_read(path: Path, usecols: list[str], **kwargs) -> pd.DataFrame:
|
| 44 |
+
available = pd.read_csv(path, nrows=0).columns.tolist()
|
| 45 |
+
cols = [c for c in usecols if c in available]
|
| 46 |
+
return pd.read_csv(path, usecols=cols, low_memory=False, **kwargs)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _normalise(df: pd.DataFrame) -> pd.DataFrame:
|
| 50 |
+
"""Force consistent dtypes so all chunks share the same Arrow schema."""
|
| 51 |
+
if "created_utc" in df.columns:
|
| 52 |
+
df["created_utc"] = pd.to_numeric(df["created_utc"], errors="coerce").astype("float64")
|
| 53 |
+
if "score" in df.columns:
|
| 54 |
+
df["score"] = pd.to_numeric(df["score"], errors="coerce").astype("Int64")
|
| 55 |
+
if "num_comments" in df.columns:
|
| 56 |
+
df["num_comments"] = pd.to_numeric(df["num_comments"], errors="coerce").astype("Int64")
|
| 57 |
+
if "upvote_ratio" in df.columns:
|
| 58 |
+
df["upvote_ratio"] = pd.to_numeric(df["upvote_ratio"], errors="coerce").astype("float64")
|
| 59 |
+
for bool_col in ("is_self", "is_video", "is_submitter"):
|
| 60 |
+
if bool_col in df.columns:
|
| 61 |
+
df[bool_col] = df[bool_col].astype("boolean")
|
| 62 |
+
# Force ALL remaining columns to string (handles NaN-only cols inferred as float)
|
| 63 |
+
for col in df.columns:
|
| 64 |
+
if df[col].dtype not in ("float64", "Int64", "boolean"):
|
| 65 |
+
df[col] = df[col].astype("string")
|
| 66 |
+
return df
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _safe_table(df: pd.DataFrame, schema: pa.Schema) -> pa.Table:
|
| 70 |
+
"""Cast a DataFrame to an existing Arrow schema, coercing mismatches."""
|
| 71 |
+
table = pa.Table.from_pandas(df, preserve_index=False)
|
| 72 |
+
return table.cast(schema, safe=False)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def build_posts():
|
| 76 |
+
print("=== Building posts_full.parquet ===")
|
| 77 |
+
writer = None
|
| 78 |
+
schema = None
|
| 79 |
+
seen_ids: set = set()
|
| 80 |
+
total = 0
|
| 81 |
+
|
| 82 |
+
# GOONED: use full combined file
|
| 83 |
+
gooned_path = DATA_DIR / "GOONED_submissions.csv"
|
| 84 |
+
print(f" Reading {gooned_path.name}...", flush=True)
|
| 85 |
+
for chunk in pd.read_csv(
|
| 86 |
+
gooned_path,
|
| 87 |
+
usecols=lambda c: c in SUBMISSION_COLS,
|
| 88 |
+
chunksize=CHUNK_SIZE,
|
| 89 |
+
low_memory=False,
|
| 90 |
+
):
|
| 91 |
+
chunk = chunk[~chunk["id"].isin(seen_ids)].drop_duplicates("id")
|
| 92 |
+
seen_ids.update(chunk["id"].tolist())
|
| 93 |
+
chunk = _normalise(chunk)
|
| 94 |
+
table = pa.Table.from_pandas(chunk, preserve_index=False)
|
| 95 |
+
if writer is None:
|
| 96 |
+
schema = table.schema
|
| 97 |
+
writer = pq.ParquetWriter(OUT_POSTS, schema, compression="snappy")
|
| 98 |
+
else:
|
| 99 |
+
table = _safe_table(chunk, schema)
|
| 100 |
+
writer.write_table(table)
|
| 101 |
+
total += len(chunk)
|
| 102 |
+
print(f" GOONED submissions: {total:,}", flush=True)
|
| 103 |
+
|
| 104 |
+
# All other subreddits: yearly files, skip GOONED
|
| 105 |
+
yearly_files = sorted(DATA_DIR.glob("*_submissions_20*.csv"))
|
| 106 |
+
yearly_files = [f for f in yearly_files if not f.name.startswith("GOONED_")]
|
| 107 |
+
|
| 108 |
+
for path in yearly_files:
|
| 109 |
+
try:
|
| 110 |
+
chunk = safe_read(path, SUBMISSION_COLS)
|
| 111 |
+
except Exception as e:
|
| 112 |
+
print(f" SKIP {path.name}: {e}", flush=True)
|
| 113 |
+
continue
|
| 114 |
+
if "id" not in chunk.columns:
|
| 115 |
+
continue
|
| 116 |
+
chunk = chunk[~chunk["id"].isin(seen_ids)].drop_duplicates("id")
|
| 117 |
+
seen_ids.update(chunk["id"].tolist())
|
| 118 |
+
chunk = _normalise(chunk)
|
| 119 |
+
table = _safe_table(chunk, schema) if schema else pa.Table.from_pandas(chunk, preserve_index=False)
|
| 120 |
+
if writer is None:
|
| 121 |
+
schema = table.schema
|
| 122 |
+
writer = pq.ParquetWriter(OUT_POSTS, schema, compression="snappy")
|
| 123 |
+
writer.write_table(table)
|
| 124 |
+
total += len(chunk)
|
| 125 |
+
|
| 126 |
+
if writer:
|
| 127 |
+
writer.close()
|
| 128 |
+
print(f" Total posts written: {total:,}")
|
| 129 |
+
print(f" Saved: {OUT_POSTS}")
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def build_comments():
|
| 133 |
+
print("=== Building comments_full.parquet ===")
|
| 134 |
+
writer = None
|
| 135 |
+
schema = None
|
| 136 |
+
total = 0
|
| 137 |
+
|
| 138 |
+
# GOONED: use full combined file (35M rows β chunked, no id tracking to save RAM)
|
| 139 |
+
gooned_path = DATA_DIR / "GOONED_comments.csv"
|
| 140 |
+
print(f" Reading {gooned_path.name} in chunks...", flush=True)
|
| 141 |
+
chunk_n = 0
|
| 142 |
+
for chunk in pd.read_csv(
|
| 143 |
+
gooned_path,
|
| 144 |
+
usecols=lambda c: c in COMMENT_COLS,
|
| 145 |
+
chunksize=CHUNK_SIZE,
|
| 146 |
+
low_memory=False,
|
| 147 |
+
):
|
| 148 |
+
chunk = chunk.drop_duplicates("id")
|
| 149 |
+
chunk = _normalise(chunk)
|
| 150 |
+
if writer is None:
|
| 151 |
+
table = pa.Table.from_pandas(chunk, preserve_index=False)
|
| 152 |
+
schema = table.schema
|
| 153 |
+
writer = pq.ParquetWriter(OUT_COMMENTS, schema, compression="snappy")
|
| 154 |
+
else:
|
| 155 |
+
table = _safe_table(chunk, schema)
|
| 156 |
+
writer.write_table(table)
|
| 157 |
+
total += len(chunk)
|
| 158 |
+
chunk_n += 1
|
| 159 |
+
if chunk_n % 10 == 0:
|
| 160 |
+
print(f" ...{total:,} rows processed", flush=True)
|
| 161 |
+
del chunk
|
| 162 |
+
gc.collect()
|
| 163 |
+
|
| 164 |
+
print(f" GOONED comments done: {total:,}", flush=True)
|
| 165 |
+
|
| 166 |
+
# All other subreddits: yearly files (no GOONED overlap since we used the combined file)
|
| 167 |
+
yearly_files = sorted(DATA_DIR.glob("*_comments_20*.csv"))
|
| 168 |
+
yearly_files = [f for f in yearly_files if not f.name.startswith("GOONED_")]
|
| 169 |
+
|
| 170 |
+
for path in yearly_files:
|
| 171 |
+
try:
|
| 172 |
+
chunk = safe_read(path, COMMENT_COLS)
|
| 173 |
+
except Exception as e:
|
| 174 |
+
print(f" SKIP {path.name}: {e}", flush=True)
|
| 175 |
+
continue
|
| 176 |
+
if "id" not in chunk.columns:
|
| 177 |
+
continue
|
| 178 |
+
chunk = chunk.drop_duplicates("id")
|
| 179 |
+
chunk = _normalise(chunk)
|
| 180 |
+
table = _safe_table(chunk, schema) if schema else pa.Table.from_pandas(chunk, preserve_index=False)
|
| 181 |
+
if writer is None:
|
| 182 |
+
schema = table.schema
|
| 183 |
+
writer = pq.ParquetWriter(OUT_COMMENTS, schema, compression="snappy")
|
| 184 |
+
writer.write_table(table)
|
| 185 |
+
total += len(chunk)
|
| 186 |
+
del chunk
|
| 187 |
+
gc.collect()
|
| 188 |
+
|
| 189 |
+
if writer:
|
| 190 |
+
writer.close()
|
| 191 |
+
print(f" Total comments written: {total:,}")
|
| 192 |
+
print(f" Saved: {OUT_COMMENTS}")
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
if __name__ == "__main__":
|
| 196 |
+
build_posts()
|
| 197 |
+
build_comments()
|
| 198 |
+
print("\nDone. Update DATA_DIR in agent/analysis/inspect_data.py to use posts_full and comments_full.")
|
agent/requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
anthropic>=0.40.0
|
| 2 |
+
together>=1.3.0
|
| 3 |
+
openai>=1.0.0
|
| 4 |
+
scikit-learn>=1.4.0
|
| 5 |
+
pandas>=2.0.0
|
| 6 |
+
pyarrow>=14.0.0
|
| 7 |
+
streamlit>=1.35.0
|
| 8 |
+
python-dotenv>=1.0.0
|
| 9 |
+
plotly>=5.20.0
|
app.py
ADDED
|
@@ -0,0 +1,1059 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Grasping Gooning β analysis agent UI
|
| 3 |
+
Run: streamlit run app.py (from /Users/binx/Desktop/Goon/)
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import inspect
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
import random
|
| 12 |
+
import sys
|
| 13 |
+
import threading
|
| 14 |
+
import time
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
import pandas as pd
|
| 18 |
+
import plotly.io as pio
|
| 19 |
+
import streamlit as st
|
| 20 |
+
from dotenv import load_dotenv
|
| 21 |
+
|
| 22 |
+
# ββ paths ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 23 |
+
ROOT = Path(__file__).parent
|
| 24 |
+
sys.path.insert(0, str(ROOT / "agent"))
|
| 25 |
+
load_dotenv(ROOT / "agent" / ".env")
|
| 26 |
+
|
| 27 |
+
from analysis import run_agent, list_datasets
|
| 28 |
+
|
| 29 |
+
# ββ page config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
+
st.set_page_config(
|
| 31 |
+
page_title="Grasping Gooning",
|
| 32 |
+
layout="wide",
|
| 33 |
+
initial_sidebar_state="expanded",
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
@st.cache_data(show_spinner=False)
|
| 37 |
+
def load_post_samples(n: int = 120) -> list[dict]:
|
| 38 |
+
"""Random sample of real post titles for the loading slideshow."""
|
| 39 |
+
try:
|
| 40 |
+
import pyarrow.dataset as _ds
|
| 41 |
+
_path = ROOT / "data" / "posts.parquet"
|
| 42 |
+
if not _path.exists():
|
| 43 |
+
return []
|
| 44 |
+
d = _ds.dataset(str(_path), format="parquet")
|
| 45 |
+
t = d.scanner(columns=["subreddit", "title"]).head(40000).to_pandas()
|
| 46 |
+
mask = (
|
| 47 |
+
t["title"].str.len() > 30
|
| 48 |
+
) & (
|
| 49 |
+
t["title"].str.len() < 180
|
| 50 |
+
) & (
|
| 51 |
+
~t["title"].str.lower().str.startswith("[")
|
| 52 |
+
)
|
| 53 |
+
sample = t[mask].sample(min(n, mask.sum()), random_state=None)
|
| 54 |
+
return sample[["subreddit", "title"]].to_dict(orient="records")
|
| 55 |
+
except Exception:
|
| 56 |
+
return []
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
LOADING_HINTS = [
|
| 60 |
+
"you're so closeβ¦",
|
| 61 |
+
"keep goingβ¦",
|
| 62 |
+
"deeperβ¦",
|
| 63 |
+
"almost thereβ¦",
|
| 64 |
+
"don't stop nowβ¦",
|
| 65 |
+
"just a bit moreβ¦",
|
| 66 |
+
"stay with itβ¦",
|
| 67 |
+
"right thereβ¦",
|
| 68 |
+
"edge of somethingβ¦",
|
| 69 |
+
"hold onβ¦",
|
| 70 |
+
"so closeβ¦",
|
| 71 |
+
"don't stopβ¦",
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
# ββ global CSS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 75 |
+
st.markdown("""
|
| 76 |
+
<style>
|
| 77 |
+
@import url('https://fonts.googleapis.com/icon?family=Material+Icons');
|
| 78 |
+
|
| 79 |
+
/* ---- tokens ---- */
|
| 80 |
+
:root {
|
| 81 |
+
--bg: #ffffff;
|
| 82 |
+
--surface: #f5f5f5;
|
| 83 |
+
--border: #e0e0e0;
|
| 84 |
+
--divider: #ebebeb;
|
| 85 |
+
--ink: #000000;
|
| 86 |
+
--body: #222222;
|
| 87 |
+
--mid: #555555;
|
| 88 |
+
--muted: #888888;
|
| 89 |
+
--faint: #aaaaaa;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
/* ---- base ---- */
|
| 93 |
+
html, body, .stApp,
|
| 94 |
+
[data-testid="stAppViewContainer"],
|
| 95 |
+
[data-testid="stMain"],
|
| 96 |
+
[data-testid="stHeader"],
|
| 97 |
+
[data-testid="stToolbar"],
|
| 98 |
+
[data-testid="stBottom"],
|
| 99 |
+
[data-testid="stBottomBlockContainer"] {
|
| 100 |
+
background: var(--bg) !important;
|
| 101 |
+
color: var(--ink) !important;
|
| 102 |
+
font-family: Arial, Helvetica, sans-serif !important;
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
/* header bar */
|
| 106 |
+
[data-testid="stHeader"] {
|
| 107 |
+
border-bottom: 1px solid var(--border) !important;
|
| 108 |
+
box-shadow: none !important;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
/* bottom chat bar */
|
| 112 |
+
[data-testid="stBottom"] {
|
| 113 |
+
border-top: 1px solid var(--border) !important;
|
| 114 |
+
box-shadow: none !important;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
#MainMenu, footer { visibility: hidden; }
|
| 118 |
+
/* hide deploy button (confirmed testid from Streamlit 1.50 bundle) */
|
| 119 |
+
[data-testid="stAppDeployButton"] {
|
| 120 |
+
display: none !important;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
/* ---- layout ---- */
|
| 124 |
+
.block-container {
|
| 125 |
+
max-width: 1060px !important;
|
| 126 |
+
padding: 56px 32px 140px !important;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
/* ---- sidebar (dark) ---- */
|
| 130 |
+
section[data-testid="stSidebar"] {
|
| 131 |
+
background: #0f0f0f !important;
|
| 132 |
+
border-right: 1px solid #1e1e1e !important;
|
| 133 |
+
}
|
| 134 |
+
section[data-testid="stSidebar"] .block-container {
|
| 135 |
+
padding: 28px 16px 48px !important;
|
| 136 |
+
}
|
| 137 |
+
/* all text inside sidebar goes light */
|
| 138 |
+
section[data-testid="stSidebar"] p,
|
| 139 |
+
section[data-testid="stSidebar"] span,
|
| 140 |
+
section[data-testid="stSidebar"] label,
|
| 141 |
+
section[data-testid="stSidebar"] div,
|
| 142 |
+
section[data-testid="stSidebar"] .stMarkdown {
|
| 143 |
+
color: #cccccc !important;
|
| 144 |
+
font-family: Arial, Helvetica, sans-serif !important;
|
| 145 |
+
}
|
| 146 |
+
/* sidebar collapse button */
|
| 147 |
+
[data-testid="stSidebarCollapseButton"] button::after { color: #555 !important; }
|
| 148 |
+
[data-testid="stExpandSidebarButton"] button::after { color: #555 !important; }
|
| 149 |
+
|
| 150 |
+
/* ---- sidebar heading animation ---- */
|
| 151 |
+
.sb-title {
|
| 152 |
+
font-size: 13px;
|
| 153 |
+
font-weight: 700;
|
| 154 |
+
letter-spacing: 0.18em;
|
| 155 |
+
text-transform: uppercase;
|
| 156 |
+
color: #ffffff !important;
|
| 157 |
+
margin-bottom: 2px;
|
| 158 |
+
animation: sbFadeDown 500ms cubic-bezier(0.22,1,0.36,1) both;
|
| 159 |
+
}
|
| 160 |
+
.sb-tagline {
|
| 161 |
+
font-size: 10px;
|
| 162 |
+
letter-spacing: 0.08em;
|
| 163 |
+
color: #555 !important;
|
| 164 |
+
overflow: hidden;
|
| 165 |
+
white-space: nowrap;
|
| 166 |
+
width: 0;
|
| 167 |
+
animation: sbTypewriter 1.4s steps(32, end) 300ms forwards,
|
| 168 |
+
sbBlinkCursor 600ms step-end 300ms 3;
|
| 169 |
+
border-right: 1px solid #444;
|
| 170 |
+
}
|
| 171 |
+
@keyframes sbFadeDown {
|
| 172 |
+
from { opacity: 0; transform: translateY(-6px); }
|
| 173 |
+
to { opacity: 1; transform: none; }
|
| 174 |
+
}
|
| 175 |
+
@keyframes sbTypewriter {
|
| 176 |
+
to { width: 100%; border-right-color: transparent; }
|
| 177 |
+
}
|
| 178 |
+
@keyframes sbBlinkCursor {
|
| 179 |
+
50% { border-right-color: transparent; }
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
/* ---- sidebar labels ---- */
|
| 183 |
+
.sidebar-label {
|
| 184 |
+
font-size: 9px;
|
| 185 |
+
font-weight: 700;
|
| 186 |
+
letter-spacing: 0.16em;
|
| 187 |
+
text-transform: uppercase;
|
| 188 |
+
color: #444444 !important;
|
| 189 |
+
animation: sbFadeDown 400ms ease both;
|
| 190 |
+
}
|
| 191 |
+
.sidebar-box {
|
| 192 |
+
border-top: 1px solid #1e1e1e;
|
| 193 |
+
padding-top: 14px;
|
| 194 |
+
margin-top: 14px;
|
| 195 |
+
}
|
| 196 |
+
.sidebar-copy {
|
| 197 |
+
font-size: 10px;
|
| 198 |
+
line-height: 1.7;
|
| 199 |
+
color: #777777 !important;
|
| 200 |
+
}
|
| 201 |
+
.sidebar-stat {
|
| 202 |
+
font-size: 11px;
|
| 203 |
+
color: #aaaaaa !important;
|
| 204 |
+
font-weight: 600;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
/* ---- sidebar buttons ---- */
|
| 208 |
+
section[data-testid="stSidebar"] .stButton > button {
|
| 209 |
+
width: 100% !important;
|
| 210 |
+
background: transparent !important;
|
| 211 |
+
border: 1px solid #2a2a2a !important;
|
| 212 |
+
border-radius: 0 !important;
|
| 213 |
+
color: #666666 !important;
|
| 214 |
+
padding: 8px 10px !important;
|
| 215 |
+
text-align: left !important;
|
| 216 |
+
font-size: 10px !important;
|
| 217 |
+
letter-spacing: 0.08em !important;
|
| 218 |
+
text-transform: uppercase !important;
|
| 219 |
+
box-shadow: none !important;
|
| 220 |
+
transition: background 150ms, border-color 150ms, color 150ms !important;
|
| 221 |
+
}
|
| 222 |
+
section[data-testid="stSidebar"] .stButton > button:hover {
|
| 223 |
+
background: #1a1a1a !important;
|
| 224 |
+
border-color: #555555 !important;
|
| 225 |
+
color: #eeeeee !important;
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
/* ---- sidebar inputs ---- */
|
| 229 |
+
section[data-testid="stSidebar"] .stTextInput input {
|
| 230 |
+
background: #1a1a1a !important;
|
| 231 |
+
border: 1px solid #2a2a2a !important;
|
| 232 |
+
border-radius: 0 !important;
|
| 233 |
+
color: #cccccc !important;
|
| 234 |
+
box-shadow: none !important;
|
| 235 |
+
font-size: 12px !important;
|
| 236 |
+
transition: border-color 150ms !important;
|
| 237 |
+
}
|
| 238 |
+
section[data-testid="stSidebar"] .stTextInput input:focus {
|
| 239 |
+
border-color: #555555 !important;
|
| 240 |
+
}
|
| 241 |
+
section[data-testid="stSidebar"] .stTextInput input::placeholder {
|
| 242 |
+
color: #444444 !important;
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
/* ---- sidebar expander (dark) ---- */
|
| 246 |
+
section[data-testid="stSidebar"] [data-testid="stExpander"] {
|
| 247 |
+
background: transparent !important;
|
| 248 |
+
border: 1px solid #1e1e1e !important;
|
| 249 |
+
}
|
| 250 |
+
section[data-testid="stSidebar"] [data-testid="stExpander"]:hover {
|
| 251 |
+
border-color: #333333 !important;
|
| 252 |
+
}
|
| 253 |
+
section[data-testid="stSidebar"] [data-testid="stExpander"] summary p,
|
| 254 |
+
section[data-testid="stSidebar"] [data-testid="stExpander"] summary span {
|
| 255 |
+
color: #888888 !important;
|
| 256 |
+
font-size: 10px !important;
|
| 257 |
+
letter-spacing: 0.1em !important;
|
| 258 |
+
text-transform: uppercase !important;
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
/* ---- main area buttons ---- */
|
| 262 |
+
.block-container .stButton > button {
|
| 263 |
+
width: 100% !important;
|
| 264 |
+
background: transparent !important;
|
| 265 |
+
border: 1px solid #cccccc !important;
|
| 266 |
+
border-radius: 0 !important;
|
| 267 |
+
color: var(--mid) !important;
|
| 268 |
+
padding: 8px 10px !important;
|
| 269 |
+
text-align: left !important;
|
| 270 |
+
font-size: 10px !important;
|
| 271 |
+
letter-spacing: 0.08em !important;
|
| 272 |
+
text-transform: uppercase !important;
|
| 273 |
+
box-shadow: none !important;
|
| 274 |
+
transition: background 150ms, border-color 150ms, color 150ms !important;
|
| 275 |
+
}
|
| 276 |
+
.block-container .stButton > button:hover {
|
| 277 |
+
background: var(--surface) !important;
|
| 278 |
+
border-color: var(--ink) !important;
|
| 279 |
+
color: var(--ink) !important;
|
| 280 |
+
}
|
| 281 |
+
|
| 282 |
+
/* ---- inputs ---- */
|
| 283 |
+
.stTextInput input {
|
| 284 |
+
background: var(--bg) !important;
|
| 285 |
+
border: 1px solid var(--border) !important;
|
| 286 |
+
border-radius: 0 !important;
|
| 287 |
+
color: var(--ink) !important;
|
| 288 |
+
box-shadow: none !important;
|
| 289 |
+
font-size: 12px !important;
|
| 290 |
+
transition: border-color 150ms !important;
|
| 291 |
+
}
|
| 292 |
+
.stTextInput input:focus { border-color: var(--ink) !important; }
|
| 293 |
+
|
| 294 |
+
/* chat input container */
|
| 295 |
+
[data-testid="stChatInput"],
|
| 296 |
+
[data-testid="stChatInput"] > div,
|
| 297 |
+
[data-testid="stChatInputContainer"] {
|
| 298 |
+
background: #ffffff !important;
|
| 299 |
+
border: 1px solid #d0d0d0 !important;
|
| 300 |
+
border-radius: 0 !important;
|
| 301 |
+
box-shadow: none !important;
|
| 302 |
+
transition: border-color 150ms !important;
|
| 303 |
+
}
|
| 304 |
+
[data-testid="stChatInput"]:focus-within,
|
| 305 |
+
[data-testid="stChatInputContainer"]:focus-within {
|
| 306 |
+
border-color: #000000 !important;
|
| 307 |
+
box-shadow: none !important;
|
| 308 |
+
}
|
| 309 |
+
[data-testid="stChatInput"] textarea {
|
| 310 |
+
background: #ffffff !important;
|
| 311 |
+
color: #000000 !important;
|
| 312 |
+
font-size: 14px !important;
|
| 313 |
+
font-family: Arial, Helvetica, sans-serif !important;
|
| 314 |
+
}
|
| 315 |
+
/* send button */
|
| 316 |
+
[data-testid="stChatInput"] button,
|
| 317 |
+
[data-testid="stChatInputContainer"] button {
|
| 318 |
+
background: #000000 !important;
|
| 319 |
+
border: none !important;
|
| 320 |
+
border-radius: 0 !important;
|
| 321 |
+
color: #ffffff !important;
|
| 322 |
+
box-shadow: none !important;
|
| 323 |
+
min-width: 44px !important;
|
| 324 |
+
width: 44px !important;
|
| 325 |
+
height: 100% !important;
|
| 326 |
+
font-family: Arial, Helvetica, sans-serif !important;
|
| 327 |
+
font-size: 13px !important;
|
| 328 |
+
font-weight: 700 !important;
|
| 329 |
+
letter-spacing: 0.04em !important;
|
| 330 |
+
}
|
| 331 |
+
[data-testid="stChatInput"] button:hover,
|
| 332 |
+
[data-testid="stChatInputContainer"] button:hover {
|
| 333 |
+
background: #333333 !important;
|
| 334 |
+
opacity: 1 !important;
|
| 335 |
+
}
|
| 336 |
+
/* replace SVG arrow with text "->" */
|
| 337 |
+
[data-testid="stChatInput"] button svg,
|
| 338 |
+
[data-testid="stChatInputContainer"] button svg { display: none !important; }
|
| 339 |
+
[data-testid="stChatInput"] button::after,
|
| 340 |
+
[data-testid="stChatInputContainer"] button::after {
|
| 341 |
+
content: "->";
|
| 342 |
+
font-family: Arial, Helvetica, sans-serif !important;
|
| 343 |
+
font-size: 13px;
|
| 344 |
+
font-weight: 700;
|
| 345 |
+
color: #ffffff;
|
| 346 |
+
}
|
| 347 |
+
/* kill any rounded wrapper Streamlit adds around the whole bar */
|
| 348 |
+
[data-testid="stBottom"] > div,
|
| 349 |
+
[data-testid="stBottomBlockContainer"] > div {
|
| 350 |
+
background: #ffffff !important;
|
| 351 |
+
border-radius: 0 !important;
|
| 352 |
+
box-shadow: none !important;
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
/* ---- expander ---- */
|
| 356 |
+
[data-testid="stExpander"] {
|
| 357 |
+
background: transparent !important;
|
| 358 |
+
border: 1px solid var(--border) !important;
|
| 359 |
+
border-radius: 0 !important;
|
| 360 |
+
overflow: hidden !important;
|
| 361 |
+
transition: border-color 150ms !important;
|
| 362 |
+
}
|
| 363 |
+
[data-testid="stExpander"]:hover { border-color: var(--ink) !important; }
|
| 364 |
+
[data-testid="stExpander"] summary { padding: 10px 14px !important; }
|
| 365 |
+
/* kill the expander toggle icon (data-testid confirmed from Streamlit 1.50 bundle) */
|
| 366 |
+
[data-testid="stExpander"] summary [data-testid="stIconMaterial"],
|
| 367 |
+
[data-testid="stExpander"] summary [data-testid="stImageIcon"] {
|
| 368 |
+
display: none !important;
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
/* sidebar collapse / expand toggle icons -> replace with < > */
|
| 372 |
+
[data-testid="stExpandSidebarButton"] [data-testid="stIconMaterial"],
|
| 373 |
+
[data-testid="stSidebarCollapseButton"] [data-testid="stIconMaterial"] {
|
| 374 |
+
display: none !important;
|
| 375 |
+
}
|
| 376 |
+
[data-testid="stExpandSidebarButton"] button::after {
|
| 377 |
+
content: ">";
|
| 378 |
+
font-size: 15px; font-weight: 700;
|
| 379 |
+
font-family: Arial, Helvetica, sans-serif !important;
|
| 380 |
+
color: #555555;
|
| 381 |
+
}
|
| 382 |
+
[data-testid="stSidebarCollapseButton"] button::after {
|
| 383 |
+
content: "<";
|
| 384 |
+
font-size: 15px; font-weight: 700;
|
| 385 |
+
font-family: Arial, Helvetica, sans-serif !important;
|
| 386 |
+
color: #555555;
|
| 387 |
+
}
|
| 388 |
+
|
| 389 |
+
/* ---- progress bar ---- */
|
| 390 |
+
.prog-wrap { padding: 20px 0 12px; }
|
| 391 |
+
.prog-hint {
|
| 392 |
+
font-size: 11px; color: var(--muted);
|
| 393 |
+
letter-spacing: 0.1em; margin-bottom: 10px;
|
| 394 |
+
font-style: italic;
|
| 395 |
+
animation: progPulse 1.8s ease-in-out infinite;
|
| 396 |
+
}
|
| 397 |
+
@keyframes progPulse {
|
| 398 |
+
0%, 100% { opacity: 0.5; }
|
| 399 |
+
50% { opacity: 1; }
|
| 400 |
+
}
|
| 401 |
+
.prog-bg {
|
| 402 |
+
background: var(--divider); height: 1px; width: 100%; margin-bottom: 6px;
|
| 403 |
+
position: relative; overflow: hidden;
|
| 404 |
+
}
|
| 405 |
+
.prog-fill {
|
| 406 |
+
background: var(--ink); height: 1px;
|
| 407 |
+
transition: width 0.35s ease;
|
| 408 |
+
position: absolute; top: 0; left: 0;
|
| 409 |
+
}
|
| 410 |
+
/* shimmer on the fill bar */
|
| 411 |
+
.prog-fill::after {
|
| 412 |
+
content: "";
|
| 413 |
+
position: absolute; top: 0; right: 0;
|
| 414 |
+
width: 40px; height: 1px;
|
| 415 |
+
background: linear-gradient(to right, transparent, #fff, transparent);
|
| 416 |
+
animation: shimmer 1.2s ease-in-out infinite;
|
| 417 |
+
}
|
| 418 |
+
@keyframes shimmer {
|
| 419 |
+
0% { opacity: 0; transform: translateX(-40px); }
|
| 420 |
+
50% { opacity: 1; }
|
| 421 |
+
100% { opacity: 0; transform: translateX(40px); }
|
| 422 |
+
}
|
| 423 |
+
.prog-pct {
|
| 424 |
+
font-size: 9px; color: var(--faint);
|
| 425 |
+
letter-spacing: 0.14em; text-transform: uppercase;
|
| 426 |
+
font-family: "Courier New", monospace !important;
|
| 427 |
+
}
|
| 428 |
+
.prog-stuck {
|
| 429 |
+
margin-top: 8px;
|
| 430 |
+
font-size: 10px; color: var(--muted);
|
| 431 |
+
letter-spacing: 0.08em; font-style: italic;
|
| 432 |
+
animation: stuckFadeIn 400ms ease both;
|
| 433 |
+
}
|
| 434 |
+
.prog-stuck-0 { color: var(--muted); }
|
| 435 |
+
.prog-stuck-1 { color: var(--faint); }
|
| 436 |
+
.prog-stuck-2 { color: #cccccc; font-size: 9px; }
|
| 437 |
+
.prog-stuck-3 { color: #dddddd; font-size: 9px; }
|
| 438 |
+
.prog-stuck-4 { color: #e0e0e0; font-size: 9px; }
|
| 439 |
+
.prog-stuck-5 { color: #e8e8e8; font-size: 9px; }
|
| 440 |
+
@keyframes stuckFadeIn {
|
| 441 |
+
from { opacity: 0; transform: translateY(4px); }
|
| 442 |
+
to { opacity: 1; transform: none; }
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
/* ---- loading post slideshow ---- */
|
| 446 |
+
.post-slide {
|
| 447 |
+
margin-top: 20px;
|
| 448 |
+
padding: 14px 18px;
|
| 449 |
+
background: var(--surface);
|
| 450 |
+
border-left: 2px solid var(--divider);
|
| 451 |
+
animation: slideIn 300ms cubic-bezier(0.22,1,0.36,1) both;
|
| 452 |
+
}
|
| 453 |
+
.post-slide-sub {
|
| 454 |
+
font-size: 9px; letter-spacing: 0.14em; text-transform: uppercase;
|
| 455 |
+
color: var(--faint); margin-bottom: 6px;
|
| 456 |
+
font-family: "Courier New", monospace !important;
|
| 457 |
+
}
|
| 458 |
+
.post-slide-title {
|
| 459 |
+
font-size: 13px; line-height: 1.5; color: var(--body);
|
| 460 |
+
font-style: italic;
|
| 461 |
+
}
|
| 462 |
+
@keyframes slideIn {
|
| 463 |
+
from { opacity: 0; transform: translateY(6px); }
|
| 464 |
+
to { opacity: 1; transform: none; }
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
/* ---- chat ---- */
|
| 468 |
+
[data-testid="stChatMessage"] {
|
| 469 |
+
background: transparent !important;
|
| 470 |
+
border: none !important;
|
| 471 |
+
padding: 0 !important;
|
| 472 |
+
margin: 0 0 28px !important;
|
| 473 |
+
gap: 12px !important;
|
| 474 |
+
animation: fadeUp 240ms cubic-bezier(0.22,1,0.36,1) both;
|
| 475 |
+
}
|
| 476 |
+
/* user avatar: kaomoji */
|
| 477 |
+
[data-testid="stChatMessageAvatarUser"] {
|
| 478 |
+
background: #000000 !important;
|
| 479 |
+
border: none !important;
|
| 480 |
+
width: 34px !important; height: 34px !important;
|
| 481 |
+
min-width: 34px !important;
|
| 482 |
+
border-radius: 0 !important;
|
| 483 |
+
position: relative !important;
|
| 484 |
+
overflow: visible !important;
|
| 485 |
+
}
|
| 486 |
+
[data-testid="stChatMessageAvatarUser"] svg,
|
| 487 |
+
[data-testid="stChatMessageAvatarUser"] img { display: none !important; }
|
| 488 |
+
[data-testid="stChatMessageAvatarUser"]::after {
|
| 489 |
+
content: "( ΛβΎΛ)";
|
| 490 |
+
position: absolute; top: 50%; left: 50%;
|
| 491 |
+
transform: translate(-50%, -50%);
|
| 492 |
+
font-size: 11px; line-height: 1; color: #ffffff;
|
| 493 |
+
white-space: nowrap;
|
| 494 |
+
font-family: Arial, Helvetica, sans-serif !important;
|
| 495 |
+
}
|
| 496 |
+
/* assistant avatar: kaomoji */
|
| 497 |
+
[data-testid="stChatMessageAvatarAssistant"] {
|
| 498 |
+
background: #ffffff !important;
|
| 499 |
+
border: 1px solid var(--border) !important;
|
| 500 |
+
width: 34px !important; height: 34px !important;
|
| 501 |
+
min-width: 34px !important;
|
| 502 |
+
border-radius: 0 !important;
|
| 503 |
+
position: relative !important;
|
| 504 |
+
overflow: visible !important;
|
| 505 |
+
}
|
| 506 |
+
[data-testid="stChatMessageAvatarAssistant"] svg,
|
| 507 |
+
[data-testid="stChatMessageAvatarAssistant"] img { display: none !important; }
|
| 508 |
+
[data-testid="stChatMessageAvatarAssistant"]::after {
|
| 509 |
+
content: "Κβ’α΄₯β’Κ";
|
| 510 |
+
position: absolute; top: 50%; left: 50%;
|
| 511 |
+
transform: translate(-50%, -50%);
|
| 512 |
+
font-size: 11px; line-height: 1; color: #000000;
|
| 513 |
+
white-space: nowrap;
|
| 514 |
+
font-family: Arial, Helvetica, sans-serif !important;
|
| 515 |
+
}
|
| 516 |
+
|
| 517 |
+
.msg-meta {
|
| 518 |
+
display: flex; align-items: center; gap: 12px;
|
| 519 |
+
margin-bottom: 8px;
|
| 520 |
+
}
|
| 521 |
+
.msg-label {
|
| 522 |
+
font-size: 10px; font-weight: 700;
|
| 523 |
+
letter-spacing: 0.14em; text-transform: uppercase;
|
| 524 |
+
color: var(--muted);
|
| 525 |
+
}
|
| 526 |
+
.route-tag {
|
| 527 |
+
font-size: 10px; letter-spacing: 0.1em; text-transform: uppercase;
|
| 528 |
+
color: var(--faint); border-left: 1px solid var(--border); padding-left: 10px;
|
| 529 |
+
}
|
| 530 |
+
.msg-body {
|
| 531 |
+
border-top: 1px solid var(--divider);
|
| 532 |
+
padding-top: 12px;
|
| 533 |
+
animation: fadeIn 220ms 60ms ease both;
|
| 534 |
+
}
|
| 535 |
+
.msg-body p {
|
| 536 |
+
font-size: 14px !important; line-height: 1.7 !important;
|
| 537 |
+
color: var(--body) !important; max-width: 72ch !important;
|
| 538 |
+
}
|
| 539 |
+
|
| 540 |
+
/* ---- cost bar ---- */
|
| 541 |
+
.cost-row {
|
| 542 |
+
display: flex; align-items: center; gap: 16px;
|
| 543 |
+
margin-top: 22px; margin-bottom: 18px;
|
| 544 |
+
padding: 14px 18px;
|
| 545 |
+
background: #000000;
|
| 546 |
+
}
|
| 547 |
+
.cost-label {
|
| 548 |
+
font-size: 10px; letter-spacing: 0.18em; text-transform: uppercase;
|
| 549 |
+
color: #666666; white-space: nowrap; font-family: "Courier New", monospace !important;
|
| 550 |
+
}
|
| 551 |
+
.cost-track {
|
| 552 |
+
flex: 1; height: 2px; background: #2a2a2a; position: relative;
|
| 553 |
+
}
|
| 554 |
+
.cost-fill {
|
| 555 |
+
position: absolute; top: 0; left: 0; height: 2px;
|
| 556 |
+
background: #ffffff;
|
| 557 |
+
transition: width 600ms cubic-bezier(0.22,1,0.36,1);
|
| 558 |
+
}
|
| 559 |
+
.cost-val {
|
| 560 |
+
font-size: 12px; letter-spacing: 0.06em; color: #ffffff;
|
| 561 |
+
font-family: "Courier New", monospace !important; white-space: nowrap;
|
| 562 |
+
}
|
| 563 |
+
.cost-tok {
|
| 564 |
+
font-size: 10px; letter-spacing: 0.04em; color: #666666;
|
| 565 |
+
font-family: "Courier New", monospace !important; white-space: nowrap;
|
| 566 |
+
}
|
| 567 |
+
|
| 568 |
+
/* ---- step trace ---- */
|
| 569 |
+
.step-title {
|
| 570 |
+
display: block; font-size: 12px; font-weight: 700;
|
| 571 |
+
margin-bottom: 4px; color: var(--ink);
|
| 572 |
+
}
|
| 573 |
+
.spath {
|
| 574 |
+
display: block; margin-top: 4px;
|
| 575 |
+
font-size: 11px; color: var(--muted);
|
| 576 |
+
font-family: "Courier New", monospace !important;
|
| 577 |
+
}
|
| 578 |
+
[data-testid="stDataFrame"] table {
|
| 579 |
+
font-size: 12px !important;
|
| 580 |
+
font-family: Arial, Helvetica, sans-serif !important;
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
/* ---- keyframes ---- */
|
| 584 |
+
@keyframes fadeUp {
|
| 585 |
+
from { opacity:0; transform:translateY(6px); }
|
| 586 |
+
to { opacity:1; transform:none; }
|
| 587 |
+
}
|
| 588 |
+
@keyframes fadeIn {
|
| 589 |
+
from { opacity:0; } to { opacity:1; }
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
/* ---- responsive ---- */
|
| 593 |
+
@media (max-width:640px) {
|
| 594 |
+
.block-container { padding: 32px 16px 100px !important; }
|
| 595 |
+
}
|
| 596 |
+
</style>
|
| 597 |
+
""", unsafe_allow_html=True)
|
| 598 |
+
|
| 599 |
+
|
| 600 |
+
# ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 601 |
+
def fmt(v: int | None) -> str:
|
| 602 |
+
return "n/a" if v is None else f"{v:,}"
|
| 603 |
+
|
| 604 |
+
|
| 605 |
+
def dataset_snapshot() -> dict:
|
| 606 |
+
try:
|
| 607 |
+
return list_datasets()
|
| 608 |
+
except Exception:
|
| 609 |
+
return {}
|
| 610 |
+
|
| 611 |
+
|
| 612 |
+
def render_plot(plotly_json: str) -> None:
|
| 613 |
+
try:
|
| 614 |
+
fig = pio.from_json(plotly_json)
|
| 615 |
+
fig.update_layout(
|
| 616 |
+
paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)",
|
| 617 |
+
font=dict(family="Arial, Helvetica, sans-serif", color="#222", size=12),
|
| 618 |
+
margin=dict(l=0, r=0, t=32, b=0),
|
| 619 |
+
colorway=["#000", "#555", "#888", "#bbb"],
|
| 620 |
+
xaxis=dict(gridcolor="#ebebeb", linecolor="#e0e0e0"),
|
| 621 |
+
yaxis=dict(gridcolor="#ebebeb", linecolor="#e0e0e0"),
|
| 622 |
+
)
|
| 623 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 624 |
+
except Exception as exc:
|
| 625 |
+
st.warning(f"Chart could not be rendered: {exc}")
|
| 626 |
+
|
| 627 |
+
|
| 628 |
+
def compact_tool_result(result: object) -> dict:
|
| 629 |
+
if not isinstance(result, dict):
|
| 630 |
+
return {"value": result}
|
| 631 |
+
compact: dict = {"keys": sorted(result.keys())}
|
| 632 |
+
for key in ("saved_csv", "saved_png", "plotly_json", "error", "analysis", "dataset", "filters"):
|
| 633 |
+
if key in result and result.get(key) is not None:
|
| 634 |
+
compact[key] = result[key]
|
| 635 |
+
table = result.get("table")
|
| 636 |
+
if isinstance(table, list):
|
| 637 |
+
compact["table_rows"] = len(table)
|
| 638 |
+
compact["table_preview"] = table[:5]
|
| 639 |
+
return compact
|
| 640 |
+
|
| 641 |
+
|
| 642 |
+
def extract_artifacts(tool_calls: list[dict]) -> list[dict]:
|
| 643 |
+
artifacts: list[dict] = []
|
| 644 |
+
for tc in tool_calls:
|
| 645 |
+
result = tc.get("result") or {}
|
| 646 |
+
if not isinstance(result, dict):
|
| 647 |
+
continue
|
| 648 |
+
for key, atype in (("saved_csv", "csv"), ("saved_png", "png")):
|
| 649 |
+
if result.get(key):
|
| 650 |
+
artifacts.append({"type": atype, "tool": tc.get("tool", "?"), "path": result[key]})
|
| 651 |
+
if result.get("plotly_json"):
|
| 652 |
+
artifacts.append({"type": "plotly_json", "tool": tc.get("tool", "?"), "present": True})
|
| 653 |
+
return artifacts
|
| 654 |
+
|
| 655 |
+
|
| 656 |
+
def build_backend_history(turns: list[dict]) -> list[dict]:
|
| 657 |
+
history: list[dict] = []
|
| 658 |
+
for turn in turns:
|
| 659 |
+
history.append({"role": "user", "content": turn["question"]})
|
| 660 |
+
content = turn["answer"]
|
| 661 |
+
state = {
|
| 662 |
+
"tool_calls": [
|
| 663 |
+
{"tool": tc.get("tool"), "args": tc.get("args") or {},
|
| 664 |
+
"result": compact_tool_result(tc.get("result"))}
|
| 665 |
+
for tc in turn.get("tool_calls", [])
|
| 666 |
+
],
|
| 667 |
+
"artifacts": turn.get("artifacts", []),
|
| 668 |
+
"plotly_json": bool(turn.get("plotly_json")),
|
| 669 |
+
"route": turn.get("route"),
|
| 670 |
+
}
|
| 671 |
+
if state["tool_calls"] or state["artifacts"] or state["plotly_json"]:
|
| 672 |
+
content += f"\n\n<analysis_state>\n{json.dumps(state, default=str, indent=2)}\n</analysis_state>"
|
| 673 |
+
history.append({"role": "assistant", "content": content})
|
| 674 |
+
return history
|
| 675 |
+
|
| 676 |
+
|
| 677 |
+
def call_agent(question: str, history: list[dict], turns: list[dict]) -> dict:
|
| 678 |
+
kwargs = {"history": history}
|
| 679 |
+
params = inspect.signature(run_agent).parameters
|
| 680 |
+
for name in ("analysis_context", "conversation_state", "turns"):
|
| 681 |
+
if name in params:
|
| 682 |
+
kwargs[name] = turns
|
| 683 |
+
break
|
| 684 |
+
return run_agent(question, **kwargs)
|
| 685 |
+
|
| 686 |
+
|
| 687 |
+
_POST_SAMPLES: list[dict] = []
|
| 688 |
+
|
| 689 |
+
|
| 690 |
+
def call_agent_with_progress(question: str, backend_history: list[dict], turns: list[dict], slot) -> dict:
|
| 691 |
+
"""Run agent in a background thread; update a progress slot from the main thread."""
|
| 692 |
+
result_holder: dict = {}
|
| 693 |
+
exc_holder: dict = {}
|
| 694 |
+
|
| 695 |
+
def worker() -> None:
|
| 696 |
+
try:
|
| 697 |
+
result_holder["r"] = call_agent(question, backend_history, turns)
|
| 698 |
+
except Exception as e:
|
| 699 |
+
exc_holder["e"] = e
|
| 700 |
+
|
| 701 |
+
t = threading.Thread(target=worker, daemon=True)
|
| 702 |
+
t.start()
|
| 703 |
+
|
| 704 |
+
STUCK_MSGS = [
|
| 705 |
+
"i promise i'm still gooning",
|
| 706 |
+
"locked in. fully gooned. cannot stop.",
|
| 707 |
+
"the data is vast. the goon is deep. patience.",
|
| 708 |
+
"i've been edging this query for so long i've lost track of time.",
|
| 709 |
+
"every second is another row scanned. feel it.",
|
| 710 |
+
"this is what a true goon session looks like. no shortcuts.",
|
| 711 |
+
"i am one with the dataset. do not disturb.",
|
| 712 |
+
]
|
| 713 |
+
|
| 714 |
+
global _POST_SAMPLES
|
| 715 |
+
if not _POST_SAMPLES:
|
| 716 |
+
_POST_SAMPLES = load_post_samples()
|
| 717 |
+
posts = _POST_SAMPLES if _POST_SAMPLES else []
|
| 718 |
+
random.shuffle(posts)
|
| 719 |
+
|
| 720 |
+
pct = 0
|
| 721 |
+
idx = 0
|
| 722 |
+
post_idx = 0
|
| 723 |
+
start = time.time()
|
| 724 |
+
stuck_since: float | None = None
|
| 725 |
+
last_pct_change = time.time()
|
| 726 |
+
|
| 727 |
+
while t.is_alive():
|
| 728 |
+
prev_pct = pct
|
| 729 |
+
pct = min(pct + random.randint(1, 5), 93)
|
| 730 |
+
|
| 731 |
+
if pct != prev_pct:
|
| 732 |
+
stuck_since = None
|
| 733 |
+
last_pct_change = time.time()
|
| 734 |
+
else:
|
| 735 |
+
if stuck_since is None:
|
| 736 |
+
stuck_since = time.time()
|
| 737 |
+
|
| 738 |
+
elapsed = int(time.time() - start)
|
| 739 |
+
elapsed_str = f"{elapsed}s" if elapsed < 60 else f"{elapsed // 60}m {elapsed % 60}s"
|
| 740 |
+
stuck_sec = int(time.time() - stuck_since) if stuck_since else 0
|
| 741 |
+
|
| 742 |
+
hint = LOADING_HINTS[idx % len(LOADING_HINTS)]
|
| 743 |
+
|
| 744 |
+
# Build up stuck messages β one new line per 12s window, cleared when pct moves
|
| 745 |
+
n_stuck = min(stuck_sec // 12, len(STUCK_MSGS))
|
| 746 |
+
stuck_html = "".join(
|
| 747 |
+
f'<div class="prog-stuck prog-stuck-{i}">{STUCK_MSGS[i]}</div>'
|
| 748 |
+
for i in range(n_stuck)
|
| 749 |
+
)
|
| 750 |
+
|
| 751 |
+
# rotate post every ~11 ticks (~4 seconds)
|
| 752 |
+
if idx % 11 == 0 and idx > 0:
|
| 753 |
+
post_idx += 1
|
| 754 |
+
post_html = ""
|
| 755 |
+
if posts:
|
| 756 |
+
p = posts[post_idx % len(posts)]
|
| 757 |
+
sub = p.get("subreddit", "")
|
| 758 |
+
title = p.get("title", "").replace("<", "<").replace(">", ">")
|
| 759 |
+
post_html = (
|
| 760 |
+
f'<div class="post-slide" key="{post_idx}">'
|
| 761 |
+
f'<div class="post-slide-sub">r/{sub}</div>'
|
| 762 |
+
f'<div class="post-slide-title">{title}</div>'
|
| 763 |
+
f'</div>'
|
| 764 |
+
)
|
| 765 |
+
|
| 766 |
+
at_cap = pct >= 93
|
| 767 |
+
pct_display = "β%" if at_cap else f"{pct}%"
|
| 768 |
+
running_label = (
|
| 769 |
+
'<span class="prog-hint" style="display:inline;margin-left:8px;margin-bottom:0">'
|
| 770 |
+
'still running</span>'
|
| 771 |
+
if at_cap else ""
|
| 772 |
+
)
|
| 773 |
+
slot.markdown(
|
| 774 |
+
f'<div class="prog-wrap">'
|
| 775 |
+
f'<div class="prog-hint">{hint}</div>'
|
| 776 |
+
f'<div class="prog-bg"><div class="prog-fill" style="width:{pct}%"></div></div>'
|
| 777 |
+
f'<div class="prog-pct">{pct_display} Β· {elapsed_str}{running_label}</div>'
|
| 778 |
+
f'{stuck_html}'
|
| 779 |
+
f'{post_html}'
|
| 780 |
+
f'</div>',
|
| 781 |
+
unsafe_allow_html=True,
|
| 782 |
+
)
|
| 783 |
+
idx += 1
|
| 784 |
+
time.sleep(0.35)
|
| 785 |
+
|
| 786 |
+
t.join()
|
| 787 |
+
slot.empty()
|
| 788 |
+
|
| 789 |
+
if exc_holder:
|
| 790 |
+
raise exc_holder["e"]
|
| 791 |
+
return result_holder["r"]
|
| 792 |
+
|
| 793 |
+
|
| 794 |
+
def render_cost_bar(usage: dict) -> None:
|
| 795 |
+
cost = usage.get("cost_usd", 0)
|
| 796 |
+
inp = usage.get("input_tokens", 0)
|
| 797 |
+
out = usage.get("output_tokens", 0)
|
| 798 |
+
# scale: 0β$0.50 maps to 0β100% of bar
|
| 799 |
+
pct = min(cost / 0.50 * 100, 100)
|
| 800 |
+
if cost < 0.01:
|
| 801 |
+
val_str = f"< $0.01"
|
| 802 |
+
else:
|
| 803 |
+
val_str = f"${cost:.3f}"
|
| 804 |
+
tok_str = f"{inp:,} in Β· {out:,} out"
|
| 805 |
+
st.markdown(
|
| 806 |
+
f'<div class="cost-row">'
|
| 807 |
+
f'<span class="cost-label">cost</span>'
|
| 808 |
+
f'<div class="cost-track"><div class="cost-fill" style="width:{pct:.1f}%"></div></div>'
|
| 809 |
+
f'<span class="cost-val">{val_str}</span>'
|
| 810 |
+
f'<span class="cost-tok">{tok_str}</span>'
|
| 811 |
+
f'</div>',
|
| 812 |
+
unsafe_allow_html=True,
|
| 813 |
+
)
|
| 814 |
+
|
| 815 |
+
|
| 816 |
+
def render_tool_calls(tool_calls: list[dict]) -> None:
|
| 817 |
+
n = len(tool_calls)
|
| 818 |
+
with st.expander(f"Method {n} step{'s' if n != 1 else ''}", expanded=False):
|
| 819 |
+
for i, tc in enumerate(tool_calls):
|
| 820 |
+
st.markdown(
|
| 821 |
+
f"<span class='step-title'>Step {i+1} -> {tc.get('tool','?')}</span>",
|
| 822 |
+
unsafe_allow_html=True,
|
| 823 |
+
)
|
| 824 |
+
if tc.get("args"):
|
| 825 |
+
st.json(tc["args"], expanded=False)
|
| 826 |
+
res = tc.get("result") or {}
|
| 827 |
+
if isinstance(res, dict):
|
| 828 |
+
if res.get("table"):
|
| 829 |
+
try:
|
| 830 |
+
st.dataframe(pd.DataFrame(res["table"]), use_container_width=True, hide_index=True)
|
| 831 |
+
except Exception:
|
| 832 |
+
pass
|
| 833 |
+
for key in ("saved_csv", "saved_png"):
|
| 834 |
+
if res.get(key):
|
| 835 |
+
st.markdown(f"<span class='spath'>-> {res[key]}</span>", unsafe_allow_html=True)
|
| 836 |
+
if i < n - 1:
|
| 837 |
+
st.markdown("---")
|
| 838 |
+
|
| 839 |
+
|
| 840 |
+
def render_export_buttons(answer: str, tool_calls: list[dict], turn_idx: int) -> None:
|
| 841 |
+
artifacts = extract_artifacts(tool_calls)
|
| 842 |
+
csvs = [a["path"] for a in artifacts if a["type"] == "csv"]
|
| 843 |
+
pngs = [a["path"] for a in artifacts if a["type"] == "png"]
|
| 844 |
+
|
| 845 |
+
items: list[tuple[str, bytes, str, str]] = []
|
| 846 |
+
items.append(("answer.md", answer.encode("utf-8"), "text/markdown", f"answer_{turn_idx}.md"))
|
| 847 |
+
for path in csvs:
|
| 848 |
+
p = Path(path)
|
| 849 |
+
if p.exists():
|
| 850 |
+
items.append((p.name, p.read_bytes(), "text/csv", p.name))
|
| 851 |
+
for path in pngs:
|
| 852 |
+
p = Path(path)
|
| 853 |
+
if p.exists():
|
| 854 |
+
items.append((p.name, p.read_bytes(), "image/png", p.name))
|
| 855 |
+
|
| 856 |
+
cols = st.columns(len(items))
|
| 857 |
+
for col, (label, data, mime, fname) in zip(cols, items):
|
| 858 |
+
with col:
|
| 859 |
+
st.download_button(
|
| 860 |
+
label=label, data=data, file_name=fname, mime=mime,
|
| 861 |
+
key=f"dl_{turn_idx}_{fname}",
|
| 862 |
+
)
|
| 863 |
+
|
| 864 |
+
|
| 865 |
+
# ββ session state ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 866 |
+
for key, default in [("history", []), ("chat", []), ("turns", []), ("prefill", ""), ("authenticated", False), ("logged_out", False)]:
|
| 867 |
+
if key not in st.session_state:
|
| 868 |
+
st.session_state[key] = default
|
| 869 |
+
|
| 870 |
+
# seed from env if already set (e.g. from .env file) β but not if user explicitly logged out
|
| 871 |
+
if not st.session_state["authenticated"] and not st.session_state["logged_out"] and os.environ.get("ANTHROPIC_API_KEY"):
|
| 872 |
+
st.session_state["authenticated"] = True
|
| 873 |
+
|
| 874 |
+
# ββ dataset metadata βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 875 |
+
meta = dataset_snapshot()
|
| 876 |
+
posts_rows = meta.get("posts", {}).get("rows")
|
| 877 |
+
comments_rows = meta.get("comments", {}).get("rows")
|
| 878 |
+
sub_count = len(meta.get("posts", {}).get("subreddits") or [])
|
| 879 |
+
latest_date = (meta.get("comments", {}).get("date_range") or {}).get("latest", "n/a")
|
| 880 |
+
|
| 881 |
+
|
| 882 |
+
# ββ login gate βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 883 |
+
if not st.session_state["authenticated"]:
|
| 884 |
+
st.markdown("""
|
| 885 |
+
<style>
|
| 886 |
+
.login-wrap {
|
| 887 |
+
display: flex; flex-direction: column; align-items: center;
|
| 888 |
+
justify-content: center; min-height: 70vh; gap: 20px;
|
| 889 |
+
}
|
| 890 |
+
.login-title {
|
| 891 |
+
font-size: 22px; font-weight: 700; letter-spacing: 0.04em; color: var(--ink);
|
| 892 |
+
}
|
| 893 |
+
.login-sub {
|
| 894 |
+
font-size: 12px; color: var(--muted); margin-top: -12px;
|
| 895 |
+
}
|
| 896 |
+
</style>
|
| 897 |
+
<div class='login-wrap'>
|
| 898 |
+
<div class='login-title'>Grasping Gooning</div>
|
| 899 |
+
<div class='login-sub'>enter your Anthropic API key to continue</div>
|
| 900 |
+
</div>
|
| 901 |
+
""", unsafe_allow_html=True)
|
| 902 |
+
|
| 903 |
+
col = st.columns([1, 2, 1])[1]
|
| 904 |
+
with col:
|
| 905 |
+
login_key = st.text_input(
|
| 906 |
+
"API key", type="password", placeholder="sk-ant-β¦",
|
| 907 |
+
label_visibility="collapsed",
|
| 908 |
+
)
|
| 909 |
+
if st.button("Enter ->", key="login_btn", use_container_width=True):
|
| 910 |
+
if login_key.strip():
|
| 911 |
+
ascii_key = login_key.encode("ascii", errors="ignore").decode("ascii")
|
| 912 |
+
os.environ["ANTHROPIC_API_KEY"] = ascii_key
|
| 913 |
+
st.session_state["authenticated"] = True
|
| 914 |
+
st.session_state["logged_out"] = False
|
| 915 |
+
st.rerun()
|
| 916 |
+
else:
|
| 917 |
+
st.error("Paste your API key above.")
|
| 918 |
+
st.stop()
|
| 919 |
+
|
| 920 |
+
# ββ sidebar ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 921 |
+
with st.sidebar:
|
| 922 |
+
st.markdown("""
|
| 923 |
+
<div class='sb-title'>Grasping Gooning</div>
|
| 924 |
+
<div class='sb-tagline'>reddit data analysis agent</div>
|
| 925 |
+
""", unsafe_allow_html=True)
|
| 926 |
+
|
| 927 |
+
st.markdown("<div class='sidebar-box'><div class='sidebar-label'>Session</div></div>",
|
| 928 |
+
unsafe_allow_html=True)
|
| 929 |
+
if st.button("Clear conversation", key="clear"):
|
| 930 |
+
st.session_state.update(history=[], chat=[], turns=[], prefill="")
|
| 931 |
+
st.rerun()
|
| 932 |
+
if st.button("Log out", key="logout"):
|
| 933 |
+
os.environ.pop("ANTHROPIC_API_KEY", None)
|
| 934 |
+
st.session_state.update(history=[], chat=[], turns=[], prefill="", authenticated=False, logged_out=True)
|
| 935 |
+
st.rerun()
|
| 936 |
+
|
| 937 |
+
# ββ about (bottom of sidebar) ββββββββββββββββββββββββββββββββββββββββββ
|
| 938 |
+
st.markdown("<div class='sidebar-box'>", unsafe_allow_html=True)
|
| 939 |
+
with st.expander("About"):
|
| 940 |
+
earliest = (meta.get("posts", {}).get("date_range") or {}).get("earliest", "n/a")
|
| 941 |
+
subs_list = meta.get("posts", {}).get("subreddits") or []
|
| 942 |
+
st.markdown(f"""
|
| 943 |
+
<div class='sidebar-copy'>
|
| 944 |
+
A research tool for analysing the Reddit gooning corpus.<br>
|
| 945 |
+
Ask questions in plain English β the agent runs real code against the data and returns charts, tables, and findings.<br><br>
|
| 946 |
+
<span class='sidebar-stat'>{fmt(posts_rows)}</span> posts<br>
|
| 947 |
+
<span class='sidebar-stat'>{fmt(comments_rows)}</span> comments<br>
|
| 948 |
+
<span class='sidebar-stat'>{sub_count}</span> subreddits<br>
|
| 949 |
+
<span style='color:#444;font-size:9px'>{earliest} β {latest_date}</span><br><br>
|
| 950 |
+
<span style='color:#333;font-size:9px;letter-spacing:0.1em;text-transform:uppercase'>Subreddits</span><br>
|
| 951 |
+
<span style='color:#555;font-size:9px;line-height:1.9'>{" Β· ".join(subs_list[:15])}{"..." if len(subs_list) > 15 else ""}</span>
|
| 952 |
+
</div>
|
| 953 |
+
""", unsafe_allow_html=True)
|
| 954 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 955 |
+
|
| 956 |
+
|
| 957 |
+
# ββ chat history βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 958 |
+
for i, msg in enumerate(st.session_state["chat"]):
|
| 959 |
+
with st.chat_message(msg["role"]):
|
| 960 |
+
role = msg["role"]
|
| 961 |
+
route = msg.get("route", "")
|
| 962 |
+
label = "You" if role == "user" else "Answer"
|
| 963 |
+
route_html = f"<span class='route-tag'>{route}</span>" if route and role == "assistant" else ""
|
| 964 |
+
st.markdown(
|
| 965 |
+
f"<div class='msg-meta'><span class='msg-label'>{label}</span>{route_html}</div>",
|
| 966 |
+
unsafe_allow_html=True,
|
| 967 |
+
)
|
| 968 |
+
st.markdown("<div class='msg-body'>", unsafe_allow_html=True)
|
| 969 |
+
st.markdown(msg["content"])
|
| 970 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 971 |
+
for pj in (msg.get("plotly_jsons") or ([msg["plotly_json"]] if msg.get("plotly_json") else [])):
|
| 972 |
+
render_plot(pj)
|
| 973 |
+
if msg.get("usage"):
|
| 974 |
+
render_cost_bar(msg["usage"])
|
| 975 |
+
if msg.get("tool_calls"):
|
| 976 |
+
render_tool_calls(msg["tool_calls"])
|
| 977 |
+
if role == "assistant":
|
| 978 |
+
render_export_buttons(msg["content"], msg.get("tool_calls") or [], i)
|
| 979 |
+
|
| 980 |
+
|
| 981 |
+
# ββ chat input βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 982 |
+
prefill = st.session_state["prefill"]
|
| 983 |
+
question = st.chat_input("what do you want to knowβ¦")
|
| 984 |
+
if prefill:
|
| 985 |
+
st.session_state["prefill"] = ""
|
| 986 |
+
effective_question = question or prefill
|
| 987 |
+
|
| 988 |
+
if effective_question:
|
| 989 |
+
question = effective_question
|
| 990 |
+
|
| 991 |
+
backend_history = build_backend_history(st.session_state["turns"])
|
| 992 |
+
|
| 993 |
+
with st.chat_message("user"):
|
| 994 |
+
st.markdown("<div class='msg-meta'><span class='msg-label'>You</span></div>",
|
| 995 |
+
unsafe_allow_html=True)
|
| 996 |
+
st.markdown("<div class='msg-body'>", unsafe_allow_html=True)
|
| 997 |
+
st.markdown(question)
|
| 998 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 999 |
+
|
| 1000 |
+
with st.chat_message("assistant"):
|
| 1001 |
+
progress_slot = st.empty()
|
| 1002 |
+
try:
|
| 1003 |
+
result = call_agent_with_progress(question, backend_history, list(st.session_state["turns"]), progress_slot)
|
| 1004 |
+
except Exception as exc:
|
| 1005 |
+
err_str = str(exc)
|
| 1006 |
+
is_auth_err = (
|
| 1007 |
+
type(exc).__name__ in ("AuthenticationError", "PermissionDeniedError")
|
| 1008 |
+
or "invalid x-api-key" in err_str.lower()
|
| 1009 |
+
or "401" in err_str
|
| 1010 |
+
)
|
| 1011 |
+
if is_auth_err:
|
| 1012 |
+
os.environ.pop("ANTHROPIC_API_KEY", None)
|
| 1013 |
+
st.session_state.update(authenticated=False, logged_out=True)
|
| 1014 |
+
st.error("API key rejected β please re-enter it.")
|
| 1015 |
+
st.rerun()
|
| 1016 |
+
elif "rate_limit" in err_str.lower():
|
| 1017 |
+
st.error("Rate limited. Wait a moment and try again.")
|
| 1018 |
+
elif "Unicode encoding error" in err_str or ("ascii" in err_str.lower() and "codec" in err_str.lower()):
|
| 1019 |
+
st.error("Encoding error β your API key may contain non-standard characters. Log out and re-enter it.")
|
| 1020 |
+
else:
|
| 1021 |
+
st.error(f"Something went wrong: {err_str[:300]}")
|
| 1022 |
+
st.stop()
|
| 1023 |
+
|
| 1024 |
+
answer = result.get("answer", "")
|
| 1025 |
+
tool_calls = result.get("tool_calls", [])
|
| 1026 |
+
plotly_jsons = result.get("plotly_jsons") or ([result["plotly_json"]] if result.get("plotly_json") else [])
|
| 1027 |
+
route = result.get("route", "")
|
| 1028 |
+
usage = result.get("usage") or {}
|
| 1029 |
+
|
| 1030 |
+
route_html = f"<span class='route-tag'>{route}</span>" if route else ""
|
| 1031 |
+
st.markdown(
|
| 1032 |
+
f"<div class='msg-meta'><span class='msg-label'>Answer</span>{route_html}</div>",
|
| 1033 |
+
unsafe_allow_html=True,
|
| 1034 |
+
)
|
| 1035 |
+
st.markdown("<div class='msg-body'>", unsafe_allow_html=True)
|
| 1036 |
+
st.markdown(answer)
|
| 1037 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 1038 |
+
for pj in plotly_jsons:
|
| 1039 |
+
render_plot(pj)
|
| 1040 |
+
if usage:
|
| 1041 |
+
render_cost_bar(usage)
|
| 1042 |
+
if tool_calls:
|
| 1043 |
+
render_tool_calls(tool_calls)
|
| 1044 |
+
render_export_buttons(answer, tool_calls, len(st.session_state["turns"]))
|
| 1045 |
+
|
| 1046 |
+
turn = {
|
| 1047 |
+
"question": question, "answer": answer,
|
| 1048 |
+
"tool_calls": tool_calls, "plotly_jsons": plotly_jsons,
|
| 1049 |
+
"artifacts": extract_artifacts(tool_calls), "route": route,
|
| 1050 |
+
"usage": usage,
|
| 1051 |
+
}
|
| 1052 |
+
st.session_state["turns"].append(turn)
|
| 1053 |
+
st.session_state["history"] = build_backend_history(st.session_state["turns"])
|
| 1054 |
+
st.session_state["chat"].append({"role": "user", "content": question})
|
| 1055 |
+
st.session_state["chat"].append({
|
| 1056 |
+
"role": "assistant", "content": answer,
|
| 1057 |
+
"tool_calls": tool_calls, "plotly_jsons": plotly_jsons,
|
| 1058 |
+
"route": route, "usage": usage,
|
| 1059 |
+
})
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
anthropic>=0.40.0
|
| 2 |
+
together>=1.3.0
|
| 3 |
+
openai>=1.0.0
|
| 4 |
+
scikit-learn>=1.4.0
|
| 5 |
+
pandas>=2.0.0
|
| 6 |
+
pyarrow>=14.0.0
|
| 7 |
+
streamlit>=1.35.0
|
| 8 |
+
python-dotenv>=1.0.0
|
| 9 |
+
plotly>=5.20.0
|
| 10 |
+
huggingface-hub>=0.22.0
|