Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- .gitattributes +1 -0
- Dockerfile +15 -0
- README.md +12 -0
- app.py +1716 -0
- dockerignore +13 -0
- features_standardized_11_renamed.parquet +3 -0
- requirements.txt +8 -0
- utils.py +188 -0
.gitattributes
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
features_standardized_11_renamed.parquet filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
RUN apt-get update && apt-get install -y --no-install-recommends curl \
|
| 6 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 7 |
+
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
RUN pip install --no-cache-dir --prefer-binary -r requirements.txt
|
| 10 |
+
|
| 11 |
+
COPY . .
|
| 12 |
+
|
| 13 |
+
EXPOSE 7860
|
| 14 |
+
|
| 15 |
+
CMD ["sh", "-c", "export STREAMLIT_SERVER_PORT='' && PORT_TO_USE=${PORT:-7860} && echo PORT_TO_USE=$PORT_TO_USE && streamlit run app.py --server.address=0.0.0.0 --server.port=$PORT_TO_USE --server.headless=true --server.enableCORS=false --server.enableXsrfProtection=false --server.runOnSave=false --server.fileWatcherType=none"]
|
README.md
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Legislation Dashboard
|
| 3 |
+
emoji: 📈
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# Legislative Trends Dashboard
|
| 11 |
+
|
| 12 |
+
Upload your parquet or CSV file to visualize legislative trends.
|
app.py
ADDED
|
@@ -0,0 +1,1716 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import streamlit as st
|
| 6 |
+
import plotly.express as px
|
| 7 |
+
import plotly.graph_objects as go
|
| 8 |
+
from scipy import stats
|
| 9 |
+
|
| 10 |
+
# Optional (legacy TF-IDF import kept harmlessly)
|
| 11 |
+
try:
|
| 12 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 13 |
+
_HAS_SK = True
|
| 14 |
+
except Exception:
|
| 15 |
+
_HAS_SK = False
|
| 16 |
+
|
| 17 |
+
# -----------------------------
|
| 18 |
+
# Page config
|
| 19 |
+
# -----------------------------
|
| 20 |
+
st.set_page_config(
|
| 21 |
+
page_title="Legislative Trends Dashboard",
|
| 22 |
+
layout="wide",
|
| 23 |
+
initial_sidebar_state="collapsed",
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# -----------------------------
|
| 27 |
+
# Palette
|
| 28 |
+
# -----------------------------
|
| 29 |
+
C_TRAPPED_DARKNESS = "#0F1F38"
|
| 30 |
+
C_CEDAR_PLANK = "#8E7970"
|
| 31 |
+
C_PUMPING_SPICE = "#F55449" # negative
|
| 32 |
+
C_LAZURITE_BLUE = "#1B4B5A" # positive
|
| 33 |
+
C_POSITIVE = C_LAZURITE_BLUE
|
| 34 |
+
C_NEGATIVE = C_PUMPING_SPICE
|
| 35 |
+
C_STABLE = C_CEDAR_PLANK
|
| 36 |
+
|
| 37 |
+
PLOTLY_TEMPLATE = "plotly_white"
|
| 38 |
+
|
| 39 |
+
DEFAULT_CANDIDATES = [
|
| 40 |
+
"features_standardized_11_renamed.parquet",
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
# Full data range (all years available for baseline)
|
| 44 |
+
DATA_START_FULL = pd.to_datetime("2019-01-09").date()
|
| 45 |
+
DATA_END_FULL = pd.to_datetime("2026-02-06").date()
|
| 46 |
+
|
| 47 |
+
# Display/filter range
|
| 48 |
+
DATA_START = pd.to_datetime("2019-01-09").date()
|
| 49 |
+
DATA_END = pd.to_datetime("2026-02-06").date()
|
| 50 |
+
|
| 51 |
+
_SPLIT_RE = re.compile(r"[,\|;/\n\t]+")
|
| 52 |
+
|
| 53 |
+
STOPWORDS = {
|
| 54 |
+
"bill", "bills", "act", "acts", "amend", "amends", "amended", "amendment", "amendments",
|
| 55 |
+
"illinois", "state", "code", "section", "sections", "law", "laws", "new", "provide", "provides",
|
| 56 |
+
"making", "make", "made", "relating", "regarding", "including", "include", "includes", "within",
|
| 57 |
+
"existing", "technical", "resolution", "resolutions", "effective", "date", "public",
|
| 58 |
+
"department", "agency", "program", "programs", "general", "shall", "may", "must", "also",
|
| 59 |
+
"one", "two", "three", "per", "use", "used", "using", "would", "could", "can", "like",
|
| 60 |
+
"not", "no", "yes", "etc", "among", "upon", "require", "requires", "required", "requirement",
|
| 61 |
+
"establish", "establishes", "established", "create", "creates", "created", "implementation",
|
| 62 |
+
"board", "boards", "commission", "commissions", "report", "reports", "reporting",
|
| 63 |
+
"information", "data", "system", "systems", "process", "processes", "administration",
|
| 64 |
+
"student", "students", "education", "educational", "school", "schools",
|
| 65 |
+
"support", "and", "the", "for", "with", "that", "this", "from", "have", "has", "had",
|
| 66 |
+
"be", "been", "being", "are", "is", "was", "were", "will", "would", "should", "could",
|
| 67 |
+
"may", "might", "must", "can", "shall", "need", "needs", "needed", "such", "other",
|
| 68 |
+
"any", "all", "each", "some", "more", "most", "than", "into", "through", "between",
|
| 69 |
+
"under", "over", "about", "against", "during", "after", "before", "above", "below",
|
| 70 |
+
"up", "down", "in", "out", "on", "off", "to", "at", "by", "of", "as", "or", "but", "if",
|
| 71 |
+
"when", "where", "why", "how", "which", "who", "whom", "whose", "what", "whether",
|
| 72 |
+
"there", "their", "they", "them", "these", "those", "then", "than", "only", "just",
|
| 73 |
+
"both", "either", "neither", "nor", "so", "too", "very", "even", "also", "however",
|
| 74 |
+
"therefore", "thus", "hence", "accordingly", "consequently", "furthermore", "moreover",
|
| 75 |
+
"nevertheless", "nonetheless", "otherwise", "rather", "instead", "yet", "still",
|
| 76 |
+
"already", "always", "never", "ever", "often", "sometimes", "usually", "generally",
|
| 77 |
+
"specifically", "particularly", "especially", "mainly", "mostly", "largely",
|
| 78 |
+
"context", "establishment", "legislative", "promoting", "justice", "human", "rights", "protections"
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
GENERIC_PHRASES = {
|
| 82 |
+
"effective date", "public act", "existing law", "state code", "general assembly",
|
| 83 |
+
"relating to", "regarding", "provide that", "provides that", "amend the", "amends the",
|
| 84 |
+
"this act", "the act", "state agency", "support and", "and context", "context establishment",
|
| 85 |
+
"legislative support", "promoting justice", "justice and", "and human", "human rights",
|
| 86 |
+
"rights protections", "and human rights", "justice and human", "human rights protections",
|
| 87 |
+
"support and context", "and context establishment", "legislative support and"
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
TFIDF_BLOCK_WORDS = {
|
| 91 |
+
"likely", "promote", "promotes", "promoting", "desire", "desires",
|
| 92 |
+
"aim", "aims", "without", "specific", "etc", "mentions", "mention",
|
| 93 |
+
"mentioned", "provided", "provides", "appears", "suggests", "suggest",
|
| 94 |
+
"driven", "purpose", "express", "referred", "uses", "use", "introduce",
|
| 95 |
+
"introduced", "unclear", "behind", "text", "motivation", "intent", "strategy"
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
TFIDF_BLOCK_PHRASES = {
|
| 99 |
+
"does specific", "provided text", "mentioned provided text", "appears procedural"
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
# -----------------------------
|
| 103 |
+
# CSS
|
| 104 |
+
# -----------------------------
|
| 105 |
+
st.markdown(
|
| 106 |
+
f"""
|
| 107 |
+
<style>
|
| 108 |
+
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap');
|
| 109 |
+
|
| 110 |
+
.block-container {{
|
| 111 |
+
padding-top: 1.2rem !important;
|
| 112 |
+
padding-bottom: 0.35rem !important;
|
| 113 |
+
padding-left: 0.8rem !important;
|
| 114 |
+
padding-right: 0.8rem !important;
|
| 115 |
+
}}
|
| 116 |
+
|
| 117 |
+
/* Hide Streamlit top decoration bar that clips content */
|
| 118 |
+
header[data-testid="stHeader"] {{
|
| 119 |
+
background: transparent !important;
|
| 120 |
+
height: 0rem !important;
|
| 121 |
+
}}
|
| 122 |
+
[data-testid="stToolbar"] {{
|
| 123 |
+
display: none !important;
|
| 124 |
+
}}
|
| 125 |
+
.main {{ background-color: #EEF2F3; }}
|
| 126 |
+
html, body, [class*="css"] {{ font-family: 'Inter', sans-serif; }}
|
| 127 |
+
|
| 128 |
+
.header-wrap {{
|
| 129 |
+
background: linear-gradient(90deg, {C_TRAPPED_DARKNESS} 0%, {C_LAZURITE_BLUE} 60%, {C_CEDAR_PLANK} 100%);
|
| 130 |
+
padding: 10px 14px;
|
| 131 |
+
border-radius: 12px;
|
| 132 |
+
margin: 6px 0 10px 0;
|
| 133 |
+
box-shadow: 0 2px 8px rgba(0,0,0,0.10);
|
| 134 |
+
}}
|
| 135 |
+
.header-title {{
|
| 136 |
+
color: #ffffff;
|
| 137 |
+
font-weight: 800;
|
| 138 |
+
font-size: 20px;
|
| 139 |
+
margin: 0;
|
| 140 |
+
line-height: 1.1;
|
| 141 |
+
}}
|
| 142 |
+
.header-sub {{
|
| 143 |
+
color: rgba(255,255,255,0.88);
|
| 144 |
+
font-size: 12px;
|
| 145 |
+
margin-top: 2px;
|
| 146 |
+
line-height: 1.2;
|
| 147 |
+
}}
|
| 148 |
+
|
| 149 |
+
.kpi-grid {{
|
| 150 |
+
display: grid;
|
| 151 |
+
grid-template-columns: 1.0fr 1.0fr 1.0fr 1.0fr 1.0fr;
|
| 152 |
+
gap: 10px;
|
| 153 |
+
margin-bottom: 10px;
|
| 154 |
+
}}
|
| 155 |
+
.kpi-card {{
|
| 156 |
+
background: #ffffff;
|
| 157 |
+
border: 1px solid #D6DEE0;
|
| 158 |
+
border-radius: 12px;
|
| 159 |
+
padding: 10px 12px;
|
| 160 |
+
box-shadow: 0 1px 6px rgba(0,0,0,0.06);
|
| 161 |
+
}}
|
| 162 |
+
.kpi-label {{
|
| 163 |
+
font-size: 11.5px;
|
| 164 |
+
font-weight: 650;
|
| 165 |
+
color: #5b6b71;
|
| 166 |
+
margin-bottom: 6px;
|
| 167 |
+
text-transform: uppercase;
|
| 168 |
+
letter-spacing: 0.2px;
|
| 169 |
+
}}
|
| 170 |
+
.kpi-value {{
|
| 171 |
+
font-size: 24px;
|
| 172 |
+
font-weight: 800;
|
| 173 |
+
color: {C_TRAPPED_DARKNESS};
|
| 174 |
+
line-height: 1.05;
|
| 175 |
+
}}
|
| 176 |
+
|
| 177 |
+
.filter-row {{
|
| 178 |
+
background:#ffffff;
|
| 179 |
+
border: 1px solid #D6DEE0;
|
| 180 |
+
border-radius: 12px;
|
| 181 |
+
padding: 8px 10px;
|
| 182 |
+
box-shadow: 0 1px 6px rgba(0,0,0,0.08);
|
| 183 |
+
margin-bottom: 10px;
|
| 184 |
+
}}
|
| 185 |
+
|
| 186 |
+
div[data-testid="stVerticalBlock"] > div {{ gap: 0.35rem; }}
|
| 187 |
+
</style>
|
| 188 |
+
""",
|
| 189 |
+
unsafe_allow_html=True,
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
# -----------------------------
|
| 193 |
+
# Helpers
|
| 194 |
+
# -----------------------------
|
| 195 |
+
def _find_first_existing(paths):
|
| 196 |
+
for p in paths:
|
| 197 |
+
if os.path.exists(p):
|
| 198 |
+
return p
|
| 199 |
+
return None
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def load_dataset(path: str) -> pd.DataFrame:
|
| 203 |
+
if path.lower().endswith(".parquet"):
|
| 204 |
+
return pd.read_parquet(path)
|
| 205 |
+
if path.lower().endswith(".csv"):
|
| 206 |
+
return pd.read_csv(path)
|
| 207 |
+
raise ValueError("Unsupported file type")
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def ensure_datetime(df: pd.DataFrame, col: str) -> pd.DataFrame:
|
| 211 |
+
df = df.copy()
|
| 212 |
+
df[col] = pd.to_datetime(df[col], errors="coerce")
|
| 213 |
+
return df.dropna(subset=[col])
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def add_time_grains(df: pd.DataFrame, date_col: str) -> pd.DataFrame:
|
| 217 |
+
df = df.copy()
|
| 218 |
+
d = df[date_col]
|
| 219 |
+
df["month"] = d.dt.to_period("M").astype(str)
|
| 220 |
+
iso = d.dt.isocalendar()
|
| 221 |
+
df["week"] = iso["year"].astype(str) + "-W" + iso["week"].astype(str).str.zfill(2)
|
| 222 |
+
df["calendar_month"] = d.dt.month # 1-12 for seasonal baseline
|
| 223 |
+
df["year"] = d.dt.year
|
| 224 |
+
return df
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def pct(n, d):
|
| 228 |
+
return 0.0 if d == 0 else round((n / d) * 100.0, 1)
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def _split_listlike(x):
|
| 232 |
+
if pd.isna(x):
|
| 233 |
+
return []
|
| 234 |
+
if isinstance(x, list):
|
| 235 |
+
parts = [str(i) for i in x]
|
| 236 |
+
elif isinstance(x, str):
|
| 237 |
+
parts = [p.strip() for p in _SPLIT_RE.split(x) if p.strip()]
|
| 238 |
+
else:
|
| 239 |
+
parts = [str(x).strip()]
|
| 240 |
+
return [p for p in parts if p]
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def safe_col(df, col):
|
| 244 |
+
return col in df.columns and df[col].notna().any()
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def tight_layout(fig, height=360):
|
| 248 |
+
fig.update_layout(
|
| 249 |
+
template=PLOTLY_TEMPLATE,
|
| 250 |
+
height=height,
|
| 251 |
+
margin=dict(l=8, r=8, t=8, b=8),
|
| 252 |
+
plot_bgcolor="white",
|
| 253 |
+
paper_bgcolor="white",
|
| 254 |
+
)
|
| 255 |
+
fig.update_xaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0")
|
| 256 |
+
fig.update_yaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0")
|
| 257 |
+
return fig
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def build_full_period_order(start_date, end_date, grain: str):
|
| 261 |
+
start = pd.to_datetime(start_date)
|
| 262 |
+
end = pd.to_datetime(end_date)
|
| 263 |
+
if grain == "month":
|
| 264 |
+
return pd.period_range(start=start, end=end, freq="M").astype(str).tolist()
|
| 265 |
+
weeks = pd.date_range(start=start, end=end, freq="W-MON")
|
| 266 |
+
if len(weeks) == 0:
|
| 267 |
+
weeks = pd.date_range(start=start, end=end, freq="D")[:1]
|
| 268 |
+
iso = weeks.isocalendar()
|
| 269 |
+
return (iso["year"].astype(str) + "-W" + iso["week"].astype(str).str.zfill(2)).tolist()
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def explode_terms(df: pd.DataFrame, col: str, stopwords=None, min_len=3):
|
| 273 |
+
if col not in df.columns:
|
| 274 |
+
return pd.DataFrame()
|
| 275 |
+
stopwords = stopwords or set()
|
| 276 |
+
tmp = df.copy()
|
| 277 |
+
tmp["_term"] = tmp[col].apply(_split_listlike)
|
| 278 |
+
tmp = tmp.explode("_term").dropna(subset=["_term"])
|
| 279 |
+
term = tmp["_term"].astype(str).str.strip().str.lower()
|
| 280 |
+
term = term.str.replace(r"[^a-z0-9\s\-]", "", regex=True).str.strip()
|
| 281 |
+
tmp["term"] = term
|
| 282 |
+
tmp = tmp[tmp["term"].str.len() >= min_len]
|
| 283 |
+
tmp = tmp[~tmp["term"].isin(stopwords)]
|
| 284 |
+
tmp["mentions"] = 1
|
| 285 |
+
return tmp.drop(columns=["_term"], errors="ignore")
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
# ---------- TF-IDF (contrastive) ----------
|
| 289 |
+
def _clean_text_for_tfidf(t: str) -> str:
|
| 290 |
+
t = str(t).lower()
|
| 291 |
+
t = re.sub(r'http\S+|www\.\S+', '', t)
|
| 292 |
+
t = re.sub(r"[^a-z0-9\s\-]", " ", t)
|
| 293 |
+
t = re.sub(r"\s+", " ", t).strip()
|
| 294 |
+
return t
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
def _term_is_bad(term: str) -> bool:
|
| 298 |
+
term = term.strip().lower()
|
| 299 |
+
if not term:
|
| 300 |
+
return True
|
| 301 |
+
if term in GENERIC_PHRASES:
|
| 302 |
+
return True
|
| 303 |
+
if term in TFIDF_BLOCK_PHRASES:
|
| 304 |
+
return True
|
| 305 |
+
toks = term.split()
|
| 306 |
+
if len(toks) < 2 or len(toks) > 3:
|
| 307 |
+
return True
|
| 308 |
+
if any(w in TFIDF_BLOCK_WORDS for w in toks):
|
| 309 |
+
return True
|
| 310 |
+
if all((w in STOPWORDS or len(w) < 3) for w in toks):
|
| 311 |
+
return True
|
| 312 |
+
generic_words = {"relating", "regarding", "provide", "provides", "amend", "amends", "section", "subsection"}
|
| 313 |
+
if any(w in generic_words for w in toks):
|
| 314 |
+
return True
|
| 315 |
+
stopword_count = sum(1 for w in toks if w in STOPWORDS)
|
| 316 |
+
if len(toks) > 1 and stopword_count / len(toks) > 0.5:
|
| 317 |
+
return True
|
| 318 |
+
if len(toks) == 2 and any(w in ["state", "bill", "act", "law"] for w in toks):
|
| 319 |
+
return True
|
| 320 |
+
return False
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
def _bill_docs(df_slice: pd.DataFrame, bill_id_col: str, text_col: str):
|
| 324 |
+
tmp = df_slice[[bill_id_col, text_col]].dropna().copy()
|
| 325 |
+
tmp[text_col] = tmp[text_col].astype(str).map(_clean_text_for_tfidf)
|
| 326 |
+
tmp = tmp[tmp[text_col].str.len() > 15]
|
| 327 |
+
if tmp.empty:
|
| 328 |
+
return []
|
| 329 |
+
docs = tmp.groupby(bill_id_col)[text_col].apply(lambda s: " ".join(s.tolist())).tolist()
|
| 330 |
+
docs = [d for d in docs if d.strip()]
|
| 331 |
+
return docs
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
def build_contrastive_tfidf(df_cat: pd.DataFrame, df_rest: pd.DataFrame, bill_id_col: str, text_col: str, top_k=15):
|
| 335 |
+
if not _HAS_SK:
|
| 336 |
+
return []
|
| 337 |
+
docs_cat = _bill_docs(df_cat, bill_id_col, text_col)
|
| 338 |
+
docs_rest = _bill_docs(df_rest, bill_id_col, text_col)
|
| 339 |
+
if len(docs_cat) < 2 or len(docs_rest) < 2:
|
| 340 |
+
return []
|
| 341 |
+
vec = TfidfVectorizer(
|
| 342 |
+
stop_words=list(STOPWORDS),
|
| 343 |
+
ngram_range=(2, 3),
|
| 344 |
+
min_df=2,
|
| 345 |
+
max_df=0.35,
|
| 346 |
+
sublinear_tf=True,
|
| 347 |
+
norm="l2",
|
| 348 |
+
token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z\-]{3,}\b",
|
| 349 |
+
max_features=2000
|
| 350 |
+
)
|
| 351 |
+
all_docs = docs_cat + docs_rest
|
| 352 |
+
try:
|
| 353 |
+
X = vec.fit_transform(all_docs)
|
| 354 |
+
except Exception:
|
| 355 |
+
return []
|
| 356 |
+
terms = np.array(vec.get_feature_names_out())
|
| 357 |
+
if len(terms) == 0:
|
| 358 |
+
return []
|
| 359 |
+
X_cat = X[:len(docs_cat)]
|
| 360 |
+
X_rest = X[len(docs_cat):]
|
| 361 |
+
mean_cat = np.asarray(X_cat.mean(axis=0)).ravel()
|
| 362 |
+
mean_rest = np.asarray(X_rest.mean(axis=0)).ravel()
|
| 363 |
+
contrast = mean_cat - mean_rest
|
| 364 |
+
idx = np.argsort(contrast)[::-1]
|
| 365 |
+
out = []
|
| 366 |
+
seen_sets = []
|
| 367 |
+
min_contrast = 0.01
|
| 368 |
+
for i in idx:
|
| 369 |
+
if len(out) >= top_k:
|
| 370 |
+
break
|
| 371 |
+
if contrast[i] <= min_contrast:
|
| 372 |
+
break
|
| 373 |
+
t = terms[i]
|
| 374 |
+
if _term_is_bad(t):
|
| 375 |
+
continue
|
| 376 |
+
wset = set(t.split())
|
| 377 |
+
redundant = False
|
| 378 |
+
for s in seen_sets:
|
| 379 |
+
if len(wset & s) >= max(2, len(wset) - 1):
|
| 380 |
+
redundant = True
|
| 381 |
+
break
|
| 382 |
+
if redundant:
|
| 383 |
+
continue
|
| 384 |
+
seen_sets.append(wset)
|
| 385 |
+
out.append((t, round(float(contrast[i]), 4)))
|
| 386 |
+
return out
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
# ---------- Direction of change ----------
|
| 390 |
+
def classify_direction(sdf: pd.DataFrame, period_col: str, period_order: list, bill_col: str):
|
| 391 |
+
if sdf.empty:
|
| 392 |
+
return ("Stable", 0.0)
|
| 393 |
+
ts = (sdf.groupby(period_col)[bill_col].nunique()
|
| 394 |
+
.reindex(period_order, fill_value=0)
|
| 395 |
+
.to_numpy(dtype=float))
|
| 396 |
+
if ts.sum() == 0 or len(ts) < 2:
|
| 397 |
+
return ("Stable", 0.0)
|
| 398 |
+
x = np.arange(len(ts), dtype=float)
|
| 399 |
+
slope = float(np.polyfit(x, ts, 1)[0])
|
| 400 |
+
eps = 0.10
|
| 401 |
+
if slope > eps:
|
| 402 |
+
return ("Rising", slope)
|
| 403 |
+
if slope < -eps:
|
| 404 |
+
return ("Declining", slope)
|
| 405 |
+
return ("Stable", slope)
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
def short_list(items, n=3):
|
| 409 |
+
items = [x for x in items if x]
|
| 410 |
+
if not items:
|
| 411 |
+
return "-"
|
| 412 |
+
return ", ".join(items[:n]) + ("..." if len(items) > n else "")
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
# =====================================================
|
| 416 |
+
# STEP 1-8: Category Share Baseline & Z-Score Engine
|
| 417 |
+
# =====================================================
|
| 418 |
+
|
| 419 |
+
def compute_monthly_share_series(df_all: pd.DataFrame, category: str,
|
| 420 |
+
cat_col: str, bill_id_col: str, date_col: str) -> pd.DataFrame:
|
| 421 |
+
"""
|
| 422 |
+
STEP 1: Compute monthly category share = bills_in_cat / total_bills for every month.
|
| 423 |
+
Returns DataFrame with columns: [period_month, year, calendar_month, share, total_bills, cat_bills]
|
| 424 |
+
Uses full dataset across all years for baseline computation.
|
| 425 |
+
"""
|
| 426 |
+
df_all = df_all.copy()
|
| 427 |
+
df_all["_ym"] = df_all[date_col].dt.to_period("M")
|
| 428 |
+
df_all["_year"] = df_all[date_col].dt.year
|
| 429 |
+
df_all["_cal_month"] = df_all[date_col].dt.month
|
| 430 |
+
|
| 431 |
+
total_by_month = (
|
| 432 |
+
df_all.groupby("_ym")[bill_id_col].nunique()
|
| 433 |
+
.reset_index(name="total_bills")
|
| 434 |
+
)
|
| 435 |
+
cat_df = df_all[df_all[cat_col].astype(str) == str(category)]
|
| 436 |
+
cat_by_month = (
|
| 437 |
+
cat_df.groupby("_ym")[bill_id_col].nunique()
|
| 438 |
+
.reset_index(name="cat_bills")
|
| 439 |
+
)
|
| 440 |
+
merged = pd.merge(total_by_month, cat_by_month, on="_ym", how="left").fillna(0)
|
| 441 |
+
merged["share"] = (merged["cat_bills"] / merged["total_bills"] * 100.0).replace([np.inf, -np.inf], 0).fillna(0)
|
| 442 |
+
merged["year"] = merged["_ym"].dt.year
|
| 443 |
+
merged["calendar_month"] = merged["_ym"].dt.month
|
| 444 |
+
merged["period_str"] = merged["_ym"].astype(str)
|
| 445 |
+
return merged.sort_values("_ym").reset_index(drop=True)
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
def compute_seasonal_baseline(share_series: pd.DataFrame, lookback_years: int = 5) -> pd.DataFrame:
|
| 449 |
+
"""
|
| 450 |
+
STEP 2: For each calendar month, compute 5-year mean and std of share.
|
| 451 |
+
Returns a dict-like DataFrame keyed by calendar_month.
|
| 452 |
+
"""
|
| 453 |
+
latest_year = share_series["year"].max()
|
| 454 |
+
cutoff_year = latest_year - lookback_years
|
| 455 |
+
historical = share_series[share_series["year"] > cutoff_year].copy()
|
| 456 |
+
baseline = (
|
| 457 |
+
historical.groupby("calendar_month")["share"]
|
| 458 |
+
.agg(mean_share="mean", std_share="std")
|
| 459 |
+
.reset_index()
|
| 460 |
+
)
|
| 461 |
+
baseline["std_share"] = baseline["std_share"].fillna(0.0)
|
| 462 |
+
return baseline
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
def compute_regression_on_share(share_series: pd.DataFrame,
|
| 466 |
+
total_monthly: pd.DataFrame) -> dict:
|
| 467 |
+
"""
|
| 468 |
+
STEP 4 & 5: Linear regression on category share and total bills over time.
|
| 469 |
+
Returns regression stats dict.
|
| 470 |
+
"""
|
| 471 |
+
y = share_series["share"].values
|
| 472 |
+
x = np.arange(len(y), dtype=float)
|
| 473 |
+
result = {"cat_slope": 0.0, "cat_pvalue": 1.0, "cat_intercept": 0.0,
|
| 474 |
+
"total_slope": 0.0, "significant_growth": False}
|
| 475 |
+
if len(y) < 3:
|
| 476 |
+
return result
|
| 477 |
+
try:
|
| 478 |
+
slope, intercept, r, p, se = stats.linregress(x, y)
|
| 479 |
+
result["cat_slope"] = float(slope)
|
| 480 |
+
result["cat_intercept"] = float(intercept)
|
| 481 |
+
result["cat_pvalue"] = float(p)
|
| 482 |
+
except Exception:
|
| 483 |
+
pass
|
| 484 |
+
# Regression on total bills
|
| 485 |
+
if total_monthly is not None and len(total_monthly) >= 3:
|
| 486 |
+
try:
|
| 487 |
+
yt = total_monthly["total_bills"].values.astype(float)
|
| 488 |
+
xt = np.arange(len(yt), dtype=float)
|
| 489 |
+
t_slope, *_ = stats.linregress(xt, yt)
|
| 490 |
+
result["total_slope"] = float(t_slope)
|
| 491 |
+
except Exception:
|
| 492 |
+
pass
|
| 493 |
+
# STEP 5: significant growth if p<0.05 AND cat slope > total slope
|
| 494 |
+
result["significant_growth"] = (
|
| 495 |
+
result["cat_pvalue"] < 0.05 and result["cat_slope"] > result["total_slope"]
|
| 496 |
+
)
|
| 497 |
+
return result
|
| 498 |
+
|
| 499 |
+
|
| 500 |
+
def compute_zscore_and_residuals(share_series: pd.DataFrame,
|
| 501 |
+
baseline: pd.DataFrame,
|
| 502 |
+
reg_stats: dict) -> pd.DataFrame:
|
| 503 |
+
"""
|
| 504 |
+
STEP 3 & 6 & 7: Compute Z-scores, regression predicted values, detrended residuals,
|
| 505 |
+
and anomaly flags (±2 SD).
|
| 506 |
+
"""
|
| 507 |
+
df = share_series.merge(baseline, on="calendar_month", how="left")
|
| 508 |
+
# STEP 3: Z-score
|
| 509 |
+
df["z_score"] = np.where(
|
| 510 |
+
df["std_share"] > 0,
|
| 511 |
+
(df["share"] - df["mean_share"]) / df["std_share"],
|
| 512 |
+
0.0
|
| 513 |
+
)
|
| 514 |
+
# STEP 6: Regression predicted value and residual
|
| 515 |
+
x = np.arange(len(df), dtype=float)
|
| 516 |
+
df["predicted_share"] = reg_stats["cat_intercept"] + reg_stats["cat_slope"] * x
|
| 517 |
+
df["residual"] = df["share"] - df["predicted_share"]
|
| 518 |
+
# Seasonal residual std (for band)
|
| 519 |
+
res_std = df["residual"].std() if len(df) > 2 else 0.0
|
| 520 |
+
df["upper_2sd"] = df["predicted_share"] + 2 * res_std
|
| 521 |
+
df["lower_2sd"] = df["predicted_share"] - 2 * res_std
|
| 522 |
+
# STEP 7: Anomaly flag using seasonal baseline ±2SD
|
| 523 |
+
df["upper_thresh"] = df["mean_share"] + 2 * df["std_share"]
|
| 524 |
+
df["lower_thresh"] = df["mean_share"] - 2 * df["std_share"]
|
| 525 |
+
df["anomaly"] = np.where(
|
| 526 |
+
df["share"] > df["upper_thresh"], "High",
|
| 527 |
+
np.where(df["share"] < df["lower_thresh"], "Low", "Normal")
|
| 528 |
+
)
|
| 529 |
+
return df
|
| 530 |
+
|
| 531 |
+
|
| 532 |
+
|
| 533 |
+
# =====================================================
|
| 534 |
+
# STEP 9: Subcategory Momentum
|
| 535 |
+
# - Short-term: iterative % change over filtered period
|
| 536 |
+
# - Long-term: 5-year monthly regression slope from df_full
|
| 537 |
+
# =====================================================
|
| 538 |
+
|
| 539 |
+
def compute_subcategory_momentum(df_cat_filtered: pd.DataFrame,
|
| 540 |
+
df_full_cat: pd.DataFrame,
|
| 541 |
+
cat_sub_col: str,
|
| 542 |
+
bill_id_col: str,
|
| 543 |
+
period_col: str,
|
| 544 |
+
period_order: list) -> pd.DataFrame:
|
| 545 |
+
"""
|
| 546 |
+
STEP 9:
|
| 547 |
+
- Iterative percent change over the user-selected filtered period (short-term momentum).
|
| 548 |
+
- 5-year monthly regression slope computed from df_full (long-term momentum strength).
|
| 549 |
+
The 5-yr slope is used as the primary bar value (momentum strength).
|
| 550 |
+
Avg % change shown in hover as the short-term signal.
|
| 551 |
+
"""
|
| 552 |
+
if not safe_col(df_cat_filtered, cat_sub_col):
|
| 553 |
+
return pd.DataFrame()
|
| 554 |
+
|
| 555 |
+
# --- Short-term: iterative pct change over filtered period ---
|
| 556 |
+
df_m = df_cat_filtered.copy()
|
| 557 |
+
df_m[cat_sub_col] = df_m[cat_sub_col].astype(str).str.strip()
|
| 558 |
+
sub_period = (
|
| 559 |
+
df_m.dropna(subset=[cat_sub_col])
|
| 560 |
+
.groupby([period_col, cat_sub_col])[bill_id_col].nunique()
|
| 561 |
+
.reset_index(name="bills")
|
| 562 |
+
)
|
| 563 |
+
subs = sorted(df_m[cat_sub_col].dropna().unique().tolist())
|
| 564 |
+
if len(period_order) < 2 or not subs or sub_period.empty:
|
| 565 |
+
return pd.DataFrame()
|
| 566 |
+
|
| 567 |
+
panel_short = (
|
| 568 |
+
sub_period.pivot_table(index=cat_sub_col, columns=period_col, values="bills", aggfunc="sum")
|
| 569 |
+
.reindex(index=subs, columns=period_order, fill_value=0)
|
| 570 |
+
)
|
| 571 |
+
|
| 572 |
+
short_term = {}
|
| 573 |
+
for sub in panel_short.index:
|
| 574 |
+
y = panel_short.loc[sub].to_numpy(dtype=float)
|
| 575 |
+
pct_changes = []
|
| 576 |
+
for i in range(1, len(y)):
|
| 577 |
+
prev, curr = y[i - 1], y[i]
|
| 578 |
+
pct_changes.append((curr - prev) / prev * 100.0 if prev > 0 else 0.0)
|
| 579 |
+
short_term[sub] = float(np.mean(pct_changes)) if pct_changes else 0.0
|
| 580 |
+
|
| 581 |
+
# --- Long-term: 5-year monthly regression slope from full dataset ---
|
| 582 |
+
long_term = {}
|
| 583 |
+
if df_full_cat is not None and not df_full_cat.empty and cat_sub_col in df_full_cat.columns:
|
| 584 |
+
df_fl = df_full_cat.copy()
|
| 585 |
+
df_fl[cat_sub_col] = df_fl[cat_sub_col].astype(str).str.strip()
|
| 586 |
+
df_fl["_ym"] = df_fl["status_date"].dt.to_period("M")
|
| 587 |
+
full_sub_monthly = (
|
| 588 |
+
df_fl.dropna(subset=[cat_sub_col])
|
| 589 |
+
.groupby(["_ym", cat_sub_col])[bill_id_col].nunique()
|
| 590 |
+
.reset_index(name="bills")
|
| 591 |
+
)
|
| 592 |
+
for sub in subs:
|
| 593 |
+
sub_ts = (
|
| 594 |
+
full_sub_monthly[full_sub_monthly[cat_sub_col] == sub]
|
| 595 |
+
.sort_values("_ym")
|
| 596 |
+
)
|
| 597 |
+
y_full = sub_ts["bills"].to_numpy(dtype=float)
|
| 598 |
+
if len(y_full) >= 3:
|
| 599 |
+
x_full = np.arange(len(y_full), dtype=float)
|
| 600 |
+
try:
|
| 601 |
+
slope_5yr = float(np.polyfit(x_full, y_full, 1)[0])
|
| 602 |
+
except Exception:
|
| 603 |
+
slope_5yr = 0.0
|
| 604 |
+
else:
|
| 605 |
+
slope_5yr = 0.0
|
| 606 |
+
long_term[sub] = slope_5yr
|
| 607 |
+
|
| 608 |
+
mom_rows = []
|
| 609 |
+
for sub in subs:
|
| 610 |
+
slope_5yr = long_term.get(sub, 0.0)
|
| 611 |
+
avg_pct_chg = short_term.get(sub, 0.0)
|
| 612 |
+
mom_rows.append((sub, slope_5yr, avg_pct_chg))
|
| 613 |
+
|
| 614 |
+
return (
|
| 615 |
+
pd.DataFrame(mom_rows, columns=["Subcategory", "Slope", "AvgPctChange"])
|
| 616 |
+
.sort_values("Slope", ascending=True)
|
| 617 |
+
.reset_index(drop=True)
|
| 618 |
+
)
|
| 619 |
+
|
| 620 |
+
|
| 621 |
+
# =====================================================
|
| 622 |
+
# STEP 8: Category Share chart with baseline band
|
| 623 |
+
# =====================================================
|
| 624 |
+
|
| 625 |
+
def plot_category_share_with_baseline(analysis_df: pd.DataFrame,
|
| 626 |
+
period_order_filter: list,
|
| 627 |
+
significant_growth: bool) -> go.Figure:
|
| 628 |
+
"""
|
| 629 |
+
STEP 8: Plot observed share, regression baseline, ±2SD band,
|
| 630 |
+
and anomaly dots (red=high, blue=low).
|
| 631 |
+
Filtered to the user-selected date range periods.
|
| 632 |
+
"""
|
| 633 |
+
plot_df = analysis_df[analysis_df["period_str"].isin(period_order_filter)].copy()
|
| 634 |
+
if plot_df.empty:
|
| 635 |
+
return go.Figure()
|
| 636 |
+
|
| 637 |
+
fig = go.Figure()
|
| 638 |
+
|
| 639 |
+
# Shaded ±2 SD band (regression-based)
|
| 640 |
+
fig.add_trace(go.Scatter(
|
| 641 |
+
x=list(plot_df["period_str"]) + list(plot_df["period_str"])[::-1],
|
| 642 |
+
y=list(plot_df["upper_2sd"]) + list(plot_df["lower_2sd"])[::-1],
|
| 643 |
+
fill="toself",
|
| 644 |
+
fillcolor="rgba(27,75,90,0.10)",
|
| 645 |
+
line=dict(color="rgba(255,255,255,0)"),
|
| 646 |
+
hoverinfo="skip",
|
| 647 |
+
name="±2 SD Band",
|
| 648 |
+
showlegend=True,
|
| 649 |
+
))
|
| 650 |
+
|
| 651 |
+
# Seasonal mean baseline (dashed)
|
| 652 |
+
fig.add_trace(go.Scatter(
|
| 653 |
+
x=plot_df["period_str"],
|
| 654 |
+
y=plot_df["mean_share"],
|
| 655 |
+
mode="lines",
|
| 656 |
+
name="Seasonal Baseline (5yr mean)",
|
| 657 |
+
line=dict(color="#90B4BE", dash="dash", width=2),
|
| 658 |
+
hovertemplate="Seasonal Mean: %{y:.2f}%<extra></extra>",
|
| 659 |
+
))
|
| 660 |
+
|
| 661 |
+
# Regression predicted line
|
| 662 |
+
fig.add_trace(go.Scatter(
|
| 663 |
+
x=plot_df["period_str"],
|
| 664 |
+
y=plot_df["predicted_share"],
|
| 665 |
+
mode="lines",
|
| 666 |
+
name="Regression Trend",
|
| 667 |
+
line=dict(color=C_CEDAR_PLANK, dash="dot", width=1.5),
|
| 668 |
+
hovertemplate="Predicted: %{y:.2f}%<extra></extra>",
|
| 669 |
+
))
|
| 670 |
+
|
| 671 |
+
# Observed share (solid dark blue)
|
| 672 |
+
fig.add_trace(go.Scatter(
|
| 673 |
+
x=plot_df["period_str"],
|
| 674 |
+
y=plot_df["share"],
|
| 675 |
+
mode="lines+markers",
|
| 676 |
+
name="Observed Share",
|
| 677 |
+
line=dict(color=C_TRAPPED_DARKNESS, width=3),
|
| 678 |
+
marker=dict(color=C_TRAPPED_DARKNESS, size=6),
|
| 679 |
+
hovertemplate="<b>%{x}</b><br>Share: %{y:.2f}%<extra></extra>",
|
| 680 |
+
))
|
| 681 |
+
|
| 682 |
+
# Anomaly dots — High = red, Low = blue
|
| 683 |
+
high_anom = plot_df[plot_df["anomaly"] == "High"]
|
| 684 |
+
low_anom = plot_df[plot_df["anomaly"] == "Low"]
|
| 685 |
+
|
| 686 |
+
if not high_anom.empty:
|
| 687 |
+
fig.add_trace(go.Scatter(
|
| 688 |
+
x=high_anom["period_str"],
|
| 689 |
+
y=high_anom["share"],
|
| 690 |
+
mode="markers",
|
| 691 |
+
name="High Anomaly (>+2 SD)",
|
| 692 |
+
marker=dict(color=C_PUMPING_SPICE, size=12, symbol="circle",
|
| 693 |
+
line=dict(color="white", width=1.5)),
|
| 694 |
+
hovertemplate="<b>%{x}</b><br>HIGH anomaly: %{y:.2f}%<br>Z: " +
|
| 695 |
+
high_anom["z_score"].round(2).astype(str) + "<extra></extra>",
|
| 696 |
+
))
|
| 697 |
+
|
| 698 |
+
if not low_anom.empty:
|
| 699 |
+
fig.add_trace(go.Scatter(
|
| 700 |
+
x=low_anom["period_str"],
|
| 701 |
+
y=low_anom["share"],
|
| 702 |
+
mode="markers",
|
| 703 |
+
name="Low Anomaly (<-2 SD)",
|
| 704 |
+
marker=dict(color=C_LAZURITE_BLUE, size=12, symbol="circle",
|
| 705 |
+
line=dict(color="white", width=1.5)),
|
| 706 |
+
hovertemplate="<b>%{x}</b><br>LOW anomaly: %{y:.2f}%<br>Z: " +
|
| 707 |
+
low_anom["z_score"].round(2).astype(str) + "<extra></extra>",
|
| 708 |
+
))
|
| 709 |
+
|
| 710 |
+
title_suffix = " ★ Significant Structural Growth" if significant_growth else ""
|
| 711 |
+
fig.update_layout(
|
| 712 |
+
template=PLOTLY_TEMPLATE,
|
| 713 |
+
height=420,
|
| 714 |
+
margin=dict(l=8, r=8, t=28, b=8),
|
| 715 |
+
hovermode="x unified",
|
| 716 |
+
yaxis_title="Share (%)",
|
| 717 |
+
xaxis_title="",
|
| 718 |
+
plot_bgcolor="white",
|
| 719 |
+
paper_bgcolor="white",
|
| 720 |
+
legend=dict(orientation="h", yanchor="bottom", y=-0.30, xanchor="center", x=0.5, font=dict(size=10)),
|
| 721 |
+
title=dict(text=title_suffix, font=dict(size=11, color=C_PUMPING_SPICE), x=0.5) if title_suffix else {},
|
| 722 |
+
)
|
| 723 |
+
fig.update_xaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0")
|
| 724 |
+
fig.update_yaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0",
|
| 725 |
+
range=[0, max(5, float(plot_df["share"].max()) * 1.25)])
|
| 726 |
+
return fig
|
| 727 |
+
|
| 728 |
+
|
| 729 |
+
# -----------------------------
|
| 730 |
+
# Load data
|
| 731 |
+
# -----------------------------
|
| 732 |
+
default_path = _find_first_existing(DEFAULT_CANDIDATES)
|
| 733 |
+
|
| 734 |
+
if default_path is None:
|
| 735 |
+
uploaded = st.file_uploader("Upload Dataset", type=["parquet", "csv"])
|
| 736 |
+
else:
|
| 737 |
+
uploaded = None
|
| 738 |
+
|
| 739 |
+
data_path = None
|
| 740 |
+
data_sig = "default"
|
| 741 |
+
|
| 742 |
+
if uploaded:
|
| 743 |
+
tmp_path = f"/tmp/{uploaded.name}"
|
| 744 |
+
with open(tmp_path, "wb") as f:
|
| 745 |
+
f.write(uploaded.getbuffer())
|
| 746 |
+
data_path = tmp_path
|
| 747 |
+
data_sig = f"{uploaded.name}-{uploaded.size}"
|
| 748 |
+
elif default_path:
|
| 749 |
+
data_path = default_path
|
| 750 |
+
try:
|
| 751 |
+
data_sig = f"{default_path}-{os.path.getmtime(default_path)}"
|
| 752 |
+
except Exception:
|
| 753 |
+
data_sig = default_path
|
| 754 |
+
|
| 755 |
+
|
| 756 |
+
@st.cache_data(show_spinner=True)
|
| 757 |
+
def _load_cached(path: str, signature: str) -> pd.DataFrame:
|
| 758 |
+
return load_dataset(path)
|
| 759 |
+
|
| 760 |
+
|
| 761 |
+
df_raw = _load_cached(data_path, data_sig) if data_path else None
|
| 762 |
+
if df_raw is None or df_raw.empty:
|
| 763 |
+
st.warning("Upload a dataset to begin.")
|
| 764 |
+
st.stop()
|
| 765 |
+
|
| 766 |
+
# -----------------------------
|
| 767 |
+
# Columns
|
| 768 |
+
# -----------------------------
|
| 769 |
+
DATE_COL = "status_date"
|
| 770 |
+
BILL_ID_COL = "bill_id"
|
| 771 |
+
CHAMBER_COL = "chamber"
|
| 772 |
+
CAT_MAIN = "category_main_label"
|
| 773 |
+
CAT_SUB = "category_sub_label"
|
| 774 |
+
INC_COL = "increasing_aspects_standardized"
|
| 775 |
+
DEC_COL = "decreasing_aspects_standardized"
|
| 776 |
+
BENEF_COL = "intended_beneficiaries_standardized"
|
| 777 |
+
IMPACT_SCORE_COL = "impact_rating_score"
|
| 778 |
+
|
| 779 |
+
KW_SOURCES = {
|
| 780 |
+
"Motivation": "motivation_raw",
|
| 781 |
+
"Intent": "intent_raw",
|
| 782 |
+
"Legislative Strategy": "legislative_strategy_raw",
|
| 783 |
+
}
|
| 784 |
+
|
| 785 |
+
required = [DATE_COL, BILL_ID_COL, CAT_MAIN]
|
| 786 |
+
missing = [c for c in required if c not in df_raw.columns]
|
| 787 |
+
if missing:
|
| 788 |
+
st.error(f"Missing required columns: {missing}")
|
| 789 |
+
st.stop()
|
| 790 |
+
|
| 791 |
+
# -----------------------------
|
| 792 |
+
# Engineer chamber from bill_number
|
| 793 |
+
# e.g. "HB1234" -> "HB", "SB5678" -> "SB"
|
| 794 |
+
# -----------------------------
|
| 795 |
+
BILL_NUM_COL = "bill_number"
|
| 796 |
+
df_raw = df_raw.copy()
|
| 797 |
+
if CHAMBER_COL not in df_raw.columns or df_raw[CHAMBER_COL].isna().all() or (df_raw[CHAMBER_COL].astype(str).str.strip() == "").all():
|
| 798 |
+
if BILL_NUM_COL in df_raw.columns:
|
| 799 |
+
df_raw[CHAMBER_COL] = (
|
| 800 |
+
df_raw[BILL_NUM_COL]
|
| 801 |
+
.astype(str)
|
| 802 |
+
.str.strip()
|
| 803 |
+
.str.extract(r"^([A-Za-z]+)", expand=False)
|
| 804 |
+
.str.upper()
|
| 805 |
+
.str[:2]
|
| 806 |
+
)
|
| 807 |
+
else:
|
| 808 |
+
df_raw[CHAMBER_COL] = "Unknown"
|
| 809 |
+
|
| 810 |
+
# Full dataset for baseline (all years) — use pd.Timestamp for reliable datetime comparison
|
| 811 |
+
df_full = ensure_datetime(df_raw, DATE_COL)
|
| 812 |
+
df_full = add_time_grains(df_full, DATE_COL)
|
| 813 |
+
df_full = df_full[
|
| 814 |
+
(df_full[DATE_COL] >= pd.Timestamp(DATA_START_FULL)) &
|
| 815 |
+
(df_full[DATE_COL] <= pd.Timestamp(DATA_END_FULL))
|
| 816 |
+
].copy()
|
| 817 |
+
|
| 818 |
+
# Filtered display dataset — strictly clamped to display range
|
| 819 |
+
df = df_full[
|
| 820 |
+
(df_full[DATE_COL] >= pd.Timestamp(DATA_START)) &
|
| 821 |
+
(df_full[DATE_COL] <= pd.Timestamp(DATA_END))
|
| 822 |
+
].copy()
|
| 823 |
+
df = df.dropna(subset=[CAT_MAIN]).copy()
|
| 824 |
+
|
| 825 |
+
if df.empty:
|
| 826 |
+
st.warning("No data in the display range (2025-01-08 to 2026-02-06).")
|
| 827 |
+
st.stop()
|
| 828 |
+
|
| 829 |
+
# -----------------------------
|
| 830 |
+
# Header
|
| 831 |
+
# -----------------------------
|
| 832 |
+
st.markdown(
|
| 833 |
+
"""
|
| 834 |
+
<div class="header-wrap">
|
| 835 |
+
<div class="header-title">Legislative Trends Dashboard</div>
|
| 836 |
+
<div class="header-sub">Category share • Subcategory drivers • Policy direction • Subcategory momentum • Beneficiary × chamber distribution</div>
|
| 837 |
+
</div>
|
| 838 |
+
""",
|
| 839 |
+
unsafe_allow_html=True,
|
| 840 |
+
)
|
| 841 |
+
|
| 842 |
+
# -----------------------------
|
| 843 |
+
# Filters
|
| 844 |
+
# -----------------------------
|
| 845 |
+
if "clear_filters" not in st.session_state:
|
| 846 |
+
st.session_state.clear_filters = 0
|
| 847 |
+
|
| 848 |
+
min_date = DATA_START
|
| 849 |
+
max_date = DATA_END
|
| 850 |
+
cats_all = sorted(df[CAT_MAIN].dropna().astype(str).unique().tolist())
|
| 851 |
+
|
| 852 |
+
st.markdown("<div class='filter-row'>", unsafe_allow_html=True)
|
| 853 |
+
f1, f2, f3, f4, f5, f6 = st.columns([1.6, 0.9, 1.2, 2.4, 1.3, 0.7])
|
| 854 |
+
|
| 855 |
+
with f1:
|
| 856 |
+
date_range = st.date_input(
|
| 857 |
+
"Date Range",
|
| 858 |
+
value=(min_date, max_date),
|
| 859 |
+
min_value=min_date,
|
| 860 |
+
max_value=max_date,
|
| 861 |
+
key=f"date_{st.session_state.clear_filters}",
|
| 862 |
+
)
|
| 863 |
+
|
| 864 |
+
if isinstance(date_range, tuple) and len(date_range) == 2:
|
| 865 |
+
start_date, end_date = date_range
|
| 866 |
+
else:
|
| 867 |
+
start_date = date_range
|
| 868 |
+
end_date = date_range
|
| 869 |
+
|
| 870 |
+
with f2:
|
| 871 |
+
time_grain = st.radio(
|
| 872 |
+
"Time Grain",
|
| 873 |
+
["month", "week"],
|
| 874 |
+
horizontal=True,
|
| 875 |
+
key=f"tg_{st.session_state.clear_filters}",
|
| 876 |
+
)
|
| 877 |
+
with f3:
|
| 878 |
+
chambers_all = ["All"] + sorted(df[CHAMBER_COL].dropna().astype(str).unique().tolist())
|
| 879 |
+
chambers = st.multiselect("Chamber", chambers_all, default=["All"], key=f"ch_{st.session_state.clear_filters}")
|
| 880 |
+
with f4:
|
| 881 |
+
selected_cat = st.selectbox(
|
| 882 |
+
"Category",
|
| 883 |
+
cats_all,
|
| 884 |
+
index=0 if cats_all else 0,
|
| 885 |
+
key=f"cat_{st.session_state.clear_filters}",
|
| 886 |
+
)
|
| 887 |
+
with f5:
|
| 888 |
+
sub_time = st.selectbox(
|
| 889 |
+
"Subcategory Window",
|
| 890 |
+
["Overall", "Last 30 days", "Last 60 days"],
|
| 891 |
+
key=f"subwin_{st.session_state.clear_filters}",
|
| 892 |
+
)
|
| 893 |
+
with f6:
|
| 894 |
+
clear = st.button("CLEAR", use_container_width=True)
|
| 895 |
+
|
| 896 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 897 |
+
if clear:
|
| 898 |
+
st.cache_data.clear()
|
| 899 |
+
st.session_state.clear_filters += 1
|
| 900 |
+
st.rerun()
|
| 901 |
+
|
| 902 |
+
# -----------------------------
|
| 903 |
+
# Apply filters
|
| 904 |
+
# -----------------------------
|
| 905 |
+
df_f = df.copy()
|
| 906 |
+
df_f = df_f[(df_f[DATE_COL].dt.date >= start_date) & (df_f[DATE_COL].dt.date <= end_date)]
|
| 907 |
+
if "All" not in chambers:
|
| 908 |
+
df_f = df_f[df_f[CHAMBER_COL].astype(str).isin([str(x) for x in chambers])]
|
| 909 |
+
df_f = df_f.dropna(subset=[CAT_MAIN])
|
| 910 |
+
|
| 911 |
+
if df_f.empty:
|
| 912 |
+
st.warning("No rows match your filters.")
|
| 913 |
+
st.stop()
|
| 914 |
+
|
| 915 |
+
tg = time_grain
|
| 916 |
+
period_col = tg
|
| 917 |
+
period_order = build_full_period_order(start_date, end_date, tg)
|
| 918 |
+
|
| 919 |
+
# -----------------------------
|
| 920 |
+
# KPI row
|
| 921 |
+
# -----------------------------
|
| 922 |
+
total_bills = int(df_f[BILL_ID_COL].nunique())
|
| 923 |
+
num_main = int(df_f[CAT_MAIN].nunique())
|
| 924 |
+
|
| 925 |
+
high_impact_bills = "-"
|
| 926 |
+
impact_tooltip = "High Impact = bills in the top quartile of impact_rating_score, when available."
|
| 927 |
+
if safe_col(df_f, IMPACT_SCORE_COL):
|
| 928 |
+
tmp = df_f[[BILL_ID_COL, IMPACT_SCORE_COL]].dropna()
|
| 929 |
+
if not tmp.empty:
|
| 930 |
+
bill_score = tmp.groupby(BILL_ID_COL)[IMPACT_SCORE_COL].max()
|
| 931 |
+
q75 = float(bill_score.quantile(0.75))
|
| 932 |
+
high_impact_bills = int((bill_score >= q75).sum())
|
| 933 |
+
else:
|
| 934 |
+
high_impact_bills = 0
|
| 935 |
+
|
| 936 |
+
ch_bill_counts = df_f.groupby(df_f[CHAMBER_COL].astype(str))[BILL_ID_COL].nunique()
|
| 937 |
+
total_ch_bills = int(ch_bill_counts.sum()) if len(ch_bill_counts) else 0
|
| 938 |
+
house_label = "HB" if "HB" in ch_bill_counts.index else (ch_bill_counts.index[0] if len(ch_bill_counts) else "HB")
|
| 939 |
+
senate_label = "SB" if "SB" in ch_bill_counts.index else (ch_bill_counts.index[1] if len(ch_bill_counts) > 1 else "SB")
|
| 940 |
+
house_pct = pct(int(ch_bill_counts.get(house_label, 0)), total_ch_bills) if total_ch_bills else 0.0
|
| 941 |
+
senate_pct = pct(int(ch_bill_counts.get(senate_label, 0)), total_ch_bills) if total_ch_bills else 0.0
|
| 942 |
+
|
| 943 |
+
st.markdown(
|
| 944 |
+
f"""
|
| 945 |
+
<div class="kpi-grid">
|
| 946 |
+
<div class="kpi-card">
|
| 947 |
+
<div class="kpi-label">Bills</div>
|
| 948 |
+
<div class="kpi-value">{total_bills:,}</div>
|
| 949 |
+
</div>
|
| 950 |
+
<div class="kpi-card" title="{impact_tooltip}">
|
| 951 |
+
<div class="kpi-label">High Impact Bills</div>
|
| 952 |
+
<div class="kpi-value">{high_impact_bills if high_impact_bills != "-" else "-"}</div>
|
| 953 |
+
</div>
|
| 954 |
+
<div class="kpi-card">
|
| 955 |
+
<div class="kpi-label">Categories</div>
|
| 956 |
+
<div class="kpi-value">{num_main:,}</div>
|
| 957 |
+
</div>
|
| 958 |
+
<div class="kpi-card">
|
| 959 |
+
<div class="kpi-label">{str(house_label)} Bills %</div>
|
| 960 |
+
<div class="kpi-value">{house_pct:.1f}%</div>
|
| 961 |
+
</div>
|
| 962 |
+
<div class="kpi-card">
|
| 963 |
+
<div class="kpi-label">{str(senate_label)} Bills %</div>
|
| 964 |
+
<div class="kpi-value">{senate_pct:.1f}%</div>
|
| 965 |
+
</div>
|
| 966 |
+
</div>
|
| 967 |
+
""",
|
| 968 |
+
unsafe_allow_html=True,
|
| 969 |
+
)
|
| 970 |
+
|
| 971 |
+
# =====================================================
|
| 972 |
+
# Manager Visual: Significant Category Shifts (Z-score)
|
| 973 |
+
# Left: ranked bar chart of categories beyond ±2σ
|
| 974 |
+
# Right: ranked interpretation table (directional)
|
| 975 |
+
# =====================================================
|
| 976 |
+
|
| 977 |
+
st.markdown("### Significant Category Shifts (vs Seasonal Baseline)")
|
| 978 |
+
|
| 979 |
+
def compute_all_category_monthly_shares(df_all: pd.DataFrame, cat_col: str, bill_id_col: str, date_col: str) -> pd.DataFrame:
|
| 980 |
+
"""
|
| 981 |
+
Compute monthly share (%) for ALL categories:
|
| 982 |
+
share = (unique bills in category that month) / (unique bills total that month) * 100
|
| 983 |
+
Returns columns: [_ym, year, calendar_month, Category, cat_bills, total_bills, share, period_str]
|
| 984 |
+
"""
|
| 985 |
+
tmp = df_all.copy()
|
| 986 |
+
tmp["_ym"] = tmp[date_col].dt.to_period("M")
|
| 987 |
+
|
| 988 |
+
total_by_month = (
|
| 989 |
+
tmp.groupby("_ym")[bill_id_col].nunique()
|
| 990 |
+
.reset_index(name="total_bills")
|
| 991 |
+
)
|
| 992 |
+
|
| 993 |
+
cat_by_month = (
|
| 994 |
+
tmp.dropna(subset=[cat_col])
|
| 995 |
+
.groupby(["_ym", cat_col])[bill_id_col].nunique()
|
| 996 |
+
.reset_index(name="cat_bills")
|
| 997 |
+
)
|
| 998 |
+
|
| 999 |
+
merged = cat_by_month.merge(total_by_month, on="_ym", how="left")
|
| 1000 |
+
merged["share"] = (merged["cat_bills"] / merged["total_bills"] * 100.0).replace([np.inf, -np.inf], 0).fillna(0)
|
| 1001 |
+
merged["period_str"] = merged["_ym"].astype(str)
|
| 1002 |
+
|
| 1003 |
+
# Ensure baseline keys exist
|
| 1004 |
+
merged["year"] = merged["_ym"].dt.year
|
| 1005 |
+
merged["calendar_month"] = merged["_ym"].dt.month
|
| 1006 |
+
|
| 1007 |
+
merged = merged.rename(columns={cat_col: "Category"})
|
| 1008 |
+
return merged.sort_values("_ym").reset_index(drop=True)
|
| 1009 |
+
|
| 1010 |
+
|
| 1011 |
+
def compute_category_seasonal_baseline_all(share_all: pd.DataFrame, lookback_years: int = 5) -> pd.DataFrame:
|
| 1012 |
+
"""
|
| 1013 |
+
For each (Category, calendar_month), compute mean and std of share over last N years.
|
| 1014 |
+
Derives year/calendar_month from _ym to avoid KeyError.
|
| 1015 |
+
Returns: Category, calendar_month, mean_share, std_share
|
| 1016 |
+
"""
|
| 1017 |
+
if share_all is None or share_all.empty:
|
| 1018 |
+
return pd.DataFrame(columns=["Category", "calendar_month", "mean_share", "std_share"])
|
| 1019 |
+
|
| 1020 |
+
tmp = share_all.copy()
|
| 1021 |
+
|
| 1022 |
+
# Ensure required fields exist
|
| 1023 |
+
if "_ym" not in tmp.columns:
|
| 1024 |
+
if "period_str" in tmp.columns:
|
| 1025 |
+
tmp["_ym"] = pd.PeriodIndex(tmp["period_str"], freq="M")
|
| 1026 |
+
else:
|
| 1027 |
+
return pd.DataFrame(columns=["Category", "calendar_month", "mean_share", "std_share"])
|
| 1028 |
+
|
| 1029 |
+
if "calendar_month" not in tmp.columns:
|
| 1030 |
+
tmp["calendar_month"] = tmp["_ym"].dt.month
|
| 1031 |
+
|
| 1032 |
+
# Derive year reliably from _ym
|
| 1033 |
+
tmp["year"] = tmp["_ym"].dt.year
|
| 1034 |
+
|
| 1035 |
+
latest_year = int(tmp["year"].max()) if not tmp.empty else 0
|
| 1036 |
+
cutoff_year = latest_year - lookback_years
|
| 1037 |
+
hist = tmp[tmp["year"] > cutoff_year].copy()
|
| 1038 |
+
|
| 1039 |
+
baseline = (
|
| 1040 |
+
hist.groupby(["Category", "calendar_month"])["share"]
|
| 1041 |
+
.agg(mean_share="mean", std_share="std")
|
| 1042 |
+
.reset_index()
|
| 1043 |
+
)
|
| 1044 |
+
baseline["std_share"] = baseline["std_share"].fillna(0.0)
|
| 1045 |
+
return baseline
|
| 1046 |
+
|
| 1047 |
+
|
| 1048 |
+
# --- Build monthly shares across FULL data for baseline ---
|
| 1049 |
+
share_all = compute_all_category_monthly_shares(df_full, CAT_MAIN, BILL_ID_COL, DATE_COL)
|
| 1050 |
+
baseline_all = compute_category_seasonal_baseline_all(share_all, lookback_years=5)
|
| 1051 |
+
|
| 1052 |
+
# --- Choose the "current" month for the selected time window (end_date month) ---
|
| 1053 |
+
target_ym = pd.to_datetime(end_date).to_period("M")
|
| 1054 |
+
target_period_str = str(target_ym)
|
| 1055 |
+
|
| 1056 |
+
current_month = share_all[share_all["_ym"] == target_ym].copy()
|
| 1057 |
+
if current_month.empty:
|
| 1058 |
+
st.info("No monthly data available for the selected end date month to compute Z-scores.")
|
| 1059 |
+
else:
|
| 1060 |
+
# Ensure calendar_month exists for the merge key
|
| 1061 |
+
if "calendar_month" not in current_month.columns:
|
| 1062 |
+
if "_ym" in current_month.columns:
|
| 1063 |
+
current_month["calendar_month"] = current_month["_ym"].dt.month
|
| 1064 |
+
elif "period_str" in current_month.columns:
|
| 1065 |
+
current_month["_ym"] = pd.PeriodIndex(current_month["period_str"], freq="M")
|
| 1066 |
+
current_month["calendar_month"] = current_month["_ym"].dt.month
|
| 1067 |
+
else:
|
| 1068 |
+
current_month["calendar_month"] = int(pd.to_datetime(end_date).month)
|
| 1069 |
+
|
| 1070 |
+
# Join baseline (Category x calendar_month) and compute Z-score
|
| 1071 |
+
current_month = current_month.merge(
|
| 1072 |
+
baseline_all,
|
| 1073 |
+
on=["Category", "calendar_month"],
|
| 1074 |
+
how="left"
|
| 1075 |
+
)
|
| 1076 |
+
|
| 1077 |
+
current_month["z_score"] = np.where(
|
| 1078 |
+
current_month["std_share"] > 0,
|
| 1079 |
+
(current_month["share"] - current_month["mean_share"]) / current_month["std_share"],
|
| 1080 |
+
0.0
|
| 1081 |
+
)
|
| 1082 |
+
|
| 1083 |
+
# Optional % change vs baseline mean (relative)
|
| 1084 |
+
current_month["pct_change_vs_mean"] = np.where(
|
| 1085 |
+
current_month["mean_share"] > 0,
|
| 1086 |
+
(current_month["share"] - current_month["mean_share"]) / current_month["mean_share"] * 100.0,
|
| 1087 |
+
0.0
|
| 1088 |
+
)
|
| 1089 |
+
|
| 1090 |
+
# Only include categories beyond ±2σ threshold
|
| 1091 |
+
shifts_sig = current_month[current_month["z_score"].abs() >= 2.0].copy()
|
| 1092 |
+
|
| 1093 |
+
# --- Fallback: if none exceed ±2σ, show the largest movers by |Z| (clearly labeled) ---
|
| 1094 |
+
show_fallback = False
|
| 1095 |
+
if shifts_sig.empty:
|
| 1096 |
+
show_fallback = True
|
| 1097 |
+
st.info(f"No categories exceeded ±2σ in {target_period_str}. Showing largest movers instead (not statistically significant).")
|
| 1098 |
+
shifts = current_month.copy()
|
| 1099 |
+
shifts["abs_z"] = shifts["z_score"].abs()
|
| 1100 |
+
shifts = shifts.sort_values("abs_z", ascending=False).head(12)
|
| 1101 |
+
else:
|
| 1102 |
+
shifts = shifts_sig.copy()
|
| 1103 |
+
shifts["abs_z"] = shifts["z_score"].abs()
|
| 1104 |
+
shifts = shifts.sort_values("abs_z", ascending=False)
|
| 1105 |
+
|
| 1106 |
+
# Color-coded: Blue above baseline, Red below baseline
|
| 1107 |
+
shifts["Color"] = np.where(shifts["z_score"] >= 0, "Above baseline", "Below baseline")
|
| 1108 |
+
color_map_shift = {"Above baseline": C_POSITIVE, "Below baseline": C_NEGATIVE}
|
| 1109 |
+
|
| 1110 |
+
left_col, right_col = st.columns([1.55, 1.0])
|
| 1111 |
+
|
| 1112 |
+
with left_col:
|
| 1113 |
+
st.markdown("**What’s moving the most?**")
|
| 1114 |
+
fig_shift = px.bar(
|
| 1115 |
+
shifts.iloc[::-1], # reverse so biggest appears at top in horizontal bar
|
| 1116 |
+
x="z_score",
|
| 1117 |
+
y="Category",
|
| 1118 |
+
orientation="h",
|
| 1119 |
+
color="Color",
|
| 1120 |
+
color_discrete_map=color_map_shift,
|
| 1121 |
+
template=PLOTLY_TEMPLATE,
|
| 1122 |
+
custom_data=["share", "mean_share", "std_share", "pct_change_vs_mean", "period_str"],
|
| 1123 |
+
labels={"z_score": "Z-score (σ from baseline)", "Category": ""},
|
| 1124 |
+
)
|
| 1125 |
+
fig_shift.update_traces(
|
| 1126 |
+
hovertemplate=(
|
| 1127 |
+
"<b>%{y}</b><br>"
|
| 1128 |
+
"Z-score: %{x:.2f}<br>"
|
| 1129 |
+
"Current share: %{customdata[0]:.2f}%<br>"
|
| 1130 |
+
"Baseline mean: %{customdata[1]:.2f}%<br>"
|
| 1131 |
+
"Baseline std: %{customdata[2]:.2f}<br>"
|
| 1132 |
+
"% change vs mean: %{customdata[3]:.1f}%<br>"
|
| 1133 |
+
"Month: %{customdata[4]}<extra></extra>"
|
| 1134 |
+
)
|
| 1135 |
+
)
|
| 1136 |
+
fig_shift = tight_layout(fig_shift, height=max(420, len(shifts) * 28 + 180))
|
| 1137 |
+
fig_shift.update_yaxes(showgrid=False)
|
| 1138 |
+
fig_shift.update_xaxes(zeroline=True, zerolinecolor="#C9D3D6")
|
| 1139 |
+
st.plotly_chart(fig_shift, use_container_width=True, config={"displayModeBar": False})
|
| 1140 |
+
|
| 1141 |
+
if show_fallback:
|
| 1142 |
+
st.caption("Largest movers shown because none crossed the ±2σ significance threshold.")
|
| 1143 |
+
|
| 1144 |
+
with right_col:
|
| 1145 |
+
st.markdown("**Current Significant Shifts**" if not show_fallback else "**Largest Movers (Below ±2σ)**")
|
| 1146 |
+
|
| 1147 |
+
# Directional ranking (NOT absolute):
|
| 1148 |
+
# - Top: largest positive deviations
|
| 1149 |
+
# - Bottom: largest negative deviations
|
| 1150 |
+
pos = shifts[shifts["z_score"] > 0].sort_values("z_score", ascending=False).copy()
|
| 1151 |
+
neg = shifts[shifts["z_score"] < 0].sort_values("z_score", ascending=True).copy()
|
| 1152 |
+
|
| 1153 |
+
def _mk_panel(df_part: pd.DataFrame, arrow: str):
|
| 1154 |
+
if df_part.empty:
|
| 1155 |
+
return pd.DataFrame(columns=["Category", "Direction", "Z-Score", "% Change", "Time Window"])
|
| 1156 |
+
out = df_part[["Category", "z_score", "pct_change_vs_mean"]].copy()
|
| 1157 |
+
out["Direction"] = arrow
|
| 1158 |
+
out["Z-Score"] = out["z_score"].round(2)
|
| 1159 |
+
out["% Change"] = out["pct_change_vs_mean"].round(1)
|
| 1160 |
+
out["Time Window"] = f"{target_period_str}"
|
| 1161 |
+
out = out.drop(columns=["z_score", "pct_change_vs_mean"])
|
| 1162 |
+
return out
|
| 1163 |
+
|
| 1164 |
+
panel_pos = _mk_panel(pos, "↑")
|
| 1165 |
+
panel_neg = _mk_panel(neg, "↓")
|
| 1166 |
+
|
| 1167 |
+
panel = pd.concat([panel_pos, panel_neg], axis=0).reset_index(drop=True)
|
| 1168 |
+
panel.insert(0, "Rank", np.arange(1, len(panel) + 1))
|
| 1169 |
+
|
| 1170 |
+
st.dataframe(panel, use_container_width=True, height=380)
|
| 1171 |
+
st.caption("Directional ranking: accelerators (↑) first, contractions (↓) last. Threshold: |Z| ≥ 2 (fallback shows top movers if none qualify).")
|
| 1172 |
+
|
| 1173 |
+
# -----------------------------
|
| 1174 |
+
# Category ranking
|
| 1175 |
+
# -----------------------------
|
| 1176 |
+
st.markdown("### Category Ranking")
|
| 1177 |
+
cat_rank = (
|
| 1178 |
+
df_f.groupby(CAT_MAIN)[BILL_ID_COL].nunique()
|
| 1179 |
+
.sort_values(ascending=False)
|
| 1180 |
+
.reset_index(name="Bills")
|
| 1181 |
+
.head(20)
|
| 1182 |
+
)
|
| 1183 |
+
|
| 1184 |
+
cat_hover_dir = []
|
| 1185 |
+
cat_hover_impact = []
|
| 1186 |
+
for cat in cat_rank[CAT_MAIN].astype(str).tolist():
|
| 1187 |
+
sdf = df_f[df_f[CAT_MAIN].astype(str) == str(cat)].copy()
|
| 1188 |
+
direction, _slope = classify_direction(sdf, period_col, period_order, BILL_ID_COL)
|
| 1189 |
+
if safe_col(sdf, IMPACT_SCORE_COL):
|
| 1190 |
+
bmax = sdf[[BILL_ID_COL, IMPACT_SCORE_COL]].dropna().groupby(BILL_ID_COL)[IMPACT_SCORE_COL].max()
|
| 1191 |
+
avg_imp = float(bmax.mean()) if len(bmax) else float("nan")
|
| 1192 |
+
else:
|
| 1193 |
+
avg_imp = float("nan")
|
| 1194 |
+
cat_hover_dir.append(direction)
|
| 1195 |
+
cat_hover_impact.append(None if np.isnan(avg_imp) else round(avg_imp, 2))
|
| 1196 |
+
|
| 1197 |
+
cat_rank2 = cat_rank.copy()
|
| 1198 |
+
cat_rank2["Direction"] = cat_hover_dir
|
| 1199 |
+
cat_rank2["AvgImpact"] = cat_hover_impact
|
| 1200 |
+
|
| 1201 |
+
fig_rank = px.bar(
|
| 1202 |
+
cat_rank2.iloc[::-1],
|
| 1203 |
+
x="Bills",
|
| 1204 |
+
y=CAT_MAIN,
|
| 1205 |
+
orientation="h",
|
| 1206 |
+
labels={"Bills": "Bills", CAT_MAIN: ""},
|
| 1207 |
+
template=PLOTLY_TEMPLATE,
|
| 1208 |
+
custom_data=["Direction", "AvgImpact"],
|
| 1209 |
+
)
|
| 1210 |
+
fig_rank.update_traces(
|
| 1211 |
+
marker_color=C_LAZURITE_BLUE,
|
| 1212 |
+
hovertemplate=(
|
| 1213 |
+
"<b>%{y}</b><br>"
|
| 1214 |
+
"Bills: %{x}<br>"
|
| 1215 |
+
"Direction: %{customdata[0]}<br>"
|
| 1216 |
+
"Avg Political Impact: %{customdata[1]}<extra></extra>"
|
| 1217 |
+
)
|
| 1218 |
+
)
|
| 1219 |
+
fig_rank = tight_layout(fig_rank, height=420)
|
| 1220 |
+
fig_rank.update_yaxes(showgrid=False)
|
| 1221 |
+
st.plotly_chart(fig_rank, use_container_width=True, config={"displayModeBar": False})
|
| 1222 |
+
|
| 1223 |
+
# =====================================================
|
| 1224 |
+
# Row 1: Category Share (with Baseline) + Subcategory Drivers
|
| 1225 |
+
# =====================================================
|
| 1226 |
+
df_cat = df_f[df_f[CAT_MAIN].astype(str) == str(selected_cat)].copy()
|
| 1227 |
+
|
| 1228 |
+
# --- STEP 1-8: Compute full share series on entire df_full for baseline ---
|
| 1229 |
+
share_series_full = compute_monthly_share_series(
|
| 1230 |
+
df_full, selected_cat, CAT_MAIN, BILL_ID_COL, DATE_COL
|
| 1231 |
+
)
|
| 1232 |
+
seasonal_baseline = compute_seasonal_baseline(share_series_full, lookback_years=5)
|
| 1233 |
+
|
| 1234 |
+
total_monthly_full = (
|
| 1235 |
+
df_full.groupby(df_full[DATE_COL].dt.to_period("M"))[BILL_ID_COL]
|
| 1236 |
+
.nunique()
|
| 1237 |
+
.reset_index(name="total_bills")
|
| 1238 |
+
)
|
| 1239 |
+
reg_stats = compute_regression_on_share(share_series_full, total_monthly_full)
|
| 1240 |
+
analysis_df = compute_zscore_and_residuals(share_series_full, seasonal_baseline, reg_stats)
|
| 1241 |
+
|
| 1242 |
+
# Build period order strings for filter alignment (months only for baseline chart)
|
| 1243 |
+
month_period_order = build_full_period_order(start_date, end_date, "month")
|
| 1244 |
+
|
| 1245 |
+
# --- Subcategory section ---
|
| 1246 |
+
df_sub = df_cat.copy()
|
| 1247 |
+
cutoff = None
|
| 1248 |
+
if sub_time != "Overall":
|
| 1249 |
+
days = 30 if sub_time == "Last 30 days" else 60
|
| 1250 |
+
cutoff = (pd.to_datetime(end_date) - pd.Timedelta(days=days)).date()
|
| 1251 |
+
df_sub = df_sub[df_sub[DATE_COL].dt.date >= cutoff]
|
| 1252 |
+
|
| 1253 |
+
emerging_map = {}
|
| 1254 |
+
if cutoff is not None and safe_col(df_cat, CAT_SUB):
|
| 1255 |
+
before = df_cat[df_cat[DATE_COL].dt.date < cutoff]
|
| 1256 |
+
before_set = set(before[CAT_SUB].dropna().astype(str).unique().tolist())
|
| 1257 |
+
window_set = set(df_sub[CAT_SUB].dropna().astype(str).unique().tolist())
|
| 1258 |
+
for s in window_set:
|
| 1259 |
+
emerging_map[s] = (s not in before_set)
|
| 1260 |
+
|
| 1261 |
+
sub_ct = pd.DataFrame()
|
| 1262 |
+
if CAT_SUB in df_sub.columns and not df_sub[CAT_SUB].isna().all():
|
| 1263 |
+
sub_ct = (
|
| 1264 |
+
df_sub.dropna(subset=[CAT_SUB])
|
| 1265 |
+
.groupby(CAT_SUB)[BILL_ID_COL].nunique()
|
| 1266 |
+
.reset_index(name="Bills")
|
| 1267 |
+
.sort_values("Bills", ascending=False)
|
| 1268 |
+
.head(12)
|
| 1269 |
+
)
|
| 1270 |
+
|
| 1271 |
+
hover_dir = []
|
| 1272 |
+
hover_impact = []
|
| 1273 |
+
for sub in sub_ct[CAT_SUB].astype(str).tolist():
|
| 1274 |
+
sdf = df_sub[df_sub[CAT_SUB].astype(str) == str(sub)].copy()
|
| 1275 |
+
direction, _slope = classify_direction(sdf, period_col, period_order, BILL_ID_COL)
|
| 1276 |
+
if cutoff is not None and emerging_map.get(sub, False):
|
| 1277 |
+
direction = "Emerging"
|
| 1278 |
+
if safe_col(sdf, IMPACT_SCORE_COL):
|
| 1279 |
+
bmax = sdf[[BILL_ID_COL, IMPACT_SCORE_COL]].dropna().groupby(BILL_ID_COL)[IMPACT_SCORE_COL].max()
|
| 1280 |
+
avg_imp = float(bmax.mean()) if len(bmax) else float("nan")
|
| 1281 |
+
else:
|
| 1282 |
+
avg_imp = float("nan")
|
| 1283 |
+
hover_dir.append(direction)
|
| 1284 |
+
hover_impact.append(None if np.isnan(avg_imp) else round(avg_imp, 2))
|
| 1285 |
+
|
| 1286 |
+
sub_ct2 = sub_ct.copy()
|
| 1287 |
+
sub_ct2["Direction"] = hover_dir
|
| 1288 |
+
sub_ct2["AvgImpact"] = hover_impact
|
| 1289 |
+
|
| 1290 |
+
r1a, r1b = st.columns(2)
|
| 1291 |
+
|
| 1292 |
+
with r1a:
|
| 1293 |
+
st.markdown("### Category Share Over Time")
|
| 1294 |
+
fig_share = plot_category_share_with_baseline(
|
| 1295 |
+
analysis_df, month_period_order, reg_stats["significant_growth"]
|
| 1296 |
+
)
|
| 1297 |
+
if fig_share.data:
|
| 1298 |
+
st.plotly_chart(fig_share, use_container_width=True, config={"displayModeBar": False})
|
| 1299 |
+
else:
|
| 1300 |
+
st.info("No share data available for this selection.")
|
| 1301 |
+
|
| 1302 |
+
with r1b:
|
| 1303 |
+
st.markdown("### Subcategory Drivers")
|
| 1304 |
+
if sub_ct2.empty:
|
| 1305 |
+
st.info("No subcategory data available for this selection/window.")
|
| 1306 |
+
else:
|
| 1307 |
+
show = sub_ct2.sort_values("Bills", ascending=True)
|
| 1308 |
+
fig = px.bar(
|
| 1309 |
+
show,
|
| 1310 |
+
x="Bills",
|
| 1311 |
+
y=CAT_SUB,
|
| 1312 |
+
orientation="h",
|
| 1313 |
+
template=PLOTLY_TEMPLATE,
|
| 1314 |
+
labels={"Bills": "Bills", CAT_SUB: ""},
|
| 1315 |
+
custom_data=["Direction", "AvgImpact"],
|
| 1316 |
+
)
|
| 1317 |
+
fig.update_traces(
|
| 1318 |
+
marker_color=C_PUMPING_SPICE,
|
| 1319 |
+
hovertemplate=(
|
| 1320 |
+
"<b>%{y}</b><br>"
|
| 1321 |
+
"Bills: %{x}<br>"
|
| 1322 |
+
"Direction: %{customdata[0]}<br>"
|
| 1323 |
+
"Avg Political Impact: %{customdata[1]}<extra></extra>"
|
| 1324 |
+
),
|
| 1325 |
+
)
|
| 1326 |
+
fig = tight_layout(fig, height=420)
|
| 1327 |
+
fig.update_yaxes(showgrid=False)
|
| 1328 |
+
st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False})
|
| 1329 |
+
|
| 1330 |
+
# -----------------------------
|
| 1331 |
+
# Row 2: Policy Direction + TF-IDF
|
| 1332 |
+
# -----------------------------
|
| 1333 |
+
r2a, r2b = st.columns([1.55, 1.0])
|
| 1334 |
+
|
| 1335 |
+
inc_terms = explode_terms(df_f, INC_COL, stopwords=STOPWORDS, min_len=3)
|
| 1336 |
+
dec_terms = explode_terms(df_f, DEC_COL, stopwords=STOPWORDS, min_len=3)
|
| 1337 |
+
|
| 1338 |
+
if not inc_terms.empty:
|
| 1339 |
+
inc_terms["_period"] = inc_terms[period_col]
|
| 1340 |
+
if not dec_terms.empty:
|
| 1341 |
+
dec_terms["_period"] = dec_terms[period_col]
|
| 1342 |
+
|
| 1343 |
+
inc_ts = inc_terms.groupby([period_col])["mentions"].sum().reindex(period_order, fill_value=0).reset_index()
|
| 1344 |
+
inc_ts.columns = [period_col, "inc"]
|
| 1345 |
+
dec_ts = dec_terms.groupby([period_col])["mentions"].sum().reindex(period_order, fill_value=0).reset_index()
|
| 1346 |
+
dec_ts.columns = [period_col, "dec"]
|
| 1347 |
+
|
| 1348 |
+
net_ts = pd.merge(inc_ts, dec_ts, on=period_col, how="left").fillna(0)
|
| 1349 |
+
net_ts["net"] = net_ts["inc"] - net_ts["dec"]
|
| 1350 |
+
|
| 1351 |
+
df_inc_rows = df_f[df_f[INC_COL].notna()].copy() if INC_COL in df_f.columns else pd.DataFrame(columns=df_f.columns)
|
| 1352 |
+
df_dec_rows = df_f[df_f[DEC_COL].notna()].copy() if DEC_COL in df_f.columns else pd.DataFrame(columns=df_f.columns)
|
| 1353 |
+
df_inc_rows["_period"] = df_inc_rows[period_col] if not df_inc_rows.empty else []
|
| 1354 |
+
df_dec_rows["_period"] = df_dec_rows[period_col] if not df_dec_rows.empty else []
|
| 1355 |
+
|
| 1356 |
+
|
| 1357 |
+
def top_keywords_for_period(term_df: pd.DataFrame, period_value, k=6) -> pd.DataFrame:
|
| 1358 |
+
if term_df is None or term_df.empty:
|
| 1359 |
+
return pd.DataFrame()
|
| 1360 |
+
sub = term_df[term_df["_period"] == period_value]
|
| 1361 |
+
if sub.empty:
|
| 1362 |
+
return pd.DataFrame()
|
| 1363 |
+
vc = sub["term"].value_counts().head(k).reset_index()
|
| 1364 |
+
vc.columns = ["Keyword", "Mentions"]
|
| 1365 |
+
return vc
|
| 1366 |
+
|
| 1367 |
+
|
| 1368 |
+
def top_beneficiaries_for_period(df_rows: pd.DataFrame, period_value, benef_col: str, k=6) -> pd.DataFrame:
|
| 1369 |
+
if benef_col not in df_rows.columns or df_rows.empty:
|
| 1370 |
+
return pd.DataFrame()
|
| 1371 |
+
sub = df_rows[df_rows["_period"] == period_value].copy()
|
| 1372 |
+
if sub.empty or sub[benef_col].dropna().empty:
|
| 1373 |
+
return pd.DataFrame()
|
| 1374 |
+
sub["_b"] = sub[benef_col].apply(_split_listlike)
|
| 1375 |
+
sub = sub.explode("_b").dropna(subset=["_b"])
|
| 1376 |
+
sub["_b"] = sub["_b"].astype(str).str.strip()
|
| 1377 |
+
sub = sub[sub["_b"].str.len() > 0]
|
| 1378 |
+
if sub.empty:
|
| 1379 |
+
return pd.DataFrame()
|
| 1380 |
+
vc = sub["_b"].value_counts().head(k).reset_index()
|
| 1381 |
+
vc.columns = ["Beneficiary", "Mentions"]
|
| 1382 |
+
return vc
|
| 1383 |
+
|
| 1384 |
+
|
| 1385 |
+
inc_kw_short, dec_kw_short, inc_b_short, dec_b_short = [], [], [], []
|
| 1386 |
+
for p in net_ts[period_col].tolist():
|
| 1387 |
+
inc_kw = top_keywords_for_period(inc_terms, p, k=6)
|
| 1388 |
+
dec_kw = top_keywords_for_period(dec_terms, p, k=6)
|
| 1389 |
+
inc_b = top_beneficiaries_for_period(df_inc_rows, p, BENEF_COL, k=6) if safe_col(df_f, BENEF_COL) else pd.DataFrame()
|
| 1390 |
+
dec_b = top_beneficiaries_for_period(df_dec_rows, p, BENEF_COL, k=6) if safe_col(df_f, BENEF_COL) else pd.DataFrame()
|
| 1391 |
+
inc_kw_short.append(short_list(inc_kw["Keyword"].tolist() if not inc_kw.empty else [], 3))
|
| 1392 |
+
dec_kw_short.append(short_list(dec_kw["Keyword"].tolist() if not dec_kw.empty else [], 3))
|
| 1393 |
+
inc_b_short.append(short_list(inc_b["Beneficiary"].tolist() if not inc_b.empty else [], 2))
|
| 1394 |
+
dec_b_short.append(short_list(dec_b["Beneficiary"].tolist() if not dec_b.empty else [], 2))
|
| 1395 |
+
|
| 1396 |
+
net_ts["inc_kw_short"] = inc_kw_short
|
| 1397 |
+
net_ts["dec_kw_short"] = dec_kw_short
|
| 1398 |
+
net_ts["inc_b_short"] = inc_b_short
|
| 1399 |
+
net_ts["dec_b_short"] = dec_b_short
|
| 1400 |
+
|
| 1401 |
+
with r2a:
|
| 1402 |
+
st.markdown("### Policy Direction Over Time")
|
| 1403 |
+
if net_ts.empty or (net_ts["inc"].sum() == 0 and net_ts["dec"].sum() == 0):
|
| 1404 |
+
st.info("No increasing/decreasing aspects available under current filters.")
|
| 1405 |
+
else:
|
| 1406 |
+
custom = np.stack(
|
| 1407 |
+
[
|
| 1408 |
+
net_ts["inc_kw_short"].astype(str),
|
| 1409 |
+
net_ts["dec_kw_short"].astype(str),
|
| 1410 |
+
net_ts["inc_b_short"].astype(str),
|
| 1411 |
+
net_ts["dec_b_short"].astype(str),
|
| 1412 |
+
],
|
| 1413 |
+
axis=1,
|
| 1414 |
+
)
|
| 1415 |
+
fig = go.Figure()
|
| 1416 |
+
fig.add_trace(go.Bar(
|
| 1417 |
+
x=net_ts[period_col],
|
| 1418 |
+
y=net_ts["inc"],
|
| 1419 |
+
name="Increasing",
|
| 1420 |
+
marker_color=C_POSITIVE,
|
| 1421 |
+
customdata=custom,
|
| 1422 |
+
hovertemplate="<b>%{x}</b><br>Increasing: %{y}<br>Keywords: %{customdata[0]}<br>Beneficiaries: %{customdata[2]}<extra></extra>",
|
| 1423 |
+
))
|
| 1424 |
+
fig.add_trace(go.Bar(
|
| 1425 |
+
x=net_ts[period_col],
|
| 1426 |
+
y=-net_ts["dec"],
|
| 1427 |
+
name="Decreasing",
|
| 1428 |
+
marker_color=C_NEGATIVE,
|
| 1429 |
+
customdata=custom,
|
| 1430 |
+
hovertemplate="<b>%{x}</b><br>Decreasing: %{y:.0f}<br>Keywords: %{customdata[1]}<br>Beneficiaries: %{customdata[3]}<extra></extra>",
|
| 1431 |
+
))
|
| 1432 |
+
fig.add_trace(go.Scatter(
|
| 1433 |
+
x=net_ts[period_col],
|
| 1434 |
+
y=net_ts["net"],
|
| 1435 |
+
mode="lines+markers",
|
| 1436 |
+
name="Net",
|
| 1437 |
+
line=dict(color=C_TRAPPED_DARKNESS, width=2),
|
| 1438 |
+
hovertemplate="<b>%{x}</b><br>Net: %{y}<extra></extra>",
|
| 1439 |
+
))
|
| 1440 |
+
fig.update_layout(
|
| 1441 |
+
template=PLOTLY_TEMPLATE,
|
| 1442 |
+
barmode="relative",
|
| 1443 |
+
height=420,
|
| 1444 |
+
margin=dict(l=8, r=8, t=8, b=8),
|
| 1445 |
+
hovermode="x unified",
|
| 1446 |
+
legend=dict(orientation="h", yanchor="bottom", y=-0.22, xanchor="center", x=0.5),
|
| 1447 |
+
yaxis_title="Mentions",
|
| 1448 |
+
xaxis_title="",
|
| 1449 |
+
plot_bgcolor="white",
|
| 1450 |
+
paper_bgcolor="white",
|
| 1451 |
+
)
|
| 1452 |
+
fig.update_xaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0")
|
| 1453 |
+
fig.update_yaxes(showgrid=True, gridcolor="#EDF2F4", showline=True, linecolor="#D6DEE0",
|
| 1454 |
+
zeroline=True, zerolinecolor="#C9D3D6")
|
| 1455 |
+
st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False})
|
| 1456 |
+
|
| 1457 |
+
with r2b:
|
| 1458 |
+
st.markdown("### Top Keywords")
|
| 1459 |
+
tfidf_mode = st.selectbox(
|
| 1460 |
+
"TF-IDF Source",
|
| 1461 |
+
["Motivation", "Intent", "Legislative Strategy"],
|
| 1462 |
+
index=0
|
| 1463 |
+
)
|
| 1464 |
+
tfidf_text_col = KW_SOURCES[tfidf_mode]
|
| 1465 |
+
if not safe_col(df_f, tfidf_text_col):
|
| 1466 |
+
st.info(f"Column `{tfidf_text_col}` not available.")
|
| 1467 |
+
else:
|
| 1468 |
+
df_rest = df_f[df_f[CAT_MAIN].astype(str) != str(selected_cat)].copy()
|
| 1469 |
+
tf_phrases = build_contrastive_tfidf(df_cat, df_rest, BILL_ID_COL, tfidf_text_col, top_k=15)
|
| 1470 |
+
if not tf_phrases:
|
| 1471 |
+
st.info("TF-IDF returned no meaningful category-distinct phrases for this slice.")
|
| 1472 |
+
else:
|
| 1473 |
+
kw_tbl = pd.DataFrame(tf_phrases, columns=["Keyword", "Distinctiveness Score"])
|
| 1474 |
+
kw_tbl.index = np.arange(1, len(kw_tbl) + 1)
|
| 1475 |
+
st.dataframe(kw_tbl, use_container_width=True, height=300)
|
| 1476 |
+
|
| 1477 |
+
# =====================================================
|
| 1478 |
+
# Row 3: Subcategory Momentum (STEP 9) + Heatmap (STEP 10)
|
| 1479 |
+
# =====================================================
|
| 1480 |
+
st.markdown("")
|
| 1481 |
+
r3a, r3b = st.columns(2)
|
| 1482 |
+
|
| 1483 |
+
with r3a:
|
| 1484 |
+
st.markdown("### Subcategory Momentum")
|
| 1485 |
+
|
| 1486 |
+
# df_full_cat: full historical data for selected category (for 5-yr slope)
|
| 1487 |
+
df_full_cat = df_full[df_full[CAT_MAIN].astype(str) == str(selected_cat)].copy()
|
| 1488 |
+
|
| 1489 |
+
# STEP 9: pass both filtered df_cat (short-term pct change) and df_full_cat (5-yr slope)
|
| 1490 |
+
sub_momentum = compute_subcategory_momentum(
|
| 1491 |
+
df_cat, df_full_cat, CAT_SUB, BILL_ID_COL, period_col, period_order
|
| 1492 |
+
)
|
| 1493 |
+
|
| 1494 |
+
if sub_momentum.empty:
|
| 1495 |
+
st.info("Not enough data to compute momentum.")
|
| 1496 |
+
else:
|
| 1497 |
+
# SlopeScaled = 5-year regression slope * 100 (momentum strength)
|
| 1498 |
+
sub_momentum["SlopeScaled"] = sub_momentum["Slope"] * 100.0
|
| 1499 |
+
sub_momentum["SlopeScaled"] = pd.to_numeric(sub_momentum["SlopeScaled"], errors="coerce").fillna(0.0)
|
| 1500 |
+
|
| 1501 |
+
eps = 1e-4
|
| 1502 |
+
sub_momentum["Direction"] = np.where(
|
| 1503 |
+
sub_momentum["SlopeScaled"] > eps, "Rising",
|
| 1504 |
+
np.where(sub_momentum["SlopeScaled"] < -eps, "Falling", "Stable")
|
| 1505 |
+
)
|
| 1506 |
+
|
| 1507 |
+
# Show top movers by absolute slope + top stable
|
| 1508 |
+
movers = sub_momentum[sub_momentum["Direction"] != "Stable"].copy()
|
| 1509 |
+
stable = sub_momentum[sub_momentum["Direction"] == "Stable"].copy()
|
| 1510 |
+
movers = movers.reindex(movers["SlopeScaled"].abs().sort_values(ascending=False).index).head(10)
|
| 1511 |
+
stable = stable.head(5)
|
| 1512 |
+
show_df = pd.concat([movers, stable], axis=0).drop_duplicates("Subcategory")
|
| 1513 |
+
if show_df.empty:
|
| 1514 |
+
show_df = sub_momentum.head(12).copy()
|
| 1515 |
+
|
| 1516 |
+
# Ensure bars are always visible (min visible length)
|
| 1517 |
+
min_visible = 0.20
|
| 1518 |
+
show_df = show_df.copy()
|
| 1519 |
+
show_df["DisplaySlope"] = show_df["SlopeScaled"]
|
| 1520 |
+
near_zero = show_df["DisplaySlope"].abs() < min_visible
|
| 1521 |
+
show_df.loc[near_zero & (show_df["Direction"] == "Rising"), "DisplaySlope"] = min_visible
|
| 1522 |
+
show_df.loc[near_zero & (show_df["Direction"] == "Falling"), "DisplaySlope"] = -min_visible
|
| 1523 |
+
show_df.loc[near_zero & (show_df["Direction"] == "Stable"), "DisplaySlope"] = min_visible * 0.6
|
| 1524 |
+
show_df = show_df.sort_values("DisplaySlope", ascending=True)
|
| 1525 |
+
|
| 1526 |
+
show_df["AvgPctChange"] = pd.to_numeric(show_df["AvgPctChange"], errors="coerce").fillna(0.0)
|
| 1527 |
+
show_df["SlopeScaled"] = pd.to_numeric(show_df["SlopeScaled"], errors="coerce").fillna(0.0)
|
| 1528 |
+
|
| 1529 |
+
|
| 1530 |
+
color_map = {"Rising": C_POSITIVE, "Falling": C_NEGATIVE, "Stable": C_STABLE}
|
| 1531 |
+
fig = px.bar(
|
| 1532 |
+
show_df,
|
| 1533 |
+
x="DisplaySlope",
|
| 1534 |
+
y="Subcategory",
|
| 1535 |
+
color="Direction",
|
| 1536 |
+
orientation="h",
|
| 1537 |
+
color_discrete_map=color_map,
|
| 1538 |
+
template=PLOTLY_TEMPLATE,
|
| 1539 |
+
custom_data=["SlopeScaled", "Direction", "AvgPctChange"],
|
| 1540 |
+
labels={"DisplaySlope": "5-Yr Momentum Slope (x100)", "Subcategory": ""},
|
| 1541 |
+
)
|
| 1542 |
+
fig.update_traces(
|
| 1543 |
+
hovertemplate=(
|
| 1544 |
+
"<b>%{y}</b><br>"
|
| 1545 |
+
"Direction: %{customdata[1]}<br>"
|
| 1546 |
+
"5-Yr Regression Slope (x100): %{customdata[0]:.3f}<br>"
|
| 1547 |
+
"Short-Term Avg % Change: %{customdata[2]:.1f}%<extra></extra>"
|
| 1548 |
+
)
|
| 1549 |
+
)
|
| 1550 |
+
max_abs = float(np.nanmax(np.abs(show_df["DisplaySlope"].to_numpy()))) if len(show_df) else 1.0
|
| 1551 |
+
max_abs = max(max_abs, 1.0)
|
| 1552 |
+
fig.update_layout(
|
| 1553 |
+
template=PLOTLY_TEMPLATE,
|
| 1554 |
+
height=520,
|
| 1555 |
+
margin=dict(l=8, r=8, t=8, b=8),
|
| 1556 |
+
xaxis_title="5-Yr Momentum Slope (x100)",
|
| 1557 |
+
yaxis_title="",
|
| 1558 |
+
plot_bgcolor="white",
|
| 1559 |
+
paper_bgcolor="white",
|
| 1560 |
+
legend=dict(orientation="h", yanchor="bottom", y=-0.22, xanchor="center", x=0.5),
|
| 1561 |
+
barmode="relative",
|
| 1562 |
+
)
|
| 1563 |
+
fig.update_xaxes(
|
| 1564 |
+
range=[-max_abs * 1.15, max_abs * 1.15],
|
| 1565 |
+
showgrid=True, gridcolor="#EDF2F4",
|
| 1566 |
+
showline=True, linecolor="#D6DEE0",
|
| 1567 |
+
zeroline=True, zerolinecolor="#C9D3D6",
|
| 1568 |
+
)
|
| 1569 |
+
fig.update_yaxes(showgrid=False)
|
| 1570 |
+
st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False})
|
| 1571 |
+
|
| 1572 |
+
with r3b:
|
| 1573 |
+
# -------------------------------------------------------
|
| 1574 |
+
# STEP 10: Heatmap — Intended Beneficiaries × Increasing Aspects
|
| 1575 |
+
# Filter: top 10 categories with bill count > K threshold
|
| 1576 |
+
# Remove empty/very low-density cells
|
| 1577 |
+
# Conditional formatting: annotate each cell with actual bill count
|
| 1578 |
+
# -------------------------------------------------------
|
| 1579 |
+
st.markdown("### Beneficiaries × Increasing Aspects")
|
| 1580 |
+
|
| 1581 |
+
hc1, hc2 = st.columns(2)
|
| 1582 |
+
with hc1:
|
| 1583 |
+
cat_bill_thresh = st.slider(
|
| 1584 |
+
"Min Bills per Category", min_value=5, max_value=100, value=20, step=5,
|
| 1585 |
+
help="Only include categories with at least this many unique bills"
|
| 1586 |
+
)
|
| 1587 |
+
with hc2:
|
| 1588 |
+
min_cell_thresh = st.slider(
|
| 1589 |
+
"Min Bills per Cell", min_value=1, max_value=15, value=2, step=1,
|
| 1590 |
+
help="Remove cells with fewer than this many bills (sparse cell filter)"
|
| 1591 |
+
)
|
| 1592 |
+
topN_benef = st.slider("Top N Beneficiaries", min_value=5, max_value=25, value=10, step=1)
|
| 1593 |
+
|
| 1594 |
+
if not safe_col(df_f, BENEF_COL) or not safe_col(df_f, INC_COL):
|
| 1595 |
+
st.caption("Beneficiary or Increasing Aspects data not available.")
|
| 1596 |
+
else:
|
| 1597 |
+
# STEP 10a: Top 10 categories, filtered by bill count > K threshold
|
| 1598 |
+
cat_counts = df_f.groupby(CAT_MAIN)[BILL_ID_COL].nunique()
|
| 1599 |
+
eligible_cats = cat_counts[cat_counts >= cat_bill_thresh].sort_values(ascending=False)
|
| 1600 |
+
|
| 1601 |
+
if eligible_cats.empty:
|
| 1602 |
+
st.caption(f"No categories have ≥ {cat_bill_thresh} bills. Try lowering the threshold.")
|
| 1603 |
+
else:
|
| 1604 |
+
top10_cats = eligible_cats.head(10).index.tolist()
|
| 1605 |
+
df_heat = df_f[df_f[CAT_MAIN].isin(top10_cats)].copy()
|
| 1606 |
+
|
| 1607 |
+
# Explode beneficiaries
|
| 1608 |
+
df_heat["_benef"] = df_heat[BENEF_COL].apply(_split_listlike)
|
| 1609 |
+
df_heat = df_heat.explode("_benef").dropna(subset=["_benef"])
|
| 1610 |
+
df_heat["_benef"] = df_heat["_benef"].astype(str).str.strip()
|
| 1611 |
+
df_heat = df_heat[df_heat["_benef"].str.len() > 0]
|
| 1612 |
+
|
| 1613 |
+
# Explode increasing aspects and clean
|
| 1614 |
+
df_heat["_inc"] = df_heat[INC_COL].apply(_split_listlike)
|
| 1615 |
+
df_heat = df_heat.explode("_inc").dropna(subset=["_inc"])
|
| 1616 |
+
df_heat["_inc"] = df_heat["_inc"].astype(str).str.strip().str.lower()
|
| 1617 |
+
df_heat["_inc"] = df_heat["_inc"].str.replace(r"[^a-z0-9\s\-]", "", regex=True).str.strip()
|
| 1618 |
+
df_heat = df_heat[df_heat["_inc"].str.len() >= 3]
|
| 1619 |
+
df_heat = df_heat[~df_heat["_inc"].isin(STOPWORDS)]
|
| 1620 |
+
|
| 1621 |
+
if df_heat.empty:
|
| 1622 |
+
st.caption("No usable beneficiary × increasing aspects data.")
|
| 1623 |
+
else:
|
| 1624 |
+
# Keep top N beneficiaries and top 15 increasing aspect terms
|
| 1625 |
+
top_benef = df_heat["_benef"].value_counts().head(topN_benef).index.tolist()
|
| 1626 |
+
top_inc = df_heat["_inc"].value_counts().head(15).index.tolist()
|
| 1627 |
+
|
| 1628 |
+
df_heat = df_heat[
|
| 1629 |
+
df_heat["_benef"].isin(top_benef) &
|
| 1630 |
+
df_heat["_inc"].isin(top_inc)
|
| 1631 |
+
].copy()
|
| 1632 |
+
|
| 1633 |
+
benef_heat = (
|
| 1634 |
+
df_heat.groupby(["_benef", "_inc"])[BILL_ID_COL].nunique()
|
| 1635 |
+
.reset_index(name="bills")
|
| 1636 |
+
)
|
| 1637 |
+
|
| 1638 |
+
# STEP 10b: Remove empty / very low-density cells (< min_cell_thresh)
|
| 1639 |
+
benef_heat = benef_heat[benef_heat["bills"] >= min_cell_thresh]
|
| 1640 |
+
|
| 1641 |
+
if benef_heat.empty:
|
| 1642 |
+
st.caption(f"No cells with ≥ {min_cell_thresh} bills. Try lowering the threshold.")
|
| 1643 |
+
else:
|
| 1644 |
+
pivot = benef_heat.pivot(index="_benef", columns="_inc", values="bills").fillna(0)
|
| 1645 |
+
|
| 1646 |
+
# Sort rows and columns by total density (highest at top/left)
|
| 1647 |
+
pivot = pivot.loc[
|
| 1648 |
+
pivot.sum(axis=1).sort_values(ascending=False).index,
|
| 1649 |
+
pivot.sum(axis=0).sort_values(ascending=False).index
|
| 1650 |
+
]
|
| 1651 |
+
|
| 1652 |
+
z_actual = pivot.values.astype(float)
|
| 1653 |
+
# STEP 10c: log-scale for color (handles outliers gracefully)
|
| 1654 |
+
z_scaled = np.log1p(z_actual)
|
| 1655 |
+
|
| 1656 |
+
# STEP 10d: Conditional formatting — annotate each cell with actual count
|
| 1657 |
+
# Only show text for cells above the sparse threshold (already filtered)
|
| 1658 |
+
annotations = []
|
| 1659 |
+
for i, row_label in enumerate(pivot.index):
|
| 1660 |
+
for j, col_label in enumerate(pivot.columns):
|
| 1661 |
+
val = int(z_actual[i, j])
|
| 1662 |
+
if val > 0:
|
| 1663 |
+
# White text for dark cells, dark for light cells
|
| 1664 |
+
max_val = z_scaled.max() if z_scaled.max() > 0 else 1
|
| 1665 |
+
brightness = z_scaled[i, j] / max_val
|
| 1666 |
+
font_color = "white" if brightness > 0.55 else C_TRAPPED_DARKNESS
|
| 1667 |
+
annotations.append(
|
| 1668 |
+
dict(
|
| 1669 |
+
x=col_label,
|
| 1670 |
+
y=row_label,
|
| 1671 |
+
text=str(val),
|
| 1672 |
+
showarrow=False,
|
| 1673 |
+
font=dict(color=font_color, size=9),
|
| 1674 |
+
xref="x", yref="y",
|
| 1675 |
+
)
|
| 1676 |
+
)
|
| 1677 |
+
|
| 1678 |
+
fig = go.Figure(data=go.Heatmap(
|
| 1679 |
+
z=z_scaled,
|
| 1680 |
+
x=pivot.columns.astype(str).tolist(),
|
| 1681 |
+
y=pivot.index.astype(str).tolist(),
|
| 1682 |
+
colorscale=[
|
| 1683 |
+
[0.0, "#F2F5F6"],
|
| 1684 |
+
[0.25, "#C8D9DE"],
|
| 1685 |
+
[0.5, "#7FAAB7"],
|
| 1686 |
+
[0.75, "#3D7285"],
|
| 1687 |
+
[1.0, C_LAZURITE_BLUE],
|
| 1688 |
+
],
|
| 1689 |
+
colorbar=dict(
|
| 1690 |
+
title="log(1+bills)",
|
| 1691 |
+
tickfont=dict(size=10),
|
| 1692 |
+
thickness=12,
|
| 1693 |
+
len=0.8,
|
| 1694 |
+
),
|
| 1695 |
+
customdata=z_actual,
|
| 1696 |
+
hovertemplate=(
|
| 1697 |
+
"Beneficiary: %{y}<br>"
|
| 1698 |
+
"Aspect: %{x}<br>"
|
| 1699 |
+
"Unique Bills: %{customdata:.0f}<extra></extra>"
|
| 1700 |
+
),
|
| 1701 |
+
xgap=1,
|
| 1702 |
+
ygap=1,
|
| 1703 |
+
))
|
| 1704 |
+
fig.update_layout(
|
| 1705 |
+
template=PLOTLY_TEMPLATE,
|
| 1706 |
+
height=max(520, len(pivot.index) * 30 + 120),
|
| 1707 |
+
margin=dict(l=8, r=8, t=8, b=80),
|
| 1708 |
+
xaxis_title="Increasing Aspect",
|
| 1709 |
+
yaxis_title="",
|
| 1710 |
+
plot_bgcolor="white",
|
| 1711 |
+
paper_bgcolor="white",
|
| 1712 |
+
xaxis=dict(tickangle=-40, tickfont=dict(size=10)),
|
| 1713 |
+
yaxis=dict(tickfont=dict(size=10)),
|
| 1714 |
+
annotations=annotations,
|
| 1715 |
+
)
|
| 1716 |
+
st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False})
|
dockerignore
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
**/.git
|
| 2 |
+
**/__pycache__
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pkl
|
| 5 |
+
*.joblib
|
| 6 |
+
*.pt
|
| 7 |
+
*.bin
|
| 8 |
+
*.zip
|
| 9 |
+
*.tar
|
| 10 |
+
*.gz
|
| 11 |
+
notebooks/
|
| 12 |
+
outputs/
|
| 13 |
+
data/
|
features_standardized_11_renamed.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:edc651cba51bc217650a48a8c4d66d9e329f19711779ad69e851816d057852c2
|
| 3 |
+
size 538177112
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
streamlit==1.37.1
|
| 3 |
+
pandas==2.2.2
|
| 4 |
+
numpy==1.26.4
|
| 5 |
+
pyarrow==17.0.0
|
| 6 |
+
plotly==5.23.0
|
| 7 |
+
scikit-learn==1.4.2
|
| 8 |
+
|
utils.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import ast
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
DATE_COL = "status_date"
|
| 7 |
+
|
| 8 |
+
# Keep only what we need for the dashboard (cuts memory a LOT)
|
| 9 |
+
NEEDED_COLS = [
|
| 10 |
+
"bill_id",
|
| 11 |
+
"session",
|
| 12 |
+
"chamber",
|
| 13 |
+
"bill_number",
|
| 14 |
+
"status_date",
|
| 15 |
+
|
| 16 |
+
"policy_domain_standardized",
|
| 17 |
+
"category_main_label",
|
| 18 |
+
"category_sub_label",
|
| 19 |
+
|
| 20 |
+
"intent_standardized",
|
| 21 |
+
"policy_direction_classifications",
|
| 22 |
+
|
| 23 |
+
"category_main_keywords",
|
| 24 |
+
"category_sub_keywords",
|
| 25 |
+
"category_main_llama_summary_keywords",
|
| 26 |
+
"category_sub_llama_summary_keywords",
|
| 27 |
+
|
| 28 |
+
"legislative_goal_standardized",
|
| 29 |
+
"impact_rating_standardized",
|
| 30 |
+
"impact_rating_score",
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
KEYWORD_COLS = [
|
| 34 |
+
"category_main_keywords",
|
| 35 |
+
"category_sub_keywords",
|
| 36 |
+
"category_main_llama_summary_keywords",
|
| 37 |
+
"category_sub_llama_summary_keywords",
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _safe_listify(x):
|
| 42 |
+
"""Turn list-like cells or strings into list[str]."""
|
| 43 |
+
if x is None:
|
| 44 |
+
return []
|
| 45 |
+
if isinstance(x, float) and np.isnan(x):
|
| 46 |
+
return []
|
| 47 |
+
if isinstance(x, list):
|
| 48 |
+
return [str(i).strip() for i in x if str(i).strip()]
|
| 49 |
+
|
| 50 |
+
s = str(x).strip()
|
| 51 |
+
if not s or s.lower() in {"nan", "none", "null"}:
|
| 52 |
+
return []
|
| 53 |
+
|
| 54 |
+
if (s.startswith("[") and s.endswith("]")) or (s.startswith("(") and s.endswith(")")):
|
| 55 |
+
try:
|
| 56 |
+
parsed = ast.literal_eval(s)
|
| 57 |
+
if isinstance(parsed, (list, tuple, set)):
|
| 58 |
+
return [str(i).strip() for i in parsed if str(i).strip()]
|
| 59 |
+
except Exception:
|
| 60 |
+
pass
|
| 61 |
+
|
| 62 |
+
parts = re.split(r"[,\|;]\s*", s)
|
| 63 |
+
return [p.strip() for p in parts if p.strip()]
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def load_dataset(path: str) -> pd.DataFrame:
|
| 67 |
+
if path.lower().endswith(".parquet"):
|
| 68 |
+
all_cols = pd.read_parquet(path, engine="pyarrow").columns
|
| 69 |
+
cols = [c for c in NEEDED_COLS if c in all_cols]
|
| 70 |
+
df = pd.read_parquet(path, columns=cols)
|
| 71 |
+
elif path.lower().endswith(".csv"):
|
| 72 |
+
# for csv, we can't cheaply read columns list; just try usecols and fallback
|
| 73 |
+
try:
|
| 74 |
+
df = pd.read_csv(path, usecols=NEEDED_COLS)
|
| 75 |
+
except Exception:
|
| 76 |
+
df = pd.read_csv(path)
|
| 77 |
+
df = df[[c for c in NEEDED_COLS if c in df.columns]]
|
| 78 |
+
else:
|
| 79 |
+
raise ValueError("Supported formats: .parquet or .csv")
|
| 80 |
+
|
| 81 |
+
if DATE_COL not in df.columns:
|
| 82 |
+
raise ValueError(f"Expected a date column named '{DATE_COL}'")
|
| 83 |
+
|
| 84 |
+
df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce")
|
| 85 |
+
df = df[df[DATE_COL].notna()].copy()
|
| 86 |
+
|
| 87 |
+
df["year"] = df[DATE_COL].dt.year
|
| 88 |
+
df["month"] = df[DATE_COL].dt.to_period("M").dt.to_timestamp()
|
| 89 |
+
df["week"] = df[DATE_COL].dt.to_period("W").dt.start_time
|
| 90 |
+
|
| 91 |
+
return df
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def apply_filters(
|
| 96 |
+
df: pd.DataFrame,
|
| 97 |
+
date_min=None,
|
| 98 |
+
date_max=None,
|
| 99 |
+
sessions=None,
|
| 100 |
+
chambers=None,
|
| 101 |
+
policy_domains=None,
|
| 102 |
+
category_main=None,
|
| 103 |
+
category_sub=None,
|
| 104 |
+
intents=None,
|
| 105 |
+
policy_dirs=None,
|
| 106 |
+
):
|
| 107 |
+
out = df.copy()
|
| 108 |
+
|
| 109 |
+
if date_min is not None:
|
| 110 |
+
out = out[out["status_date"] >= pd.to_datetime(date_min)]
|
| 111 |
+
if date_max is not None:
|
| 112 |
+
out = out[out["status_date"] <= pd.to_datetime(date_max)]
|
| 113 |
+
|
| 114 |
+
def _filter_in(col, values):
|
| 115 |
+
nonlocal out
|
| 116 |
+
if values and "All" not in values:
|
| 117 |
+
out = out[out[col].isin(values)]
|
| 118 |
+
|
| 119 |
+
_filter_in("session", sessions)
|
| 120 |
+
_filter_in("chamber", chambers)
|
| 121 |
+
_filter_in("policy_domain_standardized", policy_domains)
|
| 122 |
+
_filter_in("category_main_label", category_main)
|
| 123 |
+
_filter_in("category_sub_label", category_sub)
|
| 124 |
+
_filter_in("intent_standardized", intents)
|
| 125 |
+
_filter_in("policy_direction_classifications", policy_dirs)
|
| 126 |
+
|
| 127 |
+
return out
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def explode_keywords(df: pd.DataFrame, keyword_col: str) -> pd.DataFrame:
|
| 131 |
+
keep_cols = [
|
| 132 |
+
"bill_id",
|
| 133 |
+
"status_date",
|
| 134 |
+
"month",
|
| 135 |
+
"week",
|
| 136 |
+
"session",
|
| 137 |
+
"chamber",
|
| 138 |
+
"policy_domain_standardized",
|
| 139 |
+
"category_main_label",
|
| 140 |
+
"category_sub_label",
|
| 141 |
+
"intent_standardized",
|
| 142 |
+
"policy_direction_classifications",
|
| 143 |
+
keyword_col,
|
| 144 |
+
]
|
| 145 |
+
keep_cols = [c for c in keep_cols if c in df.columns]
|
| 146 |
+
|
| 147 |
+
tmp = df[keep_cols].copy()
|
| 148 |
+
tmp["keyword_list"] = tmp[keyword_col].apply(_safe_listify)
|
| 149 |
+
tmp = tmp.explode("keyword_list", ignore_index=True)
|
| 150 |
+
tmp = tmp.rename(columns={"keyword_list": "keyword"})
|
| 151 |
+
|
| 152 |
+
tmp["keyword"] = tmp["keyword"].astype(str).str.strip()
|
| 153 |
+
tmp = tmp[(tmp["keyword"].notna()) & (tmp["keyword"] != "") & (tmp["keyword"].str.lower() != "nan")]
|
| 154 |
+
|
| 155 |
+
tmp["keyword_norm"] = (
|
| 156 |
+
tmp["keyword"]
|
| 157 |
+
.str.lower()
|
| 158 |
+
.str.replace(r"\s+", " ", regex=True)
|
| 159 |
+
.str.replace(r"[^a-z0-9 \-_/]", "", regex=True)
|
| 160 |
+
.str.strip()
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
tmp = tmp[tmp["keyword_norm"].str.len() >= 3]
|
| 164 |
+
return tmp
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def keyword_trends(df_long: pd.DataFrame, time_grain="month", top_n=15):
|
| 168 |
+
tg = "month" if time_grain == "month" else "week"
|
| 169 |
+
|
| 170 |
+
top = (
|
| 171 |
+
df_long.groupby("keyword_norm")
|
| 172 |
+
.size()
|
| 173 |
+
.reset_index(name="count")
|
| 174 |
+
.sort_values("count", ascending=False)
|
| 175 |
+
.head(top_n)
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
top_set = set(top["keyword_norm"].tolist())
|
| 179 |
+
base = df_long[df_long["keyword_norm"].isin(top_set)]
|
| 180 |
+
|
| 181 |
+
ts = (
|
| 182 |
+
base.groupby([tg, "keyword_norm"])
|
| 183 |
+
.size()
|
| 184 |
+
.reset_index(name="mentions")
|
| 185 |
+
.sort_values([tg, "mentions"], ascending=[True, False])
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
return top, ts
|