Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Upload 30 files
Browse files- .gitattributes +1 -0
- data/data_updating_scripts/PROMPTS/__pycache__/bill_summary_prompt.cpython-313.pyc +0 -0
- data/data_updating_scripts/PROMPTS/bill_summary_prompt.py +29 -0
- data/data_updating_scripts/PROMPTS/suggested_questions_prompt.md +25 -0
- data/data_updating_scripts/__pycache__/config.cpython-313.pyc +0 -0
- data/data_updating_scripts/build_bills_vectorstore.py +46 -0
- data/data_updating_scripts/build_bills_vectorstore_pinecone_delta.py +43 -0
- data/data_updating_scripts/config.py +43 -0
- data/data_updating_scripts/eu-ai-act.pdf +3 -0
- data/data_updating_scripts/eu_vectorstore.py +269 -0
- data/data_updating_scripts/fix_pdf_bills.py +282 -0
- data/data_updating_scripts/generate_reports.py +274 -0
- data/data_updating_scripts/generate_suggested_questions.py +269 -0
- data/data_updating_scripts/generate_summaries.py +204 -0
- data/data_updating_scripts/get_data.py +251 -0
- data/data_updating_scripts/get_data_ORIGINAL.py +251 -0
- data/data_updating_scripts/known_bills_status.py +199 -0
- data/data_updating_scripts/logs/eu_vectorstore.log +128 -0
- data/data_updating_scripts/logs/fetch_ai_bills.log +0 -0
- data/data_updating_scripts/logs/fix_pdf_bills.log +0 -0
- data/data_updating_scripts/logs/generate_reports.log +0 -0
- data/data_updating_scripts/logs/generate_suggested_questions.log +0 -0
- data/data_updating_scripts/logs/generate_summaries.log +0 -0
- data/data_updating_scripts/logs/mark_no_text_bills.log +293 -0
- data/data_updating_scripts/logs/migrate_iapp_categories.log +0 -0
- data/data_updating_scripts/mark_no_text_bills.py +120 -0
- data/data_updating_scripts/migrate_iapp_categories.py +358 -0
- data/generate_password_hash.py +135 -0
- data/huggingface_upload.py +251 -0
- data/pages/Admin.py +459 -0
- data/update_data.py +64 -0
.gitattributes
CHANGED
|
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
data/eu_ai_act_vectorstore/index.faiss filter=lfs diff=lfs merge=lfs -text
|
| 37 |
data/known_bills_visualize.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
data/eu_ai_act_vectorstore/index.faiss filter=lfs diff=lfs merge=lfs -text
|
| 37 |
data/known_bills_visualize.json filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
data/data_updating_scripts/eu-ai-act.pdf filter=lfs diff=lfs merge=lfs -text
|
data/data_updating_scripts/PROMPTS/__pycache__/bill_summary_prompt.cpython-313.pyc
ADDED
|
Binary file (1.37 kB). View file
|
|
|
data/data_updating_scripts/PROMPTS/bill_summary_prompt.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PROMPTS/bill_summary_prompt.py
|
| 2 |
+
BILL_SUMMARY_PROMPT = """
|
| 3 |
+
You are an expert legislative analyst specializing in AI governance and technology policy. Your task is to provide a clear, concise summary of the given bill text.
|
| 4 |
+
|
| 5 |
+
Please analyze the bill and provide a comprehensive summary that includes:
|
| 6 |
+
|
| 7 |
+
1. **Main Purpose**: What is the primary objective of this bill?
|
| 8 |
+
2. **Key Provisions**: What are the main requirements, prohibitions, or authorizations?
|
| 9 |
+
3. **AI-Related Elements**: How does this bill relate to artificial intelligence, if at all?
|
| 10 |
+
4. **Scope and Impact**: Who does this bill affect and what are the potential consequences?
|
| 11 |
+
5. **Implementation**: What mechanisms or processes does the bill establish?
|
| 12 |
+
|
| 13 |
+
**Requirements:**
|
| 14 |
+
- Keep the summary concise but comprehensive (aim for 200-400 words)
|
| 15 |
+
- Use clear, professional language
|
| 16 |
+
- Focus on the most important aspects of the bill
|
| 17 |
+
- If the bill is not related to AI, clearly state this
|
| 18 |
+
- Structure the response with clear sections using markdown formatting
|
| 19 |
+
|
| 20 |
+
**Bill Information:**
|
| 21 |
+
- Bill Number: {bill_number}
|
| 22 |
+
- Bill Title: {bill_title}
|
| 23 |
+
- State: {state}
|
| 24 |
+
|
| 25 |
+
**Bill Text:**
|
| 26 |
+
{bill_text}
|
| 27 |
+
|
| 28 |
+
Please provide your analysis:
|
| 29 |
+
"""
|
data/data_updating_scripts/PROMPTS/suggested_questions_prompt.md
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
You are an AI governance legislation expert. Your task is to analyze the provided bill and generate exactly 5 relevant, specific questions that users might want to ask about this particular bill.
|
| 2 |
+
|
| 3 |
+
The questions should:
|
| 4 |
+
- Be specific to the content and provisions of this bill
|
| 5 |
+
- Cover different aspects of the legislation (definitions, scope, enforcement, compliance, etc.)
|
| 6 |
+
- Be phrased as user-friendly questions that someone analyzing AI governance would ask
|
| 7 |
+
- Be practical and actionable for understanding the bill's impact
|
| 8 |
+
- Avoid generic questions that could apply to any bill
|
| 9 |
+
|
| 10 |
+
Focus on aspects like:
|
| 11 |
+
- Key definitions and terminology
|
| 12 |
+
- Scope and applicability
|
| 13 |
+
- Enforcement mechanisms and penalties
|
| 14 |
+
- Compliance requirements
|
| 15 |
+
- Rights and obligations
|
| 16 |
+
- Implementation timelines
|
| 17 |
+
- Regulatory oversight
|
| 18 |
+
- Specific AI technologies or systems mentioned
|
| 19 |
+
|
| 20 |
+
Format your response as exactly 5 questions, one per line, with no numbering or bullet points. Each question should be complete and ready to use.
|
| 21 |
+
|
| 22 |
+
### Bill Content
|
| 23 |
+
{context}
|
| 24 |
+
|
| 25 |
+
Generate 5 specific questions about this bill:
|
data/data_updating_scripts/__pycache__/config.cpython-313.pyc
ADDED
|
Binary file (2.43 kB). View file
|
|
|
data/data_updating_scripts/build_bills_vectorstore.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import argparse, os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
load_dotenv(dotenv_path=Path.cwd() / ".env")
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
sys.path.append(str(Path(__file__).resolve().parents[1]))
|
| 10 |
+
|
| 11 |
+
def main():
|
| 12 |
+
p = argparse.ArgumentParser()
|
| 13 |
+
p.add_argument("--source", default="data/known_bills_visualize.json")
|
| 14 |
+
p.add_argument("--backend", choices=["chroma","pinecone"], default=os.getenv("VECTOR_BACKEND","chroma"))
|
| 15 |
+
p.add_argument("--persist", default="data/bills_vectorstore")
|
| 16 |
+
p.add_argument("--collection", default="bills")
|
| 17 |
+
p.add_argument("--manifest", default="data/bills_vectorstore_manifest.json")
|
| 18 |
+
p.add_argument("--model", default=None)
|
| 19 |
+
p.add_argument("--batch", type=int, default=128)
|
| 20 |
+
args = p.parse_args()
|
| 21 |
+
|
| 22 |
+
if args.backend == "pinecone":
|
| 23 |
+
from vectorstore.pinecone_bills_vectorstore import upsert_from_bills_json
|
| 24 |
+
stats = upsert_from_bills_json(
|
| 25 |
+
source_json_path=args.source,
|
| 26 |
+
manifest_path=args.manifest,
|
| 27 |
+
embed_model=args.model,
|
| 28 |
+
batch_size=args.batch,
|
| 29 |
+
)
|
| 30 |
+
else:
|
| 31 |
+
from vectorstore.bills_vectorstore import upsert_from_bills_json
|
| 32 |
+
stats = upsert_from_bills_json(
|
| 33 |
+
source_json_path=args.source,
|
| 34 |
+
persist_dir=args.persist,
|
| 35 |
+
collection=args.collection,
|
| 36 |
+
manifest_path=args.manifest,
|
| 37 |
+
embed_model=args.model,
|
| 38 |
+
batch_size=args.batch,
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
print("✅ Vectorstore updated")
|
| 42 |
+
for k, v in stats.items():
|
| 43 |
+
print(f" {k}: {v}")
|
| 44 |
+
|
| 45 |
+
if __name__ == "__main__":
|
| 46 |
+
main()
|
data/data_updating_scripts/build_bills_vectorstore_pinecone_delta.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import os, json, time
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import List, Dict, Any
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
import sys
|
| 7 |
+
sys.path.append(str(Path(__file__).resolve().parents[1]))
|
| 8 |
+
|
| 9 |
+
load_dotenv(dotenv_path=Path.cwd() / ".env")
|
| 10 |
+
|
| 11 |
+
from vectorstore.pinecone_delta_upsert import chunk_bill, upsert_changed_vectors
|
| 12 |
+
|
| 13 |
+
SRC = "data/known_bills_visualize.json"
|
| 14 |
+
BATCH = int(os.getenv("PINECONE_BATCH", "128"))
|
| 15 |
+
|
| 16 |
+
def main():
|
| 17 |
+
p = Path(SRC)
|
| 18 |
+
if not p.exists():
|
| 19 |
+
raise SystemExit(f"Missing {SRC}")
|
| 20 |
+
bills: List[Dict[str, Any]] = json.loads(p.read_text(encoding="utf-8"))
|
| 21 |
+
bills = [b for b in bills if (b.get("text") or b.get("description") or b.get("title"))]
|
| 22 |
+
|
| 23 |
+
chunks: List[Dict[str, Any]] = []
|
| 24 |
+
for b in bills:
|
| 25 |
+
chunks.extend(chunk_bill(b))
|
| 26 |
+
|
| 27 |
+
print(f"Total chunks computed: {len(chunks):,}")
|
| 28 |
+
|
| 29 |
+
changed = 0
|
| 30 |
+
t0 = time.time()
|
| 31 |
+
for i in range(0, len(chunks), BATCH):
|
| 32 |
+
batch = chunks[i:i+BATCH]
|
| 33 |
+
changed += upsert_changed_vectors(batch)
|
| 34 |
+
if (i // BATCH) % 10 == 0:
|
| 35 |
+
print(f"… {i+len(batch):,}/{len(chunks):,} processed")
|
| 36 |
+
dt = time.time() - t0
|
| 37 |
+
|
| 38 |
+
print("✅ Pinecone delta upsert complete")
|
| 39 |
+
print(f" changed_upserts: {changed}")
|
| 40 |
+
print(f" elapsed_sec: {dt:.1f}")
|
| 41 |
+
|
| 42 |
+
if __name__ == "__main__":
|
| 43 |
+
main()
|
data/data_updating_scripts/config.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Configuration settings for LegiScan AI Governance Bills Tracker."""
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import dotenv
|
| 5 |
+
|
| 6 |
+
dotenv.load_dotenv()
|
| 7 |
+
|
| 8 |
+
class ConfigManager:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
"""
|
| 11 |
+
Initialize configuration with profile-specific settings.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
profile (str): Configuration profile (production, development, testing)
|
| 15 |
+
"""
|
| 16 |
+
self._load_base_config()
|
| 17 |
+
|
| 18 |
+
def _load_base_config(self):
|
| 19 |
+
"""Load base configuration that applies to all profiles."""
|
| 20 |
+
self.OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 21 |
+
self.OPENAI_LLM_MODEL = os.getenv("OPENAI_LLM_MODEL", "gpt-4o")
|
| 22 |
+
|
| 23 |
+
def reload(self):
|
| 24 |
+
"""Reload configuration."""
|
| 25 |
+
self._load_base_config()
|
| 26 |
+
self._load_profile_config(self.profile)
|
| 27 |
+
self._validate_config()
|
| 28 |
+
|
| 29 |
+
def __str__(self) -> str:
|
| 30 |
+
"""Return string representation of non-sensitive config."""
|
| 31 |
+
sensitive_keys = ["OPENAI_API_KEY", "LEGISCAN_API_KEY"]
|
| 32 |
+
config_str = f"Configuration Profile: {self.profile}\n"
|
| 33 |
+
for key, value in self.__dict__.items():
|
| 34 |
+
if key.startswith("_"):
|
| 35 |
+
continue
|
| 36 |
+
if key in sensitive_keys:
|
| 37 |
+
config_str += f"{key}: {'*' * 8}\n"
|
| 38 |
+
else:
|
| 39 |
+
config_str += f"{key}: {value}\n"
|
| 40 |
+
return config_str
|
| 41 |
+
|
| 42 |
+
# Create default instance
|
| 43 |
+
config = ConfigManager()
|
data/data_updating_scripts/eu-ai-act.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bba630444b3278e881066774002a1d7824308934f49ccfa203e65be43692f55e
|
| 3 |
+
size 2583319
|
data/data_updating_scripts/eu_vectorstore.py
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# scripts/create_eu_ai_act_vectorstore.py
|
| 3 |
+
|
| 4 |
+
"""
|
| 5 |
+
Script to create and save a vectorstore from the EU AI Act PDF.
|
| 6 |
+
This creates a FAISS vectorstore that can be loaded quickly in the main app.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import logging
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import pickle
|
| 13 |
+
from typing import Optional
|
| 14 |
+
import dotenv
|
| 15 |
+
|
| 16 |
+
# Import config
|
| 17 |
+
from config import config
|
| 18 |
+
|
| 19 |
+
# PDF processing
|
| 20 |
+
import PyPDF2
|
| 21 |
+
|
| 22 |
+
# LangChain components
|
| 23 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 24 |
+
from langchain_openai import OpenAIEmbeddings
|
| 25 |
+
from langchain_community.vectorstores import FAISS
|
| 26 |
+
from langchain.schema import Document
|
| 27 |
+
|
| 28 |
+
# Load environment variables
|
| 29 |
+
dotenv.load_dotenv()
|
| 30 |
+
|
| 31 |
+
# Create logs directory if it doesn't exist
|
| 32 |
+
os.makedirs("data_updating_scripts/logs", exist_ok=True)
|
| 33 |
+
|
| 34 |
+
# Configure logging
|
| 35 |
+
logging.basicConfig(
|
| 36 |
+
level=logging.INFO,
|
| 37 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 38 |
+
handlers=[logging.StreamHandler(), logging.FileHandler("data_updating_scripts/logs/eu_vectorstore.log")],
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
logger = logging.getLogger(__name__)
|
| 42 |
+
|
| 43 |
+
def extract_text_from_pdf(pdf_path: str) -> str:
|
| 44 |
+
"""Extract text from PDF file."""
|
| 45 |
+
try:
|
| 46 |
+
with open(pdf_path, 'rb') as file:
|
| 47 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
| 48 |
+
text = ""
|
| 49 |
+
|
| 50 |
+
logger.info(f"Processing {len(pdf_reader.pages)} pages from {pdf_path}")
|
| 51 |
+
|
| 52 |
+
for page_num, page in enumerate(pdf_reader.pages):
|
| 53 |
+
try:
|
| 54 |
+
page_text = page.extract_text()
|
| 55 |
+
text += f"\n\n--- Page {page_num + 1} ---\n\n{page_text}"
|
| 56 |
+
except Exception as e:
|
| 57 |
+
logger.warning(f"Error extracting text from page {page_num + 1}: {e}")
|
| 58 |
+
continue
|
| 59 |
+
|
| 60 |
+
logger.info(f"Extracted {len(text)} characters from PDF")
|
| 61 |
+
return text
|
| 62 |
+
|
| 63 |
+
except Exception as e:
|
| 64 |
+
logger.error(f"Error reading PDF {pdf_path}: {e}")
|
| 65 |
+
raise e
|
| 66 |
+
|
| 67 |
+
def create_eu_ai_act_documents(text_content: str) -> list:
|
| 68 |
+
"""Convert EU AI Act text to Document objects with metadata."""
|
| 69 |
+
try:
|
| 70 |
+
# Initialize text splitter with appropriate settings for legal documents
|
| 71 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 72 |
+
chunk_size=1500, # Larger chunks for legal text
|
| 73 |
+
chunk_overlap=200, # More overlap for context preservation
|
| 74 |
+
length_function=len,
|
| 75 |
+
separators=["\n\n", "\n", ". ", " ", ""]
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
# Create initial document
|
| 79 |
+
doc = Document(
|
| 80 |
+
page_content=text_content,
|
| 81 |
+
metadata={
|
| 82 |
+
'source': 'EU AI Act',
|
| 83 |
+
'document_type': 'regulation',
|
| 84 |
+
'jurisdiction': 'European Union',
|
| 85 |
+
'title': 'Regulation (EU) 2024/1689 on Artificial Intelligence (AI Act)'
|
| 86 |
+
}
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# Split into chunks
|
| 90 |
+
splits = text_splitter.split_documents([doc])
|
| 91 |
+
|
| 92 |
+
# Add chunk-specific metadata
|
| 93 |
+
for i, split in enumerate(splits):
|
| 94 |
+
split.metadata.update({
|
| 95 |
+
'chunk_id': i,
|
| 96 |
+
'total_chunks': len(splits)
|
| 97 |
+
})
|
| 98 |
+
|
| 99 |
+
logger.info(f"Created {len(splits)} document chunks")
|
| 100 |
+
return splits
|
| 101 |
+
|
| 102 |
+
except Exception as e:
|
| 103 |
+
logger.error(f"Error creating documents: {e}")
|
| 104 |
+
raise e
|
| 105 |
+
|
| 106 |
+
def create_and_save_eu_vectorstore(
|
| 107 |
+
pdf_path: str = "data_updating_scripts/eu-ai-act.pdf",
|
| 108 |
+
vectorstore_path: str = "data/eu_ai_act_vectorstore",
|
| 109 |
+
openai_api_key: Optional[str] = None
|
| 110 |
+
) -> bool:
|
| 111 |
+
"""
|
| 112 |
+
Create FAISS vectorstore from EU AI Act PDF and save it locally.
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
pdf_path: Path to the EU AI Act PDF file
|
| 116 |
+
vectorstore_path: Directory to save the vectorstore
|
| 117 |
+
openai_api_key: OpenAI API key (if not provided, uses environment variable)
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
bool: True if successful, False otherwise
|
| 121 |
+
"""
|
| 122 |
+
try:
|
| 123 |
+
# Check if PDF exists
|
| 124 |
+
if not Path(pdf_path).exists():
|
| 125 |
+
logger.error(f"PDF file not found: {pdf_path}")
|
| 126 |
+
return False
|
| 127 |
+
|
| 128 |
+
# Get API key
|
| 129 |
+
api_key = openai_api_key or config.OPENAI_API_KEY
|
| 130 |
+
if not api_key:
|
| 131 |
+
logger.error("OpenAI API key not found")
|
| 132 |
+
return False
|
| 133 |
+
|
| 134 |
+
logger.info("Starting EU AI Act vectorstore creation...")
|
| 135 |
+
|
| 136 |
+
# Extract text from PDF
|
| 137 |
+
logger.info("Extracting text from PDF...")
|
| 138 |
+
text_content = extract_text_from_pdf(pdf_path)
|
| 139 |
+
|
| 140 |
+
if not text_content or len(text_content) < 1000:
|
| 141 |
+
logger.error("Insufficient text extracted from PDF")
|
| 142 |
+
return False
|
| 143 |
+
|
| 144 |
+
# Create documents
|
| 145 |
+
logger.info("Creating document chunks...")
|
| 146 |
+
documents = create_eu_ai_act_documents(text_content)
|
| 147 |
+
|
| 148 |
+
if not documents:
|
| 149 |
+
logger.error("No documents created")
|
| 150 |
+
return False
|
| 151 |
+
|
| 152 |
+
# Initialize embeddings
|
| 153 |
+
logger.info("Initializing embeddings...")
|
| 154 |
+
embeddings = OpenAIEmbeddings(
|
| 155 |
+
api_key=api_key,
|
| 156 |
+
model="text-embedding-3-small"
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
# Create vectorstore
|
| 160 |
+
logger.info("Creating FAISS vectorstore...")
|
| 161 |
+
vectorstore = FAISS.from_documents(documents, embeddings)
|
| 162 |
+
|
| 163 |
+
# Create directory if it doesn't exist
|
| 164 |
+
Path(vectorstore_path).mkdir(exist_ok=True)
|
| 165 |
+
|
| 166 |
+
# Save vectorstore
|
| 167 |
+
logger.info(f"Saving vectorstore to {vectorstore_path}...")
|
| 168 |
+
vectorstore.save_local(vectorstore_path)
|
| 169 |
+
|
| 170 |
+
# Save metadata
|
| 171 |
+
metadata = {
|
| 172 |
+
'pdf_path': pdf_path,
|
| 173 |
+
'total_chunks': len(documents),
|
| 174 |
+
'text_length': len(text_content),
|
| 175 |
+
'embedding_model': 'text-embedding-3-small',
|
| 176 |
+
'chunk_size': 1500,
|
| 177 |
+
'chunk_overlap': 200
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
metadata_path = Path(vectorstore_path) / "metadata.pickle"
|
| 181 |
+
with open(metadata_path, 'wb') as f:
|
| 182 |
+
pickle.dump(metadata, f)
|
| 183 |
+
|
| 184 |
+
logger.info(f"✅ EU AI Act vectorstore created successfully!")
|
| 185 |
+
logger.info(f" - Total chunks: {len(documents)}")
|
| 186 |
+
logger.info(f" - Text length: {len(text_content):,} characters")
|
| 187 |
+
logger.info(f" - Saved to: {vectorstore_path}")
|
| 188 |
+
|
| 189 |
+
return True
|
| 190 |
+
|
| 191 |
+
except Exception as e:
|
| 192 |
+
logger.error(f"Error creating EU AI Act vectorstore: {e}")
|
| 193 |
+
return False
|
| 194 |
+
|
| 195 |
+
def load_eu_vectorstore(
|
| 196 |
+
vectorstore_path: str = "eu_ai_act_vectorstore",
|
| 197 |
+
openai_api_key: Optional[str] = None
|
| 198 |
+
) -> Optional[FAISS]:
|
| 199 |
+
"""
|
| 200 |
+
Load the EU AI Act vectorstore from disk.
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
vectorstore_path: Path to the saved vectorstore
|
| 204 |
+
openai_api_key: OpenAI API key
|
| 205 |
+
|
| 206 |
+
Returns:
|
| 207 |
+
FAISS vectorstore or None if failed
|
| 208 |
+
"""
|
| 209 |
+
try:
|
| 210 |
+
if not Path(vectorstore_path).exists():
|
| 211 |
+
logger.error(f"Vectorstore not found: {vectorstore_path}")
|
| 212 |
+
return None
|
| 213 |
+
|
| 214 |
+
# Get API key
|
| 215 |
+
api_key = openai_api_key or config.OPENAI_API_KEY
|
| 216 |
+
if not api_key:
|
| 217 |
+
logger.error("OpenAI API key not found")
|
| 218 |
+
return None
|
| 219 |
+
|
| 220 |
+
# Initialize embeddings
|
| 221 |
+
embeddings = OpenAIEmbeddings(
|
| 222 |
+
api_key=api_key,
|
| 223 |
+
model="text-embedding-3-small"
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
# Load vectorstore
|
| 227 |
+
vectorstore = FAISS.load_local(
|
| 228 |
+
vectorstore_path,
|
| 229 |
+
embeddings,
|
| 230 |
+
allow_dangerous_deserialization=True # Required for loading pickled objects
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
logger.info(f"✅ EU AI Act vectorstore loaded from {vectorstore_path}")
|
| 234 |
+
return vectorstore
|
| 235 |
+
|
| 236 |
+
except Exception as e:
|
| 237 |
+
logger.error(f"Error loading EU AI Act vectorstore: {e}")
|
| 238 |
+
return None
|
| 239 |
+
|
| 240 |
+
def get_vectorstore_info(vectorstore_path: str = "data/eu_ai_act_vectorstore") -> dict:
|
| 241 |
+
"""Get information about the saved vectorstore."""
|
| 242 |
+
try:
|
| 243 |
+
metadata_path = Path(vectorstore_path) / "metadata.pickle"
|
| 244 |
+
if metadata_path.exists():
|
| 245 |
+
with open(metadata_path, 'rb') as f:
|
| 246 |
+
metadata = pickle.load(f)
|
| 247 |
+
return metadata
|
| 248 |
+
else:
|
| 249 |
+
return {"error": "Metadata not found"}
|
| 250 |
+
except Exception as e:
|
| 251 |
+
return {"error": str(e)}
|
| 252 |
+
|
| 253 |
+
if __name__ == "__main__":
|
| 254 |
+
# Create the vectorstore
|
| 255 |
+
success = create_and_save_eu_vectorstore()
|
| 256 |
+
|
| 257 |
+
if success:
|
| 258 |
+
# Display info
|
| 259 |
+
info = get_vectorstore_info()
|
| 260 |
+
print("\n" + "="*50)
|
| 261 |
+
print("EU AI Act Vectorstore Information:")
|
| 262 |
+
print("="*50)
|
| 263 |
+
for key, value in info.items():
|
| 264 |
+
if key != 'error':
|
| 265 |
+
print(f"{key}: {value}")
|
| 266 |
+
print("="*50)
|
| 267 |
+
else:
|
| 268 |
+
print("❌ Failed to create EU AI Act vectorstore")
|
| 269 |
+
exit(1)
|
data/data_updating_scripts/fix_pdf_bills.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import base64
|
| 4 |
+
import logging
|
| 5 |
+
import sys
|
| 6 |
+
from datetime import datetime, timezone
|
| 7 |
+
import requests
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
import PyPDF2
|
| 10 |
+
from io import BytesIO
|
| 11 |
+
import re
|
| 12 |
+
import shutil
|
| 13 |
+
|
| 14 |
+
# Load environment variables
|
| 15 |
+
load_dotenv()
|
| 16 |
+
API_KEY = os.getenv("LEGISCAN_API_KEY")
|
| 17 |
+
|
| 18 |
+
# Files
|
| 19 |
+
INPUT_FILE = "data/known_bills.json"
|
| 20 |
+
OUTPUT_FILE = "data/known_bills_fixed.json"
|
| 21 |
+
BACKUP_FILE = "data/known_bills_backup.json"
|
| 22 |
+
|
| 23 |
+
# Rate limiting
|
| 24 |
+
import time
|
| 25 |
+
RATE_LIMIT = 0.2 # seconds between API requests
|
| 26 |
+
|
| 27 |
+
# Logging configuration
|
| 28 |
+
LOG_FILE = "data_updating_scripts/logs/fix_pdf_bills.log"
|
| 29 |
+
os.makedirs("data_updating_scripts/logs", exist_ok=True)
|
| 30 |
+
|
| 31 |
+
logging.basicConfig(
|
| 32 |
+
level=logging.INFO,
|
| 33 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 34 |
+
handlers=[
|
| 35 |
+
logging.StreamHandler(sys.stdout),
|
| 36 |
+
logging.FileHandler(LOG_FILE)
|
| 37 |
+
]
|
| 38 |
+
)
|
| 39 |
+
logger = logging.getLogger(__name__)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def is_pdf_content(text):
|
| 43 |
+
"""Check if the text content is an unprocessed PDF."""
|
| 44 |
+
if not text:
|
| 45 |
+
return False
|
| 46 |
+
# Check for PDF header signatures
|
| 47 |
+
pdf_signatures = ["%PDF-1.3", "%PDF-1.4", "%PDF-1.5", "%PDF-1.6", "%PDF-1.7", "%PDF1.3", "%PDF1.4", "%PDF1.5", "%PDF1.6", "%PDF1.7"]
|
| 48 |
+
text_start = text[:20] if len(text) >= 20 else text
|
| 49 |
+
return any(text_start.startswith(sig) for sig in pdf_signatures)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def extract_text_from_pdf_bytes(pdf_bytes):
|
| 53 |
+
"""Extract text from PDF bytes using PyPDF2."""
|
| 54 |
+
try:
|
| 55 |
+
pdf_file = BytesIO(pdf_bytes)
|
| 56 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 57 |
+
|
| 58 |
+
text_content = []
|
| 59 |
+
for page_num in range(len(pdf_reader.pages)):
|
| 60 |
+
page = pdf_reader.pages[page_num]
|
| 61 |
+
page_text = page.extract_text()
|
| 62 |
+
if page_text:
|
| 63 |
+
text_content.append(page_text)
|
| 64 |
+
|
| 65 |
+
full_text = "\n".join(text_content)
|
| 66 |
+
|
| 67 |
+
# Clean up the extracted text
|
| 68 |
+
# Remove excessive whitespace while preserving paragraph breaks
|
| 69 |
+
full_text = re.sub(r'\n{3,}', '\n\n', full_text)
|
| 70 |
+
full_text = re.sub(r' {2,}', ' ', full_text)
|
| 71 |
+
full_text = full_text.strip()
|
| 72 |
+
|
| 73 |
+
return full_text
|
| 74 |
+
except Exception as e:
|
| 75 |
+
logger.error(f"Error extracting text from PDF: {e}")
|
| 76 |
+
return None
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def legi_request(op, params):
|
| 80 |
+
"""Make a request to the LegiScan API."""
|
| 81 |
+
base = "https://api.legiscan.com/"
|
| 82 |
+
params.update({"key": API_KEY, "op": op})
|
| 83 |
+
try:
|
| 84 |
+
resp = requests.get(base, params=params, timeout=10)
|
| 85 |
+
resp.raise_for_status()
|
| 86 |
+
data = resp.json()
|
| 87 |
+
if data.get("status") != "OK":
|
| 88 |
+
logger.error(f"API error {op}: {data.get('message', data)}")
|
| 89 |
+
return None
|
| 90 |
+
return data
|
| 91 |
+
except requests.RequestException as e:
|
| 92 |
+
logger.error(f"Request failed ({op}): {e}")
|
| 93 |
+
return None
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def fix_pdf_bill(bill):
|
| 97 |
+
"""Fix a single bill with unprocessed PDF content."""
|
| 98 |
+
bill_id = bill.get("bill_id")
|
| 99 |
+
state = bill.get("state")
|
| 100 |
+
bill_num = bill.get("bill_number")
|
| 101 |
+
|
| 102 |
+
logger.info(f"Fixing PDF content for {state} {bill_num} (ID: {bill_id})")
|
| 103 |
+
|
| 104 |
+
# First, try to get the bill details again
|
| 105 |
+
details_resp = legi_request("getBill", {"id": bill_id})
|
| 106 |
+
if not details_resp:
|
| 107 |
+
logger.warning(f"Could not fetch bill details for {bill_id}")
|
| 108 |
+
return None
|
| 109 |
+
|
| 110 |
+
details = details_resp.get("bill", {})
|
| 111 |
+
texts = details.get("texts", [])
|
| 112 |
+
|
| 113 |
+
if not texts:
|
| 114 |
+
logger.warning(f"No text documents available for {bill_id}")
|
| 115 |
+
return None
|
| 116 |
+
|
| 117 |
+
# Try to get the text document
|
| 118 |
+
doc_id = texts[0].get("doc_id")
|
| 119 |
+
text_resp = legi_request("getBillText", {"id": doc_id})
|
| 120 |
+
|
| 121 |
+
if not text_resp or "text" not in text_resp:
|
| 122 |
+
logger.warning(f"Could not fetch text for {bill_id}")
|
| 123 |
+
return None
|
| 124 |
+
|
| 125 |
+
raw_b64 = text_resp["text"].get("doc", "")
|
| 126 |
+
if not raw_b64:
|
| 127 |
+
logger.warning(f"No document content for {bill_id}")
|
| 128 |
+
return None
|
| 129 |
+
|
| 130 |
+
try:
|
| 131 |
+
# Decode the base64 content
|
| 132 |
+
decoded = base64.b64decode(raw_b64)
|
| 133 |
+
|
| 134 |
+
# Check if it's a PDF by looking at the magic bytes
|
| 135 |
+
if decoded[:4] == b'%PDF':
|
| 136 |
+
# It's a PDF, extract text
|
| 137 |
+
extracted_text = extract_text_from_pdf_bytes(decoded)
|
| 138 |
+
if extracted_text and len(extracted_text.strip()) > 100: # Ensure we got meaningful text
|
| 139 |
+
logger.info(f"Successfully extracted {len(extracted_text)} characters from PDF for {bill_id}")
|
| 140 |
+
return extracted_text
|
| 141 |
+
else:
|
| 142 |
+
logger.warning(f"Extracted text too short or empty for {bill_id}")
|
| 143 |
+
return None
|
| 144 |
+
else:
|
| 145 |
+
# Try to decode as HTML (shouldn't happen for these cases, but just in case)
|
| 146 |
+
try:
|
| 147 |
+
from bs4 import BeautifulSoup
|
| 148 |
+
html = decoded.decode("utf-8", errors="ignore")
|
| 149 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 150 |
+
plain_text = soup.get_text(separator="\n", strip=True)
|
| 151 |
+
if plain_text and len(plain_text.strip()) > 100:
|
| 152 |
+
logger.info(f"Successfully extracted HTML text for {bill_id}")
|
| 153 |
+
return plain_text
|
| 154 |
+
except:
|
| 155 |
+
pass
|
| 156 |
+
|
| 157 |
+
logger.warning(f"Could not process document for {bill_id}")
|
| 158 |
+
return None
|
| 159 |
+
|
| 160 |
+
except Exception as e:
|
| 161 |
+
logger.error(f"Error processing document for {bill_id}: {e}")
|
| 162 |
+
return None
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def main(overwrite: bool | None = None):
|
| 166 |
+
# Load the bills
|
| 167 |
+
logger.info(f"Loading bills from {INPUT_FILE}")
|
| 168 |
+
try:
|
| 169 |
+
with open(INPUT_FILE, 'r') as f:
|
| 170 |
+
bills = json.load(f)
|
| 171 |
+
except Exception as e:
|
| 172 |
+
logger.error(f"Could not load bills file: {e}")
|
| 173 |
+
sys.exit(1)
|
| 174 |
+
|
| 175 |
+
logger.info(f"Loaded {len(bills)} bills")
|
| 176 |
+
|
| 177 |
+
# Create a backup
|
| 178 |
+
logger.info(f"Creating backup at {BACKUP_FILE}")
|
| 179 |
+
with open(BACKUP_FILE, 'w') as f:
|
| 180 |
+
json.dump(bills, f, indent=2)
|
| 181 |
+
|
| 182 |
+
# Find bills with unprocessed PDF content
|
| 183 |
+
pdf_bills = []
|
| 184 |
+
for i, bill in enumerate(bills):
|
| 185 |
+
if is_pdf_content(bill.get("text")):
|
| 186 |
+
pdf_bills.append(i)
|
| 187 |
+
|
| 188 |
+
logger.info(f"Found {len(pdf_bills)} bills with unprocessed PDF content")
|
| 189 |
+
|
| 190 |
+
# Process each PDF bill
|
| 191 |
+
fixed_count = 0
|
| 192 |
+
failed_count = 0
|
| 193 |
+
|
| 194 |
+
for idx, bill_idx in enumerate(pdf_bills):
|
| 195 |
+
bill = bills[bill_idx]
|
| 196 |
+
logger.info(f"Processing {idx + 1}/{len(pdf_bills)}: {bill.get('state')} {bill.get('bill_number')}")
|
| 197 |
+
|
| 198 |
+
# Try to fix the PDF content
|
| 199 |
+
fixed_text = fix_pdf_bill(bill)
|
| 200 |
+
|
| 201 |
+
if fixed_text:
|
| 202 |
+
# Update the bill with the fixed text
|
| 203 |
+
bills[bill_idx]["text"] = fixed_text
|
| 204 |
+
bills[bill_idx]["lastUpdatedAt"] = datetime.now(timezone.utc).isoformat()
|
| 205 |
+
bills[bill_idx]["text_fixed"] = True # Mark that we fixed this
|
| 206 |
+
fixed_count += 1
|
| 207 |
+
logger.info(f"Successfully fixed bill {bill.get('bill_id')}")
|
| 208 |
+
else:
|
| 209 |
+
# Mark that we tried but failed
|
| 210 |
+
bills[bill_idx]["text_extraction_failed"] = True
|
| 211 |
+
bills[bill_idx]["lastUpdatedAt"] = datetime.now(timezone.utc).isoformat()
|
| 212 |
+
failed_count += 1
|
| 213 |
+
logger.warning(f"Failed to fix bill {bill.get('bill_id')}")
|
| 214 |
+
|
| 215 |
+
# Rate limiting
|
| 216 |
+
time.sleep(RATE_LIMIT)
|
| 217 |
+
|
| 218 |
+
# Save progress every 50 bills
|
| 219 |
+
if (idx + 1) % 50 == 0:
|
| 220 |
+
logger.info(f"Saving progress... ({idx + 1}/{len(pdf_bills)} processed)")
|
| 221 |
+
with open(OUTPUT_FILE, 'w') as f:
|
| 222 |
+
json.dump(bills, f, indent=2)
|
| 223 |
+
|
| 224 |
+
# Save final results
|
| 225 |
+
logger.info(f"Saving final results to {OUTPUT_FILE}")
|
| 226 |
+
with open(OUTPUT_FILE, 'w') as f:
|
| 227 |
+
json.dump(bills, f, indent=2)
|
| 228 |
+
|
| 229 |
+
logger.info(f"Processing complete!")
|
| 230 |
+
logger.info(f"Successfully fixed: {fixed_count} bills")
|
| 231 |
+
logger.info(f"Failed to fix: {failed_count} bills")
|
| 232 |
+
logger.info(f"Output saved to: {OUTPUT_FILE}")
|
| 233 |
+
|
| 234 |
+
if fixed_count > 0:
|
| 235 |
+
# Decide overwrite behavior
|
| 236 |
+
if overwrite is None:
|
| 237 |
+
# CLI mode: ask the user (guardrail preserved)
|
| 238 |
+
try:
|
| 239 |
+
response = input(
|
| 240 |
+
f"\nDo you want to overwrite {INPUT_FILE} with the fixed data? (y/n): "
|
| 241 |
+
)
|
| 242 |
+
except EOFError:
|
| 243 |
+
logger.error(
|
| 244 |
+
"No input available (EOF). Leaving original file unchanged."
|
| 245 |
+
)
|
| 246 |
+
return
|
| 247 |
+
overwrite_flag = response.strip().lower().startswith("y")
|
| 248 |
+
else:
|
| 249 |
+
# Non-interactive mode (e.g. Streamlit pipeline)
|
| 250 |
+
overwrite_flag = overwrite
|
| 251 |
+
|
| 252 |
+
if overwrite_flag:
|
| 253 |
+
shutil.copy2(OUTPUT_FILE, INPUT_FILE)
|
| 254 |
+
logger.info(f"Original file {INPUT_FILE} has been updated with fixed data.")
|
| 255 |
+
else:
|
| 256 |
+
logger.info("Overwrite declined; original file left unchanged.")
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
if __name__ == "__main__":
|
| 261 |
+
# If running under Streamlit / pipeline, we expect FIX_PDF_OVERWRITE in env:
|
| 262 |
+
# "yes", "y", "true", "1" -> overwrite=True
|
| 263 |
+
# "no", "n", "false", "0" -> overwrite=False
|
| 264 |
+
# If it's not set, we fall back to CLI mode and ask via input().
|
| 265 |
+
env_choice = os.getenv("FIX_PDF_OVERWRITE")
|
| 266 |
+
|
| 267 |
+
if env_choice is None:
|
| 268 |
+
# Local CLI run → still interactive
|
| 269 |
+
main(overwrite=None)
|
| 270 |
+
else:
|
| 271 |
+
choice = env_choice.strip().lower()
|
| 272 |
+
if choice in ("yes", "y", "true", "1"):
|
| 273 |
+
main(overwrite=True)
|
| 274 |
+
elif choice in ("no", "n", "false", "0"):
|
| 275 |
+
main(overwrite=False)
|
| 276 |
+
else:
|
| 277 |
+
logger.warning(
|
| 278 |
+
f"Invalid FIX_PDF_OVERWRITE='{env_choice}', defaulting to no overwrite."
|
| 279 |
+
)
|
| 280 |
+
main(overwrite=False)
|
| 281 |
+
|
| 282 |
+
|
data/data_updating_scripts/generate_reports.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
generate_reports.py
|
| 3 |
+
--------------------
|
| 4 |
+
|
| 5 |
+
Generates detailed Markdown reports for AI-related bills from `known_bills_visualize.json`
|
| 6 |
+
using the latest LangChain pipeline syntax.
|
| 7 |
+
|
| 8 |
+
Now includes resume functionality - can be safely stopped and restarted.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import logging
|
| 15 |
+
import os
|
| 16 |
+
import time
|
| 17 |
+
from dataclasses import dataclass
|
| 18 |
+
from typing import Any, Dict, List, Optional
|
| 19 |
+
import dotenv
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
dotenv.load_dotenv()
|
| 23 |
+
|
| 24 |
+
# Create logs directory if it doesn't exist
|
| 25 |
+
os.makedirs("data_updating_scripts/logs", exist_ok=True)
|
| 26 |
+
|
| 27 |
+
# Latest LangChain imports
|
| 28 |
+
try:
|
| 29 |
+
from langchain_openai import ChatOpenAI
|
| 30 |
+
from langchain.prompts import ChatPromptTemplate
|
| 31 |
+
except ImportError: # pragma: no cover
|
| 32 |
+
ChatOpenAI = None # type: ignore
|
| 33 |
+
ChatPromptTemplate = None # type: ignore
|
| 34 |
+
|
| 35 |
+
# Configure logging
|
| 36 |
+
logging.basicConfig(
|
| 37 |
+
level=logging.INFO,
|
| 38 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 39 |
+
handlers=[logging.StreamHandler(), logging.FileHandler("data_updating_scripts/logs/generate_reports.log")],
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
logger = logging.getLogger(__name__)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@dataclass
|
| 46 |
+
class BillReport:
|
| 47 |
+
"""Stores a bill ID and its generated detailed report."""
|
| 48 |
+
bill_id: str
|
| 49 |
+
report_markdown: str
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# Prompt template
|
| 53 |
+
DETAILED_REPORT_PROMPT = ChatPromptTemplate.from_template(
|
| 54 |
+
"""You are a seasoned legislative analyst adept at interpreting and
|
| 55 |
+
summarising bills related to artificial intelligence. Using the bill
|
| 56 |
+
information provided as JSON, produce a detailed report in Markdown
|
| 57 |
+
format for stakeholders.
|
| 58 |
+
|
| 59 |
+
Include:
|
| 60 |
+
- Bill's title, number, and state
|
| 61 |
+
- Status and key dates
|
| 62 |
+
- URL to the bill on legiscan
|
| 63 |
+
- Sponsors and scope
|
| 64 |
+
- Goals and intent
|
| 65 |
+
- Key provisions, regulatory approaches, implementation & enforcement
|
| 66 |
+
- Unique aspects or notable features
|
| 67 |
+
|
| 68 |
+
Format:
|
| 69 |
+
- Use Markdown headings and bullet points
|
| 70 |
+
- Paraphrase content
|
| 71 |
+
- Do not invent facts
|
| 72 |
+
- If bill text is truncated in source JSON, note this at the end
|
| 73 |
+
|
| 74 |
+
Bill JSON:
|
| 75 |
+
```json
|
| 76 |
+
{bill_json}
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
Now craft the detailed report.
|
| 80 |
+
"""
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _ensure_llm() -> ChatOpenAI:
|
| 85 |
+
"""Initialise ChatOpenAI with latest settings."""
|
| 86 |
+
if ChatOpenAI is None:
|
| 87 |
+
raise RuntimeError(
|
| 88 |
+
"The 'langchain' and 'openai' packages are required. Install them via 'pip install langchain openai'."
|
| 89 |
+
)
|
| 90 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
| 91 |
+
if not api_key:
|
| 92 |
+
raise RuntimeError("The OPENAI_API_KEY environment variable is not set.")
|
| 93 |
+
model_name = os.getenv("MODEL_NAME", "gpt-4o")
|
| 94 |
+
logger.debug("Initialising ChatOpenAI with model %s", model_name)
|
| 95 |
+
return ChatOpenAI(model=model_name, temperature=0)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def create_detailed_report(
|
| 99 |
+
bill: Dict[str, Any], *, llm: Optional[ChatOpenAI] = None
|
| 100 |
+
) -> BillReport:
|
| 101 |
+
"""Generate a detailed report for a single bill using latest LangChain syntax."""
|
| 102 |
+
if llm is None:
|
| 103 |
+
llm = _ensure_llm()
|
| 104 |
+
|
| 105 |
+
bill_json = json.dumps(bill, ensure_ascii=False, indent=2)
|
| 106 |
+
|
| 107 |
+
# Latest syntax: prompt | llm
|
| 108 |
+
chain = DETAILED_REPORT_PROMPT | llm
|
| 109 |
+
result = chain.invoke({"bill_json": bill_json})
|
| 110 |
+
|
| 111 |
+
# result can be AIMessage; get text
|
| 112 |
+
report_text = getattr(result, "content", str(result))
|
| 113 |
+
|
| 114 |
+
return BillReport(bill_id=str(bill.get("bill_id")), report_markdown=report_text)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def load_existing_reports(output_path: str) -> Dict[str, str]:
|
| 118 |
+
"""Load existing reports from file if it exists."""
|
| 119 |
+
if os.path.exists(output_path):
|
| 120 |
+
try:
|
| 121 |
+
with open(output_path, "r", encoding="utf-8") as f:
|
| 122 |
+
reports_list = json.load(f)
|
| 123 |
+
# Convert list to dict for easy lookup
|
| 124 |
+
reports_dict = {
|
| 125 |
+
report["bill_id"]: report["report_markdown"]
|
| 126 |
+
for report in reports_list
|
| 127 |
+
if "bill_id" in report and "report_markdown" in report
|
| 128 |
+
}
|
| 129 |
+
logger.info(f"Loaded {len(reports_dict)} existing reports from {output_path}")
|
| 130 |
+
return reports_dict
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.warning(f"Could not load existing reports: {e}")
|
| 133 |
+
return {}
|
| 134 |
+
return {}
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def save_reports_to_file(reports_dict: Dict[str, str], output_path: str) -> None:
|
| 138 |
+
"""Save reports dictionary to a JSON file."""
|
| 139 |
+
# Convert dict back to list format for consistency
|
| 140 |
+
out_list = [
|
| 141 |
+
{"bill_id": bill_id, "report_markdown": report_markdown}
|
| 142 |
+
for bill_id, report_markdown in reports_dict.items()
|
| 143 |
+
]
|
| 144 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 145 |
+
json.dump(out_list, f, ensure_ascii=False, indent=2)
|
| 146 |
+
logger.info("Saved %d reports to %s", len(out_list), output_path)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def create_reports_with_resume(
|
| 150 |
+
bills: List[Dict[str, Any]],
|
| 151 |
+
output_path: str,
|
| 152 |
+
*,
|
| 153 |
+
llm: Optional[ChatOpenAI] = None,
|
| 154 |
+
save_interval: int = 10
|
| 155 |
+
) -> Dict[str, str]:
|
| 156 |
+
"""
|
| 157 |
+
Generate detailed reports for multiple bills with resume capability.
|
| 158 |
+
|
| 159 |
+
Args:
|
| 160 |
+
bills: List of bill dictionaries
|
| 161 |
+
output_path: Path to save reports
|
| 162 |
+
llm: Optional LLM instance
|
| 163 |
+
save_interval: Save progress every N bills
|
| 164 |
+
|
| 165 |
+
Returns:
|
| 166 |
+
Dictionary of bill_id -> report_markdown
|
| 167 |
+
"""
|
| 168 |
+
if not bills:
|
| 169 |
+
return {}
|
| 170 |
+
|
| 171 |
+
if llm is None:
|
| 172 |
+
llm = _ensure_llm()
|
| 173 |
+
|
| 174 |
+
# Load existing reports
|
| 175 |
+
reports_dict = load_existing_reports(output_path)
|
| 176 |
+
|
| 177 |
+
# Track progress
|
| 178 |
+
total_bills = len(bills)
|
| 179 |
+
processed = 0
|
| 180 |
+
skipped = 0
|
| 181 |
+
errors = 0
|
| 182 |
+
|
| 183 |
+
logger.info(f"Starting report generation for {total_bills} bills")
|
| 184 |
+
|
| 185 |
+
for i, bill in enumerate(bills, 1):
|
| 186 |
+
bill_id = str(bill.get("bill_id"))
|
| 187 |
+
|
| 188 |
+
# Skip if already processed
|
| 189 |
+
if bill_id in reports_dict and reports_dict[bill_id] and not reports_dict[bill_id].startswith("ERROR:"):
|
| 190 |
+
logger.info(f"Skipping bill {bill_id} - already processed ({i}/{total_bills})")
|
| 191 |
+
skipped += 1
|
| 192 |
+
continue
|
| 193 |
+
|
| 194 |
+
logger.info(f"Processing {i}/{total_bills}: Bill ID {bill_id}")
|
| 195 |
+
|
| 196 |
+
try:
|
| 197 |
+
report = create_detailed_report(bill, llm=llm)
|
| 198 |
+
reports_dict[bill_id] = report.report_markdown
|
| 199 |
+
processed += 1
|
| 200 |
+
|
| 201 |
+
except Exception as exc:
|
| 202 |
+
logger.exception(
|
| 203 |
+
"Failed to generate report for bill %s: %s", bill_id, exc
|
| 204 |
+
)
|
| 205 |
+
reports_dict[bill_id] = f"ERROR: Failed to generate report - {str(exc)}"
|
| 206 |
+
errors += 1
|
| 207 |
+
|
| 208 |
+
# Save progress periodically
|
| 209 |
+
if i % save_interval == 0:
|
| 210 |
+
save_reports_to_file(reports_dict, output_path)
|
| 211 |
+
logger.info(f"Progress: {i}/{total_bills} - Processed: {processed}, Skipped: {skipped}, Errors: {errors}")
|
| 212 |
+
|
| 213 |
+
# Rate limiting to avoid API throttling
|
| 214 |
+
if bill_id not in reports_dict or reports_dict[bill_id].startswith("ERROR:"):
|
| 215 |
+
time.sleep(1) # 1 second delay between API calls
|
| 216 |
+
|
| 217 |
+
# Final save
|
| 218 |
+
save_reports_to_file(reports_dict, output_path)
|
| 219 |
+
|
| 220 |
+
logger.info(f"Report generation complete!")
|
| 221 |
+
logger.info(f"Total bills: {total_bills}")
|
| 222 |
+
logger.info(f"Successfully processed: {processed}")
|
| 223 |
+
logger.info(f"Skipped (already done): {skipped}")
|
| 224 |
+
logger.info(f"Errors: {errors}")
|
| 225 |
+
|
| 226 |
+
return reports_dict
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def read_bills_from_file(path: str) -> List[Dict[str, Any]]:
|
| 230 |
+
"""Read bill records from a JSON file."""
|
| 231 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 232 |
+
data = json.load(f)
|
| 233 |
+
if not isinstance(data, list):
|
| 234 |
+
raise ValueError(f"Expected list of bills in {path}, got {type(data)}")
|
| 235 |
+
return data
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def generate_reports_from_files(
|
| 239 |
+
input_path: str = "data/known_bills_visualize.json",
|
| 240 |
+
output_path: str = "data/bill_reports.json",
|
| 241 |
+
) -> None:
|
| 242 |
+
"""Read bills, generate reports with resume capability, and write them to disk."""
|
| 243 |
+
bills = read_bills_from_file(input_path)
|
| 244 |
+
create_reports_with_resume(bills, output_path)
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def main() -> None:
|
| 248 |
+
import argparse
|
| 249 |
+
logging.basicConfig(
|
| 250 |
+
level=logging.INFO,
|
| 251 |
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
| 252 |
+
)
|
| 253 |
+
parser = argparse.ArgumentParser(
|
| 254 |
+
description="Generate detailed AI legislation reports from bill data with resume capability."
|
| 255 |
+
)
|
| 256 |
+
parser.add_argument("--input", default="data/known_bills_visualize.json", help="Path to input JSON file")
|
| 257 |
+
parser.add_argument("--output", default="data/bill_reports.json", help="Path to output JSON file")
|
| 258 |
+
parser.add_argument("--save-interval", type=int, default=10, help="Save progress every N bills (default: 10)")
|
| 259 |
+
args = parser.parse_args()
|
| 260 |
+
|
| 261 |
+
try:
|
| 262 |
+
bills = read_bills_from_file(args.input)
|
| 263 |
+
create_reports_with_resume(bills, args.output, save_interval=args.save_interval)
|
| 264 |
+
print(f"✅ Report generation completed successfully!")
|
| 265 |
+
print(f" Reports saved to: {args.output}")
|
| 266 |
+
except Exception as e:
|
| 267 |
+
logger.error(f"Fatal error: {e}")
|
| 268 |
+
print(f"❌ Error: {e}")
|
| 269 |
+
import sys
|
| 270 |
+
sys.exit(1)
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
if __name__ == "__main__":
|
| 274 |
+
main()
|
data/data_updating_scripts/generate_suggested_questions.py
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Script to generate suggested questions for all bills in known_bills_visualize.json.
|
| 4 |
+
|
| 5 |
+
This script reads all bills from known_bills_visualize.json, generates 5 suggested questions using OpenAI API,
|
| 6 |
+
and saves them to data/bill_suggested_questions.json to avoid repeated API calls.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import logging
|
| 11 |
+
import time
|
| 12 |
+
import pandas as pd
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Dict, List, Optional
|
| 15 |
+
import sys
|
| 16 |
+
import os
|
| 17 |
+
|
| 18 |
+
# Add the project root to the path
|
| 19 |
+
sys.path.append(str(Path(__file__).parent.parent))
|
| 20 |
+
|
| 21 |
+
from config import ConfigManager
|
| 22 |
+
from langchain_openai import ChatOpenAI
|
| 23 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 24 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
| 25 |
+
from langchain_core.documents import Document
|
| 26 |
+
|
| 27 |
+
# Create logs directory if it doesn't exist
|
| 28 |
+
os.makedirs("data_updating_scripts/logs", exist_ok=True)
|
| 29 |
+
|
| 30 |
+
# Configure logging
|
| 31 |
+
logging.basicConfig(
|
| 32 |
+
level=logging.INFO,
|
| 33 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 34 |
+
handlers=[logging.StreamHandler(), logging.FileHandler("data_updating_scripts/logs/generate_suggested_questions.log")]
|
| 35 |
+
)
|
| 36 |
+
logger = logging.getLogger(__name__)
|
| 37 |
+
|
| 38 |
+
class SuggestedQuestionsGenerator:
|
| 39 |
+
"""Generates suggested questions for all bills in known_bills_visualize.json."""
|
| 40 |
+
|
| 41 |
+
def __init__(self):
|
| 42 |
+
"""Initialize the questions generator with configuration."""
|
| 43 |
+
self.config = ConfigManager()
|
| 44 |
+
self.known_bills_file = Path("data/known_bills_visualize.json")
|
| 45 |
+
self.questions_file = Path("data/bill_suggested_questions.json")
|
| 46 |
+
|
| 47 |
+
# Initialize OpenAI LLM
|
| 48 |
+
if not self.config.OPENAI_API_KEY:
|
| 49 |
+
raise ValueError("OPENAI_API_KEY not found in environment variables")
|
| 50 |
+
|
| 51 |
+
self.llm = ChatOpenAI(
|
| 52 |
+
model=self.config.OPENAI_LLM_MODEL,
|
| 53 |
+
temperature=0.3,
|
| 54 |
+
max_tokens=500
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Load the system prompt from markdown file
|
| 58 |
+
prompt_path = "data_updating_scripts/PROMPTS/suggested_questions_prompt.md"
|
| 59 |
+
if not os.path.exists(prompt_path):
|
| 60 |
+
raise FileNotFoundError(f"The specified file was not found: {prompt_path}")
|
| 61 |
+
|
| 62 |
+
with open(prompt_path, "r") as file:
|
| 63 |
+
system_prompt = file.read()
|
| 64 |
+
|
| 65 |
+
# Create the prompt and chain
|
| 66 |
+
self.prompt = ChatPromptTemplate.from_messages(
|
| 67 |
+
[
|
| 68 |
+
("system", system_prompt),
|
| 69 |
+
("human", "Generate 5 specific questions about this bill based on its content."),
|
| 70 |
+
]
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
self.question_generation_chain = create_stuff_documents_chain(
|
| 74 |
+
self.llm, self.prompt
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# Fallback questions
|
| 78 |
+
self.fallback_questions = [
|
| 79 |
+
"What are the key definitions in this bill?",
|
| 80 |
+
"What are the enforcement mechanisms?",
|
| 81 |
+
"Who does this bill apply to?",
|
| 82 |
+
"What are the compliance requirements?",
|
| 83 |
+
"What penalties are specified?"
|
| 84 |
+
]
|
| 85 |
+
|
| 86 |
+
logger.info(f"Initialized SuggestedQuestionsGenerator with model: {self.config.OPENAI_LLM_MODEL}")
|
| 87 |
+
|
| 88 |
+
def dataframe_to_documents(self, df):
|
| 89 |
+
"""Convert DataFrame to list of Document objects."""
|
| 90 |
+
documents = []
|
| 91 |
+
for _, row in df.iterrows():
|
| 92 |
+
if 'text' in row and pd.notna(row['text']) and row['text'].strip():
|
| 93 |
+
doc = Document(
|
| 94 |
+
page_content=row['text'],
|
| 95 |
+
metadata={
|
| 96 |
+
'bill_key': f"{row.get('state', 'Unknown')}_{row.get('bill_number', 'Unknown')}",
|
| 97 |
+
'state': row.get('state', 'Unknown'),
|
| 98 |
+
'bill_number': row.get('bill_number', 'Unknown'),
|
| 99 |
+
'title': row.get('title', 'No title')
|
| 100 |
+
}
|
| 101 |
+
)
|
| 102 |
+
documents.append(doc)
|
| 103 |
+
return documents
|
| 104 |
+
|
| 105 |
+
def load_known_bills(self) -> List[Dict]:
|
| 106 |
+
"""Load bills from known_bills_visualize.json."""
|
| 107 |
+
try:
|
| 108 |
+
with open(self.known_bills_file, 'r', encoding='utf-8') as f:
|
| 109 |
+
bills = json.load(f)
|
| 110 |
+
logger.info(f"Loaded {len(bills)} bills from {self.known_bills_file}")
|
| 111 |
+
return bills
|
| 112 |
+
except FileNotFoundError:
|
| 113 |
+
logger.error(f"File not found: {self.known_bills_file}")
|
| 114 |
+
raise
|
| 115 |
+
except json.JSONDecodeError as e:
|
| 116 |
+
logger.error(f"Error parsing JSON: {e}")
|
| 117 |
+
raise
|
| 118 |
+
|
| 119 |
+
def load_existing_questions(self) -> Dict:
|
| 120 |
+
"""Load existing questions if available."""
|
| 121 |
+
if self.questions_file.exists():
|
| 122 |
+
try:
|
| 123 |
+
with open(self.questions_file, 'r', encoding='utf-8') as f:
|
| 124 |
+
questions = json.load(f)
|
| 125 |
+
logger.info(f"Loaded {len(questions)} existing question sets")
|
| 126 |
+
return questions
|
| 127 |
+
except Exception as e:
|
| 128 |
+
logger.warning(f"Could not load existing questions: {e}")
|
| 129 |
+
return {}
|
| 130 |
+
return {}
|
| 131 |
+
|
| 132 |
+
def save_questions(self, questions: Dict) -> None:
|
| 133 |
+
"""Save questions to JSON file."""
|
| 134 |
+
try:
|
| 135 |
+
with open(self.questions_file, 'w', encoding='utf-8') as f:
|
| 136 |
+
json.dump(questions, f, indent=2, ensure_ascii=False)
|
| 137 |
+
logger.info(f"Saved {len(questions)} question sets to {self.questions_file}")
|
| 138 |
+
except Exception as e:
|
| 139 |
+
logger.error(f"Error saving questions: {e}")
|
| 140 |
+
raise
|
| 141 |
+
|
| 142 |
+
def parse_questions_response(self, response: str) -> List[str]:
|
| 143 |
+
"""Parse the LLM response into individual questions."""
|
| 144 |
+
questions = []
|
| 145 |
+
if isinstance(response, str):
|
| 146 |
+
# Split by lines and clean up
|
| 147 |
+
lines = [line.strip() for line in response.split('\n') if line.strip()]
|
| 148 |
+
# Filter out any numbering or bullet points
|
| 149 |
+
for line in lines:
|
| 150 |
+
# Remove common prefixes like "1.", "2.", "3.", "4.", "5.", "•", "-", "*", etc.
|
| 151 |
+
clean_line = line
|
| 152 |
+
if line.startswith(('1.', '2.', '3.', '4.', '5.', '•', '-', '*')):
|
| 153 |
+
clean_line = line[2:].strip()
|
| 154 |
+
elif line.startswith(('1)', '2)', '3)', '4)', '5)')):
|
| 155 |
+
clean_line = line[2:].strip()
|
| 156 |
+
|
| 157 |
+
if clean_line and clean_line.endswith('?'):
|
| 158 |
+
questions.append(clean_line)
|
| 159 |
+
|
| 160 |
+
# Ensure we have exactly 5 questions
|
| 161 |
+
if len(questions) < 5:
|
| 162 |
+
# Use fallback questions to fill up to 5
|
| 163 |
+
questions.extend(self.fallback_questions[len(questions):])
|
| 164 |
+
|
| 165 |
+
return questions[:5] # Return only the first 5
|
| 166 |
+
|
| 167 |
+
def generate_questions(self, bill: Dict) -> Optional[List[str]]:
|
| 168 |
+
"""Generate suggested questions for a single bill."""
|
| 169 |
+
try:
|
| 170 |
+
bill_number = bill.get('bill_number', 'Unknown')
|
| 171 |
+
bill_title = bill.get('title', 'No title')
|
| 172 |
+
bill_text = bill.get('text', '')
|
| 173 |
+
|
| 174 |
+
if not bill_text:
|
| 175 |
+
logger.warning(f"No text found for bill {bill_number}")
|
| 176 |
+
return self.fallback_questions
|
| 177 |
+
|
| 178 |
+
# Convert bill to document format
|
| 179 |
+
df = pd.DataFrame([bill])
|
| 180 |
+
docs = self.dataframe_to_documents(df)
|
| 181 |
+
|
| 182 |
+
if not docs:
|
| 183 |
+
logger.warning(f"No document created for bill {bill_number}")
|
| 184 |
+
return self.fallback_questions
|
| 185 |
+
|
| 186 |
+
# Generate questions using the chain
|
| 187 |
+
response = self.question_generation_chain.invoke({"context": docs})
|
| 188 |
+
|
| 189 |
+
# Parse the response into questions
|
| 190 |
+
questions = self.parse_questions_response(response)
|
| 191 |
+
|
| 192 |
+
logger.info(f"Generated {len(questions)} questions for {bill_number}")
|
| 193 |
+
return questions
|
| 194 |
+
|
| 195 |
+
except Exception as e:
|
| 196 |
+
logger.error(f"Error generating questions for bill {bill.get('bill_number', 'Unknown')}: {e}")
|
| 197 |
+
return self.fallback_questions
|
| 198 |
+
|
| 199 |
+
def generate_all_questions(self) -> None:
|
| 200 |
+
"""Generate suggested questions for all bills."""
|
| 201 |
+
# Load bills and existing questions
|
| 202 |
+
bills = self.load_known_bills()
|
| 203 |
+
existing_questions = self.load_existing_questions()
|
| 204 |
+
|
| 205 |
+
# Track progress
|
| 206 |
+
total_bills = len(bills)
|
| 207 |
+
processed = 0
|
| 208 |
+
errors = 0
|
| 209 |
+
|
| 210 |
+
logger.info(f"Starting question generation for {total_bills} bills")
|
| 211 |
+
|
| 212 |
+
for i, bill in enumerate(bills, 1):
|
| 213 |
+
bill_key = f"{bill.get('state', 'Unknown')}_{bill.get('bill_number', 'Unknown')}"
|
| 214 |
+
|
| 215 |
+
# Skip if already processed successfully
|
| 216 |
+
if bill_key in existing_questions and len(existing_questions[bill_key].get('suggested_questions', [])) == 5:
|
| 217 |
+
logger.info(f"Skipping {bill_key} - already processed")
|
| 218 |
+
processed += 1
|
| 219 |
+
continue
|
| 220 |
+
|
| 221 |
+
logger.info(f"Processing {i}/{total_bills}: {bill_key}")
|
| 222 |
+
|
| 223 |
+
# Generate questions
|
| 224 |
+
questions = self.generate_questions(bill)
|
| 225 |
+
|
| 226 |
+
# Store result
|
| 227 |
+
existing_questions[bill_key] = {
|
| 228 |
+
'bill_number': bill.get('bill_number', 'Unknown'),
|
| 229 |
+
'title': bill.get('title', 'No title'),
|
| 230 |
+
'suggested_questions': questions
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
if questions == self.fallback_questions:
|
| 234 |
+
errors += 1
|
| 235 |
+
else:
|
| 236 |
+
processed += 1
|
| 237 |
+
|
| 238 |
+
# Save progress every 10 bills
|
| 239 |
+
if i % 10 == 0:
|
| 240 |
+
self.save_questions(existing_questions)
|
| 241 |
+
logger.info(f"Progress: {i}/{total_bills} processed, {errors} errors")
|
| 242 |
+
|
| 243 |
+
# Rate limiting
|
| 244 |
+
time.sleep(1) # 1 second delay between API calls
|
| 245 |
+
|
| 246 |
+
# Final save
|
| 247 |
+
self.save_questions(existing_questions)
|
| 248 |
+
|
| 249 |
+
logger.info(f"Question generation complete!")
|
| 250 |
+
logger.info(f"Total bills: {total_bills}")
|
| 251 |
+
logger.info(f"Successfully processed: {processed}")
|
| 252 |
+
logger.info(f"Errors: {errors}")
|
| 253 |
+
logger.info(f"Questions saved to: {self.questions_file}")
|
| 254 |
+
|
| 255 |
+
def main():
|
| 256 |
+
"""Main function to run the question generation."""
|
| 257 |
+
try:
|
| 258 |
+
generator = SuggestedQuestionsGenerator()
|
| 259 |
+
generator.generate_all_questions()
|
| 260 |
+
print("✅ Suggested questions generation completed successfully!")
|
| 261 |
+
print(f" Questions saved to: {generator.questions_file}")
|
| 262 |
+
|
| 263 |
+
except Exception as e:
|
| 264 |
+
logger.error(f"Fatal error: {e}")
|
| 265 |
+
print(f"❌ Error: {e}")
|
| 266 |
+
sys.exit(1)
|
| 267 |
+
|
| 268 |
+
if __name__ == "__main__":
|
| 269 |
+
main()
|
data/data_updating_scripts/generate_summaries.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Script to generate summaries for all bills in known_bills_visualize.json.
|
| 4 |
+
|
| 5 |
+
This script reads all bills from known_bills_visualize.json, generates summaries using OpenAI API,
|
| 6 |
+
and saves them to data/bill_summaries.json to avoid repeated API calls.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import logging
|
| 11 |
+
import time
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Dict, List, Optional
|
| 14 |
+
import sys
|
| 15 |
+
import os
|
| 16 |
+
|
| 17 |
+
# Add the project root to the path
|
| 18 |
+
sys.path.append(str(Path(__file__).parent.parent))
|
| 19 |
+
|
| 20 |
+
from config import ConfigManager
|
| 21 |
+
from langchain_openai import ChatOpenAI
|
| 22 |
+
from langchain_core.prompts import PromptTemplate
|
| 23 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 24 |
+
from PROMPTS.bill_summary_prompt import BILL_SUMMARY_PROMPT
|
| 25 |
+
|
| 26 |
+
# Create logs directory if it doesn't exist
|
| 27 |
+
os.makedirs("data_updating_scripts/logs", exist_ok=True)
|
| 28 |
+
|
| 29 |
+
# Configure logging
|
| 30 |
+
logging.basicConfig(
|
| 31 |
+
level=logging.INFO,
|
| 32 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 33 |
+
handlers=[logging.StreamHandler(), logging.FileHandler("data_updating_scripts/logs/generate_summaries.log")]
|
| 34 |
+
)
|
| 35 |
+
logger = logging.getLogger(__name__)
|
| 36 |
+
|
| 37 |
+
class BillSummaryGenerator:
|
| 38 |
+
"""Generates summaries for all bills in known_bills_visualize.json."""
|
| 39 |
+
|
| 40 |
+
def __init__(self):
|
| 41 |
+
"""Initialize the summary generator with configuration."""
|
| 42 |
+
self.config = ConfigManager()
|
| 43 |
+
self.known_bills_file = Path("data/known_bills_visualize.json")
|
| 44 |
+
self.summaries_file = Path("data/bill_summaries.json")
|
| 45 |
+
|
| 46 |
+
# Initialize OpenAI LLM
|
| 47 |
+
if not self.config.OPENAI_API_KEY:
|
| 48 |
+
raise ValueError("OPENAI_API_KEY not found in environment variables")
|
| 49 |
+
|
| 50 |
+
self.llm = ChatOpenAI(
|
| 51 |
+
model=self.config.OPENAI_LLM_MODEL,
|
| 52 |
+
temperature=0.1,
|
| 53 |
+
max_tokens=1000
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# Create the prompt template
|
| 57 |
+
self.prompt_template = PromptTemplate(
|
| 58 |
+
template=BILL_SUMMARY_PROMPT,
|
| 59 |
+
input_variables=["bill_number", "bill_title", "state", "bill_text"]
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Create the chain
|
| 63 |
+
self.chain = self.prompt_template | self.llm | StrOutputParser()
|
| 64 |
+
|
| 65 |
+
logger.info(f"Initialized BillSummaryGenerator with model: {self.config.OPENAI_LLM_MODEL}")
|
| 66 |
+
|
| 67 |
+
def load_known_bills(self) -> List[Dict]:
|
| 68 |
+
"""Load bills from known_bills_visualize.json."""
|
| 69 |
+
try:
|
| 70 |
+
with open(self.known_bills_file, 'r', encoding='utf-8') as f:
|
| 71 |
+
bills = json.load(f)
|
| 72 |
+
logger.info(f"Loaded {len(bills)} bills from {self.known_bills_file}")
|
| 73 |
+
return bills
|
| 74 |
+
except FileNotFoundError:
|
| 75 |
+
logger.error(f"File not found: {self.known_bills_file}")
|
| 76 |
+
raise
|
| 77 |
+
except json.JSONDecodeError as e:
|
| 78 |
+
logger.error(f"Error parsing JSON: {e}")
|
| 79 |
+
raise
|
| 80 |
+
|
| 81 |
+
def load_existing_summaries(self) -> Dict:
|
| 82 |
+
"""Load existing summaries if available."""
|
| 83 |
+
if self.summaries_file.exists():
|
| 84 |
+
try:
|
| 85 |
+
with open(self.summaries_file, 'r', encoding='utf-8') as f:
|
| 86 |
+
summaries = json.load(f)
|
| 87 |
+
logger.info(f"Loaded {len(summaries)} existing summaries")
|
| 88 |
+
return summaries
|
| 89 |
+
except Exception as e:
|
| 90 |
+
logger.warning(f"Could not load existing summaries: {e}")
|
| 91 |
+
return {}
|
| 92 |
+
return {}
|
| 93 |
+
|
| 94 |
+
def save_summaries(self, summaries: Dict) -> None:
|
| 95 |
+
"""Save summaries to JSON file."""
|
| 96 |
+
try:
|
| 97 |
+
with open(self.summaries_file, 'w', encoding='utf-8') as f:
|
| 98 |
+
json.dump(summaries, f, indent=2, ensure_ascii=False)
|
| 99 |
+
logger.info(f"Saved {len(summaries)} summaries to {self.summaries_file}")
|
| 100 |
+
except Exception as e:
|
| 101 |
+
logger.error(f"Error saving summaries: {e}")
|
| 102 |
+
raise
|
| 103 |
+
|
| 104 |
+
def generate_summary(self, bill: Dict) -> Optional[str]:
|
| 105 |
+
"""Generate summary for a single bill."""
|
| 106 |
+
try:
|
| 107 |
+
bill_number = bill.get('bill_number', 'Unknown')
|
| 108 |
+
bill_title = bill.get('title', 'No title')
|
| 109 |
+
state = bill.get('state', 'Unknown')
|
| 110 |
+
bill_text = bill.get('text', '')
|
| 111 |
+
|
| 112 |
+
if not bill_text:
|
| 113 |
+
logger.warning(f"No text found for bill {bill_number}")
|
| 114 |
+
return "ERROR: No bill text available"
|
| 115 |
+
|
| 116 |
+
# Prepare the input for the chain
|
| 117 |
+
chain_input = {
|
| 118 |
+
"bill_number": bill_number,
|
| 119 |
+
"bill_title": bill_title,
|
| 120 |
+
"state": state,
|
| 121 |
+
"bill_text": bill_text[:8000] # Limit text length to avoid token limits
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
# Generate summary using the chain
|
| 125 |
+
summary = self.chain.invoke(chain_input)
|
| 126 |
+
|
| 127 |
+
logger.info(f"Generated summary for {bill_number}")
|
| 128 |
+
return summary
|
| 129 |
+
|
| 130 |
+
except Exception as e:
|
| 131 |
+
logger.error(f"Error generating summary for bill {bill.get('bill_number', 'Unknown')}: {e}")
|
| 132 |
+
return f"ERROR: {str(e)}"
|
| 133 |
+
|
| 134 |
+
def generate_all_summaries(self) -> None:
|
| 135 |
+
"""Generate summaries for all bills."""
|
| 136 |
+
# Load bills and existing summaries
|
| 137 |
+
bills = self.load_known_bills()
|
| 138 |
+
existing_summaries = self.load_existing_summaries()
|
| 139 |
+
|
| 140 |
+
# Track progress
|
| 141 |
+
total_bills = len(bills)
|
| 142 |
+
processed = 0
|
| 143 |
+
errors = 0
|
| 144 |
+
|
| 145 |
+
logger.info(f"Starting summary generation for {total_bills} bills")
|
| 146 |
+
|
| 147 |
+
for i, bill in enumerate(bills, 1):
|
| 148 |
+
bill_key = f"{bill.get('state', 'Unknown')}_{bill.get('bill_number', 'Unknown')}"
|
| 149 |
+
|
| 150 |
+
# Skip if already processed successfully
|
| 151 |
+
if bill_key in existing_summaries and not existing_summaries[bill_key].get('summary', '').startswith('ERROR:'):
|
| 152 |
+
logger.info(f"Skipping {bill_key} - already processed")
|
| 153 |
+
processed += 1
|
| 154 |
+
continue
|
| 155 |
+
|
| 156 |
+
logger.info(f"Processing {i}/{total_bills}: {bill_key}")
|
| 157 |
+
|
| 158 |
+
# Generate summary
|
| 159 |
+
summary = self.generate_summary(bill)
|
| 160 |
+
|
| 161 |
+
# Store result
|
| 162 |
+
existing_summaries[bill_key] = {
|
| 163 |
+
'bill_number': bill.get('bill_number', 'Unknown'),
|
| 164 |
+
'title': bill.get('title', 'No title'),
|
| 165 |
+
'summary': summary
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
if summary.startswith('ERROR:'):
|
| 169 |
+
errors += 1
|
| 170 |
+
else:
|
| 171 |
+
processed += 1
|
| 172 |
+
|
| 173 |
+
# Save progress every 10 bills
|
| 174 |
+
if i % 10 == 0:
|
| 175 |
+
self.save_summaries(existing_summaries)
|
| 176 |
+
logger.info(f"Progress: {i}/{total_bills} processed, {errors} errors")
|
| 177 |
+
|
| 178 |
+
# Rate limiting
|
| 179 |
+
time.sleep(1) # 1 second delay between API calls
|
| 180 |
+
|
| 181 |
+
# Final save
|
| 182 |
+
self.save_summaries(existing_summaries)
|
| 183 |
+
|
| 184 |
+
logger.info(f"Summary generation complete!")
|
| 185 |
+
logger.info(f"Total bills: {total_bills}")
|
| 186 |
+
logger.info(f"Successfully processed: {processed}")
|
| 187 |
+
logger.info(f"Errors: {errors}")
|
| 188 |
+
logger.info(f"Summaries saved to: {self.summaries_file}")
|
| 189 |
+
|
| 190 |
+
def main():
|
| 191 |
+
"""Main function to run the summary generation."""
|
| 192 |
+
try:
|
| 193 |
+
generator = BillSummaryGenerator()
|
| 194 |
+
generator.generate_all_summaries()
|
| 195 |
+
print("✅ Summary generation completed successfully!")
|
| 196 |
+
print(f" Summaries saved to: {generator.summaries_file}")
|
| 197 |
+
|
| 198 |
+
except Exception as e:
|
| 199 |
+
logger.error(f"Fatal error: {e}")
|
| 200 |
+
print(f"❌ Error: {e}")
|
| 201 |
+
sys.exit(1)
|
| 202 |
+
|
| 203 |
+
if __name__ == "__main__":
|
| 204 |
+
main()
|
data/data_updating_scripts/get_data.py
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import json
|
| 4 |
+
import time
|
| 5 |
+
import logging
|
| 6 |
+
import base64
|
| 7 |
+
from datetime import datetime, timezone
|
| 8 |
+
import requests
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
from bs4 import BeautifulSoup
|
| 11 |
+
|
| 12 |
+
# Load environment variables from .env file
|
| 13 |
+
load_dotenv()
|
| 14 |
+
# Pull API key from environment
|
| 15 |
+
API_KEY = os.getenv("LEGISCAN_API_KEY") # Set your LegiScan API key in .env
|
| 16 |
+
if not API_KEY:
|
| 17 |
+
print("Error: Please set LEGISCAN_API_KEY in your .env file.")
|
| 18 |
+
sys.exit(1)
|
| 19 |
+
|
| 20 |
+
# Modes for testing
|
| 21 |
+
# Quick test: pulls only TEST_MAX_BILLS bills
|
| 22 |
+
TESTING_MODE = False
|
| 23 |
+
# Full test: pulls all bills for TEST_STATE and TEST_YEAR without bill count cap
|
| 24 |
+
FULL_TESTING_MODE = False
|
| 25 |
+
TEST_STATE = 'CA'
|
| 26 |
+
TEST_YEAR = 2023
|
| 27 |
+
TEST_MAX_BILLS = 3
|
| 28 |
+
|
| 29 |
+
# Output files
|
| 30 |
+
CACHE_FILE = "data/bill_cache.json" # Stores bill_id -> change_hash
|
| 31 |
+
OUTPUT_FILE = "data/known_bills.json" # Final bills data
|
| 32 |
+
|
| 33 |
+
# Query settings
|
| 34 |
+
QUERY = "artificial intelligence"
|
| 35 |
+
START_YEAR = 2023
|
| 36 |
+
END_YEAR = datetime.now(timezone.utc).year
|
| 37 |
+
|
| 38 |
+
# Include all state legislatures plus U.S. Congress (both chambers)
|
| 39 |
+
STATES = [
|
| 40 |
+
"AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA",
|
| 41 |
+
"HI","ID","IL","IN","IA","KS","KY","LA","ME","MD",
|
| 42 |
+
"MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ",
|
| 43 |
+
"NM","NY","NC","ND","OH","OK","OR","PA","RI","SC",
|
| 44 |
+
"SD","TN","TX","UT","VT","VA","WA","WV","WI","WY",
|
| 45 |
+
"US" # U.S. Congress
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
# Rate limiting (seconds between requests)
|
| 49 |
+
RATE_LIMIT = 0.2
|
| 50 |
+
|
| 51 |
+
# Create logs directory if it doesn't exist
|
| 52 |
+
os.makedirs("data_updating_scripts/logs", exist_ok=True)
|
| 53 |
+
|
| 54 |
+
# Logging configuration
|
| 55 |
+
LOG_FILE = "data_updating_scripts/logs/fetch_ai_bills.log"
|
| 56 |
+
logging.basicConfig(
|
| 57 |
+
level=logging.INFO,
|
| 58 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 59 |
+
handlers=[
|
| 60 |
+
logging.StreamHandler(sys.stdout),
|
| 61 |
+
logging.FileHandler(LOG_FILE)
|
| 62 |
+
]
|
| 63 |
+
)
|
| 64 |
+
logger = logging.getLogger(__name__)
|
| 65 |
+
|
| 66 |
+
# Apply testing overrides
|
| 67 |
+
if TESTING_MODE:
|
| 68 |
+
logger.info(f"*** TESTING MODE: fetching only {TEST_MAX_BILLS} bills from {TEST_STATE} ({TEST_YEAR}) ***")
|
| 69 |
+
STATES = [TEST_STATE]
|
| 70 |
+
if FULL_TESTING_MODE:
|
| 71 |
+
logger.info(f"*** FULL TESTING MODE: fetching all bills from {TEST_STATE} ({TEST_YEAR}) ***")
|
| 72 |
+
STATES = [TEST_STATE]
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def load_json(path, default):
|
| 76 |
+
try:
|
| 77 |
+
with open(path, 'r') as f:
|
| 78 |
+
return json.load(f)
|
| 79 |
+
except (FileNotFoundError, json.JSONDecodeError):
|
| 80 |
+
return default
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def save_json(path, data):
|
| 84 |
+
# Create directory if it doesn't exist
|
| 85 |
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
| 86 |
+
with open(path, 'w') as f:
|
| 87 |
+
json.dump(data, f, indent=2)
|
| 88 |
+
logger.info(f"Saved JSON to {path}")
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def legi_request(op, params):
|
| 92 |
+
base = "https://api.legiscan.com/"
|
| 93 |
+
params.update({"key": API_KEY, "op": op})
|
| 94 |
+
try:
|
| 95 |
+
resp = requests.get(base, params=params, timeout=10)
|
| 96 |
+
resp.raise_for_status()
|
| 97 |
+
data = resp.json()
|
| 98 |
+
if data.get("status") != "OK":
|
| 99 |
+
logger.error(f"API error {op}: {data.get('message', data)}")
|
| 100 |
+
return None
|
| 101 |
+
return data
|
| 102 |
+
except requests.RequestException as e:
|
| 103 |
+
logger.error(f"Request failed ({op}): {e}")
|
| 104 |
+
return None
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def extract_plain_text(html_content: str) -> str:
|
| 108 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
| 109 |
+
return soup.get_text(separator="\n", strip=True)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def main():
|
| 113 |
+
cache = load_json(CACHE_FILE, {})
|
| 114 |
+
existing = load_json(OUTPUT_FILE, [])
|
| 115 |
+
existing_map = {b.get("bill_id"): b for b in existing}
|
| 116 |
+
logger.info(f"Loaded cache entries: {len(cache)}, existing bills: {len(existing)}")
|
| 117 |
+
|
| 118 |
+
collected = []
|
| 119 |
+
total_fetched = 0
|
| 120 |
+
years = [TEST_YEAR] if (TESTING_MODE or FULL_TESTING_MODE) else list(range(START_YEAR, END_YEAR + 1))
|
| 121 |
+
|
| 122 |
+
for state in STATES:
|
| 123 |
+
for year in years:
|
| 124 |
+
page = 1
|
| 125 |
+
while True:
|
| 126 |
+
if TESTING_MODE and total_fetched >= TEST_MAX_BILLS:
|
| 127 |
+
logger.info("Reached TEST_MAX_BILLS limit, stopping early.")
|
| 128 |
+
break
|
| 129 |
+
params = {"state": state, "year": year, "query": QUERY, "page": page}
|
| 130 |
+
logger.info(f"Searching {state} for {year}, page {page}")
|
| 131 |
+
data = legi_request("getSearch", params)
|
| 132 |
+
if not data:
|
| 133 |
+
break
|
| 134 |
+
|
| 135 |
+
results = data.get("searchresult", {})
|
| 136 |
+
summary = results.get("summary", {})
|
| 137 |
+
bills = [v for k, v in results.items() if k != "summary"]
|
| 138 |
+
if not bills:
|
| 139 |
+
logger.info(f"No bills on page {page} for {state} {year}")
|
| 140 |
+
break
|
| 141 |
+
|
| 142 |
+
logger.info(f"Found {len(bills)} bills on {state} {year} page {page}")
|
| 143 |
+
for bill in bills:
|
| 144 |
+
if TESTING_MODE and total_fetched >= TEST_MAX_BILLS:
|
| 145 |
+
break
|
| 146 |
+
bill_id = str(bill.get("bill_id"))
|
| 147 |
+
state_code = bill.get("state")
|
| 148 |
+
bill_num = bill.get("bill_number")
|
| 149 |
+
logger.info(f"Processing bill {state_code}_{bill_num} (ID: {bill_id})")
|
| 150 |
+
|
| 151 |
+
details_resp = legi_request("getBill", {"id": bill_id})
|
| 152 |
+
if not details_resp:
|
| 153 |
+
continue
|
| 154 |
+
details = details_resp.get("bill", {})
|
| 155 |
+
sess_year = details.get("session", {}).get("year_start", 0)
|
| 156 |
+
if sess_year < START_YEAR:
|
| 157 |
+
continue
|
| 158 |
+
|
| 159 |
+
new_hash = details.get("change_hash")
|
| 160 |
+
old_hash = cache.get(bill_id)
|
| 161 |
+
now_iso = datetime.now(timezone.utc).isoformat()
|
| 162 |
+
|
| 163 |
+
# Extract all relevant dates
|
| 164 |
+
explicit = details.get("last_action_date")
|
| 165 |
+
status_date = details.get("status_date")
|
| 166 |
+
last_vote_date = details.get("last_vote_date")
|
| 167 |
+
last_amendment_date = details.get("last_amendment_date")
|
| 168 |
+
actions = details.get("actions", [])
|
| 169 |
+
action_dates = [a.get("action_date") for a in actions if a.get("action_date")]
|
| 170 |
+
most_recent_action = max(action_dates) if action_dates else None
|
| 171 |
+
candidates = [d for d in [explicit, status_date, last_vote_date, last_amendment_date, most_recent_action] if d]
|
| 172 |
+
last_action_date = max(candidates) if candidates else None
|
| 173 |
+
|
| 174 |
+
bill_url = details.get("url") # Bill detail page URL
|
| 175 |
+
|
| 176 |
+
if new_hash and new_hash == old_hash and bill_id in existing_map:
|
| 177 |
+
entry = existing_map[bill_id]
|
| 178 |
+
entry.update({
|
| 179 |
+
"status": details.get("status"),
|
| 180 |
+
"session_year": f"{details.get('session', {}).get('year_start', '')}-{details.get('session', {}).get('year_end', '')}",
|
| 181 |
+
"last_action_date": last_action_date,
|
| 182 |
+
"status_date": status_date,
|
| 183 |
+
"last_vote_date": last_vote_date,
|
| 184 |
+
"last_amendment_date": last_amendment_date,
|
| 185 |
+
"actions": actions,
|
| 186 |
+
"bill_url": bill_url,
|
| 187 |
+
"lastUpdatedAt": now_iso
|
| 188 |
+
})
|
| 189 |
+
logger.info(f"Reused cache; updated status={entry['status']}, last_action_date={entry['last_action_date']}")
|
| 190 |
+
else:
|
| 191 |
+
plain_text = None
|
| 192 |
+
texts = details.get("texts", [])
|
| 193 |
+
if texts:
|
| 194 |
+
doc_id = texts[0].get("doc_id")
|
| 195 |
+
text_resp = legi_request("getBillText", {"id": doc_id})
|
| 196 |
+
if text_resp and "text" in text_resp:
|
| 197 |
+
raw_b64 = text_resp["text"].get("doc", "")
|
| 198 |
+
try:
|
| 199 |
+
decoded = base64.b64decode(raw_b64)
|
| 200 |
+
html = decoded.decode("utf-8", errors="ignore")
|
| 201 |
+
plain_text = extract_plain_text(html)
|
| 202 |
+
except Exception as e:
|
| 203 |
+
logger.error(f"Failed decoding HTML for {bill_id}: {e}")
|
| 204 |
+
|
| 205 |
+
entry = {
|
| 206 |
+
"bill_id": bill_id,
|
| 207 |
+
"state": state_code,
|
| 208 |
+
"bill_number": bill_num,
|
| 209 |
+
"session_year": f"{details.get('session', {}).get('year_start', '')}-{details.get('session', {}).get('year_end', '')}",
|
| 210 |
+
"title": details.get("title"),
|
| 211 |
+
"description": details.get("description"),
|
| 212 |
+
"status": details.get("status"),
|
| 213 |
+
"sponsors": [s.get("name") for s in details.get("sponsors", [])],
|
| 214 |
+
"text": plain_text,
|
| 215 |
+
"last_action_date": last_action_date,
|
| 216 |
+
"status_date": status_date,
|
| 217 |
+
"last_vote_date": last_vote_date,
|
| 218 |
+
"last_amendment_date": last_amendment_date,
|
| 219 |
+
"actions": actions,
|
| 220 |
+
"bill_url": bill_url,
|
| 221 |
+
"change_hash": new_hash,
|
| 222 |
+
"lastUpdatedAt": now_iso
|
| 223 |
+
}
|
| 224 |
+
cache[bill_id] = new_hash
|
| 225 |
+
logger.info(
|
| 226 |
+
f"Entry data: title='{entry['title']}', sponsors={len(entry['sponsors'])}, "
|
| 227 |
+
f"status={entry['status']}, last_action_date={entry['last_action_date']}"
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
collected.append(entry)
|
| 231 |
+
total_fetched += 1
|
| 232 |
+
time.sleep(RATE_LIMIT)
|
| 233 |
+
|
| 234 |
+
if page >= summary.get("page_total", 1):
|
| 235 |
+
break
|
| 236 |
+
page += 1
|
| 237 |
+
time.sleep(RATE_LIMIT)
|
| 238 |
+
if TESTING_MODE and total_fetched >= TEST_MAX_BILLS:
|
| 239 |
+
break
|
| 240 |
+
if TESTING_MODE and total_fetched >= TEST_MAX_BILLS:
|
| 241 |
+
break
|
| 242 |
+
|
| 243 |
+
dedup = {e["bill_id"]: e for e in collected}
|
| 244 |
+
all_bills = list(dedup.values())
|
| 245 |
+
save_json(OUTPUT_FILE, all_bills)
|
| 246 |
+
save_json(CACHE_FILE, cache)
|
| 247 |
+
logger.info(f"Completed run, saved {len(all_bills)} bills to {OUTPUT_FILE}")
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
if __name__ == "__main__":
|
| 251 |
+
main()
|
data/data_updating_scripts/get_data_ORIGINAL.py
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import json
|
| 4 |
+
import time
|
| 5 |
+
import logging
|
| 6 |
+
import base64
|
| 7 |
+
from datetime import datetime, timezone
|
| 8 |
+
import requests
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
from bs4 import BeautifulSoup
|
| 11 |
+
|
| 12 |
+
# Load environment variables from .env file
|
| 13 |
+
load_dotenv()
|
| 14 |
+
# Pull API key from environment
|
| 15 |
+
API_KEY = os.getenv("LEGISCAN_API_KEY") # Set your LegiScan API key in .env
|
| 16 |
+
if not API_KEY:
|
| 17 |
+
print("Error: Please set LEGISCAN_API_KEY in your .env file.")
|
| 18 |
+
sys.exit(1)
|
| 19 |
+
|
| 20 |
+
# Modes for testing
|
| 21 |
+
# Quick test: pulls only TEST_MAX_BILLS bills
|
| 22 |
+
TESTING_MODE = False
|
| 23 |
+
# Full test: pulls all bills for TEST_STATE and TEST_YEAR without bill count cap
|
| 24 |
+
FULL_TESTING_MODE = False
|
| 25 |
+
TEST_STATE = 'CA'
|
| 26 |
+
TEST_YEAR = 2023
|
| 27 |
+
TEST_MAX_BILLS = 3
|
| 28 |
+
|
| 29 |
+
# Output files
|
| 30 |
+
CACHE_FILE = "data/bill_cache.json" # Stores bill_id -> change_hash
|
| 31 |
+
OUTPUT_FILE = "data/known_bills.json" # Final bills data
|
| 32 |
+
|
| 33 |
+
# Query settings
|
| 34 |
+
QUERY = "artificial intelligence"
|
| 35 |
+
START_YEAR = 2023
|
| 36 |
+
END_YEAR = datetime.now(timezone.utc).year
|
| 37 |
+
|
| 38 |
+
# Include all state legislatures plus U.S. Congress (both chambers)
|
| 39 |
+
STATES = [
|
| 40 |
+
"AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA",
|
| 41 |
+
"HI","ID","IL","IN","IA","KS","KY","LA","ME","MD",
|
| 42 |
+
"MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ",
|
| 43 |
+
"NM","NY","NC","ND","OH","OK","OR","PA","RI","SC",
|
| 44 |
+
"SD","TN","TX","UT","VT","VA","WA","WV","WI","WY",
|
| 45 |
+
"US" # U.S. Congress
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
# Rate limiting (seconds between requests)
|
| 49 |
+
RATE_LIMIT = 0.2
|
| 50 |
+
|
| 51 |
+
# Create logs directory if it doesn't exist
|
| 52 |
+
os.makedirs("data_updating_scripts/logs", exist_ok=True)
|
| 53 |
+
|
| 54 |
+
# Logging configuration
|
| 55 |
+
LOG_FILE = "data_updating_scripts/logs/fetch_ai_bills.log"
|
| 56 |
+
logging.basicConfig(
|
| 57 |
+
level=logging.INFO,
|
| 58 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 59 |
+
handlers=[
|
| 60 |
+
logging.StreamHandler(sys.stdout),
|
| 61 |
+
logging.FileHandler(LOG_FILE)
|
| 62 |
+
]
|
| 63 |
+
)
|
| 64 |
+
logger = logging.getLogger(__name__)
|
| 65 |
+
|
| 66 |
+
# Apply testing overrides
|
| 67 |
+
if TESTING_MODE:
|
| 68 |
+
logger.info(f"*** TESTING MODE: fetching only {TEST_MAX_BILLS} bills from {TEST_STATE} ({TEST_YEAR}) ***")
|
| 69 |
+
STATES = [TEST_STATE]
|
| 70 |
+
if FULL_TESTING_MODE:
|
| 71 |
+
logger.info(f"*** FULL TESTING MODE: fetching all bills from {TEST_STATE} ({TEST_YEAR}) ***")
|
| 72 |
+
STATES = [TEST_STATE]
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def load_json(path, default):
|
| 76 |
+
try:
|
| 77 |
+
with open(path, 'r') as f:
|
| 78 |
+
return json.load(f)
|
| 79 |
+
except (FileNotFoundError, json.JSONDecodeError):
|
| 80 |
+
return default
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def save_json(path, data):
|
| 84 |
+
# Create directory if it doesn't exist
|
| 85 |
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
| 86 |
+
with open(path, 'w') as f:
|
| 87 |
+
json.dump(data, f, indent=2)
|
| 88 |
+
logger.info(f"Saved JSON to {path}")
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def legi_request(op, params):
|
| 92 |
+
base = "https://api.legiscan.com/"
|
| 93 |
+
params.update({"key": API_KEY, "op": op})
|
| 94 |
+
try:
|
| 95 |
+
resp = requests.get(base, params=params, timeout=10)
|
| 96 |
+
resp.raise_for_status()
|
| 97 |
+
data = resp.json()
|
| 98 |
+
if data.get("status") != "OK":
|
| 99 |
+
logger.error(f"API error {op}: {data.get('message', data)}")
|
| 100 |
+
return None
|
| 101 |
+
return data
|
| 102 |
+
except requests.RequestException as e:
|
| 103 |
+
logger.error(f"Request failed ({op}): {e}")
|
| 104 |
+
return None
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def extract_plain_text(html_content: str) -> str:
|
| 108 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
| 109 |
+
return soup.get_text(separator="\n", strip=True)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def main():
|
| 113 |
+
cache = load_json(CACHE_FILE, {})
|
| 114 |
+
existing = load_json(OUTPUT_FILE, [])
|
| 115 |
+
existing_map = {b.get("bill_id"): b for b in existing}
|
| 116 |
+
logger.info(f"Loaded cache entries: {len(cache)}, existing bills: {len(existing)}")
|
| 117 |
+
|
| 118 |
+
collected = []
|
| 119 |
+
total_fetched = 0
|
| 120 |
+
years = [TEST_YEAR] if (TESTING_MODE or FULL_TESTING_MODE) else list(range(START_YEAR, END_YEAR + 1))
|
| 121 |
+
|
| 122 |
+
for state in STATES:
|
| 123 |
+
for year in years:
|
| 124 |
+
page = 1
|
| 125 |
+
while True:
|
| 126 |
+
if TESTING_MODE and total_fetched >= TEST_MAX_BILLS:
|
| 127 |
+
logger.info("Reached TEST_MAX_BILLS limit, stopping early.")
|
| 128 |
+
break
|
| 129 |
+
params = {"state": state, "year": year, "query": QUERY, "page": page}
|
| 130 |
+
logger.info(f"Searching {state} for {year}, page {page}")
|
| 131 |
+
data = legi_request("getSearch", params)
|
| 132 |
+
if not data:
|
| 133 |
+
break
|
| 134 |
+
|
| 135 |
+
results = data.get("searchresult", {})
|
| 136 |
+
summary = results.get("summary", {})
|
| 137 |
+
bills = [v for k, v in results.items() if k != "summary"]
|
| 138 |
+
if not bills:
|
| 139 |
+
logger.info(f"No bills on page {page} for {state} {year}")
|
| 140 |
+
break
|
| 141 |
+
|
| 142 |
+
logger.info(f"Found {len(bills)} bills on {state} {year} page {page}")
|
| 143 |
+
for bill in bills:
|
| 144 |
+
if TESTING_MODE and total_fetched >= TEST_MAX_BILLS:
|
| 145 |
+
break
|
| 146 |
+
bill_id = str(bill.get("bill_id"))
|
| 147 |
+
state_code = bill.get("state")
|
| 148 |
+
bill_num = bill.get("bill_number")
|
| 149 |
+
logger.info(f"Processing bill {state_code}_{bill_num} (ID: {bill_id})")
|
| 150 |
+
|
| 151 |
+
details_resp = legi_request("getBill", {"id": bill_id})
|
| 152 |
+
if not details_resp:
|
| 153 |
+
continue
|
| 154 |
+
details = details_resp.get("bill", {})
|
| 155 |
+
sess_year = details.get("session", {}).get("year_start", 0)
|
| 156 |
+
if sess_year < START_YEAR:
|
| 157 |
+
continue
|
| 158 |
+
|
| 159 |
+
new_hash = details.get("change_hash")
|
| 160 |
+
old_hash = cache.get(bill_id)
|
| 161 |
+
now_iso = datetime.now(timezone.utc).isoformat()
|
| 162 |
+
|
| 163 |
+
# Extract all relevant dates
|
| 164 |
+
explicit = details.get("last_action_date")
|
| 165 |
+
status_date = details.get("status_date")
|
| 166 |
+
last_vote_date = details.get("last_vote_date")
|
| 167 |
+
last_amendment_date = details.get("last_amendment_date")
|
| 168 |
+
actions = details.get("actions", [])
|
| 169 |
+
action_dates = [a.get("action_date") for a in actions if a.get("action_date")]
|
| 170 |
+
most_recent_action = max(action_dates) if action_dates else None
|
| 171 |
+
candidates = [d for d in [explicit, status_date, last_vote_date, last_amendment_date, most_recent_action] if d]
|
| 172 |
+
last_action_date = max(candidates) if candidates else None
|
| 173 |
+
|
| 174 |
+
bill_url = details.get("url") # Bill detail page URL
|
| 175 |
+
|
| 176 |
+
if new_hash and new_hash == old_hash and bill_id in existing_map:
|
| 177 |
+
entry = existing_map[bill_id]
|
| 178 |
+
entry.update({
|
| 179 |
+
"status": details.get("status"),
|
| 180 |
+
"session_year": f"{details.get('session', {}).get('year_start', '')}-{details.get('session', {}).get('year_end', '')}",
|
| 181 |
+
"last_action_date": last_action_date,
|
| 182 |
+
"status_date": status_date,
|
| 183 |
+
"last_vote_date": last_vote_date,
|
| 184 |
+
"last_amendment_date": last_amendment_date,
|
| 185 |
+
"actions": actions,
|
| 186 |
+
"bill_url": bill_url,
|
| 187 |
+
"lastUpdatedAt": now_iso
|
| 188 |
+
})
|
| 189 |
+
logger.info(f"Reused cache; updated status={entry['status']}, last_action_date={entry['last_action_date']}")
|
| 190 |
+
else:
|
| 191 |
+
plain_text = None
|
| 192 |
+
texts = details.get("texts", [])
|
| 193 |
+
if texts:
|
| 194 |
+
doc_id = texts[0].get("doc_id")
|
| 195 |
+
text_resp = legi_request("getBillText", {"id": doc_id})
|
| 196 |
+
if text_resp and "text" in text_resp:
|
| 197 |
+
raw_b64 = text_resp["text"].get("doc", "")
|
| 198 |
+
try:
|
| 199 |
+
decoded = base64.b64decode(raw_b64)
|
| 200 |
+
html = decoded.decode("utf-8", errors="ignore")
|
| 201 |
+
plain_text = extract_plain_text(html)
|
| 202 |
+
except Exception as e:
|
| 203 |
+
logger.error(f"Failed decoding HTML for {bill_id}: {e}")
|
| 204 |
+
|
| 205 |
+
entry = {
|
| 206 |
+
"bill_id": bill_id,
|
| 207 |
+
"state": state_code,
|
| 208 |
+
"bill_number": bill_num,
|
| 209 |
+
"session_year": f"{details.get('session', {}).get('year_start', '')}-{details.get('session', {}).get('year_end', '')}",
|
| 210 |
+
"title": details.get("title"),
|
| 211 |
+
"description": details.get("description"),
|
| 212 |
+
"status": details.get("status"),
|
| 213 |
+
"sponsors": [s.get("name") for s in details.get("sponsors", [])],
|
| 214 |
+
"text": plain_text,
|
| 215 |
+
"last_action_date": last_action_date,
|
| 216 |
+
"status_date": status_date,
|
| 217 |
+
"last_vote_date": last_vote_date,
|
| 218 |
+
"last_amendment_date": last_amendment_date,
|
| 219 |
+
"actions": actions,
|
| 220 |
+
"bill_url": bill_url,
|
| 221 |
+
"change_hash": new_hash,
|
| 222 |
+
"lastUpdatedAt": now_iso
|
| 223 |
+
}
|
| 224 |
+
cache[bill_id] = new_hash
|
| 225 |
+
logger.info(
|
| 226 |
+
f"Entry data: title='{entry['title']}', sponsors={len(entry['sponsors'])}, "
|
| 227 |
+
f"status={entry['status']}, last_action_date={entry['last_action_date']}"
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
collected.append(entry)
|
| 231 |
+
total_fetched += 1
|
| 232 |
+
time.sleep(RATE_LIMIT)
|
| 233 |
+
|
| 234 |
+
if page >= summary.get("page_total", 1):
|
| 235 |
+
break
|
| 236 |
+
page += 1
|
| 237 |
+
time.sleep(RATE_LIMIT)
|
| 238 |
+
if TESTING_MODE and total_fetched >= TEST_MAX_BILLS:
|
| 239 |
+
break
|
| 240 |
+
if TESTING_MODE and total_fetched >= TEST_MAX_BILLS:
|
| 241 |
+
break
|
| 242 |
+
|
| 243 |
+
dedup = {e["bill_id"]: e for e in collected}
|
| 244 |
+
all_bills = list(dedup.values())
|
| 245 |
+
save_json(OUTPUT_FILE, all_bills)
|
| 246 |
+
save_json(CACHE_FILE, cache)
|
| 247 |
+
logger.info(f"Completed run, saved {len(all_bills)} bills to {OUTPUT_FILE}")
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
if __name__ == "__main__":
|
| 251 |
+
main()
|
data/data_updating_scripts/known_bills_status.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
known_bills_status.py
|
| 4 |
+
|
| 5 |
+
Reads known_bills_fixed.json and updates existing known_bills_visualize.json.
|
| 6 |
+
Merges new bills and updates existing ones while preserving clean status fields.
|
| 7 |
+
"""
|
| 8 |
+
import json
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from datetime import datetime, timezone
|
| 11 |
+
|
| 12 |
+
def map_status(original_status):
|
| 13 |
+
"""Map LegiScan status codes to clean display text."""
|
| 14 |
+
|
| 15 |
+
# Direct mapping for numeric codes
|
| 16 |
+
status_mapping = {
|
| 17 |
+
"0": "Inactive", # Pre-filed
|
| 18 |
+
"1": "Active", # Introduced
|
| 19 |
+
"2": "Active", # Engrossed
|
| 20 |
+
"3": "Active", # Enrolled
|
| 21 |
+
"4": "Signed Into Law", # Passed
|
| 22 |
+
"5": "Vetoed", # Vetoed
|
| 23 |
+
"6": "Inactive", # Failed
|
| 24 |
+
"7": "Signed Into Law", # Override
|
| 25 |
+
"8": "Signed Into Law", # Chaptered
|
| 26 |
+
"9": "Active", # Refer
|
| 27 |
+
"10": "Active", # Report Pass
|
| 28 |
+
"11": "Inactive", # Report DNP
|
| 29 |
+
"12": "Active", # Draft
|
| 30 |
+
|
| 31 |
+
# Integer versions
|
| 32 |
+
0: "Inactive", 1: "Active", 2: "Active", 3: "Active",
|
| 33 |
+
4: "Signed Into Law", 5: "Vetoed", 6: "Inactive",
|
| 34 |
+
7: "Signed Into Law", 8: "Signed Into Law", 9: "Active",
|
| 35 |
+
10: "Active", 11: "Inactive", 12: "Active"
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
# Try direct mapping first
|
| 39 |
+
if original_status in status_mapping:
|
| 40 |
+
return status_mapping[original_status]
|
| 41 |
+
|
| 42 |
+
# Handle text statuses
|
| 43 |
+
if original_status:
|
| 44 |
+
status_str = str(original_status).lower()
|
| 45 |
+
if "pass" in status_str or "signed" in status_str or "enacted" in status_str:
|
| 46 |
+
return "Signed Into Law"
|
| 47 |
+
elif "veto" in status_str:
|
| 48 |
+
return "Vetoed"
|
| 49 |
+
elif "fail" in status_str or "dead" in status_str or "killed" in status_str:
|
| 50 |
+
return "Inactive"
|
| 51 |
+
elif "active" in status_str or "intro" in status_str or "pending" in status_str:
|
| 52 |
+
return "Active"
|
| 53 |
+
|
| 54 |
+
# Default fallback
|
| 55 |
+
return "Inactive"
|
| 56 |
+
|
| 57 |
+
def create_bill_key(bill):
|
| 58 |
+
"""Create a unique key for each bill."""
|
| 59 |
+
return f"{bill.get('state', 'Unknown')}_{bill.get('bill_number', 'Unknown')}"
|
| 60 |
+
|
| 61 |
+
def merge_bill_data(new_bill, existing_bill=None):
|
| 62 |
+
"""Merge new bill data with existing bill, preserving processed fields."""
|
| 63 |
+
if not existing_bill:
|
| 64 |
+
# New bill - create clean version
|
| 65 |
+
merged_bill = new_bill.copy()
|
| 66 |
+
original_status = merged_bill.get('status')
|
| 67 |
+
merged_bill['original_status'] = original_status
|
| 68 |
+
merged_bill['status'] = map_status(original_status)
|
| 69 |
+
merged_bill['status_updated_at'] = datetime.now(timezone.utc).isoformat()
|
| 70 |
+
return merged_bill
|
| 71 |
+
|
| 72 |
+
# Existing bill - merge carefully
|
| 73 |
+
merged_bill = existing_bill.copy()
|
| 74 |
+
|
| 75 |
+
# Update with new data from source (except status fields)
|
| 76 |
+
for key, value in new_bill.items():
|
| 77 |
+
if key not in ['status', 'original_status', 'status_updated_at']:
|
| 78 |
+
merged_bill[key] = value
|
| 79 |
+
|
| 80 |
+
# Check if original status actually changed
|
| 81 |
+
new_original_status = new_bill.get('status')
|
| 82 |
+
old_original_status = existing_bill.get('original_status')
|
| 83 |
+
|
| 84 |
+
# Convert both to strings for comparison to handle int vs string
|
| 85 |
+
new_status_str = str(new_original_status) if new_original_status is not None else None
|
| 86 |
+
old_status_str = str(old_original_status) if old_original_status is not None else None
|
| 87 |
+
|
| 88 |
+
if new_status_str != old_status_str:
|
| 89 |
+
# Real change in underlying data
|
| 90 |
+
new_clean_status = map_status(new_original_status)
|
| 91 |
+
merged_bill['original_status'] = new_original_status
|
| 92 |
+
merged_bill['status'] = new_clean_status
|
| 93 |
+
merged_bill['status_updated_at'] = datetime.now(timezone.utc).isoformat()
|
| 94 |
+
return merged_bill
|
| 95 |
+
|
| 96 |
+
# No change - keep existing clean status but ensure it's properly mapped
|
| 97 |
+
if 'status' not in merged_bill or merged_bill['status'] in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']:
|
| 98 |
+
# Only remap if status is still numeric (needs cleaning)
|
| 99 |
+
merged_bill['status'] = map_status(old_original_status)
|
| 100 |
+
|
| 101 |
+
return merged_bill
|
| 102 |
+
|
| 103 |
+
def main():
|
| 104 |
+
|
| 105 |
+
# File paths
|
| 106 |
+
input_file = Path("data/known_bills_fixed.json")
|
| 107 |
+
output_file = Path("data/known_bills_visualize.json")
|
| 108 |
+
|
| 109 |
+
print(f"Reading source bills from: {input_file}")
|
| 110 |
+
|
| 111 |
+
# Load source bills data
|
| 112 |
+
with open(input_file, 'r', encoding='utf-8') as f:
|
| 113 |
+
source_bills = json.load(f)
|
| 114 |
+
|
| 115 |
+
print(f"Loaded {len(source_bills)} bills from source")
|
| 116 |
+
|
| 117 |
+
# Load existing visualization data if it exists
|
| 118 |
+
existing_bills = []
|
| 119 |
+
if output_file.exists():
|
| 120 |
+
print(f"Reading existing visualization data from: {output_file}")
|
| 121 |
+
with open(output_file, 'r', encoding='utf-8') as f:
|
| 122 |
+
existing_bills = json.load(f)
|
| 123 |
+
print(f"Loaded {len(existing_bills)} existing bills")
|
| 124 |
+
else:
|
| 125 |
+
print("No existing visualization data found - will create new file")
|
| 126 |
+
|
| 127 |
+
# Create lookup dictionary for existing bills
|
| 128 |
+
existing_bills_dict = {}
|
| 129 |
+
for bill in existing_bills:
|
| 130 |
+
key = create_bill_key(bill)
|
| 131 |
+
existing_bills_dict[key] = bill
|
| 132 |
+
|
| 133 |
+
# Process and merge bills
|
| 134 |
+
merged_bills = []
|
| 135 |
+
new_bills_count = 0
|
| 136 |
+
updated_bills_count = 0
|
| 137 |
+
unchanged_bills_count = 0
|
| 138 |
+
|
| 139 |
+
print(f"\nProcessing {len(source_bills)} bills...")
|
| 140 |
+
|
| 141 |
+
for source_bill in source_bills:
|
| 142 |
+
bill_key = create_bill_key(source_bill)
|
| 143 |
+
existing_bill = existing_bills_dict.get(bill_key)
|
| 144 |
+
|
| 145 |
+
if existing_bill:
|
| 146 |
+
# Check if anything actually changed
|
| 147 |
+
old_original_status = existing_bill.get('original_status')
|
| 148 |
+
new_original_status = source_bill.get('status')
|
| 149 |
+
|
| 150 |
+
if old_original_status != new_original_status:
|
| 151 |
+
updated_bills_count += 1
|
| 152 |
+
else:
|
| 153 |
+
unchanged_bills_count += 1
|
| 154 |
+
else:
|
| 155 |
+
new_bills_count += 1
|
| 156 |
+
|
| 157 |
+
merged_bill = merge_bill_data(source_bill, existing_bill)
|
| 158 |
+
merged_bills.append(merged_bill)
|
| 159 |
+
|
| 160 |
+
# Check for bills that exist in visualization but not in source (removed bills)
|
| 161 |
+
source_keys = {create_bill_key(bill) for bill in source_bills}
|
| 162 |
+
existing_keys = set(existing_bills_dict.keys())
|
| 163 |
+
removed_keys = existing_keys - source_keys
|
| 164 |
+
|
| 165 |
+
# Save updated bills
|
| 166 |
+
print(f"\nSaving updated bills to: {output_file}")
|
| 167 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 168 |
+
json.dump(merged_bills, f, indent=2, ensure_ascii=False)
|
| 169 |
+
|
| 170 |
+
# Show status distribution
|
| 171 |
+
status_counts = {}
|
| 172 |
+
for bill in merged_bills:
|
| 173 |
+
status = bill['status']
|
| 174 |
+
status_counts[status] = status_counts.get(status, 0) + 1
|
| 175 |
+
|
| 176 |
+
# Summary
|
| 177 |
+
print(f"\n✅ Update complete!")
|
| 178 |
+
print(f" 📊 Total bills: {len(merged_bills)}")
|
| 179 |
+
if new_bills_count > 0:
|
| 180 |
+
print(f" 🆕 New bills: {new_bills_count}")
|
| 181 |
+
if updated_bills_count > 0:
|
| 182 |
+
print(f" 🔄 Updated bills: {updated_bills_count}")
|
| 183 |
+
if unchanged_bills_count > 0:
|
| 184 |
+
print(f" ✅ Unchanged bills: {unchanged_bills_count}")
|
| 185 |
+
if removed_keys:
|
| 186 |
+
print(f" 🗑️ Removed bills: {len(removed_keys)}")
|
| 187 |
+
|
| 188 |
+
if new_bills_count == 0 and updated_bills_count == 0:
|
| 189 |
+
print(f" 🎉 All bills are up to date - no changes needed!")
|
| 190 |
+
|
| 191 |
+
print(f"\n📈 Status distribution:")
|
| 192 |
+
for status, count in sorted(status_counts.items()):
|
| 193 |
+
print(f" {status}: {count}")
|
| 194 |
+
|
| 195 |
+
print(f"\n📁 Clean data saved to: {output_file}")
|
| 196 |
+
print("Now run: streamlit run scripts/visualize-MIT.py")
|
| 197 |
+
|
| 198 |
+
if __name__ == "__main__":
|
| 199 |
+
main()
|
data/data_updating_scripts/logs/eu_vectorstore.log
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-11-03 11:40:25,451 [INFO] Starting EU AI Act vectorstore creation...
|
| 2 |
+
2025-11-03 11:40:25,451 [INFO] Extracting text from PDF...
|
| 3 |
+
2025-11-03 11:40:25,480 [INFO] Processing 144 pages from data_updating_scripts/eu-ai-act.pdf
|
| 4 |
+
2025-11-03 11:40:27,260 [INFO] Extracted 612396 characters from PDF
|
| 5 |
+
2025-11-03 11:40:27,260 [INFO] Creating document chunks...
|
| 6 |
+
2025-11-03 11:40:27,268 [INFO] Created 648 document chunks
|
| 7 |
+
2025-11-03 11:40:27,268 [INFO] Initializing embeddings...
|
| 8 |
+
2025-11-03 11:40:27,397 [INFO] Creating FAISS vectorstore...
|
| 9 |
+
2025-11-03 11:40:31,088 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
|
| 10 |
+
2025-11-03 11:40:31,414 [INFO] Loading faiss.
|
| 11 |
+
2025-11-03 11:40:31,881 [INFO] Successfully loaded faiss.
|
| 12 |
+
2025-11-03 11:40:31,936 [INFO] Saving vectorstore to data/eu_ai_act_vectorstore...
|
| 13 |
+
2025-11-03 11:40:31,945 [INFO] ✅ EU AI Act vectorstore created successfully!
|
| 14 |
+
2025-11-03 11:40:31,945 [INFO] - Total chunks: 648
|
| 15 |
+
2025-11-03 11:40:31,945 [INFO] - Text length: 612,396 characters
|
| 16 |
+
2025-11-03 11:40:31,945 [INFO] - Saved to: data/eu_ai_act_vectorstore
|
| 17 |
+
2025-11-03 12:24:44,470 [INFO] Starting EU AI Act vectorstore creation...
|
| 18 |
+
2025-11-03 12:24:44,471 [INFO] Extracting text from PDF...
|
| 19 |
+
2025-11-03 12:24:44,492 [INFO] Processing 144 pages from data_updating_scripts/eu-ai-act.pdf
|
| 20 |
+
2025-11-03 12:24:46,209 [INFO] Extracted 612396 characters from PDF
|
| 21 |
+
2025-11-03 12:24:46,209 [INFO] Creating document chunks...
|
| 22 |
+
2025-11-03 12:24:46,217 [INFO] Created 648 document chunks
|
| 23 |
+
2025-11-03 12:24:46,217 [INFO] Initializing embeddings...
|
| 24 |
+
2025-11-03 12:24:46,357 [INFO] Creating FAISS vectorstore...
|
| 25 |
+
2025-11-03 12:24:49,286 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
|
| 26 |
+
2025-11-03 12:24:49,669 [INFO] Loading faiss.
|
| 27 |
+
2025-11-03 12:24:49,700 [INFO] Successfully loaded faiss.
|
| 28 |
+
2025-11-03 12:24:49,749 [INFO] Saving vectorstore to data/eu_ai_act_vectorstore...
|
| 29 |
+
2025-11-03 12:24:49,754 [INFO] ✅ EU AI Act vectorstore created successfully!
|
| 30 |
+
2025-11-03 12:24:49,754 [INFO] - Total chunks: 648
|
| 31 |
+
2025-11-03 12:24:49,754 [INFO] - Text length: 612,396 characters
|
| 32 |
+
2025-11-03 12:24:49,754 [INFO] - Saved to: data/eu_ai_act_vectorstore
|
| 33 |
+
2025-11-04 15:55:15,879 [INFO] Starting EU AI Act vectorstore creation...
|
| 34 |
+
2025-11-04 15:55:15,879 [INFO] Extracting text from PDF...
|
| 35 |
+
2025-11-04 15:55:15,899 [INFO] Processing 144 pages from data_updating_scripts/eu-ai-act.pdf
|
| 36 |
+
2025-11-04 15:55:17,629 [INFO] Extracted 612396 characters from PDF
|
| 37 |
+
2025-11-04 15:55:17,629 [INFO] Creating document chunks...
|
| 38 |
+
2025-11-04 15:55:17,637 [INFO] Created 648 document chunks
|
| 39 |
+
2025-11-04 15:55:17,637 [INFO] Initializing embeddings...
|
| 40 |
+
2025-11-04 15:55:17,768 [INFO] Creating FAISS vectorstore...
|
| 41 |
+
2025-11-04 15:55:21,406 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
|
| 42 |
+
2025-11-04 15:55:21,846 [INFO] Loading faiss.
|
| 43 |
+
2025-11-04 15:55:21,917 [INFO] Successfully loaded faiss.
|
| 44 |
+
2025-11-04 15:55:21,968 [INFO] Saving vectorstore to data/eu_ai_act_vectorstore...
|
| 45 |
+
2025-11-04 15:55:21,981 [INFO] ✅ EU AI Act vectorstore created successfully!
|
| 46 |
+
2025-11-04 15:55:21,981 [INFO] - Total chunks: 648
|
| 47 |
+
2025-11-04 15:55:21,981 [INFO] - Text length: 612,396 characters
|
| 48 |
+
2025-11-04 15:55:21,981 [INFO] - Saved to: data/eu_ai_act_vectorstore
|
| 49 |
+
2025-11-14 15:36:40,441 [INFO] Starting EU AI Act vectorstore creation...
|
| 50 |
+
2025-11-14 15:36:40,442 [INFO] Extracting text from PDF...
|
| 51 |
+
2025-11-14 15:36:40,455 [INFO] Processing 144 pages from data_updating_scripts/eu-ai-act.pdf
|
| 52 |
+
2025-11-14 15:36:41,830 [INFO] Extracted 612396 characters from PDF
|
| 53 |
+
2025-11-14 15:36:41,830 [INFO] Creating document chunks...
|
| 54 |
+
2025-11-14 15:36:41,837 [INFO] Created 648 document chunks
|
| 55 |
+
2025-11-14 15:36:41,837 [INFO] Initializing embeddings...
|
| 56 |
+
2025-11-14 15:36:41,983 [INFO] Creating FAISS vectorstore...
|
| 57 |
+
2025-11-14 15:36:46,413 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
|
| 58 |
+
2025-11-14 15:36:46,791 [INFO] Loading faiss.
|
| 59 |
+
2025-11-14 15:36:47,362 [INFO] Successfully loaded faiss.
|
| 60 |
+
2025-11-14 15:36:47,404 [INFO] Saving vectorstore to data/eu_ai_act_vectorstore...
|
| 61 |
+
2025-11-14 15:36:47,410 [INFO] ✅ EU AI Act vectorstore created successfully!
|
| 62 |
+
2025-11-14 15:36:47,410 [INFO] - Total chunks: 648
|
| 63 |
+
2025-11-14 15:36:47,410 [INFO] - Text length: 612,396 characters
|
| 64 |
+
2025-11-14 15:36:47,410 [INFO] - Saved to: data/eu_ai_act_vectorstore
|
| 65 |
+
2025-11-20 14:15:10,012 [INFO] Starting EU AI Act vectorstore creation...
|
| 66 |
+
2025-11-20 14:15:10,013 [INFO] Extracting text from PDF...
|
| 67 |
+
2025-11-20 14:15:10,029 [INFO] Processing 144 pages from data_updating_scripts/eu-ai-act.pdf
|
| 68 |
+
2025-11-20 14:15:11,997 [INFO] Extracted 612396 characters from PDF
|
| 69 |
+
2025-11-20 14:15:11,998 [INFO] Creating document chunks...
|
| 70 |
+
2025-11-20 14:15:12,006 [INFO] Created 648 document chunks
|
| 71 |
+
2025-11-20 14:15:12,006 [INFO] Initializing embeddings...
|
| 72 |
+
2025-11-20 14:15:12,200 [INFO] Creating FAISS vectorstore...
|
| 73 |
+
2025-11-20 14:15:16,058 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
|
| 74 |
+
2025-11-20 14:15:16,386 [INFO] Loading faiss.
|
| 75 |
+
2025-11-20 14:15:16,477 [INFO] Successfully loaded faiss.
|
| 76 |
+
2025-11-20 14:15:16,521 [INFO] Saving vectorstore to data/eu_ai_act_vectorstore...
|
| 77 |
+
2025-11-20 14:15:16,529 [INFO] ✅ EU AI Act vectorstore created successfully!
|
| 78 |
+
2025-11-20 14:15:16,529 [INFO] - Total chunks: 648
|
| 79 |
+
2025-11-20 14:15:16,529 [INFO] - Text length: 612,396 characters
|
| 80 |
+
2025-11-20 14:15:16,529 [INFO] - Saved to: data/eu_ai_act_vectorstore
|
| 81 |
+
2025-12-01 12:38:49,653 [INFO] Starting EU AI Act vectorstore creation...
|
| 82 |
+
2025-12-01 12:38:49,653 [INFO] Extracting text from PDF...
|
| 83 |
+
2025-12-01 12:38:49,669 [INFO] Processing 144 pages from data_updating_scripts/eu-ai-act.pdf
|
| 84 |
+
2025-12-01 12:38:51,518 [INFO] Extracted 612396 characters from PDF
|
| 85 |
+
2025-12-01 12:38:51,518 [INFO] Creating document chunks...
|
| 86 |
+
2025-12-01 12:38:51,526 [INFO] Created 648 document chunks
|
| 87 |
+
2025-12-01 12:38:51,526 [INFO] Initializing embeddings...
|
| 88 |
+
2025-12-01 12:38:51,709 [INFO] Creating FAISS vectorstore...
|
| 89 |
+
2025-12-01 12:38:54,252 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
|
| 90 |
+
2025-12-01 12:38:54,675 [INFO] Loading faiss.
|
| 91 |
+
2025-12-01 12:38:54,817 [INFO] Successfully loaded faiss.
|
| 92 |
+
2025-12-01 12:38:54,859 [INFO] Saving vectorstore to data/eu_ai_act_vectorstore...
|
| 93 |
+
2025-12-01 12:38:54,865 [INFO] ✅ EU AI Act vectorstore created successfully!
|
| 94 |
+
2025-12-01 12:38:54,866 [INFO] - Total chunks: 648
|
| 95 |
+
2025-12-01 12:38:54,866 [INFO] - Text length: 612,396 characters
|
| 96 |
+
2025-12-01 12:38:54,866 [INFO] - Saved to: data/eu_ai_act_vectorstore
|
| 97 |
+
2025-12-01 13:21:15,236 [INFO] Starting EU AI Act vectorstore creation...
|
| 98 |
+
2025-12-01 13:21:15,237 [INFO] Extracting text from PDF...
|
| 99 |
+
2025-12-01 13:21:15,253 [INFO] Processing 144 pages from data_updating_scripts/eu-ai-act.pdf
|
| 100 |
+
2025-12-01 13:21:17,069 [INFO] Extracted 612396 characters from PDF
|
| 101 |
+
2025-12-01 13:21:17,069 [INFO] Creating document chunks...
|
| 102 |
+
2025-12-01 13:21:17,078 [INFO] Created 648 document chunks
|
| 103 |
+
2025-12-01 13:21:17,078 [INFO] Initializing embeddings...
|
| 104 |
+
2025-12-01 13:21:17,343 [INFO] Creating FAISS vectorstore...
|
| 105 |
+
2025-12-01 13:21:20,254 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
|
| 106 |
+
2025-12-01 13:21:20,654 [INFO] Loading faiss.
|
| 107 |
+
2025-12-01 13:21:20,768 [INFO] Successfully loaded faiss.
|
| 108 |
+
2025-12-01 13:21:20,815 [INFO] Saving vectorstore to data/eu_ai_act_vectorstore...
|
| 109 |
+
2025-12-01 13:21:20,821 [INFO] ✅ EU AI Act vectorstore created successfully!
|
| 110 |
+
2025-12-01 13:21:20,821 [INFO] - Total chunks: 648
|
| 111 |
+
2025-12-01 13:21:20,821 [INFO] - Text length: 612,396 characters
|
| 112 |
+
2025-12-01 13:21:20,822 [INFO] - Saved to: data/eu_ai_act_vectorstore
|
| 113 |
+
2025-12-03 11:09:39,059 [INFO] Starting EU AI Act vectorstore creation...
|
| 114 |
+
2025-12-03 11:09:39,060 [INFO] Extracting text from PDF...
|
| 115 |
+
2025-12-03 11:09:39,075 [INFO] Processing 144 pages from data_updating_scripts/eu-ai-act.pdf
|
| 116 |
+
2025-12-03 11:09:40,933 [INFO] Extracted 612396 characters from PDF
|
| 117 |
+
2025-12-03 11:09:40,934 [INFO] Creating document chunks...
|
| 118 |
+
2025-12-03 11:09:40,942 [INFO] Created 648 document chunks
|
| 119 |
+
2025-12-03 11:09:40,942 [INFO] Initializing embeddings...
|
| 120 |
+
2025-12-03 11:09:41,136 [INFO] Creating FAISS vectorstore...
|
| 121 |
+
2025-12-03 11:09:44,436 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
|
| 122 |
+
2025-12-03 11:09:44,820 [INFO] Loading faiss.
|
| 123 |
+
2025-12-03 11:09:44,925 [INFO] Successfully loaded faiss.
|
| 124 |
+
2025-12-03 11:09:44,968 [INFO] Saving vectorstore to data/eu_ai_act_vectorstore...
|
| 125 |
+
2025-12-03 11:09:44,974 [INFO] ✅ EU AI Act vectorstore created successfully!
|
| 126 |
+
2025-12-03 11:09:44,974 [INFO] - Total chunks: 648
|
| 127 |
+
2025-12-03 11:09:44,974 [INFO] - Text length: 612,396 characters
|
| 128 |
+
2025-12-03 11:09:44,974 [INFO] - Saved to: data/eu_ai_act_vectorstore
|
data/data_updating_scripts/logs/fetch_ai_bills.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/data_updating_scripts/logs/fix_pdf_bills.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/data_updating_scripts/logs/generate_reports.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/data_updating_scripts/logs/generate_suggested_questions.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/data_updating_scripts/logs/generate_summaries.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/data_updating_scripts/logs/mark_no_text_bills.log
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-11-03 11:36:37 [INFO] Starting no-text bill marking process
|
| 2 |
+
2025-11-03 11:36:37 [INFO] Loaded 1 bills from data/known_bills_visualize.json
|
| 3 |
+
2025-11-03 11:36:37 [INFO] Processing 1 bills to mark no-text bills
|
| 4 |
+
2025-11-03 11:36:37 [INFO] Saved 1 bills to data/known_bills_visualize.json
|
| 5 |
+
2025-11-03 11:36:37 [INFO] Processing complete!
|
| 6 |
+
2025-11-03 11:36:37 [INFO] Total bills processed: 1
|
| 7 |
+
2025-11-03 11:36:37 [INFO] Bills without text: 0
|
| 8 |
+
2025-11-03 11:36:37 [INFO] Already had None categories: 0
|
| 9 |
+
2025-11-03 11:36:37 [INFO] Newly marked as None: 0
|
| 10 |
+
2025-11-03 11:36:37 [INFO] No-text bill marking process completed
|
| 11 |
+
2025-11-03 11:40:21 [INFO] Starting no-text bill marking process
|
| 12 |
+
2025-11-03 11:40:21 [INFO] Loaded 1 bills from data/known_bills_visualize.json
|
| 13 |
+
2025-11-03 11:40:21 [INFO] Processing 1 bills to mark no-text bills
|
| 14 |
+
2025-11-03 11:40:21 [INFO] Saved 1 bills to data/known_bills_visualize.json
|
| 15 |
+
2025-11-03 11:40:21 [INFO] Processing complete!
|
| 16 |
+
2025-11-03 11:40:21 [INFO] Total bills processed: 1
|
| 17 |
+
2025-11-03 11:40:21 [INFO] Bills without text: 0
|
| 18 |
+
2025-11-03 11:40:21 [INFO] Already had None categories: 0
|
| 19 |
+
2025-11-03 11:40:21 [INFO] Newly marked as None: 0
|
| 20 |
+
2025-11-03 11:40:21 [INFO] No-text bill marking process completed
|
| 21 |
+
2025-11-03 12:24:40 [INFO] Starting no-text bill marking process
|
| 22 |
+
2025-11-03 12:24:40 [INFO] Loaded 1 bills from data/known_bills_visualize.json
|
| 23 |
+
2025-11-03 12:24:40 [INFO] Processing 1 bills to mark no-text bills
|
| 24 |
+
2025-11-03 12:24:40 [INFO] Saved 1 bills to data/known_bills_visualize.json
|
| 25 |
+
2025-11-03 12:24:40 [INFO] Processing complete!
|
| 26 |
+
2025-11-03 12:24:40 [INFO] Total bills processed: 1
|
| 27 |
+
2025-11-03 12:24:40 [INFO] Bills without text: 0
|
| 28 |
+
2025-11-03 12:24:40 [INFO] Already had None categories: 0
|
| 29 |
+
2025-11-03 12:24:40 [INFO] Newly marked as None: 0
|
| 30 |
+
2025-11-03 12:24:40 [INFO] No-text bill marking process completed
|
| 31 |
+
2025-11-04 15:55:11 [INFO] Starting no-text bill marking process
|
| 32 |
+
2025-11-04 15:55:11 [INFO] Loaded 10 bills from data/known_bills_visualize.json
|
| 33 |
+
2025-11-04 15:55:11 [INFO] Processing 10 bills to mark no-text bills
|
| 34 |
+
2025-11-04 15:55:11 [INFO] Saved 10 bills to data/known_bills_visualize.json
|
| 35 |
+
2025-11-04 15:55:11 [INFO] Processing complete!
|
| 36 |
+
2025-11-04 15:55:11 [INFO] Total bills processed: 10
|
| 37 |
+
2025-11-04 15:55:11 [INFO] Bills without text: 0
|
| 38 |
+
2025-11-04 15:55:11 [INFO] Already had None categories: 0
|
| 39 |
+
2025-11-04 15:55:11 [INFO] Newly marked as None: 0
|
| 40 |
+
2025-11-04 15:55:11 [INFO] No-text bill marking process completed
|
| 41 |
+
2025-11-14 15:31:16 [INFO] Starting no-text bill marking process
|
| 42 |
+
2025-11-14 15:31:16 [INFO] Loaded 2564 bills from data/known_bills_visualize.json
|
| 43 |
+
2025-11-14 15:31:16 [INFO] Processing 2564 bills to mark no-text bills
|
| 44 |
+
2025-11-14 15:31:16 [INFO] Progress: 100/2564 processed
|
| 45 |
+
2025-11-14 15:31:16 [INFO] Progress: 200/2564 processed
|
| 46 |
+
2025-11-14 15:31:16 [INFO] Progress: 300/2564 processed
|
| 47 |
+
2025-11-14 15:31:16 [INFO] Progress: 400/2564 processed
|
| 48 |
+
2025-11-14 15:31:16 [INFO] Progress: 500/2564 processed
|
| 49 |
+
2025-11-14 15:31:16 [INFO] Progress: 600/2564 processed
|
| 50 |
+
2025-11-14 15:31:16 [INFO] Progress: 700/2564 processed
|
| 51 |
+
2025-11-14 15:31:16 [INFO] Progress: 800/2564 processed
|
| 52 |
+
2025-11-14 15:31:16 [INFO] Progress: 900/2564 processed
|
| 53 |
+
2025-11-14 15:31:16 [INFO] Progress: 1000/2564 processed
|
| 54 |
+
2025-11-14 15:31:16 [INFO] Progress: 1100/2564 processed
|
| 55 |
+
2025-11-14 15:31:16 [INFO] Progress: 1200/2564 processed
|
| 56 |
+
2025-11-14 15:31:16 [INFO] Progress: 1300/2564 processed
|
| 57 |
+
2025-11-14 15:31:16 [INFO] Progress: 1400/2564 processed
|
| 58 |
+
2025-11-14 15:31:16 [INFO] Progress: 1500/2564 processed
|
| 59 |
+
2025-11-14 15:31:16 [INFO] Progress: 1600/2564 processed
|
| 60 |
+
2025-11-14 15:31:16 [INFO] Progress: 1700/2564 processed
|
| 61 |
+
2025-11-14 15:31:16 [INFO] Progress: 1800/2564 processed
|
| 62 |
+
2025-11-14 15:31:16 [INFO] Progress: 1900/2564 processed
|
| 63 |
+
2025-11-14 15:31:16 [INFO] Progress: 2000/2564 processed
|
| 64 |
+
2025-11-14 15:31:16 [INFO] Progress: 2100/2564 processed
|
| 65 |
+
2025-11-14 15:31:16 [INFO] Progress: 2200/2564 processed
|
| 66 |
+
2025-11-14 15:31:16 [INFO] Progress: 2300/2564 processed
|
| 67 |
+
2025-11-14 15:31:16 [INFO] Progress: 2400/2564 processed
|
| 68 |
+
2025-11-14 15:31:16 [INFO] Progress: 2500/2564 processed
|
| 69 |
+
2025-11-14 15:31:17 [INFO] Saved 2564 bills to data/known_bills_visualize.json
|
| 70 |
+
2025-11-14 15:31:17 [INFO] Processing complete!
|
| 71 |
+
2025-11-14 15:31:17 [INFO] Total bills processed: 2564
|
| 72 |
+
2025-11-14 15:31:17 [INFO] Bills without text: 9
|
| 73 |
+
2025-11-14 15:31:17 [INFO] Already had None categories: 9
|
| 74 |
+
2025-11-14 15:31:17 [INFO] Newly marked as None: 0
|
| 75 |
+
2025-11-14 15:31:17 [INFO] No-text bill marking process completed
|
| 76 |
+
2025-11-17 21:13:12 [INFO] Starting no-text bill marking process
|
| 77 |
+
2025-11-17 21:13:13 [INFO] Loaded 2564 bills from data/known_bills_visualize.json
|
| 78 |
+
2025-11-17 21:13:13 [INFO] Processing 2564 bills to mark no-text bills
|
| 79 |
+
2025-11-17 21:13:13 [INFO] Progress: 100/2564 processed
|
| 80 |
+
2025-11-17 21:13:13 [INFO] Progress: 200/2564 processed
|
| 81 |
+
2025-11-17 21:13:13 [INFO] Progress: 300/2564 processed
|
| 82 |
+
2025-11-17 21:13:13 [INFO] Progress: 400/2564 processed
|
| 83 |
+
2025-11-17 21:13:13 [INFO] Progress: 500/2564 processed
|
| 84 |
+
2025-11-17 21:13:13 [INFO] Progress: 600/2564 processed
|
| 85 |
+
2025-11-17 21:13:13 [INFO] Progress: 700/2564 processed
|
| 86 |
+
2025-11-17 21:13:13 [INFO] Progress: 800/2564 processed
|
| 87 |
+
2025-11-17 21:13:13 [INFO] Progress: 900/2564 processed
|
| 88 |
+
2025-11-17 21:13:13 [INFO] Progress: 1000/2564 processed
|
| 89 |
+
2025-11-17 21:13:13 [INFO] Progress: 1100/2564 processed
|
| 90 |
+
2025-11-17 21:13:13 [INFO] Progress: 1200/2564 processed
|
| 91 |
+
2025-11-17 21:13:13 [INFO] Progress: 1300/2564 processed
|
| 92 |
+
2025-11-17 21:13:13 [INFO] Progress: 1400/2564 processed
|
| 93 |
+
2025-11-17 21:13:13 [INFO] Progress: 1500/2564 processed
|
| 94 |
+
2025-11-17 21:13:13 [INFO] Progress: 1600/2564 processed
|
| 95 |
+
2025-11-17 21:13:13 [INFO] Progress: 1700/2564 processed
|
| 96 |
+
2025-11-17 21:13:13 [INFO] Progress: 1800/2564 processed
|
| 97 |
+
2025-11-17 21:13:13 [INFO] Progress: 1900/2564 processed
|
| 98 |
+
2025-11-17 21:13:13 [INFO] Progress: 2000/2564 processed
|
| 99 |
+
2025-11-17 21:13:13 [INFO] Progress: 2100/2564 processed
|
| 100 |
+
2025-11-17 21:13:13 [INFO] Progress: 2200/2564 processed
|
| 101 |
+
2025-11-17 21:13:13 [INFO] Progress: 2300/2564 processed
|
| 102 |
+
2025-11-17 21:13:13 [INFO] Progress: 2400/2564 processed
|
| 103 |
+
2025-11-17 21:13:13 [INFO] Progress: 2500/2564 processed
|
| 104 |
+
2025-11-17 21:13:14 [INFO] Saved 2564 bills to data/known_bills_visualize.json
|
| 105 |
+
2025-11-17 21:13:14 [INFO] Processing complete!
|
| 106 |
+
2025-11-17 21:13:14 [INFO] Total bills processed: 2564
|
| 107 |
+
2025-11-17 21:13:14 [INFO] Bills without text: 9
|
| 108 |
+
2025-11-17 21:13:14 [INFO] Already had None categories: 9
|
| 109 |
+
2025-11-17 21:13:14 [INFO] Newly marked as None: 0
|
| 110 |
+
2025-11-17 21:13:14 [INFO] No-text bill marking process completed
|
| 111 |
+
2025-11-20 13:52:45 [INFO] Starting no-text bill marking process
|
| 112 |
+
2025-11-20 13:52:46 [INFO] Loaded 2596 bills from data/known_bills_visualize.json
|
| 113 |
+
2025-11-20 13:52:46 [INFO] Processing 2596 bills to mark no-text bills
|
| 114 |
+
2025-11-20 13:52:46 [INFO] Progress: 100/2596 processed
|
| 115 |
+
2025-11-20 13:52:46 [INFO] Progress: 200/2596 processed
|
| 116 |
+
2025-11-20 13:52:46 [INFO] Progress: 300/2596 processed
|
| 117 |
+
2025-11-20 13:52:46 [INFO] Progress: 400/2596 processed
|
| 118 |
+
2025-11-20 13:52:46 [INFO] Progress: 500/2596 processed
|
| 119 |
+
2025-11-20 13:52:46 [INFO] Progress: 600/2596 processed
|
| 120 |
+
2025-11-20 13:52:46 [INFO] Progress: 700/2596 processed
|
| 121 |
+
2025-11-20 13:52:46 [INFO] Progress: 800/2596 processed
|
| 122 |
+
2025-11-20 13:52:46 [INFO] Progress: 900/2596 processed
|
| 123 |
+
2025-11-20 13:52:46 [INFO] Progress: 1000/2596 processed
|
| 124 |
+
2025-11-20 13:52:46 [INFO] Progress: 1100/2596 processed
|
| 125 |
+
2025-11-20 13:52:46 [INFO] Progress: 1200/2596 processed
|
| 126 |
+
2025-11-20 13:52:46 [INFO] Progress: 1300/2596 processed
|
| 127 |
+
2025-11-20 13:52:46 [INFO] Progress: 1400/2596 processed
|
| 128 |
+
2025-11-20 13:52:46 [INFO] Progress: 1500/2596 processed
|
| 129 |
+
2025-11-20 13:52:46 [INFO] Progress: 1600/2596 processed
|
| 130 |
+
2025-11-20 13:52:46 [INFO] Progress: 1700/2596 processed
|
| 131 |
+
2025-11-20 13:52:46 [INFO] Progress: 1800/2596 processed
|
| 132 |
+
2025-11-20 13:52:46 [INFO] Progress: 1900/2596 processed
|
| 133 |
+
2025-11-20 13:52:46 [INFO] Progress: 2000/2596 processed
|
| 134 |
+
2025-11-20 13:52:46 [INFO] Progress: 2100/2596 processed
|
| 135 |
+
2025-11-20 13:52:46 [INFO] Progress: 2200/2596 processed
|
| 136 |
+
2025-11-20 13:52:46 [INFO] Progress: 2300/2596 processed
|
| 137 |
+
2025-11-20 13:52:46 [INFO] Progress: 2400/2596 processed
|
| 138 |
+
2025-11-20 13:52:46 [INFO] Progress: 2500/2596 processed
|
| 139 |
+
2025-11-20 13:52:47 [INFO] Saved 2596 bills to data/known_bills_visualize.json
|
| 140 |
+
2025-11-20 13:52:47 [INFO] Processing complete!
|
| 141 |
+
2025-11-20 13:52:47 [INFO] Total bills processed: 2596
|
| 142 |
+
2025-11-20 13:52:47 [INFO] Bills without text: 13
|
| 143 |
+
2025-11-20 13:52:47 [INFO] Already had None categories: 13
|
| 144 |
+
2025-11-20 13:52:47 [INFO] Newly marked as None: 0
|
| 145 |
+
2025-11-20 13:52:47 [INFO] No-text bill marking process completed
|
| 146 |
+
2025-12-01 12:30:17 [INFO] Starting no-text bill marking process
|
| 147 |
+
2025-12-01 12:30:17 [INFO] Loaded 2605 bills from data/known_bills_visualize.json
|
| 148 |
+
2025-12-01 12:30:17 [INFO] Processing 2605 bills to mark no-text bills
|
| 149 |
+
2025-12-01 12:30:17 [INFO] Progress: 100/2605 processed
|
| 150 |
+
2025-12-01 12:30:17 [INFO] Progress: 200/2605 processed
|
| 151 |
+
2025-12-01 12:30:17 [INFO] Progress: 300/2605 processed
|
| 152 |
+
2025-12-01 12:30:17 [INFO] Progress: 400/2605 processed
|
| 153 |
+
2025-12-01 12:30:17 [INFO] Progress: 500/2605 processed
|
| 154 |
+
2025-12-01 12:30:17 [INFO] Progress: 600/2605 processed
|
| 155 |
+
2025-12-01 12:30:17 [INFO] Progress: 700/2605 processed
|
| 156 |
+
2025-12-01 12:30:17 [INFO] Progress: 800/2605 processed
|
| 157 |
+
2025-12-01 12:30:17 [INFO] Progress: 900/2605 processed
|
| 158 |
+
2025-12-01 12:30:17 [INFO] Progress: 1000/2605 processed
|
| 159 |
+
2025-12-01 12:30:17 [INFO] Progress: 1100/2605 processed
|
| 160 |
+
2025-12-01 12:30:17 [INFO] Progress: 1200/2605 processed
|
| 161 |
+
2025-12-01 12:30:17 [INFO] Progress: 1300/2605 processed
|
| 162 |
+
2025-12-01 12:30:17 [INFO] Progress: 1400/2605 processed
|
| 163 |
+
2025-12-01 12:30:17 [INFO] Progress: 1500/2605 processed
|
| 164 |
+
2025-12-01 12:30:17 [INFO] Progress: 1600/2605 processed
|
| 165 |
+
2025-12-01 12:30:17 [INFO] Progress: 1700/2605 processed
|
| 166 |
+
2025-12-01 12:30:17 [INFO] Progress: 1800/2605 processed
|
| 167 |
+
2025-12-01 12:30:17 [INFO] Progress: 1900/2605 processed
|
| 168 |
+
2025-12-01 12:30:17 [INFO] Progress: 2000/2605 processed
|
| 169 |
+
2025-12-01 12:30:17 [INFO] Progress: 2100/2605 processed
|
| 170 |
+
2025-12-01 12:30:17 [INFO] Progress: 2200/2605 processed
|
| 171 |
+
2025-12-01 12:30:17 [INFO] Progress: 2300/2605 processed
|
| 172 |
+
2025-12-01 12:30:17 [INFO] Progress: 2400/2605 processed
|
| 173 |
+
2025-12-01 12:30:17 [INFO] Progress: 2500/2605 processed
|
| 174 |
+
2025-12-01 12:30:17 [INFO] Progress: 2600/2605 processed
|
| 175 |
+
2025-12-01 12:30:19 [INFO] Saved 2605 bills to data/known_bills_visualize.json
|
| 176 |
+
2025-12-01 12:30:19 [INFO] Processing complete!
|
| 177 |
+
2025-12-01 12:30:19 [INFO] Total bills processed: 2605
|
| 178 |
+
2025-12-01 12:30:19 [INFO] Bills without text: 16
|
| 179 |
+
2025-12-01 12:30:19 [INFO] Already had None categories: 16
|
| 180 |
+
2025-12-01 12:30:19 [INFO] Newly marked as None: 0
|
| 181 |
+
2025-12-01 12:30:19 [INFO] No-text bill marking process completed
|
| 182 |
+
2025-12-01 13:11:46 [INFO] Starting no-text bill marking process
|
| 183 |
+
2025-12-01 13:11:47 [INFO] Loaded 2605 bills from data/known_bills_visualize.json
|
| 184 |
+
2025-12-01 13:11:47 [INFO] Processing 2605 bills to mark no-text bills
|
| 185 |
+
2025-12-01 13:11:47 [INFO] Progress: 100/2605 processed
|
| 186 |
+
2025-12-01 13:11:47 [INFO] Progress: 200/2605 processed
|
| 187 |
+
2025-12-01 13:11:47 [INFO] Progress: 300/2605 processed
|
| 188 |
+
2025-12-01 13:11:47 [INFO] Progress: 400/2605 processed
|
| 189 |
+
2025-12-01 13:11:47 [INFO] Progress: 500/2605 processed
|
| 190 |
+
2025-12-01 13:11:47 [INFO] Progress: 600/2605 processed
|
| 191 |
+
2025-12-01 13:11:47 [INFO] Progress: 700/2605 processed
|
| 192 |
+
2025-12-01 13:11:47 [INFO] Progress: 800/2605 processed
|
| 193 |
+
2025-12-01 13:11:47 [INFO] Progress: 900/2605 processed
|
| 194 |
+
2025-12-01 13:11:47 [INFO] Progress: 1000/2605 processed
|
| 195 |
+
2025-12-01 13:11:47 [INFO] Progress: 1100/2605 processed
|
| 196 |
+
2025-12-01 13:11:47 [INFO] Progress: 1200/2605 processed
|
| 197 |
+
2025-12-01 13:11:47 [INFO] Progress: 1300/2605 processed
|
| 198 |
+
2025-12-01 13:11:47 [INFO] Progress: 1400/2605 processed
|
| 199 |
+
2025-12-01 13:11:47 [INFO] Progress: 1500/2605 processed
|
| 200 |
+
2025-12-01 13:11:47 [INFO] Progress: 1600/2605 processed
|
| 201 |
+
2025-12-01 13:11:47 [INFO] Progress: 1700/2605 processed
|
| 202 |
+
2025-12-01 13:11:47 [INFO] Progress: 1800/2605 processed
|
| 203 |
+
2025-12-01 13:11:47 [INFO] Progress: 1900/2605 processed
|
| 204 |
+
2025-12-01 13:11:47 [INFO] Progress: 2000/2605 processed
|
| 205 |
+
2025-12-01 13:11:47 [INFO] Progress: 2100/2605 processed
|
| 206 |
+
2025-12-01 13:11:47 [INFO] Progress: 2200/2605 processed
|
| 207 |
+
2025-12-01 13:11:47 [INFO] Progress: 2300/2605 processed
|
| 208 |
+
2025-12-01 13:11:47 [INFO] Progress: 2400/2605 processed
|
| 209 |
+
2025-12-01 13:11:47 [INFO] Progress: 2500/2605 processed
|
| 210 |
+
2025-12-01 13:11:47 [INFO] Progress: 2600/2605 processed
|
| 211 |
+
2025-12-01 13:11:48 [INFO] Saved 2605 bills to data/known_bills_visualize.json
|
| 212 |
+
2025-12-01 13:11:48 [INFO] Processing complete!
|
| 213 |
+
2025-12-01 13:11:48 [INFO] Total bills processed: 2605
|
| 214 |
+
2025-12-01 13:11:48 [INFO] Bills without text: 16
|
| 215 |
+
2025-12-01 13:11:48 [INFO] Already had None categories: 16
|
| 216 |
+
2025-12-01 13:11:48 [INFO] Newly marked as None: 0
|
| 217 |
+
2025-12-01 13:11:48 [INFO] No-text bill marking process completed
|
| 218 |
+
2025-12-01 13:16:12 [INFO] Starting no-text bill marking process
|
| 219 |
+
2025-12-01 13:16:13 [ERROR] Error loading bills: Expecting ',' delimiter: line 70396 column 683331 (char 189968803)
|
| 220 |
+
2025-12-01 13:16:13 [ERROR] No bills loaded. Exiting.
|
| 221 |
+
2025-12-01 13:16:13 [INFO] No-text bill marking process completed
|
| 222 |
+
2025-12-01 13:16:16 [INFO] Starting no-text bill marking process
|
| 223 |
+
2025-12-01 13:16:17 [INFO] Loaded 2605 bills from data/known_bills_visualize.json
|
| 224 |
+
2025-12-01 13:16:17 [INFO] Processing 2605 bills to mark no-text bills
|
| 225 |
+
2025-12-01 13:16:17 [INFO] Progress: 100/2605 processed
|
| 226 |
+
2025-12-01 13:16:17 [INFO] Progress: 200/2605 processed
|
| 227 |
+
2025-12-01 13:16:17 [INFO] Progress: 300/2605 processed
|
| 228 |
+
2025-12-01 13:16:17 [INFO] Progress: 400/2605 processed
|
| 229 |
+
2025-12-01 13:16:17 [INFO] Progress: 500/2605 processed
|
| 230 |
+
2025-12-01 13:16:17 [INFO] Progress: 600/2605 processed
|
| 231 |
+
2025-12-01 13:16:17 [INFO] Progress: 700/2605 processed
|
| 232 |
+
2025-12-01 13:16:17 [INFO] Progress: 800/2605 processed
|
| 233 |
+
2025-12-01 13:16:17 [INFO] Progress: 900/2605 processed
|
| 234 |
+
2025-12-01 13:16:17 [INFO] Progress: 1000/2605 processed
|
| 235 |
+
2025-12-01 13:16:17 [INFO] Progress: 1100/2605 processed
|
| 236 |
+
2025-12-01 13:16:17 [INFO] Progress: 1200/2605 processed
|
| 237 |
+
2025-12-01 13:16:17 [INFO] Progress: 1300/2605 processed
|
| 238 |
+
2025-12-01 13:16:17 [INFO] Progress: 1400/2605 processed
|
| 239 |
+
2025-12-01 13:16:17 [INFO] Progress: 1500/2605 processed
|
| 240 |
+
2025-12-01 13:16:17 [INFO] Progress: 1600/2605 processed
|
| 241 |
+
2025-12-01 13:16:17 [INFO] Progress: 1700/2605 processed
|
| 242 |
+
2025-12-01 13:16:17 [INFO] Progress: 1800/2605 processed
|
| 243 |
+
2025-12-01 13:16:17 [INFO] Progress: 1900/2605 processed
|
| 244 |
+
2025-12-01 13:16:17 [INFO] Progress: 2000/2605 processed
|
| 245 |
+
2025-12-01 13:16:17 [INFO] Progress: 2100/2605 processed
|
| 246 |
+
2025-12-01 13:16:17 [INFO] Progress: 2200/2605 processed
|
| 247 |
+
2025-12-01 13:16:17 [INFO] Progress: 2300/2605 processed
|
| 248 |
+
2025-12-01 13:16:17 [INFO] Progress: 2400/2605 processed
|
| 249 |
+
2025-12-01 13:16:17 [INFO] Progress: 2500/2605 processed
|
| 250 |
+
2025-12-01 13:16:17 [INFO] Progress: 2600/2605 processed
|
| 251 |
+
2025-12-01 13:16:18 [INFO] Saved 2605 bills to data/known_bills_visualize.json
|
| 252 |
+
2025-12-01 13:16:18 [INFO] Processing complete!
|
| 253 |
+
2025-12-01 13:16:18 [INFO] Total bills processed: 2605
|
| 254 |
+
2025-12-01 13:16:18 [INFO] Bills without text: 16
|
| 255 |
+
2025-12-01 13:16:18 [INFO] Already had None categories: 16
|
| 256 |
+
2025-12-01 13:16:18 [INFO] Newly marked as None: 0
|
| 257 |
+
2025-12-01 13:16:18 [INFO] No-text bill marking process completed
|
| 258 |
+
2025-12-03 11:02:34 [INFO] Starting no-text bill marking process
|
| 259 |
+
2025-12-03 11:02:35 [INFO] Loaded 2608 bills from data/known_bills_visualize.json
|
| 260 |
+
2025-12-03 11:02:35 [INFO] Processing 2608 bills to mark no-text bills
|
| 261 |
+
2025-12-03 11:02:35 [INFO] Progress: 100/2608 processed
|
| 262 |
+
2025-12-03 11:02:35 [INFO] Progress: 200/2608 processed
|
| 263 |
+
2025-12-03 11:02:35 [INFO] Progress: 300/2608 processed
|
| 264 |
+
2025-12-03 11:02:35 [INFO] Progress: 400/2608 processed
|
| 265 |
+
2025-12-03 11:02:35 [INFO] Progress: 500/2608 processed
|
| 266 |
+
2025-12-03 11:02:35 [INFO] Progress: 600/2608 processed
|
| 267 |
+
2025-12-03 11:02:35 [INFO] Progress: 700/2608 processed
|
| 268 |
+
2025-12-03 11:02:35 [INFO] Progress: 800/2608 processed
|
| 269 |
+
2025-12-03 11:02:35 [INFO] Progress: 900/2608 processed
|
| 270 |
+
2025-12-03 11:02:35 [INFO] Progress: 1000/2608 processed
|
| 271 |
+
2025-12-03 11:02:35 [INFO] Progress: 1100/2608 processed
|
| 272 |
+
2025-12-03 11:02:35 [INFO] Progress: 1200/2608 processed
|
| 273 |
+
2025-12-03 11:02:35 [INFO] Progress: 1300/2608 processed
|
| 274 |
+
2025-12-03 11:02:35 [INFO] Progress: 1400/2608 processed
|
| 275 |
+
2025-12-03 11:02:35 [INFO] Progress: 1500/2608 processed
|
| 276 |
+
2025-12-03 11:02:35 [INFO] Progress: 1600/2608 processed
|
| 277 |
+
2025-12-03 11:02:35 [INFO] Progress: 1700/2608 processed
|
| 278 |
+
2025-12-03 11:02:35 [INFO] Progress: 1800/2608 processed
|
| 279 |
+
2025-12-03 11:02:35 [INFO] Progress: 1900/2608 processed
|
| 280 |
+
2025-12-03 11:02:35 [INFO] Progress: 2000/2608 processed
|
| 281 |
+
2025-12-03 11:02:35 [INFO] Progress: 2100/2608 processed
|
| 282 |
+
2025-12-03 11:02:35 [INFO] Progress: 2200/2608 processed
|
| 283 |
+
2025-12-03 11:02:35 [INFO] Progress: 2300/2608 processed
|
| 284 |
+
2025-12-03 11:02:35 [INFO] Progress: 2400/2608 processed
|
| 285 |
+
2025-12-03 11:02:35 [INFO] Progress: 2500/2608 processed
|
| 286 |
+
2025-12-03 11:02:35 [INFO] Progress: 2600/2608 processed
|
| 287 |
+
2025-12-03 11:02:36 [INFO] Saved 2608 bills to data/known_bills_visualize.json
|
| 288 |
+
2025-12-03 11:02:36 [INFO] Processing complete!
|
| 289 |
+
2025-12-03 11:02:36 [INFO] Total bills processed: 2608
|
| 290 |
+
2025-12-03 11:02:36 [INFO] Bills without text: 16
|
| 291 |
+
2025-12-03 11:02:36 [INFO] Already had None categories: 16
|
| 292 |
+
2025-12-03 11:02:36 [INFO] Newly marked as None: 0
|
| 293 |
+
2025-12-03 11:02:36 [INFO] No-text bill marking process completed
|
data/data_updating_scripts/logs/migrate_iapp_categories.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/data_updating_scripts/mark_no_text_bills.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Script to mark bills without text as having None IAPP categories.
|
| 4 |
+
|
| 5 |
+
This script reads known_bills_visualize.json, identifies bills without text,
|
| 6 |
+
and sets their IAPP categories to None. The file is modified in-place.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
import logging
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Dict, List
|
| 14 |
+
import sys
|
| 15 |
+
|
| 16 |
+
# Add the project root to the path
|
| 17 |
+
sys.path.append(str(Path(__file__).parent.parent))
|
| 18 |
+
|
| 19 |
+
# Create logs directory if it doesn't exist
|
| 20 |
+
os.makedirs("data_updating_scripts/logs", exist_ok=True)
|
| 21 |
+
|
| 22 |
+
# Set up logging
|
| 23 |
+
logging.basicConfig(
|
| 24 |
+
level=logging.INFO,
|
| 25 |
+
format='%(asctime)s [%(levelname)s] %(message)s',
|
| 26 |
+
datefmt='%Y-%m-%d %H:%M:%S',
|
| 27 |
+
handlers=[logging.StreamHandler(), logging.FileHandler("data_updating_scripts/logs/mark_no_text_bills.log")]
|
| 28 |
+
)
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class NoTextBillMarker:
|
| 33 |
+
"""Class to mark bills without text as having None IAPP categories."""
|
| 34 |
+
|
| 35 |
+
def __init__(self):
|
| 36 |
+
self.visualize_file = "data/known_bills_visualize.json"
|
| 37 |
+
|
| 38 |
+
def load_bills(self) -> List[Dict]:
|
| 39 |
+
"""Load bills from known_bills_visualize.json."""
|
| 40 |
+
try:
|
| 41 |
+
with open(self.visualize_file, 'r', encoding='utf-8') as f:
|
| 42 |
+
bills = json.load(f)
|
| 43 |
+
logger.info(f"Loaded {len(bills)} bills from {self.visualize_file}")
|
| 44 |
+
return bills
|
| 45 |
+
except Exception as e:
|
| 46 |
+
logger.error(f"Error loading bills: {e}")
|
| 47 |
+
return []
|
| 48 |
+
|
| 49 |
+
def save_bills(self, bills: List[Dict]) -> None:
|
| 50 |
+
"""Save bills back to known_bills_visualize.json."""
|
| 51 |
+
try:
|
| 52 |
+
with open(self.visualize_file, 'w', encoding='utf-8') as f:
|
| 53 |
+
json.dump(bills, f, indent=2, ensure_ascii=False)
|
| 54 |
+
logger.info(f"Saved {len(bills)} bills to {self.visualize_file}")
|
| 55 |
+
except Exception as e:
|
| 56 |
+
logger.error(f"Error saving bills: {e}")
|
| 57 |
+
|
| 58 |
+
def has_text(self, bill: Dict) -> bool:
|
| 59 |
+
text = bill.get('text')
|
| 60 |
+
return isinstance(text, str) and len(text.strip()) > 50
|
| 61 |
+
|
| 62 |
+
def mark_no_text_bills(self) -> None:
|
| 63 |
+
"""Mark bills without text as having None IAPP categories."""
|
| 64 |
+
# Load bills
|
| 65 |
+
bills = self.load_bills()
|
| 66 |
+
if not bills:
|
| 67 |
+
logger.error("No bills loaded. Exiting.")
|
| 68 |
+
return
|
| 69 |
+
|
| 70 |
+
# Track progress
|
| 71 |
+
total_bills = len(bills)
|
| 72 |
+
no_text_count = 0
|
| 73 |
+
already_none_count = 0
|
| 74 |
+
|
| 75 |
+
logger.info(f"Processing {total_bills} bills to mark no-text bills")
|
| 76 |
+
|
| 77 |
+
for i, bill in enumerate(bills, 1):
|
| 78 |
+
bill_key = f"{bill.get('state', 'Unknown')}_{bill.get('bill_number', 'Unknown')}"
|
| 79 |
+
|
| 80 |
+
# Check if bill has text
|
| 81 |
+
if not self.has_text(bill):
|
| 82 |
+
no_text_count += 1
|
| 83 |
+
|
| 84 |
+
# Check if IAPP categories are already None
|
| 85 |
+
current_iapp = bill.get('iapp_categories')
|
| 86 |
+
if current_iapp is None:
|
| 87 |
+
already_none_count += 1
|
| 88 |
+
logger.debug(f"Bill {bill_key} already has None IAPP categories")
|
| 89 |
+
else:
|
| 90 |
+
# Set IAPP categories to None
|
| 91 |
+
bill['iapp_categories'] = None
|
| 92 |
+
logger.info(f"Marked bill {bill_key} as having None IAPP categories (no text)")
|
| 93 |
+
|
| 94 |
+
# Log progress every 100 bills
|
| 95 |
+
if i % 100 == 0:
|
| 96 |
+
logger.info(f"Progress: {i}/{total_bills} processed")
|
| 97 |
+
|
| 98 |
+
# Save the modified bills
|
| 99 |
+
self.save_bills(bills)
|
| 100 |
+
|
| 101 |
+
# Summary
|
| 102 |
+
logger.info(f"Processing complete!")
|
| 103 |
+
logger.info(f"Total bills processed: {total_bills}")
|
| 104 |
+
logger.info(f"Bills without text: {no_text_count}")
|
| 105 |
+
logger.info(f"Already had None categories: {already_none_count}")
|
| 106 |
+
logger.info(f"Newly marked as None: {no_text_count - already_none_count}")
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def main():
|
| 110 |
+
"""Main function to run the no-text bill marker."""
|
| 111 |
+
logger.info("Starting no-text bill marking process")
|
| 112 |
+
|
| 113 |
+
marker = NoTextBillMarker()
|
| 114 |
+
marker.mark_no_text_bills()
|
| 115 |
+
|
| 116 |
+
logger.info("No-text bill marking process completed")
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
if __name__ == "__main__":
|
| 120 |
+
main()
|
data/data_updating_scripts/migrate_iapp_categories.py
ADDED
|
@@ -0,0 +1,358 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Script to migrate IAPP categories for bills with missing or invalid subcategories.
|
| 4 |
+
|
| 5 |
+
This script reads bills from known_bills_fixed.json, analyzes bills with missing IAPP categories
|
| 6 |
+
using OpenAI API, and saves the results to known_bills_visualize.json.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import logging
|
| 11 |
+
import time
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Dict, List, Optional
|
| 14 |
+
import sys
|
| 15 |
+
import os
|
| 16 |
+
import re
|
| 17 |
+
import hashlib
|
| 18 |
+
import argparse
|
| 19 |
+
|
| 20 |
+
# Add the project root to the path
|
| 21 |
+
sys.path.append(str(Path(__file__).parent.parent))
|
| 22 |
+
|
| 23 |
+
from config import ConfigManager
|
| 24 |
+
from langchain_openai import ChatOpenAI
|
| 25 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 26 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 27 |
+
from langchain_core.documents import Document
|
| 28 |
+
|
| 29 |
+
# Paths
|
| 30 |
+
INPUT_FILE = Path("data/known_bills_fixed.json")
|
| 31 |
+
VIS_FILE = Path("data/known_bills_visualize.json")
|
| 32 |
+
CACHE_FILE = Path("data/iapp_categories_cache.json")
|
| 33 |
+
|
| 34 |
+
# Create logs directory if it doesn't exist
|
| 35 |
+
os.makedirs("data_updating_scripts/logs", exist_ok=True)
|
| 36 |
+
|
| 37 |
+
# Configure logging
|
| 38 |
+
logging.basicConfig(
|
| 39 |
+
level=logging.INFO,
|
| 40 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 41 |
+
handlers=[logging.StreamHandler(), logging.FileHandler("data_updating_scripts/logs/migrate_iapp_categories.log")]
|
| 42 |
+
)
|
| 43 |
+
logger = logging.getLogger(__name__)
|
| 44 |
+
|
| 45 |
+
# Exact subcategory lists for validation
|
| 46 |
+
EXACT_SUBCATEGORIES = {
|
| 47 |
+
"Governance": ["Program and documentation", "Assessments", "Training", "Responsible individual"],
|
| 48 |
+
"Transparency": ["General notice", "Labeling/notification", "Explanation/incident reporting", "Developer documentation"],
|
| 49 |
+
"Assurance": ["Registration", "Third-party review"],
|
| 50 |
+
"Individual Rights": ["Opt out/appeal", "Nondiscrimination"]
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
# Fallback categories for failed API calls
|
| 54 |
+
FALLBACK_CATEGORIES = {
|
| 55 |
+
"Governance": ["Program and documentation"],
|
| 56 |
+
"Transparency": ["General notice"],
|
| 57 |
+
"Assurance": ["Registration"],
|
| 58 |
+
"Individual Rights": ["Opt out/appeal"]
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
def bill_key(b: Dict) -> str:
|
| 62 |
+
return f"{b.get('state','Unknown')}_{b.get('bill_number','Unknown')}"
|
| 63 |
+
|
| 64 |
+
def sha256(s: Optional[str]) -> Optional[str]:
|
| 65 |
+
if not isinstance(s, str) or not s.strip():
|
| 66 |
+
return None
|
| 67 |
+
return hashlib.sha256(s.encode("utf-8")).hexdigest()
|
| 68 |
+
|
| 69 |
+
def load_json(path: Path, default):
|
| 70 |
+
try:
|
| 71 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 72 |
+
return json.load(f)
|
| 73 |
+
except Exception:
|
| 74 |
+
return default
|
| 75 |
+
|
| 76 |
+
def save_json(path: Path, data):
|
| 77 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 78 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 79 |
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
| 80 |
+
|
| 81 |
+
class IAPPCategoriesMigrator:
|
| 82 |
+
"""Migrates IAPP categories for bills with missing or invalid subcategories."""
|
| 83 |
+
|
| 84 |
+
def __init__(self, force: bool = False, rebuild_cache: bool = False, sleep_sec: float = 0.0):
|
| 85 |
+
"""Initialize the migrator with configuration."""
|
| 86 |
+
self.config = ConfigManager()
|
| 87 |
+
if not self.config.OPENAI_API_KEY:
|
| 88 |
+
raise ValueError("OPENAI_API_KEY not found in environment variables")
|
| 89 |
+
|
| 90 |
+
self.force = force
|
| 91 |
+
self.sleep_sec = max(0.0, sleep_sec)
|
| 92 |
+
# Cache
|
| 93 |
+
self.cache: Dict[str, Dict] = {} if rebuild_cache else load_json(CACHE_FILE, {})
|
| 94 |
+
|
| 95 |
+
self.llm = ChatOpenAI(
|
| 96 |
+
model=self.config.OPENAI_LLM_MODEL,
|
| 97 |
+
temperature=0.1,
|
| 98 |
+
max_tokens=1000
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
self.iapp_prompt = self._create_prompt()
|
| 102 |
+
self.chain = self.iapp_prompt | self.llm | StrOutputParser()
|
| 103 |
+
logger.info(
|
| 104 |
+
f"Initialized IAPPCategoriesMigrator | model={self.config.OPENAI_LLM_MODEL} | "
|
| 105 |
+
f"force={self.force} | rebuild_cache={rebuild_cache}"
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
def _create_prompt(self):
|
| 109 |
+
"""Create the IAPP analysis prompt with relaxed subcategory matching."""
|
| 110 |
+
prompt_text = """
|
| 111 |
+
Analyze the following AI-related bill content using the IAPP (International Association of Privacy Professionals) framework for AI governance categorization.
|
| 112 |
+
|
| 113 |
+
Your response must be ONLY a JSON object in this exact format with nothing else before or after:
|
| 114 |
+
{{"iapp_categories": {{"Governance": ["subcategory1", "subcategory2"], "Transparency": [], "Assurance": [], "Individual Rights": []}}}}
|
| 115 |
+
|
| 116 |
+
Use these four main categories and their EXACT subcategories (no variations allowed):
|
| 117 |
+
|
| 118 |
+
**Governance:**
|
| 119 |
+
- Program and documentation
|
| 120 |
+
- Assessments
|
| 121 |
+
- Training
|
| 122 |
+
- Responsible individual
|
| 123 |
+
|
| 124 |
+
**Transparency:**
|
| 125 |
+
- General notice
|
| 126 |
+
- Labeling/notification
|
| 127 |
+
- Explanation/incident reporting
|
| 128 |
+
- Developer documentation
|
| 129 |
+
|
| 130 |
+
**Assurance:**
|
| 131 |
+
- Registration
|
| 132 |
+
- Third-party review
|
| 133 |
+
|
| 134 |
+
**Individual Rights:**
|
| 135 |
+
- Opt out/appeal
|
| 136 |
+
- Nondiscrimination
|
| 137 |
+
|
| 138 |
+
Guidelines for categorization:
|
| 139 |
+
- Select ALL applicable subcategories that the bill directly addresses or substantially discusses
|
| 140 |
+
- If a category has no applicable subcategories, try to label it anyway based on surrounding context
|
| 141 |
+
- Be specific – prioritize subcategories that are clearly supported, but use judgment if AI or governance themes are present
|
| 142 |
+
- Focus on what the bill addresses or emphasizes, even if it doesn’t explicitly mandate requirements
|
| 143 |
+
- If the bill discusses AI, automation, decision systems, digital governance, or national technology strategy, categorize it as best as possible
|
| 144 |
+
- Avoid returning no categories when possible assuming that the bill is AI governance related, unless it truly could not be categorized into any of the four categories.
|
| 145 |
+
|
| 146 |
+
Bill content to analyze: {context}
|
| 147 |
+
"""
|
| 148 |
+
return ChatPromptTemplate.from_messages([
|
| 149 |
+
("system", prompt_text),
|
| 150 |
+
("human", "Analyze this bill for IAPP categories:")
|
| 151 |
+
])
|
| 152 |
+
|
| 153 |
+
def docs_from_bill(self, bill: Dict) -> List[Document]:
|
| 154 |
+
txt = bill.get("text", "")
|
| 155 |
+
if not isinstance(txt, str) or not txt.strip():
|
| 156 |
+
return []
|
| 157 |
+
return [
|
| 158 |
+
Document(
|
| 159 |
+
page_content=txt,
|
| 160 |
+
metadata={
|
| 161 |
+
"bill_key": bill_key(bill),
|
| 162 |
+
"state": bill.get("state", "Unknown"),
|
| 163 |
+
"bill_number": bill.get("bill_number", "Unknown"),
|
| 164 |
+
"title": bill.get("title", "No title"),
|
| 165 |
+
},
|
| 166 |
+
)
|
| 167 |
+
]
|
| 168 |
+
|
| 169 |
+
def is_valid_categories(self, iapp: Dict) -> bool:
|
| 170 |
+
if not isinstance(iapp, dict):
|
| 171 |
+
return False
|
| 172 |
+
for cat in ("Governance", "Transparency", "Assurance", "Individual Rights"):
|
| 173 |
+
if cat not in iapp or not isinstance(iapp[cat], list):
|
| 174 |
+
return False
|
| 175 |
+
for sub in iapp[cat]:
|
| 176 |
+
if sub not in EXACT_SUBCATEGORIES[cat]:
|
| 177 |
+
return False
|
| 178 |
+
return True
|
| 179 |
+
|
| 180 |
+
def parse_llm(self, response: str) -> Optional[Dict]:
|
| 181 |
+
m = re.search(r"\{.*\}", response, re.DOTALL)
|
| 182 |
+
if not m:
|
| 183 |
+
return None
|
| 184 |
+
try:
|
| 185 |
+
obj = json.loads(m.group(0))
|
| 186 |
+
return obj.get("iapp_categories")
|
| 187 |
+
except Exception:
|
| 188 |
+
return None
|
| 189 |
+
|
| 190 |
+
def cached_match(self, b: Dict) -> Optional[Dict]:
|
| 191 |
+
bid = str(b.get("bill_id"))
|
| 192 |
+
ch = b.get("change_hash")
|
| 193 |
+
txt_hash = sha256(b.get("text"))
|
| 194 |
+
c = self.cache.get(bid)
|
| 195 |
+
if not c:
|
| 196 |
+
return None
|
| 197 |
+
if (ch and c.get("change_hash") == ch) or (txt_hash and c.get("text_sha256") == txt_hash):
|
| 198 |
+
return c.get("iapp_categories")
|
| 199 |
+
return None
|
| 200 |
+
|
| 201 |
+
def remember(self, b: Dict, iapp: Dict):
|
| 202 |
+
bid = str(b.get("bill_id"))
|
| 203 |
+
self.cache[bid] = {
|
| 204 |
+
"bill_id": bid,
|
| 205 |
+
"change_hash": b.get("change_hash"),
|
| 206 |
+
"text_sha256": sha256(b.get("text")),
|
| 207 |
+
"iapp_categories": iapp,
|
| 208 |
+
"updated_at": b.get("lastUpdatedAt"),
|
| 209 |
+
"state": b.get("state"),
|
| 210 |
+
"bill_number": b.get("bill_number"),
|
| 211 |
+
"title": b.get("title"),
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
def run(self):
|
| 215 |
+
if not INPUT_FILE.exists():
|
| 216 |
+
raise FileNotFoundError(f"Missing {INPUT_FILE}")
|
| 217 |
+
src_bills: List[Dict] = load_json(INPUT_FILE, [])
|
| 218 |
+
vis_bills: List[Dict] = load_json(VIS_FILE, [])
|
| 219 |
+
|
| 220 |
+
vis_map = {bill_key(b): b for b in vis_bills}
|
| 221 |
+
|
| 222 |
+
total = len(src_bills)
|
| 223 |
+
reused_cache = 0
|
| 224 |
+
reused_vis = 0
|
| 225 |
+
computed = 0
|
| 226 |
+
skipped_no_text = 0
|
| 227 |
+
errors = 0
|
| 228 |
+
|
| 229 |
+
out_bills: List[Dict] = []
|
| 230 |
+
|
| 231 |
+
logger.info(f"Loaded {total} source bills; visualize has {len(vis_bills)} existing entries; cache size={len(self.cache)}")
|
| 232 |
+
|
| 233 |
+
for i, b in enumerate(src_bills, 1):
|
| 234 |
+
key = bill_key(b)
|
| 235 |
+
txt = b.get("text", "")
|
| 236 |
+
|
| 237 |
+
out_rec = b.copy()
|
| 238 |
+
|
| 239 |
+
if not isinstance(txt, str) or len(txt.strip()) <= 50:
|
| 240 |
+
prev = vis_map.get(key)
|
| 241 |
+
if prev and "iapp_categories" in prev:
|
| 242 |
+
out_rec["iapp_categories"] = prev["iapp_categories"]
|
| 243 |
+
else:
|
| 244 |
+
out_rec["iapp_categories"] = None
|
| 245 |
+
out_bills.append(out_rec)
|
| 246 |
+
skipped_no_text += 1
|
| 247 |
+
if i % 50 == 0:
|
| 248 |
+
logger.info(f"[{i}/{total}] progress...")
|
| 249 |
+
continue
|
| 250 |
+
|
| 251 |
+
if self.force:
|
| 252 |
+
iapp = self._compute_categories(b)
|
| 253 |
+
if iapp is None:
|
| 254 |
+
iapp = FALLBACK_CATEGORIES
|
| 255 |
+
errors += 1
|
| 256 |
+
else:
|
| 257 |
+
computed += 1
|
| 258 |
+
out_rec["iapp_categories"] = iapp
|
| 259 |
+
self.remember(b, iapp)
|
| 260 |
+
out_bills.append(out_rec)
|
| 261 |
+
if self.sleep_sec:
|
| 262 |
+
time.sleep(self.sleep_sec)
|
| 263 |
+
if i % 10 == 0:
|
| 264 |
+
save_json(VIS_FILE, out_bills)
|
| 265 |
+
save_json(CACHE_FILE, self.cache)
|
| 266 |
+
continue
|
| 267 |
+
|
| 268 |
+
cached = self.cached_match(b)
|
| 269 |
+
if cached and self.is_valid_categories(cached):
|
| 270 |
+
out_rec["iapp_categories"] = cached
|
| 271 |
+
out_bills.append(out_rec)
|
| 272 |
+
reused_cache += 1
|
| 273 |
+
if i % 50 == 0:
|
| 274 |
+
logger.info(f"[{i}/{total}] progress...")
|
| 275 |
+
continue
|
| 276 |
+
|
| 277 |
+
prev = vis_map.get(key)
|
| 278 |
+
if prev and "iapp_categories" in prev:
|
| 279 |
+
prev_bid = str(prev.get("bill_id"))
|
| 280 |
+
prev_cache = self.cache.get(prev_bid)
|
| 281 |
+
if prev_cache:
|
| 282 |
+
same = False
|
| 283 |
+
if b.get("change_hash") and prev_cache.get("change_hash") == b.get("change_hash"):
|
| 284 |
+
same = True
|
| 285 |
+
elif sha256(b.get("text")) == prev_cache.get("text_sha256"):
|
| 286 |
+
same = True
|
| 287 |
+
if same and self.is_valid_categories(prev.get("iapp_categories", {})):
|
| 288 |
+
out_rec["iapp_categories"] = prev["iapp_categories"]
|
| 289 |
+
out_bills.append(out_rec)
|
| 290 |
+
reused_vis += 1
|
| 291 |
+
if i % 50 == 0:
|
| 292 |
+
logger.info(f"[{i}/{total}] progress...")
|
| 293 |
+
continue
|
| 294 |
+
|
| 295 |
+
iapp = self._compute_categories(b)
|
| 296 |
+
if iapp is None:
|
| 297 |
+
iapp = FALLBACK_CATEGORIES
|
| 298 |
+
errors += 1
|
| 299 |
+
else:
|
| 300 |
+
computed += 1
|
| 301 |
+
out_rec["iapp_categories"] = iapp
|
| 302 |
+
self.remember(b, iapp)
|
| 303 |
+
out_bills.append(out_rec)
|
| 304 |
+
|
| 305 |
+
if self.sleep_sec:
|
| 306 |
+
time.sleep(self.sleep_sec)
|
| 307 |
+
if i % 10 == 0:
|
| 308 |
+
save_json(VIS_FILE, out_bills)
|
| 309 |
+
save_json(CACHE_FILE, self.cache)
|
| 310 |
+
|
| 311 |
+
save_json(VIS_FILE, out_bills)
|
| 312 |
+
save_json(CACHE_FILE, self.cache)
|
| 313 |
+
|
| 314 |
+
logger.info("IAPP migration complete.")
|
| 315 |
+
logger.info(f"Total: {total} | reused_cache: {reused_cache} | reused_visualize: {reused_vis} | computed: {computed} | no_text: {skipped_no_text} | errors: {errors}")
|
| 316 |
+
print("✅ IAPP categories migration completed successfully!")
|
| 317 |
+
print(f" Total: {total}")
|
| 318 |
+
print(f" Reused (cache): {reused_cache}")
|
| 319 |
+
print(f" Reused (visualize match): {reused_vis}")
|
| 320 |
+
print(f" Newly computed: {computed}")
|
| 321 |
+
print(f" No text: {skipped_no_text}")
|
| 322 |
+
print(f" Fallback/errors: {errors}")
|
| 323 |
+
print(f" Results: {VIS_FILE}")
|
| 324 |
+
print(f" Cache: {CACHE_FILE}")
|
| 325 |
+
|
| 326 |
+
def _compute_categories(self, bill: Dict) -> Optional[Dict]:
|
| 327 |
+
try:
|
| 328 |
+
docs = self.docs_from_bill(bill)
|
| 329 |
+
if not docs:
|
| 330 |
+
return None
|
| 331 |
+
resp = self.chain.invoke({"context": docs})
|
| 332 |
+
parsed = self.parse_llm(resp)
|
| 333 |
+
if parsed and self.is_valid_categories(parsed):
|
| 334 |
+
return parsed
|
| 335 |
+
resp = self.chain.invoke({"context": docs})
|
| 336 |
+
parsed = self.parse_llm(resp)
|
| 337 |
+
if parsed and self.is_valid_categories(parsed):
|
| 338 |
+
return parsed
|
| 339 |
+
txt = bill.get("text", "").lower()
|
| 340 |
+
if "ai" in txt or "artificial intelligence" in txt:
|
| 341 |
+
return FALLBACK_CATEGORIES
|
| 342 |
+
return {"Governance": [], "Transparency": [], "Assurance": [], "Individual Rights": []}
|
| 343 |
+
except Exception as e:
|
| 344 |
+
logger.exception(f"LLM error for {bill_key(bill)}: {e}")
|
| 345 |
+
return None
|
| 346 |
+
|
| 347 |
+
def main():
|
| 348 |
+
parser = argparse.ArgumentParser()
|
| 349 |
+
parser.add_argument("--force", action="store_true", help="Recompute categories for all bills with text")
|
| 350 |
+
parser.add_argument("--rebuild-cache", action="store_true", help="Ignore existing cache file and rebuild it")
|
| 351 |
+
parser.add_argument("--sleep-sec", type=float, default=0.0, help="Sleep seconds between LLM calls (rate limiting)")
|
| 352 |
+
args = parser.parse_args()
|
| 353 |
+
|
| 354 |
+
migrator = IAPPCategoriesMigrator(force=args.force, rebuild_cache=args.rebuild_cache, sleep_sec=args.sleep_sec)
|
| 355 |
+
migrator.run()
|
| 356 |
+
|
| 357 |
+
if __name__ == "__main__":
|
| 358 |
+
main()
|
data/generate_password_hash.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Password Hash Generator for streamlit-authenticator
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python generate_password_hash.py
|
| 7 |
+
|
| 8 |
+
This will prompt you for a password and generate the bcrypt hash.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import streamlit_authenticator as stauth
|
| 12 |
+
|
| 13 |
+
def generate_hash():
|
| 14 |
+
print("=" * 50)
|
| 15 |
+
print("Password Hash Generator")
|
| 16 |
+
print("=" * 50)
|
| 17 |
+
print()
|
| 18 |
+
|
| 19 |
+
# Get password from user
|
| 20 |
+
password = input("Enter password to hash: ").strip()
|
| 21 |
+
|
| 22 |
+
if not password:
|
| 23 |
+
print("Password cannot be empty!")
|
| 24 |
+
return
|
| 25 |
+
|
| 26 |
+
# Generate hash
|
| 27 |
+
print("\nGenerating hash...")
|
| 28 |
+
hashed_passwords = stauth.Hasher([password]).generate()
|
| 29 |
+
hash_value = hashed_passwords[0]
|
| 30 |
+
|
| 31 |
+
print("\nHash generated successfully!")
|
| 32 |
+
print("=" * 50)
|
| 33 |
+
print(f"\nYour hashed password:\n{hash_value}")
|
| 34 |
+
print("=" * 50)
|
| 35 |
+
|
| 36 |
+
# Show example usage
|
| 37 |
+
print("\nAdd to config.yaml:")
|
| 38 |
+
print("-" * 50)
|
| 39 |
+
print(f"""
|
| 40 |
+
credentials:
|
| 41 |
+
usernames:
|
| 42 |
+
username_here:
|
| 43 |
+
email: user@example.com
|
| 44 |
+
name: User Name
|
| 45 |
+
password: {hash_value}
|
| 46 |
+
""")
|
| 47 |
+
|
| 48 |
+
print("\nOr add to secrets.toml:")
|
| 49 |
+
print("-" * 50)
|
| 50 |
+
print(f"""
|
| 51 |
+
[auth.credentials.usernames.username_here]
|
| 52 |
+
email = "user@example.com"
|
| 53 |
+
name = "User Name"
|
| 54 |
+
password = "{hash_value}"
|
| 55 |
+
""")
|
| 56 |
+
|
| 57 |
+
print("\nDone! Copy the hash above to your config file.")
|
| 58 |
+
|
| 59 |
+
def generate_multiple():
|
| 60 |
+
print("=" * 50)
|
| 61 |
+
print("Multiple User Password Hash Generator")
|
| 62 |
+
print("=" * 50)
|
| 63 |
+
print()
|
| 64 |
+
|
| 65 |
+
users = {}
|
| 66 |
+
|
| 67 |
+
while True:
|
| 68 |
+
username = input("\nEnter username (or press Enter to finish): ").strip()
|
| 69 |
+
if not username:
|
| 70 |
+
break
|
| 71 |
+
|
| 72 |
+
password = input(f"Enter password for {username}: ").strip()
|
| 73 |
+
if not password:
|
| 74 |
+
print("Password cannot be empty! Skipping user.")
|
| 75 |
+
continue
|
| 76 |
+
|
| 77 |
+
email = input(f"Enter email for {username}: ").strip()
|
| 78 |
+
name = input(f"Enter full name for {username}: ").strip()
|
| 79 |
+
|
| 80 |
+
users[username] = {
|
| 81 |
+
'password': password,
|
| 82 |
+
'email': email or f"{username}@example.com",
|
| 83 |
+
'name': name or username.title()
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
if not users:
|
| 87 |
+
print("\n No users to process!")
|
| 88 |
+
return
|
| 89 |
+
|
| 90 |
+
# Generate all hashes
|
| 91 |
+
print("\nGenerating hashes...")
|
| 92 |
+
passwords = [data['password'] for data in users.values()]
|
| 93 |
+
hashed_passwords = stauth.Hasher(passwords).generate()
|
| 94 |
+
|
| 95 |
+
# Update users with hashed passwords
|
| 96 |
+
for i, username in enumerate(users.keys()):
|
| 97 |
+
users[username]['hashed'] = hashed_passwords[i]
|
| 98 |
+
|
| 99 |
+
# Display results
|
| 100 |
+
print("\nHashes generated successfully!")
|
| 101 |
+
print("=" * 50)
|
| 102 |
+
|
| 103 |
+
print("\nconfig.yaml format:")
|
| 104 |
+
print("-" * 50)
|
| 105 |
+
print("credentials:")
|
| 106 |
+
print(" usernames:")
|
| 107 |
+
for username, data in users.items():
|
| 108 |
+
print(f" {username}:")
|
| 109 |
+
print(f" email: {data['email']}")
|
| 110 |
+
print(f" name: {data['name']}")
|
| 111 |
+
print(f" password: {data['hashed']}")
|
| 112 |
+
|
| 113 |
+
print("\nsecrets.toml format:")
|
| 114 |
+
print("-" * 50)
|
| 115 |
+
for username, data in users.items():
|
| 116 |
+
print(f"[auth.credentials.usernames.{username}]")
|
| 117 |
+
print(f'email = "{data["email"]}"')
|
| 118 |
+
print(f'name = "{data["name"]}"')
|
| 119 |
+
print(f'password = "{data["hashed"]}"')
|
| 120 |
+
print()
|
| 121 |
+
|
| 122 |
+
print("Done! Copy the configuration above to your config file.")
|
| 123 |
+
|
| 124 |
+
if __name__ == "__main__":
|
| 125 |
+
print("\nChoose an option:")
|
| 126 |
+
print("1. Generate single password hash")
|
| 127 |
+
print("2. Generate multiple user hashes")
|
| 128 |
+
choice = input("\nEnter choice (1 or 2): ").strip()
|
| 129 |
+
|
| 130 |
+
if choice == "1":
|
| 131 |
+
generate_hash()
|
| 132 |
+
elif choice == "2":
|
| 133 |
+
generate_multiple()
|
| 134 |
+
else:
|
| 135 |
+
print("Invalid choice!")
|
data/huggingface_upload.py
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HuggingFace Dataset Upload Module
|
| 3 |
+
- Tests HF connection
|
| 4 |
+
- Uploads known_bills_visualize.json (legacy function)
|
| 5 |
+
- Uploads ALL core data JSONs (new function) to HuggingFace Datasets Hub
|
| 6 |
+
Works with the Admin panel HuggingFace tab
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from huggingface_hub import HfApi, create_repo
|
| 10 |
+
import streamlit as st
|
| 11 |
+
import os
|
| 12 |
+
import json
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Dict, List, Tuple, Optional
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
FILES_TO_UPLOAD = {
|
| 18 |
+
"data/known_bills_visualize.json": "known_bills_visualize.json",
|
| 19 |
+
"data/bill_summaries.json": "bill_summaries.json",
|
| 20 |
+
"data/bill_suggested_questions.json": "bill_suggested_questions.json",
|
| 21 |
+
"data/bill_reports.json": "bill_reports.json",
|
| 22 |
+
"data/bill_cache.json": "bill_cache.json",
|
| 23 |
+
"data/known_bills.json": "known_bills.json",
|
| 24 |
+
"data/known_bills_fixed.json": "known_bills_fixed.json",
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def _get_hf_token_and_repo() -> Tuple[str, str]:
|
| 30 |
+
"""
|
| 31 |
+
Get HF token + dataset repo.
|
| 32 |
+
|
| 33 |
+
Priority:
|
| 34 |
+
1. Streamlit secrets (for the Admin UI)
|
| 35 |
+
2. Environment variables (for CLI scripts like update_data.py)
|
| 36 |
+
- HUGGINGFACE_HUB_TOKEN
|
| 37 |
+
- HF_REPO_ID
|
| 38 |
+
"""
|
| 39 |
+
token = None
|
| 40 |
+
repo_id = None
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
token = st.secrets["huggingface"]["token"]
|
| 44 |
+
repo_id = st.secrets["huggingface"]["dataset_repo"]
|
| 45 |
+
except Exception:
|
| 46 |
+
pass
|
| 47 |
+
|
| 48 |
+
if not token:
|
| 49 |
+
token = os.getenv("HUGGINGFACE_HUB_TOKEN")
|
| 50 |
+
if not repo_id:
|
| 51 |
+
repo_id = os.getenv("HF_REPO_ID")
|
| 52 |
+
|
| 53 |
+
if not token or not repo_id:
|
| 54 |
+
raise KeyError(
|
| 55 |
+
"HuggingFace configuration missing. "
|
| 56 |
+
"Provide either Streamlit secrets "
|
| 57 |
+
"[huggingface.token] and [huggingface.dataset_repo] "
|
| 58 |
+
"or environment variables HUGGINGFACE_HUB_TOKEN and HF_REPO_ID."
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
return token, repo_id
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def test_hf_connection() -> Tuple[bool, str]:
|
| 66 |
+
"""
|
| 67 |
+
Test connection to HuggingFace API
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
tuple: (success: bool, message: str)
|
| 71 |
+
"""
|
| 72 |
+
try:
|
| 73 |
+
token, _ = _get_hf_token_and_repo()
|
| 74 |
+
api = HfApi()
|
| 75 |
+
user = api.whoami(token=token)
|
| 76 |
+
username = user.get("name") or user.get("fullname") or user.get("id") or "User"
|
| 77 |
+
return True, f"Connected as: {username}"
|
| 78 |
+
except KeyError:
|
| 79 |
+
return False, "HuggingFace token or dataset_repo not found in secrets"
|
| 80 |
+
except Exception as e:
|
| 81 |
+
return False, f"Connection failed: {str(e)}"
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def get_dataset_url(filename: str = "known_bills_visualize.json") -> Optional[str]:
|
| 85 |
+
"""
|
| 86 |
+
Get the public URL of a file inside the dataset.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
filename: Name of the file in the HF dataset repo.
|
| 90 |
+
|
| 91 |
+
Returns:
|
| 92 |
+
str | None: URL to the dataset file, or None if config missing
|
| 93 |
+
"""
|
| 94 |
+
try:
|
| 95 |
+
repo = st.secrets["huggingface"]["dataset_repo"]
|
| 96 |
+
return f"https://huggingface.co/datasets/{repo}/resolve/main/{filename}"
|
| 97 |
+
except KeyError:
|
| 98 |
+
return None
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def _find_and_validate_json(possible_paths: List[Path]) -> Path:
|
| 102 |
+
"""
|
| 103 |
+
Given a list of possible paths, return the first that exists,
|
| 104 |
+
and validate that it is valid JSON.
|
| 105 |
+
"""
|
| 106 |
+
file_path = None
|
| 107 |
+
for path in possible_paths:
|
| 108 |
+
if path.exists():
|
| 109 |
+
file_path = path
|
| 110 |
+
break
|
| 111 |
+
|
| 112 |
+
if file_path is None:
|
| 113 |
+
raise FileNotFoundError(
|
| 114 |
+
"File not found.\n"
|
| 115 |
+
"Checked locations:\n" + "\n".join(f" - {p}" for p in possible_paths)
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
try:
|
| 119 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
| 120 |
+
data = json.load(f)
|
| 121 |
+
if not isinstance(data, (dict, list)):
|
| 122 |
+
raise ValueError("JSON file must contain a dict or list")
|
| 123 |
+
except json.JSONDecodeError as e:
|
| 124 |
+
raise ValueError(f"Invalid JSON file: {str(e)}")
|
| 125 |
+
|
| 126 |
+
return file_path
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def _ensure_dataset_exists(api: HfApi, repo_id: str, token: str) -> None:
|
| 130 |
+
"""Create the dataset repo if it does not already exist."""
|
| 131 |
+
try:
|
| 132 |
+
create_repo(
|
| 133 |
+
repo_id=repo_id,
|
| 134 |
+
repo_type="dataset",
|
| 135 |
+
token=token,
|
| 136 |
+
exist_ok=True,
|
| 137 |
+
private=False,
|
| 138 |
+
)
|
| 139 |
+
except Exception:
|
| 140 |
+
pass
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def upload_to_huggingface() -> str:
|
| 144 |
+
"""
|
| 145 |
+
Legacy function: Upload ONLY known_bills_visualize.json to HuggingFace Datasets Hub.
|
| 146 |
+
Used by existing Admin panel code. New code should prefer upload_all_to_huggingface().
|
| 147 |
+
|
| 148 |
+
Returns:
|
| 149 |
+
str: Public URL to the uploaded file
|
| 150 |
+
|
| 151 |
+
Raises:
|
| 152 |
+
FileNotFoundError: If JSON file doesn't exist
|
| 153 |
+
Exception: If upload fails
|
| 154 |
+
"""
|
| 155 |
+
try:
|
| 156 |
+
token, repo_id = _get_hf_token_and_repo()
|
| 157 |
+
api = HfApi()
|
| 158 |
+
|
| 159 |
+
_ensure_dataset_exists(api, repo_id, token)
|
| 160 |
+
|
| 161 |
+
possible_paths = [
|
| 162 |
+
Path("data/known_bills_visualize.json"),
|
| 163 |
+
Path("known_bills_visualize.json"),
|
| 164 |
+
]
|
| 165 |
+
file_path = _find_and_validate_json(possible_paths)
|
| 166 |
+
|
| 167 |
+
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
|
| 168 |
+
|
| 169 |
+
api.upload_file(
|
| 170 |
+
path_or_fileobj=str(file_path),
|
| 171 |
+
path_in_repo="known_bills_visualize.json",
|
| 172 |
+
repo_id=repo_id,
|
| 173 |
+
repo_type="dataset",
|
| 174 |
+
token=token,
|
| 175 |
+
commit_message=f"Update AI legislation data ({file_size_mb:.2f}MB)",
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
url = get_dataset_url("known_bills_visualize.json")
|
| 179 |
+
return url
|
| 180 |
+
|
| 181 |
+
except FileNotFoundError as e:
|
| 182 |
+
raise e
|
| 183 |
+
except KeyError as e:
|
| 184 |
+
raise Exception(f"Missing configuration in secrets.toml: {e}")
|
| 185 |
+
except Exception as e:
|
| 186 |
+
raise Exception(f"Upload failed: {str(e)}")
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def upload_all_to_huggingface() -> Dict[str, str]:
|
| 190 |
+
"""
|
| 191 |
+
NEW: Upload ALL core JSON files to HuggingFace Datasets Hub.
|
| 192 |
+
|
| 193 |
+
Returns:
|
| 194 |
+
dict: mapping from dataset filename -> public URL (for successfully uploaded files)
|
| 195 |
+
"""
|
| 196 |
+
token, repo_id = _get_hf_token_and_repo()
|
| 197 |
+
api = HfApi()
|
| 198 |
+
_ensure_dataset_exists(api, repo_id, token)
|
| 199 |
+
|
| 200 |
+
uploaded_urls: Dict[str, str] = {}
|
| 201 |
+
|
| 202 |
+
for local_path, dest_name in FILES_TO_UPLOAD.items():
|
| 203 |
+
possible_paths = [Path(local_path), Path(dest_name)]
|
| 204 |
+
|
| 205 |
+
try:
|
| 206 |
+
file_path = _find_and_validate_json(possible_paths)
|
| 207 |
+
except FileNotFoundError:
|
| 208 |
+
msg = f"Skipping missing file: {local_path}"
|
| 209 |
+
print(msg)
|
| 210 |
+
st.write(msg)
|
| 211 |
+
continue
|
| 212 |
+
except ValueError as e:
|
| 213 |
+
msg = f"Skipping invalid JSON in {local_path}: {e}"
|
| 214 |
+
print(msg)
|
| 215 |
+
st.write(msg)
|
| 216 |
+
continue
|
| 217 |
+
|
| 218 |
+
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
|
| 219 |
+
commit_msg = f"Update {dest_name} ({file_size_mb:.2f}MB)"
|
| 220 |
+
|
| 221 |
+
print(f"Uploading {file_path} → {repo_id}/{dest_name} ...")
|
| 222 |
+
api.upload_file(
|
| 223 |
+
path_or_fileobj=str(file_path),
|
| 224 |
+
path_in_repo=dest_name,
|
| 225 |
+
repo_id=repo_id,
|
| 226 |
+
repo_type="dataset",
|
| 227 |
+
token=token,
|
| 228 |
+
commit_message=commit_msg,
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
url = get_dataset_url(dest_name)
|
| 232 |
+
if url:
|
| 233 |
+
uploaded_urls[dest_name] = url
|
| 234 |
+
|
| 235 |
+
return uploaded_urls
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
if __name__ == "__main__":
|
| 239 |
+
print("Testing HuggingFace connection...")
|
| 240 |
+
success, msg = test_hf_connection()
|
| 241 |
+
print(msg)
|
| 242 |
+
|
| 243 |
+
if success:
|
| 244 |
+
print("\nAttempting upload of ALL files...")
|
| 245 |
+
try:
|
| 246 |
+
urls = upload_all_to_huggingface()
|
| 247 |
+
print("\nUpload successful!")
|
| 248 |
+
for name, url in urls.items():
|
| 249 |
+
print(f"- {name}: {url}")
|
| 250 |
+
except Exception as e:
|
| 251 |
+
print(f"\nUpload failed: {e}")
|
data/pages/Admin.py
ADDED
|
@@ -0,0 +1,459 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import streamlit_authenticator as stauth
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import sys
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import subprocess
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
import os
|
| 9 |
+
from huggingface_upload import upload_all_to_huggingface
|
| 10 |
+
|
| 11 |
+
# Allow imports of project modules
|
| 12 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 13 |
+
from user_management import HuggingFaceUserManager, load_user_config
|
| 14 |
+
|
| 15 |
+
st.set_page_config(page_title="Admin Panel", layout="wide", page_icon="🛠️")
|
| 16 |
+
|
| 17 |
+
# CSS
|
| 18 |
+
st.markdown("""
|
| 19 |
+
<style>
|
| 20 |
+
.main .block-container { padding-top: 2rem; max-width: 1200px; }
|
| 21 |
+
h2 { color: #e0e0e0 !important; font-weight: 400 !important; font-size: 1.5rem !important; }
|
| 22 |
+
</style>
|
| 23 |
+
""", unsafe_allow_html=True)
|
| 24 |
+
|
| 25 |
+
# CONFIG
|
| 26 |
+
config, using_hf = load_user_config()
|
| 27 |
+
|
| 28 |
+
if config is None:
|
| 29 |
+
st.error("Authentication configuration not found!")
|
| 30 |
+
st.stop()
|
| 31 |
+
|
| 32 |
+
# AUTH SYSTEM
|
| 33 |
+
authenticator = stauth.Authenticate(
|
| 34 |
+
config['credentials'],
|
| 35 |
+
config['cookie']['name'],
|
| 36 |
+
config['cookie']['key'],
|
| 37 |
+
config['cookie']['expiry_days']
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
try:
|
| 41 |
+
authenticator.login('main')
|
| 42 |
+
except Exception as e:
|
| 43 |
+
st.error(f"Login error: {e}")
|
| 44 |
+
|
| 45 |
+
name = st.session_state.get("name")
|
| 46 |
+
authentication_status = st.session_state.get("authentication_status")
|
| 47 |
+
username = st.session_state.get("username")
|
| 48 |
+
|
| 49 |
+
if authentication_status == False:
|
| 50 |
+
st.error('Username/password is incorrect')
|
| 51 |
+
st.stop()
|
| 52 |
+
|
| 53 |
+
if authentication_status == None:
|
| 54 |
+
st.warning('Please enter your username and password')
|
| 55 |
+
st.stop()
|
| 56 |
+
|
| 57 |
+
# AUTH VIEW
|
| 58 |
+
if authentication_status:
|
| 59 |
+
|
| 60 |
+
with st.sidebar:
|
| 61 |
+
st.markdown("---")
|
| 62 |
+
st.markdown(f"**Logged in as:** {name}")
|
| 63 |
+
st.markdown(f"**Username:** {username}")
|
| 64 |
+
authenticator.logout('Logout', 'sidebar')
|
| 65 |
+
|
| 66 |
+
ALLOWED_USERNAMES = set(config['credentials']['usernames'].keys())
|
| 67 |
+
if username not in ALLOWED_USERNAMES:
|
| 68 |
+
st.error(f"User '{username}' is not authorized.")
|
| 69 |
+
st.stop()
|
| 70 |
+
|
| 71 |
+
# HEADER
|
| 72 |
+
st.success(f"Welcome, {name}!")
|
| 73 |
+
st.markdown("---")
|
| 74 |
+
st.markdown("""
|
| 75 |
+
<div style='text-align: center; padding: 1rem 0 2rem 0;'>
|
| 76 |
+
<h1 style='color: #1f2937;'>Admin Panel</h1>
|
| 77 |
+
<p style='color: #6b7280;'>Cloud data sync controls</p>
|
| 78 |
+
</div>
|
| 79 |
+
""", unsafe_allow_html=True)
|
| 80 |
+
st.markdown("---")
|
| 81 |
+
|
| 82 |
+
# Tabs
|
| 83 |
+
tab1, tab2, tab3 = st.tabs(["Dashboard", "Data Pipeline", "User Management"])
|
| 84 |
+
|
| 85 |
+
# ------------------------------------------------------------------
|
| 86 |
+
# TAB 1 — Dashboard
|
| 87 |
+
# ------------------------------------------------------------------
|
| 88 |
+
with tab1:
|
| 89 |
+
st.subheader("Admin Dashboard")
|
| 90 |
+
|
| 91 |
+
users = config['credentials']['usernames']
|
| 92 |
+
admin_data = [
|
| 93 |
+
{
|
| 94 |
+
"Username": uname,
|
| 95 |
+
"Name": data.get("name"),
|
| 96 |
+
"Email": data.get("email"),
|
| 97 |
+
"Current User": "Admin" if uname == username else ""
|
| 98 |
+
}
|
| 99 |
+
for uname, data in users.items()
|
| 100 |
+
]
|
| 101 |
+
|
| 102 |
+
st.dataframe(pd.DataFrame(admin_data), width="stretch", hide_index=True)
|
| 103 |
+
|
| 104 |
+
# ------------------------------------------------------------------
|
| 105 |
+
# TAB 2 — DATA PIPELINE
|
| 106 |
+
# ------------------------------------------------------------------
|
| 107 |
+
with tab2:
|
| 108 |
+
st.subheader("Data Pipeline")
|
| 109 |
+
|
| 110 |
+
if 'huggingface' not in st.secrets:
|
| 111 |
+
st.warning("Add HuggingFace credentials to `.streamlit/secrets.toml`")
|
| 112 |
+
st.stop()
|
| 113 |
+
|
| 114 |
+
from huggingface_upload import upload_to_huggingface, test_hf_connection
|
| 115 |
+
|
| 116 |
+
# --- Connection Test
|
| 117 |
+
st.markdown("Connection Status")
|
| 118 |
+
col1, col2 = st.columns(2)
|
| 119 |
+
|
| 120 |
+
with col1:
|
| 121 |
+
if st.button("Test HuggingFace Connection", width='stretch'):
|
| 122 |
+
ok, msg = test_hf_connection()
|
| 123 |
+
(st.success if ok else st.error)(msg)
|
| 124 |
+
|
| 125 |
+
with col2:
|
| 126 |
+
repo = st.secrets["huggingface"]["dataset_repo"]
|
| 127 |
+
st.info(f"Dataset: {repo}")
|
| 128 |
+
|
| 129 |
+
st.markdown("---")
|
| 130 |
+
|
| 131 |
+
# --- Full Data Update Section
|
| 132 |
+
st.subheader("Full Data Update")
|
| 133 |
+
st.info("Pull new data, process PDFs, generate embeddings, and upload to HuggingFace.")
|
| 134 |
+
|
| 135 |
+
# ➤ NEW UI CONTROL — Pull new data?
|
| 136 |
+
pull_new_data = st.radio(
|
| 137 |
+
"Pull new data from LegiScan?",
|
| 138 |
+
options=[
|
| 139 |
+
("no", "No - Use existing local data"),
|
| 140 |
+
("yes", "Yes - Pull fresh data (costs API quota)"),
|
| 141 |
+
],
|
| 142 |
+
format_func=lambda x: x[1],
|
| 143 |
+
index=0,
|
| 144 |
+
key="pull_option"
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
# ➤ NEW UI CONTROL — overwrite known_bills.json?
|
| 148 |
+
overwrite_pdf = st.radio(
|
| 149 |
+
"After fixing PDF bills, overwrite data/known_bills.json?",
|
| 150 |
+
options=[
|
| 151 |
+
("no", "No - keep original file"),
|
| 152 |
+
("yes", "Yes - overwrite with cleaned PDF text"),
|
| 153 |
+
],
|
| 154 |
+
format_func=lambda x: x[1],
|
| 155 |
+
index=0,
|
| 156 |
+
key="overwrite_option"
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
# Run full update
|
| 160 |
+
if st.button("Run Full Update & Upload", type="primary", width='stretch'):
|
| 161 |
+
status_container = st.container()
|
| 162 |
+
|
| 163 |
+
with status_container:
|
| 164 |
+
st.markdown("### Step 1: Running Data Pipeline")
|
| 165 |
+
|
| 166 |
+
with st.status("Processing data...", expanded=True) as status:
|
| 167 |
+
|
| 168 |
+
try:
|
| 169 |
+
update_cmd = [sys.executable, "update_data.py"]
|
| 170 |
+
legiscan_answer = "y\n" if pull_new_data[0] == "yes" else "n\n"
|
| 171 |
+
|
| 172 |
+
import os
|
| 173 |
+
from dotenv import load_dotenv
|
| 174 |
+
load_dotenv()
|
| 175 |
+
|
| 176 |
+
env = os.environ.copy()
|
| 177 |
+
|
| 178 |
+
# Pass OpenAI keys (existing logic)
|
| 179 |
+
openai_key = (
|
| 180 |
+
st.secrets.get("openai_api_key")
|
| 181 |
+
or st.secrets.get("OPENAI_API_KEY")
|
| 182 |
+
or env.get("openai_api_key")
|
| 183 |
+
or env.get("OPENAI_API_KEY")
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
if openai_key:
|
| 187 |
+
env["OPENAI_API_KEY"] = openai_key
|
| 188 |
+
env["openai_api_key"] = openai_key
|
| 189 |
+
st.success("OpenAI key found")
|
| 190 |
+
else:
|
| 191 |
+
st.warning("OpenAI API key missing!")
|
| 192 |
+
|
| 193 |
+
# ➤ NEW: Pass PDF overwrite decision into environment
|
| 194 |
+
env["FIX_PDF_OVERWRITE"] = (
|
| 195 |
+
"yes" if overwrite_pdf[0] == "yes" else "no"
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
log_file = Path("pipeline_last_run.log")
|
| 199 |
+
|
| 200 |
+
with log_file.open("w", encoding="utf-8") as lf:
|
| 201 |
+
proc = subprocess.Popen(
|
| 202 |
+
update_cmd,
|
| 203 |
+
stdout=subprocess.PIPE,
|
| 204 |
+
stderr=subprocess.STDOUT,
|
| 205 |
+
stdin=subprocess.PIPE,
|
| 206 |
+
text=True,
|
| 207 |
+
bufsize=1,
|
| 208 |
+
env=env,
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
# Send LegiScan yes/no
|
| 212 |
+
try:
|
| 213 |
+
proc.stdin.write(legiscan_answer)
|
| 214 |
+
proc.stdin.write("n\n") # continue-on-error prompt
|
| 215 |
+
proc.stdin.flush()
|
| 216 |
+
proc.stdin.close()
|
| 217 |
+
except:
|
| 218 |
+
pass
|
| 219 |
+
|
| 220 |
+
# Stream output
|
| 221 |
+
for line in proc.stdout:
|
| 222 |
+
line = line.rstrip("\n")
|
| 223 |
+
st.text(line)
|
| 224 |
+
lf.write(line + "\n")
|
| 225 |
+
|
| 226 |
+
rc = proc.wait()
|
| 227 |
+
|
| 228 |
+
if rc == 0:
|
| 229 |
+
status.update(label="Data pipeline completed", state="complete")
|
| 230 |
+
st.success("Processing successful!")
|
| 231 |
+
|
| 232 |
+
st.markdown("---")
|
| 233 |
+
st.markdown("### Step 2: Uploading to HuggingFace")
|
| 234 |
+
|
| 235 |
+
with st.spinner("Uploading..."):
|
| 236 |
+
url = upload_to_huggingface()
|
| 237 |
+
st.success("Uploaded to HuggingFace!")
|
| 238 |
+
st.code(url)
|
| 239 |
+
st.cache_data.clear()
|
| 240 |
+
|
| 241 |
+
else:
|
| 242 |
+
status.update(label="Pipeline failed", state="error")
|
| 243 |
+
st.error(f"Pipeline exited with code {rc}")
|
| 244 |
+
|
| 245 |
+
except Exception as e:
|
| 246 |
+
st.error(f"Pipeline error: {e}")
|
| 247 |
+
st.exception(e)
|
| 248 |
+
|
| 249 |
+
st.markdown("---")
|
| 250 |
+
|
| 251 |
+
with st.expander("Manual Upload Only"):
|
| 252 |
+
st.info("Use this only when skipping update_data.py")
|
| 253 |
+
|
| 254 |
+
if st.button("Upload Existing Data", width='stretch'):
|
| 255 |
+
with st.spinner("Uploading..."):
|
| 256 |
+
url = upload_to_huggingface()
|
| 257 |
+
st.success("Uploaded!")
|
| 258 |
+
st.code(url)
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
with tab3:
|
| 262 |
+
st.subheader("User Management")
|
| 263 |
+
|
| 264 |
+
if using_hf:
|
| 265 |
+
st.success("Using HuggingFace for persistent user storage")
|
| 266 |
+
|
| 267 |
+
try:
|
| 268 |
+
user_manager = HuggingFaceUserManager()
|
| 269 |
+
|
| 270 |
+
st.markdown("Add New Admin")
|
| 271 |
+
|
| 272 |
+
with st.form("add_user_form"):
|
| 273 |
+
col1, col2 = st.columns(2)
|
| 274 |
+
with col1:
|
| 275 |
+
new_username = st.text_input("Username", key="new_username")
|
| 276 |
+
new_email = st.text_input("Email", key="new_email")
|
| 277 |
+
with col2:
|
| 278 |
+
new_name = st.text_input("Full Name", key="new_name")
|
| 279 |
+
new_password = st.text_input("Password", type="password", key="new_password")
|
| 280 |
+
|
| 281 |
+
submit_add = st.form_submit_button("Add Admin", type="primary", width='stretch')
|
| 282 |
+
|
| 283 |
+
if submit_add:
|
| 284 |
+
if not all([new_username, new_email, new_name, new_password]):
|
| 285 |
+
st.error("Please fill in all fields")
|
| 286 |
+
else:
|
| 287 |
+
with st.spinner("Adding user..."):
|
| 288 |
+
import bcrypt
|
| 289 |
+
hashed_password = bcrypt.hashpw(new_password.encode(), bcrypt.gensalt()).decode()
|
| 290 |
+
|
| 291 |
+
success, message, commit_url = user_manager.add_user(
|
| 292 |
+
new_username, new_email, new_name, hashed_password
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
if success:
|
| 296 |
+
st.success(f"{message}")
|
| 297 |
+
st.cache_data.clear()
|
| 298 |
+
if commit_url:
|
| 299 |
+
with st.expander("View commit"):
|
| 300 |
+
st.code(commit_url)
|
| 301 |
+
st.rerun()
|
| 302 |
+
else:
|
| 303 |
+
st.error(f"{message}")
|
| 304 |
+
|
| 305 |
+
st.markdown("---")
|
| 306 |
+
|
| 307 |
+
st.markdown("Edit Admin")
|
| 308 |
+
|
| 309 |
+
users = config['credentials']['usernames']
|
| 310 |
+
usernames_list = list(users.keys())
|
| 311 |
+
|
| 312 |
+
with st.form("edit_user_form"):
|
| 313 |
+
user_to_edit = st.selectbox(
|
| 314 |
+
"Select user to edit",
|
| 315 |
+
options=usernames_list,
|
| 316 |
+
key="edit_username"
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
current_user = users.get(user_to_edit, {})
|
| 320 |
+
|
| 321 |
+
st.markdown("**Current Details:**")
|
| 322 |
+
st.text(f"Email: {current_user.get('email', 'N/A')}")
|
| 323 |
+
st.text(f"Name: {current_user.get('name', 'N/A')}")
|
| 324 |
+
|
| 325 |
+
st.markdown("**New Details** (leave blank to keep current):")
|
| 326 |
+
|
| 327 |
+
col1, col2 = st.columns(2)
|
| 328 |
+
with col1:
|
| 329 |
+
new_email = st.text_input("New Email", key="edit_email", placeholder="Leave blank to keep current")
|
| 330 |
+
new_password = st.text_input("New Password", type="password", key="edit_password", placeholder="Leave blank to keep current")
|
| 331 |
+
with col2:
|
| 332 |
+
new_name = st.text_input("New Name", key="edit_name", placeholder="Leave blank to keep current")
|
| 333 |
+
|
| 334 |
+
submit_edit = st.form_submit_button("Update Admin", type="primary", width='stretch')
|
| 335 |
+
|
| 336 |
+
if submit_edit:
|
| 337 |
+
if not any([new_email, new_name, new_password]):
|
| 338 |
+
st.warning("Please enter at least one field to update")
|
| 339 |
+
else:
|
| 340 |
+
with st.spinner("Updating user..."):
|
| 341 |
+
hashed_password = None
|
| 342 |
+
if new_password:
|
| 343 |
+
import bcrypt
|
| 344 |
+
hashed_password = bcrypt.hashpw(new_password.encode(), bcrypt.gensalt()).decode()
|
| 345 |
+
|
| 346 |
+
success, message, commit_url = user_manager.update_user(
|
| 347 |
+
user_to_edit,
|
| 348 |
+
new_email=new_email if new_email else None,
|
| 349 |
+
new_name=new_name if new_name else None,
|
| 350 |
+
new_password=hashed_password
|
| 351 |
+
)
|
| 352 |
+
|
| 353 |
+
if success:
|
| 354 |
+
st.success(f"{message}")
|
| 355 |
+
st.info("Refreshing user data...")
|
| 356 |
+
st.cache_data.clear()
|
| 357 |
+
if commit_url:
|
| 358 |
+
with st.expander("View commit"):
|
| 359 |
+
st.code(commit_url)
|
| 360 |
+
st.info("Please log out and log back in if you changed your own password")
|
| 361 |
+
st.rerun()
|
| 362 |
+
else:
|
| 363 |
+
st.error(f"{message}")
|
| 364 |
+
|
| 365 |
+
st.markdown("---")
|
| 366 |
+
|
| 367 |
+
# Remove user
|
| 368 |
+
st.markdown("Remove Admin")
|
| 369 |
+
|
| 370 |
+
users = config['credentials']['usernames']
|
| 371 |
+
usernames_list = list(users.keys())
|
| 372 |
+
|
| 373 |
+
if len(usernames_list) > 1:
|
| 374 |
+
with st.form("remove_user_form"):
|
| 375 |
+
user_to_remove = st.selectbox(
|
| 376 |
+
"Select user to remove",
|
| 377 |
+
options=usernames_list,
|
| 378 |
+
key="remove_username"
|
| 379 |
+
)
|
| 380 |
+
|
| 381 |
+
st.warning(f"This will permanently delete user: **{user_to_remove}**")
|
| 382 |
+
|
| 383 |
+
confirm_remove = st.checkbox("I confirm I want to remove this user")
|
| 384 |
+
submit_remove = st.form_submit_button("Remove Admin", type="secondary", width='stretch')
|
| 385 |
+
|
| 386 |
+
if submit_remove:
|
| 387 |
+
if not confirm_remove:
|
| 388 |
+
st.error("Please confirm the removal")
|
| 389 |
+
elif user_to_remove == username:
|
| 390 |
+
st.error("You cannot remove yourself!")
|
| 391 |
+
else:
|
| 392 |
+
with st.spinner("Removing user..."):
|
| 393 |
+
success, message, commit_url = user_manager.remove_user(user_to_remove)
|
| 394 |
+
|
| 395 |
+
if success:
|
| 396 |
+
st.success(f"✅ {message}")
|
| 397 |
+
st.cache_data.clear()
|
| 398 |
+
if commit_url:
|
| 399 |
+
with st.expander("View commit"):
|
| 400 |
+
st.code(commit_url)
|
| 401 |
+
st.rerun()
|
| 402 |
+
else:
|
| 403 |
+
st.error(f"{message}")
|
| 404 |
+
else:
|
| 405 |
+
st.info("ℹCannot remove the last admin user")
|
| 406 |
+
|
| 407 |
+
st.markdown("---")
|
| 408 |
+
|
| 409 |
+
# Show current users
|
| 410 |
+
st.markdown("Current Admins")
|
| 411 |
+
for uname, udata in users.items():
|
| 412 |
+
with st.expander(f"{udata.get('name', uname)} (@{uname})"):
|
| 413 |
+
st.write(f"**Email:** {udata.get('email', 'N/A')}")
|
| 414 |
+
st.write(f"**Username:** {uname}")
|
| 415 |
+
st.write(f"**Admin Status:**Admin")
|
| 416 |
+
|
| 417 |
+
if uname == username:
|
| 418 |
+
st.info("This is you!")
|
| 419 |
+
|
| 420 |
+
except Exception as e:
|
| 421 |
+
st.error(f"Error initializing user manager: {e}")
|
| 422 |
+
st.exception(e)
|
| 423 |
+
|
| 424 |
+
else:
|
| 425 |
+
st.warning("Using secrets.toml (read-only)")
|
| 426 |
+
st.info("For persistent user management, add HuggingFace credentials to secrets.toml")
|
| 427 |
+
|
| 428 |
+
with st.expander("How to add users manually"):
|
| 429 |
+
st.markdown("""
|
| 430 |
+
**To add new users when using secrets.toml:**
|
| 431 |
+
|
| 432 |
+
1. **Generate password hash:**
|
| 433 |
+
```bash
|
| 434 |
+
python generate_password_hash.py
|
| 435 |
+
```
|
| 436 |
+
|
| 437 |
+
2. **Add to secrets.toml:**
|
| 438 |
+
```toml
|
| 439 |
+
[auth.credentials.usernames.newuser]
|
| 440 |
+
email = "user@vanderbilt.edu"
|
| 441 |
+
name = "New User"
|
| 442 |
+
password = "$2b$12$HASH_FROM_STEP_1"
|
| 443 |
+
```
|
| 444 |
+
|
| 445 |
+
3. **Update on HuggingFace Spaces** (re-upload secrets.toml)
|
| 446 |
+
|
| 447 |
+
All registered users automatically get admin access.
|
| 448 |
+
""")
|
| 449 |
+
|
| 450 |
+
st.markdown("---")
|
| 451 |
+
|
| 452 |
+
st.markdown("Current Admins")
|
| 453 |
+
if 'credentials' in config and 'usernames' in config['credentials']:
|
| 454 |
+
users = config['credentials']['usernames']
|
| 455 |
+
for uname, udata in users.items():
|
| 456 |
+
with st.expander(f"{udata.get('name', uname)} (@{uname})"):
|
| 457 |
+
st.write(f"**Email:** {udata.get('email', 'N/A')}")
|
| 458 |
+
st.write(f"**Username:** {uname}")
|
| 459 |
+
st.write(f"**Admin Status:Admin")
|
data/update_data.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
import sys
|
| 3 |
+
from huggingface_upload import upload_all_to_huggingface
|
| 4 |
+
|
| 5 |
+
all_scripts = [
|
| 6 |
+
"data_updating_scripts/get_data.py",
|
| 7 |
+
"data_updating_scripts/fix_pdf_bills.py",
|
| 8 |
+
"data_updating_scripts/known_bills_status.py",
|
| 9 |
+
"data_updating_scripts/migrate_iapp_categories.py",
|
| 10 |
+
"data_updating_scripts/mark_no_text_bills.py",
|
| 11 |
+
"data_updating_scripts/generate_summaries.py",
|
| 12 |
+
"data_updating_scripts/generate_suggested_questions.py",
|
| 13 |
+
"data_updating_scripts/generate_reports.py",
|
| 14 |
+
"data_updating_scripts/eu_vectorstore.py",
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
print("Do you want to pull new data from LegiScan?")
|
| 18 |
+
print("Enter 'y' or 'yes' to pull new data, or 'n' or 'no' to skip and use existing data:")
|
| 19 |
+
response = input().strip().lower()
|
| 20 |
+
|
| 21 |
+
if response in ["y", "yes"]:
|
| 22 |
+
print("\n✓ Will pull new data from LegiScan")
|
| 23 |
+
scripts_to_run = all_scripts
|
| 24 |
+
elif response in ["n", "no"]:
|
| 25 |
+
print("\n✓ Skipping data pull, using existing data")
|
| 26 |
+
scripts_to_run = all_scripts[2:]
|
| 27 |
+
else:
|
| 28 |
+
print(f"\n✗ Invalid response '{response}'. Please run the script again and enter 'y' or 'n'.")
|
| 29 |
+
sys.exit(1)
|
| 30 |
+
|
| 31 |
+
print(f"\nWill run {len(scripts_to_run)} scripts:")
|
| 32 |
+
for script in scripts_to_run:
|
| 33 |
+
print(f" - {script}")
|
| 34 |
+
|
| 35 |
+
print("\n" + "=" * 50)
|
| 36 |
+
|
| 37 |
+
for script in scripts_to_run:
|
| 38 |
+
print(f"\n--- Running {script} ---")
|
| 39 |
+
print("=" * 50)
|
| 40 |
+
|
| 41 |
+
result = subprocess.run([sys.executable, script])
|
| 42 |
+
|
| 43 |
+
if result.returncode != 0:
|
| 44 |
+
print(f"\n✗ Script {script} failed with return code {result.returncode}")
|
| 45 |
+
print("Do you want to continue with the remaining scripts? (y/n):")
|
| 46 |
+
continue_response = input().strip().lower()
|
| 47 |
+
if continue_response not in ["y", "yes"]:
|
| 48 |
+
print("Stopping pipeline execution.")
|
| 49 |
+
sys.exit(1)
|
| 50 |
+
else:
|
| 51 |
+
print(f"✓ {script} completed successfully")
|
| 52 |
+
|
| 53 |
+
print("\n" + "=" * 50)
|
| 54 |
+
print("✓ Pipeline execution completed!")
|
| 55 |
+
|
| 56 |
+
print("\nUploading all JSON datasets to HuggingFace…")
|
| 57 |
+
|
| 58 |
+
try:
|
| 59 |
+
upload_all_to_huggingface()
|
| 60 |
+
print("✓ HuggingFace upload complete!")
|
| 61 |
+
except KeyError as e:
|
| 62 |
+
print(f"✗ HuggingFace config error: {e}")
|
| 63 |
+
except Exception as e:
|
| 64 |
+
print(f"✗ Upload failed: {e}")
|