ramanna commited on
Commit
0e39328
·
verified ·
1 Parent(s): 1ec6477

Upload 30 files

Browse files
Files changed (31) hide show
  1. .gitattributes +1 -0
  2. data/data_updating_scripts/PROMPTS/__pycache__/bill_summary_prompt.cpython-313.pyc +0 -0
  3. data/data_updating_scripts/PROMPTS/bill_summary_prompt.py +29 -0
  4. data/data_updating_scripts/PROMPTS/suggested_questions_prompt.md +25 -0
  5. data/data_updating_scripts/__pycache__/config.cpython-313.pyc +0 -0
  6. data/data_updating_scripts/build_bills_vectorstore.py +46 -0
  7. data/data_updating_scripts/build_bills_vectorstore_pinecone_delta.py +43 -0
  8. data/data_updating_scripts/config.py +43 -0
  9. data/data_updating_scripts/eu-ai-act.pdf +3 -0
  10. data/data_updating_scripts/eu_vectorstore.py +269 -0
  11. data/data_updating_scripts/fix_pdf_bills.py +282 -0
  12. data/data_updating_scripts/generate_reports.py +274 -0
  13. data/data_updating_scripts/generate_suggested_questions.py +269 -0
  14. data/data_updating_scripts/generate_summaries.py +204 -0
  15. data/data_updating_scripts/get_data.py +251 -0
  16. data/data_updating_scripts/get_data_ORIGINAL.py +251 -0
  17. data/data_updating_scripts/known_bills_status.py +199 -0
  18. data/data_updating_scripts/logs/eu_vectorstore.log +128 -0
  19. data/data_updating_scripts/logs/fetch_ai_bills.log +0 -0
  20. data/data_updating_scripts/logs/fix_pdf_bills.log +0 -0
  21. data/data_updating_scripts/logs/generate_reports.log +0 -0
  22. data/data_updating_scripts/logs/generate_suggested_questions.log +0 -0
  23. data/data_updating_scripts/logs/generate_summaries.log +0 -0
  24. data/data_updating_scripts/logs/mark_no_text_bills.log +293 -0
  25. data/data_updating_scripts/logs/migrate_iapp_categories.log +0 -0
  26. data/data_updating_scripts/mark_no_text_bills.py +120 -0
  27. data/data_updating_scripts/migrate_iapp_categories.py +358 -0
  28. data/generate_password_hash.py +135 -0
  29. data/huggingface_upload.py +251 -0
  30. data/pages/Admin.py +459 -0
  31. data/update_data.py +64 -0
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  data/eu_ai_act_vectorstore/index.faiss filter=lfs diff=lfs merge=lfs -text
37
  data/known_bills_visualize.json filter=lfs diff=lfs merge=lfs -text
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  data/eu_ai_act_vectorstore/index.faiss filter=lfs diff=lfs merge=lfs -text
37
  data/known_bills_visualize.json filter=lfs diff=lfs merge=lfs -text
38
+ data/data_updating_scripts/eu-ai-act.pdf filter=lfs diff=lfs merge=lfs -text
data/data_updating_scripts/PROMPTS/__pycache__/bill_summary_prompt.cpython-313.pyc ADDED
Binary file (1.37 kB). View file
 
data/data_updating_scripts/PROMPTS/bill_summary_prompt.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PROMPTS/bill_summary_prompt.py
2
+ BILL_SUMMARY_PROMPT = """
3
+ You are an expert legislative analyst specializing in AI governance and technology policy. Your task is to provide a clear, concise summary of the given bill text.
4
+
5
+ Please analyze the bill and provide a comprehensive summary that includes:
6
+
7
+ 1. **Main Purpose**: What is the primary objective of this bill?
8
+ 2. **Key Provisions**: What are the main requirements, prohibitions, or authorizations?
9
+ 3. **AI-Related Elements**: How does this bill relate to artificial intelligence, if at all?
10
+ 4. **Scope and Impact**: Who does this bill affect and what are the potential consequences?
11
+ 5. **Implementation**: What mechanisms or processes does the bill establish?
12
+
13
+ **Requirements:**
14
+ - Keep the summary concise but comprehensive (aim for 200-400 words)
15
+ - Use clear, professional language
16
+ - Focus on the most important aspects of the bill
17
+ - If the bill is not related to AI, clearly state this
18
+ - Structure the response with clear sections using markdown formatting
19
+
20
+ **Bill Information:**
21
+ - Bill Number: {bill_number}
22
+ - Bill Title: {bill_title}
23
+ - State: {state}
24
+
25
+ **Bill Text:**
26
+ {bill_text}
27
+
28
+ Please provide your analysis:
29
+ """
data/data_updating_scripts/PROMPTS/suggested_questions_prompt.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are an AI governance legislation expert. Your task is to analyze the provided bill and generate exactly 5 relevant, specific questions that users might want to ask about this particular bill.
2
+
3
+ The questions should:
4
+ - Be specific to the content and provisions of this bill
5
+ - Cover different aspects of the legislation (definitions, scope, enforcement, compliance, etc.)
6
+ - Be phrased as user-friendly questions that someone analyzing AI governance would ask
7
+ - Be practical and actionable for understanding the bill's impact
8
+ - Avoid generic questions that could apply to any bill
9
+
10
+ Focus on aspects like:
11
+ - Key definitions and terminology
12
+ - Scope and applicability
13
+ - Enforcement mechanisms and penalties
14
+ - Compliance requirements
15
+ - Rights and obligations
16
+ - Implementation timelines
17
+ - Regulatory oversight
18
+ - Specific AI technologies or systems mentioned
19
+
20
+ Format your response as exactly 5 questions, one per line, with no numbering or bullet points. Each question should be complete and ready to use.
21
+
22
+ ### Bill Content
23
+ {context}
24
+
25
+ Generate 5 specific questions about this bill:
data/data_updating_scripts/__pycache__/config.cpython-313.pyc ADDED
Binary file (2.43 kB). View file
 
data/data_updating_scripts/build_bills_vectorstore.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse, os
3
+ from pathlib import Path
4
+ from dotenv import load_dotenv
5
+ load_dotenv(dotenv_path=Path.cwd() / ".env")
6
+
7
+ import sys
8
+ from pathlib import Path
9
+ sys.path.append(str(Path(__file__).resolve().parents[1]))
10
+
11
+ def main():
12
+ p = argparse.ArgumentParser()
13
+ p.add_argument("--source", default="data/known_bills_visualize.json")
14
+ p.add_argument("--backend", choices=["chroma","pinecone"], default=os.getenv("VECTOR_BACKEND","chroma"))
15
+ p.add_argument("--persist", default="data/bills_vectorstore")
16
+ p.add_argument("--collection", default="bills")
17
+ p.add_argument("--manifest", default="data/bills_vectorstore_manifest.json")
18
+ p.add_argument("--model", default=None)
19
+ p.add_argument("--batch", type=int, default=128)
20
+ args = p.parse_args()
21
+
22
+ if args.backend == "pinecone":
23
+ from vectorstore.pinecone_bills_vectorstore import upsert_from_bills_json
24
+ stats = upsert_from_bills_json(
25
+ source_json_path=args.source,
26
+ manifest_path=args.manifest,
27
+ embed_model=args.model,
28
+ batch_size=args.batch,
29
+ )
30
+ else:
31
+ from vectorstore.bills_vectorstore import upsert_from_bills_json
32
+ stats = upsert_from_bills_json(
33
+ source_json_path=args.source,
34
+ persist_dir=args.persist,
35
+ collection=args.collection,
36
+ manifest_path=args.manifest,
37
+ embed_model=args.model,
38
+ batch_size=args.batch,
39
+ )
40
+
41
+ print("✅ Vectorstore updated")
42
+ for k, v in stats.items():
43
+ print(f" {k}: {v}")
44
+
45
+ if __name__ == "__main__":
46
+ main()
data/data_updating_scripts/build_bills_vectorstore_pinecone_delta.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os, json, time
3
+ from pathlib import Path
4
+ from typing import List, Dict, Any
5
+ from dotenv import load_dotenv
6
+ import sys
7
+ sys.path.append(str(Path(__file__).resolve().parents[1]))
8
+
9
+ load_dotenv(dotenv_path=Path.cwd() / ".env")
10
+
11
+ from vectorstore.pinecone_delta_upsert import chunk_bill, upsert_changed_vectors
12
+
13
+ SRC = "data/known_bills_visualize.json"
14
+ BATCH = int(os.getenv("PINECONE_BATCH", "128"))
15
+
16
+ def main():
17
+ p = Path(SRC)
18
+ if not p.exists():
19
+ raise SystemExit(f"Missing {SRC}")
20
+ bills: List[Dict[str, Any]] = json.loads(p.read_text(encoding="utf-8"))
21
+ bills = [b for b in bills if (b.get("text") or b.get("description") or b.get("title"))]
22
+
23
+ chunks: List[Dict[str, Any]] = []
24
+ for b in bills:
25
+ chunks.extend(chunk_bill(b))
26
+
27
+ print(f"Total chunks computed: {len(chunks):,}")
28
+
29
+ changed = 0
30
+ t0 = time.time()
31
+ for i in range(0, len(chunks), BATCH):
32
+ batch = chunks[i:i+BATCH]
33
+ changed += upsert_changed_vectors(batch)
34
+ if (i // BATCH) % 10 == 0:
35
+ print(f"… {i+len(batch):,}/{len(chunks):,} processed")
36
+ dt = time.time() - t0
37
+
38
+ print("✅ Pinecone delta upsert complete")
39
+ print(f" changed_upserts: {changed}")
40
+ print(f" elapsed_sec: {dt:.1f}")
41
+
42
+ if __name__ == "__main__":
43
+ main()
data/data_updating_scripts/config.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration settings for LegiScan AI Governance Bills Tracker."""
2
+ import os
3
+ from pathlib import Path
4
+ import dotenv
5
+
6
+ dotenv.load_dotenv()
7
+
8
+ class ConfigManager:
9
+ def __init__(self):
10
+ """
11
+ Initialize configuration with profile-specific settings.
12
+
13
+ Args:
14
+ profile (str): Configuration profile (production, development, testing)
15
+ """
16
+ self._load_base_config()
17
+
18
+ def _load_base_config(self):
19
+ """Load base configuration that applies to all profiles."""
20
+ self.OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
21
+ self.OPENAI_LLM_MODEL = os.getenv("OPENAI_LLM_MODEL", "gpt-4o")
22
+
23
+ def reload(self):
24
+ """Reload configuration."""
25
+ self._load_base_config()
26
+ self._load_profile_config(self.profile)
27
+ self._validate_config()
28
+
29
+ def __str__(self) -> str:
30
+ """Return string representation of non-sensitive config."""
31
+ sensitive_keys = ["OPENAI_API_KEY", "LEGISCAN_API_KEY"]
32
+ config_str = f"Configuration Profile: {self.profile}\n"
33
+ for key, value in self.__dict__.items():
34
+ if key.startswith("_"):
35
+ continue
36
+ if key in sensitive_keys:
37
+ config_str += f"{key}: {'*' * 8}\n"
38
+ else:
39
+ config_str += f"{key}: {value}\n"
40
+ return config_str
41
+
42
+ # Create default instance
43
+ config = ConfigManager()
data/data_updating_scripts/eu-ai-act.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bba630444b3278e881066774002a1d7824308934f49ccfa203e65be43692f55e
3
+ size 2583319
data/data_updating_scripts/eu_vectorstore.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # scripts/create_eu_ai_act_vectorstore.py
3
+
4
+ """
5
+ Script to create and save a vectorstore from the EU AI Act PDF.
6
+ This creates a FAISS vectorstore that can be loaded quickly in the main app.
7
+ """
8
+
9
+ import os
10
+ import logging
11
+ from pathlib import Path
12
+ import pickle
13
+ from typing import Optional
14
+ import dotenv
15
+
16
+ # Import config
17
+ from config import config
18
+
19
+ # PDF processing
20
+ import PyPDF2
21
+
22
+ # LangChain components
23
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
24
+ from langchain_openai import OpenAIEmbeddings
25
+ from langchain_community.vectorstores import FAISS
26
+ from langchain.schema import Document
27
+
28
+ # Load environment variables
29
+ dotenv.load_dotenv()
30
+
31
+ # Create logs directory if it doesn't exist
32
+ os.makedirs("data_updating_scripts/logs", exist_ok=True)
33
+
34
+ # Configure logging
35
+ logging.basicConfig(
36
+ level=logging.INFO,
37
+ format="%(asctime)s [%(levelname)s] %(message)s",
38
+ handlers=[logging.StreamHandler(), logging.FileHandler("data_updating_scripts/logs/eu_vectorstore.log")],
39
+ )
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+ def extract_text_from_pdf(pdf_path: str) -> str:
44
+ """Extract text from PDF file."""
45
+ try:
46
+ with open(pdf_path, 'rb') as file:
47
+ pdf_reader = PyPDF2.PdfReader(file)
48
+ text = ""
49
+
50
+ logger.info(f"Processing {len(pdf_reader.pages)} pages from {pdf_path}")
51
+
52
+ for page_num, page in enumerate(pdf_reader.pages):
53
+ try:
54
+ page_text = page.extract_text()
55
+ text += f"\n\n--- Page {page_num + 1} ---\n\n{page_text}"
56
+ except Exception as e:
57
+ logger.warning(f"Error extracting text from page {page_num + 1}: {e}")
58
+ continue
59
+
60
+ logger.info(f"Extracted {len(text)} characters from PDF")
61
+ return text
62
+
63
+ except Exception as e:
64
+ logger.error(f"Error reading PDF {pdf_path}: {e}")
65
+ raise e
66
+
67
+ def create_eu_ai_act_documents(text_content: str) -> list:
68
+ """Convert EU AI Act text to Document objects with metadata."""
69
+ try:
70
+ # Initialize text splitter with appropriate settings for legal documents
71
+ text_splitter = RecursiveCharacterTextSplitter(
72
+ chunk_size=1500, # Larger chunks for legal text
73
+ chunk_overlap=200, # More overlap for context preservation
74
+ length_function=len,
75
+ separators=["\n\n", "\n", ". ", " ", ""]
76
+ )
77
+
78
+ # Create initial document
79
+ doc = Document(
80
+ page_content=text_content,
81
+ metadata={
82
+ 'source': 'EU AI Act',
83
+ 'document_type': 'regulation',
84
+ 'jurisdiction': 'European Union',
85
+ 'title': 'Regulation (EU) 2024/1689 on Artificial Intelligence (AI Act)'
86
+ }
87
+ )
88
+
89
+ # Split into chunks
90
+ splits = text_splitter.split_documents([doc])
91
+
92
+ # Add chunk-specific metadata
93
+ for i, split in enumerate(splits):
94
+ split.metadata.update({
95
+ 'chunk_id': i,
96
+ 'total_chunks': len(splits)
97
+ })
98
+
99
+ logger.info(f"Created {len(splits)} document chunks")
100
+ return splits
101
+
102
+ except Exception as e:
103
+ logger.error(f"Error creating documents: {e}")
104
+ raise e
105
+
106
+ def create_and_save_eu_vectorstore(
107
+ pdf_path: str = "data_updating_scripts/eu-ai-act.pdf",
108
+ vectorstore_path: str = "data/eu_ai_act_vectorstore",
109
+ openai_api_key: Optional[str] = None
110
+ ) -> bool:
111
+ """
112
+ Create FAISS vectorstore from EU AI Act PDF and save it locally.
113
+
114
+ Args:
115
+ pdf_path: Path to the EU AI Act PDF file
116
+ vectorstore_path: Directory to save the vectorstore
117
+ openai_api_key: OpenAI API key (if not provided, uses environment variable)
118
+
119
+ Returns:
120
+ bool: True if successful, False otherwise
121
+ """
122
+ try:
123
+ # Check if PDF exists
124
+ if not Path(pdf_path).exists():
125
+ logger.error(f"PDF file not found: {pdf_path}")
126
+ return False
127
+
128
+ # Get API key
129
+ api_key = openai_api_key or config.OPENAI_API_KEY
130
+ if not api_key:
131
+ logger.error("OpenAI API key not found")
132
+ return False
133
+
134
+ logger.info("Starting EU AI Act vectorstore creation...")
135
+
136
+ # Extract text from PDF
137
+ logger.info("Extracting text from PDF...")
138
+ text_content = extract_text_from_pdf(pdf_path)
139
+
140
+ if not text_content or len(text_content) < 1000:
141
+ logger.error("Insufficient text extracted from PDF")
142
+ return False
143
+
144
+ # Create documents
145
+ logger.info("Creating document chunks...")
146
+ documents = create_eu_ai_act_documents(text_content)
147
+
148
+ if not documents:
149
+ logger.error("No documents created")
150
+ return False
151
+
152
+ # Initialize embeddings
153
+ logger.info("Initializing embeddings...")
154
+ embeddings = OpenAIEmbeddings(
155
+ api_key=api_key,
156
+ model="text-embedding-3-small"
157
+ )
158
+
159
+ # Create vectorstore
160
+ logger.info("Creating FAISS vectorstore...")
161
+ vectorstore = FAISS.from_documents(documents, embeddings)
162
+
163
+ # Create directory if it doesn't exist
164
+ Path(vectorstore_path).mkdir(exist_ok=True)
165
+
166
+ # Save vectorstore
167
+ logger.info(f"Saving vectorstore to {vectorstore_path}...")
168
+ vectorstore.save_local(vectorstore_path)
169
+
170
+ # Save metadata
171
+ metadata = {
172
+ 'pdf_path': pdf_path,
173
+ 'total_chunks': len(documents),
174
+ 'text_length': len(text_content),
175
+ 'embedding_model': 'text-embedding-3-small',
176
+ 'chunk_size': 1500,
177
+ 'chunk_overlap': 200
178
+ }
179
+
180
+ metadata_path = Path(vectorstore_path) / "metadata.pickle"
181
+ with open(metadata_path, 'wb') as f:
182
+ pickle.dump(metadata, f)
183
+
184
+ logger.info(f"✅ EU AI Act vectorstore created successfully!")
185
+ logger.info(f" - Total chunks: {len(documents)}")
186
+ logger.info(f" - Text length: {len(text_content):,} characters")
187
+ logger.info(f" - Saved to: {vectorstore_path}")
188
+
189
+ return True
190
+
191
+ except Exception as e:
192
+ logger.error(f"Error creating EU AI Act vectorstore: {e}")
193
+ return False
194
+
195
+ def load_eu_vectorstore(
196
+ vectorstore_path: str = "eu_ai_act_vectorstore",
197
+ openai_api_key: Optional[str] = None
198
+ ) -> Optional[FAISS]:
199
+ """
200
+ Load the EU AI Act vectorstore from disk.
201
+
202
+ Args:
203
+ vectorstore_path: Path to the saved vectorstore
204
+ openai_api_key: OpenAI API key
205
+
206
+ Returns:
207
+ FAISS vectorstore or None if failed
208
+ """
209
+ try:
210
+ if not Path(vectorstore_path).exists():
211
+ logger.error(f"Vectorstore not found: {vectorstore_path}")
212
+ return None
213
+
214
+ # Get API key
215
+ api_key = openai_api_key or config.OPENAI_API_KEY
216
+ if not api_key:
217
+ logger.error("OpenAI API key not found")
218
+ return None
219
+
220
+ # Initialize embeddings
221
+ embeddings = OpenAIEmbeddings(
222
+ api_key=api_key,
223
+ model="text-embedding-3-small"
224
+ )
225
+
226
+ # Load vectorstore
227
+ vectorstore = FAISS.load_local(
228
+ vectorstore_path,
229
+ embeddings,
230
+ allow_dangerous_deserialization=True # Required for loading pickled objects
231
+ )
232
+
233
+ logger.info(f"✅ EU AI Act vectorstore loaded from {vectorstore_path}")
234
+ return vectorstore
235
+
236
+ except Exception as e:
237
+ logger.error(f"Error loading EU AI Act vectorstore: {e}")
238
+ return None
239
+
240
+ def get_vectorstore_info(vectorstore_path: str = "data/eu_ai_act_vectorstore") -> dict:
241
+ """Get information about the saved vectorstore."""
242
+ try:
243
+ metadata_path = Path(vectorstore_path) / "metadata.pickle"
244
+ if metadata_path.exists():
245
+ with open(metadata_path, 'rb') as f:
246
+ metadata = pickle.load(f)
247
+ return metadata
248
+ else:
249
+ return {"error": "Metadata not found"}
250
+ except Exception as e:
251
+ return {"error": str(e)}
252
+
253
+ if __name__ == "__main__":
254
+ # Create the vectorstore
255
+ success = create_and_save_eu_vectorstore()
256
+
257
+ if success:
258
+ # Display info
259
+ info = get_vectorstore_info()
260
+ print("\n" + "="*50)
261
+ print("EU AI Act Vectorstore Information:")
262
+ print("="*50)
263
+ for key, value in info.items():
264
+ if key != 'error':
265
+ print(f"{key}: {value}")
266
+ print("="*50)
267
+ else:
268
+ print("❌ Failed to create EU AI Act vectorstore")
269
+ exit(1)
data/data_updating_scripts/fix_pdf_bills.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import base64
4
+ import logging
5
+ import sys
6
+ from datetime import datetime, timezone
7
+ import requests
8
+ from dotenv import load_dotenv
9
+ import PyPDF2
10
+ from io import BytesIO
11
+ import re
12
+ import shutil
13
+
14
+ # Load environment variables
15
+ load_dotenv()
16
+ API_KEY = os.getenv("LEGISCAN_API_KEY")
17
+
18
+ # Files
19
+ INPUT_FILE = "data/known_bills.json"
20
+ OUTPUT_FILE = "data/known_bills_fixed.json"
21
+ BACKUP_FILE = "data/known_bills_backup.json"
22
+
23
+ # Rate limiting
24
+ import time
25
+ RATE_LIMIT = 0.2 # seconds between API requests
26
+
27
+ # Logging configuration
28
+ LOG_FILE = "data_updating_scripts/logs/fix_pdf_bills.log"
29
+ os.makedirs("data_updating_scripts/logs", exist_ok=True)
30
+
31
+ logging.basicConfig(
32
+ level=logging.INFO,
33
+ format="%(asctime)s [%(levelname)s] %(message)s",
34
+ handlers=[
35
+ logging.StreamHandler(sys.stdout),
36
+ logging.FileHandler(LOG_FILE)
37
+ ]
38
+ )
39
+ logger = logging.getLogger(__name__)
40
+
41
+
42
+ def is_pdf_content(text):
43
+ """Check if the text content is an unprocessed PDF."""
44
+ if not text:
45
+ return False
46
+ # Check for PDF header signatures
47
+ pdf_signatures = ["%PDF-1.3", "%PDF-1.4", "%PDF-1.5", "%PDF-1.6", "%PDF-1.7", "%PDF1.3", "%PDF1.4", "%PDF1.5", "%PDF1.6", "%PDF1.7"]
48
+ text_start = text[:20] if len(text) >= 20 else text
49
+ return any(text_start.startswith(sig) for sig in pdf_signatures)
50
+
51
+
52
+ def extract_text_from_pdf_bytes(pdf_bytes):
53
+ """Extract text from PDF bytes using PyPDF2."""
54
+ try:
55
+ pdf_file = BytesIO(pdf_bytes)
56
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
57
+
58
+ text_content = []
59
+ for page_num in range(len(pdf_reader.pages)):
60
+ page = pdf_reader.pages[page_num]
61
+ page_text = page.extract_text()
62
+ if page_text:
63
+ text_content.append(page_text)
64
+
65
+ full_text = "\n".join(text_content)
66
+
67
+ # Clean up the extracted text
68
+ # Remove excessive whitespace while preserving paragraph breaks
69
+ full_text = re.sub(r'\n{3,}', '\n\n', full_text)
70
+ full_text = re.sub(r' {2,}', ' ', full_text)
71
+ full_text = full_text.strip()
72
+
73
+ return full_text
74
+ except Exception as e:
75
+ logger.error(f"Error extracting text from PDF: {e}")
76
+ return None
77
+
78
+
79
+ def legi_request(op, params):
80
+ """Make a request to the LegiScan API."""
81
+ base = "https://api.legiscan.com/"
82
+ params.update({"key": API_KEY, "op": op})
83
+ try:
84
+ resp = requests.get(base, params=params, timeout=10)
85
+ resp.raise_for_status()
86
+ data = resp.json()
87
+ if data.get("status") != "OK":
88
+ logger.error(f"API error {op}: {data.get('message', data)}")
89
+ return None
90
+ return data
91
+ except requests.RequestException as e:
92
+ logger.error(f"Request failed ({op}): {e}")
93
+ return None
94
+
95
+
96
+ def fix_pdf_bill(bill):
97
+ """Fix a single bill with unprocessed PDF content."""
98
+ bill_id = bill.get("bill_id")
99
+ state = bill.get("state")
100
+ bill_num = bill.get("bill_number")
101
+
102
+ logger.info(f"Fixing PDF content for {state} {bill_num} (ID: {bill_id})")
103
+
104
+ # First, try to get the bill details again
105
+ details_resp = legi_request("getBill", {"id": bill_id})
106
+ if not details_resp:
107
+ logger.warning(f"Could not fetch bill details for {bill_id}")
108
+ return None
109
+
110
+ details = details_resp.get("bill", {})
111
+ texts = details.get("texts", [])
112
+
113
+ if not texts:
114
+ logger.warning(f"No text documents available for {bill_id}")
115
+ return None
116
+
117
+ # Try to get the text document
118
+ doc_id = texts[0].get("doc_id")
119
+ text_resp = legi_request("getBillText", {"id": doc_id})
120
+
121
+ if not text_resp or "text" not in text_resp:
122
+ logger.warning(f"Could not fetch text for {bill_id}")
123
+ return None
124
+
125
+ raw_b64 = text_resp["text"].get("doc", "")
126
+ if not raw_b64:
127
+ logger.warning(f"No document content for {bill_id}")
128
+ return None
129
+
130
+ try:
131
+ # Decode the base64 content
132
+ decoded = base64.b64decode(raw_b64)
133
+
134
+ # Check if it's a PDF by looking at the magic bytes
135
+ if decoded[:4] == b'%PDF':
136
+ # It's a PDF, extract text
137
+ extracted_text = extract_text_from_pdf_bytes(decoded)
138
+ if extracted_text and len(extracted_text.strip()) > 100: # Ensure we got meaningful text
139
+ logger.info(f"Successfully extracted {len(extracted_text)} characters from PDF for {bill_id}")
140
+ return extracted_text
141
+ else:
142
+ logger.warning(f"Extracted text too short or empty for {bill_id}")
143
+ return None
144
+ else:
145
+ # Try to decode as HTML (shouldn't happen for these cases, but just in case)
146
+ try:
147
+ from bs4 import BeautifulSoup
148
+ html = decoded.decode("utf-8", errors="ignore")
149
+ soup = BeautifulSoup(html, "html.parser")
150
+ plain_text = soup.get_text(separator="\n", strip=True)
151
+ if plain_text and len(plain_text.strip()) > 100:
152
+ logger.info(f"Successfully extracted HTML text for {bill_id}")
153
+ return plain_text
154
+ except:
155
+ pass
156
+
157
+ logger.warning(f"Could not process document for {bill_id}")
158
+ return None
159
+
160
+ except Exception as e:
161
+ logger.error(f"Error processing document for {bill_id}: {e}")
162
+ return None
163
+
164
+
165
+ def main(overwrite: bool | None = None):
166
+ # Load the bills
167
+ logger.info(f"Loading bills from {INPUT_FILE}")
168
+ try:
169
+ with open(INPUT_FILE, 'r') as f:
170
+ bills = json.load(f)
171
+ except Exception as e:
172
+ logger.error(f"Could not load bills file: {e}")
173
+ sys.exit(1)
174
+
175
+ logger.info(f"Loaded {len(bills)} bills")
176
+
177
+ # Create a backup
178
+ logger.info(f"Creating backup at {BACKUP_FILE}")
179
+ with open(BACKUP_FILE, 'w') as f:
180
+ json.dump(bills, f, indent=2)
181
+
182
+ # Find bills with unprocessed PDF content
183
+ pdf_bills = []
184
+ for i, bill in enumerate(bills):
185
+ if is_pdf_content(bill.get("text")):
186
+ pdf_bills.append(i)
187
+
188
+ logger.info(f"Found {len(pdf_bills)} bills with unprocessed PDF content")
189
+
190
+ # Process each PDF bill
191
+ fixed_count = 0
192
+ failed_count = 0
193
+
194
+ for idx, bill_idx in enumerate(pdf_bills):
195
+ bill = bills[bill_idx]
196
+ logger.info(f"Processing {idx + 1}/{len(pdf_bills)}: {bill.get('state')} {bill.get('bill_number')}")
197
+
198
+ # Try to fix the PDF content
199
+ fixed_text = fix_pdf_bill(bill)
200
+
201
+ if fixed_text:
202
+ # Update the bill with the fixed text
203
+ bills[bill_idx]["text"] = fixed_text
204
+ bills[bill_idx]["lastUpdatedAt"] = datetime.now(timezone.utc).isoformat()
205
+ bills[bill_idx]["text_fixed"] = True # Mark that we fixed this
206
+ fixed_count += 1
207
+ logger.info(f"Successfully fixed bill {bill.get('bill_id')}")
208
+ else:
209
+ # Mark that we tried but failed
210
+ bills[bill_idx]["text_extraction_failed"] = True
211
+ bills[bill_idx]["lastUpdatedAt"] = datetime.now(timezone.utc).isoformat()
212
+ failed_count += 1
213
+ logger.warning(f"Failed to fix bill {bill.get('bill_id')}")
214
+
215
+ # Rate limiting
216
+ time.sleep(RATE_LIMIT)
217
+
218
+ # Save progress every 50 bills
219
+ if (idx + 1) % 50 == 0:
220
+ logger.info(f"Saving progress... ({idx + 1}/{len(pdf_bills)} processed)")
221
+ with open(OUTPUT_FILE, 'w') as f:
222
+ json.dump(bills, f, indent=2)
223
+
224
+ # Save final results
225
+ logger.info(f"Saving final results to {OUTPUT_FILE}")
226
+ with open(OUTPUT_FILE, 'w') as f:
227
+ json.dump(bills, f, indent=2)
228
+
229
+ logger.info(f"Processing complete!")
230
+ logger.info(f"Successfully fixed: {fixed_count} bills")
231
+ logger.info(f"Failed to fix: {failed_count} bills")
232
+ logger.info(f"Output saved to: {OUTPUT_FILE}")
233
+
234
+ if fixed_count > 0:
235
+ # Decide overwrite behavior
236
+ if overwrite is None:
237
+ # CLI mode: ask the user (guardrail preserved)
238
+ try:
239
+ response = input(
240
+ f"\nDo you want to overwrite {INPUT_FILE} with the fixed data? (y/n): "
241
+ )
242
+ except EOFError:
243
+ logger.error(
244
+ "No input available (EOF). Leaving original file unchanged."
245
+ )
246
+ return
247
+ overwrite_flag = response.strip().lower().startswith("y")
248
+ else:
249
+ # Non-interactive mode (e.g. Streamlit pipeline)
250
+ overwrite_flag = overwrite
251
+
252
+ if overwrite_flag:
253
+ shutil.copy2(OUTPUT_FILE, INPUT_FILE)
254
+ logger.info(f"Original file {INPUT_FILE} has been updated with fixed data.")
255
+ else:
256
+ logger.info("Overwrite declined; original file left unchanged.")
257
+
258
+
259
+
260
+ if __name__ == "__main__":
261
+ # If running under Streamlit / pipeline, we expect FIX_PDF_OVERWRITE in env:
262
+ # "yes", "y", "true", "1" -> overwrite=True
263
+ # "no", "n", "false", "0" -> overwrite=False
264
+ # If it's not set, we fall back to CLI mode and ask via input().
265
+ env_choice = os.getenv("FIX_PDF_OVERWRITE")
266
+
267
+ if env_choice is None:
268
+ # Local CLI run → still interactive
269
+ main(overwrite=None)
270
+ else:
271
+ choice = env_choice.strip().lower()
272
+ if choice in ("yes", "y", "true", "1"):
273
+ main(overwrite=True)
274
+ elif choice in ("no", "n", "false", "0"):
275
+ main(overwrite=False)
276
+ else:
277
+ logger.warning(
278
+ f"Invalid FIX_PDF_OVERWRITE='{env_choice}', defaulting to no overwrite."
279
+ )
280
+ main(overwrite=False)
281
+
282
+
data/data_updating_scripts/generate_reports.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ generate_reports.py
3
+ --------------------
4
+
5
+ Generates detailed Markdown reports for AI-related bills from `known_bills_visualize.json`
6
+ using the latest LangChain pipeline syntax.
7
+
8
+ Now includes resume functionality - can be safely stopped and restarted.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import logging
15
+ import os
16
+ import time
17
+ from dataclasses import dataclass
18
+ from typing import Any, Dict, List, Optional
19
+ import dotenv
20
+
21
+
22
+ dotenv.load_dotenv()
23
+
24
+ # Create logs directory if it doesn't exist
25
+ os.makedirs("data_updating_scripts/logs", exist_ok=True)
26
+
27
+ # Latest LangChain imports
28
+ try:
29
+ from langchain_openai import ChatOpenAI
30
+ from langchain.prompts import ChatPromptTemplate
31
+ except ImportError: # pragma: no cover
32
+ ChatOpenAI = None # type: ignore
33
+ ChatPromptTemplate = None # type: ignore
34
+
35
+ # Configure logging
36
+ logging.basicConfig(
37
+ level=logging.INFO,
38
+ format="%(asctime)s [%(levelname)s] %(message)s",
39
+ handlers=[logging.StreamHandler(), logging.FileHandler("data_updating_scripts/logs/generate_reports.log")],
40
+ )
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+
45
+ @dataclass
46
+ class BillReport:
47
+ """Stores a bill ID and its generated detailed report."""
48
+ bill_id: str
49
+ report_markdown: str
50
+
51
+
52
+ # Prompt template
53
+ DETAILED_REPORT_PROMPT = ChatPromptTemplate.from_template(
54
+ """You are a seasoned legislative analyst adept at interpreting and
55
+ summarising bills related to artificial intelligence. Using the bill
56
+ information provided as JSON, produce a detailed report in Markdown
57
+ format for stakeholders.
58
+
59
+ Include:
60
+ - Bill's title, number, and state
61
+ - Status and key dates
62
+ - URL to the bill on legiscan
63
+ - Sponsors and scope
64
+ - Goals and intent
65
+ - Key provisions, regulatory approaches, implementation & enforcement
66
+ - Unique aspects or notable features
67
+
68
+ Format:
69
+ - Use Markdown headings and bullet points
70
+ - Paraphrase content
71
+ - Do not invent facts
72
+ - If bill text is truncated in source JSON, note this at the end
73
+
74
+ Bill JSON:
75
+ ```json
76
+ {bill_json}
77
+ ```
78
+
79
+ Now craft the detailed report.
80
+ """
81
+ )
82
+
83
+
84
+ def _ensure_llm() -> ChatOpenAI:
85
+ """Initialise ChatOpenAI with latest settings."""
86
+ if ChatOpenAI is None:
87
+ raise RuntimeError(
88
+ "The 'langchain' and 'openai' packages are required. Install them via 'pip install langchain openai'."
89
+ )
90
+ api_key = os.getenv("OPENAI_API_KEY")
91
+ if not api_key:
92
+ raise RuntimeError("The OPENAI_API_KEY environment variable is not set.")
93
+ model_name = os.getenv("MODEL_NAME", "gpt-4o")
94
+ logger.debug("Initialising ChatOpenAI with model %s", model_name)
95
+ return ChatOpenAI(model=model_name, temperature=0)
96
+
97
+
98
+ def create_detailed_report(
99
+ bill: Dict[str, Any], *, llm: Optional[ChatOpenAI] = None
100
+ ) -> BillReport:
101
+ """Generate a detailed report for a single bill using latest LangChain syntax."""
102
+ if llm is None:
103
+ llm = _ensure_llm()
104
+
105
+ bill_json = json.dumps(bill, ensure_ascii=False, indent=2)
106
+
107
+ # Latest syntax: prompt | llm
108
+ chain = DETAILED_REPORT_PROMPT | llm
109
+ result = chain.invoke({"bill_json": bill_json})
110
+
111
+ # result can be AIMessage; get text
112
+ report_text = getattr(result, "content", str(result))
113
+
114
+ return BillReport(bill_id=str(bill.get("bill_id")), report_markdown=report_text)
115
+
116
+
117
+ def load_existing_reports(output_path: str) -> Dict[str, str]:
118
+ """Load existing reports from file if it exists."""
119
+ if os.path.exists(output_path):
120
+ try:
121
+ with open(output_path, "r", encoding="utf-8") as f:
122
+ reports_list = json.load(f)
123
+ # Convert list to dict for easy lookup
124
+ reports_dict = {
125
+ report["bill_id"]: report["report_markdown"]
126
+ for report in reports_list
127
+ if "bill_id" in report and "report_markdown" in report
128
+ }
129
+ logger.info(f"Loaded {len(reports_dict)} existing reports from {output_path}")
130
+ return reports_dict
131
+ except Exception as e:
132
+ logger.warning(f"Could not load existing reports: {e}")
133
+ return {}
134
+ return {}
135
+
136
+
137
+ def save_reports_to_file(reports_dict: Dict[str, str], output_path: str) -> None:
138
+ """Save reports dictionary to a JSON file."""
139
+ # Convert dict back to list format for consistency
140
+ out_list = [
141
+ {"bill_id": bill_id, "report_markdown": report_markdown}
142
+ for bill_id, report_markdown in reports_dict.items()
143
+ ]
144
+ with open(output_path, "w", encoding="utf-8") as f:
145
+ json.dump(out_list, f, ensure_ascii=False, indent=2)
146
+ logger.info("Saved %d reports to %s", len(out_list), output_path)
147
+
148
+
149
+ def create_reports_with_resume(
150
+ bills: List[Dict[str, Any]],
151
+ output_path: str,
152
+ *,
153
+ llm: Optional[ChatOpenAI] = None,
154
+ save_interval: int = 10
155
+ ) -> Dict[str, str]:
156
+ """
157
+ Generate detailed reports for multiple bills with resume capability.
158
+
159
+ Args:
160
+ bills: List of bill dictionaries
161
+ output_path: Path to save reports
162
+ llm: Optional LLM instance
163
+ save_interval: Save progress every N bills
164
+
165
+ Returns:
166
+ Dictionary of bill_id -> report_markdown
167
+ """
168
+ if not bills:
169
+ return {}
170
+
171
+ if llm is None:
172
+ llm = _ensure_llm()
173
+
174
+ # Load existing reports
175
+ reports_dict = load_existing_reports(output_path)
176
+
177
+ # Track progress
178
+ total_bills = len(bills)
179
+ processed = 0
180
+ skipped = 0
181
+ errors = 0
182
+
183
+ logger.info(f"Starting report generation for {total_bills} bills")
184
+
185
+ for i, bill in enumerate(bills, 1):
186
+ bill_id = str(bill.get("bill_id"))
187
+
188
+ # Skip if already processed
189
+ if bill_id in reports_dict and reports_dict[bill_id] and not reports_dict[bill_id].startswith("ERROR:"):
190
+ logger.info(f"Skipping bill {bill_id} - already processed ({i}/{total_bills})")
191
+ skipped += 1
192
+ continue
193
+
194
+ logger.info(f"Processing {i}/{total_bills}: Bill ID {bill_id}")
195
+
196
+ try:
197
+ report = create_detailed_report(bill, llm=llm)
198
+ reports_dict[bill_id] = report.report_markdown
199
+ processed += 1
200
+
201
+ except Exception as exc:
202
+ logger.exception(
203
+ "Failed to generate report for bill %s: %s", bill_id, exc
204
+ )
205
+ reports_dict[bill_id] = f"ERROR: Failed to generate report - {str(exc)}"
206
+ errors += 1
207
+
208
+ # Save progress periodically
209
+ if i % save_interval == 0:
210
+ save_reports_to_file(reports_dict, output_path)
211
+ logger.info(f"Progress: {i}/{total_bills} - Processed: {processed}, Skipped: {skipped}, Errors: {errors}")
212
+
213
+ # Rate limiting to avoid API throttling
214
+ if bill_id not in reports_dict or reports_dict[bill_id].startswith("ERROR:"):
215
+ time.sleep(1) # 1 second delay between API calls
216
+
217
+ # Final save
218
+ save_reports_to_file(reports_dict, output_path)
219
+
220
+ logger.info(f"Report generation complete!")
221
+ logger.info(f"Total bills: {total_bills}")
222
+ logger.info(f"Successfully processed: {processed}")
223
+ logger.info(f"Skipped (already done): {skipped}")
224
+ logger.info(f"Errors: {errors}")
225
+
226
+ return reports_dict
227
+
228
+
229
+ def read_bills_from_file(path: str) -> List[Dict[str, Any]]:
230
+ """Read bill records from a JSON file."""
231
+ with open(path, "r", encoding="utf-8") as f:
232
+ data = json.load(f)
233
+ if not isinstance(data, list):
234
+ raise ValueError(f"Expected list of bills in {path}, got {type(data)}")
235
+ return data
236
+
237
+
238
+ def generate_reports_from_files(
239
+ input_path: str = "data/known_bills_visualize.json",
240
+ output_path: str = "data/bill_reports.json",
241
+ ) -> None:
242
+ """Read bills, generate reports with resume capability, and write them to disk."""
243
+ bills = read_bills_from_file(input_path)
244
+ create_reports_with_resume(bills, output_path)
245
+
246
+
247
+ def main() -> None:
248
+ import argparse
249
+ logging.basicConfig(
250
+ level=logging.INFO,
251
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
252
+ )
253
+ parser = argparse.ArgumentParser(
254
+ description="Generate detailed AI legislation reports from bill data with resume capability."
255
+ )
256
+ parser.add_argument("--input", default="data/known_bills_visualize.json", help="Path to input JSON file")
257
+ parser.add_argument("--output", default="data/bill_reports.json", help="Path to output JSON file")
258
+ parser.add_argument("--save-interval", type=int, default=10, help="Save progress every N bills (default: 10)")
259
+ args = parser.parse_args()
260
+
261
+ try:
262
+ bills = read_bills_from_file(args.input)
263
+ create_reports_with_resume(bills, args.output, save_interval=args.save_interval)
264
+ print(f"✅ Report generation completed successfully!")
265
+ print(f" Reports saved to: {args.output}")
266
+ except Exception as e:
267
+ logger.error(f"Fatal error: {e}")
268
+ print(f"❌ Error: {e}")
269
+ import sys
270
+ sys.exit(1)
271
+
272
+
273
+ if __name__ == "__main__":
274
+ main()
data/data_updating_scripts/generate_suggested_questions.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to generate suggested questions for all bills in known_bills_visualize.json.
4
+
5
+ This script reads all bills from known_bills_visualize.json, generates 5 suggested questions using OpenAI API,
6
+ and saves them to data/bill_suggested_questions.json to avoid repeated API calls.
7
+ """
8
+
9
+ import json
10
+ import logging
11
+ import time
12
+ import pandas as pd
13
+ from pathlib import Path
14
+ from typing import Dict, List, Optional
15
+ import sys
16
+ import os
17
+
18
+ # Add the project root to the path
19
+ sys.path.append(str(Path(__file__).parent.parent))
20
+
21
+ from config import ConfigManager
22
+ from langchain_openai import ChatOpenAI
23
+ from langchain_core.prompts import ChatPromptTemplate
24
+ from langchain.chains.combine_documents import create_stuff_documents_chain
25
+ from langchain_core.documents import Document
26
+
27
+ # Create logs directory if it doesn't exist
28
+ os.makedirs("data_updating_scripts/logs", exist_ok=True)
29
+
30
+ # Configure logging
31
+ logging.basicConfig(
32
+ level=logging.INFO,
33
+ format="%(asctime)s [%(levelname)s] %(message)s",
34
+ handlers=[logging.StreamHandler(), logging.FileHandler("data_updating_scripts/logs/generate_suggested_questions.log")]
35
+ )
36
+ logger = logging.getLogger(__name__)
37
+
38
+ class SuggestedQuestionsGenerator:
39
+ """Generates suggested questions for all bills in known_bills_visualize.json."""
40
+
41
+ def __init__(self):
42
+ """Initialize the questions generator with configuration."""
43
+ self.config = ConfigManager()
44
+ self.known_bills_file = Path("data/known_bills_visualize.json")
45
+ self.questions_file = Path("data/bill_suggested_questions.json")
46
+
47
+ # Initialize OpenAI LLM
48
+ if not self.config.OPENAI_API_KEY:
49
+ raise ValueError("OPENAI_API_KEY not found in environment variables")
50
+
51
+ self.llm = ChatOpenAI(
52
+ model=self.config.OPENAI_LLM_MODEL,
53
+ temperature=0.3,
54
+ max_tokens=500
55
+ )
56
+
57
+ # Load the system prompt from markdown file
58
+ prompt_path = "data_updating_scripts/PROMPTS/suggested_questions_prompt.md"
59
+ if not os.path.exists(prompt_path):
60
+ raise FileNotFoundError(f"The specified file was not found: {prompt_path}")
61
+
62
+ with open(prompt_path, "r") as file:
63
+ system_prompt = file.read()
64
+
65
+ # Create the prompt and chain
66
+ self.prompt = ChatPromptTemplate.from_messages(
67
+ [
68
+ ("system", system_prompt),
69
+ ("human", "Generate 5 specific questions about this bill based on its content."),
70
+ ]
71
+ )
72
+
73
+ self.question_generation_chain = create_stuff_documents_chain(
74
+ self.llm, self.prompt
75
+ )
76
+
77
+ # Fallback questions
78
+ self.fallback_questions = [
79
+ "What are the key definitions in this bill?",
80
+ "What are the enforcement mechanisms?",
81
+ "Who does this bill apply to?",
82
+ "What are the compliance requirements?",
83
+ "What penalties are specified?"
84
+ ]
85
+
86
+ logger.info(f"Initialized SuggestedQuestionsGenerator with model: {self.config.OPENAI_LLM_MODEL}")
87
+
88
+ def dataframe_to_documents(self, df):
89
+ """Convert DataFrame to list of Document objects."""
90
+ documents = []
91
+ for _, row in df.iterrows():
92
+ if 'text' in row and pd.notna(row['text']) and row['text'].strip():
93
+ doc = Document(
94
+ page_content=row['text'],
95
+ metadata={
96
+ 'bill_key': f"{row.get('state', 'Unknown')}_{row.get('bill_number', 'Unknown')}",
97
+ 'state': row.get('state', 'Unknown'),
98
+ 'bill_number': row.get('bill_number', 'Unknown'),
99
+ 'title': row.get('title', 'No title')
100
+ }
101
+ )
102
+ documents.append(doc)
103
+ return documents
104
+
105
+ def load_known_bills(self) -> List[Dict]:
106
+ """Load bills from known_bills_visualize.json."""
107
+ try:
108
+ with open(self.known_bills_file, 'r', encoding='utf-8') as f:
109
+ bills = json.load(f)
110
+ logger.info(f"Loaded {len(bills)} bills from {self.known_bills_file}")
111
+ return bills
112
+ except FileNotFoundError:
113
+ logger.error(f"File not found: {self.known_bills_file}")
114
+ raise
115
+ except json.JSONDecodeError as e:
116
+ logger.error(f"Error parsing JSON: {e}")
117
+ raise
118
+
119
+ def load_existing_questions(self) -> Dict:
120
+ """Load existing questions if available."""
121
+ if self.questions_file.exists():
122
+ try:
123
+ with open(self.questions_file, 'r', encoding='utf-8') as f:
124
+ questions = json.load(f)
125
+ logger.info(f"Loaded {len(questions)} existing question sets")
126
+ return questions
127
+ except Exception as e:
128
+ logger.warning(f"Could not load existing questions: {e}")
129
+ return {}
130
+ return {}
131
+
132
+ def save_questions(self, questions: Dict) -> None:
133
+ """Save questions to JSON file."""
134
+ try:
135
+ with open(self.questions_file, 'w', encoding='utf-8') as f:
136
+ json.dump(questions, f, indent=2, ensure_ascii=False)
137
+ logger.info(f"Saved {len(questions)} question sets to {self.questions_file}")
138
+ except Exception as e:
139
+ logger.error(f"Error saving questions: {e}")
140
+ raise
141
+
142
+ def parse_questions_response(self, response: str) -> List[str]:
143
+ """Parse the LLM response into individual questions."""
144
+ questions = []
145
+ if isinstance(response, str):
146
+ # Split by lines and clean up
147
+ lines = [line.strip() for line in response.split('\n') if line.strip()]
148
+ # Filter out any numbering or bullet points
149
+ for line in lines:
150
+ # Remove common prefixes like "1.", "2.", "3.", "4.", "5.", "•", "-", "*", etc.
151
+ clean_line = line
152
+ if line.startswith(('1.', '2.', '3.', '4.', '5.', '•', '-', '*')):
153
+ clean_line = line[2:].strip()
154
+ elif line.startswith(('1)', '2)', '3)', '4)', '5)')):
155
+ clean_line = line[2:].strip()
156
+
157
+ if clean_line and clean_line.endswith('?'):
158
+ questions.append(clean_line)
159
+
160
+ # Ensure we have exactly 5 questions
161
+ if len(questions) < 5:
162
+ # Use fallback questions to fill up to 5
163
+ questions.extend(self.fallback_questions[len(questions):])
164
+
165
+ return questions[:5] # Return only the first 5
166
+
167
+ def generate_questions(self, bill: Dict) -> Optional[List[str]]:
168
+ """Generate suggested questions for a single bill."""
169
+ try:
170
+ bill_number = bill.get('bill_number', 'Unknown')
171
+ bill_title = bill.get('title', 'No title')
172
+ bill_text = bill.get('text', '')
173
+
174
+ if not bill_text:
175
+ logger.warning(f"No text found for bill {bill_number}")
176
+ return self.fallback_questions
177
+
178
+ # Convert bill to document format
179
+ df = pd.DataFrame([bill])
180
+ docs = self.dataframe_to_documents(df)
181
+
182
+ if not docs:
183
+ logger.warning(f"No document created for bill {bill_number}")
184
+ return self.fallback_questions
185
+
186
+ # Generate questions using the chain
187
+ response = self.question_generation_chain.invoke({"context": docs})
188
+
189
+ # Parse the response into questions
190
+ questions = self.parse_questions_response(response)
191
+
192
+ logger.info(f"Generated {len(questions)} questions for {bill_number}")
193
+ return questions
194
+
195
+ except Exception as e:
196
+ logger.error(f"Error generating questions for bill {bill.get('bill_number', 'Unknown')}: {e}")
197
+ return self.fallback_questions
198
+
199
+ def generate_all_questions(self) -> None:
200
+ """Generate suggested questions for all bills."""
201
+ # Load bills and existing questions
202
+ bills = self.load_known_bills()
203
+ existing_questions = self.load_existing_questions()
204
+
205
+ # Track progress
206
+ total_bills = len(bills)
207
+ processed = 0
208
+ errors = 0
209
+
210
+ logger.info(f"Starting question generation for {total_bills} bills")
211
+
212
+ for i, bill in enumerate(bills, 1):
213
+ bill_key = f"{bill.get('state', 'Unknown')}_{bill.get('bill_number', 'Unknown')}"
214
+
215
+ # Skip if already processed successfully
216
+ if bill_key in existing_questions and len(existing_questions[bill_key].get('suggested_questions', [])) == 5:
217
+ logger.info(f"Skipping {bill_key} - already processed")
218
+ processed += 1
219
+ continue
220
+
221
+ logger.info(f"Processing {i}/{total_bills}: {bill_key}")
222
+
223
+ # Generate questions
224
+ questions = self.generate_questions(bill)
225
+
226
+ # Store result
227
+ existing_questions[bill_key] = {
228
+ 'bill_number': bill.get('bill_number', 'Unknown'),
229
+ 'title': bill.get('title', 'No title'),
230
+ 'suggested_questions': questions
231
+ }
232
+
233
+ if questions == self.fallback_questions:
234
+ errors += 1
235
+ else:
236
+ processed += 1
237
+
238
+ # Save progress every 10 bills
239
+ if i % 10 == 0:
240
+ self.save_questions(existing_questions)
241
+ logger.info(f"Progress: {i}/{total_bills} processed, {errors} errors")
242
+
243
+ # Rate limiting
244
+ time.sleep(1) # 1 second delay between API calls
245
+
246
+ # Final save
247
+ self.save_questions(existing_questions)
248
+
249
+ logger.info(f"Question generation complete!")
250
+ logger.info(f"Total bills: {total_bills}")
251
+ logger.info(f"Successfully processed: {processed}")
252
+ logger.info(f"Errors: {errors}")
253
+ logger.info(f"Questions saved to: {self.questions_file}")
254
+
255
+ def main():
256
+ """Main function to run the question generation."""
257
+ try:
258
+ generator = SuggestedQuestionsGenerator()
259
+ generator.generate_all_questions()
260
+ print("✅ Suggested questions generation completed successfully!")
261
+ print(f" Questions saved to: {generator.questions_file}")
262
+
263
+ except Exception as e:
264
+ logger.error(f"Fatal error: {e}")
265
+ print(f"❌ Error: {e}")
266
+ sys.exit(1)
267
+
268
+ if __name__ == "__main__":
269
+ main()
data/data_updating_scripts/generate_summaries.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to generate summaries for all bills in known_bills_visualize.json.
4
+
5
+ This script reads all bills from known_bills_visualize.json, generates summaries using OpenAI API,
6
+ and saves them to data/bill_summaries.json to avoid repeated API calls.
7
+ """
8
+
9
+ import json
10
+ import logging
11
+ import time
12
+ from pathlib import Path
13
+ from typing import Dict, List, Optional
14
+ import sys
15
+ import os
16
+
17
+ # Add the project root to the path
18
+ sys.path.append(str(Path(__file__).parent.parent))
19
+
20
+ from config import ConfigManager
21
+ from langchain_openai import ChatOpenAI
22
+ from langchain_core.prompts import PromptTemplate
23
+ from langchain_core.output_parsers import StrOutputParser
24
+ from PROMPTS.bill_summary_prompt import BILL_SUMMARY_PROMPT
25
+
26
+ # Create logs directory if it doesn't exist
27
+ os.makedirs("data_updating_scripts/logs", exist_ok=True)
28
+
29
+ # Configure logging
30
+ logging.basicConfig(
31
+ level=logging.INFO,
32
+ format="%(asctime)s [%(levelname)s] %(message)s",
33
+ handlers=[logging.StreamHandler(), logging.FileHandler("data_updating_scripts/logs/generate_summaries.log")]
34
+ )
35
+ logger = logging.getLogger(__name__)
36
+
37
+ class BillSummaryGenerator:
38
+ """Generates summaries for all bills in known_bills_visualize.json."""
39
+
40
+ def __init__(self):
41
+ """Initialize the summary generator with configuration."""
42
+ self.config = ConfigManager()
43
+ self.known_bills_file = Path("data/known_bills_visualize.json")
44
+ self.summaries_file = Path("data/bill_summaries.json")
45
+
46
+ # Initialize OpenAI LLM
47
+ if not self.config.OPENAI_API_KEY:
48
+ raise ValueError("OPENAI_API_KEY not found in environment variables")
49
+
50
+ self.llm = ChatOpenAI(
51
+ model=self.config.OPENAI_LLM_MODEL,
52
+ temperature=0.1,
53
+ max_tokens=1000
54
+ )
55
+
56
+ # Create the prompt template
57
+ self.prompt_template = PromptTemplate(
58
+ template=BILL_SUMMARY_PROMPT,
59
+ input_variables=["bill_number", "bill_title", "state", "bill_text"]
60
+ )
61
+
62
+ # Create the chain
63
+ self.chain = self.prompt_template | self.llm | StrOutputParser()
64
+
65
+ logger.info(f"Initialized BillSummaryGenerator with model: {self.config.OPENAI_LLM_MODEL}")
66
+
67
+ def load_known_bills(self) -> List[Dict]:
68
+ """Load bills from known_bills_visualize.json."""
69
+ try:
70
+ with open(self.known_bills_file, 'r', encoding='utf-8') as f:
71
+ bills = json.load(f)
72
+ logger.info(f"Loaded {len(bills)} bills from {self.known_bills_file}")
73
+ return bills
74
+ except FileNotFoundError:
75
+ logger.error(f"File not found: {self.known_bills_file}")
76
+ raise
77
+ except json.JSONDecodeError as e:
78
+ logger.error(f"Error parsing JSON: {e}")
79
+ raise
80
+
81
+ def load_existing_summaries(self) -> Dict:
82
+ """Load existing summaries if available."""
83
+ if self.summaries_file.exists():
84
+ try:
85
+ with open(self.summaries_file, 'r', encoding='utf-8') as f:
86
+ summaries = json.load(f)
87
+ logger.info(f"Loaded {len(summaries)} existing summaries")
88
+ return summaries
89
+ except Exception as e:
90
+ logger.warning(f"Could not load existing summaries: {e}")
91
+ return {}
92
+ return {}
93
+
94
+ def save_summaries(self, summaries: Dict) -> None:
95
+ """Save summaries to JSON file."""
96
+ try:
97
+ with open(self.summaries_file, 'w', encoding='utf-8') as f:
98
+ json.dump(summaries, f, indent=2, ensure_ascii=False)
99
+ logger.info(f"Saved {len(summaries)} summaries to {self.summaries_file}")
100
+ except Exception as e:
101
+ logger.error(f"Error saving summaries: {e}")
102
+ raise
103
+
104
+ def generate_summary(self, bill: Dict) -> Optional[str]:
105
+ """Generate summary for a single bill."""
106
+ try:
107
+ bill_number = bill.get('bill_number', 'Unknown')
108
+ bill_title = bill.get('title', 'No title')
109
+ state = bill.get('state', 'Unknown')
110
+ bill_text = bill.get('text', '')
111
+
112
+ if not bill_text:
113
+ logger.warning(f"No text found for bill {bill_number}")
114
+ return "ERROR: No bill text available"
115
+
116
+ # Prepare the input for the chain
117
+ chain_input = {
118
+ "bill_number": bill_number,
119
+ "bill_title": bill_title,
120
+ "state": state,
121
+ "bill_text": bill_text[:8000] # Limit text length to avoid token limits
122
+ }
123
+
124
+ # Generate summary using the chain
125
+ summary = self.chain.invoke(chain_input)
126
+
127
+ logger.info(f"Generated summary for {bill_number}")
128
+ return summary
129
+
130
+ except Exception as e:
131
+ logger.error(f"Error generating summary for bill {bill.get('bill_number', 'Unknown')}: {e}")
132
+ return f"ERROR: {str(e)}"
133
+
134
+ def generate_all_summaries(self) -> None:
135
+ """Generate summaries for all bills."""
136
+ # Load bills and existing summaries
137
+ bills = self.load_known_bills()
138
+ existing_summaries = self.load_existing_summaries()
139
+
140
+ # Track progress
141
+ total_bills = len(bills)
142
+ processed = 0
143
+ errors = 0
144
+
145
+ logger.info(f"Starting summary generation for {total_bills} bills")
146
+
147
+ for i, bill in enumerate(bills, 1):
148
+ bill_key = f"{bill.get('state', 'Unknown')}_{bill.get('bill_number', 'Unknown')}"
149
+
150
+ # Skip if already processed successfully
151
+ if bill_key in existing_summaries and not existing_summaries[bill_key].get('summary', '').startswith('ERROR:'):
152
+ logger.info(f"Skipping {bill_key} - already processed")
153
+ processed += 1
154
+ continue
155
+
156
+ logger.info(f"Processing {i}/{total_bills}: {bill_key}")
157
+
158
+ # Generate summary
159
+ summary = self.generate_summary(bill)
160
+
161
+ # Store result
162
+ existing_summaries[bill_key] = {
163
+ 'bill_number': bill.get('bill_number', 'Unknown'),
164
+ 'title': bill.get('title', 'No title'),
165
+ 'summary': summary
166
+ }
167
+
168
+ if summary.startswith('ERROR:'):
169
+ errors += 1
170
+ else:
171
+ processed += 1
172
+
173
+ # Save progress every 10 bills
174
+ if i % 10 == 0:
175
+ self.save_summaries(existing_summaries)
176
+ logger.info(f"Progress: {i}/{total_bills} processed, {errors} errors")
177
+
178
+ # Rate limiting
179
+ time.sleep(1) # 1 second delay between API calls
180
+
181
+ # Final save
182
+ self.save_summaries(existing_summaries)
183
+
184
+ logger.info(f"Summary generation complete!")
185
+ logger.info(f"Total bills: {total_bills}")
186
+ logger.info(f"Successfully processed: {processed}")
187
+ logger.info(f"Errors: {errors}")
188
+ logger.info(f"Summaries saved to: {self.summaries_file}")
189
+
190
+ def main():
191
+ """Main function to run the summary generation."""
192
+ try:
193
+ generator = BillSummaryGenerator()
194
+ generator.generate_all_summaries()
195
+ print("✅ Summary generation completed successfully!")
196
+ print(f" Summaries saved to: {generator.summaries_file}")
197
+
198
+ except Exception as e:
199
+ logger.error(f"Fatal error: {e}")
200
+ print(f"❌ Error: {e}")
201
+ sys.exit(1)
202
+
203
+ if __name__ == "__main__":
204
+ main()
data/data_updating_scripts/get_data.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ import time
5
+ import logging
6
+ import base64
7
+ from datetime import datetime, timezone
8
+ import requests
9
+ from dotenv import load_dotenv
10
+ from bs4 import BeautifulSoup
11
+
12
+ # Load environment variables from .env file
13
+ load_dotenv()
14
+ # Pull API key from environment
15
+ API_KEY = os.getenv("LEGISCAN_API_KEY") # Set your LegiScan API key in .env
16
+ if not API_KEY:
17
+ print("Error: Please set LEGISCAN_API_KEY in your .env file.")
18
+ sys.exit(1)
19
+
20
+ # Modes for testing
21
+ # Quick test: pulls only TEST_MAX_BILLS bills
22
+ TESTING_MODE = False
23
+ # Full test: pulls all bills for TEST_STATE and TEST_YEAR without bill count cap
24
+ FULL_TESTING_MODE = False
25
+ TEST_STATE = 'CA'
26
+ TEST_YEAR = 2023
27
+ TEST_MAX_BILLS = 3
28
+
29
+ # Output files
30
+ CACHE_FILE = "data/bill_cache.json" # Stores bill_id -> change_hash
31
+ OUTPUT_FILE = "data/known_bills.json" # Final bills data
32
+
33
+ # Query settings
34
+ QUERY = "artificial intelligence"
35
+ START_YEAR = 2023
36
+ END_YEAR = datetime.now(timezone.utc).year
37
+
38
+ # Include all state legislatures plus U.S. Congress (both chambers)
39
+ STATES = [
40
+ "AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA",
41
+ "HI","ID","IL","IN","IA","KS","KY","LA","ME","MD",
42
+ "MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ",
43
+ "NM","NY","NC","ND","OH","OK","OR","PA","RI","SC",
44
+ "SD","TN","TX","UT","VT","VA","WA","WV","WI","WY",
45
+ "US" # U.S. Congress
46
+ ]
47
+
48
+ # Rate limiting (seconds between requests)
49
+ RATE_LIMIT = 0.2
50
+
51
+ # Create logs directory if it doesn't exist
52
+ os.makedirs("data_updating_scripts/logs", exist_ok=True)
53
+
54
+ # Logging configuration
55
+ LOG_FILE = "data_updating_scripts/logs/fetch_ai_bills.log"
56
+ logging.basicConfig(
57
+ level=logging.INFO,
58
+ format="%(asctime)s [%(levelname)s] %(message)s",
59
+ handlers=[
60
+ logging.StreamHandler(sys.stdout),
61
+ logging.FileHandler(LOG_FILE)
62
+ ]
63
+ )
64
+ logger = logging.getLogger(__name__)
65
+
66
+ # Apply testing overrides
67
+ if TESTING_MODE:
68
+ logger.info(f"*** TESTING MODE: fetching only {TEST_MAX_BILLS} bills from {TEST_STATE} ({TEST_YEAR}) ***")
69
+ STATES = [TEST_STATE]
70
+ if FULL_TESTING_MODE:
71
+ logger.info(f"*** FULL TESTING MODE: fetching all bills from {TEST_STATE} ({TEST_YEAR}) ***")
72
+ STATES = [TEST_STATE]
73
+
74
+
75
+ def load_json(path, default):
76
+ try:
77
+ with open(path, 'r') as f:
78
+ return json.load(f)
79
+ except (FileNotFoundError, json.JSONDecodeError):
80
+ return default
81
+
82
+
83
+ def save_json(path, data):
84
+ # Create directory if it doesn't exist
85
+ os.makedirs(os.path.dirname(path), exist_ok=True)
86
+ with open(path, 'w') as f:
87
+ json.dump(data, f, indent=2)
88
+ logger.info(f"Saved JSON to {path}")
89
+
90
+
91
+ def legi_request(op, params):
92
+ base = "https://api.legiscan.com/"
93
+ params.update({"key": API_KEY, "op": op})
94
+ try:
95
+ resp = requests.get(base, params=params, timeout=10)
96
+ resp.raise_for_status()
97
+ data = resp.json()
98
+ if data.get("status") != "OK":
99
+ logger.error(f"API error {op}: {data.get('message', data)}")
100
+ return None
101
+ return data
102
+ except requests.RequestException as e:
103
+ logger.error(f"Request failed ({op}): {e}")
104
+ return None
105
+
106
+
107
+ def extract_plain_text(html_content: str) -> str:
108
+ soup = BeautifulSoup(html_content, "html.parser")
109
+ return soup.get_text(separator="\n", strip=True)
110
+
111
+
112
+ def main():
113
+ cache = load_json(CACHE_FILE, {})
114
+ existing = load_json(OUTPUT_FILE, [])
115
+ existing_map = {b.get("bill_id"): b for b in existing}
116
+ logger.info(f"Loaded cache entries: {len(cache)}, existing bills: {len(existing)}")
117
+
118
+ collected = []
119
+ total_fetched = 0
120
+ years = [TEST_YEAR] if (TESTING_MODE or FULL_TESTING_MODE) else list(range(START_YEAR, END_YEAR + 1))
121
+
122
+ for state in STATES:
123
+ for year in years:
124
+ page = 1
125
+ while True:
126
+ if TESTING_MODE and total_fetched >= TEST_MAX_BILLS:
127
+ logger.info("Reached TEST_MAX_BILLS limit, stopping early.")
128
+ break
129
+ params = {"state": state, "year": year, "query": QUERY, "page": page}
130
+ logger.info(f"Searching {state} for {year}, page {page}")
131
+ data = legi_request("getSearch", params)
132
+ if not data:
133
+ break
134
+
135
+ results = data.get("searchresult", {})
136
+ summary = results.get("summary", {})
137
+ bills = [v for k, v in results.items() if k != "summary"]
138
+ if not bills:
139
+ logger.info(f"No bills on page {page} for {state} {year}")
140
+ break
141
+
142
+ logger.info(f"Found {len(bills)} bills on {state} {year} page {page}")
143
+ for bill in bills:
144
+ if TESTING_MODE and total_fetched >= TEST_MAX_BILLS:
145
+ break
146
+ bill_id = str(bill.get("bill_id"))
147
+ state_code = bill.get("state")
148
+ bill_num = bill.get("bill_number")
149
+ logger.info(f"Processing bill {state_code}_{bill_num} (ID: {bill_id})")
150
+
151
+ details_resp = legi_request("getBill", {"id": bill_id})
152
+ if not details_resp:
153
+ continue
154
+ details = details_resp.get("bill", {})
155
+ sess_year = details.get("session", {}).get("year_start", 0)
156
+ if sess_year < START_YEAR:
157
+ continue
158
+
159
+ new_hash = details.get("change_hash")
160
+ old_hash = cache.get(bill_id)
161
+ now_iso = datetime.now(timezone.utc).isoformat()
162
+
163
+ # Extract all relevant dates
164
+ explicit = details.get("last_action_date")
165
+ status_date = details.get("status_date")
166
+ last_vote_date = details.get("last_vote_date")
167
+ last_amendment_date = details.get("last_amendment_date")
168
+ actions = details.get("actions", [])
169
+ action_dates = [a.get("action_date") for a in actions if a.get("action_date")]
170
+ most_recent_action = max(action_dates) if action_dates else None
171
+ candidates = [d for d in [explicit, status_date, last_vote_date, last_amendment_date, most_recent_action] if d]
172
+ last_action_date = max(candidates) if candidates else None
173
+
174
+ bill_url = details.get("url") # Bill detail page URL
175
+
176
+ if new_hash and new_hash == old_hash and bill_id in existing_map:
177
+ entry = existing_map[bill_id]
178
+ entry.update({
179
+ "status": details.get("status"),
180
+ "session_year": f"{details.get('session', {}).get('year_start', '')}-{details.get('session', {}).get('year_end', '')}",
181
+ "last_action_date": last_action_date,
182
+ "status_date": status_date,
183
+ "last_vote_date": last_vote_date,
184
+ "last_amendment_date": last_amendment_date,
185
+ "actions": actions,
186
+ "bill_url": bill_url,
187
+ "lastUpdatedAt": now_iso
188
+ })
189
+ logger.info(f"Reused cache; updated status={entry['status']}, last_action_date={entry['last_action_date']}")
190
+ else:
191
+ plain_text = None
192
+ texts = details.get("texts", [])
193
+ if texts:
194
+ doc_id = texts[0].get("doc_id")
195
+ text_resp = legi_request("getBillText", {"id": doc_id})
196
+ if text_resp and "text" in text_resp:
197
+ raw_b64 = text_resp["text"].get("doc", "")
198
+ try:
199
+ decoded = base64.b64decode(raw_b64)
200
+ html = decoded.decode("utf-8", errors="ignore")
201
+ plain_text = extract_plain_text(html)
202
+ except Exception as e:
203
+ logger.error(f"Failed decoding HTML for {bill_id}: {e}")
204
+
205
+ entry = {
206
+ "bill_id": bill_id,
207
+ "state": state_code,
208
+ "bill_number": bill_num,
209
+ "session_year": f"{details.get('session', {}).get('year_start', '')}-{details.get('session', {}).get('year_end', '')}",
210
+ "title": details.get("title"),
211
+ "description": details.get("description"),
212
+ "status": details.get("status"),
213
+ "sponsors": [s.get("name") for s in details.get("sponsors", [])],
214
+ "text": plain_text,
215
+ "last_action_date": last_action_date,
216
+ "status_date": status_date,
217
+ "last_vote_date": last_vote_date,
218
+ "last_amendment_date": last_amendment_date,
219
+ "actions": actions,
220
+ "bill_url": bill_url,
221
+ "change_hash": new_hash,
222
+ "lastUpdatedAt": now_iso
223
+ }
224
+ cache[bill_id] = new_hash
225
+ logger.info(
226
+ f"Entry data: title='{entry['title']}', sponsors={len(entry['sponsors'])}, "
227
+ f"status={entry['status']}, last_action_date={entry['last_action_date']}"
228
+ )
229
+
230
+ collected.append(entry)
231
+ total_fetched += 1
232
+ time.sleep(RATE_LIMIT)
233
+
234
+ if page >= summary.get("page_total", 1):
235
+ break
236
+ page += 1
237
+ time.sleep(RATE_LIMIT)
238
+ if TESTING_MODE and total_fetched >= TEST_MAX_BILLS:
239
+ break
240
+ if TESTING_MODE and total_fetched >= TEST_MAX_BILLS:
241
+ break
242
+
243
+ dedup = {e["bill_id"]: e for e in collected}
244
+ all_bills = list(dedup.values())
245
+ save_json(OUTPUT_FILE, all_bills)
246
+ save_json(CACHE_FILE, cache)
247
+ logger.info(f"Completed run, saved {len(all_bills)} bills to {OUTPUT_FILE}")
248
+
249
+
250
+ if __name__ == "__main__":
251
+ main()
data/data_updating_scripts/get_data_ORIGINAL.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ import time
5
+ import logging
6
+ import base64
7
+ from datetime import datetime, timezone
8
+ import requests
9
+ from dotenv import load_dotenv
10
+ from bs4 import BeautifulSoup
11
+
12
+ # Load environment variables from .env file
13
+ load_dotenv()
14
+ # Pull API key from environment
15
+ API_KEY = os.getenv("LEGISCAN_API_KEY") # Set your LegiScan API key in .env
16
+ if not API_KEY:
17
+ print("Error: Please set LEGISCAN_API_KEY in your .env file.")
18
+ sys.exit(1)
19
+
20
+ # Modes for testing
21
+ # Quick test: pulls only TEST_MAX_BILLS bills
22
+ TESTING_MODE = False
23
+ # Full test: pulls all bills for TEST_STATE and TEST_YEAR without bill count cap
24
+ FULL_TESTING_MODE = False
25
+ TEST_STATE = 'CA'
26
+ TEST_YEAR = 2023
27
+ TEST_MAX_BILLS = 3
28
+
29
+ # Output files
30
+ CACHE_FILE = "data/bill_cache.json" # Stores bill_id -> change_hash
31
+ OUTPUT_FILE = "data/known_bills.json" # Final bills data
32
+
33
+ # Query settings
34
+ QUERY = "artificial intelligence"
35
+ START_YEAR = 2023
36
+ END_YEAR = datetime.now(timezone.utc).year
37
+
38
+ # Include all state legislatures plus U.S. Congress (both chambers)
39
+ STATES = [
40
+ "AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA",
41
+ "HI","ID","IL","IN","IA","KS","KY","LA","ME","MD",
42
+ "MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ",
43
+ "NM","NY","NC","ND","OH","OK","OR","PA","RI","SC",
44
+ "SD","TN","TX","UT","VT","VA","WA","WV","WI","WY",
45
+ "US" # U.S. Congress
46
+ ]
47
+
48
+ # Rate limiting (seconds between requests)
49
+ RATE_LIMIT = 0.2
50
+
51
+ # Create logs directory if it doesn't exist
52
+ os.makedirs("data_updating_scripts/logs", exist_ok=True)
53
+
54
+ # Logging configuration
55
+ LOG_FILE = "data_updating_scripts/logs/fetch_ai_bills.log"
56
+ logging.basicConfig(
57
+ level=logging.INFO,
58
+ format="%(asctime)s [%(levelname)s] %(message)s",
59
+ handlers=[
60
+ logging.StreamHandler(sys.stdout),
61
+ logging.FileHandler(LOG_FILE)
62
+ ]
63
+ )
64
+ logger = logging.getLogger(__name__)
65
+
66
+ # Apply testing overrides
67
+ if TESTING_MODE:
68
+ logger.info(f"*** TESTING MODE: fetching only {TEST_MAX_BILLS} bills from {TEST_STATE} ({TEST_YEAR}) ***")
69
+ STATES = [TEST_STATE]
70
+ if FULL_TESTING_MODE:
71
+ logger.info(f"*** FULL TESTING MODE: fetching all bills from {TEST_STATE} ({TEST_YEAR}) ***")
72
+ STATES = [TEST_STATE]
73
+
74
+
75
+ def load_json(path, default):
76
+ try:
77
+ with open(path, 'r') as f:
78
+ return json.load(f)
79
+ except (FileNotFoundError, json.JSONDecodeError):
80
+ return default
81
+
82
+
83
+ def save_json(path, data):
84
+ # Create directory if it doesn't exist
85
+ os.makedirs(os.path.dirname(path), exist_ok=True)
86
+ with open(path, 'w') as f:
87
+ json.dump(data, f, indent=2)
88
+ logger.info(f"Saved JSON to {path}")
89
+
90
+
91
+ def legi_request(op, params):
92
+ base = "https://api.legiscan.com/"
93
+ params.update({"key": API_KEY, "op": op})
94
+ try:
95
+ resp = requests.get(base, params=params, timeout=10)
96
+ resp.raise_for_status()
97
+ data = resp.json()
98
+ if data.get("status") != "OK":
99
+ logger.error(f"API error {op}: {data.get('message', data)}")
100
+ return None
101
+ return data
102
+ except requests.RequestException as e:
103
+ logger.error(f"Request failed ({op}): {e}")
104
+ return None
105
+
106
+
107
+ def extract_plain_text(html_content: str) -> str:
108
+ soup = BeautifulSoup(html_content, "html.parser")
109
+ return soup.get_text(separator="\n", strip=True)
110
+
111
+
112
+ def main():
113
+ cache = load_json(CACHE_FILE, {})
114
+ existing = load_json(OUTPUT_FILE, [])
115
+ existing_map = {b.get("bill_id"): b for b in existing}
116
+ logger.info(f"Loaded cache entries: {len(cache)}, existing bills: {len(existing)}")
117
+
118
+ collected = []
119
+ total_fetched = 0
120
+ years = [TEST_YEAR] if (TESTING_MODE or FULL_TESTING_MODE) else list(range(START_YEAR, END_YEAR + 1))
121
+
122
+ for state in STATES:
123
+ for year in years:
124
+ page = 1
125
+ while True:
126
+ if TESTING_MODE and total_fetched >= TEST_MAX_BILLS:
127
+ logger.info("Reached TEST_MAX_BILLS limit, stopping early.")
128
+ break
129
+ params = {"state": state, "year": year, "query": QUERY, "page": page}
130
+ logger.info(f"Searching {state} for {year}, page {page}")
131
+ data = legi_request("getSearch", params)
132
+ if not data:
133
+ break
134
+
135
+ results = data.get("searchresult", {})
136
+ summary = results.get("summary", {})
137
+ bills = [v for k, v in results.items() if k != "summary"]
138
+ if not bills:
139
+ logger.info(f"No bills on page {page} for {state} {year}")
140
+ break
141
+
142
+ logger.info(f"Found {len(bills)} bills on {state} {year} page {page}")
143
+ for bill in bills:
144
+ if TESTING_MODE and total_fetched >= TEST_MAX_BILLS:
145
+ break
146
+ bill_id = str(bill.get("bill_id"))
147
+ state_code = bill.get("state")
148
+ bill_num = bill.get("bill_number")
149
+ logger.info(f"Processing bill {state_code}_{bill_num} (ID: {bill_id})")
150
+
151
+ details_resp = legi_request("getBill", {"id": bill_id})
152
+ if not details_resp:
153
+ continue
154
+ details = details_resp.get("bill", {})
155
+ sess_year = details.get("session", {}).get("year_start", 0)
156
+ if sess_year < START_YEAR:
157
+ continue
158
+
159
+ new_hash = details.get("change_hash")
160
+ old_hash = cache.get(bill_id)
161
+ now_iso = datetime.now(timezone.utc).isoformat()
162
+
163
+ # Extract all relevant dates
164
+ explicit = details.get("last_action_date")
165
+ status_date = details.get("status_date")
166
+ last_vote_date = details.get("last_vote_date")
167
+ last_amendment_date = details.get("last_amendment_date")
168
+ actions = details.get("actions", [])
169
+ action_dates = [a.get("action_date") for a in actions if a.get("action_date")]
170
+ most_recent_action = max(action_dates) if action_dates else None
171
+ candidates = [d for d in [explicit, status_date, last_vote_date, last_amendment_date, most_recent_action] if d]
172
+ last_action_date = max(candidates) if candidates else None
173
+
174
+ bill_url = details.get("url") # Bill detail page URL
175
+
176
+ if new_hash and new_hash == old_hash and bill_id in existing_map:
177
+ entry = existing_map[bill_id]
178
+ entry.update({
179
+ "status": details.get("status"),
180
+ "session_year": f"{details.get('session', {}).get('year_start', '')}-{details.get('session', {}).get('year_end', '')}",
181
+ "last_action_date": last_action_date,
182
+ "status_date": status_date,
183
+ "last_vote_date": last_vote_date,
184
+ "last_amendment_date": last_amendment_date,
185
+ "actions": actions,
186
+ "bill_url": bill_url,
187
+ "lastUpdatedAt": now_iso
188
+ })
189
+ logger.info(f"Reused cache; updated status={entry['status']}, last_action_date={entry['last_action_date']}")
190
+ else:
191
+ plain_text = None
192
+ texts = details.get("texts", [])
193
+ if texts:
194
+ doc_id = texts[0].get("doc_id")
195
+ text_resp = legi_request("getBillText", {"id": doc_id})
196
+ if text_resp and "text" in text_resp:
197
+ raw_b64 = text_resp["text"].get("doc", "")
198
+ try:
199
+ decoded = base64.b64decode(raw_b64)
200
+ html = decoded.decode("utf-8", errors="ignore")
201
+ plain_text = extract_plain_text(html)
202
+ except Exception as e:
203
+ logger.error(f"Failed decoding HTML for {bill_id}: {e}")
204
+
205
+ entry = {
206
+ "bill_id": bill_id,
207
+ "state": state_code,
208
+ "bill_number": bill_num,
209
+ "session_year": f"{details.get('session', {}).get('year_start', '')}-{details.get('session', {}).get('year_end', '')}",
210
+ "title": details.get("title"),
211
+ "description": details.get("description"),
212
+ "status": details.get("status"),
213
+ "sponsors": [s.get("name") for s in details.get("sponsors", [])],
214
+ "text": plain_text,
215
+ "last_action_date": last_action_date,
216
+ "status_date": status_date,
217
+ "last_vote_date": last_vote_date,
218
+ "last_amendment_date": last_amendment_date,
219
+ "actions": actions,
220
+ "bill_url": bill_url,
221
+ "change_hash": new_hash,
222
+ "lastUpdatedAt": now_iso
223
+ }
224
+ cache[bill_id] = new_hash
225
+ logger.info(
226
+ f"Entry data: title='{entry['title']}', sponsors={len(entry['sponsors'])}, "
227
+ f"status={entry['status']}, last_action_date={entry['last_action_date']}"
228
+ )
229
+
230
+ collected.append(entry)
231
+ total_fetched += 1
232
+ time.sleep(RATE_LIMIT)
233
+
234
+ if page >= summary.get("page_total", 1):
235
+ break
236
+ page += 1
237
+ time.sleep(RATE_LIMIT)
238
+ if TESTING_MODE and total_fetched >= TEST_MAX_BILLS:
239
+ break
240
+ if TESTING_MODE and total_fetched >= TEST_MAX_BILLS:
241
+ break
242
+
243
+ dedup = {e["bill_id"]: e for e in collected}
244
+ all_bills = list(dedup.values())
245
+ save_json(OUTPUT_FILE, all_bills)
246
+ save_json(CACHE_FILE, cache)
247
+ logger.info(f"Completed run, saved {len(all_bills)} bills to {OUTPUT_FILE}")
248
+
249
+
250
+ if __name__ == "__main__":
251
+ main()
data/data_updating_scripts/known_bills_status.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ known_bills_status.py
4
+
5
+ Reads known_bills_fixed.json and updates existing known_bills_visualize.json.
6
+ Merges new bills and updates existing ones while preserving clean status fields.
7
+ """
8
+ import json
9
+ from pathlib import Path
10
+ from datetime import datetime, timezone
11
+
12
+ def map_status(original_status):
13
+ """Map LegiScan status codes to clean display text."""
14
+
15
+ # Direct mapping for numeric codes
16
+ status_mapping = {
17
+ "0": "Inactive", # Pre-filed
18
+ "1": "Active", # Introduced
19
+ "2": "Active", # Engrossed
20
+ "3": "Active", # Enrolled
21
+ "4": "Signed Into Law", # Passed
22
+ "5": "Vetoed", # Vetoed
23
+ "6": "Inactive", # Failed
24
+ "7": "Signed Into Law", # Override
25
+ "8": "Signed Into Law", # Chaptered
26
+ "9": "Active", # Refer
27
+ "10": "Active", # Report Pass
28
+ "11": "Inactive", # Report DNP
29
+ "12": "Active", # Draft
30
+
31
+ # Integer versions
32
+ 0: "Inactive", 1: "Active", 2: "Active", 3: "Active",
33
+ 4: "Signed Into Law", 5: "Vetoed", 6: "Inactive",
34
+ 7: "Signed Into Law", 8: "Signed Into Law", 9: "Active",
35
+ 10: "Active", 11: "Inactive", 12: "Active"
36
+ }
37
+
38
+ # Try direct mapping first
39
+ if original_status in status_mapping:
40
+ return status_mapping[original_status]
41
+
42
+ # Handle text statuses
43
+ if original_status:
44
+ status_str = str(original_status).lower()
45
+ if "pass" in status_str or "signed" in status_str or "enacted" in status_str:
46
+ return "Signed Into Law"
47
+ elif "veto" in status_str:
48
+ return "Vetoed"
49
+ elif "fail" in status_str or "dead" in status_str or "killed" in status_str:
50
+ return "Inactive"
51
+ elif "active" in status_str or "intro" in status_str or "pending" in status_str:
52
+ return "Active"
53
+
54
+ # Default fallback
55
+ return "Inactive"
56
+
57
+ def create_bill_key(bill):
58
+ """Create a unique key for each bill."""
59
+ return f"{bill.get('state', 'Unknown')}_{bill.get('bill_number', 'Unknown')}"
60
+
61
+ def merge_bill_data(new_bill, existing_bill=None):
62
+ """Merge new bill data with existing bill, preserving processed fields."""
63
+ if not existing_bill:
64
+ # New bill - create clean version
65
+ merged_bill = new_bill.copy()
66
+ original_status = merged_bill.get('status')
67
+ merged_bill['original_status'] = original_status
68
+ merged_bill['status'] = map_status(original_status)
69
+ merged_bill['status_updated_at'] = datetime.now(timezone.utc).isoformat()
70
+ return merged_bill
71
+
72
+ # Existing bill - merge carefully
73
+ merged_bill = existing_bill.copy()
74
+
75
+ # Update with new data from source (except status fields)
76
+ for key, value in new_bill.items():
77
+ if key not in ['status', 'original_status', 'status_updated_at']:
78
+ merged_bill[key] = value
79
+
80
+ # Check if original status actually changed
81
+ new_original_status = new_bill.get('status')
82
+ old_original_status = existing_bill.get('original_status')
83
+
84
+ # Convert both to strings for comparison to handle int vs string
85
+ new_status_str = str(new_original_status) if new_original_status is not None else None
86
+ old_status_str = str(old_original_status) if old_original_status is not None else None
87
+
88
+ if new_status_str != old_status_str:
89
+ # Real change in underlying data
90
+ new_clean_status = map_status(new_original_status)
91
+ merged_bill['original_status'] = new_original_status
92
+ merged_bill['status'] = new_clean_status
93
+ merged_bill['status_updated_at'] = datetime.now(timezone.utc).isoformat()
94
+ return merged_bill
95
+
96
+ # No change - keep existing clean status but ensure it's properly mapped
97
+ if 'status' not in merged_bill or merged_bill['status'] in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']:
98
+ # Only remap if status is still numeric (needs cleaning)
99
+ merged_bill['status'] = map_status(old_original_status)
100
+
101
+ return merged_bill
102
+
103
+ def main():
104
+
105
+ # File paths
106
+ input_file = Path("data/known_bills_fixed.json")
107
+ output_file = Path("data/known_bills_visualize.json")
108
+
109
+ print(f"Reading source bills from: {input_file}")
110
+
111
+ # Load source bills data
112
+ with open(input_file, 'r', encoding='utf-8') as f:
113
+ source_bills = json.load(f)
114
+
115
+ print(f"Loaded {len(source_bills)} bills from source")
116
+
117
+ # Load existing visualization data if it exists
118
+ existing_bills = []
119
+ if output_file.exists():
120
+ print(f"Reading existing visualization data from: {output_file}")
121
+ with open(output_file, 'r', encoding='utf-8') as f:
122
+ existing_bills = json.load(f)
123
+ print(f"Loaded {len(existing_bills)} existing bills")
124
+ else:
125
+ print("No existing visualization data found - will create new file")
126
+
127
+ # Create lookup dictionary for existing bills
128
+ existing_bills_dict = {}
129
+ for bill in existing_bills:
130
+ key = create_bill_key(bill)
131
+ existing_bills_dict[key] = bill
132
+
133
+ # Process and merge bills
134
+ merged_bills = []
135
+ new_bills_count = 0
136
+ updated_bills_count = 0
137
+ unchanged_bills_count = 0
138
+
139
+ print(f"\nProcessing {len(source_bills)} bills...")
140
+
141
+ for source_bill in source_bills:
142
+ bill_key = create_bill_key(source_bill)
143
+ existing_bill = existing_bills_dict.get(bill_key)
144
+
145
+ if existing_bill:
146
+ # Check if anything actually changed
147
+ old_original_status = existing_bill.get('original_status')
148
+ new_original_status = source_bill.get('status')
149
+
150
+ if old_original_status != new_original_status:
151
+ updated_bills_count += 1
152
+ else:
153
+ unchanged_bills_count += 1
154
+ else:
155
+ new_bills_count += 1
156
+
157
+ merged_bill = merge_bill_data(source_bill, existing_bill)
158
+ merged_bills.append(merged_bill)
159
+
160
+ # Check for bills that exist in visualization but not in source (removed bills)
161
+ source_keys = {create_bill_key(bill) for bill in source_bills}
162
+ existing_keys = set(existing_bills_dict.keys())
163
+ removed_keys = existing_keys - source_keys
164
+
165
+ # Save updated bills
166
+ print(f"\nSaving updated bills to: {output_file}")
167
+ with open(output_file, 'w', encoding='utf-8') as f:
168
+ json.dump(merged_bills, f, indent=2, ensure_ascii=False)
169
+
170
+ # Show status distribution
171
+ status_counts = {}
172
+ for bill in merged_bills:
173
+ status = bill['status']
174
+ status_counts[status] = status_counts.get(status, 0) + 1
175
+
176
+ # Summary
177
+ print(f"\n✅ Update complete!")
178
+ print(f" 📊 Total bills: {len(merged_bills)}")
179
+ if new_bills_count > 0:
180
+ print(f" 🆕 New bills: {new_bills_count}")
181
+ if updated_bills_count > 0:
182
+ print(f" 🔄 Updated bills: {updated_bills_count}")
183
+ if unchanged_bills_count > 0:
184
+ print(f" ✅ Unchanged bills: {unchanged_bills_count}")
185
+ if removed_keys:
186
+ print(f" 🗑️ Removed bills: {len(removed_keys)}")
187
+
188
+ if new_bills_count == 0 and updated_bills_count == 0:
189
+ print(f" 🎉 All bills are up to date - no changes needed!")
190
+
191
+ print(f"\n📈 Status distribution:")
192
+ for status, count in sorted(status_counts.items()):
193
+ print(f" {status}: {count}")
194
+
195
+ print(f"\n📁 Clean data saved to: {output_file}")
196
+ print("Now run: streamlit run scripts/visualize-MIT.py")
197
+
198
+ if __name__ == "__main__":
199
+ main()
data/data_updating_scripts/logs/eu_vectorstore.log ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-11-03 11:40:25,451 [INFO] Starting EU AI Act vectorstore creation...
2
+ 2025-11-03 11:40:25,451 [INFO] Extracting text from PDF...
3
+ 2025-11-03 11:40:25,480 [INFO] Processing 144 pages from data_updating_scripts/eu-ai-act.pdf
4
+ 2025-11-03 11:40:27,260 [INFO] Extracted 612396 characters from PDF
5
+ 2025-11-03 11:40:27,260 [INFO] Creating document chunks...
6
+ 2025-11-03 11:40:27,268 [INFO] Created 648 document chunks
7
+ 2025-11-03 11:40:27,268 [INFO] Initializing embeddings...
8
+ 2025-11-03 11:40:27,397 [INFO] Creating FAISS vectorstore...
9
+ 2025-11-03 11:40:31,088 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
10
+ 2025-11-03 11:40:31,414 [INFO] Loading faiss.
11
+ 2025-11-03 11:40:31,881 [INFO] Successfully loaded faiss.
12
+ 2025-11-03 11:40:31,936 [INFO] Saving vectorstore to data/eu_ai_act_vectorstore...
13
+ 2025-11-03 11:40:31,945 [INFO] ✅ EU AI Act vectorstore created successfully!
14
+ 2025-11-03 11:40:31,945 [INFO] - Total chunks: 648
15
+ 2025-11-03 11:40:31,945 [INFO] - Text length: 612,396 characters
16
+ 2025-11-03 11:40:31,945 [INFO] - Saved to: data/eu_ai_act_vectorstore
17
+ 2025-11-03 12:24:44,470 [INFO] Starting EU AI Act vectorstore creation...
18
+ 2025-11-03 12:24:44,471 [INFO] Extracting text from PDF...
19
+ 2025-11-03 12:24:44,492 [INFO] Processing 144 pages from data_updating_scripts/eu-ai-act.pdf
20
+ 2025-11-03 12:24:46,209 [INFO] Extracted 612396 characters from PDF
21
+ 2025-11-03 12:24:46,209 [INFO] Creating document chunks...
22
+ 2025-11-03 12:24:46,217 [INFO] Created 648 document chunks
23
+ 2025-11-03 12:24:46,217 [INFO] Initializing embeddings...
24
+ 2025-11-03 12:24:46,357 [INFO] Creating FAISS vectorstore...
25
+ 2025-11-03 12:24:49,286 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
26
+ 2025-11-03 12:24:49,669 [INFO] Loading faiss.
27
+ 2025-11-03 12:24:49,700 [INFO] Successfully loaded faiss.
28
+ 2025-11-03 12:24:49,749 [INFO] Saving vectorstore to data/eu_ai_act_vectorstore...
29
+ 2025-11-03 12:24:49,754 [INFO] ✅ EU AI Act vectorstore created successfully!
30
+ 2025-11-03 12:24:49,754 [INFO] - Total chunks: 648
31
+ 2025-11-03 12:24:49,754 [INFO] - Text length: 612,396 characters
32
+ 2025-11-03 12:24:49,754 [INFO] - Saved to: data/eu_ai_act_vectorstore
33
+ 2025-11-04 15:55:15,879 [INFO] Starting EU AI Act vectorstore creation...
34
+ 2025-11-04 15:55:15,879 [INFO] Extracting text from PDF...
35
+ 2025-11-04 15:55:15,899 [INFO] Processing 144 pages from data_updating_scripts/eu-ai-act.pdf
36
+ 2025-11-04 15:55:17,629 [INFO] Extracted 612396 characters from PDF
37
+ 2025-11-04 15:55:17,629 [INFO] Creating document chunks...
38
+ 2025-11-04 15:55:17,637 [INFO] Created 648 document chunks
39
+ 2025-11-04 15:55:17,637 [INFO] Initializing embeddings...
40
+ 2025-11-04 15:55:17,768 [INFO] Creating FAISS vectorstore...
41
+ 2025-11-04 15:55:21,406 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
42
+ 2025-11-04 15:55:21,846 [INFO] Loading faiss.
43
+ 2025-11-04 15:55:21,917 [INFO] Successfully loaded faiss.
44
+ 2025-11-04 15:55:21,968 [INFO] Saving vectorstore to data/eu_ai_act_vectorstore...
45
+ 2025-11-04 15:55:21,981 [INFO] ✅ EU AI Act vectorstore created successfully!
46
+ 2025-11-04 15:55:21,981 [INFO] - Total chunks: 648
47
+ 2025-11-04 15:55:21,981 [INFO] - Text length: 612,396 characters
48
+ 2025-11-04 15:55:21,981 [INFO] - Saved to: data/eu_ai_act_vectorstore
49
+ 2025-11-14 15:36:40,441 [INFO] Starting EU AI Act vectorstore creation...
50
+ 2025-11-14 15:36:40,442 [INFO] Extracting text from PDF...
51
+ 2025-11-14 15:36:40,455 [INFO] Processing 144 pages from data_updating_scripts/eu-ai-act.pdf
52
+ 2025-11-14 15:36:41,830 [INFO] Extracted 612396 characters from PDF
53
+ 2025-11-14 15:36:41,830 [INFO] Creating document chunks...
54
+ 2025-11-14 15:36:41,837 [INFO] Created 648 document chunks
55
+ 2025-11-14 15:36:41,837 [INFO] Initializing embeddings...
56
+ 2025-11-14 15:36:41,983 [INFO] Creating FAISS vectorstore...
57
+ 2025-11-14 15:36:46,413 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
58
+ 2025-11-14 15:36:46,791 [INFO] Loading faiss.
59
+ 2025-11-14 15:36:47,362 [INFO] Successfully loaded faiss.
60
+ 2025-11-14 15:36:47,404 [INFO] Saving vectorstore to data/eu_ai_act_vectorstore...
61
+ 2025-11-14 15:36:47,410 [INFO] ✅ EU AI Act vectorstore created successfully!
62
+ 2025-11-14 15:36:47,410 [INFO] - Total chunks: 648
63
+ 2025-11-14 15:36:47,410 [INFO] - Text length: 612,396 characters
64
+ 2025-11-14 15:36:47,410 [INFO] - Saved to: data/eu_ai_act_vectorstore
65
+ 2025-11-20 14:15:10,012 [INFO] Starting EU AI Act vectorstore creation...
66
+ 2025-11-20 14:15:10,013 [INFO] Extracting text from PDF...
67
+ 2025-11-20 14:15:10,029 [INFO] Processing 144 pages from data_updating_scripts/eu-ai-act.pdf
68
+ 2025-11-20 14:15:11,997 [INFO] Extracted 612396 characters from PDF
69
+ 2025-11-20 14:15:11,998 [INFO] Creating document chunks...
70
+ 2025-11-20 14:15:12,006 [INFO] Created 648 document chunks
71
+ 2025-11-20 14:15:12,006 [INFO] Initializing embeddings...
72
+ 2025-11-20 14:15:12,200 [INFO] Creating FAISS vectorstore...
73
+ 2025-11-20 14:15:16,058 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
74
+ 2025-11-20 14:15:16,386 [INFO] Loading faiss.
75
+ 2025-11-20 14:15:16,477 [INFO] Successfully loaded faiss.
76
+ 2025-11-20 14:15:16,521 [INFO] Saving vectorstore to data/eu_ai_act_vectorstore...
77
+ 2025-11-20 14:15:16,529 [INFO] ✅ EU AI Act vectorstore created successfully!
78
+ 2025-11-20 14:15:16,529 [INFO] - Total chunks: 648
79
+ 2025-11-20 14:15:16,529 [INFO] - Text length: 612,396 characters
80
+ 2025-11-20 14:15:16,529 [INFO] - Saved to: data/eu_ai_act_vectorstore
81
+ 2025-12-01 12:38:49,653 [INFO] Starting EU AI Act vectorstore creation...
82
+ 2025-12-01 12:38:49,653 [INFO] Extracting text from PDF...
83
+ 2025-12-01 12:38:49,669 [INFO] Processing 144 pages from data_updating_scripts/eu-ai-act.pdf
84
+ 2025-12-01 12:38:51,518 [INFO] Extracted 612396 characters from PDF
85
+ 2025-12-01 12:38:51,518 [INFO] Creating document chunks...
86
+ 2025-12-01 12:38:51,526 [INFO] Created 648 document chunks
87
+ 2025-12-01 12:38:51,526 [INFO] Initializing embeddings...
88
+ 2025-12-01 12:38:51,709 [INFO] Creating FAISS vectorstore...
89
+ 2025-12-01 12:38:54,252 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
90
+ 2025-12-01 12:38:54,675 [INFO] Loading faiss.
91
+ 2025-12-01 12:38:54,817 [INFO] Successfully loaded faiss.
92
+ 2025-12-01 12:38:54,859 [INFO] Saving vectorstore to data/eu_ai_act_vectorstore...
93
+ 2025-12-01 12:38:54,865 [INFO] ✅ EU AI Act vectorstore created successfully!
94
+ 2025-12-01 12:38:54,866 [INFO] - Total chunks: 648
95
+ 2025-12-01 12:38:54,866 [INFO] - Text length: 612,396 characters
96
+ 2025-12-01 12:38:54,866 [INFO] - Saved to: data/eu_ai_act_vectorstore
97
+ 2025-12-01 13:21:15,236 [INFO] Starting EU AI Act vectorstore creation...
98
+ 2025-12-01 13:21:15,237 [INFO] Extracting text from PDF...
99
+ 2025-12-01 13:21:15,253 [INFO] Processing 144 pages from data_updating_scripts/eu-ai-act.pdf
100
+ 2025-12-01 13:21:17,069 [INFO] Extracted 612396 characters from PDF
101
+ 2025-12-01 13:21:17,069 [INFO] Creating document chunks...
102
+ 2025-12-01 13:21:17,078 [INFO] Created 648 document chunks
103
+ 2025-12-01 13:21:17,078 [INFO] Initializing embeddings...
104
+ 2025-12-01 13:21:17,343 [INFO] Creating FAISS vectorstore...
105
+ 2025-12-01 13:21:20,254 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
106
+ 2025-12-01 13:21:20,654 [INFO] Loading faiss.
107
+ 2025-12-01 13:21:20,768 [INFO] Successfully loaded faiss.
108
+ 2025-12-01 13:21:20,815 [INFO] Saving vectorstore to data/eu_ai_act_vectorstore...
109
+ 2025-12-01 13:21:20,821 [INFO] ✅ EU AI Act vectorstore created successfully!
110
+ 2025-12-01 13:21:20,821 [INFO] - Total chunks: 648
111
+ 2025-12-01 13:21:20,821 [INFO] - Text length: 612,396 characters
112
+ 2025-12-01 13:21:20,822 [INFO] - Saved to: data/eu_ai_act_vectorstore
113
+ 2025-12-03 11:09:39,059 [INFO] Starting EU AI Act vectorstore creation...
114
+ 2025-12-03 11:09:39,060 [INFO] Extracting text from PDF...
115
+ 2025-12-03 11:09:39,075 [INFO] Processing 144 pages from data_updating_scripts/eu-ai-act.pdf
116
+ 2025-12-03 11:09:40,933 [INFO] Extracted 612396 characters from PDF
117
+ 2025-12-03 11:09:40,934 [INFO] Creating document chunks...
118
+ 2025-12-03 11:09:40,942 [INFO] Created 648 document chunks
119
+ 2025-12-03 11:09:40,942 [INFO] Initializing embeddings...
120
+ 2025-12-03 11:09:41,136 [INFO] Creating FAISS vectorstore...
121
+ 2025-12-03 11:09:44,436 [INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
122
+ 2025-12-03 11:09:44,820 [INFO] Loading faiss.
123
+ 2025-12-03 11:09:44,925 [INFO] Successfully loaded faiss.
124
+ 2025-12-03 11:09:44,968 [INFO] Saving vectorstore to data/eu_ai_act_vectorstore...
125
+ 2025-12-03 11:09:44,974 [INFO] ✅ EU AI Act vectorstore created successfully!
126
+ 2025-12-03 11:09:44,974 [INFO] - Total chunks: 648
127
+ 2025-12-03 11:09:44,974 [INFO] - Text length: 612,396 characters
128
+ 2025-12-03 11:09:44,974 [INFO] - Saved to: data/eu_ai_act_vectorstore
data/data_updating_scripts/logs/fetch_ai_bills.log ADDED
The diff for this file is too large to render. See raw diff
 
data/data_updating_scripts/logs/fix_pdf_bills.log ADDED
The diff for this file is too large to render. See raw diff
 
data/data_updating_scripts/logs/generate_reports.log ADDED
The diff for this file is too large to render. See raw diff
 
data/data_updating_scripts/logs/generate_suggested_questions.log ADDED
The diff for this file is too large to render. See raw diff
 
data/data_updating_scripts/logs/generate_summaries.log ADDED
The diff for this file is too large to render. See raw diff
 
data/data_updating_scripts/logs/mark_no_text_bills.log ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-11-03 11:36:37 [INFO] Starting no-text bill marking process
2
+ 2025-11-03 11:36:37 [INFO] Loaded 1 bills from data/known_bills_visualize.json
3
+ 2025-11-03 11:36:37 [INFO] Processing 1 bills to mark no-text bills
4
+ 2025-11-03 11:36:37 [INFO] Saved 1 bills to data/known_bills_visualize.json
5
+ 2025-11-03 11:36:37 [INFO] Processing complete!
6
+ 2025-11-03 11:36:37 [INFO] Total bills processed: 1
7
+ 2025-11-03 11:36:37 [INFO] Bills without text: 0
8
+ 2025-11-03 11:36:37 [INFO] Already had None categories: 0
9
+ 2025-11-03 11:36:37 [INFO] Newly marked as None: 0
10
+ 2025-11-03 11:36:37 [INFO] No-text bill marking process completed
11
+ 2025-11-03 11:40:21 [INFO] Starting no-text bill marking process
12
+ 2025-11-03 11:40:21 [INFO] Loaded 1 bills from data/known_bills_visualize.json
13
+ 2025-11-03 11:40:21 [INFO] Processing 1 bills to mark no-text bills
14
+ 2025-11-03 11:40:21 [INFO] Saved 1 bills to data/known_bills_visualize.json
15
+ 2025-11-03 11:40:21 [INFO] Processing complete!
16
+ 2025-11-03 11:40:21 [INFO] Total bills processed: 1
17
+ 2025-11-03 11:40:21 [INFO] Bills without text: 0
18
+ 2025-11-03 11:40:21 [INFO] Already had None categories: 0
19
+ 2025-11-03 11:40:21 [INFO] Newly marked as None: 0
20
+ 2025-11-03 11:40:21 [INFO] No-text bill marking process completed
21
+ 2025-11-03 12:24:40 [INFO] Starting no-text bill marking process
22
+ 2025-11-03 12:24:40 [INFO] Loaded 1 bills from data/known_bills_visualize.json
23
+ 2025-11-03 12:24:40 [INFO] Processing 1 bills to mark no-text bills
24
+ 2025-11-03 12:24:40 [INFO] Saved 1 bills to data/known_bills_visualize.json
25
+ 2025-11-03 12:24:40 [INFO] Processing complete!
26
+ 2025-11-03 12:24:40 [INFO] Total bills processed: 1
27
+ 2025-11-03 12:24:40 [INFO] Bills without text: 0
28
+ 2025-11-03 12:24:40 [INFO] Already had None categories: 0
29
+ 2025-11-03 12:24:40 [INFO] Newly marked as None: 0
30
+ 2025-11-03 12:24:40 [INFO] No-text bill marking process completed
31
+ 2025-11-04 15:55:11 [INFO] Starting no-text bill marking process
32
+ 2025-11-04 15:55:11 [INFO] Loaded 10 bills from data/known_bills_visualize.json
33
+ 2025-11-04 15:55:11 [INFO] Processing 10 bills to mark no-text bills
34
+ 2025-11-04 15:55:11 [INFO] Saved 10 bills to data/known_bills_visualize.json
35
+ 2025-11-04 15:55:11 [INFO] Processing complete!
36
+ 2025-11-04 15:55:11 [INFO] Total bills processed: 10
37
+ 2025-11-04 15:55:11 [INFO] Bills without text: 0
38
+ 2025-11-04 15:55:11 [INFO] Already had None categories: 0
39
+ 2025-11-04 15:55:11 [INFO] Newly marked as None: 0
40
+ 2025-11-04 15:55:11 [INFO] No-text bill marking process completed
41
+ 2025-11-14 15:31:16 [INFO] Starting no-text bill marking process
42
+ 2025-11-14 15:31:16 [INFO] Loaded 2564 bills from data/known_bills_visualize.json
43
+ 2025-11-14 15:31:16 [INFO] Processing 2564 bills to mark no-text bills
44
+ 2025-11-14 15:31:16 [INFO] Progress: 100/2564 processed
45
+ 2025-11-14 15:31:16 [INFO] Progress: 200/2564 processed
46
+ 2025-11-14 15:31:16 [INFO] Progress: 300/2564 processed
47
+ 2025-11-14 15:31:16 [INFO] Progress: 400/2564 processed
48
+ 2025-11-14 15:31:16 [INFO] Progress: 500/2564 processed
49
+ 2025-11-14 15:31:16 [INFO] Progress: 600/2564 processed
50
+ 2025-11-14 15:31:16 [INFO] Progress: 700/2564 processed
51
+ 2025-11-14 15:31:16 [INFO] Progress: 800/2564 processed
52
+ 2025-11-14 15:31:16 [INFO] Progress: 900/2564 processed
53
+ 2025-11-14 15:31:16 [INFO] Progress: 1000/2564 processed
54
+ 2025-11-14 15:31:16 [INFO] Progress: 1100/2564 processed
55
+ 2025-11-14 15:31:16 [INFO] Progress: 1200/2564 processed
56
+ 2025-11-14 15:31:16 [INFO] Progress: 1300/2564 processed
57
+ 2025-11-14 15:31:16 [INFO] Progress: 1400/2564 processed
58
+ 2025-11-14 15:31:16 [INFO] Progress: 1500/2564 processed
59
+ 2025-11-14 15:31:16 [INFO] Progress: 1600/2564 processed
60
+ 2025-11-14 15:31:16 [INFO] Progress: 1700/2564 processed
61
+ 2025-11-14 15:31:16 [INFO] Progress: 1800/2564 processed
62
+ 2025-11-14 15:31:16 [INFO] Progress: 1900/2564 processed
63
+ 2025-11-14 15:31:16 [INFO] Progress: 2000/2564 processed
64
+ 2025-11-14 15:31:16 [INFO] Progress: 2100/2564 processed
65
+ 2025-11-14 15:31:16 [INFO] Progress: 2200/2564 processed
66
+ 2025-11-14 15:31:16 [INFO] Progress: 2300/2564 processed
67
+ 2025-11-14 15:31:16 [INFO] Progress: 2400/2564 processed
68
+ 2025-11-14 15:31:16 [INFO] Progress: 2500/2564 processed
69
+ 2025-11-14 15:31:17 [INFO] Saved 2564 bills to data/known_bills_visualize.json
70
+ 2025-11-14 15:31:17 [INFO] Processing complete!
71
+ 2025-11-14 15:31:17 [INFO] Total bills processed: 2564
72
+ 2025-11-14 15:31:17 [INFO] Bills without text: 9
73
+ 2025-11-14 15:31:17 [INFO] Already had None categories: 9
74
+ 2025-11-14 15:31:17 [INFO] Newly marked as None: 0
75
+ 2025-11-14 15:31:17 [INFO] No-text bill marking process completed
76
+ 2025-11-17 21:13:12 [INFO] Starting no-text bill marking process
77
+ 2025-11-17 21:13:13 [INFO] Loaded 2564 bills from data/known_bills_visualize.json
78
+ 2025-11-17 21:13:13 [INFO] Processing 2564 bills to mark no-text bills
79
+ 2025-11-17 21:13:13 [INFO] Progress: 100/2564 processed
80
+ 2025-11-17 21:13:13 [INFO] Progress: 200/2564 processed
81
+ 2025-11-17 21:13:13 [INFO] Progress: 300/2564 processed
82
+ 2025-11-17 21:13:13 [INFO] Progress: 400/2564 processed
83
+ 2025-11-17 21:13:13 [INFO] Progress: 500/2564 processed
84
+ 2025-11-17 21:13:13 [INFO] Progress: 600/2564 processed
85
+ 2025-11-17 21:13:13 [INFO] Progress: 700/2564 processed
86
+ 2025-11-17 21:13:13 [INFO] Progress: 800/2564 processed
87
+ 2025-11-17 21:13:13 [INFO] Progress: 900/2564 processed
88
+ 2025-11-17 21:13:13 [INFO] Progress: 1000/2564 processed
89
+ 2025-11-17 21:13:13 [INFO] Progress: 1100/2564 processed
90
+ 2025-11-17 21:13:13 [INFO] Progress: 1200/2564 processed
91
+ 2025-11-17 21:13:13 [INFO] Progress: 1300/2564 processed
92
+ 2025-11-17 21:13:13 [INFO] Progress: 1400/2564 processed
93
+ 2025-11-17 21:13:13 [INFO] Progress: 1500/2564 processed
94
+ 2025-11-17 21:13:13 [INFO] Progress: 1600/2564 processed
95
+ 2025-11-17 21:13:13 [INFO] Progress: 1700/2564 processed
96
+ 2025-11-17 21:13:13 [INFO] Progress: 1800/2564 processed
97
+ 2025-11-17 21:13:13 [INFO] Progress: 1900/2564 processed
98
+ 2025-11-17 21:13:13 [INFO] Progress: 2000/2564 processed
99
+ 2025-11-17 21:13:13 [INFO] Progress: 2100/2564 processed
100
+ 2025-11-17 21:13:13 [INFO] Progress: 2200/2564 processed
101
+ 2025-11-17 21:13:13 [INFO] Progress: 2300/2564 processed
102
+ 2025-11-17 21:13:13 [INFO] Progress: 2400/2564 processed
103
+ 2025-11-17 21:13:13 [INFO] Progress: 2500/2564 processed
104
+ 2025-11-17 21:13:14 [INFO] Saved 2564 bills to data/known_bills_visualize.json
105
+ 2025-11-17 21:13:14 [INFO] Processing complete!
106
+ 2025-11-17 21:13:14 [INFO] Total bills processed: 2564
107
+ 2025-11-17 21:13:14 [INFO] Bills without text: 9
108
+ 2025-11-17 21:13:14 [INFO] Already had None categories: 9
109
+ 2025-11-17 21:13:14 [INFO] Newly marked as None: 0
110
+ 2025-11-17 21:13:14 [INFO] No-text bill marking process completed
111
+ 2025-11-20 13:52:45 [INFO] Starting no-text bill marking process
112
+ 2025-11-20 13:52:46 [INFO] Loaded 2596 bills from data/known_bills_visualize.json
113
+ 2025-11-20 13:52:46 [INFO] Processing 2596 bills to mark no-text bills
114
+ 2025-11-20 13:52:46 [INFO] Progress: 100/2596 processed
115
+ 2025-11-20 13:52:46 [INFO] Progress: 200/2596 processed
116
+ 2025-11-20 13:52:46 [INFO] Progress: 300/2596 processed
117
+ 2025-11-20 13:52:46 [INFO] Progress: 400/2596 processed
118
+ 2025-11-20 13:52:46 [INFO] Progress: 500/2596 processed
119
+ 2025-11-20 13:52:46 [INFO] Progress: 600/2596 processed
120
+ 2025-11-20 13:52:46 [INFO] Progress: 700/2596 processed
121
+ 2025-11-20 13:52:46 [INFO] Progress: 800/2596 processed
122
+ 2025-11-20 13:52:46 [INFO] Progress: 900/2596 processed
123
+ 2025-11-20 13:52:46 [INFO] Progress: 1000/2596 processed
124
+ 2025-11-20 13:52:46 [INFO] Progress: 1100/2596 processed
125
+ 2025-11-20 13:52:46 [INFO] Progress: 1200/2596 processed
126
+ 2025-11-20 13:52:46 [INFO] Progress: 1300/2596 processed
127
+ 2025-11-20 13:52:46 [INFO] Progress: 1400/2596 processed
128
+ 2025-11-20 13:52:46 [INFO] Progress: 1500/2596 processed
129
+ 2025-11-20 13:52:46 [INFO] Progress: 1600/2596 processed
130
+ 2025-11-20 13:52:46 [INFO] Progress: 1700/2596 processed
131
+ 2025-11-20 13:52:46 [INFO] Progress: 1800/2596 processed
132
+ 2025-11-20 13:52:46 [INFO] Progress: 1900/2596 processed
133
+ 2025-11-20 13:52:46 [INFO] Progress: 2000/2596 processed
134
+ 2025-11-20 13:52:46 [INFO] Progress: 2100/2596 processed
135
+ 2025-11-20 13:52:46 [INFO] Progress: 2200/2596 processed
136
+ 2025-11-20 13:52:46 [INFO] Progress: 2300/2596 processed
137
+ 2025-11-20 13:52:46 [INFO] Progress: 2400/2596 processed
138
+ 2025-11-20 13:52:46 [INFO] Progress: 2500/2596 processed
139
+ 2025-11-20 13:52:47 [INFO] Saved 2596 bills to data/known_bills_visualize.json
140
+ 2025-11-20 13:52:47 [INFO] Processing complete!
141
+ 2025-11-20 13:52:47 [INFO] Total bills processed: 2596
142
+ 2025-11-20 13:52:47 [INFO] Bills without text: 13
143
+ 2025-11-20 13:52:47 [INFO] Already had None categories: 13
144
+ 2025-11-20 13:52:47 [INFO] Newly marked as None: 0
145
+ 2025-11-20 13:52:47 [INFO] No-text bill marking process completed
146
+ 2025-12-01 12:30:17 [INFO] Starting no-text bill marking process
147
+ 2025-12-01 12:30:17 [INFO] Loaded 2605 bills from data/known_bills_visualize.json
148
+ 2025-12-01 12:30:17 [INFO] Processing 2605 bills to mark no-text bills
149
+ 2025-12-01 12:30:17 [INFO] Progress: 100/2605 processed
150
+ 2025-12-01 12:30:17 [INFO] Progress: 200/2605 processed
151
+ 2025-12-01 12:30:17 [INFO] Progress: 300/2605 processed
152
+ 2025-12-01 12:30:17 [INFO] Progress: 400/2605 processed
153
+ 2025-12-01 12:30:17 [INFO] Progress: 500/2605 processed
154
+ 2025-12-01 12:30:17 [INFO] Progress: 600/2605 processed
155
+ 2025-12-01 12:30:17 [INFO] Progress: 700/2605 processed
156
+ 2025-12-01 12:30:17 [INFO] Progress: 800/2605 processed
157
+ 2025-12-01 12:30:17 [INFO] Progress: 900/2605 processed
158
+ 2025-12-01 12:30:17 [INFO] Progress: 1000/2605 processed
159
+ 2025-12-01 12:30:17 [INFO] Progress: 1100/2605 processed
160
+ 2025-12-01 12:30:17 [INFO] Progress: 1200/2605 processed
161
+ 2025-12-01 12:30:17 [INFO] Progress: 1300/2605 processed
162
+ 2025-12-01 12:30:17 [INFO] Progress: 1400/2605 processed
163
+ 2025-12-01 12:30:17 [INFO] Progress: 1500/2605 processed
164
+ 2025-12-01 12:30:17 [INFO] Progress: 1600/2605 processed
165
+ 2025-12-01 12:30:17 [INFO] Progress: 1700/2605 processed
166
+ 2025-12-01 12:30:17 [INFO] Progress: 1800/2605 processed
167
+ 2025-12-01 12:30:17 [INFO] Progress: 1900/2605 processed
168
+ 2025-12-01 12:30:17 [INFO] Progress: 2000/2605 processed
169
+ 2025-12-01 12:30:17 [INFO] Progress: 2100/2605 processed
170
+ 2025-12-01 12:30:17 [INFO] Progress: 2200/2605 processed
171
+ 2025-12-01 12:30:17 [INFO] Progress: 2300/2605 processed
172
+ 2025-12-01 12:30:17 [INFO] Progress: 2400/2605 processed
173
+ 2025-12-01 12:30:17 [INFO] Progress: 2500/2605 processed
174
+ 2025-12-01 12:30:17 [INFO] Progress: 2600/2605 processed
175
+ 2025-12-01 12:30:19 [INFO] Saved 2605 bills to data/known_bills_visualize.json
176
+ 2025-12-01 12:30:19 [INFO] Processing complete!
177
+ 2025-12-01 12:30:19 [INFO] Total bills processed: 2605
178
+ 2025-12-01 12:30:19 [INFO] Bills without text: 16
179
+ 2025-12-01 12:30:19 [INFO] Already had None categories: 16
180
+ 2025-12-01 12:30:19 [INFO] Newly marked as None: 0
181
+ 2025-12-01 12:30:19 [INFO] No-text bill marking process completed
182
+ 2025-12-01 13:11:46 [INFO] Starting no-text bill marking process
183
+ 2025-12-01 13:11:47 [INFO] Loaded 2605 bills from data/known_bills_visualize.json
184
+ 2025-12-01 13:11:47 [INFO] Processing 2605 bills to mark no-text bills
185
+ 2025-12-01 13:11:47 [INFO] Progress: 100/2605 processed
186
+ 2025-12-01 13:11:47 [INFO] Progress: 200/2605 processed
187
+ 2025-12-01 13:11:47 [INFO] Progress: 300/2605 processed
188
+ 2025-12-01 13:11:47 [INFO] Progress: 400/2605 processed
189
+ 2025-12-01 13:11:47 [INFO] Progress: 500/2605 processed
190
+ 2025-12-01 13:11:47 [INFO] Progress: 600/2605 processed
191
+ 2025-12-01 13:11:47 [INFO] Progress: 700/2605 processed
192
+ 2025-12-01 13:11:47 [INFO] Progress: 800/2605 processed
193
+ 2025-12-01 13:11:47 [INFO] Progress: 900/2605 processed
194
+ 2025-12-01 13:11:47 [INFO] Progress: 1000/2605 processed
195
+ 2025-12-01 13:11:47 [INFO] Progress: 1100/2605 processed
196
+ 2025-12-01 13:11:47 [INFO] Progress: 1200/2605 processed
197
+ 2025-12-01 13:11:47 [INFO] Progress: 1300/2605 processed
198
+ 2025-12-01 13:11:47 [INFO] Progress: 1400/2605 processed
199
+ 2025-12-01 13:11:47 [INFO] Progress: 1500/2605 processed
200
+ 2025-12-01 13:11:47 [INFO] Progress: 1600/2605 processed
201
+ 2025-12-01 13:11:47 [INFO] Progress: 1700/2605 processed
202
+ 2025-12-01 13:11:47 [INFO] Progress: 1800/2605 processed
203
+ 2025-12-01 13:11:47 [INFO] Progress: 1900/2605 processed
204
+ 2025-12-01 13:11:47 [INFO] Progress: 2000/2605 processed
205
+ 2025-12-01 13:11:47 [INFO] Progress: 2100/2605 processed
206
+ 2025-12-01 13:11:47 [INFO] Progress: 2200/2605 processed
207
+ 2025-12-01 13:11:47 [INFO] Progress: 2300/2605 processed
208
+ 2025-12-01 13:11:47 [INFO] Progress: 2400/2605 processed
209
+ 2025-12-01 13:11:47 [INFO] Progress: 2500/2605 processed
210
+ 2025-12-01 13:11:47 [INFO] Progress: 2600/2605 processed
211
+ 2025-12-01 13:11:48 [INFO] Saved 2605 bills to data/known_bills_visualize.json
212
+ 2025-12-01 13:11:48 [INFO] Processing complete!
213
+ 2025-12-01 13:11:48 [INFO] Total bills processed: 2605
214
+ 2025-12-01 13:11:48 [INFO] Bills without text: 16
215
+ 2025-12-01 13:11:48 [INFO] Already had None categories: 16
216
+ 2025-12-01 13:11:48 [INFO] Newly marked as None: 0
217
+ 2025-12-01 13:11:48 [INFO] No-text bill marking process completed
218
+ 2025-12-01 13:16:12 [INFO] Starting no-text bill marking process
219
+ 2025-12-01 13:16:13 [ERROR] Error loading bills: Expecting ',' delimiter: line 70396 column 683331 (char 189968803)
220
+ 2025-12-01 13:16:13 [ERROR] No bills loaded. Exiting.
221
+ 2025-12-01 13:16:13 [INFO] No-text bill marking process completed
222
+ 2025-12-01 13:16:16 [INFO] Starting no-text bill marking process
223
+ 2025-12-01 13:16:17 [INFO] Loaded 2605 bills from data/known_bills_visualize.json
224
+ 2025-12-01 13:16:17 [INFO] Processing 2605 bills to mark no-text bills
225
+ 2025-12-01 13:16:17 [INFO] Progress: 100/2605 processed
226
+ 2025-12-01 13:16:17 [INFO] Progress: 200/2605 processed
227
+ 2025-12-01 13:16:17 [INFO] Progress: 300/2605 processed
228
+ 2025-12-01 13:16:17 [INFO] Progress: 400/2605 processed
229
+ 2025-12-01 13:16:17 [INFO] Progress: 500/2605 processed
230
+ 2025-12-01 13:16:17 [INFO] Progress: 600/2605 processed
231
+ 2025-12-01 13:16:17 [INFO] Progress: 700/2605 processed
232
+ 2025-12-01 13:16:17 [INFO] Progress: 800/2605 processed
233
+ 2025-12-01 13:16:17 [INFO] Progress: 900/2605 processed
234
+ 2025-12-01 13:16:17 [INFO] Progress: 1000/2605 processed
235
+ 2025-12-01 13:16:17 [INFO] Progress: 1100/2605 processed
236
+ 2025-12-01 13:16:17 [INFO] Progress: 1200/2605 processed
237
+ 2025-12-01 13:16:17 [INFO] Progress: 1300/2605 processed
238
+ 2025-12-01 13:16:17 [INFO] Progress: 1400/2605 processed
239
+ 2025-12-01 13:16:17 [INFO] Progress: 1500/2605 processed
240
+ 2025-12-01 13:16:17 [INFO] Progress: 1600/2605 processed
241
+ 2025-12-01 13:16:17 [INFO] Progress: 1700/2605 processed
242
+ 2025-12-01 13:16:17 [INFO] Progress: 1800/2605 processed
243
+ 2025-12-01 13:16:17 [INFO] Progress: 1900/2605 processed
244
+ 2025-12-01 13:16:17 [INFO] Progress: 2000/2605 processed
245
+ 2025-12-01 13:16:17 [INFO] Progress: 2100/2605 processed
246
+ 2025-12-01 13:16:17 [INFO] Progress: 2200/2605 processed
247
+ 2025-12-01 13:16:17 [INFO] Progress: 2300/2605 processed
248
+ 2025-12-01 13:16:17 [INFO] Progress: 2400/2605 processed
249
+ 2025-12-01 13:16:17 [INFO] Progress: 2500/2605 processed
250
+ 2025-12-01 13:16:17 [INFO] Progress: 2600/2605 processed
251
+ 2025-12-01 13:16:18 [INFO] Saved 2605 bills to data/known_bills_visualize.json
252
+ 2025-12-01 13:16:18 [INFO] Processing complete!
253
+ 2025-12-01 13:16:18 [INFO] Total bills processed: 2605
254
+ 2025-12-01 13:16:18 [INFO] Bills without text: 16
255
+ 2025-12-01 13:16:18 [INFO] Already had None categories: 16
256
+ 2025-12-01 13:16:18 [INFO] Newly marked as None: 0
257
+ 2025-12-01 13:16:18 [INFO] No-text bill marking process completed
258
+ 2025-12-03 11:02:34 [INFO] Starting no-text bill marking process
259
+ 2025-12-03 11:02:35 [INFO] Loaded 2608 bills from data/known_bills_visualize.json
260
+ 2025-12-03 11:02:35 [INFO] Processing 2608 bills to mark no-text bills
261
+ 2025-12-03 11:02:35 [INFO] Progress: 100/2608 processed
262
+ 2025-12-03 11:02:35 [INFO] Progress: 200/2608 processed
263
+ 2025-12-03 11:02:35 [INFO] Progress: 300/2608 processed
264
+ 2025-12-03 11:02:35 [INFO] Progress: 400/2608 processed
265
+ 2025-12-03 11:02:35 [INFO] Progress: 500/2608 processed
266
+ 2025-12-03 11:02:35 [INFO] Progress: 600/2608 processed
267
+ 2025-12-03 11:02:35 [INFO] Progress: 700/2608 processed
268
+ 2025-12-03 11:02:35 [INFO] Progress: 800/2608 processed
269
+ 2025-12-03 11:02:35 [INFO] Progress: 900/2608 processed
270
+ 2025-12-03 11:02:35 [INFO] Progress: 1000/2608 processed
271
+ 2025-12-03 11:02:35 [INFO] Progress: 1100/2608 processed
272
+ 2025-12-03 11:02:35 [INFO] Progress: 1200/2608 processed
273
+ 2025-12-03 11:02:35 [INFO] Progress: 1300/2608 processed
274
+ 2025-12-03 11:02:35 [INFO] Progress: 1400/2608 processed
275
+ 2025-12-03 11:02:35 [INFO] Progress: 1500/2608 processed
276
+ 2025-12-03 11:02:35 [INFO] Progress: 1600/2608 processed
277
+ 2025-12-03 11:02:35 [INFO] Progress: 1700/2608 processed
278
+ 2025-12-03 11:02:35 [INFO] Progress: 1800/2608 processed
279
+ 2025-12-03 11:02:35 [INFO] Progress: 1900/2608 processed
280
+ 2025-12-03 11:02:35 [INFO] Progress: 2000/2608 processed
281
+ 2025-12-03 11:02:35 [INFO] Progress: 2100/2608 processed
282
+ 2025-12-03 11:02:35 [INFO] Progress: 2200/2608 processed
283
+ 2025-12-03 11:02:35 [INFO] Progress: 2300/2608 processed
284
+ 2025-12-03 11:02:35 [INFO] Progress: 2400/2608 processed
285
+ 2025-12-03 11:02:35 [INFO] Progress: 2500/2608 processed
286
+ 2025-12-03 11:02:35 [INFO] Progress: 2600/2608 processed
287
+ 2025-12-03 11:02:36 [INFO] Saved 2608 bills to data/known_bills_visualize.json
288
+ 2025-12-03 11:02:36 [INFO] Processing complete!
289
+ 2025-12-03 11:02:36 [INFO] Total bills processed: 2608
290
+ 2025-12-03 11:02:36 [INFO] Bills without text: 16
291
+ 2025-12-03 11:02:36 [INFO] Already had None categories: 16
292
+ 2025-12-03 11:02:36 [INFO] Newly marked as None: 0
293
+ 2025-12-03 11:02:36 [INFO] No-text bill marking process completed
data/data_updating_scripts/logs/migrate_iapp_categories.log ADDED
The diff for this file is too large to render. See raw diff
 
data/data_updating_scripts/mark_no_text_bills.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to mark bills without text as having None IAPP categories.
4
+
5
+ This script reads known_bills_visualize.json, identifies bills without text,
6
+ and sets their IAPP categories to None. The file is modified in-place.
7
+ """
8
+
9
+ import json
10
+ import os
11
+ import logging
12
+ from pathlib import Path
13
+ from typing import Dict, List
14
+ import sys
15
+
16
+ # Add the project root to the path
17
+ sys.path.append(str(Path(__file__).parent.parent))
18
+
19
+ # Create logs directory if it doesn't exist
20
+ os.makedirs("data_updating_scripts/logs", exist_ok=True)
21
+
22
+ # Set up logging
23
+ logging.basicConfig(
24
+ level=logging.INFO,
25
+ format='%(asctime)s [%(levelname)s] %(message)s',
26
+ datefmt='%Y-%m-%d %H:%M:%S',
27
+ handlers=[logging.StreamHandler(), logging.FileHandler("data_updating_scripts/logs/mark_no_text_bills.log")]
28
+ )
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class NoTextBillMarker:
33
+ """Class to mark bills without text as having None IAPP categories."""
34
+
35
+ def __init__(self):
36
+ self.visualize_file = "data/known_bills_visualize.json"
37
+
38
+ def load_bills(self) -> List[Dict]:
39
+ """Load bills from known_bills_visualize.json."""
40
+ try:
41
+ with open(self.visualize_file, 'r', encoding='utf-8') as f:
42
+ bills = json.load(f)
43
+ logger.info(f"Loaded {len(bills)} bills from {self.visualize_file}")
44
+ return bills
45
+ except Exception as e:
46
+ logger.error(f"Error loading bills: {e}")
47
+ return []
48
+
49
+ def save_bills(self, bills: List[Dict]) -> None:
50
+ """Save bills back to known_bills_visualize.json."""
51
+ try:
52
+ with open(self.visualize_file, 'w', encoding='utf-8') as f:
53
+ json.dump(bills, f, indent=2, ensure_ascii=False)
54
+ logger.info(f"Saved {len(bills)} bills to {self.visualize_file}")
55
+ except Exception as e:
56
+ logger.error(f"Error saving bills: {e}")
57
+
58
+ def has_text(self, bill: Dict) -> bool:
59
+ text = bill.get('text')
60
+ return isinstance(text, str) and len(text.strip()) > 50
61
+
62
+ def mark_no_text_bills(self) -> None:
63
+ """Mark bills without text as having None IAPP categories."""
64
+ # Load bills
65
+ bills = self.load_bills()
66
+ if not bills:
67
+ logger.error("No bills loaded. Exiting.")
68
+ return
69
+
70
+ # Track progress
71
+ total_bills = len(bills)
72
+ no_text_count = 0
73
+ already_none_count = 0
74
+
75
+ logger.info(f"Processing {total_bills} bills to mark no-text bills")
76
+
77
+ for i, bill in enumerate(bills, 1):
78
+ bill_key = f"{bill.get('state', 'Unknown')}_{bill.get('bill_number', 'Unknown')}"
79
+
80
+ # Check if bill has text
81
+ if not self.has_text(bill):
82
+ no_text_count += 1
83
+
84
+ # Check if IAPP categories are already None
85
+ current_iapp = bill.get('iapp_categories')
86
+ if current_iapp is None:
87
+ already_none_count += 1
88
+ logger.debug(f"Bill {bill_key} already has None IAPP categories")
89
+ else:
90
+ # Set IAPP categories to None
91
+ bill['iapp_categories'] = None
92
+ logger.info(f"Marked bill {bill_key} as having None IAPP categories (no text)")
93
+
94
+ # Log progress every 100 bills
95
+ if i % 100 == 0:
96
+ logger.info(f"Progress: {i}/{total_bills} processed")
97
+
98
+ # Save the modified bills
99
+ self.save_bills(bills)
100
+
101
+ # Summary
102
+ logger.info(f"Processing complete!")
103
+ logger.info(f"Total bills processed: {total_bills}")
104
+ logger.info(f"Bills without text: {no_text_count}")
105
+ logger.info(f"Already had None categories: {already_none_count}")
106
+ logger.info(f"Newly marked as None: {no_text_count - already_none_count}")
107
+
108
+
109
+ def main():
110
+ """Main function to run the no-text bill marker."""
111
+ logger.info("Starting no-text bill marking process")
112
+
113
+ marker = NoTextBillMarker()
114
+ marker.mark_no_text_bills()
115
+
116
+ logger.info("No-text bill marking process completed")
117
+
118
+
119
+ if __name__ == "__main__":
120
+ main()
data/data_updating_scripts/migrate_iapp_categories.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to migrate IAPP categories for bills with missing or invalid subcategories.
4
+
5
+ This script reads bills from known_bills_fixed.json, analyzes bills with missing IAPP categories
6
+ using OpenAI API, and saves the results to known_bills_visualize.json.
7
+ """
8
+
9
+ import json
10
+ import logging
11
+ import time
12
+ from pathlib import Path
13
+ from typing import Dict, List, Optional
14
+ import sys
15
+ import os
16
+ import re
17
+ import hashlib
18
+ import argparse
19
+
20
+ # Add the project root to the path
21
+ sys.path.append(str(Path(__file__).parent.parent))
22
+
23
+ from config import ConfigManager
24
+ from langchain_openai import ChatOpenAI
25
+ from langchain_core.prompts import ChatPromptTemplate
26
+ from langchain_core.output_parsers import StrOutputParser
27
+ from langchain_core.documents import Document
28
+
29
+ # Paths
30
+ INPUT_FILE = Path("data/known_bills_fixed.json")
31
+ VIS_FILE = Path("data/known_bills_visualize.json")
32
+ CACHE_FILE = Path("data/iapp_categories_cache.json")
33
+
34
+ # Create logs directory if it doesn't exist
35
+ os.makedirs("data_updating_scripts/logs", exist_ok=True)
36
+
37
+ # Configure logging
38
+ logging.basicConfig(
39
+ level=logging.INFO,
40
+ format="%(asctime)s [%(levelname)s] %(message)s",
41
+ handlers=[logging.StreamHandler(), logging.FileHandler("data_updating_scripts/logs/migrate_iapp_categories.log")]
42
+ )
43
+ logger = logging.getLogger(__name__)
44
+
45
+ # Exact subcategory lists for validation
46
+ EXACT_SUBCATEGORIES = {
47
+ "Governance": ["Program and documentation", "Assessments", "Training", "Responsible individual"],
48
+ "Transparency": ["General notice", "Labeling/notification", "Explanation/incident reporting", "Developer documentation"],
49
+ "Assurance": ["Registration", "Third-party review"],
50
+ "Individual Rights": ["Opt out/appeal", "Nondiscrimination"]
51
+ }
52
+
53
+ # Fallback categories for failed API calls
54
+ FALLBACK_CATEGORIES = {
55
+ "Governance": ["Program and documentation"],
56
+ "Transparency": ["General notice"],
57
+ "Assurance": ["Registration"],
58
+ "Individual Rights": ["Opt out/appeal"]
59
+ }
60
+
61
+ def bill_key(b: Dict) -> str:
62
+ return f"{b.get('state','Unknown')}_{b.get('bill_number','Unknown')}"
63
+
64
+ def sha256(s: Optional[str]) -> Optional[str]:
65
+ if not isinstance(s, str) or not s.strip():
66
+ return None
67
+ return hashlib.sha256(s.encode("utf-8")).hexdigest()
68
+
69
+ def load_json(path: Path, default):
70
+ try:
71
+ with open(path, "r", encoding="utf-8") as f:
72
+ return json.load(f)
73
+ except Exception:
74
+ return default
75
+
76
+ def save_json(path: Path, data):
77
+ path.parent.mkdir(parents=True, exist_ok=True)
78
+ with open(path, "w", encoding="utf-8") as f:
79
+ json.dump(data, f, indent=2, ensure_ascii=False)
80
+
81
+ class IAPPCategoriesMigrator:
82
+ """Migrates IAPP categories for bills with missing or invalid subcategories."""
83
+
84
+ def __init__(self, force: bool = False, rebuild_cache: bool = False, sleep_sec: float = 0.0):
85
+ """Initialize the migrator with configuration."""
86
+ self.config = ConfigManager()
87
+ if not self.config.OPENAI_API_KEY:
88
+ raise ValueError("OPENAI_API_KEY not found in environment variables")
89
+
90
+ self.force = force
91
+ self.sleep_sec = max(0.0, sleep_sec)
92
+ # Cache
93
+ self.cache: Dict[str, Dict] = {} if rebuild_cache else load_json(CACHE_FILE, {})
94
+
95
+ self.llm = ChatOpenAI(
96
+ model=self.config.OPENAI_LLM_MODEL,
97
+ temperature=0.1,
98
+ max_tokens=1000
99
+ )
100
+
101
+ self.iapp_prompt = self._create_prompt()
102
+ self.chain = self.iapp_prompt | self.llm | StrOutputParser()
103
+ logger.info(
104
+ f"Initialized IAPPCategoriesMigrator | model={self.config.OPENAI_LLM_MODEL} | "
105
+ f"force={self.force} | rebuild_cache={rebuild_cache}"
106
+ )
107
+
108
+ def _create_prompt(self):
109
+ """Create the IAPP analysis prompt with relaxed subcategory matching."""
110
+ prompt_text = """
111
+ Analyze the following AI-related bill content using the IAPP (International Association of Privacy Professionals) framework for AI governance categorization.
112
+
113
+ Your response must be ONLY a JSON object in this exact format with nothing else before or after:
114
+ {{"iapp_categories": {{"Governance": ["subcategory1", "subcategory2"], "Transparency": [], "Assurance": [], "Individual Rights": []}}}}
115
+
116
+ Use these four main categories and their EXACT subcategories (no variations allowed):
117
+
118
+ **Governance:**
119
+ - Program and documentation
120
+ - Assessments
121
+ - Training
122
+ - Responsible individual
123
+
124
+ **Transparency:**
125
+ - General notice
126
+ - Labeling/notification
127
+ - Explanation/incident reporting
128
+ - Developer documentation
129
+
130
+ **Assurance:**
131
+ - Registration
132
+ - Third-party review
133
+
134
+ **Individual Rights:**
135
+ - Opt out/appeal
136
+ - Nondiscrimination
137
+
138
+ Guidelines for categorization:
139
+ - Select ALL applicable subcategories that the bill directly addresses or substantially discusses
140
+ - If a category has no applicable subcategories, try to label it anyway based on surrounding context
141
+ - Be specific – prioritize subcategories that are clearly supported, but use judgment if AI or governance themes are present
142
+ - Focus on what the bill addresses or emphasizes, even if it doesn’t explicitly mandate requirements
143
+ - If the bill discusses AI, automation, decision systems, digital governance, or national technology strategy, categorize it as best as possible
144
+ - Avoid returning no categories when possible assuming that the bill is AI governance related, unless it truly could not be categorized into any of the four categories.
145
+
146
+ Bill content to analyze: {context}
147
+ """
148
+ return ChatPromptTemplate.from_messages([
149
+ ("system", prompt_text),
150
+ ("human", "Analyze this bill for IAPP categories:")
151
+ ])
152
+
153
+ def docs_from_bill(self, bill: Dict) -> List[Document]:
154
+ txt = bill.get("text", "")
155
+ if not isinstance(txt, str) or not txt.strip():
156
+ return []
157
+ return [
158
+ Document(
159
+ page_content=txt,
160
+ metadata={
161
+ "bill_key": bill_key(bill),
162
+ "state": bill.get("state", "Unknown"),
163
+ "bill_number": bill.get("bill_number", "Unknown"),
164
+ "title": bill.get("title", "No title"),
165
+ },
166
+ )
167
+ ]
168
+
169
+ def is_valid_categories(self, iapp: Dict) -> bool:
170
+ if not isinstance(iapp, dict):
171
+ return False
172
+ for cat in ("Governance", "Transparency", "Assurance", "Individual Rights"):
173
+ if cat not in iapp or not isinstance(iapp[cat], list):
174
+ return False
175
+ for sub in iapp[cat]:
176
+ if sub not in EXACT_SUBCATEGORIES[cat]:
177
+ return False
178
+ return True
179
+
180
+ def parse_llm(self, response: str) -> Optional[Dict]:
181
+ m = re.search(r"\{.*\}", response, re.DOTALL)
182
+ if not m:
183
+ return None
184
+ try:
185
+ obj = json.loads(m.group(0))
186
+ return obj.get("iapp_categories")
187
+ except Exception:
188
+ return None
189
+
190
+ def cached_match(self, b: Dict) -> Optional[Dict]:
191
+ bid = str(b.get("bill_id"))
192
+ ch = b.get("change_hash")
193
+ txt_hash = sha256(b.get("text"))
194
+ c = self.cache.get(bid)
195
+ if not c:
196
+ return None
197
+ if (ch and c.get("change_hash") == ch) or (txt_hash and c.get("text_sha256") == txt_hash):
198
+ return c.get("iapp_categories")
199
+ return None
200
+
201
+ def remember(self, b: Dict, iapp: Dict):
202
+ bid = str(b.get("bill_id"))
203
+ self.cache[bid] = {
204
+ "bill_id": bid,
205
+ "change_hash": b.get("change_hash"),
206
+ "text_sha256": sha256(b.get("text")),
207
+ "iapp_categories": iapp,
208
+ "updated_at": b.get("lastUpdatedAt"),
209
+ "state": b.get("state"),
210
+ "bill_number": b.get("bill_number"),
211
+ "title": b.get("title"),
212
+ }
213
+
214
+ def run(self):
215
+ if not INPUT_FILE.exists():
216
+ raise FileNotFoundError(f"Missing {INPUT_FILE}")
217
+ src_bills: List[Dict] = load_json(INPUT_FILE, [])
218
+ vis_bills: List[Dict] = load_json(VIS_FILE, [])
219
+
220
+ vis_map = {bill_key(b): b for b in vis_bills}
221
+
222
+ total = len(src_bills)
223
+ reused_cache = 0
224
+ reused_vis = 0
225
+ computed = 0
226
+ skipped_no_text = 0
227
+ errors = 0
228
+
229
+ out_bills: List[Dict] = []
230
+
231
+ logger.info(f"Loaded {total} source bills; visualize has {len(vis_bills)} existing entries; cache size={len(self.cache)}")
232
+
233
+ for i, b in enumerate(src_bills, 1):
234
+ key = bill_key(b)
235
+ txt = b.get("text", "")
236
+
237
+ out_rec = b.copy()
238
+
239
+ if not isinstance(txt, str) or len(txt.strip()) <= 50:
240
+ prev = vis_map.get(key)
241
+ if prev and "iapp_categories" in prev:
242
+ out_rec["iapp_categories"] = prev["iapp_categories"]
243
+ else:
244
+ out_rec["iapp_categories"] = None
245
+ out_bills.append(out_rec)
246
+ skipped_no_text += 1
247
+ if i % 50 == 0:
248
+ logger.info(f"[{i}/{total}] progress...")
249
+ continue
250
+
251
+ if self.force:
252
+ iapp = self._compute_categories(b)
253
+ if iapp is None:
254
+ iapp = FALLBACK_CATEGORIES
255
+ errors += 1
256
+ else:
257
+ computed += 1
258
+ out_rec["iapp_categories"] = iapp
259
+ self.remember(b, iapp)
260
+ out_bills.append(out_rec)
261
+ if self.sleep_sec:
262
+ time.sleep(self.sleep_sec)
263
+ if i % 10 == 0:
264
+ save_json(VIS_FILE, out_bills)
265
+ save_json(CACHE_FILE, self.cache)
266
+ continue
267
+
268
+ cached = self.cached_match(b)
269
+ if cached and self.is_valid_categories(cached):
270
+ out_rec["iapp_categories"] = cached
271
+ out_bills.append(out_rec)
272
+ reused_cache += 1
273
+ if i % 50 == 0:
274
+ logger.info(f"[{i}/{total}] progress...")
275
+ continue
276
+
277
+ prev = vis_map.get(key)
278
+ if prev and "iapp_categories" in prev:
279
+ prev_bid = str(prev.get("bill_id"))
280
+ prev_cache = self.cache.get(prev_bid)
281
+ if prev_cache:
282
+ same = False
283
+ if b.get("change_hash") and prev_cache.get("change_hash") == b.get("change_hash"):
284
+ same = True
285
+ elif sha256(b.get("text")) == prev_cache.get("text_sha256"):
286
+ same = True
287
+ if same and self.is_valid_categories(prev.get("iapp_categories", {})):
288
+ out_rec["iapp_categories"] = prev["iapp_categories"]
289
+ out_bills.append(out_rec)
290
+ reused_vis += 1
291
+ if i % 50 == 0:
292
+ logger.info(f"[{i}/{total}] progress...")
293
+ continue
294
+
295
+ iapp = self._compute_categories(b)
296
+ if iapp is None:
297
+ iapp = FALLBACK_CATEGORIES
298
+ errors += 1
299
+ else:
300
+ computed += 1
301
+ out_rec["iapp_categories"] = iapp
302
+ self.remember(b, iapp)
303
+ out_bills.append(out_rec)
304
+
305
+ if self.sleep_sec:
306
+ time.sleep(self.sleep_sec)
307
+ if i % 10 == 0:
308
+ save_json(VIS_FILE, out_bills)
309
+ save_json(CACHE_FILE, self.cache)
310
+
311
+ save_json(VIS_FILE, out_bills)
312
+ save_json(CACHE_FILE, self.cache)
313
+
314
+ logger.info("IAPP migration complete.")
315
+ logger.info(f"Total: {total} | reused_cache: {reused_cache} | reused_visualize: {reused_vis} | computed: {computed} | no_text: {skipped_no_text} | errors: {errors}")
316
+ print("✅ IAPP categories migration completed successfully!")
317
+ print(f" Total: {total}")
318
+ print(f" Reused (cache): {reused_cache}")
319
+ print(f" Reused (visualize match): {reused_vis}")
320
+ print(f" Newly computed: {computed}")
321
+ print(f" No text: {skipped_no_text}")
322
+ print(f" Fallback/errors: {errors}")
323
+ print(f" Results: {VIS_FILE}")
324
+ print(f" Cache: {CACHE_FILE}")
325
+
326
+ def _compute_categories(self, bill: Dict) -> Optional[Dict]:
327
+ try:
328
+ docs = self.docs_from_bill(bill)
329
+ if not docs:
330
+ return None
331
+ resp = self.chain.invoke({"context": docs})
332
+ parsed = self.parse_llm(resp)
333
+ if parsed and self.is_valid_categories(parsed):
334
+ return parsed
335
+ resp = self.chain.invoke({"context": docs})
336
+ parsed = self.parse_llm(resp)
337
+ if parsed and self.is_valid_categories(parsed):
338
+ return parsed
339
+ txt = bill.get("text", "").lower()
340
+ if "ai" in txt or "artificial intelligence" in txt:
341
+ return FALLBACK_CATEGORIES
342
+ return {"Governance": [], "Transparency": [], "Assurance": [], "Individual Rights": []}
343
+ except Exception as e:
344
+ logger.exception(f"LLM error for {bill_key(bill)}: {e}")
345
+ return None
346
+
347
+ def main():
348
+ parser = argparse.ArgumentParser()
349
+ parser.add_argument("--force", action="store_true", help="Recompute categories for all bills with text")
350
+ parser.add_argument("--rebuild-cache", action="store_true", help="Ignore existing cache file and rebuild it")
351
+ parser.add_argument("--sleep-sec", type=float, default=0.0, help="Sleep seconds between LLM calls (rate limiting)")
352
+ args = parser.parse_args()
353
+
354
+ migrator = IAPPCategoriesMigrator(force=args.force, rebuild_cache=args.rebuild_cache, sleep_sec=args.sleep_sec)
355
+ migrator.run()
356
+
357
+ if __name__ == "__main__":
358
+ main()
data/generate_password_hash.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Password Hash Generator for streamlit-authenticator
4
+
5
+ Usage:
6
+ python generate_password_hash.py
7
+
8
+ This will prompt you for a password and generate the bcrypt hash.
9
+ """
10
+
11
+ import streamlit_authenticator as stauth
12
+
13
+ def generate_hash():
14
+ print("=" * 50)
15
+ print("Password Hash Generator")
16
+ print("=" * 50)
17
+ print()
18
+
19
+ # Get password from user
20
+ password = input("Enter password to hash: ").strip()
21
+
22
+ if not password:
23
+ print("Password cannot be empty!")
24
+ return
25
+
26
+ # Generate hash
27
+ print("\nGenerating hash...")
28
+ hashed_passwords = stauth.Hasher([password]).generate()
29
+ hash_value = hashed_passwords[0]
30
+
31
+ print("\nHash generated successfully!")
32
+ print("=" * 50)
33
+ print(f"\nYour hashed password:\n{hash_value}")
34
+ print("=" * 50)
35
+
36
+ # Show example usage
37
+ print("\nAdd to config.yaml:")
38
+ print("-" * 50)
39
+ print(f"""
40
+ credentials:
41
+ usernames:
42
+ username_here:
43
+ email: user@example.com
44
+ name: User Name
45
+ password: {hash_value}
46
+ """)
47
+
48
+ print("\nOr add to secrets.toml:")
49
+ print("-" * 50)
50
+ print(f"""
51
+ [auth.credentials.usernames.username_here]
52
+ email = "user@example.com"
53
+ name = "User Name"
54
+ password = "{hash_value}"
55
+ """)
56
+
57
+ print("\nDone! Copy the hash above to your config file.")
58
+
59
+ def generate_multiple():
60
+ print("=" * 50)
61
+ print("Multiple User Password Hash Generator")
62
+ print("=" * 50)
63
+ print()
64
+
65
+ users = {}
66
+
67
+ while True:
68
+ username = input("\nEnter username (or press Enter to finish): ").strip()
69
+ if not username:
70
+ break
71
+
72
+ password = input(f"Enter password for {username}: ").strip()
73
+ if not password:
74
+ print("Password cannot be empty! Skipping user.")
75
+ continue
76
+
77
+ email = input(f"Enter email for {username}: ").strip()
78
+ name = input(f"Enter full name for {username}: ").strip()
79
+
80
+ users[username] = {
81
+ 'password': password,
82
+ 'email': email or f"{username}@example.com",
83
+ 'name': name or username.title()
84
+ }
85
+
86
+ if not users:
87
+ print("\n No users to process!")
88
+ return
89
+
90
+ # Generate all hashes
91
+ print("\nGenerating hashes...")
92
+ passwords = [data['password'] for data in users.values()]
93
+ hashed_passwords = stauth.Hasher(passwords).generate()
94
+
95
+ # Update users with hashed passwords
96
+ for i, username in enumerate(users.keys()):
97
+ users[username]['hashed'] = hashed_passwords[i]
98
+
99
+ # Display results
100
+ print("\nHashes generated successfully!")
101
+ print("=" * 50)
102
+
103
+ print("\nconfig.yaml format:")
104
+ print("-" * 50)
105
+ print("credentials:")
106
+ print(" usernames:")
107
+ for username, data in users.items():
108
+ print(f" {username}:")
109
+ print(f" email: {data['email']}")
110
+ print(f" name: {data['name']}")
111
+ print(f" password: {data['hashed']}")
112
+
113
+ print("\nsecrets.toml format:")
114
+ print("-" * 50)
115
+ for username, data in users.items():
116
+ print(f"[auth.credentials.usernames.{username}]")
117
+ print(f'email = "{data["email"]}"')
118
+ print(f'name = "{data["name"]}"')
119
+ print(f'password = "{data["hashed"]}"')
120
+ print()
121
+
122
+ print("Done! Copy the configuration above to your config file.")
123
+
124
+ if __name__ == "__main__":
125
+ print("\nChoose an option:")
126
+ print("1. Generate single password hash")
127
+ print("2. Generate multiple user hashes")
128
+ choice = input("\nEnter choice (1 or 2): ").strip()
129
+
130
+ if choice == "1":
131
+ generate_hash()
132
+ elif choice == "2":
133
+ generate_multiple()
134
+ else:
135
+ print("Invalid choice!")
data/huggingface_upload.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HuggingFace Dataset Upload Module
3
+ - Tests HF connection
4
+ - Uploads known_bills_visualize.json (legacy function)
5
+ - Uploads ALL core data JSONs (new function) to HuggingFace Datasets Hub
6
+ Works with the Admin panel HuggingFace tab
7
+ """
8
+
9
+ from huggingface_hub import HfApi, create_repo
10
+ import streamlit as st
11
+ import os
12
+ import json
13
+ from pathlib import Path
14
+ from typing import Dict, List, Tuple, Optional
15
+
16
+
17
+ FILES_TO_UPLOAD = {
18
+ "data/known_bills_visualize.json": "known_bills_visualize.json",
19
+ "data/bill_summaries.json": "bill_summaries.json",
20
+ "data/bill_suggested_questions.json": "bill_suggested_questions.json",
21
+ "data/bill_reports.json": "bill_reports.json",
22
+ "data/bill_cache.json": "bill_cache.json",
23
+ "data/known_bills.json": "known_bills.json",
24
+ "data/known_bills_fixed.json": "known_bills_fixed.json",
25
+ }
26
+
27
+
28
+
29
+ def _get_hf_token_and_repo() -> Tuple[str, str]:
30
+ """
31
+ Get HF token + dataset repo.
32
+
33
+ Priority:
34
+ 1. Streamlit secrets (for the Admin UI)
35
+ 2. Environment variables (for CLI scripts like update_data.py)
36
+ - HUGGINGFACE_HUB_TOKEN
37
+ - HF_REPO_ID
38
+ """
39
+ token = None
40
+ repo_id = None
41
+
42
+ try:
43
+ token = st.secrets["huggingface"]["token"]
44
+ repo_id = st.secrets["huggingface"]["dataset_repo"]
45
+ except Exception:
46
+ pass
47
+
48
+ if not token:
49
+ token = os.getenv("HUGGINGFACE_HUB_TOKEN")
50
+ if not repo_id:
51
+ repo_id = os.getenv("HF_REPO_ID")
52
+
53
+ if not token or not repo_id:
54
+ raise KeyError(
55
+ "HuggingFace configuration missing. "
56
+ "Provide either Streamlit secrets "
57
+ "[huggingface.token] and [huggingface.dataset_repo] "
58
+ "or environment variables HUGGINGFACE_HUB_TOKEN and HF_REPO_ID."
59
+ )
60
+
61
+ return token, repo_id
62
+
63
+
64
+
65
+ def test_hf_connection() -> Tuple[bool, str]:
66
+ """
67
+ Test connection to HuggingFace API
68
+
69
+ Returns:
70
+ tuple: (success: bool, message: str)
71
+ """
72
+ try:
73
+ token, _ = _get_hf_token_and_repo()
74
+ api = HfApi()
75
+ user = api.whoami(token=token)
76
+ username = user.get("name") or user.get("fullname") or user.get("id") or "User"
77
+ return True, f"Connected as: {username}"
78
+ except KeyError:
79
+ return False, "HuggingFace token or dataset_repo not found in secrets"
80
+ except Exception as e:
81
+ return False, f"Connection failed: {str(e)}"
82
+
83
+
84
+ def get_dataset_url(filename: str = "known_bills_visualize.json") -> Optional[str]:
85
+ """
86
+ Get the public URL of a file inside the dataset.
87
+
88
+ Args:
89
+ filename: Name of the file in the HF dataset repo.
90
+
91
+ Returns:
92
+ str | None: URL to the dataset file, or None if config missing
93
+ """
94
+ try:
95
+ repo = st.secrets["huggingface"]["dataset_repo"]
96
+ return f"https://huggingface.co/datasets/{repo}/resolve/main/{filename}"
97
+ except KeyError:
98
+ return None
99
+
100
+
101
+ def _find_and_validate_json(possible_paths: List[Path]) -> Path:
102
+ """
103
+ Given a list of possible paths, return the first that exists,
104
+ and validate that it is valid JSON.
105
+ """
106
+ file_path = None
107
+ for path in possible_paths:
108
+ if path.exists():
109
+ file_path = path
110
+ break
111
+
112
+ if file_path is None:
113
+ raise FileNotFoundError(
114
+ "File not found.\n"
115
+ "Checked locations:\n" + "\n".join(f" - {p}" for p in possible_paths)
116
+ )
117
+
118
+ try:
119
+ with open(file_path, "r", encoding="utf-8") as f:
120
+ data = json.load(f)
121
+ if not isinstance(data, (dict, list)):
122
+ raise ValueError("JSON file must contain a dict or list")
123
+ except json.JSONDecodeError as e:
124
+ raise ValueError(f"Invalid JSON file: {str(e)}")
125
+
126
+ return file_path
127
+
128
+
129
+ def _ensure_dataset_exists(api: HfApi, repo_id: str, token: str) -> None:
130
+ """Create the dataset repo if it does not already exist."""
131
+ try:
132
+ create_repo(
133
+ repo_id=repo_id,
134
+ repo_type="dataset",
135
+ token=token,
136
+ exist_ok=True,
137
+ private=False,
138
+ )
139
+ except Exception:
140
+ pass
141
+
142
+
143
+ def upload_to_huggingface() -> str:
144
+ """
145
+ Legacy function: Upload ONLY known_bills_visualize.json to HuggingFace Datasets Hub.
146
+ Used by existing Admin panel code. New code should prefer upload_all_to_huggingface().
147
+
148
+ Returns:
149
+ str: Public URL to the uploaded file
150
+
151
+ Raises:
152
+ FileNotFoundError: If JSON file doesn't exist
153
+ Exception: If upload fails
154
+ """
155
+ try:
156
+ token, repo_id = _get_hf_token_and_repo()
157
+ api = HfApi()
158
+
159
+ _ensure_dataset_exists(api, repo_id, token)
160
+
161
+ possible_paths = [
162
+ Path("data/known_bills_visualize.json"),
163
+ Path("known_bills_visualize.json"),
164
+ ]
165
+ file_path = _find_and_validate_json(possible_paths)
166
+
167
+ file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
168
+
169
+ api.upload_file(
170
+ path_or_fileobj=str(file_path),
171
+ path_in_repo="known_bills_visualize.json",
172
+ repo_id=repo_id,
173
+ repo_type="dataset",
174
+ token=token,
175
+ commit_message=f"Update AI legislation data ({file_size_mb:.2f}MB)",
176
+ )
177
+
178
+ url = get_dataset_url("known_bills_visualize.json")
179
+ return url
180
+
181
+ except FileNotFoundError as e:
182
+ raise e
183
+ except KeyError as e:
184
+ raise Exception(f"Missing configuration in secrets.toml: {e}")
185
+ except Exception as e:
186
+ raise Exception(f"Upload failed: {str(e)}")
187
+
188
+
189
+ def upload_all_to_huggingface() -> Dict[str, str]:
190
+ """
191
+ NEW: Upload ALL core JSON files to HuggingFace Datasets Hub.
192
+
193
+ Returns:
194
+ dict: mapping from dataset filename -> public URL (for successfully uploaded files)
195
+ """
196
+ token, repo_id = _get_hf_token_and_repo()
197
+ api = HfApi()
198
+ _ensure_dataset_exists(api, repo_id, token)
199
+
200
+ uploaded_urls: Dict[str, str] = {}
201
+
202
+ for local_path, dest_name in FILES_TO_UPLOAD.items():
203
+ possible_paths = [Path(local_path), Path(dest_name)]
204
+
205
+ try:
206
+ file_path = _find_and_validate_json(possible_paths)
207
+ except FileNotFoundError:
208
+ msg = f"Skipping missing file: {local_path}"
209
+ print(msg)
210
+ st.write(msg)
211
+ continue
212
+ except ValueError as e:
213
+ msg = f"Skipping invalid JSON in {local_path}: {e}"
214
+ print(msg)
215
+ st.write(msg)
216
+ continue
217
+
218
+ file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
219
+ commit_msg = f"Update {dest_name} ({file_size_mb:.2f}MB)"
220
+
221
+ print(f"Uploading {file_path} → {repo_id}/{dest_name} ...")
222
+ api.upload_file(
223
+ path_or_fileobj=str(file_path),
224
+ path_in_repo=dest_name,
225
+ repo_id=repo_id,
226
+ repo_type="dataset",
227
+ token=token,
228
+ commit_message=commit_msg,
229
+ )
230
+
231
+ url = get_dataset_url(dest_name)
232
+ if url:
233
+ uploaded_urls[dest_name] = url
234
+
235
+ return uploaded_urls
236
+
237
+
238
+ if __name__ == "__main__":
239
+ print("Testing HuggingFace connection...")
240
+ success, msg = test_hf_connection()
241
+ print(msg)
242
+
243
+ if success:
244
+ print("\nAttempting upload of ALL files...")
245
+ try:
246
+ urls = upload_all_to_huggingface()
247
+ print("\nUpload successful!")
248
+ for name, url in urls.items():
249
+ print(f"- {name}: {url}")
250
+ except Exception as e:
251
+ print(f"\nUpload failed: {e}")
data/pages/Admin.py ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import streamlit_authenticator as stauth
3
+ from pathlib import Path
4
+ import sys
5
+ import pandas as pd
6
+ import subprocess
7
+ from datetime import datetime
8
+ import os
9
+ from huggingface_upload import upload_all_to_huggingface
10
+
11
+ # Allow imports of project modules
12
+ sys.path.insert(0, str(Path(__file__).parent.parent))
13
+ from user_management import HuggingFaceUserManager, load_user_config
14
+
15
+ st.set_page_config(page_title="Admin Panel", layout="wide", page_icon="🛠️")
16
+
17
+ # CSS
18
+ st.markdown("""
19
+ <style>
20
+ .main .block-container { padding-top: 2rem; max-width: 1200px; }
21
+ h2 { color: #e0e0e0 !important; font-weight: 400 !important; font-size: 1.5rem !important; }
22
+ </style>
23
+ """, unsafe_allow_html=True)
24
+
25
+ # CONFIG
26
+ config, using_hf = load_user_config()
27
+
28
+ if config is None:
29
+ st.error("Authentication configuration not found!")
30
+ st.stop()
31
+
32
+ # AUTH SYSTEM
33
+ authenticator = stauth.Authenticate(
34
+ config['credentials'],
35
+ config['cookie']['name'],
36
+ config['cookie']['key'],
37
+ config['cookie']['expiry_days']
38
+ )
39
+
40
+ try:
41
+ authenticator.login('main')
42
+ except Exception as e:
43
+ st.error(f"Login error: {e}")
44
+
45
+ name = st.session_state.get("name")
46
+ authentication_status = st.session_state.get("authentication_status")
47
+ username = st.session_state.get("username")
48
+
49
+ if authentication_status == False:
50
+ st.error('Username/password is incorrect')
51
+ st.stop()
52
+
53
+ if authentication_status == None:
54
+ st.warning('Please enter your username and password')
55
+ st.stop()
56
+
57
+ # AUTH VIEW
58
+ if authentication_status:
59
+
60
+ with st.sidebar:
61
+ st.markdown("---")
62
+ st.markdown(f"**Logged in as:** {name}")
63
+ st.markdown(f"**Username:** {username}")
64
+ authenticator.logout('Logout', 'sidebar')
65
+
66
+ ALLOWED_USERNAMES = set(config['credentials']['usernames'].keys())
67
+ if username not in ALLOWED_USERNAMES:
68
+ st.error(f"User '{username}' is not authorized.")
69
+ st.stop()
70
+
71
+ # HEADER
72
+ st.success(f"Welcome, {name}!")
73
+ st.markdown("---")
74
+ st.markdown("""
75
+ <div style='text-align: center; padding: 1rem 0 2rem 0;'>
76
+ <h1 style='color: #1f2937;'>Admin Panel</h1>
77
+ <p style='color: #6b7280;'>Cloud data sync controls</p>
78
+ </div>
79
+ """, unsafe_allow_html=True)
80
+ st.markdown("---")
81
+
82
+ # Tabs
83
+ tab1, tab2, tab3 = st.tabs(["Dashboard", "Data Pipeline", "User Management"])
84
+
85
+ # ------------------------------------------------------------------
86
+ # TAB 1 — Dashboard
87
+ # ------------------------------------------------------------------
88
+ with tab1:
89
+ st.subheader("Admin Dashboard")
90
+
91
+ users = config['credentials']['usernames']
92
+ admin_data = [
93
+ {
94
+ "Username": uname,
95
+ "Name": data.get("name"),
96
+ "Email": data.get("email"),
97
+ "Current User": "Admin" if uname == username else ""
98
+ }
99
+ for uname, data in users.items()
100
+ ]
101
+
102
+ st.dataframe(pd.DataFrame(admin_data), width="stretch", hide_index=True)
103
+
104
+ # ------------------------------------------------------------------
105
+ # TAB 2 — DATA PIPELINE
106
+ # ------------------------------------------------------------------
107
+ with tab2:
108
+ st.subheader("Data Pipeline")
109
+
110
+ if 'huggingface' not in st.secrets:
111
+ st.warning("Add HuggingFace credentials to `.streamlit/secrets.toml`")
112
+ st.stop()
113
+
114
+ from huggingface_upload import upload_to_huggingface, test_hf_connection
115
+
116
+ # --- Connection Test
117
+ st.markdown("Connection Status")
118
+ col1, col2 = st.columns(2)
119
+
120
+ with col1:
121
+ if st.button("Test HuggingFace Connection", width='stretch'):
122
+ ok, msg = test_hf_connection()
123
+ (st.success if ok else st.error)(msg)
124
+
125
+ with col2:
126
+ repo = st.secrets["huggingface"]["dataset_repo"]
127
+ st.info(f"Dataset: {repo}")
128
+
129
+ st.markdown("---")
130
+
131
+ # --- Full Data Update Section
132
+ st.subheader("Full Data Update")
133
+ st.info("Pull new data, process PDFs, generate embeddings, and upload to HuggingFace.")
134
+
135
+ # ➤ NEW UI CONTROL — Pull new data?
136
+ pull_new_data = st.radio(
137
+ "Pull new data from LegiScan?",
138
+ options=[
139
+ ("no", "No - Use existing local data"),
140
+ ("yes", "Yes - Pull fresh data (costs API quota)"),
141
+ ],
142
+ format_func=lambda x: x[1],
143
+ index=0,
144
+ key="pull_option"
145
+ )
146
+
147
+ # ➤ NEW UI CONTROL — overwrite known_bills.json?
148
+ overwrite_pdf = st.radio(
149
+ "After fixing PDF bills, overwrite data/known_bills.json?",
150
+ options=[
151
+ ("no", "No - keep original file"),
152
+ ("yes", "Yes - overwrite with cleaned PDF text"),
153
+ ],
154
+ format_func=lambda x: x[1],
155
+ index=0,
156
+ key="overwrite_option"
157
+ )
158
+
159
+ # Run full update
160
+ if st.button("Run Full Update & Upload", type="primary", width='stretch'):
161
+ status_container = st.container()
162
+
163
+ with status_container:
164
+ st.markdown("### Step 1: Running Data Pipeline")
165
+
166
+ with st.status("Processing data...", expanded=True) as status:
167
+
168
+ try:
169
+ update_cmd = [sys.executable, "update_data.py"]
170
+ legiscan_answer = "y\n" if pull_new_data[0] == "yes" else "n\n"
171
+
172
+ import os
173
+ from dotenv import load_dotenv
174
+ load_dotenv()
175
+
176
+ env = os.environ.copy()
177
+
178
+ # Pass OpenAI keys (existing logic)
179
+ openai_key = (
180
+ st.secrets.get("openai_api_key")
181
+ or st.secrets.get("OPENAI_API_KEY")
182
+ or env.get("openai_api_key")
183
+ or env.get("OPENAI_API_KEY")
184
+ )
185
+
186
+ if openai_key:
187
+ env["OPENAI_API_KEY"] = openai_key
188
+ env["openai_api_key"] = openai_key
189
+ st.success("OpenAI key found")
190
+ else:
191
+ st.warning("OpenAI API key missing!")
192
+
193
+ # ➤ NEW: Pass PDF overwrite decision into environment
194
+ env["FIX_PDF_OVERWRITE"] = (
195
+ "yes" if overwrite_pdf[0] == "yes" else "no"
196
+ )
197
+
198
+ log_file = Path("pipeline_last_run.log")
199
+
200
+ with log_file.open("w", encoding="utf-8") as lf:
201
+ proc = subprocess.Popen(
202
+ update_cmd,
203
+ stdout=subprocess.PIPE,
204
+ stderr=subprocess.STDOUT,
205
+ stdin=subprocess.PIPE,
206
+ text=True,
207
+ bufsize=1,
208
+ env=env,
209
+ )
210
+
211
+ # Send LegiScan yes/no
212
+ try:
213
+ proc.stdin.write(legiscan_answer)
214
+ proc.stdin.write("n\n") # continue-on-error prompt
215
+ proc.stdin.flush()
216
+ proc.stdin.close()
217
+ except:
218
+ pass
219
+
220
+ # Stream output
221
+ for line in proc.stdout:
222
+ line = line.rstrip("\n")
223
+ st.text(line)
224
+ lf.write(line + "\n")
225
+
226
+ rc = proc.wait()
227
+
228
+ if rc == 0:
229
+ status.update(label="Data pipeline completed", state="complete")
230
+ st.success("Processing successful!")
231
+
232
+ st.markdown("---")
233
+ st.markdown("### Step 2: Uploading to HuggingFace")
234
+
235
+ with st.spinner("Uploading..."):
236
+ url = upload_to_huggingface()
237
+ st.success("Uploaded to HuggingFace!")
238
+ st.code(url)
239
+ st.cache_data.clear()
240
+
241
+ else:
242
+ status.update(label="Pipeline failed", state="error")
243
+ st.error(f"Pipeline exited with code {rc}")
244
+
245
+ except Exception as e:
246
+ st.error(f"Pipeline error: {e}")
247
+ st.exception(e)
248
+
249
+ st.markdown("---")
250
+
251
+ with st.expander("Manual Upload Only"):
252
+ st.info("Use this only when skipping update_data.py")
253
+
254
+ if st.button("Upload Existing Data", width='stretch'):
255
+ with st.spinner("Uploading..."):
256
+ url = upload_to_huggingface()
257
+ st.success("Uploaded!")
258
+ st.code(url)
259
+
260
+
261
+ with tab3:
262
+ st.subheader("User Management")
263
+
264
+ if using_hf:
265
+ st.success("Using HuggingFace for persistent user storage")
266
+
267
+ try:
268
+ user_manager = HuggingFaceUserManager()
269
+
270
+ st.markdown("Add New Admin")
271
+
272
+ with st.form("add_user_form"):
273
+ col1, col2 = st.columns(2)
274
+ with col1:
275
+ new_username = st.text_input("Username", key="new_username")
276
+ new_email = st.text_input("Email", key="new_email")
277
+ with col2:
278
+ new_name = st.text_input("Full Name", key="new_name")
279
+ new_password = st.text_input("Password", type="password", key="new_password")
280
+
281
+ submit_add = st.form_submit_button("Add Admin", type="primary", width='stretch')
282
+
283
+ if submit_add:
284
+ if not all([new_username, new_email, new_name, new_password]):
285
+ st.error("Please fill in all fields")
286
+ else:
287
+ with st.spinner("Adding user..."):
288
+ import bcrypt
289
+ hashed_password = bcrypt.hashpw(new_password.encode(), bcrypt.gensalt()).decode()
290
+
291
+ success, message, commit_url = user_manager.add_user(
292
+ new_username, new_email, new_name, hashed_password
293
+ )
294
+
295
+ if success:
296
+ st.success(f"{message}")
297
+ st.cache_data.clear()
298
+ if commit_url:
299
+ with st.expander("View commit"):
300
+ st.code(commit_url)
301
+ st.rerun()
302
+ else:
303
+ st.error(f"{message}")
304
+
305
+ st.markdown("---")
306
+
307
+ st.markdown("Edit Admin")
308
+
309
+ users = config['credentials']['usernames']
310
+ usernames_list = list(users.keys())
311
+
312
+ with st.form("edit_user_form"):
313
+ user_to_edit = st.selectbox(
314
+ "Select user to edit",
315
+ options=usernames_list,
316
+ key="edit_username"
317
+ )
318
+
319
+ current_user = users.get(user_to_edit, {})
320
+
321
+ st.markdown("**Current Details:**")
322
+ st.text(f"Email: {current_user.get('email', 'N/A')}")
323
+ st.text(f"Name: {current_user.get('name', 'N/A')}")
324
+
325
+ st.markdown("**New Details** (leave blank to keep current):")
326
+
327
+ col1, col2 = st.columns(2)
328
+ with col1:
329
+ new_email = st.text_input("New Email", key="edit_email", placeholder="Leave blank to keep current")
330
+ new_password = st.text_input("New Password", type="password", key="edit_password", placeholder="Leave blank to keep current")
331
+ with col2:
332
+ new_name = st.text_input("New Name", key="edit_name", placeholder="Leave blank to keep current")
333
+
334
+ submit_edit = st.form_submit_button("Update Admin", type="primary", width='stretch')
335
+
336
+ if submit_edit:
337
+ if not any([new_email, new_name, new_password]):
338
+ st.warning("Please enter at least one field to update")
339
+ else:
340
+ with st.spinner("Updating user..."):
341
+ hashed_password = None
342
+ if new_password:
343
+ import bcrypt
344
+ hashed_password = bcrypt.hashpw(new_password.encode(), bcrypt.gensalt()).decode()
345
+
346
+ success, message, commit_url = user_manager.update_user(
347
+ user_to_edit,
348
+ new_email=new_email if new_email else None,
349
+ new_name=new_name if new_name else None,
350
+ new_password=hashed_password
351
+ )
352
+
353
+ if success:
354
+ st.success(f"{message}")
355
+ st.info("Refreshing user data...")
356
+ st.cache_data.clear()
357
+ if commit_url:
358
+ with st.expander("View commit"):
359
+ st.code(commit_url)
360
+ st.info("Please log out and log back in if you changed your own password")
361
+ st.rerun()
362
+ else:
363
+ st.error(f"{message}")
364
+
365
+ st.markdown("---")
366
+
367
+ # Remove user
368
+ st.markdown("Remove Admin")
369
+
370
+ users = config['credentials']['usernames']
371
+ usernames_list = list(users.keys())
372
+
373
+ if len(usernames_list) > 1:
374
+ with st.form("remove_user_form"):
375
+ user_to_remove = st.selectbox(
376
+ "Select user to remove",
377
+ options=usernames_list,
378
+ key="remove_username"
379
+ )
380
+
381
+ st.warning(f"This will permanently delete user: **{user_to_remove}**")
382
+
383
+ confirm_remove = st.checkbox("I confirm I want to remove this user")
384
+ submit_remove = st.form_submit_button("Remove Admin", type="secondary", width='stretch')
385
+
386
+ if submit_remove:
387
+ if not confirm_remove:
388
+ st.error("Please confirm the removal")
389
+ elif user_to_remove == username:
390
+ st.error("You cannot remove yourself!")
391
+ else:
392
+ with st.spinner("Removing user..."):
393
+ success, message, commit_url = user_manager.remove_user(user_to_remove)
394
+
395
+ if success:
396
+ st.success(f"✅ {message}")
397
+ st.cache_data.clear()
398
+ if commit_url:
399
+ with st.expander("View commit"):
400
+ st.code(commit_url)
401
+ st.rerun()
402
+ else:
403
+ st.error(f"{message}")
404
+ else:
405
+ st.info("ℹCannot remove the last admin user")
406
+
407
+ st.markdown("---")
408
+
409
+ # Show current users
410
+ st.markdown("Current Admins")
411
+ for uname, udata in users.items():
412
+ with st.expander(f"{udata.get('name', uname)} (@{uname})"):
413
+ st.write(f"**Email:** {udata.get('email', 'N/A')}")
414
+ st.write(f"**Username:** {uname}")
415
+ st.write(f"**Admin Status:**Admin")
416
+
417
+ if uname == username:
418
+ st.info("This is you!")
419
+
420
+ except Exception as e:
421
+ st.error(f"Error initializing user manager: {e}")
422
+ st.exception(e)
423
+
424
+ else:
425
+ st.warning("Using secrets.toml (read-only)")
426
+ st.info("For persistent user management, add HuggingFace credentials to secrets.toml")
427
+
428
+ with st.expander("How to add users manually"):
429
+ st.markdown("""
430
+ **To add new users when using secrets.toml:**
431
+
432
+ 1. **Generate password hash:**
433
+ ```bash
434
+ python generate_password_hash.py
435
+ ```
436
+
437
+ 2. **Add to secrets.toml:**
438
+ ```toml
439
+ [auth.credentials.usernames.newuser]
440
+ email = "user@vanderbilt.edu"
441
+ name = "New User"
442
+ password = "$2b$12$HASH_FROM_STEP_1"
443
+ ```
444
+
445
+ 3. **Update on HuggingFace Spaces** (re-upload secrets.toml)
446
+
447
+ All registered users automatically get admin access.
448
+ """)
449
+
450
+ st.markdown("---")
451
+
452
+ st.markdown("Current Admins")
453
+ if 'credentials' in config and 'usernames' in config['credentials']:
454
+ users = config['credentials']['usernames']
455
+ for uname, udata in users.items():
456
+ with st.expander(f"{udata.get('name', uname)} (@{uname})"):
457
+ st.write(f"**Email:** {udata.get('email', 'N/A')}")
458
+ st.write(f"**Username:** {uname}")
459
+ st.write(f"**Admin Status:Admin")
data/update_data.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import sys
3
+ from huggingface_upload import upload_all_to_huggingface
4
+
5
+ all_scripts = [
6
+ "data_updating_scripts/get_data.py",
7
+ "data_updating_scripts/fix_pdf_bills.py",
8
+ "data_updating_scripts/known_bills_status.py",
9
+ "data_updating_scripts/migrate_iapp_categories.py",
10
+ "data_updating_scripts/mark_no_text_bills.py",
11
+ "data_updating_scripts/generate_summaries.py",
12
+ "data_updating_scripts/generate_suggested_questions.py",
13
+ "data_updating_scripts/generate_reports.py",
14
+ "data_updating_scripts/eu_vectorstore.py",
15
+ ]
16
+
17
+ print("Do you want to pull new data from LegiScan?")
18
+ print("Enter 'y' or 'yes' to pull new data, or 'n' or 'no' to skip and use existing data:")
19
+ response = input().strip().lower()
20
+
21
+ if response in ["y", "yes"]:
22
+ print("\n✓ Will pull new data from LegiScan")
23
+ scripts_to_run = all_scripts
24
+ elif response in ["n", "no"]:
25
+ print("\n✓ Skipping data pull, using existing data")
26
+ scripts_to_run = all_scripts[2:]
27
+ else:
28
+ print(f"\n✗ Invalid response '{response}'. Please run the script again and enter 'y' or 'n'.")
29
+ sys.exit(1)
30
+
31
+ print(f"\nWill run {len(scripts_to_run)} scripts:")
32
+ for script in scripts_to_run:
33
+ print(f" - {script}")
34
+
35
+ print("\n" + "=" * 50)
36
+
37
+ for script in scripts_to_run:
38
+ print(f"\n--- Running {script} ---")
39
+ print("=" * 50)
40
+
41
+ result = subprocess.run([sys.executable, script])
42
+
43
+ if result.returncode != 0:
44
+ print(f"\n✗ Script {script} failed with return code {result.returncode}")
45
+ print("Do you want to continue with the remaining scripts? (y/n):")
46
+ continue_response = input().strip().lower()
47
+ if continue_response not in ["y", "yes"]:
48
+ print("Stopping pipeline execution.")
49
+ sys.exit(1)
50
+ else:
51
+ print(f"✓ {script} completed successfully")
52
+
53
+ print("\n" + "=" * 50)
54
+ print("✓ Pipeline execution completed!")
55
+
56
+ print("\nUploading all JSON datasets to HuggingFace…")
57
+
58
+ try:
59
+ upload_all_to_huggingface()
60
+ print("✓ HuggingFace upload complete!")
61
+ except KeyError as e:
62
+ print(f"✗ HuggingFace config error: {e}")
63
+ except Exception as e:
64
+ print(f"✗ Upload failed: {e}")