ConstitutionAgent / data_tools /enrich_data.py
Meshyboi's picture
Upload 53 files
0cd3dc5 verified
import re
import os
import json
import glob
import time
import random
import sys
# Ensure project root is in sys.path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from groq import Groq
from utils.config import settings
# Initialize Groq
if not settings.GROQ_API_KEY:
raise ValueError("GROQ_API_KEY is not set.")
client = Groq(api_key=settings.GROQ_API_KEY)
model = settings.GROQ_MODEL or "llama-3.3-70b-versatile"
def enrich_amendment_summary(file_path):
print(f"Processing {file_path}...")
try:
with open(file_path, "r") as f:
data = json.load(f)
original_desc = data["metadata"].get("amendment_description", "")
title = data["metadata"].get("amendment_title", "")
prompt = f"""
You are a Constitutional Legal Expert. Your task is to EXPAND the summary of a Constitutional Amendment to include specific details about KEY ARTICLES that were added, deleted, or modified.
Amendment: {title}
Original Summary: {original_desc}
Your Goal:
1. Identify the major articles mentioned (e.g., Article 19, 31, 368, 42nd Amendment changes).
2. Explicitly state WHAT changed for these articles. Did it delete a Right? Did it add a Duty? Did it change 'internal disturbance' to 'armed rebellion'?
3. Be precise with Article numbers and Clauses (e.g. 19(1)(f)).
4. **Identify Cross-Article Effects**: Does the amendment modify an Article that impacts *another* Article? (e.g. "Article 358 restricts Article 19"). Explicitly state if one article suspends or overrides another.
5. Do NOT hallucinate. Use your knowledge of the Indian Constitution to fill in the semantic details implied by the original summary.
Output ONLY the Enhanced Summary text. Do not add conversational filler.
"""
# Retry logic for Rate Limits
for attempt in range(3):
try:
chat_completion = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant that outputs detailed legal summaries."},
{"role": "user", "content": prompt}
],
model=model
)
break
except Exception as e:
# If rate limit error or groq error
if "429" in str(e) or "rate_limit" in str(e):
wait = (attempt + 1) * 5
print(f"Rate Limit hit. Waiting {wait}s...")
time.sleep(wait)
else:
# other errors, maybe skip
print(f"Error calling LLM: {e}")
return
else:
print(f"Failed to enrich {file_path} after 3 retries.")
return
enhanced_summary = chat_completion.choices[0].message.content.strip()
# Strip thought process if present (e.g. <think>...</think>)
enhanced_summary = re.sub(r'<think>.*?</think>', '', enhanced_summary, flags=re.DOTALL).strip()
enhanced_summary = re.sub(r'<thought>.*?</thought>', '', enhanced_summary, flags=re.DOTALL).strip()
# Update the JSON
data["content"] = enhanced_summary
data["metadata"]["enriched"] = True
with open(file_path, "w") as f:
json.dump(data, f, indent=2)
print(f"Enriched {title}")
except Exception as e:
print(f"Error processing {file_path}: {e}")
def main():
# Target ALL amendments
summary_files = glob.glob("extracted_data/amendment_*/summary.json")
summary_files = sorted(summary_files)
print(f"Found {len(summary_files)} amendment summaries to enrich.")
for i, file_path in enumerate(summary_files):
print(f"[{i+1}/{len(summary_files)}] Processing {file_path}")
enrich_amendment_summary(file_path)
# Rate Limit Spacing (Sequential)
time.sleep(2)
if __name__ == "__main__":
main()