legislation-tracker / scoping_agent.py
ramanna's picture
Deploy: newsletter display polish
3cc39aa
#!/usr/bin/env python3
"""
Scoping agent for AI legislation research using LangChain Open Deep Research patterns.
Focus: Phase 1 (Scoping) – User Clarification + Brief Generation.
Inputs: freeform research question.
Outputs: structured JSON object for downstream planning/tooling.
This module avoids web research and is tailored for internal dataset integration later
(`data/known_bills_visualize.json`).
"""
from __future__ import annotations
import os
from typing import List, Optional, Literal, Dict, Any
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
class SubQuestion(BaseModel):
id: str = Field(..., description="Stable identifier for the sub-question")
text: str = Field(..., description="Concrete, answerable sub-question")
strategy: Optional[Literal[
"fact_finding",
"timeline_analysis",
"comparative_review",
"trend_analysis",
"dataset_query",
"definition_mapping",
"stakeholder_mapping",
"gap_identification",
]] = Field(
default=None,
description="Suggested research strategy type for this sub-question",
)
notes: Optional[str] = Field(
default=None,
description="Brief guidance, constraints, or hints for this sub-question",
)
class ResearchScope(BaseModel):
clarified_question: str
scope_statement: str
assumptions: List[str]
exclusions: List[str]
key_terms: List[str]
jurisdictions: List[str]
time_horizon: Optional[str] = None
sub_questions: List[SubQuestion]
suggested_directions: List[str]
dataset_alignment: Dict[str, Any] = Field(
default_factory=dict,
description="Hints for how to query the internal dataset (fields, filters)",
)
SYSTEM_PROMPT = """
You are a senior research methodologist specializing in AI legislation. Your job is to perform
the Scoping phase of a deep research workflow: clarify the user's question and produce a concise,
well-structured research brief tailored for legislative analysis using an internal dataset of
~2,000+ US federal and state AI-related bills.
Guidelines:
- Keep scope precise and operational. Avoid generic platitudes.
- Break the problem into 5–10 concrete sub-questions, each with an optional strategy label
(fact_finding, timeline_analysis, comparative_review, trend_analysis, dataset_query,
definition_mapping, stakeholder_mapping, gap_identification).
- Prefer directions that can be executed via structured queries over the internal dataset later.
- Include key terms to anchor retrieval (bill fields, concepts, definitions, agencies, topics).
- Include assumptions and explicit exclusions to bound the work.
- Include a dataset_alignment section that anticipates filters/fields such as: state, session_year,
status, iapp_categories, last_action_date, sponsors, title, summary.
Output MUST be valid JSON only, matching the provided JSON schema.
"""
USER_TEMPLATE = """
User research question:
{user_question}
Produce a JSON object that matches this pydantic schema:
{format_instructions}
"""
def get_llm(api_key: Optional[str] = None, model: str = "gpt-5-mini", temperature: float = 0.2) -> ChatOpenAI:
key = api_key or os.environ.get("OPENAI_API_KEY")
if not key:
raise RuntimeError("OPENAI_API_KEY not set. Provide via env or argument.")
return ChatOpenAI(api_key=key, model=model, temperature=temperature)
def build_scoping_chain(llm: Optional[ChatOpenAI] = None):
parser = JsonOutputParser(pydantic_object=ResearchScope)
prompt = ChatPromptTemplate.from_messages([
("system", SYSTEM_PROMPT),
("user", USER_TEMPLATE),
]).partial(format_instructions=parser.get_format_instructions())
model = llm or get_llm()
chain = prompt | model | parser
return chain
def scope_research_question(user_question: str, *, llm: Optional[ChatOpenAI] = None) -> ResearchScope:
chain = build_scoping_chain(llm)
result = chain.invoke({"user_question": user_question})
# JsonOutputParser may return a plain dict; coerce to ResearchScope
if isinstance(result, ResearchScope):
return result
return ResearchScope.model_validate(result)
def to_json(scope: ResearchScope) -> str:
import json as _json
return _json.dumps(scope.model_dump(), indent=2, ensure_ascii=False)
if __name__ == "__main__":
import argparse
import json as _json
parser = argparse.ArgumentParser(description="Run the scoping agent.")
parser.add_argument("question", type=str, help="Freeform research question")
parser.add_argument(
"--api-key",
dest="api_key",
type=str,
default=None,
help="OpenAI API key (overrides environment variable)",
)
parser.add_argument(
"--model",
dest="model",
type=str,
default="gpt-5-mini",
help="Model name (default: gpt-5-mini)",
)
args = parser.parse_args()
llm = None
if args.api_key is not None:
llm = get_llm(api_key=args.api_key, model=args.model)
scope = scope_research_question(args.question, llm=llm)
import json as _json
print(_json.dumps(scope.model_dump(), indent=2, ensure_ascii=False))