#!/usr/bin/env python3 """ Scoping agent for AI legislation research using LangChain Open Deep Research patterns. Focus: Phase 1 (Scoping) – User Clarification + Brief Generation. Inputs: freeform research question. Outputs: structured JSON object for downstream planning/tooling. This module avoids web research and is tailored for internal dataset integration later (`data/known_bills_visualize.json`). """ from __future__ import annotations import os from typing import List, Optional, Literal, Dict, Any from pydantic import BaseModel, Field from langchain_openai import ChatOpenAI from langchain_core.prompts import ChatPromptTemplate from langchain_core.output_parsers import JsonOutputParser class SubQuestion(BaseModel): id: str = Field(..., description="Stable identifier for the sub-question") text: str = Field(..., description="Concrete, answerable sub-question") strategy: Optional[Literal[ "fact_finding", "timeline_analysis", "comparative_review", "trend_analysis", "dataset_query", "definition_mapping", "stakeholder_mapping", "gap_identification", ]] = Field( default=None, description="Suggested research strategy type for this sub-question", ) notes: Optional[str] = Field( default=None, description="Brief guidance, constraints, or hints for this sub-question", ) class ResearchScope(BaseModel): clarified_question: str scope_statement: str assumptions: List[str] exclusions: List[str] key_terms: List[str] jurisdictions: List[str] time_horizon: Optional[str] = None sub_questions: List[SubQuestion] suggested_directions: List[str] dataset_alignment: Dict[str, Any] = Field( default_factory=dict, description="Hints for how to query the internal dataset (fields, filters)", ) SYSTEM_PROMPT = """ You are a senior research methodologist specializing in AI legislation. Your job is to perform the Scoping phase of a deep research workflow: clarify the user's question and produce a concise, well-structured research brief tailored for legislative analysis using an internal dataset of ~2,000+ US federal and state AI-related bills. Guidelines: - Keep scope precise and operational. Avoid generic platitudes. - Break the problem into 5–10 concrete sub-questions, each with an optional strategy label (fact_finding, timeline_analysis, comparative_review, trend_analysis, dataset_query, definition_mapping, stakeholder_mapping, gap_identification). - Prefer directions that can be executed via structured queries over the internal dataset later. - Include key terms to anchor retrieval (bill fields, concepts, definitions, agencies, topics). - Include assumptions and explicit exclusions to bound the work. - Include a dataset_alignment section that anticipates filters/fields such as: state, session_year, status, iapp_categories, last_action_date, sponsors, title, summary. Output MUST be valid JSON only, matching the provided JSON schema. """ USER_TEMPLATE = """ User research question: {user_question} Produce a JSON object that matches this pydantic schema: {format_instructions} """ def get_llm(api_key: Optional[str] = None, model: str = "gpt-5-mini", temperature: float = 0.2) -> ChatOpenAI: key = api_key or os.environ.get("OPENAI_API_KEY") if not key: raise RuntimeError("OPENAI_API_KEY not set. Provide via env or argument.") return ChatOpenAI(api_key=key, model=model, temperature=temperature) def build_scoping_chain(llm: Optional[ChatOpenAI] = None): parser = JsonOutputParser(pydantic_object=ResearchScope) prompt = ChatPromptTemplate.from_messages([ ("system", SYSTEM_PROMPT), ("user", USER_TEMPLATE), ]).partial(format_instructions=parser.get_format_instructions()) model = llm or get_llm() chain = prompt | model | parser return chain def scope_research_question(user_question: str, *, llm: Optional[ChatOpenAI] = None) -> ResearchScope: chain = build_scoping_chain(llm) result = chain.invoke({"user_question": user_question}) # JsonOutputParser may return a plain dict; coerce to ResearchScope if isinstance(result, ResearchScope): return result return ResearchScope.model_validate(result) def to_json(scope: ResearchScope) -> str: import json as _json return _json.dumps(scope.model_dump(), indent=2, ensure_ascii=False) if __name__ == "__main__": import argparse import json as _json parser = argparse.ArgumentParser(description="Run the scoping agent.") parser.add_argument("question", type=str, help="Freeform research question") parser.add_argument( "--api-key", dest="api_key", type=str, default=None, help="OpenAI API key (overrides environment variable)", ) parser.add_argument( "--model", dest="model", type=str, default="gpt-5-mini", help="Model name (default: gpt-5-mini)", ) args = parser.parse_args() llm = None if args.api_key is not None: llm = get_llm(api_key=args.api_key, model=args.model) scope = scope_research_question(args.question, llm=llm) import json as _json print(_json.dumps(scope.model_dump(), indent=2, ensure_ascii=False))