Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| #!/usr/bin/env python3 | |
| """ | |
| Scoping agent for AI legislation research using LangChain Open Deep Research patterns. | |
| Focus: Phase 1 (Scoping) – User Clarification + Brief Generation. | |
| Inputs: freeform research question. | |
| Outputs: structured JSON object for downstream planning/tooling. | |
| This module avoids web research and is tailored for internal dataset integration later | |
| (`data/known_bills_visualize.json`). | |
| """ | |
| from __future__ import annotations | |
| import os | |
| from typing import List, Optional, Literal, Dict, Any | |
| from pydantic import BaseModel, Field | |
| from langchain_openai import ChatOpenAI | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain_core.output_parsers import JsonOutputParser | |
| class SubQuestion(BaseModel): | |
| id: str = Field(..., description="Stable identifier for the sub-question") | |
| text: str = Field(..., description="Concrete, answerable sub-question") | |
| strategy: Optional[Literal[ | |
| "fact_finding", | |
| "timeline_analysis", | |
| "comparative_review", | |
| "trend_analysis", | |
| "dataset_query", | |
| "definition_mapping", | |
| "stakeholder_mapping", | |
| "gap_identification", | |
| ]] = Field( | |
| default=None, | |
| description="Suggested research strategy type for this sub-question", | |
| ) | |
| notes: Optional[str] = Field( | |
| default=None, | |
| description="Brief guidance, constraints, or hints for this sub-question", | |
| ) | |
| class ResearchScope(BaseModel): | |
| clarified_question: str | |
| scope_statement: str | |
| assumptions: List[str] | |
| exclusions: List[str] | |
| key_terms: List[str] | |
| jurisdictions: List[str] | |
| time_horizon: Optional[str] = None | |
| sub_questions: List[SubQuestion] | |
| suggested_directions: List[str] | |
| dataset_alignment: Dict[str, Any] = Field( | |
| default_factory=dict, | |
| description="Hints for how to query the internal dataset (fields, filters)", | |
| ) | |
| SYSTEM_PROMPT = """ | |
| You are a senior research methodologist specializing in AI legislation. Your job is to perform | |
| the Scoping phase of a deep research workflow: clarify the user's question and produce a concise, | |
| well-structured research brief tailored for legislative analysis using an internal dataset of | |
| ~2,000+ US federal and state AI-related bills. | |
| Guidelines: | |
| - Keep scope precise and operational. Avoid generic platitudes. | |
| - Break the problem into 5–10 concrete sub-questions, each with an optional strategy label | |
| (fact_finding, timeline_analysis, comparative_review, trend_analysis, dataset_query, | |
| definition_mapping, stakeholder_mapping, gap_identification). | |
| - Prefer directions that can be executed via structured queries over the internal dataset later. | |
| - Include key terms to anchor retrieval (bill fields, concepts, definitions, agencies, topics). | |
| - Include assumptions and explicit exclusions to bound the work. | |
| - Include a dataset_alignment section that anticipates filters/fields such as: state, session_year, | |
| status, iapp_categories, last_action_date, sponsors, title, summary. | |
| Output MUST be valid JSON only, matching the provided JSON schema. | |
| """ | |
| USER_TEMPLATE = """ | |
| User research question: | |
| {user_question} | |
| Produce a JSON object that matches this pydantic schema: | |
| {format_instructions} | |
| """ | |
| def get_llm(api_key: Optional[str] = None, model: str = "gpt-5-mini", temperature: float = 0.2) -> ChatOpenAI: | |
| key = api_key or os.environ.get("OPENAI_API_KEY") | |
| if not key: | |
| raise RuntimeError("OPENAI_API_KEY not set. Provide via env or argument.") | |
| return ChatOpenAI(api_key=key, model=model, temperature=temperature) | |
| def build_scoping_chain(llm: Optional[ChatOpenAI] = None): | |
| parser = JsonOutputParser(pydantic_object=ResearchScope) | |
| prompt = ChatPromptTemplate.from_messages([ | |
| ("system", SYSTEM_PROMPT), | |
| ("user", USER_TEMPLATE), | |
| ]).partial(format_instructions=parser.get_format_instructions()) | |
| model = llm or get_llm() | |
| chain = prompt | model | parser | |
| return chain | |
| def scope_research_question(user_question: str, *, llm: Optional[ChatOpenAI] = None) -> ResearchScope: | |
| chain = build_scoping_chain(llm) | |
| result = chain.invoke({"user_question": user_question}) | |
| # JsonOutputParser may return a plain dict; coerce to ResearchScope | |
| if isinstance(result, ResearchScope): | |
| return result | |
| return ResearchScope.model_validate(result) | |
| def to_json(scope: ResearchScope) -> str: | |
| import json as _json | |
| return _json.dumps(scope.model_dump(), indent=2, ensure_ascii=False) | |
| if __name__ == "__main__": | |
| import argparse | |
| import json as _json | |
| parser = argparse.ArgumentParser(description="Run the scoping agent.") | |
| parser.add_argument("question", type=str, help="Freeform research question") | |
| parser.add_argument( | |
| "--api-key", | |
| dest="api_key", | |
| type=str, | |
| default=None, | |
| help="OpenAI API key (overrides environment variable)", | |
| ) | |
| parser.add_argument( | |
| "--model", | |
| dest="model", | |
| type=str, | |
| default="gpt-5-mini", | |
| help="Model name (default: gpt-5-mini)", | |
| ) | |
| args = parser.parse_args() | |
| llm = None | |
| if args.api_key is not None: | |
| llm = get_llm(api_key=args.api_key, model=args.model) | |
| scope = scope_research_question(args.question, llm=llm) | |
| import json as _json | |
| print(_json.dumps(scope.model_dump(), indent=2, ensure_ascii=False)) | |