#!/usr/bin/env python3
"""
Scoping agent for AI legislation research using LangChain Open Deep Research patterns.

Focus: Phase 1 (Scoping) – User Clarification + Brief Generation.

Inputs: freeform research question.
Outputs: structured JSON object for downstream planning/tooling.

This module avoids web research and is tailored for internal dataset integration later
(`data/known_bills_visualize.json`).
"""

from __future__ import annotations

import os
from typing import List, Optional, Literal, Dict, Any

from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser


class SubQuestion(BaseModel):
    id: str = Field(..., description="Stable identifier for the sub-question")
    text: str = Field(..., description="Concrete, answerable sub-question")
    strategy: Optional[Literal[
        "fact_finding",
        "timeline_analysis",
        "comparative_review",
        "trend_analysis",
        "dataset_query",
        "definition_mapping",
        "stakeholder_mapping",
        "gap_identification",
    ]] = Field(
        default=None,
        description="Suggested research strategy type for this sub-question",
    )
    notes: Optional[str] = Field(
        default=None,
        description="Brief guidance, constraints, or hints for this sub-question",
    )


class ResearchScope(BaseModel):
    clarified_question: str
    scope_statement: str
    assumptions: List[str]
    exclusions: List[str]
    key_terms: List[str]
    jurisdictions: List[str]
    time_horizon: Optional[str] = None
    sub_questions: List[SubQuestion]
    suggested_directions: List[str]
    dataset_alignment: Dict[str, Any] = Field(
        default_factory=dict,
        description="Hints for how to query the internal dataset (fields, filters)",
    )


SYSTEM_PROMPT = """
You are a senior research methodologist specializing in AI legislation. Your job is to perform
the Scoping phase of a deep research workflow: clarify the user's question and produce a concise,
well-structured research brief tailored for legislative analysis using an internal dataset of
~2,000+ US federal and state AI-related bills.

Guidelines:
- Keep scope precise and operational. Avoid generic platitudes.
- Break the problem into 5–10 concrete sub-questions, each with an optional strategy label
  (fact_finding, timeline_analysis, comparative_review, trend_analysis, dataset_query,
   definition_mapping, stakeholder_mapping, gap_identification).
- Prefer directions that can be executed via structured queries over the internal dataset later.
- Include key terms to anchor retrieval (bill fields, concepts, definitions, agencies, topics).
- Include assumptions and explicit exclusions to bound the work.
- Include a dataset_alignment section that anticipates filters/fields such as: state, session_year,
  status, iapp_categories, last_action_date, sponsors, title, summary.

Output MUST be valid JSON only, matching the provided JSON schema.
"""


USER_TEMPLATE = """
User research question:
{user_question}

Produce a JSON object that matches this pydantic schema:
{format_instructions}
"""


def get_llm(api_key: Optional[str] = None, model: str = "gpt-5-mini", temperature: float = 0.2) -> ChatOpenAI:
    key = api_key or os.environ.get("OPENAI_API_KEY")
    if not key:
        raise RuntimeError("OPENAI_API_KEY not set. Provide via env or argument.")
    return ChatOpenAI(api_key=key, model=model, temperature=temperature)


def build_scoping_chain(llm: Optional[ChatOpenAI] = None):
    parser = JsonOutputParser(pydantic_object=ResearchScope)
    prompt = ChatPromptTemplate.from_messages([
        ("system", SYSTEM_PROMPT),
        ("user", USER_TEMPLATE),
    ]).partial(format_instructions=parser.get_format_instructions())

    model = llm or get_llm()
    chain = prompt | model | parser
    return chain


def scope_research_question(user_question: str, *, llm: Optional[ChatOpenAI] = None) -> ResearchScope:
    chain = build_scoping_chain(llm)
    result = chain.invoke({"user_question": user_question})
    # JsonOutputParser may return a plain dict; coerce to ResearchScope
    if isinstance(result, ResearchScope):
        return result
    return ResearchScope.model_validate(result)


def to_json(scope: ResearchScope) -> str:
    import json as _json
    return _json.dumps(scope.model_dump(), indent=2, ensure_ascii=False)


if __name__ == "__main__":
    import argparse
    import json as _json

    parser = argparse.ArgumentParser(description="Run the scoping agent.")
    parser.add_argument("question", type=str, help="Freeform research question")
    parser.add_argument(
        "--api-key",
        dest="api_key",
        type=str,
        default=None,
        help="OpenAI API key (overrides environment variable)",
    )
    parser.add_argument(
        "--model",
        dest="model",
        type=str,
        default="gpt-5-mini",
        help="Model name (default: gpt-5-mini)",
    )
    args = parser.parse_args()

    llm = None
    if args.api_key is not None:
        llm = get_llm(api_key=args.api_key, model=args.model)

    scope = scope_research_question(args.question, llm=llm)
    import json as _json
    print(_json.dumps(scope.model_dump(), indent=2, ensure_ascii=False))