Spaces:

vanderbilt-dsi
/

survey-analytics

Build error

App Files Files Community

umangchaudhry commited on Oct 19, 2025

Commit

6241bf7

verified ·

1 Parent(s): ff476de

Upload 6 files

Browse files

Files changed (6) hide show

app.py +247 -0
questionnaire_rag.py +592 -0
questionnaire_vectorstores/poll_catalog.json +74 -0
questionnaire_vectorstores/questions_index.json +0 -0
requirements.txt +9 -0
survey_agent.py +1175 -0

app.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""
+Gradio interface for Survey Analysis Agent
+Host on Hugging Face Spaces
+"""
+import os
+import gradio as gr
+from survey_agent import SurveyAnalysisAgent
+import uuid
+from datetime import datetime
+# Initialize agent (will be done once at startup)
+agent = None
+initialization_error = None
+def initialize_agent():
+    """Initialize the agent with API keys from environment"""
+    global agent, initialization_error
+    try:
+        openai_api_key = os.getenv("OPENAI_API_KEY")
+        pinecone_api_key = os.getenv("PINECONE_API_KEY")
+        if not openai_api_key:
+            initialization_error = "❌ OPENAI_API_KEY not found. Please set it in Space Settings → Repository Secrets."
+            return False
+        if not pinecone_api_key:
+            initialization_error = "❌ PINECONE_API_KEY not found. Please set it in Space Settings → Repository Secrets."
+            return False
+        # Check if vector store exists
+        if not os.path.exists("./questionnaire_vectorstores"):
+            initialization_error = "❌ Vector store directory not found. Please upload the questionnaire_vectorstores folder."
+            return False
+        agent = SurveyAnalysisAgent(
+            openai_api_key=openai_api_key,
+            pinecone_api_key=pinecone_api_key,
+            verbose=False  # Set to False for cleaner UI
+        )
+        return True
+    except Exception as e:
+        initialization_error = f"❌ Initialization error: {str(e)}"
+        return False
+def chat(message, history, session_id):
+    """
+    Handle chat interaction
+    Args:
+        message: User's message
+        history: Chat history (list of [user_msg, bot_msg] pairs)
+        session_id: Unique session identifier for conversation memory
+    """
+    if initialization_error:
+        return initialization_error
+    if not agent:
+        return "⚠️ Agent not initialized. Please refresh the page."
+    if not message.strip():
+        return "Please enter a question."
+    try:
+        # Use session_id as thread_id for conversation memory
+        answer = agent.query(message, thread_id=session_id)
+        return answer
+    except Exception as e:
+        error_msg = f"❌ Error processing query: {str(e)}"
+        print(f"Error details: {e}")  # Log to console
+        return error_msg
+def create_new_session():
+    """Create a new session ID"""
+    return str(uuid.uuid4())
+def get_available_surveys():
+    """Get list of available surveys"""
+    if initialization_error or not agent:
+        return "Agent not initialized"
+    try:
+        surveys = agent.questionnaire_rag.get_available_survey_names()
+        polls = agent.questionnaire_rag.get_available_polls()
+        info = "## Available Surveys\n\n"
+        info += f"**Survey Names:** {', '.join(surveys)}\n\n"
+        info += "## Available Polls\n\n"
+        for poll in polls:
+            info += f"- **{poll['poll_date']}** ({poll['month']} {poll['year']}): {poll['survey_name']} - {poll['num_questions']} questions\n"
+        return info
+    except Exception as e:
+        return f"Error retrieving survey info: {str(e)}"
+# Initialize agent at startup
+print("🚀 Initializing Survey Analysis Agent...")
+init_success = initialize_agent()
+if init_success:
+    print("✅ Agent initialized successfully!")
+else:
+    print(f"⚠️ Agent initialization failed: {initialization_error}")
+# Create Gradio interface
+with gr.Blocks(title="Survey Analysis Agent", theme=gr.themes.Soft()) as demo:
+    # Header
+    gr.Markdown("""
+    # 📊 Survey Analysis Agent
+    Ask questions about survey data using natural language. The agent can:
+    - Find questions from specific surveys and time periods
+    - Compare questions across different time periods
+    - Analyze question topics and themes
+    - Show sampling logic and question flow
+    **Note:** Currently only questionnaire data is available (questions, topics, response options, skip logic).
+    """)
+    # Show initialization status
+    if initialization_error:
+        gr.Markdown(f"## ⚠️ Setup Required\n\n{initialization_error}")
+    # Session state
+    session_id_state = gr.State(value=create_new_session())
+    # Main chat interface
+    with gr.Row():
+        with gr.Column(scale=2):
+            chatbot = gr.Chatbot(
+                label="Conversation",
+                height=500,
+                show_label=True,
+                type="messages"
+            )
+            with gr.Row():
+                msg = gr.Textbox(
+                    label="Your question",
+                    placeholder="e.g., What questions were asked in the June 2025 Unity Poll?",
+                    show_label=False,
+                    scale=4
+                )
+                submit = gr.Button("Send", scale=1, variant="primary")
+            with gr.Row():
+                clear = gr.Button("🔄 New Conversation", scale=1)
+            # Example questions
+            gr.Examples(
+                examples=[
+                    "What questions were asked in June 2025?",
+                    "Show me all healthcare-related questions",
+                    "What questions were asked in the Unity Poll?",
+                    "Compare immigration questions from different surveys",
+                ],
+                inputs=msg,
+                label="Example Questions"
+            )
+        # Sidebar with info
+        with gr.Column(scale=1):
+            gr.Markdown("## 📋 Available Data")
+            survey_info = gr.Markdown(
+                value=get_available_surveys() if init_success else "Agent not initialized",
+                label="Surveys"
+            )
+            refresh_info = gr.Button("🔄 Refresh Survey List", size="sm")
+            gr.Markdown("""
+            ## 💡 Tips
+            - Be specific about time periods (e.g., "June 2025")
+            - Mention survey names when relevant
+            - Follow up with clarifications if needed
+            - The agent maintains conversation context
+            ## 🔧 Current Capabilities
+            ✅ **Available:**
+            - Question text and response options
+            - Topics and themes
+            - Skip logic and sampling
+            - Question sequencing
+            ⏳ **Coming Soon:**
+            - Response frequencies (toplines)
+            - Cross-tabulations
+            - Statistical analysis
+            """)
+    # Event handlers
+    def respond(message, chat_history, session_id):
+        """Handle message and update chat history"""
+        if not message.strip():
+            return chat_history, ""
+        # Add user message
+        chat_history.append({"role": "user", "content": message})
+        # Get bot response
+        bot_message = chat(message, chat_history, session_id)
+        # Add bot message
+        chat_history.append({"role": "assistant", "content": bot_message})
+        return chat_history, ""
+    def clear_chat():
+        """Clear chat and create new session"""
+        new_session = create_new_session()
+        return [], new_session
+    # Wire up events
+    msg.submit(respond, [msg, chatbot, session_id_state], [chatbot, msg])
+    submit.click(respond, [msg, chatbot, session_id_state], [chatbot, msg])
+    clear.click(clear_chat, None, [chatbot, session_id_state])
+    refresh_info.click(get_available_surveys, None, survey_info)
+    # Footer
+    gr.Markdown("""
+    ---
+    **Note:** This system uses conversation memory. You can ask follow-up questions like:
+    1. "What questions were asked?"
+    2. "June 2025, Unity Poll" (it will understand the context)
+    """)
+# Launch the app
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )

questionnaire_rag.py ADDED Viewed

	@@ -0,0 +1,592 @@

+"""
+Questionnaire RAG with better filtering and anti-hallucination measures.
+Key improvements:
+1. Correct Pinecone filter syntax
+2. Post-retrieval validation of filters
+3. Stronger anti-hallucination prompts
+4. Explicit checks for data existence
+5. Fuzzy survey name matching
+"""
+import os
+import json
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+from langchain_pinecone import PineconeVectorStore
+from pinecone import Pinecone
+from langchain.prompts import ChatPromptTemplate
+from langchain.schema.output_parser import StrOutputParser
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    pass
+class QuestionnaireRAG:
+    """
+    Improved questionnaire RAG with:
+    - Better Pinecone filtering
+    - Post-retrieval validation
+    - Anti-hallucination measures
+    - Fuzzy survey name matching
+    """
+    def __init__(
+        self,
+        openai_api_key: str,
+        pinecone_api_key: str,
+        persist_directory: str = "./questionnaire_vectorstores",
+        verbose: bool = False
+    ):
+        self.openai_api_key = openai_api_key
+        self.pinecone_api_key = pinecone_api_key
+        self.persist_directory = persist_directory
+        self.verbose = verbose
+        # Initialize embeddings
+        self.embeddings = OpenAIEmbeddings(
+            model=os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
+        )
+        # Initialize LLM
+        chat_model = os.getenv("OPENAI_MODEL", "gpt-4o")
+        self.llm = ChatOpenAI(model=chat_model, temperature=0)
+        # Load vector store
+        if not os.path.exists(persist_directory):
+            raise ValueError(
+                f"Vector store not found at {persist_directory}\n"
+                "Run create_questionnaire_vectorstores.py first"
+            )
+        # Connect to Pinecone
+        index_name = os.getenv("PINECONE_INDEX_NAME", "poll-questionnaire-index")
+        namespace = os.getenv("PINECONE_NAMESPACE") or None
+        pc = Pinecone(api_key=self.pinecone_api_key)
+        self.index = pc.Index(index_name)
+        self.vectorstore = PineconeVectorStore(
+            index=self.index,
+            embedding=self.embeddings,
+            namespace=namespace
+        )
+        # Load catalog and questions
+        self.poll_catalog = self._load_catalog()
+        self.questions_by_id = self._load_questions_index()
+        if self.verbose:
+            print(f"✓ Loaded {len(self.questions_by_id)} questions from {len(self.poll_catalog)} polls")
+    def _load_catalog(self) -> Dict[str, Dict]:
+        """Load poll catalog"""
+        catalog_path = Path(self.persist_directory) / "poll_catalog.json"
+        if catalog_path.exists():
+            with open(catalog_path, 'r') as f:
+                return json.load(f)
+        return {}
+    def _load_questions_index(self) -> Dict[str, Dict]:
+        """Load questions index"""
+        questions_path = Path(self.persist_directory) / "questions_index.json"
+        if questions_path.exists():
+            with open(questions_path, 'r') as f:
+                return json.load(f)
+        return {}
+    def get_available_survey_names(self) -> List[str]:
+        """Get list of unique survey names from the catalog"""
+        survey_names = set()
+        for info in self.poll_catalog.values():
+            survey_names.add(info["survey_name"])
+        return sorted(survey_names)
+    def _fuzzy_match_survey_name(self, requested_name: str) -> Optional[str]:
+        """
+        Fuzzy match a requested survey name to an actual stored name.
+        Examples:
+        - "Unity Poll" → "Vanderbilt_Unity_Poll"
+        - "unity poll" → "Vanderbilt_Unity_Poll"
+        - "Vanderbilt Unity" → "Vanderbilt_Unity_Poll"
+        """
+        # Get all unique survey names
+        available_names = self.get_available_survey_names()
+        # Normalize the requested name
+        normalized_requested = requested_name.lower().replace("_", " ").replace("-", " ")
+        # Try exact match first (case-insensitive)
+        for stored_name in available_names:
+            normalized_stored = stored_name.lower().replace("_", " ").replace("-", " ")
+            if normalized_requested == normalized_stored:
+                return stored_name
+        # Try substring matching - check if requested is in stored
+        for stored_name in available_names:
+            normalized_stored = stored_name.lower().replace("_", " ").replace("-", " ")
+            if normalized_requested in normalized_stored:
+                return stored_name
+        # Try reverse - check if stored is in requested
+        for stored_name in available_names:
+            normalized_stored = stored_name.lower().replace("_", " ").replace("-", " ")
+            if normalized_stored in normalized_requested:
+                return stored_name
+        # Try word-level matching - if all words from requested are in stored
+        requested_words = set(normalized_requested.split())
+        for stored_name in available_names:
+            normalized_stored = stored_name.lower().replace("_", " ").replace("-", " ")
+            stored_words = set(normalized_stored.split())
+            # Check if requested words are a subset of stored words
+            if requested_words.issubset(stored_words):
+                return stored_name
+        return None
+    def _build_pinecone_filter(self, filters: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """
+        Build proper Pinecone metadata filter with fuzzy survey name matching.
+        Pinecone filter syntax:
+        - Simple: {"year": 2025}
+        - Multiple: {"$and": [{"year": 2025}, {"month": "February"}]}
+        """
+        if not filters:
+            return None
+        filter_conditions = []
+        # Handle year filter
+        if "year" in filters:
+            year = filters["year"]
+            if isinstance(year, str):
+                year = int(year)
+            filter_conditions.append({"year": {"$eq": year}})
+        # Handle month filter
+        if "month" in filters:
+            month = filters["month"]
+            # Ensure proper capitalization
+            if isinstance(month, str):
+                month = month.capitalize()
+            filter_conditions.append({"month": {"$eq": month}})
+        # Handle poll_date filter (exact match)
+        if "poll_date" in filters:
+            filter_conditions.append({"poll_date": {"$eq": filters["poll_date"]}})
+        # Handle survey_name filter with fuzzy matching
+        if "survey_name" in filters:
+            requested_name = filters["survey_name"]
+            # Try to fuzzy match the survey name
+            matched_name = self._fuzzy_match_survey_name(requested_name)
+            if matched_name:
+                if self.verbose and matched_name != requested_name:
+                    print(f"🔄 Mapped survey name '{requested_name}' → '{matched_name}'")
+                filter_conditions.append({"survey_name": {"$eq": matched_name}})
+            else:
+                if self.verbose:
+                    print(f"⚠️  Survey name '{requested_name}' not found in catalog")
+                    print(f"    Available: {self.get_available_survey_names()}")
+                # Don't add the filter if we can't match it - let other filters work
+        # Handle topics (if a topic is in the comma-separated list)
+        if "topic" in filters:
+            # This is trickier with comma-separated strings in metadata
+            # For now, we'll do post-filtering
+            pass
+        # Combine filters
+        if len(filter_conditions) == 0:
+            return None
+        elif len(filter_conditions) == 1:
+            return filter_conditions[0]
+        else:
+            return {"$and": filter_conditions}
+    def _validate_results(
+        self,
+        docs: List[Any],
+        filters: Dict[str, Any]
+    ) -> List[Any]:
+        """
+        Validate that retrieved documents actually match the filters.
+        This catches cases where:
+        1. Pinecone filtering didn't work correctly
+        2. We need to do additional filtering (like topic matching)
+        """
+        if not filters:
+            return docs
+        validated_docs = []
+        for doc in docs:
+            metadata = doc.metadata
+            valid = True
+            # Check year
+            if "year" in filters:
+                expected_year = int(filters["year"]) if isinstance(filters["year"], str) else filters["year"]
+                if metadata.get("year") != expected_year:
+                    if self.verbose:
+                        print(f"⚠️  Filtered out: wrong year {metadata.get('year')} != {expected_year}")
+                    valid = False
+            # Check month
+            if "month" in filters and valid:
+                expected_month = filters["month"].capitalize() if isinstance(filters["month"], str) else filters["month"]
+                if metadata.get("month") != expected_month:
+                    if self.verbose:
+                        print(f"⚠️  Filtered out: wrong month {metadata.get('month')} != {expected_month}")
+                    valid = False
+            # Check poll_date
+            if "poll_date" in filters and valid:
+                if metadata.get("poll_date") != filters["poll_date"]:
+                    if self.verbose:
+                        print(f"⚠️  Filtered out: wrong poll_date {metadata.get('poll_date')} != {filters['poll_date']}")
+                    valid = False
+            # Check survey_name (with fuzzy matching)
+            if "survey_name" in filters and valid:
+                requested_name = filters["survey_name"]
+                matched_name = self._fuzzy_match_survey_name(requested_name)
+                if matched_name and metadata.get("survey_name") != matched_name:
+                    if self.verbose:
+                        print(f"⚠️  Filtered out: wrong survey {metadata.get('survey_name')} != {matched_name}")
+                    valid = False
+            if valid:
+                validated_docs.append(doc)
+        return validated_docs
+    def _get_prompt(self) -> ChatPromptTemplate:
+        """Get the improved system prompt with anti-hallucination measures"""
+        return ChatPromptTemplate.from_messages([
+            ("system", """You are an expert assistant for analyzing poll questionnaires.
+🚨 CRITICAL RULES - NEVER VIOLATE THESE:
+1. **ONLY use information from the provided context**
+   - Do NOT make up questions, polls, or dates
+   - Do NOT assume a poll exists if it's not in the context
+   - If information is missing, say "I don't have data for [X]" rather than making it up
+2. **Verify data exists before listing it**
+   - Before mentioning any poll, check it's actually in the context
+   - Before listing questions, confirm they exist in the retrieved data
+   - If asked about multiple time periods, explicitly state which ones have data and which don't
+3. **Be explicit about what's NOT in the data**
+   - If asked about "2024 and 2025" but only 2025 data exists, say: "I have data for 2025, but there is no 2024 data in the retrieved results"
+   - Never silently skip missing data - always acknowledge it
+4. **When listing questions:**
+   - List ALL questions from the context in order
+   - Include full question text and response options
+   - Note sampling inline in clear language:
+     * "Asked to all respondents" (not "ASK ALL")
+     * "Asked to half the sample" (not "HALFSAMP1=1")
+     * "Asked only if [condition]" (not technical codes)
+   - If sibling variants exist, note "One of two versions shown to different groups"
+   - Always cite which poll(s) you're using
+5. **Format for scannability:**
+   - Use numbered lists for questions
+   - Bold question text
+   - Include response options as bullet points
+   - Put sampling info in parentheses after question
+Available polls in the system (for reference):
+{catalog}
+Context (ONLY source of truth):
+{context}
+Question: {question}
+"""),
+            ("human", "Answer:")
+        ])
+    def query(self, question: str, filters: Optional[Dict[str, Any]] = None, k: int = 20) -> str:
+        """
+        Query the questionnaire system.
+        Args:
+            question: Natural language question
+            filters: Optional filters (year, month, poll_date, survey_name)
+            k: Number of results to retrieve
+        Returns:
+            Answer string
+        """
+        result = self._query_internal(question, filters, k)
+        return result['answer']
+    def query_with_metadata(
+        self,
+        question: str,
+        filters: Optional[Dict[str, Any]] = None,
+        k: int = 20
+    ) -> Dict[str, Any]:
+        """
+        Query with full metadata about retrieval.
+        Returns:
+            Dict with 'answer', 'source_questions', 'num_sources', 'filters_applied'
+        """
+        return self._query_internal(question, filters, k)
+    def _query_internal(
+        self,
+        question: str,
+        filters: Optional[Dict[str, Any]] = None,
+        k: int = 20
+    ) -> Dict[str, Any]:
+        """Internal query implementation"""
+        if self.verbose:
+            print(f"\n📊 Query: {question}")
+            if filters:
+                print(f"🔍 Filters: {filters}")
+        # Build Pinecone filter
+        pinecone_filter = self._build_pinecone_filter(filters or {})
+        # Retrieve documents
+        if pinecone_filter:
+            if self.verbose:
+                print(f"🔧 Pinecone filter: {pinecone_filter}")
+            retriever = self.vectorstore.as_retriever(
+                search_kwargs={"k": k, "filter": pinecone_filter}
+            )
+        else:
+            retriever = self.vectorstore.as_retriever(search_kwargs={"k": k})
+        docs = retriever.invoke(question)
+        if self.verbose:
+            print(f"📥 Retrieved {len(docs)} documents from Pinecone")
+        # Validate results match filters
+        if filters:
+            docs = self._validate_results(docs, filters)
+            if self.verbose:
+                print(f"✅ After validation: {len(docs)} documents")
+        # Check if we have any results
+        if not docs:
+            no_data_msg = f"No questionnaire data found"
+            if filters:
+                filter_desc = ", ".join([f"{k}={v}" for k, v in filters.items()])
+                no_data_msg += f" matching filters: {filter_desc}"
+            return {
+                "answer": no_data_msg,
+                "source_questions": [],
+                "num_sources": 0,
+                "filters_applied": filters or {}
+            }
+        # Reconstruct full questions
+        full_questions = []
+        seen_ids = set()
+        for doc in docs:
+            q_id = doc.metadata.get('question_id')
+            if q_id and q_id not in seen_ids:
+                if q_id in self.questions_by_id:
+                    full_questions.append(self.questions_by_id[q_id])
+                    seen_ids.add(q_id)
+        # Sort by position to maintain survey order
+        full_questions.sort(key=lambda q: (q.get('poll_date', ''), q.get('position', 0)))
+        # Format context with explicit data availability info
+        context = self._format_context(full_questions, filters)
+        # Get prompt
+        prompt = self._get_prompt()
+        # Create chain
+        chain = (
+            {
+                "context": lambda x: context,
+                "question": lambda x: question,
+                "catalog": lambda x: self._get_catalog_summary()
+            }
+            | prompt
+            | self.llm
+            | StrOutputParser()
+        )
+        # Get answer
+        answer = chain.invoke(question)
+        return {
+            'answer': answer,
+            'source_questions': full_questions,
+            'num_sources': len(full_questions),
+            'filters_applied': filters or {}
+        }
+    def _format_context(
+        self,
+        questions: List[Dict],
+        filters: Optional[Dict[str, Any]] = None
+    ) -> str:
+        """Format questions as context with explicit data availability"""
+        if not questions:
+            filter_desc = ""
+            if filters:
+                filter_desc = f" matching {filters}"
+            return f"⚠️ NO DATA RETRIEVED{filter_desc}\n\nYou must inform the user that no data exists for their query."
+        context_parts = []
+        # Add explicit note about what data we have
+        polls_found = sorted(set(q['poll_date'] for q in questions))
+        context_parts.append(f"✅ DATA AVAILABLE FOR: {', '.join(polls_found)}")
+        # Add note about what was requested vs what was found
+        if filters:
+            if 'year' in filters and 'month' in filters:
+                requested = f"{filters['month']} {filters['year']}"
+                context_parts.append(f"🔍 REQUESTED: {requested}")
+        context_parts.append("")  # Blank line
+        context_parts.append("=" * 80)
+        context_parts.append("")
+        # Format each question
+        for i, q in enumerate(questions, 1):
+            part = f"""
+--- Question {i} from {q['survey_name']} ({q['poll_date']}) ---
+Variable: {q['variable_name']}
+Question: {q['question_text']}
+Response Options: {' | '.join(q['response_options'])}
+Topics: {', '.join(q['topics'])}
+Question Type: {q['question_type']}
+Administration: {q['ask_condition']}
+"""
+            # Add skip logic/sampling
+            if q.get('skip_logic'):
+                part += f"Skip Logic: {q['skip_logic']}\n"
+            if q.get('half_sample_group'):
+                part += f"Half Sample Group: {q['half_sample_group']}\n"
+            # Add sibling variants
+            if q.get('sibling_variants'):
+                part += f"\nAlternate Versions (shown to different groups):\n"
+                for sib in q['sibling_variants']:
+                    sib_group = sib.get('half_sample_group', 'other group')
+                    part += f"  - [{sib_group}] {sib['question_text']}\n"
+            # Add sequence context
+            if q.get('previous_question'):
+                prev_vars = q.get('previous_question_variants', [])
+                if len(prev_vars) > 1:
+                    part += "\nPrevious Question (respondents saw one of these):\n"
+                    for pv in prev_vars:
+                        part += f"  - {pv['question_text']}\n"
+                else:
+                    part += f"\nPrevious Question: {q['previous_question']['question_text']}\n"
+            if q.get('next_question'):
+                next_vars = q.get('next_question_variants', [])
+                if len(next_vars) > 1:
+                    part += "\nNext Question (respondents saw one of these):\n"
+                    for nv in next_vars:
+                        part += f"  - {nv['question_text']}\n"
+                else:
+                    part += f"\nNext Question: {q['next_question']['question_text']}\n"
+            context_parts.append(part.strip())
+        return "\n\n".join(context_parts)
+    def _get_catalog_summary(self) -> str:
+        """Get summary of available polls"""
+        lines = ["Available polls:"]
+        for poll_date in sorted(self.poll_catalog.keys()):
+            info = self.poll_catalog[poll_date]
+            month_str = f" ({info['month']})" if info.get('month') else ""
+            lines.append(f"- {poll_date}{month_str}: {info['num_questions']} questions")
+        return "\n".join(lines)
+    def get_available_polls(self) -> List[Dict[str, Any]]:
+        """Get list of all available polls"""
+        return [
+            {
+                "poll_date": poll_date,
+                "survey_name": info["survey_name"],
+                "year": info["year"],
+                "month": info.get("month", ""),
+                "num_questions": info["num_questions"]
+            }
+            for poll_date, info in sorted(self.poll_catalog.items())
+        ]
+def main():
+    """Test CLI"""
+    import sys
+    openai_api_key = os.getenv("OPENAI_API_KEY")
+    pinecone_api_key = os.getenv("PINECONE_API_KEY")
+    if not openai_api_key or not pinecone_api_key:
+        print("Error: Missing API keys")
+        sys.exit(1)
+    rag = QuestionnaireRAG(
+        openai_api_key=openai_api_key,
+        pinecone_api_key=pinecone_api_key,
+        verbose=True
+    )
+    print("\n" + "="*80)
+    print("QUESTIONNAIRE RAG - TEST MODE")
+    print("="*80)
+    # Test fuzzy matching
+    print("\n🧪 TEST: Fuzzy survey name matching")
+    test_names = ["Unity Poll", "unity poll", "Vanderbilt Unity", "UNITY"]
+    for name in test_names:
+        matched = rag._fuzzy_match_survey_name(name)
+        print(f"  '{name}' → '{matched}'")
+    # Test with the problematic query
+    print("\n🧪 TEST: Query that previously failed")
+    print("Query: What questions were asked in the June 2025 Unity Poll?")
+    filters = {"year": 2025, "month": "June", "survey_name": "Unity Poll"}
+    result = rag.query_with_metadata(
+        "What questions were asked in the June 2025 Unity Poll?",
+        filters=filters
+    )
+    print(f"\n📊 Results:")
+    print(f"Found: {result['num_sources']} questions")
+    print(f"\n{result['answer'][:500]}...")
+    print("\n" + "="*80)
+if __name__ == "__main__":
+    main()

questionnaire_vectorstores/poll_catalog.json ADDED Viewed

	@@ -0,0 +1,74 @@

+{
+  "2023-06": {
+    "survey_name": "Vanderbilt_Unity_Poll",
+    "year": 2023,
+    "month": "June",
+    "poll_date": "2023-06",
+    "num_questions": 15,
+    "file": "questionnaire_data/Vanderbilt_Unity_Poll_2023_June_questions.json"
+  },
+  "2023-03": {
+    "survey_name": "Vanderbilt_Unity_Poll",
+    "year": 2023,
+    "month": "March",
+    "poll_date": "2023-03",
+    "num_questions": 8,
+    "file": "questionnaire_data/Vanderbilt_Unity_Poll_2023_March_questions.json"
+  },
+  "2023-09": {
+    "survey_name": "Vanderbilt_Unity_Poll",
+    "year": 2023,
+    "month": "September",
+    "poll_date": "2023-09",
+    "num_questions": 15,
+    "file": "questionnaire_data/Vanderbilt_Unity_Poll_2023_September_questions.json"
+  },
+  "2024-06": {
+    "survey_name": "Vanderbilt_Unity_Poll",
+    "year": 2024,
+    "month": "June",
+    "poll_date": "2024-06",
+    "num_questions": 5,
+    "file": "questionnaire_data/Vanderbilt_Unity_Poll_2024_June_questions.json"
+  },
+  "2024-03": {
+    "survey_name": "Vanderbilt_Unity_Poll",
+    "year": 2024,
+    "month": "March",
+    "poll_date": "2024-03",
+    "num_questions": 13,
+    "file": "questionnaire_data/Vanderbilt_Unity_Poll_2024_March_questions.json"
+  },
+  "2024-10": {
+    "survey_name": "Vanderbilt_Unity_Poll",
+    "year": 2024,
+    "month": "October",
+    "poll_date": "2024-10",
+    "num_questions": 14,
+    "file": "questionnaire_data/Vanderbilt_Unity_Poll_2024_October_questions.json"
+  },
+  "2024-09": {
+    "survey_name": "Vanderbilt_Unity_Poll",
+    "year": 2024,
+    "month": "September",
+    "poll_date": "2024-09",
+    "num_questions": 15,
+    "file": "questionnaire_data/Vanderbilt_Unity_Poll_2024_September_questions.json"
+  },
+  "2025-02": {
+    "survey_name": "Vanderbilt_Unity_Poll",
+    "year": 2025,
+    "month": "February",
+    "poll_date": "2025-02",
+    "num_questions": 17,
+    "file": "questionnaire_data/Vanderbilt_Unity_Poll_2025_February_questions.json"
+  },
+  "2025-06": {
+    "survey_name": "Vanderbilt_Unity_Poll",
+    "year": 2025,
+    "month": "June",
+    "poll_date": "2025-06",
+    "num_questions": 23,
+    "file": "questionnaire_data/Vanderbilt_Unity_Poll_2025_June_questions.json"
+  }
+}

questionnaire_vectorstores/questions_index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio>=4.0.0
+langchain>=0.1.0
+langchain-openai>=0.0.5
+langchain-pinecone>=0.0.3
+langgraph>=0.0.20
+openai>=1.0.0
+pinecone
+python-dotenv>=1.0.0
+pydantic>=2.0.0

survey_agent.py ADDED Viewed

	@@ -0,0 +1,1175 @@

+"""
+Multi-agent survey analysis system using LangGraph with Staged Research Briefs.
+This orchestrates multiple data sources (questionnaires, toplines, crosstabs, SQL)
+to answer complex survey research questions using sequential, adaptive research stages.
+# TODO: REMOVE WHEN PIPELINES READY
+When new pipelines (toplines, crosstabs, SQL) become available:
+1. Add pipeline name to SurveyAnalysisAgent.AVAILABLE_PIPELINES (line ~105)
+2. Add execution logic in _execute_stage() method (around line ~450)
+3. Search for "TODO: REMOVE WHEN PIPELINES READY" and remove those sections
+4. Update examples to include the new pipeline capabilities
+Current Status:
+- ✅ Questionnaire pipeline: ACTIVE
+- ⏳ Toplines pipeline: Not yet implemented
+- ⏳ Crosstabs pipeline: Not yet implemented
+- ⏳ SQL pipeline: Not yet implemented
+"""
+import os
+import json
+from typing import TypedDict, Literal, Annotated, List, Dict, Any, Optional, Union
+from pathlib import Path
+import operator
+from langgraph.graph import StateGraph, START, END
+from langgraph.checkpoint.memory import MemorySaver
+from langchain_openai import ChatOpenAI
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
+from pydantic import BaseModel, Field, ConfigDict
+# Import the questionnaire RAG
+from questionnaire_rag import QuestionnaireRAG
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    pass
+# ============================================================================
+# STATE DEFINITIONS (PYDANTIC V2) - WITH STAGED RESEARCH
+# ============================================================================
+class QueryFilters(BaseModel):
+    """Filters for data source queries - Pydantic v2 with strict schema"""
+    model_config = ConfigDict(extra="forbid")
+    year: Optional[int] = Field(default=None, description="Year filter (e.g., 2025)")
+    month: Optional[str] = Field(default=None, description="Month filter (e.g., 'February')")
+    poll_date: Optional[str] = Field(default=None, description="Specific poll date (e.g., '2025-02-15')")
+    survey_name: Optional[str] = Field(default=None, description="Survey name filter (e.g., 'Unity Poll')")
+    topic: Optional[str] = Field(default=None, description="Topic filter")
+    question_ids: Optional[List[str]] = Field(default=None, description="Specific question IDs from previous stage")
+class DataSource(BaseModel):
+    """Represents a data source to query"""
+    model_config = ConfigDict(extra="forbid")
+    source_type: Literal["questionnaire", "toplines", "crosstabs", "sql"]
+    query_description: str = Field(description="What to retrieve from this source")
+    filters: QueryFilters = Field(default_factory=QueryFilters, description="Filters to apply")
+    result_label: Optional[str] = Field(default=None, description="Label for these results (e.g., '2024_questions')")
+class ResearchStage(BaseModel):
+    """A single stage in a multi-stage research plan"""
+    model_config = ConfigDict(extra="forbid")
+    stage_number: int = Field(description="Stage number (1-indexed)")
+    description: str = Field(description="What this stage accomplishes")
+    data_sources: List[DataSource] = Field(description="Data sources to query in this stage")
+    depends_on_stages: List[int] = Field(default_factory=list, description="Which prior stages this depends on")
+    use_previous_results_for: Optional[str] = Field(
+        default=None,
+        description="How to use previous stage results (e.g., 'Extract question IDs from stage 1')"
+    )
+class ResearchBrief(BaseModel):
+    """Research brief - can be either single-stage or multi-stage"""
+    model_config = ConfigDict(extra="forbid")
+    action: Literal["answer", "followup", "route_to_sources", "execute_stages"]
+    followup_question: Optional[str] = Field(default=None, description="Follow-up question to ask user")
+    reasoning: str = Field(description="Why this approach was chosen")
+    # For simple queries (single-stage)
+    data_sources: List[DataSource] = Field(default_factory=list, description="Data sources for simple queries")
+    # For complex queries (multi-stage)
+    stages: List[ResearchStage] = Field(default_factory=list, description="Ordered stages of research")
+class StageResult(BaseModel):
+    """Results from executing one stage"""
+    model_config = ConfigDict(extra="forbid")
+    stage_number: int
+    status: Literal["success", "partial", "failed"]
+    questionnaire_results: Optional[Dict[str, Any]] = None
+    toplines_results: Optional[Dict[str, Any]] = None
+    crosstabs_results: Optional[Dict[str, Any]] = None
+    sql_results: Optional[Dict[str, Any]] = None
+    extracted_context: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Key information extracted for next stages (e.g., question IDs)"
+    )
+class VerificationResult(BaseModel):
+    """Result of verifying if data answers the question"""
+    model_config = ConfigDict(extra="forbid")
+    answers_question: bool = Field(description="Whether the data fully answers the question")
+    missing_info: Optional[str] = Field(default=None, description="What information is missing")
+    improvement_suggestion: Optional[str] = Field(default=None, description="How to improve the research brief")
+class SurveyAnalysisState(TypedDict):
+    """State for the survey analysis agent - WITH STAGED RESEARCH"""
+    # User interaction
+    messages: Annotated[List, operator.add]
+    user_question: str
+    # Planning
+    research_brief: Optional[ResearchBrief]
+    # Stage execution
+    current_stage: int  # Which stage we're executing (0-indexed internally, but 1-indexed in models)
+    stage_results: List[StageResult]  # Results from each completed stage
+    # Legacy single-stage results (for backward compatibility)
+    questionnaire_results: Optional[Dict[str, Any]]
+    toplines_results: Optional[Dict[str, Any]]
+    crosstabs_results: Optional[Dict[str, Any]]
+    sql_results: Optional[Dict[str, Any]]
+    # Verification & synthesis
+    verification: Optional[VerificationResult]
+    final_answer: Optional[str]
+    # Control flow
+    retry_count: int
+    max_retries: int
+# ============================================================================
+# SURVEY ANALYSIS ORCHESTRATOR - WITH STAGED RESEARCH
+# ============================================================================
+class SurveyAnalysisAgent:
+    """
+    Multi-agent system for analyzing survey data with staged research briefs.
+    Flow:
+    1. User asks question
+    2. Research brief agent decides: simple (one-shot) or complex (staged)
+    3. For simple: run pipelines in parallel → verify → synthesize
+    4. For complex: execute stages sequentially, each using previous results
+    5. Final synthesis combines all stage results
+    """
+    # TODO: REMOVE WHEN PIPELINES READY - START
+    # Track which pipelines are currently available
+    AVAILABLE_PIPELINES = {"questionnaire"}  # Add "toplines", "crosstabs", "sql" as they become ready
+    # TODO: REMOVE WHEN PIPELINES READY - END
+    def __init__(
+        self,
+        openai_api_key: str,
+        pinecone_api_key: str,
+        questionnaire_persist_dir: str = "./questionnaire_vectorstores",
+        max_retries: int = 2,
+        verbose: bool = True
+    ):
+        self.openai_api_key = openai_api_key
+        self.pinecone_api_key = pinecone_api_key
+        self.verbose = verbose
+        self.max_retries = max_retries
+        # Initialize LLM
+        self.llm = ChatOpenAI(
+            model=os.getenv("OPENAI_MODEL", "gpt-4o"),
+            temperature=0
+        )
+        # Initialize questionnaire RAG
+        if self.verbose:
+            print("Initializing questionnaire RAG system...")
+        self.questionnaire_rag = QuestionnaireRAG(
+            openai_api_key=openai_api_key,
+            pinecone_api_key=pinecone_api_key,
+            persist_directory=questionnaire_persist_dir,
+            verbose=verbose
+        )
+        # Build the graph
+        self.graph = self._build_graph()
+        if self.verbose:
+            print("✓ Survey analysis agent initialized with staged research capability")
+    def _build_graph(self) -> StateGraph:
+        """Build the LangGraph workflow with staged research support"""
+        workflow = StateGraph(SurveyAnalysisState)
+        # Add nodes
+        workflow.add_node("generate_research_brief", self._generate_research_brief)
+        workflow.add_node("execute_stage", self._execute_stage)
+        workflow.add_node("extract_stage_context", self._extract_stage_context)
+        workflow.add_node("verify_results", self._verify_results)
+        workflow.add_node("synthesize_response", self._synthesize_response)
+        # Define edges
+        workflow.add_edge(START, "generate_research_brief")
+        # After research brief, route based on action
+        workflow.add_conditional_edges(
+            "generate_research_brief",
+            self._route_after_brief,
+            {
+                "followup": END,
+                "answer": "synthesize_response",
+                "execute_stage": "execute_stage"
+            }
+        )
+        # After stage execution, extract context for next stage
+        workflow.add_edge("execute_stage", "extract_stage_context")
+        # After context extraction, decide next step
+        workflow.add_conditional_edges(
+            "extract_stage_context",
+            self._route_after_stage,
+            {
+                "next_stage": "execute_stage",  # More stages to go
+                "verify": "verify_results"       # All stages done, verify
+            }
+        )
+        # After verification, decide next step
+        workflow.add_conditional_edges(
+            "verify_results",
+            self._route_after_verification,
+            {
+                "synthesize": "synthesize_response",
+                "retry": "generate_research_brief",
+                "give_up": "synthesize_response"
+            }
+        )
+        # End after synthesis
+        workflow.add_edge("synthesize_response", END)
+        # Compile with memory
+        memory = MemorySaver()
+        return workflow.compile(checkpointer=memory)
+    def _get_available_surveys_description(self) -> str:
+        """Get formatted description of available surveys for LLM prompt"""
+        survey_names = self.questionnaire_rag.get_available_survey_names()
+        if not survey_names:
+            return "No surveys currently loaded."
+        lines = ["Available survey names in the system:"]
+        for name in survey_names:
+            # Show both the stored name and common variations
+            lines.append(f"  - Stored as: '{name}'")
+            # Parse variations
+            variations = []
+            # Remove underscores for common term
+            clean = name.replace("_", " ")
+            if clean != name:
+                variations.append(f"'{clean}'")
+            # Extract key words
+            words = clean.split()
+            if len(words) > 1:
+                # Last few words might be the short name
+                short_name = " ".join(words[-2:]) if len(words) >= 2 else words[-1]
+                if short_name != clean:
+                    variations.append(f"'{short_name}'")
+            if variations:
+                lines.append(f"    (users might say: {', '.join(variations)})")
+        lines.append("\nIMPORTANT: Use the exact stored name in your filters!")
+        return "\n".join(lines)
+    # TODO: REMOVE WHEN PIPELINES READY - START
+    def _get_pipeline_status_description(self) -> str:
+        """Get description of available vs unavailable pipelines"""
+        all_pipelines = {
+            "questionnaire": "Survey questions, response options, topics, skip logic, sampling",
+            "toplines": "Pre-computed response frequencies for each question",
+            "crosstabs": "Pre-computed cross-tabulations by demographics",
+            "sql": "Raw survey responses for custom analysis"
+        }
+        lines = []
+        for pipeline, description in all_pipelines.items():
+            status = "✅ AVAILABLE" if pipeline in self.AVAILABLE_PIPELINES else "❌ NOT YET AVAILABLE"
+            lines.append(f"{pipeline.capitalize()}: {description} {status}")
+        return "\n".join(lines)
+    # TODO: REMOVE WHEN PIPELINES READY - END
+    def _get_full_question_context(self, state: SurveyAnalysisState) -> str:
+        """
+        Build full question context from conversation history.
+        This handles cases where the user's question is split across multiple turns:
+        - Turn 1: "what questions were asked?"
+        - Turn 2: "June 2025, unity poll"
+        We need to combine these to understand the full intent.
+        """
+        messages = state.get("messages", [])
+        # Extract all human messages (excluding system/AI messages)
+        human_messages = []
+        for msg in messages:
+            if isinstance(msg, HumanMessage):
+                human_messages.append(msg.content)
+        if not human_messages:
+            return state["user_question"]
+        if self.verbose:
+            print(f"📝 Conversation history: {len(human_messages)} user message(s)")
+            for i, msg in enumerate(human_messages, 1):
+                print(f"   {i}. {msg[:100]}..." if len(msg) > 100 else f"   {i}. {msg}")
+        # If there's only one message, just use it
+        if len(human_messages) == 1:
+            return human_messages[0]
+        # Multiple messages - combine them intelligently
+        # The last message is usually the most specific (e.g., "June 2025, unity poll")
+        # Earlier messages provide the intent (e.g., "what questions were asked?")
+        # Check if the first message is a question and the second is a clarification
+        first_msg = human_messages[0].lower()
+        is_followup_scenario = any(word in first_msg for word in ["what", "which", "how", "show", "list", "tell"])
+        if is_followup_scenario and len(human_messages) == 2:
+            # Combine: "what questions were asked? [from] June 2025, unity poll"
+            combined = f"{human_messages[0]} (specifically: {human_messages[1]})"
+            if self.verbose:
+                print(f"🔗 Combined context: {combined}")
+            return combined
+        # For other cases, join all messages
+        combined = " | ".join(human_messages)
+        if self.verbose:
+            print(f"🔗 Combined context: {combined}")
+        return combined
+    # ========================================================================
+    # NODE FUNCTIONS
+    # ========================================================================
+    def _generate_research_brief(self, state: SurveyAnalysisState) -> Dict[str, Any]:
+        """Generate research brief - decides single-stage vs multi-stage approach"""
+        if self.verbose:
+            print("\n=== GENERATING RESEARCH BRIEF ===")
+        # Get full question context from conversation history
+        question = self._get_full_question_context(state)
+        original_question = state["user_question"]  # Keep original for reference
+        if self.verbose and question != original_question:
+            print(f"💬 Using full context from conversation history")
+        retry_count = state.get("retry_count", 0)
+        # Add context from verification if this is a retry
+        verification_context = ""
+        if state.get("verification") and retry_count > 0:
+            verification_context = f"""
+Previous attempt was insufficient:
+- Missing: {state['verification'].missing_info}
+- Suggestion: {state['verification'].improvement_suggestion}
+Please improve the research plan based on this feedback.
+"""
+        system_prompt = f"""You are a research planning expert for survey data analysis.
+# TODO: REMOVE WHEN PIPELINES READY - Use dynamic status
+Available data sources:
+{self._get_pipeline_status_description()}
+# TODO: REMOVE WHEN PIPELINES READY - START
+⚠️ IMPORTANT: Currently ONLY the questionnaire pipeline is available.
+- Do NOT create research plans that require toplines, crosstabs, or SQL
+- If the user asks for results/data/analysis that requires those sources, use action="followup" to inform them
+- Focus on what CAN be answered with questionnaires alone (question text, response options, topics, skip logic)
+# TODO: REMOVE WHEN PIPELINES READY - END
+{self._get_available_surveys_description()}
+You have FOUR possible actions:
+**1. followup** - Ask clarifying question if ambiguous OR if user asks for unavailable data
+**2. answer** - Answer directly without data (system questions, general knowledge)
+**3. route_to_sources** - Simple query that can be answered with parallel data retrieval
+   Use this for:
+   - "What questions were asked in June 2025?"
+   - "Show me all healthcare questions"
+   - Questions that don't require sequential reasoning
+**4. execute_stages** - Complex query requiring STAGED research
+   Use this for:
+   - Queries with "most/least/best/worst" (need stage 1: retrieve, stage 2: analyze)
+   - Comparative queries "compare 2024 vs 2025" (need separate stages to maintain context)
+   - Queries depending on intermediate results
+   - "What demographics differ most?" (stage 1: get questions, stage 2: get crosstabs for those questions)
+   # TODO: REMOVE WHEN PIPELINES READY - START
+   NOTE: Since toplines/crosstabs/SQL aren't available, only use execute_stages for comparing questionnaires
+   # TODO: REMOVE WHEN PIPELINES READY - END
+   When using stages:
+   - Each stage can use results from previous stages via `use_previous_results_for`
+   - Later stages can filter by question_ids extracted from earlier stages
+   - Each stage can have a `result_label` to maintain separate contexts
+CRITICAL FILTERING RULES:
+- **Survey Names**: User queries like "Unity Poll" or "Vanderbilt Unity Poll" should map to the exact stored name shown above
+- When you see "Unity Poll" in a query, use the exact stored name in your filter
+- Only specify filters if explicitly mentioned or clearly implied
+- For staged queries, be explicit about how each stage uses previous results
+- Use `question_ids` filter when later stages need specific questions from earlier stages
+- Year and month are usually sufficient - survey_name is optional unless needed for disambiguation
+{verification_context}
+Examples:
+# TODO: REMOVE WHEN PIPELINES READY - START
+User asks for results/analysis → Inform them:
+Q: "What were the topline results for June 2025?"
+Brief:
+  action: followup
+  followup_question: "I can show you the questions asked in June 2025, but topline results aren't available yet. Would you like to see the questions?"
+# TODO: REMOVE WHEN PIPELINES READY - END
+User says "Unity Poll" → Use stored name in filter:
+Q: "What questions were asked in June 2025 Unity Poll?"
+Brief:
+  action: route_to_sources
+  data_sources: [questionnaire with year=2025, month=June, survey_name='Vanderbilt_Unity_Poll']
+Simple Query → route_to_sources:
+Q: "What questions were asked in June 2025?"
+Brief:
+  action: route_to_sources
+  data_sources: [questionnaire with June 2025 filters]
+Complex Query → execute_stages:
+Q: "Compare immigration questions from 2024 vs 2025"
+Brief:
+  action: execute_stages
+  stages:
+    - stage 1: Get 2024 immigration questions (label: "2024_questions")
+    - stage 2: Get 2025 immigration questions (label: "2025_questions")
+    - stage 3: Compare the two sets in synthesis
+"""
+        brief_generator = self.llm.with_structured_output(ResearchBrief)
+        brief = brief_generator.invoke([
+            SystemMessage(content=system_prompt),
+            HumanMessage(content=f"User question: {question}\n\nGenerate a research brief.")
+        ])
+        if self.verbose:
+            print(f"Action: {brief.action}")
+            print(f"Reasoning: {brief.reasoning}")
+            if brief.followup_question:
+                print(f"Follow-up: {brief.followup_question}")
+            if brief.action == "route_to_sources" and brief.data_sources:
+                print(f"Simple query - {len(brief.data_sources)} data sources")
+                for ds in brief.data_sources:
+                    filters_dict = {k: v for k, v in ds.filters.model_dump().items() if v is not None}
+                    print(f"  - {ds.source_type}: {ds.query_description}")
+                    if filters_dict:
+                        print(f"    Filters: {filters_dict}")
+            if brief.action == "execute_stages" and brief.stages:
+                print(f"Staged query - {len(brief.stages)} stages")
+                for stage in brief.stages:
+                    print(f"\nStage {stage.stage_number}: {stage.description}")
+                    if stage.depends_on_stages:
+                        print(f"  Depends on: stages {stage.depends_on_stages}")
+                    if stage.use_previous_results_for:
+                        print(f"  Uses previous: {stage.use_previous_results_for}")
+                    for ds in stage.data_sources:
+                        print(f"  - {ds.source_type}: {ds.query_description}")
+                        if ds.result_label:
+                            print(f"    Label: {ds.result_label}")
+        return {
+            "research_brief": brief,
+            "current_stage": 0,  # Start at stage 0 (will execute stage 1 first)
+            "stage_results": [],
+            "messages": [AIMessage(content=f"[Research plan: {brief.action}]")]
+        }
+    def _route_after_brief(self, state: SurveyAnalysisState) -> str:
+        """Route based on research brief action"""
+        brief = state["research_brief"]
+        if brief.action == "followup":
+            return "followup"
+        elif brief.action == "answer":
+            return "answer"
+        elif brief.action == "execute_stages":
+            return "execute_stage"
+        else:  # route_to_sources
+            return "execute_stage"  # We'll handle both single and staged in execute_stage
+    def _execute_stage(self, state: SurveyAnalysisState) -> Dict[str, Any]:
+        """Execute one stage of research (handles both single-stage and multi-stage)"""
+        brief = state["research_brief"]
+        current_stage_idx = state.get("current_stage", 0)
+        previous_stage_results = state.get("stage_results", [])
+        # Determine if this is single-stage or multi-stage
+        if brief.action == "route_to_sources":
+            # Single-stage: use data_sources directly
+            if self.verbose:
+                print(f"\n=== EXECUTING SINGLE-STAGE RESEARCH ===")
+            stage_data_sources = brief.data_sources
+            stage_desc = "Single-stage retrieval"
+        elif brief.action == "execute_stages":
+            # Multi-stage: get current stage
+            stage = brief.stages[current_stage_idx]
+            if self.verbose:
+                print(f"\n=== EXECUTING STAGE {stage.stage_number}/{len(brief.stages)} ===")
+                print(f"Description: {stage.description}")
+            stage_data_sources = stage.data_sources
+            stage_desc = stage.description
+            # If this stage depends on previous stages, enrich filters with context
+            if stage.use_previous_results_for and previous_stage_results:
+                stage_data_sources = self._enrich_data_sources_with_context(
+                    stage_data_sources,
+                    previous_stage_results,
+                    stage.use_previous_results_for
+                )
+        else:
+            return {}
+        # Execute pipelines for this stage
+        stage_result = StageResult(
+            stage_number=current_stage_idx + 1,
+            status="success"
+        )
+        # TODO: REMOVE WHEN PIPELINES READY - Track what was attempted vs available
+        attempted_pipelines = []
+        unavailable_pipelines = []
+        # Run each pipeline
+        for ds in stage_data_sources:
+            filters_dict = {k: v for k, v in ds.filters.model_dump().items() if v is not None}
+            # TODO: REMOVE WHEN PIPELINES READY - START
+            attempted_pipelines.append(ds.source_type)
+            # TODO: REMOVE WHEN PIPELINES READY - END
+            if ds.source_type == "questionnaire":
+                if self.verbose:
+                    print(f"\nQuerying questionnaire: {ds.query_description}")
+                    if filters_dict:
+                        print(f"Filters: {filters_dict}")
+                result = self.questionnaire_rag.query_with_metadata(
+                    question=ds.query_description,
+                    filters=filters_dict if filters_dict else None
+                )
+                # Store with label if provided
+                if ds.result_label:
+                    result["label"] = ds.result_label
+                stage_result.questionnaire_results = result if stage_result.questionnaire_results is None else {
+                    "multiple": True,
+                    "results": [stage_result.questionnaire_results, result]
+                }
+                if self.verbose:
+                    print(f"Retrieved {result['num_sources']} questions")
+            # TODO: REMOVE WHEN PIPELINES READY - START
+            elif ds.source_type not in self.AVAILABLE_PIPELINES:
+                unavailable_pipelines.append(ds.source_type)
+                if self.verbose:
+                    print(f"\n⚠️  {ds.source_type.upper()} pipeline not yet available - skipping")
+                    print(f"   Requested: {ds.query_description}")
+            # TODO: REMOVE WHEN PIPELINES READY - END
+        # TODO: REMOVE WHEN PIPELINES READY - START
+        # Add a note about unavailable pipelines to the stage result
+        if unavailable_pipelines:
+            if self.verbose:
+                print(f"\n⚠️  Stage {current_stage_idx + 1} incomplete: {len(unavailable_pipelines)} pipeline(s) unavailable")
+            stage_result.status = "partial"
+            # Store info about what was unavailable for the synthesizer
+            if not stage_result.extracted_context:
+                stage_result.extracted_context = {}
+            stage_result.extracted_context["unavailable_pipelines"] = unavailable_pipelines
+        # TODO: REMOVE WHEN PIPELINES READY - END
+        # Add stage result to list
+        updated_stage_results = previous_stage_results + [stage_result]
+        # For single-stage, also populate legacy fields
+        if brief.action == "route_to_sources":
+            return {
+                "stage_results": updated_stage_results,
+                "questionnaire_results": stage_result.questionnaire_results,
+                "toplines_results": stage_result.toplines_results,
+                "crosstabs_results": stage_result.crosstabs_results,
+                "sql_results": stage_result.sql_results
+            }
+        return {
+            "stage_results": updated_stage_results
+        }
+    def _enrich_data_sources_with_context(
+        self,
+        data_sources: List[DataSource],
+        previous_results: List[StageResult],
+        use_instruction: str
+    ) -> List[DataSource]:
+        """Enrich data sources with context from previous stages"""
+        if self.verbose:
+            print(f"  Enriching with context: {use_instruction}")
+        # For now, handle the most common case: extracting question IDs
+        if "question" in use_instruction.lower() and "id" in use_instruction.lower():
+            # Extract question IDs from previous questionnaire results
+            question_ids = []
+            for prev_result in previous_results:
+                if prev_result.questionnaire_results:
+                    q_results = prev_result.questionnaire_results
+                    if "source_questions" in q_results:
+                        question_ids.extend([q.get("question_id") for q in q_results["source_questions"]])
+            if question_ids and self.verbose:
+                print(f"  Found {len(question_ids)} question IDs from previous stages")
+            # Add question_ids to filters
+            enriched_sources = []
+            for ds in data_sources:
+                new_filters = ds.filters.model_copy()
+                new_filters.question_ids = question_ids if question_ids else None
+                enriched_ds = ds.model_copy()
+                enriched_ds.filters = new_filters
+                enriched_sources.append(enriched_ds)
+            return enriched_sources
+        return data_sources
+    def _extract_stage_context(self, state: SurveyAnalysisState) -> Dict[str, Any]:
+        """Extract key context from completed stage for use in next stages"""
+        stage_results = state.get("stage_results", [])
+        if not stage_results:
+            return {}
+        current_result = stage_results[-1]
+        # Extract question IDs if questionnaire results exist
+        extracted_context = {}
+        if current_result.questionnaire_results:
+            q_results = current_result.questionnaire_results
+            if "source_questions" in q_results:
+                question_ids = [q.get("question_id") for q in q_results["source_questions"]]
+                extracted_context["question_ids"] = question_ids
+                if self.verbose:
+                    print(f"\n=== EXTRACTED CONTEXT FROM STAGE {current_result.stage_number} ===")
+                    print(f"Question IDs: {len(question_ids)} extracted")
+        # Update the stage result with extracted context
+        current_result.extracted_context = extracted_context
+        return {}
+    def _route_after_stage(self, state: SurveyAnalysisState) -> str:
+        """Decide if we need to execute another stage or move to verification"""
+        brief = state["research_brief"]
+        current_stage_idx = state.get("current_stage", 0)
+        # Single-stage query
+        if brief.action == "route_to_sources":
+            if self.verbose:
+                print("\n=== SINGLE-STAGE COMPLETE → VERIFICATION ===")
+            return "verify"
+        # Multi-stage query
+        total_stages = len(brief.stages)
+        next_stage_idx = current_stage_idx + 1
+        if next_stage_idx < total_stages:
+            if self.verbose:
+                print(f"\n=== MORE STAGES REMAINING ({next_stage_idx + 1}/{total_stages}) → NEXT STAGE ===")
+            return "next_stage"
+        else:
+            if self.verbose:
+                print(f"\n=== ALL {total_stages} STAGES COMPLETE → VERIFICATION ===")
+            return "verify"
+    def _verify_results(self, state: SurveyAnalysisState) -> Dict[str, Any]:
+        """Verify that retrieved data answers the question"""
+        if self.verbose:
+            print("\n=== VERIFYING RESULTS ===")
+        # Build full question context from conversation history
+        question = self._get_full_question_context(state)
+        if self.verbose and question != state["user_question"]:
+            print(f"💬 Using full context: {question[:150]}...")
+        stage_results = state.get("stage_results", [])
+        brief = state["research_brief"]
+        # Build summary of what we retrieved
+        retrieval_summary = []
+        total_questions = 0
+        # TODO: REMOVE WHEN PIPELINES READY - START
+        unavailable_pipelines_found = []
+        # TODO: REMOVE WHEN PIPELINES READY - END
+        for stage_result in stage_results:
+            if stage_result.questionnaire_results:
+                q_res = stage_result.questionnaire_results
+                num = q_res.get("num_sources", 0)
+                total_questions += num
+                retrieval_summary.append(f"Stage {stage_result.stage_number}: Retrieved {num} questions")
+            # TODO: REMOVE WHEN PIPELINES READY - START
+            # Check if any pipelines were unavailable
+            if stage_result.extracted_context and "unavailable_pipelines" in stage_result.extracted_context:
+                unavailable = stage_result.extracted_context["unavailable_pipelines"]
+                unavailable_pipelines_found.extend(unavailable)
+                retrieval_summary.append(f"Stage {stage_result.stage_number}: ⚠️  {', '.join(unavailable)} not yet available")
+            # TODO: REMOVE WHEN PIPELINES READY - END
+        if not retrieval_summary:
+            retrieval_summary.append("No data was retrieved")
+        # Simple heuristic: if this is a single-stage simple query and we got results, auto-pass
+        if brief.action == "route_to_sources" and len(stage_results) == 1 and total_questions > 0:
+            # Check if question is a simple "what questions" type query
+            question_lower = question.lower()
+            simple_patterns = ["what question", "which question", "list question", "show question", "questions asked"]
+            if any(pattern in question_lower for pattern in simple_patterns):
+                if self.verbose:
+                    print(f"✓ Auto-pass: Simple question retrieval with {total_questions} results")
+                return {
+                    "verification": VerificationResult(
+                        answers_question=True,
+                        missing_info=None,
+                        improvement_suggestion=None
+                    )
+                }
+        # TODO: REMOVE WHEN PIPELINES READY - START
+        # If we have unavailable pipelines but got questionnaire data, auto-pass with note
+        if unavailable_pipelines_found and total_questions > 0:
+            if self.verbose:
+                print(f"✓ Auto-pass: Got questionnaire data, {len(unavailable_pipelines_found)} pipeline(s) not yet available")
+            return {
+                "verification": VerificationResult(
+                    answers_question=True,
+                    missing_info=None,
+                    improvement_suggestion=None
+                )
+            }
+        # TODO: REMOVE WHEN PIPELINES READY - END
+        # If we got 0 results, auto-fail without calling LLM
+        if total_questions == 0:
+            if self.verbose:
+                print("✗ Auto-fail: No results retrieved")
+            return {
+                "verification": VerificationResult(
+                    answers_question=False,
+                    missing_info="No data was retrieved",
+                    improvement_suggestion="Adjust filters or search criteria"
+                ),
+                "retry_count": state.get("retry_count", 0) + 1
+            }
+        # For other cases, use LLM verification
+        system_prompt = """You are a verification expert. Your ONLY job is to check if the retrieved data matches what the user asked for.
+CRITICAL RULES:
+1. **Match the question literally** - Don't add requirements the user didn't ask for
+   - If they asked "what questions were asked?" and we retrieved questions → SUCCESS
+   - If they asked "what are the results?" and we only have questions → FAILURE
+2. **Don't overthink it** - Keep it simple:
+   - Did we retrieve the type of data they asked for? (questions, results, etc.)
+   - Is it from the right time period/survey they specified?
+   - Is there enough data (at least 1 result)?
+3. **Only fail if there's an actual problem**:
+   - We retrieved the wrong type of data (e.g., questions when they asked for results)
+   - We retrieved from the wrong time period/survey
+4. **Do NOT fail if**:
+   - User asked for questions and we got questions (even if we don't have "analysis")
+   - User asked for data from June 2025 and that's what we got
+   - The data seems sufficient to answer their actual question
+Be practical, not pedantic. If the retrieved data can answer what they asked, approve it.
+"""
+        verifier = self.llm.with_structured_output(VerificationResult)
+        verification = verifier.invoke([
+            SystemMessage(content=system_prompt),
+            HumanMessage(content=f"""
+User question: "{question}"
+What we retrieved:
+{chr(10).join(retrieval_summary)}
+Simple question: Can we answer their question with this data? YES or NO.
+""")
+        ])
+        if self.verbose:
+            print(f"Answers question: {verification.answers_question}")
+            if not verification.answers_question:
+                print(f"Missing: {verification.missing_info}")
+                print(f"Suggestion: {verification.improvement_suggestion}")
+        # ⭐ INCREMENT RETRY COUNT IF VERIFICATION FAILS
+        updates = {"verification": verification}
+        if not verification.answers_question:
+            current_retry = state.get("retry_count", 0)
+            updates["retry_count"] = current_retry + 1
+        return updates
+    def _route_after_verification(self, state: SurveyAnalysisState) -> str:
+        """Route based on verification result"""
+        verification = state["verification"]
+        retry_count = state.get("retry_count", 0)
+        max_retries = state.get("max_retries", self.max_retries)
+        if verification.answers_question:
+            return "synthesize"
+        elif retry_count < max_retries:
+            if self.verbose:
+                print(f"\n⚠️  Retry {retry_count + 1}/{max_retries}")
+            return "retry"
+        else:
+            if self.verbose:
+                print(f"\n⚠️  Max retries reached, proceeding with partial results")
+            return "give_up"
+    def _synthesize_response(self, state: SurveyAnalysisState) -> Dict[str, Any]:
+        """Synthesize final response from all results"""
+        if self.verbose:
+            print("\n=== SYNTHESIZING RESPONSE ===")
+        brief = state["research_brief"]
+        # Get full question context from conversation history
+        full_question = self._get_full_question_context(state)
+        if self.verbose and full_question != state["user_question"]:
+            print(f"💬 Using full context: {full_question[:150]}...")
+        # Handle followup action
+        if brief.action == "followup":
+            if self.verbose:
+                print("Returning followup question")
+            return {
+                "final_answer": brief.followup_question,
+                "messages": [AIMessage(content=brief.followup_question)]
+            }
+        # Handle direct answer (no data retrieval)
+        if brief.action == "answer":
+            if self.verbose:
+                print("Generating direct answer without data")
+            answer = self.llm.invoke([
+                SystemMessage(content="Answer the user's question directly."),
+                HumanMessage(content=full_question)
+            ]).content
+            return {
+                "final_answer": answer,
+                "messages": [AIMessage(content=answer)]
+            }
+        # Get stage results
+        stage_results = state.get("stage_results", [])
+        if not stage_results:
+            if self.verbose:
+                print("No stage results available")
+            return {
+                "final_answer": "I was unable to retrieve any data to answer your question.",
+                "messages": [AIMessage(content="I was unable to retrieve any data to answer your question.")]
+            }
+        # CASE 1: Single stage with single pipeline → return direct answer
+        if len(stage_results) == 1:
+            stage_result = stage_results[0]
+            # Check if only one pipeline returned data
+            pipelines_with_data = 0
+            direct_answer = None
+            if stage_result.questionnaire_results:
+                pipelines_with_data += 1
+                direct_answer = stage_result.questionnaire_results.get("answer")
+            if pipelines_with_data == 1 and direct_answer:
+                if self.verbose:
+                    print("Single stage, single pipeline - returning direct answer (no synthesis)")
+                return {
+                    "final_answer": direct_answer,
+                    "messages": [AIMessage(content=direct_answer)]
+                }
+        # CASE 2: Multiple stages or multiple pipelines → synthesize
+        if self.verbose:
+            print(f"Synthesizing from {len(stage_results)} stage(s)")
+        # Build context from all stages
+        context_parts = []
+        # TODO: REMOVE WHEN PIPELINES READY - START
+        unavailable_pipelines_overall = []
+        # TODO: REMOVE WHEN PIPELINES READY - END
+        for i, stage_result in enumerate(stage_results, 1):
+            if stage_result.questionnaire_results:
+                q_res = stage_result.questionnaire_results
+                # Check if this is a labeled result
+                label = q_res.get("label", f"Stage {i}")
+                context_parts.append(f"\n=== {label.upper()} ===")
+                context_parts.append(f"Stage {i} results:")
+                context_parts.append(q_res.get("answer", "No answer available"))
+            # TODO: REMOVE WHEN PIPELINES READY - START
+            # Track unavailable pipelines for note in synthesis
+            if stage_result.extracted_context and "unavailable_pipelines" in stage_result.extracted_context:
+                unavailable = stage_result.extracted_context["unavailable_pipelines"]
+                unavailable_pipelines_overall.extend(unavailable)
+                context_parts.append(f"\n⚠️  Note: {', '.join(unavailable)} data was requested but not yet available")
+            # TODO: REMOVE WHEN PIPELINES READY - END
+        # TODO: REMOVE WHEN PIPELINES READY - START
+        unavailable_note = ""
+        if unavailable_pipelines_overall:
+            unique_unavailable = list(set(unavailable_pipelines_overall))
+            unavailable_note = f"""
+⚠️ IMPORTANT: The following data sources were requested but are not yet available:
+{', '.join(unique_unavailable).upper()}
+Please answer based on the questionnaire data that IS available, and note any limitations.
+"""
+        # TODO: REMOVE WHEN PIPELINES READY - END
+        synthesis_prompt = f"""Synthesize results from {'multiple stages' if len(stage_results) > 1 else 'the research'} to answer the user's question.
+User question: {full_question}
+Research plan: {brief.reasoning}
+Retrieved data:
+{chr(10).join(context_parts)}
+{unavailable_note}
+Instructions:
+- If this is a comparative query, clearly organize by the comparison dimensions
+- If this is an analytical query (most/least/best/worst), perform the analysis
+- Preserve important details from the research
+- Use natural language, be clear and organized
+- Cite which poll(s) or stage(s) information comes from
+- Do NOT make up information not in the retrieved data
+- TODO: REMOVE WHEN PIPELINES READY - If some data sources weren't available, clearly state this and explain what you CAN provide
+"""
+        final_answer = self.llm.invoke([
+            SystemMessage(content="You are a survey data analyst synthesizing research results."),
+            HumanMessage(content=synthesis_prompt)
+        ]).content
+        if self.verbose:
+            print("Synthesis complete")
+        return {
+            "final_answer": final_answer,
+            "messages": [AIMessage(content=final_answer)]
+        }
+    # ========================================================================
+    # PUBLIC API
+    # ========================================================================
+    def query(self, question: str, thread_id: str = "default") -> str:
+        """
+        Query the survey analysis system.
+        Args:
+            question: User's question
+            thread_id: Conversation thread ID for memory
+        Returns:
+            Answer string
+        Note: When using the same thread_id across multiple calls, the conversation
+        context is preserved. For example:
+        - Call 1: query("what questions were asked?", thread_id="user_123")
+        - Call 2: query("June 2025, unity poll", thread_id="user_123")
+        The second call will understand the full context.
+        """
+        # Create initial state for this turn
+        # Note: LangGraph's operator.add annotation will append to existing messages
+        # from the checkpointer, not replace them
+        initial_state = {
+            "messages": [HumanMessage(content=question)],
+            "user_question": question,
+            "research_brief": None,
+            "current_stage": 0,
+            "stage_results": [],
+            "questionnaire_results": None,
+            "toplines_results": None,
+            "crosstabs_results": None,
+            "sql_results": None,
+            "verification": None,
+            "final_answer": None,
+            "retry_count": 0,
+            "max_retries": self.max_retries
+        }
+        config = {"configurable": {"thread_id": thread_id}}
+        if self.verbose:
+            print(f"\n🧵 Thread ID: {thread_id}")
+        final_state = self.graph.invoke(initial_state, config)
+        return final_state["final_answer"]
+    def stream_query(self, question: str, thread_id: str = "default"):
+        """Stream the query execution for real-time updates"""
+        initial_state = {
+            "messages": [HumanMessage(content=question)],
+            "user_question": question,
+            "research_brief": None,
+            "current_stage": 0,
+            "stage_results": [],
+            "questionnaire_results": None,
+            "toplines_results": None,
+            "crosstabs_results": None,
+            "sql_results": None,
+            "verification": None,
+            "final_answer": None,
+            "retry_count": 0,
+            "max_retries": self.max_retries
+        }
+        config = {"configurable": {"thread_id": thread_id}}
+        for event in self.graph.stream(initial_state, config):
+            yield event
+# ============================================================================
+# CLI INTERFACE
+# ============================================================================
+def main():
+    """Interactive CLI"""
+    import sys
+    openai_api_key = os.getenv("OPENAI_API_KEY")
+    pinecone_api_key = os.getenv("PINECONE_API_KEY")
+    if not openai_api_key or not pinecone_api_key:
+        print("Error: Missing API keys")
+        print("Set OPENAI_API_KEY and PINECONE_API_KEY environment variables")
+        sys.exit(1)
+    print("Initializing survey analysis agent...")
+    agent = SurveyAnalysisAgent(
+        openai_api_key=openai_api_key,
+        pinecone_api_key=pinecone_api_key,
+        verbose=True
+    )
+    print("\n" + "="*80)
+    print("SURVEY ANALYSIS AGENT (WITH STAGED RESEARCH)")
+    print("="*80)
+    print("\nType 'quit' to exit\n")
+    thread_id = "cli_session"
+    while True:
+        try:
+            question = input("\nYour question: ").strip()
+            if not question or question.lower() in ['quit', 'exit', 'q']:
+                print("\nGoodbye!")
+                break
+            print("\n" + "-"*80)
+            answer = agent.query(question, thread_id=thread_id)
+            print("\n" + "="*80)
+            print("ANSWER:")
+            print("="*80)
+            print(answer)
+            print("="*80)
+        except KeyboardInterrupt:
+            print("\n\nGoodbye!")
+            break
+        except Exception as e:
+            print(f"\nError: {e}")
+            if os.getenv("DEBUG"):
+                raise
+if __name__ == "__main__":
+    main()