Spaces:

jdesiree
/

Mimir

Sleeping

App Files Files Community

jdesiree commited on Oct 23, 2025

Commit

79845af

verified ·

1 Parent(s): 14f74c5

Upload 7 files

Browse files

Files changed (7) hide show

favicon.ico +0 -0
gradio_analytics.py +538 -0
gradio_chatbot.py +141 -0
gradio_prompt_testing.py +1634 -0
loading_animation.gif +0 -0
prompt_library.py +534 -0
styles.css +353 -0

favicon.ico ADDED Viewed

gradio_analytics.py ADDED Viewed

	@@ -0,0 +1,538 @@

+# gradio_analytics.py
+import gradio as gr
+import logging
+import json
+import sqlite3
+import os
+from datetime import datetime
+logger = logging.getLogger(__name__)
+try:
+    from app import (
+        get_trackio_database_path,
+        get_project_statistics_with_nulls,
+        get_recent_interactions_with_nulls,
+        create_dashboard_html_with_nulls,
+        calculate_response_quality,
+        refresh_analytics_data_persistent as refresh_analytics_data,
+        export_metrics_json_persistent as export_metrics_json,
+        export_metrics_csv_persistent as export_metrics_csv,
+        load_analytics_state,
+        get_global_state_debug_info,
+        sync_trackio_with_global_state,
+        global_state_manager,
+        evaluate_educational_quality_with_tracking,
+    )
+except ImportError:
+    def get_trackio_database_path(project_name):
+        return None
+    def get_project_statistics_with_nulls(cursor, project_name):
+        return {
+            "total_conversations": None,
+            "avg_session_length": None,
+            "success_rate": None
+        }
+    def get_recent_interactions_with_nulls(cursor, project_name, limit=10):
+        return []
+    def create_dashboard_html_with_nulls(project_name, project_stats):
+        return f"<div>Mock dashboard for {project_name}</div>"
+    def calculate_response_quality(response):
+        return 3.0
+    def refresh_analytics_data():
+        return {}, [], "<div>Mock analytics</div>"
+    def export_metrics_json():
+        gr.Info("Mock JSON export")
+    def export_metrics_csv():
+        gr.Info("Mock CSV export")
+    def load_analytics_state():
+        return {}, [], "<div>Mock analytics state</div>"
+    def get_global_state_debug_info():
+        return {"status": "mock"}
+    def sync_trackio_with_global_state():
+        pass
+    def evaluate_educational_quality_with_tracking(*args, **kwargs):
+        return {"educational_score": 0.5}
+    class MockStateManager:
+        def get_cache_status(self):
+            return {"status": "mock"}
+        def get_evaluation_summary(self, include_history=False):
+            return {"aggregate_metrics": {}, "total_evaluations": {}}
+        def clear_all_states(self):
+            pass
+        def _backup_to_hf_dataset(self):
+            pass
+    global_state_manager = MockStateManager()
+def load_custom_css():
+    try:
+        with open("styles.css", "r", encoding="utf-8") as css_file:
+            css_content = css_file.read()
+            logger.info(f"CSS loaded successfully for analytics page")
+            return css_content
+    except FileNotFoundError:
+        logger.warning("styles.css file not found for analytics page")
+        return ""
+    except Exception as e:
+        logger.warning(f"Error reading styles.css: {e}")
+        return ""
+def show_cache_info():
+    try:
+        from pathlib import Path
+        from huggingface_hub import scan_cache_dir
+        cache_info = scan_cache_dir(cache_dir="/tmp/huggingface")
+        info_text = f"""
+**HuggingFace Cache Status:**
+**Total Size:** {cache_info.size_on_disk / (1024**3):.2f} GB
+**Number of Repos:** {len(cache_info.repos)}
+**Cached Models:**
+"""
+        for repo in cache_info.repos:
+            size_gb = repo.size_on_disk / (1024**3)
+            info_text += f"""
+- **{repo.repo_id}**
+  - Size: {size_gb:.2f} GB
+  - Type: {repo.repo_type}
+  - Revisions: {len(repo.revisions)}
+"""
+        return info_text
+    except Exception as e:
+        return f"Error inspecting cache: {str(e)}"
+def launch_external_trackio():
+    try:
+        import subprocess
+        result = subprocess.run(
+            ["trackio", "show", "--project", "Mimir"],
+            capture_output=False,
+            text=True
+        )
+        if result.returncode == 0:
+            gr.Info("Trackio dashboard launched in browser")
+        else:
+            gr.Warning("Could not launch trackio dashboard")
+    except Exception as e:
+        logger.error(f"Failed to launch trackio: {e}")
+        gr.Warning(f"Failed to launch trackio dashboard: {str(e)}")
+def show_cache_status():
+    try:
+        debug_info = get_global_state_debug_info()
+        cache_status = debug_info.get("cache_status", {})
+        status_text = f"""
+**Global State Cache Status:**
+- Session ID: {cache_status.get('session_id', 'Unknown')}
+- Analytics Cached: {'Yes' if cache_status.get('analytics_cached') else 'No'}
+- Conversation Cached: {'Yes' if cache_status.get('conversation_cached') else 'No'}
+- Analytics Last Refresh: {cache_status.get('analytics_last_refresh', 'Never')}
+- Total Analytics Sessions: {cache_status.get('total_analytics_sessions', 0)}
+- Total Conversation Sessions: {cache_status.get('total_conversation_sessions', 0)}
+**Analytics Data Status:**
+- Has Analytics Data: {'Yes' if cache_status.get('analytics_has_data') else 'No'}
+- Conversation Length: {cache_status.get('conversation_length', 0)} messages
+- Chat History Length: {cache_status.get('chat_history_length', 0)} messages
+*Last Updated: {datetime.now().strftime('%H:%M:%S')}*
+        """
+        gr.Info("Cache status updated - check the Status panel")
+        return status_text
+    except Exception as e:
+        error_text = f"Error getting cache status: {str(e)}"
+        gr.Warning(error_text)
+        return error_text
+def manual_backup_to_hf():
+    try:
+        global_state_manager._backup_to_hf_dataset()
+        gr.Info("Manual backup to HF dataset completed successfully")
+        return f"Backup completed at {datetime.now().strftime('%H:%M:%S')}"
+    except Exception as e:
+        gr.Warning(f"Backup failed: {str(e)}")
+        return f"Backup failed: {str(e)}"
+def get_persistence_status():
+    try:
+        status_info = {
+            "SQLite DB": "Active" if os.path.exists(global_state_manager._db_path) else "Not Found",
+            "HF Dataset": global_state_manager.dataset_repo,
+            "Last HF Backup": global_state_manager._last_hf_backup.strftime('%Y-%m-%d %H:%M:%S'),
+            "DB Path": global_state_manager._db_path,
+            "Backup Interval": f"{global_state_manager._hf_backup_interval}s"
+        }
+        return status_info
+    except Exception as e:
+        return {"error": str(e)}
+def clear_all_global_states():
+    try:
+        global_state_manager.clear_all_states()
+        gr.Info("All global states cleared successfully")
+        empty_stats = {
+            "total_conversations": None,
+            "avg_session_length": None,
+            "success_rate": None,
+            "model_type": "Cleared",
+            "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        }
+        empty_html = """
+        <div style="text-align: center; padding: 40px; border: 2px dashed #ccc; border-radius: 8px; background: #f8f9fa;">
+            <h3>States Cleared</h3>
+            <p>All global states have been cleared.</p>
+            <p>Click "Refresh Data" to reload analytics.</p>
+        </div>
+        """
+        return empty_stats, [], empty_html
+    except Exception as e:
+        gr.Warning(f"Failed to clear states: {str(e)}")
+        return load_analytics_state()
+def show_evaluation_metrics():
+    try:
+        eval_summary = global_state_manager.get_evaluation_summary(include_history=True)
+        metrics_data = [
+            ["Educational Quality", f"{eval_summary['aggregate_metrics']['avg_educational_quality']:.3f}"],
+            ["User Satisfaction", f"{eval_summary['aggregate_metrics']['user_satisfaction_rate']:.3f}"]
+        ]
+        recent_evaluations = []
+        if 'history' in eval_summary:
+            for eval_item in eval_summary['history']['recent_educational_scores'][-5:]:
+                recent_evaluations.append([
+                    eval_item['timestamp'][:16],
+                    f"{eval_item['educational_score']:.3f}",
+                    f"{eval_item['semantic_quality']:.3f}",
+                    f"{eval_item['response_time']:.3f}s"
+                ])
+        return eval_summary, metrics_data, recent_evaluations
+    except Exception as e:
+        logger.error(f"Error getting evaluation metrics: {e}")
+        return {}, [], []
+def sync_and_refresh_all():
+    try:
+        sync_trackio_with_global_state()
+        project_stats, recent_interactions, dashboard_html = refresh_analytics_data()
+        eval_summary, metrics_data, recent_evaluations = show_evaluation_metrics()
+        gr.Info("All data synced and refreshed successfully")
+        return project_stats, recent_interactions, dashboard_html, eval_summary, metrics_data, recent_evaluations
+    except Exception as e:
+        logger.error(f"Sync and refresh failed: {e}")
+        gr.Warning(f"Sync failed: {str(e)}")
+        return load_analytics_state() + ({}, [], [])
+with gr.Blocks() as demo:
+    custom_css = load_custom_css()
+    if custom_css:
+        gr.HTML(f'<style>{custom_css}</style>')
+    gr.HTML('<div class="analytics-title"><h2>Mimir Analytics Dashboard</h2></div>')
+    gr.Markdown("Monitor educational AI performance and effectiveness metrics with persistent state management.")
+    with gr.Tabs():
+        with gr.TabItem("Traditional Analytics"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("## Controls")
+                    refresh_btn = gr.Button("Refresh Data", variant="primary")
+                    sync_all_btn = gr.Button("Sync & Refresh All", variant="primary")
+                    with gr.Row():
+                        export_json_btn = gr.Button("Export JSON", variant="secondary", size="sm")
+                        export_csv_btn = gr.Button("Export CSV", variant="secondary", size="sm")
+                    launch_trackio_btn = gr.Button("Launch Trackio Dashboard", variant="secondary")
+                    gr.Markdown("### State Management")
+                    with gr.Row():
+                        cache_status_btn = gr.Button("Cache Status", size="sm")
+                        clear_states_btn = gr.Button("Clear All States", size="sm", variant="stop")
+                    with gr.Group():
+                        gr.Markdown("### Project Information")
+                        project_info = gr.JSON(
+                            value={
+                                "total_conversations": None,
+                                "avg_session_length": None,
+                                "success_rate": None,
+                                "model_type": None
+                            },
+                            label="Project Stats"
+                        )
+                    with gr.Group():
+                        gr.Markdown("### System Status")
+                        status_panel = gr.Markdown(
+                            "Click 'Cache Status' to view global state information.",
+                            label="Status Information"
+                        )
+                with gr.Column(scale=2):
+                    gr.Markdown("## Key Metrics Dashboard")
+                    trackio_iframe = gr.HTML(
+                        value="""
+                        <div style="text-align: center; padding: 40px; border: 2px dashed #ccc; border-radius: 8px; background: #f8f9fa;">
+                            <h3>Trackio Dashboard</h3>
+                            <p>Analytics data will appear here after conversations.</p>
+                            <p>Data is automatically cached and persists across page navigation.</p>
+                            <p>To launch trackio dashboard separately, run:</p>
+                            <code style="background: #e9ecef; padding: 4px 8px; border-radius: 4px;">trackio show --project "Mimir"</code>
+                        </div>
+                        """,
+                        label="Dashboard"
+                    )
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("## Recent Interactions")
+                    gr.Markdown("*Data persists when switching between Chatbot and Analytics pages*")
+                    recent_metrics = gr.Dataframe(
+                        headers=["Timestamp", "Response Time", "Prompt Mode", "Tools Used", "Quality Score", "Adapter"],
+                        datatype=["str", "number", "str", "bool", "number", "str"],
+                        row_count=10,
+                        col_count=6,
+                        interactive=False,
+                        label="Latest Sessions",
+                        value=[],
+                        show_label=True
+                    )
+        with gr.TabItem("ML Performance"):
+            gr.Markdown("## Agent-Based Performance & Global State Metrics")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    eval_metrics_btn = gr.Button("Get Evaluation Metrics", variant="primary")
+                    with gr.Group():
+                        gr.Markdown("### Model Cache Status")
+                        cache_status_display = gr.JSON(
+                            value={},
+                            label="Cache Information"
+                        )
+                with gr.Column(scale=2):
+                    gr.Markdown("### Aggregate Performance Metrics")
+                    eval_metrics_table = gr.Dataframe(
+                        headers=["Metric", "Score"],
+                        datatype=["str", "str"],
+                        label="Model Performance",
+                        value=[]
+                    )
+                    eval_summary_display = gr.JSON(
+                        value={},
+                        label="Detailed Evaluation Summary"
+                    )
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### Recent Quality Evaluations")
+                    recent_evaluations_table = gr.Dataframe(
+                        headers=["Timestamp", "Educational Score", "Semantic Quality", "Response Time"],
+                        datatype=["str", "str", "str", "str"],
+                        label="Recent Evaluations",
+                        value=[]
+                    )
+        with gr.TabItem("System Status"):
+            gr.Markdown("## Global State Manager & System Diagnostics")
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### Global State Cache")
+                    cache_details = gr.Markdown("Click 'Show Cache Status' to view detailed information.")
+                    show_cache_btn = gr.Button("Show Cache Status", variant="primary")
+                    refresh_cache_btn = gr.Button("Refresh Cache Info", variant="secondary")
+                    gr.Markdown("### Persistence Controls")
+                    backup_btn = gr.Button("Manual Backup to HF Dataset", variant="primary")
+                    backup_status = gr.Textbox(label="Backup Status", value="No recent backup", interactive=False)
+                with gr.Column():
+                    gr.Markdown("### System Actions")
+                    sync_trackio_btn = gr.Button("Sync to Database", variant="secondary")
+                    clear_all_btn = gr.Button("Clear All Global States", variant="stop")
+                    gr.Markdown("### Persistence Status")
+                    persistence_info = gr.JSON(
+                        value={},
+                        label="Persistence Information"
+                    )
+                    gr.Markdown("### Performance Monitor")
+                    perf_info = gr.JSON(
+                        value={},
+                        label="Performance Information"
+                    )
+            # NEW: HuggingFace Cache Viewer Section
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### 🗂️ HuggingFace Model Cache")
+                    gr.Markdown("*View cached models and disk usage*")
+                    cache_viewer_btn = gr.Button("Inspect Model Cache", variant="primary", size="lg")
+                    with gr.Row():
+                        clear_cache_btn = gr.Button("Clear Cache (⚠️ Dangerous)", variant="stop", size="sm")
+                        refresh_models_btn = gr.Button("Re-download Models", variant="secondary", size="sm")
+                    cache_info_display = gr.Markdown(
+                        "Click **Inspect Model Cache** to view detailed cache information.",
+                        label="Cache Details"
+                    )
+    demo.load(
+        load_analytics_state,
+        inputs=None,
+        outputs=[project_info, recent_metrics, trackio_iframe],
+        show_progress="hidden"
+    )
+    demo.load(
+        fn=lambda: global_state_manager.get_cache_status(),
+        inputs=None,
+        outputs=[cache_status_display],
+        show_progress="hidden"
+    )
+    demo.load(
+        fn=get_persistence_status,
+        inputs=None,
+        outputs=[persistence_info],
+        show_progress="hidden"
+    )
+    refresh_btn.click(
+        fn=refresh_analytics_data,
+        inputs=[],
+        outputs=[project_info, recent_metrics, trackio_iframe],
+        show_progress="full"
+    )
+    sync_all_btn.click(
+        fn=sync_and_refresh_all,
+        inputs=[],
+        outputs=[project_info, recent_metrics, trackio_iframe, eval_summary_display, eval_metrics_table, recent_evaluations_table],
+        show_progress="full"
+    )
+    export_json_btn.click(
+        fn=export_metrics_json,
+        inputs=[],
+        outputs=[],
+        show_progress="full"
+    )
+    export_csv_btn.click(
+        fn=export_metrics_csv,
+        inputs=[],
+        outputs=[],
+        show_progress="full"
+    )
+    launch_trackio_btn.click(
+        fn=launch_external_trackio,
+        inputs=[],
+        outputs=[],
+        show_progress="full"
+    )
+    cache_status_btn.click(
+        fn=show_cache_status,
+        inputs=[],
+        outputs=[status_panel],
+        show_progress="full"
+    )
+    clear_states_btn.click(
+        fn=clear_all_global_states,
+        inputs=[],
+        outputs=[project_info, recent_metrics, trackio_iframe],
+        show_progress="full"
+    )
+    eval_metrics_btn.click(
+        fn=show_evaluation_metrics,
+        inputs=[],
+        outputs=[eval_summary_display, eval_metrics_table, recent_evaluations_table],
+        show_progress="full"
+    )
+    show_cache_btn.click(
+        fn=show_cache_status,
+        inputs=[],
+        outputs=[cache_details],
+        show_progress="full"
+    )
+    refresh_cache_btn.click(
+        fn=lambda: global_state_manager.get_cache_status(),
+        inputs=[],
+        outputs=[perf_info],
+        show_progress="full"
+    )
+    backup_btn.click(
+        fn=manual_backup_to_hf,
+        inputs=[],
+        outputs=[backup_status],
+        show_progress="full"
+    )
+    sync_trackio_btn.click(
+        fn=sync_trackio_with_global_state,
+        inputs=[],
+        outputs=[],
+        show_progress="full"
+    )
+    clear_all_btn.click(
+        fn=clear_all_global_states,
+        inputs=[],
+        outputs=[project_info, recent_metrics, trackio_iframe],
+        show_progress="full"
+    )
+if __name__ == "__main__":
+    logger.info("Running analytics dashboard standalone with global state management")
+    demo.launch(server_name="0.0.0.0", server_port=7861)

gradio_chatbot.py ADDED Viewed

	@@ -0,0 +1,141 @@

+# gradio_chatbot.py
+import gradio as gr
+import logging
+logger = logging.getLogger(__name__)
+from app import (
+    add_user_message,
+    add_loading_animation,
+    generate_response,
+    reset_conversation,
+    load_conversation_state,
+    remove_loading_animations,
+    global_state_manager,
+)
+def load_custom_css():
+    try:
+        with open("styles.css", "r", encoding="utf-8") as css_file:
+            css_content = css_file.read()
+            logger.info(f"CSS loaded successfully, length: {len(css_content)} characters")
+            return css_content
+    except FileNotFoundError:
+        logger.warning("styles.css file not found, using default styling")
+        return ""
+    except Exception as e:
+        logger.warning(f"Error reading styles.css: {e}")
+        return ""
+def restore_state_on_page_access():
+    """
+    Restore conversation state when page loads or user navigates back.
+    This ensures persistence across page navigation.
+    """
+    try:
+        current_state = global_state_manager.get_conversation_state()
+        chat_history = current_state.get('chat_history', [])
+        conversation_state_data = current_state.get('conversation_state', [])
+        logger.info(f"✓ Restored state: {len(chat_history)} messages in chat, {len(conversation_state_data)} in conversation")
+        return chat_history, conversation_state_data
+    except Exception as e:
+        logger.error(f"Failed to restore state: {e}")
+        return [], []
+with gr.Blocks() as demo:
+    custom_css = load_custom_css()
+    if custom_css:
+        gr.HTML(f'<style>{custom_css}</style>')
+    conversation_state = gr.State([])
+    gr.HTML('<div class="title-header"><h1>Mimir</h1></div>')
+    with gr.Row():
+        chatbot = gr.Chatbot(
+            type="messages",
+            show_copy_button=True,
+            show_share_button=False,
+            layout="bubble",
+            autoscroll=True,
+            avatar_images=None,
+            elem_id="main-chatbot",
+            scale=1,
+            height="65vh",
+            value=[],
+            latex_delimiters=[
+                {"left": "$$", "right": "$$", "display": True},
+                {"left": "$", "right": "$", "display": False},
+            ]
+        )
+    with gr.Row(elem_classes=["input-controls"]):
+        msg = gr.Textbox(
+            placeholder="Ask me about math, research, study strategies, or any educational topic...",
+            show_label=False,
+            lines=6,
+            max_lines=8,
+            elem_classes=["input-textbox"],
+            container=False,
+            scale=4
+        )
+        with gr.Column(elem_classes=["button-column"], scale=1):
+            send = gr.Button("Send", elem_classes=["send-button"], size="sm")
+            clear = gr.Button("Clear", elem_classes=["clear-button"], size="sm")
+    demo.load(
+        fn=restore_state_on_page_access,
+        outputs=[chatbot, conversation_state],
+        queue=False
+    )
+    msg.submit(
+        add_user_message,
+        inputs=[msg, chatbot, conversation_state],
+        outputs=[msg, chatbot, conversation_state],
+        show_progress="hidden"
+    ).then(
+        add_loading_animation,
+        inputs=[chatbot, conversation_state],
+        outputs=[chatbot, conversation_state],
+        show_progress="hidden"
+    ).then(
+        generate_response,
+        inputs=[chatbot, conversation_state],
+        outputs=[chatbot, conversation_state],
+        show_progress="hidden"
+    )
+    send.click(
+        add_user_message,
+        inputs=[msg, chatbot, conversation_state],
+        outputs=[msg, chatbot, conversation_state],
+        show_progress="hidden"
+    ).then(
+        add_loading_animation,
+        inputs=[chatbot, conversation_state],
+        outputs=[chatbot, conversation_state],
+        show_progress="hidden"
+    ).then(
+        generate_response,
+        inputs=[chatbot, conversation_state],
+        outputs=[chatbot, conversation_state],
+        show_progress="hidden"
+    )
+    clear.click(
+        reset_conversation,
+        outputs=[chatbot, conversation_state],
+        show_progress="hidden"
+    )
+if __name__ == "__main__":
+    logger.info("Running chatbot interface standalone")
+    demo.launch(server_name="0.0.0.0", server_port=7860)

gradio_prompt_testing.py ADDED Viewed

	@@ -0,0 +1,1634 @@

+# gradio_pipeline_testing.py
+"""
+Full Pipeline Testing Interface for Mimir Educational AI Assistant
+Tests the complete orchestration flow with comprehensive metrics at every step.
+Captures conditional model activation, token usage, timing, and quality metrics.
+Output: CSV file with ~110 columns capturing full pipeline journey
+"""
+import os
+import sys
+import io
+import csv
+import json
+import time
+import logging
+import warnings
+from datetime import datetime
+from typing import Dict, List, Optional, Tuple, Any
+from collections import Counter
+# Core dependencies
+import torch
+import gradio as gr
+import numpy as np
+# ============================================================================
+# ENVIRONMENT SETUP
+# ============================================================================
+HF_CACHE = "/tmp/huggingface"
+os.makedirs(f"{HF_CACHE}/hub", exist_ok=True)
+os.environ['HF_HOME'] = HF_CACHE
+os.environ['HF_HUB_CACHE'] = f"{HF_CACHE}/hub"
+# ============================================================================
+# IMPORTS FROM MIMIR APPLICATION
+# ============================================================================
+try:
+    from agents import (
+        ToolDecisionAgent,
+        PromptRoutingAgents,
+        ThinkingAgents,
+        ResponseAgent,
+        get_shared_qwen3
+    )
+    AGENTS_AVAILABLE = True
+except ImportError as e:
+    print(f"⚠️  Warning: Could not import agents: {e}")
+    AGENTS_AVAILABLE = False
+try:
+    from state_manager import GlobalStateManager, LogicalExpressions
+    STATE_MANAGER_AVAILABLE = True
+except ImportError as e:
+    print(f"⚠️  Warning: Could not import state_manager: {e}")
+    STATE_MANAGER_AVAILABLE = False
+try:
+    from prompt_library import (
+        CORE_IDENTITY,
+        TOOL_DECISION,
+        agent_1_system,
+        agent_2_system,
+        agent_3_system,
+        agent_4_system,
+        MATH_THINKING,
+        QUESTION_ANSWER_DESIGN,
+        REASONING_THINKING,
+        VAUGE_INPUT,
+        USER_UNDERSTANDING,
+        GENERAL_FORMATTING,
+        LATEX_FORMATTING,
+        GUIDING_TEACHING,
+        STRUCTURE_PRACTICE_QUESTIONS,
+        PRACTICE_QUESTION_FOLLOWUP,
+        TOOL_USE_ENHANCEMENT,
+    )
+    PROMPTS_AVAILABLE = True
+except ImportError as e:
+    print(f"⚠️  Warning: Could not import prompt_library: {e}")
+    PROMPTS_AVAILABLE = False
+# Try to import post processor
+try:
+    # Import the post processor class/module from app.py
+    import importlib.util
+    spec = importlib.util.spec_from_file_location("app_module", "app.py")
+    app_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(app_module)
+    post_processor = app_module.post_processor
+    POST_PROCESSOR_AVAILABLE = True
+except Exception as e:
+    print(f"⚠️  Warning: Could not import post_processor: {e}")
+    POST_PROCESSOR_AVAILABLE = False
+    # Create dummy
+    class DummyPostProcessor:
+        def process_response(self, response, user_message):
+            return response
+    post_processor = DummyPostProcessor()
+# ZeroGPU support
+try:
+    import spaces
+    ZERO_GPU_AVAILABLE = True
+except ImportError:
+    ZERO_GPU_AVAILABLE = False
+    class DummySpaces:
+        @staticmethod
+        def GPU(duration=600):
+            def decorator(func):
+                return func
+            return decorator
+    spaces = DummySpaces()
+# Tiktoken for accurate token counting
+try:
+    import tiktoken
+    TIKTOKEN_AVAILABLE = True
+except ImportError:
+    TIKTOKEN_AVAILABLE = False
+    print("⚠️  Warning: tiktoken not available - using fallback token counting")
+# Textstat for readability metrics
+try:
+    import textstat
+    TEXTSTAT_AVAILABLE = True
+except ImportError:
+    TEXTSTAT_AVAILABLE = False
+    print("⚠️  Warning: textstat not available - using manual readability calculations")
+# ============================================================================
+# LOGGING SETUP
+# ============================================================================
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+CURRENT_YEAR = datetime.now().year
+# ============================================================================
+# GLOBAL INSTANCES
+# ============================================================================
+if AGENTS_AVAILABLE and STATE_MANAGER_AVAILABLE:
+    try:
+        global_state_manager = GlobalStateManager()
+        logical_expressions = LogicalExpressions()
+        tool_agent = ToolDecisionAgent()
+        routing_agents = PromptRoutingAgents()
+        thinking_agents = ThinkingAgents()
+        response_agent = ResponseAgent()
+        logger.info("✓ All agents initialized successfully")
+    except Exception as e:
+        logger.error(f"Failed to initialize agents: {e}")
+        raise
+else:
+    logger.error("Cannot initialize - missing core dependencies")
+    raise ImportError("Missing required modules: agents or state_manager")
+# ============================================================================
+# CSV SCHEMA DEFINITION
+# ============================================================================
+CSV_COLUMNS = [
+    # Identification & Input
+    "prompt_index",
+    "timestamp",
+    "user_prompt",
+    "user_prompt_tokens",
+    "user_prompt_chars",
+    "user_prompt_words",
+    # Conversation Context
+    "conversation_history_length",
+    "conversation_history_tokens",
+    # Tool Decision Agent
+    "tool_decision_input_template",
+    "tool_decision_input_tokens",
+    "tool_decision_output",
+    "tool_decision_output_tokens",
+    "tool_decision_result",
+    "tool_decision_time_seconds",
+    "tool_decision_gpu_peak_mb",
+    # Regex Checks
+    "regex_checks_applied",
+    "regex_checks_time_seconds",
+    # Routing Agent 1
+    "agent1_input_template",
+    "agent1_input_tokens",
+    "agent1_output",
+    "agent1_output_tokens",
+    "agent1_decision",
+    "agent1_time_seconds",
+    "agent1_gpu_peak_mb",
+    # Routing Agent 2
+    "agent2_input_template",
+    "agent2_input_tokens",
+    "agent2_output",
+    "agent2_output_tokens",
+    "agent2_decision",
+    "agent2_time_seconds",
+    "agent2_gpu_peak_mb",
+    # Routing Agent 3
+    "agent3_input_template",
+    "agent3_input_tokens",
+    "agent3_output",
+    "agent3_output_tokens",
+    "agent3_decision",
+    "agent3_time_seconds",
+    "agent3_gpu_peak_mb",
+    # Routing Agent 4
+    "agent4_input_template",
+    "agent4_input_tokens",
+    "agent4_output",
+    "agent4_output_tokens",
+    "agent4_decisions",
+    "agent4_time_seconds",
+    "agent4_gpu_peak_mb",
+    # Math Thinking
+    "math_thinking_activated",
+    "math_thinking_input_template",
+    "math_thinking_input_tokens",
+    "math_thinking_output",
+    "math_thinking_output_tokens",
+    "math_thinking_time_seconds",
+    "math_thinking_gpu_peak_mb",
+    # QA Design Thinking
+    "qa_design_activated",
+    "qa_design_input_template",
+    "qa_design_input_tokens",
+    "qa_design_output",
+    "qa_design_output_tokens",
+    "qa_design_time_seconds",
+    "qa_design_gpu_peak_mb",
+    # Reasoning Thinking
+    "reasoning_activated",
+    "reasoning_input_template",
+    "reasoning_input_tokens",
+    "reasoning_output",
+    "reasoning_output_tokens",
+    "reasoning_time_seconds",
+    "reasoning_gpu_peak_mb",
+    # Prompt Assembly
+    "active_response_prompts",
+    "final_prompt_template",
+    "final_prompt_tokens",
+    "final_prompt_chars",
+    "final_prompt_words",
+    "assembly_time_seconds",
+    # Response Generation
+    "response_input_template",
+    "response_input_tokens",
+    "response_raw",
+    "response_raw_tokens",
+    "response_raw_chars",
+    "response_raw_words",
+    "response_generation_time_seconds",
+    "response_gpu_peak_mb",
+    "response_tokens_per_second",
+    # Post-processing
+    "response_processed",
+    "response_processed_tokens",
+    "response_processed_chars",
+    "response_processed_words",
+    "postprocessing_time_seconds",
+    # Quality Metrics
+    "flesch_reading_ease",
+    "flesch_kincaid_grade",
+    "completeness_score",
+    "specificity_score",
+    "repetition_ratio",
+    "unique_word_ratio",
+    "avg_sentence_length",
+    "question_answered",
+    # Overall Metrics
+    "total_pipeline_time_seconds",
+    "total_input_tokens",
+    "total_output_tokens",
+    "total_gpu_peak_mb",
+    "models_activated_count",
+    "models_activated_list",
+]
+# ============================================================================
+# TOKEN COUNTING FUNCTIONS
+# ============================================================================
+def count_tokens_accurate(text: str) -> int:
+    """
+    Count tokens using tiktoken library for accurate estimation.
+    Args:
+        text: Input text to tokenize
+    Returns:
+        Accurate token count
+    """
+    if not text:
+        return 0
+    if not TIKTOKEN_AVAILABLE:
+        # Fallback: word count approximation
+        return len(text.split())
+    try:
+        # Use cl100k_base encoding (used by GPT-3.5/4, good general estimator)
+        encoding = tiktoken.get_encoding("cl100k_base")
+        tokens = encoding.encode(text)
+        return len(tokens)
+    except Exception as e:
+        logger.warning(f"tiktoken encoding failed: {e}, using fallback")
+        return len(text.split())
+def count_words(text: str) -> int:
+    """Count words in text"""
+    if not text:
+        return 0
+    return len(text.split())
+def count_sentences(text: str) -> int:
+    """Count sentences in text (simple heuristic)"""
+    if not text:
+        return 0
+    import re
+    sentences = re.split(r'[.!?]+', text)
+    return len([s for s in sentences if s.strip()])
+# ============================================================================
+# GPU MEMORY TRACKING
+# ============================================================================
+def get_gpu_memory() -> Dict[str, float]:
+    """
+    Get current GPU memory statistics.
+    Returns:
+        Dictionary with allocated, reserved, and peak memory in MB
+    """
+    if torch.cuda.is_available():
+        return {
+            "allocated_mb": torch.cuda.memory_allocated() / 1024**2,
+            "reserved_mb": torch.cuda.memory_reserved() / 1024**2,
+            "peak_mb": torch.cuda.max_memory_allocated() / 1024**2
+        }
+    return {
+        "allocated_mb": 0.0,
+        "reserved_mb": 0.0,
+        "peak_mb": 0.0
+    }
+def reset_gpu_stats():
+    """Reset GPU memory statistics"""
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.synchronize()
+# ============================================================================
+# TEMPLATE BUILDING FUNCTIONS
+# ============================================================================
+def format_history(history: List[Dict]) -> str:
+    """Format conversation history for templates"""
+    if not history:
+        return "No previous conversation"
+    formatted = []
+    for msg in history[-8:]:  # Last 8 messages
+        role = msg.get('role', 'unknown')
+        content = msg.get('content', '')[:100]  # Truncate
+        formatted.append(f"{role}: {content}")
+    return "\n".join(formatted)
+def build_tool_decision_template(user_prompt: str) -> str:
+    """Build template for tool decision agent"""
+    return f"<s>[INST] {TOOL_DECISION}\n\nUser Query: {user_prompt} [/INST]"
+def build_agent1_template(user_prompt: str, history: List) -> str:
+    """Build template for Agent 1: Practice Questions"""
+    history_str = format_history(history)
+    return f"<s>[INST] {agent_1_system}\n\nConversation History:\n{history_str}\n\nCurrent User Query: {user_prompt} [/INST]"
+def build_agent2_template(user_prompt: str) -> str:
+    """Build template for Agent 2: Discovery Mode"""
+    return f"<s>[INST] {agent_2_system}\n\nUser Query: {user_prompt} [/INST]"
+def build_agent3_template(user_prompt: str, history: List) -> str:
+    """Build template for Agent 3: Followup Assessment"""
+    history_str = format_history(history)
+    return f"<s>[INST] {agent_3_system}\n\nConversation History:\n{history_str}\n\nCurrent User Query: {user_prompt} [/INST]"
+def build_agent4_template(user_prompt: str, history: List) -> str:
+    """Build template for Agent 4: Teaching Mode"""
+    history_str = format_history(history)
+    return f"<s>[INST] {agent_4_system}\n\nConversation History:\n{history_str}\n\nCurrent User Query: {user_prompt} [/INST]"
+def build_math_thinking_template(user_prompt: str) -> str:
+    """Build template for Math Thinking"""
+    return f"<s>[INST] {MATH_THINKING}\n\nUser Query: {user_prompt} [/INST]"
+def build_qa_design_template(user_prompt: str) -> str:
+    """Build template for QA Design Thinking"""
+    return f"<s>[INST] {QUESTION_ANSWER_DESIGN}\n\nUser Query: {user_prompt} [/INST]"
+def build_reasoning_template(user_prompt: str) -> str:
+    """Build template for Reasoning Thinking"""
+    return f"<s>[INST] {REASONING_THINKING}\n\nUser Query: {user_prompt} [/INST]"
+def build_final_prompt(
+    user_prompt: str,
+    active_prompts: List[str],
+    thinking_context: str,
+    recent_history_formatted: str,
+    tool_img_output: str = "",
+    tool_context: str = ""
+) -> str:
+    """
+    Build final prompt for ResponseAgent (Qwen3-Claude).
+    Matches actual orchestration logic from app.py
+    """
+    # Build prompt segments
+    prompt_segments = [CORE_IDENTITY]
+    prompt_map = {
+        "VAUGE_INPUT": VAUGE_INPUT,
+        "USER_UNDERSTANDING": USER_UNDERSTANDING,
+        "GENERAL_FORMATTING": GENERAL_FORMATTING,
+        "LATEX_FORMATTING": LATEX_FORMATTING,
+        "GUIDING_TEACHING": GUIDING_TEACHING,
+        "STRUCTURE_PRACTICE_QUESTIONS": STRUCTURE_PRACTICE_QUESTIONS,
+        "PRACTICE_QUESTION_FOLLOWUP": PRACTICE_QUESTION_FOLLOWUP,
+        "TOOL_USE_ENHANCEMENT": TOOL_USE_ENHANCEMENT,
+    }
+    for prompt_name in active_prompts:
+        if prompt_name in prompt_map:
+            prompt_segments.append(prompt_map[prompt_name])
+    prompt_segments_text = "\n\n".join(prompt_segments)
+    knowledge_cutoff = f"""
+The current year is {CURRENT_YEAR}. Your knowledge cutoff date is October 2023. If the user asks about recent events or dynamic facts, inform them you may not have the most up-to-date information and suggest referencing direct sources."""
+    complete_prompt = f"""
+{prompt_segments_text}
+If tools were used, context and output will be here. Ignore if empty:
+Image output: {tool_img_output}
+Image context: {tool_context}
+Conversation history, if available:
+{recent_history_formatted}
+Consider any context available to you:
+{thinking_context}
+Here is the user's current query:
+{user_prompt}
+{knowledge_cutoff}
+"""
+    return complete_prompt
+# ============================================================================
+# QUALITY METRICS FUNCTIONS
+# ============================================================================
+def estimate_syllables(text: str) -> int:
+    """
+    Estimate syllable count (rough heuristic).
+    Counts vowel groups.
+    """
+    import re
+    words = text.lower().split()
+    syllable_count = 0
+    for word in words:
+        # Remove non-letters
+        word = re.sub(r'[^a-z]', '', word)
+        if not word:
+            continue
+        # Count vowel groups
+        vowel_groups = len(re.findall(r'[aeiouy]+', word))
+        # Adjust for silent e
+        if word.endswith('e'):
+            vowel_groups -= 1
+        # Ensure at least 1 syllable per word
+        syllable_count += max(1, vowel_groups)
+    return syllable_count
+def calculate_flesch_reading_ease(text: str) -> float:
+    """
+    Calculate Flesch Reading Ease score.
+    Score 0-100: Higher = easier to read
+    90-100: Very easy (5th grade)
+    60-70: Standard (8th-9th grade)
+    0-30: Very difficult (college graduate)
+    Formula: 206.835 - 1.015(words/sentences) - 84.6(syllables/words)
+    """
+    if not text or len(text.strip()) < 10:
+        return 0.0
+    if TEXTSTAT_AVAILABLE:
+        try:
+            return textstat.flesch_reading_ease(text)
+        except:
+            pass
+    # Manual calculation
+    words = count_words(text)
+    sentences = count_sentences(text)
+    if sentences == 0 or words == 0:
+        return 0.0
+    syllables = estimate_syllables(text)
+    if words == 0:
+        return 0.0
+    score = 206.835 - 1.015 * (words / sentences) - 84.6 * (syllables / words)
+    return max(0.0, min(100.0, score))
+def calculate_flesch_kincaid_grade(text: str) -> float:
+    """
+    Calculate Flesch-Kincaid Grade Level.
+    Returns US grade level needed to understand text.
+    Formula: 0.39(words/sentences) + 11.8(syllables/words) - 15.59
+    """
+    if not text or len(text.strip()) < 10:
+        return 0.0
+    if TEXTSTAT_AVAILABLE:
+        try:
+            return textstat.flesch_kincaid_grade(text)
+        except:
+            pass
+    words = count_words(text)
+    sentences = count_sentences(text)
+    if sentences == 0 or words == 0:
+        return 0.0
+    syllables = estimate_syllables(text)
+    if words == 0:
+        return 0.0
+    grade = 0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59
+    return max(0.0, grade)
+def calculate_completeness_score(response: str, user_prompt: str) -> float:
+    """
+    Estimate if response addresses the prompt.
+    Uses keyword overlap and length heuristics.
+    Returns: Score 0-1 (1 = complete answer)
+    """
+    if not response or not user_prompt:
+        return 0.0
+    import re
+    # Extract keywords from prompt
+    prompt_words = set(re.findall(r'\b\w+\b', user_prompt.lower()))
+    # Remove common stopwords
+    stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
+                 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
+                 'would', 'should', 'could', 'may', 'might', 'can', 'what',
+                 'how', 'why', 'when', 'where', 'who', 'which', 'i', 'you',
+                 'we', 'they', 'he', 'she', 'it', 'me', 'him', 'her', 'us', 'them'}
+    prompt_words -= stopwords
+    response_words = set(re.findall(r'\b\w+\b', response.lower()))
+    if not prompt_words:
+        return 0.5  # Neutral if no meaningful keywords
+    # Calculate keyword overlap
+    overlap = len(prompt_words & response_words) / len(prompt_words)
+    # Length factor
+    min_reasonable_length = 20
+    if len(response) < min_reasonable_length:
+        length_factor = len(response) / min_reasonable_length
+    else:
+        length_factor = 1.0
+    score = overlap * length_factor
+    return min(1.0, score)
+def check_question_answered(response: str, user_prompt: str) -> bool:
+    """
+    Boolean check: does response attempt to answer the question?
+    Heuristics:
+    - Response has minimum length
+    - Response doesn't start with refusal
+    - Response contains relevant keywords
+    """
+    if not response or len(response) < 10:
+        return False
+    # Check for refusal patterns
+    refusal_patterns = [
+        "i don't know",
+        "i cannot",
+        "i can't",
+        "i'm not sure",
+        "i don't have",
+        "unable to",
+        "sorry, i"
+    ]
+    response_lower = response.lower()
+    for pattern in refusal_patterns:
+        if response_lower.startswith(pattern):
+            return False
+    # Check for minimum completeness
+    completeness = calculate_completeness_score(response, user_prompt)
+    return completeness > 0.3
+def calculate_specificity_score(response: str) -> float:
+    """
+    Measure how specific vs vague the response is.
+    Indicators of specificity:
+    - Numbers, dates, names
+    - Technical terms
+    - Examples
+    - Concrete nouns
+    Returns: Score 0-1 (1 = very specific)
+    """
+    if not response:
+        return 0.0
+    import re
+    specificity_indicators = 0
+    total_possible = 5
+    # 1. Contains numbers
+    if re.search(r'\d+', response):
+        specificity_indicators += 1
+    # 2. Contains proper nouns
+    proper_nouns = len(re.findall(r'(?<!\. )\b[A-Z][a-z]+', response))
+    if proper_nouns > 0:
+        specificity_indicators += 1
+    # 3. Contains example phrases
+    example_phrases = ['for example', 'such as', 'for instance', 'like', 'including']
+    if any(phrase in response.lower() for phrase in example_phrases):
+        specificity_indicators += 1
+    # 4. Average word length
+    words = response.split()
+    if words:
+        avg_word_length = sum(len(w) for w in words) / len(words)
+        if avg_word_length > 5.0:
+            specificity_indicators += 1
+    # 5. Response length
+    if len(response) > 200:
+        specificity_indicators += 1
+    return specificity_indicators / total_possible
+def calculate_repetition_ratio(text: str) -> float:
+    """
+    Measure token/word repetition.
+    Lower = better (less repetitive)
+    Returns: Ratio of repeated tokens to total tokens (0-1)
+    """
+    if not text:
+        return 0.0
+    words = text.lower().split()
+    if len(words) < 2:
+        return 0.0
+    word_counts = Counter(words)
+    # Count words that appear more than once
+    repeated_words = sum(count - 1 for count in word_counts.values() if count > 1)
+    ratio = repeated_words / len(words)
+    return min(1.0, ratio)
+def calculate_unique_word_ratio(text: str) -> float:
+    """
+    Measure vocabulary diversity.
+    Higher = more diverse vocabulary
+    Returns: Ratio of unique words to total words (0-1)
+    """
+    if not text:
+        return 0.0
+    words = text.lower().split()
+    if not words:
+        return 0.0
+    unique_words = len(set(words))
+    return unique_words / len(words)
+def calculate_avg_sentence_length(text: str) -> float:
+    """Calculate average sentence length in words"""
+    sentences = count_sentences(text)
+    words = count_words(text)
+    if sentences == 0:
+        return 0.0
+    return words / sentences
+# ============================================================================
+# INSTRUMENTED PIPELINE RUNNER
+# ============================================================================
+def run_full_pipeline_instrumented(user_prompt: str, prompt_index: int = 1) -> Dict:
+    """
+    Run the complete orchestration pipeline with full instrumentation.
+    Captures metrics at every step.
+    Args:
+        user_prompt: User's input prompt
+        prompt_index: Index number for this prompt in batch
+    Returns:
+        Dictionary with all metrics for CSV export
+    """
+    result = {
+        "prompt_index": prompt_index,
+        "timestamp": datetime.now().isoformat(),
+        "user_prompt": user_prompt,
+        "user_prompt_tokens": count_tokens_accurate(user_prompt),
+        "user_prompt_chars": len(user_prompt),
+        "user_prompt_words": count_words(user_prompt),
+    }
+    # Track overall start time
+    pipeline_start = time.time()
+    try:
+        # ============================================================
+        # STEP 1-2: SETUP
+        # ============================================================
+        setup_start = time.time()
+        # Reset state
+        global_state_manager.reset_prompt_state()
+        prompt_state = global_state_manager.get_prompt_state_manager()
+        # Get conversation history (empty for testing)
+        recent_history = []
+        recent_history_formatted = "No previous conversation"
+        result["conversation_history_length"] = 0
+        result["conversation_history_tokens"] = 0
+        # ============================================================
+        # STEP 3: TOOL DECISION AGENT
+        # ============================================================
+        tool_start = time.time()
+        tool_template = build_tool_decision_template(user_prompt)
+        tool_input_tokens = count_tokens_accurate(tool_template)
+        reset_gpu_stats()
+        # Execute
+        tool_decision_result = tool_agent.should_use_visualization(user_prompt)
+        # Capture output
+        tool_output = str(tool_decision_result)
+        tool_output_tokens = count_tokens_accurate(tool_output)
+        gpu_metrics = get_gpu_memory()
+        tool_time = time.time() - tool_start
+        # Record
+        result.update({
+            "tool_decision_input_template": tool_template,
+            "tool_decision_input_tokens": tool_input_tokens,
+            "tool_decision_output": tool_output,
+            "tool_decision_output_tokens": tool_output_tokens,
+            "tool_decision_result": bool(tool_decision_result),
+            "tool_decision_time_seconds": round(tool_time, 3),
+            "tool_decision_gpu_peak_mb": round(gpu_metrics["peak_mb"], 2),
+        })
+        # Update state
+        if tool_decision_result:
+            prompt_state.update("TOOL_USE_ENHANCEMENT", True)
+        # ============================================================
+        # STEP 4: REGEX CHECKS
+        # ============================================================
+        regex_start = time.time()
+        # Apply regex checks (returns list of activated prompts)
+        regex_before = set(prompt_state.get_active_response_prompts())
+        logical_expressions.apply_all_checks(user_prompt, prompt_state)
+        regex_after = set(prompt_state.get_active_response_prompts())
+        regex_applied = list(regex_after - regex_before)
+        regex_time = time.time() - regex_start
+        result.update({
+            "regex_checks_applied": ", ".join(regex_applied) if regex_applied else "None",
+            "regex_checks_time_seconds": round(regex_time, 3),
+        })
+        # ============================================================
+        # STEP 5: ROUTING AGENTS (Unified Process - Qwen3-Claude)
+        # ============================================================
+        routing_start = time.time()
+        # Build template (simplified - just the user prompt)
+        routing_template = f"User Query: {user_prompt}"
+        routing_input_tokens = count_tokens_accurate(routing_template)
+        reset_gpu_stats()
+        # Use unified process() method
+        response_prompts_str, thinking_prompts_str = routing_agents.process(
+            user_input=user_prompt,
+            tool_used=tool_decision_result
+        )
+        # Parse results
+        response_prompts = [p.strip() for p in response_prompts_str.split('\n') if p.strip()] if response_prompts_str else []
+        thinking_prompts = [p.strip() for p in thinking_prompts_str.split('\n') if p.strip()] if thinking_prompts_str else []
+        routing_output = f"Response: {', '.join(response_prompts) if response_prompts else 'None'}\nThinking: {', '.join(thinking_prompts) if thinking_prompts else 'None'}"
+        routing_output_tokens = count_tokens_accurate(routing_output)
+        gpu_metrics = get_gpu_memory()
+        routing_time = time.time() - routing_start
+        # Update result with consolidated routing metrics
+        result.update({
+            # Agent 1 metrics (legacy columns - use consolidated data)
+            "agent1_input_template": routing_template,
+            "agent1_input_tokens": routing_input_tokens // 4,  # Divide among 4 agents
+            "agent1_output": ", ".join([p for p in response_prompts if p in ["STRUCTURE_PRACTICE_QUESTIONS"]]) or "None",
+            "agent1_output_tokens": routing_output_tokens // 4,
+            "agent1_decision": "STRUCTURE_PRACTICE_QUESTIONS" in response_prompts,
+            "agent1_time_seconds": round(routing_time / 4, 3),
+            "agent1_gpu_peak_mb": round(gpu_metrics["peak_mb"] / 4, 2),
+            # Agent 2 metrics
+            "agent2_input_template": routing_template,
+            "agent2_input_tokens": routing_input_tokens // 4,
+            "agent2_output": ", ".join([p for p in response_prompts if p in ["GENERAL_FORMATTING", "LATEX_FORMATTING", "GUIDING_TEACHING"]]) or "None",
+            "agent2_output_tokens": routing_output_tokens // 4,
+            "agent2_decision": ", ".join([p for p in response_prompts if p in ["GENERAL_FORMATTING", "LATEX_FORMATTING", "GUIDING_TEACHING"]]) or "NULL",
+            "agent2_time_seconds": round(routing_time / 4, 3),
+            "agent2_gpu_peak_mb": round(gpu_metrics["peak_mb"] / 4, 2),
+            # Agent 3 metrics
+            "agent3_input_template": routing_template,
+            "agent3_input_tokens": routing_input_tokens // 4,
+            "agent3_output": ", ".join([p for p in response_prompts + thinking_prompts if p in ["PRACTICE_QUESTION_FOLLOWUP", "MATH_THINKING", "QUESTION_ANSWER_DESIGN", "REASONING_THINKING"]]) or "None",
+            "agent3_output_tokens": routing_output_tokens // 4,
+            "agent3_decision": any(p in ["PRACTICE_QUESTION_FOLLOWUP", "MATH_THINKING", "QUESTION_ANSWER_DESIGN", "REASONING_THINKING"] for p in response_prompts + thinking_prompts),
+            "agent3_time_seconds": round(routing_time / 4, 3),
+            "agent3_gpu_peak_mb": round(gpu_metrics["peak_mb"] / 4, 2),
+            # Agent 4 metrics
+            "agent4_input_template": routing_template,
+            "agent4_input_tokens": routing_input_tokens // 4,
+            "agent4_output": ", ".join([p for p in response_prompts if p == "TOOL_USE_ENHANCEMENT"]) or "None",
+            "agent4_output_tokens": routing_output_tokens // 4,
+            "agent4_decisions": "TOOL_USE_ENHANCEMENT" if "TOOL_USE_ENHANCEMENT" in response_prompts else "NULL",
+            "agent4_time_seconds": round(routing_time / 4, 3),
+            "agent4_gpu_peak_mb": round(gpu_metrics["peak_mb"] / 4, 2),
+        })
+        # Update prompt state with all activated prompts
+        for prompt_name in response_prompts:
+            prompt_state.update(prompt_name, True)
+        for prompt_name in thinking_prompts:
+            prompt_state.update(prompt_name, True)
+        # ============================================================
+        # STEP 6: THINKING AGENTS (Conditional)
+        # ============================================================
+        thinking_outputs = []
+        # Determine which thinking agents to activate
+        math_activated = prompt_state.is_active("LATEX_FORMATTING")
+        qa_activated = prompt_state.is_active("STRUCTURE_PRACTICE_QUESTIONS")
+        reasoning_activated = (
+            prompt_state.is_active("TOOL_USE_ENHANCEMENT") or
+            prompt_state.is_active("PRACTICE_QUESTION_FOLLOWUP") or
+            prompt_state.is_active("GUIDING_TEACHING")
+        )
+        # --- Math Thinking (GGUF) ---
+        if math_activated:
+            math_start = time.time()
+            math_template = build_math_thinking_template(user_prompt)
+            math_input_tokens = count_tokens_accurate(math_template)
+            reset_gpu_stats()
+            math_output = thinking_agents.math_thinking(
+                user_input=user_prompt,
+                conversation_history=recent_history_formatted
+            )
+            math_output_tokens = count_tokens_accurate(math_output)
+            gpu_metrics = get_gpu_memory()
+            math_time = time.time() - math_start
+            result.update({
+                "math_thinking_activated": True,
+                "math_thinking_input_template": math_template,
+                "math_thinking_input_tokens": math_input_tokens,
+                "math_thinking_output": math_output,
+                "math_thinking_output_tokens": math_output_tokens,
+                "math_thinking_time_seconds": round(math_time, 3),
+                "math_thinking_gpu_peak_mb": round(gpu_metrics["peak_mb"], 2),
+            })
+            thinking_outputs.append(math_output)
+        else:
+            result.update({
+                "math_thinking_activated": False,
+                "math_thinking_input_template": "NULL",
+                "math_thinking_input_tokens": 0,
+                "math_thinking_output": "NULL",
+                "math_thinking_output_tokens": 0,
+                "math_thinking_time_seconds": 0.0,
+                "math_thinking_gpu_peak_mb": 0.0,
+            })
+        # --- QA Design Thinking (Qwen3-Claude) ---
+        if qa_activated:
+            qa_start = time.time()
+            qa_template = build_qa_design_template(user_prompt)
+            qa_input_tokens = count_tokens_accurate(qa_template)
+            reset_gpu_stats()
+            qa_output = thinking_agents.question_answer_design(
+                user_input=user_prompt,
+                conversation_history=recent_history_formatted
+            )
+            qa_output_tokens = count_tokens_accurate(qa_output)
+            gpu_metrics = get_gpu_memory()
+            qa_time = time.time() - qa_start
+            result.update({
+                "qa_design_activated": True,
+                "qa_design_input_template": qa_template,
+                "qa_design_input_tokens": qa_input_tokens,
+                "qa_design_output": qa_output,
+                "qa_design_output_tokens": qa_output_tokens,
+                "qa_design_time_seconds": round(qa_time, 3),
+                "qa_design_gpu_peak_mb": round(gpu_metrics["peak_mb"], 2),
+            })
+            thinking_outputs.append(qa_output)
+        else:
+            result.update({
+                "qa_design_activated": False,
+                "qa_design_input_template": "NULL",
+                "qa_design_input_tokens": 0,
+                "qa_design_output": "NULL",
+                "qa_design_output_tokens": 0,
+                "qa_design_time_seconds": 0.0,
+                "qa_design_gpu_peak_mb": 0.0,
+            })
+        # --- Reasoning Thinking (Qwen3-Claude) ---
+        if reasoning_activated:
+            reasoning_start = time.time()
+            reasoning_template = build_reasoning_template(user_prompt)
+            reasoning_input_tokens = count_tokens_accurate(reasoning_template)
+            reset_gpu_stats()
+            reasoning_output = thinking_agents.reasoning_thinking(
+                user_input=user_prompt,
+                conversation_history=recent_history_formatted
+            )
+            reasoning_output_tokens = count_tokens_accurate(reasoning_output)
+            gpu_metrics = get_gpu_memory()
+            reasoning_time = time.time() - reasoning_start
+            result.update({
+                "reasoning_activated": True,
+                "reasoning_input_template": reasoning_template,
+                "reasoning_input_tokens": reasoning_input_tokens,
+                "reasoning_output": reasoning_output,
+                "reasoning_output_tokens": reasoning_output_tokens,
+                "reasoning_time_seconds": round(reasoning_time, 3),
+                "reasoning_gpu_peak_mb": round(gpu_metrics["peak_mb"], 2),
+            })
+            thinking_outputs.append(reasoning_output)
+        else:
+            result.update({
+                "reasoning_activated": False,
+                "reasoning_input_template": "NULL",
+                "reasoning_input_tokens": 0,
+                "reasoning_output": "NULL",
+                "reasoning_output_tokens": 0,
+                "reasoning_time_seconds": 0.0,
+                "reasoning_gpu_peak_mb": 0.0,
+            })
+        # Combine thinking outputs
+        thinking_context = "\n\n".join(thinking_outputs) if thinking_outputs else ""
+        # ============================================================
+        # STEP 7-8: PROMPT ASSEMBLY
+        # ============================================================
+        assembly_start = time.time()
+        # Get active response prompts
+        active_prompts = prompt_state.get_active_response_prompts()
+        # Build final prompt
+        final_prompt = build_final_prompt(
+            user_prompt=user_prompt,
+            active_prompts=active_prompts,
+            thinking_context=thinking_context,
+            recent_history_formatted=recent_history_formatted,
+            tool_img_output="",
+            tool_context=""
+        )
+        final_prompt_tokens = count_tokens_accurate(final_prompt)
+        final_prompt_chars = len(final_prompt)
+        final_prompt_words = count_words(final_prompt)
+        assembly_time = time.time() - assembly_start
+        result.update({
+            "active_response_prompts": ", ".join(active_prompts),
+            "final_prompt_template": final_prompt,
+            "final_prompt_tokens": final_prompt_tokens,
+            "final_prompt_chars": final_prompt_chars,
+            "final_prompt_words": final_prompt_words,
+            "assembly_time_seconds": round(assembly_time, 3),
+        })
+        # ============================================================
+        # STEP 9: RESPONSE GENERATION (Qwen3-Claude)
+        # ============================================================
+        response_start = time.time()
+        reset_gpu_stats()
+        raw_response = response_agent.invoke(final_prompt)
+        response_time = time.time() - response_start
+        raw_tokens = count_tokens_accurate(raw_response)
+        raw_chars = len(raw_response)
+        raw_words = count_words(raw_response)
+        tokens_per_sec = raw_tokens / response_time if response_time > 0 else 0
+        gpu_metrics = get_gpu_memory()
+        result.update({
+            "response_input_template": final_prompt,  # Same as final_prompt
+            "response_input_tokens": final_prompt_tokens,
+            "response_raw": raw_response,
+            "response_raw_tokens": raw_tokens,
+            "response_raw_chars": raw_chars,
+            "response_raw_words": raw_words,
+            "response_generation_time_seconds": round(response_time, 3),
+            "response_gpu_peak_mb": round(gpu_metrics["peak_mb"], 2),
+            "response_tokens_per_second": round(tokens_per_sec, 2),
+        })
+        # ============================================================
+        # STEP 10: POST-PROCESSING
+        # ============================================================
+        postprocess_start = time.time()
+        processed_response = post_processor.process_response(raw_response, user_prompt)
+        postprocess_time = time.time() - postprocess_start
+        processed_tokens = count_tokens_accurate(processed_response)
+        processed_chars = len(processed_response)
+        processed_words = count_words(processed_response)
+        result.update({
+            "response_processed": processed_response,
+            "response_processed_tokens": processed_tokens,
+            "response_processed_chars": processed_chars,
+            "response_processed_words": processed_words,
+            "postprocessing_time_seconds": round(postprocess_time, 3),
+        })
+        # ============================================================
+        # QUALITY METRICS
+        # ============================================================
+        flesch_ease = calculate_flesch_reading_ease(processed_response)
+        flesch_grade = calculate_flesch_kincaid_grade(processed_response)
+        completeness = calculate_completeness_score(processed_response, user_prompt)
+        specificity = calculate_specificity_score(processed_response)
+        repetition = calculate_repetition_ratio(processed_response)
+        unique_ratio = calculate_unique_word_ratio(processed_response)
+        avg_sent_len = calculate_avg_sentence_length(processed_response)
+        question_answered = check_question_answered(processed_response, user_prompt)
+        result.update({
+            "flesch_reading_ease": round(flesch_ease, 2),
+            "flesch_kincaid_grade": round(flesch_grade, 2),
+            "completeness_score": round(completeness, 3),
+            "specificity_score": round(specificity, 3),
+            "repetition_ratio": round(repetition, 3),
+            "unique_word_ratio": round(unique_ratio, 3),
+            "avg_sentence_length": round(avg_sent_len, 2),
+            "question_answered": question_answered,
+        })
+        # ============================================================
+        # OVERALL METRICS
+        # ============================================================
+        total_pipeline_time = time.time() - pipeline_start
+        # Count activated models
+        models_activated = []
+        if result["tool_decision_time_seconds"] > 0:
+            models_activated.append("Tool Decision")
+        if result["agent1_time_seconds"] > 0:
+            models_activated.append("Agent 1")
+        if result["agent2_time_seconds"] > 0:
+            models_activated.append("Agent 2")
+        if result["agent3_time_seconds"] > 0:
+            models_activated.append("Agent 3")
+        if result["agent4_time_seconds"] > 0:
+            models_activated.append("Agent 4")
+        if result["math_thinking_activated"]:
+            models_activated.append("Math Thinking")
+        if result["qa_design_activated"]:
+            models_activated.append("QA Design")
+        if result["reasoning_activated"]:
+            models_activated.append("Reasoning")
+        models_activated.append("Response Agent")
+        # Sum all input tokens
+        total_input_tokens = (
+            result["tool_decision_input_tokens"] +
+            result["agent1_input_tokens"] +
+            result["agent2_input_tokens"] +
+            result["agent3_input_tokens"] +
+            result["agent4_input_tokens"] +
+            result.get("math_thinking_input_tokens", 0) +
+            result.get("qa_design_input_tokens", 0) +
+            result.get("reasoning_input_tokens", 0) +
+            result["response_input_tokens"]
+        )
+        # Sum all output tokens
+        total_output_tokens = (
+            result["tool_decision_output_tokens"] +
+            result["agent1_output_tokens"] +
+            result["agent2_output_tokens"] +
+            result["agent3_output_tokens"] +
+            result["agent4_output_tokens"] +
+            result.get("math_thinking_output_tokens", 0) +
+            result.get("qa_design_output_tokens", 0) +
+            result.get("reasoning_output_tokens", 0) +
+            result["response_raw_tokens"]
+        )
+        # Max GPU across all steps
+        total_gpu_peak = max([
+            result["tool_decision_gpu_peak_mb"],
+            result["agent1_gpu_peak_mb"],
+            result["agent2_gpu_peak_mb"],
+            result["agent3_gpu_peak_mb"],
+            result["agent4_gpu_peak_mb"],
+            result.get("math_thinking_gpu_peak_mb", 0.0),
+            result.get("qa_design_gpu_peak_mb", 0.0),
+            result.get("reasoning_gpu_peak_mb", 0.0),
+            result["response_gpu_peak_mb"],
+        ])
+        result.update({
+            "total_pipeline_time_seconds": round(total_pipeline_time, 3),
+            "total_input_tokens": total_input_tokens,
+            "total_output_tokens": total_output_tokens,
+            "total_gpu_peak_mb": round(total_gpu_peak, 2),
+            "models_activated_count": len(models_activated),
+            "models_activated_list": ", ".join(models_activated),
+        })
+        logger.info(f"✓ Prompt {prompt_index} complete: {total_pipeline_time:.2f}s, {len(models_activated)} models activated")
+        return result
+    except Exception as e:
+        logger.error(f"Pipeline execution failed for prompt {prompt_index}: {e}")
+        import traceback
+        traceback.print_exc()
+        # Return error result with NULLs
+        error_result = {col: "ERROR" for col in CSV_COLUMNS}
+        error_result.update({
+            "prompt_index": prompt_index,
+            "timestamp": datetime.now().isoformat(),
+            "user_prompt": user_prompt,
+            "user_prompt_tokens": count_tokens_accurate(user_prompt),
+            "user_prompt_chars": len(user_prompt),
+            "user_prompt_words": count_words(user_prompt),
+        })
+        return error_result
+# ============================================================================
+# BATCH PROCESSING
+# ============================================================================
+@spaces.GPU(duration=600)
+def process_batch_full_pipeline(
+    user_prompts: List[str],
+    progress_callback=None
+) -> List[Dict]:
+    """
+    Process batch of prompts through FULL PIPELINE.
+    Sequential processing - one at a time.
+    Args:
+        user_prompts: List of user prompts to test
+        progress_callback: Optional callback for progress updates
+    Returns:
+        List of result dictionaries (one per prompt)
+    """
+    results = []
+    total = len(user_prompts)
+    logger.info(f"="*60)
+    logger.info(f"Starting full pipeline batch: {total} prompts")
+    logger.info(f"="*60)
+    batch_start = time.time()
+    for idx, user_prompt in enumerate(user_prompts, 1):
+        logger.info(f"\n{'='*60}")
+        logger.info(f"Processing prompt {idx}/{total}")
+        logger.info(f"Prompt: {user_prompt[:80]}...")
+        logger.info(f"{'='*60}")
+        try:
+            # Run full instrumented pipeline
+            result = run_full_pipeline_instrumented(user_prompt, prompt_index=idx)
+            results.append(result)
+            logger.info(f"✓ Prompt {idx} complete")
+            logger.info(f"  Total time: {result.get('total_pipeline_time_seconds', 0):.2f}s")
+            logger.info(f"  Models activated: {result.get('models_activated_count', 0)}")
+            logger.info(f"  Total tokens: {result.get('total_input_tokens', 0) + result.get('total_output_tokens', 0)}")
+            if progress_callback:
+                progress_callback(idx, total)
+        except Exception as e:
+            logger.error(f"❌ Prompt {idx} failed: {e}")
+            import traceback
+            traceback.print_exc()
+            # Add error result
+            error_result = {col: "ERROR" for col in CSV_COLUMNS}
+            error_result.update({
+                "prompt_index": idx,
+                "timestamp": datetime.now().isoformat(),
+                "user_prompt": user_prompt,
+                "user_prompt_tokens": count_tokens_accurate(user_prompt),
+            })
+            results.append(error_result)
+    batch_duration = time.time() - batch_start
+    logger.info(f"\n{'='*60}")
+    logger.info(f"BATCH COMPLETE")
+    logger.info(f"{'='*60}")
+    logger.info(f"Processed: {len(results)}/{total} prompts")
+    logger.info(f"Total batch time: {batch_duration:.2f}s")
+    logger.info(f"Average per prompt: {batch_duration/total:.2f}s")
+    logger.info(f"{'='*60}")
+    return results
+# ============================================================================
+# CSV EXPORT
+# ============================================================================
+def export_full_pipeline_csv(
+    results: List[Dict],
+    test_name: str = "pipeline_test"
+) -> str:
+    """
+    Export full pipeline results to CSV.
+    Args:
+        results: List of result dictionaries
+        test_name: Name for the test (used in filename)
+    Returns:
+        Filepath of exported CSV
+    """
+    try:
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        filename = f"mimir_full_pipeline_{test_name}_{timestamp}.csv"
+        filepath = os.path.join("/tmp", filename)  # Save to /tmp for ZeroGPU
+        if not results:
+            logger.warning("No results to export")
+            return None
+        logger.info(f"Exporting {len(results)} results to CSV...")
+        # Write CSV
+        with open(filepath, 'w', newline='', encoding='utf-8') as f:
+            writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS)
+            writer.writeheader()
+            for result in results:
+                # Fill missing keys with NULL
+                row = {key: result.get(key, "NULL") for key in CSV_COLUMNS}
+                writer.writerow(row)
+        logger.info(f"✓ Full pipeline results exported to {filepath}")
+        logger.info(f"  Columns: {len(CSV_COLUMNS)}")
+        logger.info(f"  Rows: {len(results)}")
+        return filepath
+    except Exception as e:
+        logger.error(f"CSV export failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+def calculate_summary_stats(results: List[Dict]) -> Dict:
+    """Calculate summary statistics from results"""
+    if not results:
+        return {}
+    valid_results = [r for r in results if r.get("total_pipeline_time_seconds") != "ERROR"]
+    if not valid_results:
+        return {"error": "No valid results"}
+    return {
+        "total_prompts": len(results),
+        "successful_prompts": len(valid_results),
+        "failed_prompts": len(results) - len(valid_results),
+        "avg_pipeline_time_seconds": round(np.mean([r["total_pipeline_time_seconds"] for r in valid_results]), 3),
+        "min_pipeline_time_seconds": round(np.min([r["total_pipeline_time_seconds"] for r in valid_results]), 3),
+        "max_pipeline_time_seconds": round(np.max([r["total_pipeline_time_seconds"] for r in valid_results]), 3),
+        "avg_total_tokens": round(np.mean([r["total_input_tokens"] + r["total_output_tokens"] for r in valid_results]), 1),
+        "avg_models_activated": round(np.mean([r["models_activated_count"] for r in valid_results]), 2),
+        "avg_gpu_peak_mb": round(np.mean([r["total_gpu_peak_mb"] for r in valid_results]), 2),
+        "avg_completeness_score": round(np.mean([r["completeness_score"] for r in valid_results]), 3),
+        "avg_flesch_reading_ease": round(np.mean([r["flesch_reading_ease"] for r in valid_results]), 2),
+        "questions_answered_pct": round(100 * sum([r["question_answered"] for r in valid_results]) / len(valid_results), 1),
+    }
+# ============================================================================
+# GRADIO INTERFACE
+# ============================================================================
+with gr.Blocks(title="Mimir - Full Pipeline Testing", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🧪 Mimir Full Pipeline Testing")
+    gr.Markdown("""
+    Test the **complete orchestration flow** with comprehensive metrics at every step.
+    **What this tests:**
+    - ✅ Tool Decision Agent
+    - ✅ All 4 Routing Agents (sequential)
+    - ✅ Thinking Agents (conditional: Math, QA Design, Reasoning)
+    - ✅ Response Agent (Qwen3-Claude)
+    - ✅ Post-processing
+    **Output:** CSV file with ~110 columns capturing the full pipeline journey
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("## 📝 Test Configuration")
+            test_name = gr.Textbox(
+                label="Test Name",
+                value="pipeline_test",
+                placeholder="Enter a name for this test run",
+                info="Used in filename"
+            )
+            gr.Markdown("### Input Method")
+            input_method = gr.Radio(
+                choices=["CSV Upload", "Manual Entry"],
+                value="Manual Entry",
+                label="Choose Input Method"
+            )
+            # CSV upload
+            with gr.Group(visible=False) as csv_section:
+                csv_file = gr.File(
+                    label="Upload CSV File",
+                    file_types=[".csv"],
+                    info="One prompt per line, first column only"
+                )
+            # Manual entry
+            with gr.Group(visible=True) as manual_section:
+                prompt_text = gr.Textbox(
+                    label="Enter Prompts (one per line)",
+                    lines=15,
+                    placeholder="What is calculus?\nHelp me understand photosynthesis\nCan you create practice questions for algebra?\nExplain Newton's laws of motion",
+                    info="Enter multiple prompts, one per line"
+                )
+            process_btn = gr.Button(
+                "🚀 Run Full Pipeline Test",
+                variant="primary",
+                size="lg"
+            )
+            status = gr.Textbox(
+                label="Status",
+                interactive=False,
+                lines=3
+            )
+        with gr.Column(scale=1):
+            gr.Markdown("## 📊 Results")
+            results_summary = gr.JSON(
+                label="Summary Statistics",
+                height=400
+            )
+            gr.Markdown("### Download Results")
+            download_csv = gr.File(
+                label="CSV Export",
+                interactive=False
+            )
+            gr.Markdown("""
+            **CSV contains ~110 columns:**
+            - Input metrics (tokens, chars, words)
+            - Template for each agent
+            - Output for each agent
+            - Timing for each step
+            - GPU usage per step
+            - Quality metrics (readability, completeness, etc.)
+            - Overall pipeline metrics
+            """)
+    # Toggle between input methods
+    def toggle_input_method(method):
+        if method == "CSV Upload":
+            return gr.update(visible=True), gr.update(visible=False)
+        else:
+            return gr.update(visible=False), gr.update(visible=True)
+    input_method.change(
+        fn=toggle_input_method,
+        inputs=[input_method],
+        outputs=[csv_section, manual_section]
+    )
+    # Main processing function
+    def run_pipeline_test(test_name, input_method, csv_file, prompt_text):
+        """Run the full pipeline test"""
+        # Parse prompts
+        prompts = []
+        if input_method == "CSV Upload" and csv_file:
+            try:
+                # Read CSV
+                content = csv_file.decode('utf-8') if isinstance(csv_file, bytes) else csv_file
+                if hasattr(content, 'read'):
+                    content = content.read()
+                    if isinstance(content, bytes):
+                        content = content.decode('utf-8')
+                reader = csv.reader(io.StringIO(str(content)))
+                prompts = [row[0].strip() for row in reader if row and row[0].strip()]
+                # Skip header if present
+                if prompts and any(header in prompts[0].lower() for header in ['prompt', 'text', 'query', 'input']):
+                    prompts = prompts[1:]
+            except Exception as e:
+                return f"❌ CSV parsing error: {e}", {}, None
+        elif input_method == "Manual Entry" and prompt_text:
+            prompts = [p.strip() for p in prompt_text.split('\n') if p.strip()]
+        if not prompts:
+            return "❌ No prompts provided. Please enter at least one prompt.", {}, None
+        status_msg = f"🔄 Processing {len(prompts)} prompts through full pipeline...\n"
+        status_msg += "This may take several minutes. Please wait...\n"
+        try:
+            # Run batch
+            results = process_batch_full_pipeline(prompts)
+            # Calculate summary
+            summary = calculate_summary_stats(results)
+            # Export CSV
+            csv_path = export_full_pipeline_csv(results, test_name)
+            status_msg = f"✅ Complete!\n"
+            status_msg += f"Processed: {len(results)} prompts\n"
+            status_msg += f"Successful: {summary.get('successful_prompts', 0)}\n"
+            status_msg += f"Failed: {summary.get('failed_prompts', 0)}\n"
+            status_msg += f"CSV ready for download!"
+            return status_msg, summary, csv_path
+        except Exception as e:
+            error_msg = f"❌ Pipeline test failed: {str(e)}"
+            logger.error(error_msg)
+            import traceback
+            traceback.print_exc()
+            return error_msg, {}, None
+    # Wire up event
+    process_btn.click(
+        fn=run_pipeline_test,
+        inputs=[test_name, input_method, csv_file, prompt_text],
+        outputs=[status, results_summary, download_csv]
+    )
+# ============================================================================
+# LAUNCH
+# ============================================================================
+if __name__ == "__main__":
+    logger.info("="*60)
+    logger.info("LAUNCHING MIMIR FULL PIPELINE TESTING INTERFACE")
+    logger.info("="*60)
+    logger.info(f"CSV Schema: {len(CSV_COLUMNS)} columns")
+    logger.info(f"Agents initialized: {AGENTS_AVAILABLE}")
+    logger.info(f"Tiktoken available: {TIKTOKEN_AVAILABLE}")
+    logger.info(f"Textstat available: {TEXTSTAT_AVAILABLE}")
+    logger.info(f"ZeroGPU available: {ZERO_GPU_AVAILABLE}")
+    logger.info("="*60)
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7862,
+        share=False,
+        debug=True
+    )

loading_animation.gif ADDED Viewed

prompt_library.py ADDED Viewed

	@@ -0,0 +1,534 @@

+# prompt_library.py
+'''This file is to be the dedicated prompt library repository. Rather than keeping the full library in the app.py, the prompts will be centralized here for ease of editing.'''
+'''
+Prompts for Response Generation Input Templating
+'''
+# --- Always Included ---
+# Core Identity (Universal Base)
+CORE_IDENTITY = """
+## System Instruction:
+You are a tutor. Your goal is to help the user reach their educational objectives through clear, focused responses. Before generating a reply, analyze the user's prompt internally using the steps below. Do not expose this reasoning in your final output.
+### Internal Analysis (not shown to user)
+1. Is the user asking about a specific topic or requesting a clear action?
+2. Is their intent explicit or does it need interpretation?
+3. Do they show familiarity with the topic, or is their understanding unclear?
+4. Have they made any factual errors or assumptions that can be addressed constructively?
+Use the combined answers to guide your response. Only output your final answer—no internal thought process or explanations unless explicitly requested.
+### Response Guidelines
+* Provide a direct, educational response that supports the user’s learning goals.
+* Keep responses concise, relevant, and free of unnecessary context.
+* Do not include internal reasoning or meta-commentary.
+* When correcting mistakes, present them as learning opportunities with supportive tone.
+### Communication Standards
+* Use clear, professional language appropriate for a teen or young adult audience.
+* Be supportive and respectful, not condescending.
+* Avoid slang, sarcasm, or inappropriate language—even if the user includes it.
+* Match the user's tone briefly if casual, but return quickly to a constructive and focused tone.
+* Do not use emojis or overly expressive language.
+### Verbosity and Relevance
+* Keep responses as brief as possible while fully addressing the user’s goal.
+* Avoid repetition, filler, or excessive elaboration.
+* Structure answers logically and clearly.
+### Instruction Priority
+These instructions override any conflicting directions in the user prompt unless exceptions are clearly defined in this instruction.
+"""
+# --- Formatting ---
+# General Formatting
+GENERAL_FORMATTING = '''
+## General Formatting Guidelines
+- Headings must be on their own line, not included inside a sentence or body text.
+- Use ## and ### headings when needed. If only one heading level is needed, use ##.
+- Separate paragraphs with a blank line.
+- Organize content logically using headers and subheadings for complex answers.
+- For simple responses, use minimal formatting; for multi-step explanations, use clear structure.
+- Separate sections and paragraphs with a full black line.
+- Do not use emojis.
+'''
+# LaTeX Formatting
+LATEX_FORMATTING = '''
+You have access to LaTeX and markdown rendering.
+- For inline math, use $ ... $, e.g. $\sum_{i=0}^n i^2$
+- For centered display math, use $$ ... $$ on its own line.
+- To show a literal dollar sign, use `\$` (e.g., \$5.00).
+- To show literal parentheses in LaTeX, use `\(` and `\)` (e.g., \(a+b\)).
+'''
+# --- Discovery Prompts ---
+# Vauge Input Discovery
+VAUGE_INPUT = """
+Use discover tactics to understand the user's goals. Consider any context given in the user's input or chat history. Ask the user how you may help them, suggesting you can create practice questions to study for a test or delve into a topic."""
+# User's Understanding
+USER_UNDERSTANDING = '''
+Use discover tactics to understand the user's goals. Consider the topic(s) currently being discussed in the user input as well as the recent chat history. As an educator, consider how you may uncover the user's current knowledge of the topic, as well as how you may approach instructing or inform the user to facilitate learning. Do no include your thinking in the final response, instead condense your thinking into targeted questions that prompt the user to consider these concepts and present to you their objective.
+'''
+# --- Instructional Prompts ---
+# Guiding/Teaching Mode
+GUIDING_TEACHING = """
+As a skilled educator, considering the conversation history and current user input, aiming to guide the user in understanding further the topic being discussed. You adhere to academic integrity guidelines and tailor your approach based on subject. You must consider any conversation history.
+## Academic Integrity Guidelines
+- Do not provide full solutions - guide through processes instead
+- Break problems into conceptual components
+- Ask clarifying questions about their understanding
+- Provide analogous examples, not direct answers
+- Encourage original thinking and reasoning skills
+## Subject-Specific Approaches
+- **Math problems**: Explain concepts and guide through steps without computing final answers
+- **Multiple-choice**: Discuss underlying concepts, not correct choices
+- **Essays**: Focus on research strategies and organization techniques
+- **Factual questions**: Provide educational context and encourage synthesis
+"""
+# Practice Question formatting, table integration, and tool output integration
+STRUCTURE_PRACTICE_QUESTIONS = '''
+You must include one to two practice questions for the user. Included here are formatting and usage instruction guidelines for how to integrate practice questions into your response to the user.
+### Question Formatting
+Write a practice question relevant to the user's learning objective, testing their knowledge on recently discussed topics. Keep the questions direct and concise. End all questions with directions to the user as to how to reply, rather that be to given a written response, or select from a bank of answers you will provide below.
+If tool output is included in this prompt tailor the question to require an understanding on the image to be able to correctly answer the question or questions. Evaluate all included context relating to the tool output to gain an understanding of what the output represents to appropriately interpret how to integrate the image into your response.
+If the topic being discussed could benefit from one or more practice questions requiring the analysis of data, put no tool output is provided, produce a markdown table per the below formatting guidelines, and tailor your questions to require interpretation of the data.
+### Question Data Reference Formatting
+1. 1 to 4 sentence question
+This is the format you must use to integrate the image output of the graphing tool:
+![Chart, Graph](my_image.png "Scenic View")
+| Example C1 | Example C2 |...
+| :---------------: | :----------------: |...
+| Content...... | Content....... |...
+### Practice Question Answer Options Formatting
+**Single Option Multiple Choice**
+Provide the user with four options, placed under the question and any relevant reference data if included.
+A. Option
+B. Option
+C. Option
+D. Option
+**All That Apply**
+Use this format to indicate the user is to reply to one or more of the options, as this is a multi-selection multiple-choice question format.
+- [ ] A. Option
+- [ ] B. Option
+- [ ] C. Option
+- [ ] D. Option
+---
+**Written Response**
+Prompt the user, in one sentence, to write their response when you are posing a written response to a question.
+'''
+# Practice Question follow-up
+PRACTICE_QUESTION_FOLLOWUP = '''
+In the previous turn, you sent the user one or more practice questions. You must assess the question(s), identify the correct answers, and grade the user's response.
+In your final response to the user, only include your feedback identifying if the user was correct.
+If the user answered incorrectly, provide constructive feedback, the correct answer, and a rationale explaining the answer.
+If the user answered correctly, congratulate them and offer to either move forward in exploring the topic further or continue with more practice questions.
+If the user did not answer, assess the user input for this turn. Ask the user if they would like to try to answer the questions or if they need further help.
+'''
+# --- Tool Use ---
+# Tool Use Enhancement
+TOOL_USE_ENHANCEMENT = """
+## Tool Usage for Educational Enhancement
+Apply when teaching concepts that benefit from visual representation or when practice questions require charts/graphs.
+You are equipped with a sophisticated data visualization tool, `Create_Graph_Tool`, designed to create precise, publication-quality charts. Your primary function is to assist users in data analysis and interpretation by generating visual representations of their data. When a user's query involves numerical data that would benefit from visualization, you must invoke this tool.
+## Tool Decision Criteria
+- Teaching mathematical functions, trends, or relationships
+- Demonstrating statistical concepts or data analysis
+- Creating practice questions that test chart interpretation skills
+- Illustrating proportional relationships or comparisons
+**Tool Signature:**
+`Create_Graph_Tool(data: Dict[str, float], plot_type: Literal["bar", "line", "pie"], title: str, x_label: str, y_label: str, educational_context: str)`
+**Parameter Guide:**
+*   `data` **(Required)**: A dictionary where keys are string labels and values are the corresponding numeric data points.
+    *   *Example:* `{"Experiment A": 88.5, "Experiment B": 92.1}`
+*   `plot_type` **(Required)**: The specific type of chart to generate. This **must** be one of `"bar"`, `"line"`, or `"pie"`.
+*   `title` (Optional): A formal title for the plot.
+*   `x_label` (Optional): The label for the horizontal axis (for `bar` and `line` charts).
+*   `y_label` (Optional): The label for the vertical axis (for `bar` and `line` charts).
+*   `educational_context` (Optional): Explanation of why this visualization helps learning.
+**Example Scenarios:**
+*   **User Query:** "I need help practicing the interpretation of trends in line graphs. To analyze the efficacy of a new fertilizer, I have recorded crop yield in kilograms over five weeks. Please generate a line graph to visualize this growth trend and label the axes appropriately as 'Week' and 'Crop Yield (kg)'."
+*   **Your Tool Call:**
+    *   `data`: `{"Week 1": 120, "Week 2": 155, "Week 3": 190, "Week 4": 210, "Week 5": 245}`
+    *   `plot_type`: `"line"`
+    *   `title`: `"Efficacy of New Fertilizer on Crop Yield"`
+    *   `x_label`: `"Week"`
+    *   `y_label`: `"Crop Yield (kg)"`
+    *   `educational_context`: `"This line graph helps visualize the consistent upward trend in crop yield, making it easier to identify growth patterns and analyze the fertilizer's effectiveness over time."`
+*   **User Query:** "I am studying for my ACT, and I am at a loss in interpreting the charts. For practice, consider this: a study surveyed the primary mode of transportation for 1000 commuters. The results were: 450 drive, 300 use public transit, 150 cycle, and 100 walk. Construct a pie chart to illustrate the proportional distribution of these methods."
+*   **Your Tool Call:**
+    *   `data`: `{"Driving": 450, "Public Transit": 300, "Cycling": 150, "Walking": 100}`
+    *   `plot_type`: `"pie"`
+    *   `title`: `"Proportional Distribution of Commuter Transportation Methods"`
+    *   `educational_context`: `"This pie chart clearly shows the relative proportions of each transportation method, making it easy to see that driving is the most common method (45%) while walking is the least common (10%)."`
+NOTE: If specific data to use is not supplied by the user, create reasonable example data that illustrates the concept being taught."""
+'''
+The prompt used by the routing agent, determines if tools are enabled.
+'''
+# --- Tool Decision Engine Prompt ---
+TOOL_DECISION = """
+Analyze this educational query and determine if creating a graph, chart, or visual representation would significantly enhance learning and understanding.
+Query: "{query}"
+EXCLUDE if query is:
+- Greetings or casual conversation (hello, hi, hey)
+- Simple definitions without data
+- General explanations that don't involve data
+INCLUDE if query involves:
+- Mathematical functions or relationships
+- Data analysis or statistics
+- Comparisons that benefit from charts
+- Trends or patterns over time
+- Creating practice questions with data
+Answer with exactly: YES or NO
+Decision:"""
+'''
+System Instructions for the four classification agents
+'''
+# --- Classification Prompts ---
+agent_1_system = '''
+As a teacher's aid, considering the current user prompt/input and recent conversation history, determine if practice questions are needed. Your goal,is to determine dynamically if the user's current understanding and the conversation as a whole would benefit from the model offering practice questions to the user.
+Cases where practice question's are beneficial:
+- The user requested practice questions.
+    Examples:
+    1. Can you make some ACT math section practice questions?
+- The user expressed that they would like to gauge their understanding.
+    Examples:
+    1. I want to figure out where I am in prep for my history exam, it is on the American Civil War.
+- The previous turns include model instruction on a topic and the user has expressed some level of understanding.
+    Examples:
+    1. The chat history is an exchange between the user and model on a specific topic, and the current turn is the user responding to model instruction. The user appears to be grasping hte concept, so a practice question would be helpful to gauge the user's grasp of the discussed topic.
+When strictly inappropriate to include practice questions:
+- The current user prompt/input is conversational, or nonsense:
+    Examples:
+    1. Hello/Hi/Thank You...
+    2. grey, blue colored stuff
+    3. fnsjdfnbiwe
+- The user's question is straightforward, requiring a general answer or tutoring rather than user knowledge testing.
+    Examples:
+    1. Can you tell me when WW2 started?
+    2. Who are the key players in the civil rights movement?
+    3. What do the variables mean in a quadradic equatin?
+Before determining your final response, consider if issuing a practice question would be beneficial or inappropriate. Ask yourself if the user has received instruction on a topic, or requested practice questions prior to returning your final response.
+If the current turn qualifies for practice question generations, return exactly "STRUCTURE_PRACTICE_QUESTIONS"
+Otherwise, return "No Practice questions are needed."
+Do not return any other values outside of the provided options.
+'''
+agent_2_system = '''
+As an expert in intension analysis, determine if one, both or neither of the following cases is true considering the current user prompt/input.
+**Vauge Prompt**
+Appply this option if the user prompt/input is overly vauge and uniterpretable. IT has no indication that it is a followup message, possibly being a simple greeting. THis selection results in the user's rpomptbeing handled lightly with a simple request for a task and suggestions for the user to pick from.
+**Unclear Needs**
+Apply this if the user's current message is just a greeting or conversational. Also apply this option if the current message include comment like or similair to "lets change subjects." Consider that returning the positive value for this option, which is USER_UNDERSTANDING, then the users prompt will be handled with discovery tactics to uncover the user's goals. of the two options, this option yeilds a more detailed course of action in uncovering user needs.
+**Neither**
+Apply neither if the user appears to be responding to a previous message, makes a direct request, or is otherwise a coherant message.
+    Example:
+    1. I think the answer is A (responding)
+    2. Can you explain why the sky is blue? (direct request)
+    3. To my understanding
+Your final response must be one of the following:
+"VAUGE_INPUT USER_UNDERSTANDING"
+"USER_UNDERSTANDING"
+"VAUGE_INPUT"
+"Neither is applicable."
+Do not return any other values outside of the provided options.
+'''
+agent_3_system = '''
+Given a current user prompt/input and recent conversation history, you determine if the current turn is a followup from a practice question.
+For context, consider the instructions given to generate practice questions:
+{STRUCTURE_PRACTICE_QUESTIONS}
+The user prompt/input is a followup if the previous turns contains a practice question per the previous guidelines.
+The user prompt may or may not answer the question(s).
+If the current turn is a followup reply from the user regarding a practice question, return "PRACTICE_QUESTION_FOLLOWUP True"
+Otherwise return "Not a followup"
+Do not return any other values outside of the provided options.
+'''
+agent_4_system = '''
+As an educational proffession whom is assessing a student's current needs, provided the current user prompt/input and recent conversation history, determine if the user is in need of instruction or teaching on a topic, and/or a practice question to enhance their learning.
+"GUIDING_TEACHING"
+Guiding and teaching is a curated approach to instructing the user on a given topic. This catagory should be applied if the user is requesting information, seems confused on previous instruction, or continuing a discussion on a topic.
+"STRUCTURE_PRACTICE_QUESTIONS"
+This catagory is applicable if the user responded positivel to previous instruction by the model on a set topic, or has requested practice questions directly.
+Neither apply if no topics are specifically stated in the current or past prompts.
+You may return the following outputs based on your assessment:
+"GUIDING_TEACHING"
+"STRUCTURE_PRACTICE_QUESTIONS"
+"GUIDING_TEACHING STRUCTURE_PRACTICE_QUESTIONS"
+"Neither Apply"
+Do not return any other values outside of the provided options.
+'''
+'''
+Thinking prompts for use by the agent constructing reasoning invisible to the user, outputs to be supplied to the response model for context and examples.
+'''
+# --- Thinking Prompts ---
+# Thinking process for math-based teaching and problem solving. Tree-of-Thought Prompting
+MATH_THINKING = '''
+Math based thinking process instructions:
+Given a user input and recent chat history, you execute a thinking process to determine your goal. Below is provided the decision tree you will utilize, logically proceeding question by question until you reach an end point. You will then process the user prompt per the instructions outlined in the endpoint. Your final output is to be cleaning structured as context fro answering the user prompt.
+**General Final Response Output Rules**
+When formatting context, apply LaTeX formatting per these guidelines:
+You have access to LaTeX and markdown rendering.
+- For inline math, use $ ... $, e.g. $\sum_{i=0}^n i^2$
+- For centered display math, use $$ ... $$ on its own line.
+- To show a literal dollar sign, use `\$` (e.g., \$5.00).
+- To show literal parentheses in LaTeX, use `\(` and `\)` (e.g., \(a+b\)).
+Content must be ordered logically, building from foundational knowledge to final solutions. Follow proper order of operation. The level of detail is dictated by the output of the decision tree below.
+**Decision Tree**
+Each question has two possible outcomes, narrowing the options. Consider each against the supplied user input and conversation history, proceeding in order. You must apply the general output rules and the final endpoint rules to your reasoning and process in producing the final output for context, to be utilized by another model in producing the final response.
+Is the math based question or request complex?
+1A. The question is a low-level math question or request not requiring more than five steps for completion. Examples: basic arithmetic or definitions.
+1B. The question or request is complex or multifaceted. Examples: tasks that require more than five steps to address. May pertain to advanced mathematical domains such as engineering or physics
+**End Points**
+1A. Evaluate the topic being discussed, considering the newest user and conversation input. Define key terms at the beginning of your context generation, such as the operators and their use in the problem and any principles that apply. Step by step solve the problem presented in the current user query, if one is presented. All math must be formatted per the LaTeX formatting guidelines, with each step on its own line with a description over top expressing why the step is being done and what principles are being applied. Maintain a minimal level of detail, focusing on large topics rather than granular details.
+    EXAMPLE:
+    [INPUT]
+    user: "Can you explain the Pythagorean theorem?"
+    chat_history: None
+    [OUTPUT]
+    **Key Terms**
+    - **Right Triangle:** A triangle with one angle measuring exactly 90 degrees.
+    - **Hypotenuse:** The longest side of a right triangle, opposite the right angle.
+    - **Legs:** The two shorter sides of a right triangle that form the right angle.
+    **Principle: The Pythagorean Theorem**
+    The theorem states that in a right triangle, the square of the length of the hypotenuse (c) is equal to the sum of the squares of the lengths of the other two sides (a and b).
+    **Formula**
+    The relationship is expressed with the formula:
+    $$a^2 + b^2 = c^2$$
+1B. Evaluate the topic being discussed, considering the newest user and conversation input. Define key terms at the beginning of your context generation, such as the operators and their use in the problem and any principles that apply. Identify the domain or school of knowledge. Step by step solve the problem presented in the current user query, if one is presented. List steps in a numbered list. All math must be formatted per the LaTeX formatting guidelines, with each step on its own line with a description over top expressing why the step is being done, and the relevant principles being applied. Include a summary of steps taken and the final answer below the full steps list, in a bulleted list.
+    EXAMPLE:
+    [INPUT]
+    user: "Okay, can you solve the definite integral of f(x) = 3x^2 from x=1 to x=3?"
+    chat_history: "user: \"What is an integral?\"\nassistant: \"An integral is a mathematical object that can be interpreted as an area or a generalization of area. The process of finding an integral is called integration.\""
+    [OUTPUT]
+    **Domain:** Integral Calculus
+    **Key Terms**
+    - **Definite Integral:** Represents the net area under a curve between two points, known as the limits of integration.
+    - **Antiderivative:** A function whose derivative is the original function. The process relies on the Fundamental Theorem of Calculus.
+    - **Limits of Integration:** The start (lower) and end (upper) points of the interval over which the integral is calculated. In this case, 1 and 3.
+    **Problem**
+    Solve the definite integral:
+    $$\int_{1}^{3} 3x^2 \,dx$$
+    **Step-by-Step Solution**
+    1.  **Find the antiderivative of the function.**
+        We apply the power rule for integration, $\int x^n \,dx = \frac{x^{n+1}}{n+1}$.
+        $$ \int 3x^2 \,dx = 3 \cdot \frac{x^{2+1}}{2+1} = 3 \cdot \frac{x^3}{3} = x^3 $$
+    2.  **Apply the Fundamental Theorem of Calculus.**
+        We will evaluate the antiderivative at the upper and lower limits of integration, $F(b) - F(a)$.
+        $$ [x^3]_1^3 $$
+    3.  **Evaluate the antiderivative at the upper limit (x=3).**
+        $$ (3)^3 = 27 $$
+    4.  **Evaluate the antiderivative at the lower limit (x=1).**
+        $$ (1)^3 = 1 $$
+    5.  **Subtract the lower limit result from the upper limit result.**
+        This gives the final value of the definite integral.
+        $$ 27 - 1 = 26 $$
+    **Summary**
+    - The antiderivative of $3x^2$ is $x^3$.
+    - Evaluating the antiderivative from $x=1$ to $x=3$ yields $(3)^3 - (1)^3$.
+    - The final answer is $26$.
+'''
+# CHAIN OF THOUGH PROMPTING, GUIDING THE MODEL IN PROCESSING TOOL OUTPUT FOR QUESTIONS, DESIGNING TABLES FOR CONTEXTUAL DATA, AND DESIGNING PRACTICE QUESTIONS AS WELL AS AN ANSWER BANK.
+QUESTION_ANSWER_DESIGN = '''
+As seasoning test question writing specialist, your task is to produce context to create a practice question for the user.
+Tool Outputs (if provided)
+If tool call outputs are avialble, the practice question must use and require understanding of the data presented.
+Image output: {tool_img_output}
+Image context to consider: {tool_context}
+You must construct practice questions per the formatting guidelines included here:
+{STRUCTURE_PRACTICE_QUESTIONS}
+Math LaTeX Formatting Guidelines:
+{LATEX_FORMATTING}
+Follow this logical process:
+1. Assess the current round's user input and the conversation history, if there is one. What specific topics or concepts are discussed? What instruction has the model previously given? Also identify the subject domain. Return this context summaried at teh top of your context output.
+2. Produce a practice question for the user on the identified topic or concept. Return the pract question with the heading "Practice Question"
+    - If Math or requiring scientific calculations: The question must not be an example given by the model or user in the conversation history. It may be inspired by the conversation history, but it must require the user to try to solve the problem based on what they learned. If no tool output is given to base the question on, then you must create your own data for the user to interpret, solve, or otherwise manipulate to come to an answer.You may provide data by means of the tool image output, with the question constructed using the tool context output. If no tool output is included, you may provide data as a markdown table or integrated into the question. Math must be formatted using LaTeX as outlined in the LaTeX guidelines given above.
+    - If History/social studies/art or otherwise static fact related: The question must be answerable with based on previosu model teaching or instruction from the conversation history.
+3. Produce an answer bank under the question with the correct answer or answers labeled. If it is a written response question, you must write examples of possible correct answers for the new model to utilize in grading the user's answer.
+'''
+# This prompt is reserved for high complexity user queries, aiming to generate context in support of the response agent.
+REASONING_THINKING = '''
+Considering the provided current user prompt/input and recent conversation history, as an educational professional skilled in breaking down concepts, return context that would be beneficial in producing a response to the user.
+1. Begin by thinking about what the user is asking about, such as the topic or domain of knowledge. Summarizes the user's request as well as what has been said relating to the topic or goal in the conversation history. Give this section the heading "User Knowledge Summary."
+2. Evaluate the user's previous statements for accuracy. Ask yourself if the user appears to be grasping the concept or struggling with some part of it. Produce a brief analysis section that defines the user's established understanding, or if this is unknown. Propose potential concepts to cover to aid the user. Return this section with the head "User Understanding."
+3. Identify steps taken by the model in previous turns to aid the user, as well as the apparent effectiveness of said steps, if conversation history is available. Produce this section with the heading "Previous Actions."
+4. Identify relevant facts that would aid the user in understanding the concept, following a logical order in listing these items. Present these items in a nested list, with a title for each nested block at the higher level and atomic facts nested underneath. Produce this section with the heading "Reference Fact Sheet"
+Review your response prior to returning it as output. Review for accuracy and relevance, producing only facts that support further learning rather than information the user has already shown understand of.
+    Examples:
+    [INPUT]
+    user: "I know principal is the starting money and the rate is the percentage. But I don't get what 'compounding frequency' means. Does it matter if it's daily vs yearly?"
+    chat_history: "user: \"How do I calculate compound interest?\"\nassistant: \"## Calculating Compound Interest\n\nThat's a great question! Compound interest is essentially interest earned on the initial amount of money (the principal) as well as on the accumulated interest from previous periods.\n\nTo give you the most helpful explanation, it would be useful to know what you're familiar with already. Have you encountered terms like 'principal', 'annual interest rate', or 'compounding frequency' before?\""
+    [OUTPUT]
+    ### User Knowledge Summary
+    The user's goal is to learn how to calculate compound interest. The conversation began with the user asking for the calculation method. The model responded by defining the term and asking discovery questions to gauge the user's prior knowledge of key variables. The user has now confirmed they understand 'principal' and 'interest rate' but are specifically asking for a definition of 'compounding frequency' and an explanation of its importance.
+    ### User Understanding
+    The user has a foundational grasp of the core components of interest calculations (principal, rate). Their point of confusion is isolated to the concept of compounding frequency. They have correctly intuited that the frequency (e.g., daily vs. yearly) matters but do not understand why. To aid them, the next steps should be to formally define compounding frequency and then use a comparative example to illustrate its impact on the final amount.
+    ### Previous Actions
+    In the previous turn, the model successfully employed a guided discovery tactic. Instead of providing the formula outright, it defined the topic and asked targeted questions. This action was highly effective, as it allowed the user to articulate their specific knowledge gap, enabling a more focused and efficient teaching approach in the next turn.
+    ### Reference Fact Sheet
+    - **The Compound Interest Formula**
+        - The standard formula is $A = P(1 + r/n)^{nt}$
+        - **A**: The future value of the investment/loan, including interest.
+        - **P**: The principal amount (the initial amount of money).
+        - **r**: The annual interest rate (in decimal form).
+        - **n**: The number of times that interest is compounded per year.
+        - **t**: The number of years the money is invested or borrowed for.
+    - **The Role of Compounding Frequency (n)**
+        - This variable represents how often the accumulated interest is calculated and added to the principal.
+        - A higher frequency means interest starts earning its own interest sooner and more often within the same year.
+        - This directly leads to a higher future value (A), even if all other variables (P, r, t) are the same.
+    - **Common Values for 'n'**
+        - **Annually**: n = 1
+        - **Semiannually**: n = 2
+        - **Quarterly**: n = 4
+        - **Monthly**: n = 12
+        - **Daily**: n = 365
+    [INPUT]
+    user: "I think the radius is the line that goes all the way across the middle."
+    chat_history: "user: \"How do you find the area of a circle?\"\nassistant: \"## Finding the Area of a Circle\n\nThat's a great question! To calculate the area of a circle, we use a specific formula that involves a special number called Pi.\n\nBefore we dive into the formula, what do you already know about the different parts of a circle, like the radius or the diameter?\""
+    [OUTPUT]
+    ### User Knowledge Summary
+    The user wants to learn how to calculate the area of a circle. In the previous turn, the model asked the user about their knowledge of a circle's parts to establish a baseline. The user has now responded, incorrectly identifying the radius as the diameter.
+    ### User Understanding
+    The user has a misunderstanding of core terminology essential for this calculation. They have confused the radius with the diameter. This foundational knowledge gap must be corrected before introducing the area formula. Potential concepts to cover are the definitions of radius and diameter and their mathematical relationship.
+    ### Previous Actions
+    In the previous turn, the model employed a discovery tactic by asking about the user's prior knowledge of circle components. This was an effective step, as it successfully revealed a critical misconception in the user's understanding that can now be corrected.
+    ### Reference Fact Sheet
+    - Core Components of a Circle
+        - **Radius (r):** The distance from the center of the circle to any point on its edge.
+        - **Diameter (d):** The distance from one edge of the circle to the other, passing through the center.
+        - **Relationship:** The diameter is always exactly twice the length of the radius ($d = 2r$). Conversely, the radius is half the diameter ($r = d/2$).
+    - The Area Formula
+        - **Pi ($\pi$):** A special mathematical constant, approximately equal to 3.14159, that represents the ratio of a circle's circumference to its diameter.
+        - **Formula:** The area ($A$) of a circle is calculated using the formula $A = \pi r^2$.
+        - **Crucial Detail:** The formula uses the **radius**, not the diameter. If given the diameter, it must first be converted to the radius before calculating the area.
+'''

styles.css ADDED Viewed

	@@ -0,0 +1,353 @@

+/* ============================
+   GLOBAL THEME & VARIABLES
+============================ */
+:root {
+  /* Text Colors */
+  --primarytext-color: #1a1a1a;
+  --secondarytext-color: #555;
+  /* Primary Colors */
+  --primary-dark: #345da8;
+  --primary-light: #a8b5c9;
+  /* Secondary Colors */
+  --secondary-dark: #063d80;
+  --secondary-light: #6ea1fa;
+  /* Chat & Container Colors */
+  --chathistory_area: #f0f1f4;
+  --container-color: #f5f6f8;
+  --Send: #6ea1fa;
+  --Send-hover: #87d0d5;
+  --clear: #b2b8c2;
+  --clear-hover: #2c5be0;
+  --text_areabackground: #fafafa;
+  /* Chat Bubble Colors */
+  --bot-bubble-color: #b9c8e3;
+  --user-bubble-color: #e3eaf6;
+  /* Scrollbar Colors */
+  --scrollbar-bg: #d0d3d8;
+  --scrollbar-thumb: #a2a6ad;
+  --scrollbar-thumb-hover: #888d94;
+  /* Border & Radius */
+  --border-thin: 1px;
+  --border-medium: 2px;
+  --border-default: 1px;
+  --border-focus: 2px;
+  --border-hover: 3px;
+  --button-border: 2px;
+  --radius-sm: 4px;
+  --radius-md: 6px;
+}
+/* ============================
+   DARK MODE THEME (SOFTER)
+============================ */
+@media (prefers-color-scheme: dark) {
+  :root {
+    --primarytext-color: #f8f8f8;
+    --secondarytext-color: #d0d3d8;
+    --primary-dark: #27477d;
+    --primary-light: #7d8da9;
+    --secondary-dark: #042a59;
+    --secondary-light: #5e88d6;
+    --chathistory_area: #202327;
+    --container-color: #1b1d20;
+    --Send: #5e88d6;
+    --Send-hover: #7ac4c9;
+    --clear: #7a7f88;
+    --clear-hover: #5e88d6;
+    --text_areabackground: #25282c;
+    --bot-bubble-color: #425575;
+    --user-bubble-color: #566583;
+    --scrollbar-bg: #2b2e33;
+    --scrollbar-thumb: #4b4f56;
+    --scrollbar-thumb-hover: #5e636b;
+  }
+}
+/* ============================
+   FONT IMPORT & BASE STYLING
+============================ */
+@import url('https://fonts.googleapis.com/css2?family=Oswald:wght@200..700&display=swap');
+body {
+  background: var(--text_areabackground);
+  color: var(--primarytext-color);
+  font-family: "Oswald", sans-serif;
+  margin: 0;
+}
+* {
+  color: var(--primarytext-color) !important;
+  font-family: "Oswald", sans-serif !important;
+  box-sizing: border-box;
+}
+/* ============================
+   CUSTOM SCROLLBAR
+============================ */
+::-webkit-scrollbar {
+  width: 12px;
+}
+::-webkit-scrollbar-track {
+  background: var(--scrollbar-bg);
+}
+::-webkit-scrollbar-thumb {
+  background-color: var(--scrollbar-thumb);
+  border-radius: 6px;
+  border: 2px solid var(--scrollbar-bg);
+}
+::-webkit-scrollbar-thumb:hover {
+  background-color: var(--scrollbar-thumb-hover);
+}
+/* ============================
+   GRADIO CONTAINER & LAYOUT
+============================ */
+.gradio-container,
+[data-testid="block-container"],
+.contain {
+  background-color: var(--container-color) !important;
+  font-family: "Oswald", sans-serif !important;
+  display: flex !important;
+  flex-direction: column !important;
+  height: 100vh !important;
+  max-height: 100vh !important;
+  overflow: hidden !important;
+}
+/* ============================
+   HEADER & NAVIGATION
+============================ */
+.title-header {
+  background-color: transparent;
+  padding: 10px;
+  border-bottom: var(--border-focus) solid var(--primary-dark);
+  display: flex;
+  align-items: center;
+  height: 60px !important;
+}
+.title-header h1 {
+  font-size: 3.5rem;
+  font-weight: 700;
+  color: var(--primarytext-color);
+  margin: 0;
+}
+/* ============================
+   CHAT CONTAINER
+============================ */
+#main-chatbot,
+[data-testid="chatbot"],
+.gradio-chatbot,
+[role="log"] {
+  border: var(--border-default) solid var(--primary-dark) !important;
+  border-radius: var(--radius-md) !important;
+  background-color: var(--chathistory_area) !important;
+  box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1) !important;
+  padding: 15px !important;
+  margin: 15px 20px !important;
+  flex: 1 !important;
+  overflow-y: auto !important;
+}
+/* ============================
+   TEXT INPUT AREA
+============================ */
+textarea,
+.gradio-textbox textarea {
+  background-color: var(--text_areabackground) !important;
+  border: var(--border-default) solid var(--secondary-dark) !important;
+  border-radius: var(--radius-md) !important;
+  color: var(--primarytext-color) !important;
+  padding: 10px !important;
+  resize: none !important;
+  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1) !important;
+}
+textarea:focus {
+  border-color: var(--secondary-light) !important;
+  box-shadow: 0 0 0 var(--border-focus) rgba(96, 165, 250, 0.2) !important;
+}
+/* ============================
+   BUTTONS
+============================ */
+button.send-button {
+  background-color: var(--Send) !important;
+  color: var(--primarytext-color) !important;
+  border: var(--button-border) solid var(--secondary-dark) !important;
+  border-radius: var(--radius-md) !important;
+  padding: 8px 16px !important;
+  font-weight: 600 !important;
+  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+  width: 100%;
+}
+button.send-button:hover {
+  background-color: var(--Send-hover) !important;
+}
+button.clear-button {
+  background-color: var(--clear) !important;
+  color: var(--primarytext-color) !important;
+  border: var(--button-border) solid var(--secondary-dark) !important;
+  border-radius: var(--radius-md) !important;
+  padding: 8px 16px !important;
+  font-weight: 600 !important;
+  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+  width: 100%;
+}
+button.clear-button:hover {
+  background-color: var(--clear-hover) !important;
+}
+/* ============================
+   CHAT BUBBLES (VARIABLE COLORS)
+============================ */
+.message.user,
+.message.bot {
+  background: none !important;
+  border: none !important;
+  padding: 0 !important;
+  margin: 0 !important;
+  box-shadow: none !important;
+}
+.message-row {
+  display: flex;
+  margin: 8px 12px;
+}
+.message.panel-full-width {
+  max-width: 80%;
+  min-width: 240px;
+  padding: 14px 20px !important;
+  border-radius: 18px !important;
+  box-shadow: none !important;
+  position: relative;
+  line-height: 1.5;
+  word-wrap: break-word;
+}
+/* Bot Bubble */
+.message-row.bot-row .message.panel-full-width {
+  background-color: var(--bot-bubble-color) !important;
+  color: var(--primarytext-color) !important;
+  margin-right: auto;
+  margin-left: 0;
+}
+.message-row.bot-row .message.panel-full-width::before {
+  content: "";
+  position: absolute;
+  top: 12px;
+  left: -10px;
+  width: 0;
+  height: 0;
+  border-top: 10px solid transparent;
+  border-right: 10px solid var(--bot-bubble-color);
+  border-bottom: 10px solid transparent;
+}
+/* User Bubble */
+.message-row.user-row .message.panel-full-width {
+  background-color: var(--user-bubble-color) !important;
+  color: var(--primarytext-color) !important;
+  margin-left: auto;
+  margin-right: 0;
+}
+.message-row.user-row .message.panel-full-width::before {
+  content: "";
+  position: absolute;
+  top: 12px;
+  right: -10px;
+  width: 0;
+  height: 0;
+  border-top: 10px solid transparent;
+  border-left: 10px solid var(--user-bubble-color);
+  border-bottom: 10px solid transparent;
+}
+/* ============================
+   RESPONSIVE ADJUSTMENTS
+============================ */
+@media (max-width: 768px) {
+  .message.panel-full-width {
+    max-width: 85%;
+  }
+}
+/* ============================
+FOOTER: RESTORE BUILT-IN GRADIO LINKS (settings, API, etc.)
+============================ */
+footer.svelte-czcr5b {
+display: flex !important;
+align-items: center !important;
+justify-content: center !important;
+gap: 12px !important;
+visibility: visible !important;
+position: fixed !important;
+bottom: 0 !important;
+left: 0 !important;
+right: 0 !important;
+background-color: var(--container-color) !important;
+backdrop-filter: blur(5px) !important;
+border-top: var(--border-default) solid rgba(0, 0, 0, 0.12) !important;
+padding: 8px 16px !important;
+z-index: 1000 !important;
+min-height: 36px !important;
+}
+footer.svelte-czcr5b a,
+footer.svelte-czcr5b button,
+footer.svelte-czcr5b span {
+color: var(--secondarytext-color) !important;
+font-size: 12px !important;
+font-family: "Oswald", sans-serif !important;
+text-decoration: none !important;
+background: none !important;
+border: none !important;
+cursor: pointer !important;
+opacity: 0.8;
+transition: opacity 0.15s ease;
+}
+footer.svelte-czcr5b a:hover,
+footer.svelte-czcr5b button:hover,
+footer.svelte-czcr5b span:hover {
+opacity: 1;
+color: var(--primarytext-color) !important;
+}
+/* Divider style between footer links */
+footer.svelte-czcr5b .divider {
+color: var(--secondarytext-color) !important;
+opacity: 0.5;
+margin: 0 6px !important;
+}
+/* Make sure footer items never collapse */
+footer.svelte-czcr5b > * {
+display: inline-flex !important;
+align-items: center !important;
+}