Spaces:

jdesiree
/

Mimir

Sleeping

App Files Files Community

Update app.py

by jdesiree - opened Sep 7, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+2015

-7121

Files changed (18) hide show

.gitattributes +35 -0
.gitignore +0 -40
Dockerfile +0 -13
LightEval_Mimir.py +0 -109
README.md +94 -418
agents.py +0 -940
app.py +0 -0
gradio_analytics.py +0 -538
gradio_chatbot.py +0 -148
gradio_prompt_testing.py +0 -1564
graph_tool.py +0 -2
loading_animation.gif +0 -0
loading_animations.py +52 -0
model_manager.py +0 -270
prompt_library.py +0 -534
requirements.txt +16 -39
state_manager.py +0 -807
styles.css +706 -299

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore DELETED Viewed

@@ -1,40 +0,0 @@
-# .gitignore
-# Database files
-*.db
-*.db-journal
-*.db-shm
-*.db-wal
-mimir_analytics.db
-# Python
-__pycache__/
-*.py[cod]
-*$py.class
-*.so
-.Python
-*.egg-info/
-dist/
-build/
-# Environment
-.env
-.venv/
-venv/
-ENV/
-# IDE
-.vscode/
-.idea/
-*.swp
-*.swo
-*~
-# OS
-.DS_Store
-Thumbs.db
-# Logs
-*.log
-# Git
-.git/

Dockerfile DELETED Viewed

@@ -1,13 +0,0 @@
-FROM python:3.10
-WORKDIR /app
-COPY requirements.txt .
-RUN pip install -r requirements.txt
-COPY . .
-# Force unbuffered Python output
-ENV PYTHONUNBUFFERED=1
-# Run with explicit python -u flag
-CMD ["python", "-u", "app.py"]

LightEval_Mimir.py DELETED Viewed

@@ -1,109 +0,0 @@
-# LightEval_Mimir.py
-'''This document outlines hte LightEval setu for tracking performance metrics of Mimir, to be sent to the trackio page for viszulization.'''
-# Imports
-from lighteval.metrics.metrics_sample import BertScore, ROUGE
-from lighteval.tasks.requests import Doc
-async def evaluate_educational_quality(user_query, response, thread_id):
-    """Dynamic evaluation using LightEval metrics"""
-    # Create ephemeral task for this turn
-    doc = Doc(
-        task_name=f"turn_{thread_id}",
-        query=user_query,
-        choices=[response],
-        gold_index=-1,  # No ground truth initially
-        specific_output=response
-    )
-    # Use BertScore for semantic quality
-    bert_score = BertScore().compute(doc)
-    # Custom educational coherence metric
-    educational_indicators = {
-        'has_examples': 'example' in response.lower(),
-        'structured_explanation': '##' in response or '1.' in response,
-        'appropriate_length': 100 < len(response) < 1500,
-        'encourages_learning': any(phrase in response.lower()
-            for phrase in ['practice', 'try', 'consider', 'think about'])
-    }
-    return {
-        'semantic_quality': bert_score,
-        'educational_score': sum(educational_indicators.values()) / len(educational_indicators),
-        'response_time': time.time() - start_time
-    }
-def track_rag_performance(query, retrieved_docs, used_in_response):
-    """Evaluate RAG retrieval quality"""
-    from lighteval.metrics.utils.metric_utils import SampleLevelMetric
-    # Track retrieval-to-response alignment
-    retrieval_relevance = calculate_relevance(query, retrieved_docs)
-    retrieval_usage = len(used_in_response) / len(retrieved_docs) if retrieved_docs else 0
-    # Log to trackio with LightEval structure
-    metric_payload = {
-        "evaluation_id": str(uuid.uuid4()),
-        "task": "rag_retrieval",
-        "metrics": {
-            "retrieval_relevance": retrieval_relevance,
-            "retrieval_usage_rate": retrieval_usage,
-            "num_docs_retrieved": len(retrieved_docs)
-        },
-        "metadata": {
-            "query": query[:100],
-            "sources": [doc.metadata.get('source') for doc in retrieved_docs]
-        }
-    }
-    send_evaluation_to_trackio(metric_payload)
-def evaluate_prompt_classification(predicted_mode, actual_conversation_outcome, thread_id):
-    """Track prompt classifier accuracy in production"""
-    # Did the predicted mode lead to successful interaction?
-    success_indicators = {
-        'discovery_mode': lambda outcome: 'clarified_topic' in outcome,
-        'teaching_mode': lambda outcome: outcome.get('quality_score', 0) > 3.5,
-        'conversational': lambda outcome: outcome.get('user_satisfied', False)
-    }
-    mode_was_correct = success_indicators.get(
-        predicted_mode,
-        lambda x: True
-    )(actual_conversation_outcome)
-    # Create LightEval-style evaluation
-    from lighteval.metrics import Metrics
-    accuracy_metric = Metrics.ACCURACY if mode_was_correct else 0
-    return {
-        "prompt_classifier_accuracy": accuracy_metric,
-        "predicted_mode": predicted_mode,
-        "conversation_length": len(conversation_state)
-    }
-def process_user_feedback(response_id, feedback_type, conversation_state):
-    """Convert user feedback to LightEval ground truth"""
-    last_exchange = {
-        "query": conversation_state[-2]["content"],  # User's question
-        "response": conversation_state[-1]["content"], # Agent's response
-        "gold_index": 0 if feedback_type == "thumbs_up" else -1
-    }
-    # Create retrospective evaluation with ground truth
-    from lighteval.tasks.requests import Doc
-    doc = Doc(
-        task_name="user_feedback_eval",
-        query=last_exchange["query"],
-        choices=[last_exchange["response"]],
-        gold_index=last_exchange["gold_index"]
-    )
-    # Now you have ground truth for accuracy metrics!
-    accuracy = 1.0 if feedback_type == "thumbs_up" else 0.0
-    return {"user_feedback_accuracy": accuracy, "response_id": response_id}

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
 title: Mimir
 emoji: 📚
-colorFrom: indigo
-colorTo: blue
 sdk: gradio
-sdk_version: 5.49.1
 app_file: app.py
 pinned: true
 python_version: '3.10'
@@ -12,461 +12,137 @@ short_description: Advanced prompt engineering for educational AI systems.
 thumbnail: >-
   https://cdn-uploads.huggingface.co/production/uploads/68700e7552b74a1dcbb2a87e/Z7P8DJ57rc5P1ozA5gwp3.png
 hardware: zero-gpu-dynamic
-startup_duration_timeout: 30m
 ---
 # Mimir: Educational AI Assistant
-## Advanced Multi-Agent Architecture & Prompt Engineering Portfolio Project
 ### Project Overview
-Mimir demonstrates enterprise-grade AI system design through a sophisticated multi-agent architecture applied to educational technology. The system showcases advanced prompt engineering, intelligent decision-making pipelines, and state-persistent conversation management. Unlike simple single-model implementations, Mimir employs **four specialized agent types** working in concert: a tool decision engine, four parallel routing agents for prompt selection, three preprocessing thinking agents for complex reasoning, and a unified response generator. This architecture prioritizes pedagogical effectiveness through dynamic context assembly, ensuring responses are tailored to each unique educational interaction.
 ***
 ### Technical Architecture
-**Multi-Agent System:**
-```
-User Input → Tool Decision Agent → Routing Agents (4x) → Thinking Agents (3x) → Response Agent → Output
-                     ↓                    ↓                      ↓                  ↓
-              Llama-3.2-3B         Llama-3.2-3B (shared)    Llama-3.2-3B      Llama-3.2-3B
-```
 **Core Technologies:**
-* **Unified Model Architecture**: Llama-3.2-3B-Instruct (3.21B parameters) for all tasks - decision-making, reasoning, and response generation
-* **Lazy Loading Strategy**: Model loads on first request and caches for subsequent calls (optimal for ZeroGPU)
-* **Custom Orchestration**: Hand-built agent coordination replacing traditional frameworks for precise control and optimization
-* **State Management**: Thread-safe global state with dual persistence (SQLite + HuggingFace Datasets)
-* **ZeroGPU Integration**: Dynamic GPU allocation with `@spaces.GPU` decorators for efficient resource usage
-* **Gradio**: Multi-page interface (Chatbot + Analytics Dashboard)
-* **Python**: Advanced backend with 4-bit quantization and streaming
-**Key Frameworks & Libraries:**
-* `transformers` & `accelerate` for model loading and inference optimization
-* `bitsandbytes` for 4-bit NF4 quantization (75% memory reduction)
-* `peft` for Parameter-Efficient Fine-Tuning support
-* `spaces` for HuggingFace ZeroGPU integration
-* `matplotlib` for dynamic visualization generation
-* Custom state management system with SQLite and dataset backup
 ***
-### Advanced Agent Architecture
-#### Agent Pipeline Overview
-The system processes each user interaction through a sophisticated four-stage pipeline, with each stage making intelligent decisions that shape the final response.
-#### Stage 1: Tool Decision Agent
-**Purpose**: Determines if visualization tools enhance learning
-**Model**: Llama-3.2-3B-Instruct (4-bit NF4 quantized)
-**Prompt Engineering**:
-* Highly constrained binary decision prompt (YES/NO only)
-* Explicit INCLUDE/EXCLUDE criteria for educational contexts
-* Zero-shot classification with educational domain knowledge
-**Decision Criteria**:
-```
-INCLUDE: Mathematical functions, data analysis, chart interpretation,
-         trend visualization, proportional relationships
-EXCLUDE: Greetings, definitions, explanations without data
-```
-**Output**: Boolean flag activating `TOOL_USE_ENHANCEMENT` prompt segment
----
-#### Stage 2: Prompt Routing Agents (4 Specialized Agents)
-**Purpose**: Intelligent prompt segment selection through parallel analysis
-**Model**: Shared Llama-3.2-3B-Instruct instance (memory efficient)
-**Agent Specializations**:
-1. **Agent 1 - Practice Question Detector**
-   - Analyzes conversation context for practice question opportunities
-   - Considers user's expressed understanding and learning progression
-   - Activates: `STRUCTURE_PRACTICE_QUESTIONS`
-2. **Agent 2 - Discovery Mode Classifier**
-   - Dual-classification: vague input detection + understanding assessment
-   - Returns: `VAUGE_INPUT`, `USER_UNDERSTANDING`, or neither
-   - Enables guided discovery and clarification strategies
-3. **Agent 3 - Follow-up Assessment Agent**
-   - Detects if user is responding to previous practice questions
-   - Analyzes conversation history for grading opportunities
-   - Activates: `PRACTICE_QUESTION_FOLLOWUP` (triggers grading mode)
-4. **Agent 4 - Teaching Mode Assessor**
-   - Evaluates need for direct instruction vs. structured practice
-   - Multi-output agent (can activate multiple prompts)
-   - Activates: `GUIDING_TEACHING`, `STRUCTURE_PRACTICE_QUESTIONS`
-**Prompt Engineering Innovation**:
-* Each agent uses a specialized system prompt with clear decision criteria
-* Structured output formats for reliable parsing
-* Context-aware analysis incorporating full conversation history
-* Sequential execution prevents decision conflicts
----
-#### Stage 3: Thinking Agents (Preprocessing Layer)
-**Purpose**: Generate reasoning context before final response (CoT/ToT)
-**Model**: Llama-3.2-3B-Instruct (shared instance)
-**Agent Specializations**:
-1. **Math Thinking Agent**
-   - **Method**: Tree-of-Thought reasoning for mathematical problems
-   - **Activation**: When `LATEX_FORMATTING` is active
-   - **Output Structure**:
-     ```
-     Key Terms → Principles → Formulas → Step-by-Step Solution → Summary
-     ```
-   - **Complexity Routing**: Decision tree determines detail level (1A: basic, 1B: complex)
-2. **Question/Answer Design Agent**
-   - **Method**: Chain-of-Thought for practice question formulation
-   - **Activation**: When `STRUCTURE_PRACTICE_QUESTIONS` is active
-   - **Formatted Inputs**: Tool context, LaTeX guidelines, practice question templates
-   - **Output**: Question design, data formatting, answer bank generation
-3. **Reasoning Thinking Agent**
-   - **Method**: General Chain-of-Thought preprocessing
-   - **Activation**: When tools, follow-ups, or teaching mode active
-   - **Output Structure**:
-     ```
-     User Knowledge Summary → Understanding Analysis →
-     Previous Actions → Reference Fact Sheet
-     ```
-**Prompt Engineering Innovation**:
-* Thinking agents produce **context for ResponseAgent**, not final output
-* Outputs are invisible to user but inform response quality
-* Tree-of-Thought (ToT) for math: explores multiple solution paths
-* Chain-of-Thought (CoT) for others: step-by-step reasoning traces
----
-#### Stage 4: Response Agent (Educational Response Generation)
-**Purpose**: Generate pedagogically sound final response
-**Model**: Llama-3.2-3B-Instruct (same shared instance)
-**Configuration**:
-* 4-bit NF4 quantization (BitsAndBytes)
-* Mixed precision BF16 inference
-* Accelerate integration for distributed computation
-* 128K context window
-* Multilingual support (8 languages)
-**Prompt Assembly Process**:
-1. **Core Identity**: Always included (defines Mimir persona)
-2. **Logical Expressions**: Regex-triggered prompts (e.g., math keywords → `LATEX_FORMATTING`)
-3. **Agent-Selected Prompts**: Dynamic assembly based on routing agent decisions
-4. **Context Integration**: Tool outputs, thinking agent outputs, conversation history
-5. **Complete Prompt**: All segments joined with proper formatting
-**Dynamic Prompt Library** (11 segments):
-```
-Core:          CORE_IDENTITY (always)
-Formatting:    GENERAL_FORMATTING (always), LATEX_FORMATTING (math)
-Discovery:     VAUGE_INPUT, USER_UNDERSTANDING
-Teaching:      GUIDING_TEACHING
-Practice:      STRUCTURE_PRACTICE_QUESTIONS, PRACTICE_QUESTION_FOLLOWUP
-Tool:          TOOL_USE_ENHANCEMENT
-```
-**Response Post-Processing**:
-* Artifact cleanup (remove special tokens)
-* Intelligent truncation at logical breakpoints
-* Sentence integrity preservation
-* Quality validation gates
-* Word-by-word streaming for UX
----
-### Model Specifications
-**Llama-3.2-3B-Instruct Details:**
-* **Parameters**: 3.21 billion
-* **Architecture**: Optimized transformer with Grouped-Query Attention (GQA)
-* **Training Data**: 9 trillion tokens (December 2023 cutoff)
-* **Context Length**: 128,000 tokens
-* **Languages**: English, German, French, Italian, Portuguese, Hindi, Spanish, Thai
-* **Quantization**: 4-bit NF4 (~1GB VRAM)
-* **Training Method**: Knowledge distillation from Llama 3.1 8B/70B + SFT + RLHF
-**Why Single Model Architecture:**
-* ✅ **Consistency**: Same reasoning style across all agents
-* ✅ **Memory Efficient**: One model, shared instance (~1GB total)
-* ✅ **Instruction-Tuned**: Optimized for educational dialogue
-* ✅ **Fast Inference**: 3B parameters = quick responses
-* ✅ **ZeroGPU Friendly**: Small enough for dynamic allocation
-* ✅ **128K Context**: Can handle long educational conversations
----
-### Prompt Engineering Techniques Demonstrated
-#### 1. Hierarchical Prompt Architecture
-**Three-Layer System**:
-- **Agent System Prompts**: Specialized instructions for each agent type
-- **Response Prompt Segments**: Modular components dynamically assembled
-- **Thinking Prompts**: Preprocessing templates for reasoning generation
-**Innovation**: Separates decision-making logic from response generation, enabling precise control over AI behavior at each pipeline stage.
-#### 2. Per-Turn Prompt State Management
-**PromptStateManager**:
-```python
-# Reset at turn start - clean slate
-prompt_state.reset()  # All 11 prompts → False
-# Agents activate relevant prompts
-prompt_state.update("LATEX_FORMATTING", True)
-prompt_state.update("GUIDING_TEACHING", True)
-# Assemble only active prompts
-active_prompts = prompt_state.get_active_response_prompts()
-# Returns: ["CORE_IDENTITY", "GENERAL_FORMATTING",
-#           "LATEX_FORMATTING", "GUIDING_TEACHING"]
-```
-**Benefits**:
-- No prompt pollution between turns
-- Context-appropriate responses every time
-- Traceable decision-making for debugging
-#### 3. Logical Expression System
-**Regex-Based Automatic Activation**:
-```python
-# Math keyword detection
-math_regex = r'\b(calculus|algebra|equation|solve|derivative)\b'
-if re.search(math_regex, user_input, re.IGNORECASE):
-    prompt_state.update("LATEX_FORMATTING", True)
-```
-**Hybrid Approach**: Combines rule-based triggers with LLM decision-making for optimal reliability.
-#### 4. Constraint-Based Agent Prompting
-**Tool Decision Example**:
-```
-System Prompt: Analyze query and determine if visualization needed.
-Output Format: YES or NO (nothing else)
-INCLUDE if: mathematical functions, data analysis, trends
-EXCLUDE if: greetings, simple definitions, no data
-```
-**Result**: Reliable, parseable outputs from agents without complex post-processing.
-#### 5. Chain-of-Thought & Tree-of-Thought Preprocessing
-**CoT for Sequential Reasoning**:
-```
-Step 1: Assess topic →
-Step 2: Identify user understanding →
-Step 3: Previous actions →
-Step 4: Reference facts
-```
-**ToT for Mathematical Reasoning**:
-```
-Question Type Assessment →
-  Branch 1A (Simple): Minimal steps
-  Branch 1B (Complex): Full derivation with principles
-```
-**Innovation**: Thinking agents generate rich context that guides ResponseAgent to higher-quality outputs.
-#### 6. Academic Integrity by Design
-**Embedded in Core Prompts**:
-* "Do not provide full solutions - guide through processes instead"
-* "Break problems into conceptual components"
-* "Ask clarifying questions about their understanding"
-* Subject-specific guidelines (Math: explain concepts, not compute)
-**Follow-up Grading**:
-* Agent 3 detects practice question responses
-* `PRACTICE_QUESTION_FOLLOWUP` prompt activates
-* Automated assessment with constructive feedback
-#### 7. Multi-Modal Response Generation
-**Tool Integration**:
-```python
-# Tool decision → JSON generation → matplotlib rendering → base64 encoding
-Create_Graph_Tool(
-    data={"Week 1": 120, "Week 2": 155, ...},
-    plot_type="line",
-    title="Crop Yield Analysis",
-    educational_context="Visualizes growth trend over time"
-)
-```
-**Result**: In-memory graph generation with educational context, embedded directly in response.
----
-### State Management & Persistence
-#### GlobalStateManager Architecture
-**Dual-Layer Persistence**:
-1. **SQLite Database**: Fast local access, immediate writes
-2. **HuggingFace Dataset**: Cloud backup, hourly sync
-**State Categories**:
-```python
-- Conversation State: Full chat history + agent context
-- Prompt State: Per-turn activation (resets each interaction)
-- Analytics State: Metrics, dashboard data, export history
-- Evaluation State: Quality scores, classifier accuracy, user feedback
-- ML Model Cache: Loaded model for reuse across sessions
-```
-**Thread Safety**: All state operations protected by `threading.Lock()`
-**Cleanup Strategy**:
-- Automatic cleanup every 60 minutes
-- Remove sessions older than 24 hours
-- Prevents memory leaks in long-running deployments
----
-### Model Loading & Optimization Strategy
-#### Two-Stage Lazy Loading Pipeline
-**Stage 1: Build Time (Docker) - Optional Pre-caching**
-```yaml
-# preload_from_hub in README.md
-preload_from_hub:
-  - meta-llama/Llama-3.2-3B-Instruct
-```
-* Downloads model weights during Docker build
-* Cached in HuggingFace hub cache directory
-* Reduces first-request latency (no download needed)
-* **Optional but recommended** for production deployments
-**Stage 2: Runtime (Lazy Loading with Automatic Caching)**
-```python
-# model_manager.py - LazyLlamaModel class
-def _load_model(self):
-    """Load on first generate() call"""
-    if self.model is not None:
-        return  # Already loaded - reuse cached instance
-    # First call: Load with 4-bit quantization
-    self.model = AutoModelForCausalLM.from_pretrained(
-        "meta-llama/Llama-3.2-3B-Instruct",
-        quantization_config=quantization_config,
-        device_map="auto",
-    )
-    # Model stays in memory for all future calls
-# All agents share this single instance
-@spaces.GPU(duration=120)
-def _load_model(self):
-    # GPU allocated for 120 seconds during first load
-    # Then reused without re-allocation
-```
-**Loading Flow**:
-```
-App starts → Instant startup (no model loading)
-             ↓
-First user request → Triggers model load (~30-60s)
-                    ├─ Download from cache (if preloaded: instant)
-                    ├─ Load with 4-bit quantization
-                    ├─ Create pipeline
-                    └─ Cache in memory
-                    ↓
-All subsequent requests → Use cached model (~1s)
-```
-**Memory Optimization**:
-- **4-bit NF4 Quantization**: 75% memory reduction
-  - Llama-3.2-3B: ~6GB → ~1GB VRAM
-- **Shared Model Strategy**: ALL agents share one model instance
-- **Singleton Pattern**: Thread-safe model caching
-- **Device Mapping**: Automatic distribution with ZeroGPU
-- **128K Context**: Long conversations without truncation
-**ZeroGPU Integration**:
-```python
-@spaces.GPU(duration=120)  # Dynamic allocation for first load
-def _load_model(self):
-    # GPU available for 120 seconds
-    # Loads model once on first request
-    # Cached instance reused across all agents
-    # Automatic GPU management by ZeroGPU
-```
-**Performance Characteristics**:
-* **First Request**: 30-60 seconds (one-time model load)
-  - With `preload_from_hub`: 30-40s (just quantization)
-  - Without preload: 50-60s (download + quantization)
-* **Subsequent Requests**: <1 second per agent
-* **Memory Footprint**: ~1GB VRAM (persistent)
-* **Cold Start**: Instant app startup (model loads on demand)
-**Why Lazy Loading?**
-* ✅ **Instant Startup**: App launches immediately
-* ✅ **ZeroGPU Optimal**: Perfect for dynamic GPU allocation
-* ✅ **Memory Efficient**: Only loads when needed
-* ✅ **Cache Persistent**: Stays loaded between requests
-* ✅ **Serverless Friendly**: Ideal for HuggingFace Spaces
----
-### Analytics & Evaluation System
-#### Built-In Dashboard
-**Real-Time Metrics**:
-* Total conversations
-* Average response time
-* Success rate (quality score >3.5)
-* Educational quality scores (ML-evaluated)
-* Classifier accuracy rates
-* Active sessions count
-**LightEval Integration**:
-* BertScore for semantic quality
-* ROUGE for response completeness
-* Custom educational quality indicators:
-  - Has examples
-  - Structured explanation
-  - Appropriate length
-  - Encourages learning
-  - Uses LaTeX (for math)
-  - Clear sections
-**Exportable Data**:
-* JSON export with full metrics
-* CSV export of interaction history
-* Programmatic access via API
----
-### Performance Benchmarks
-**Runtime Performance:**
-* **Inference Speed**: 25-40 tokens/second (with ZeroGPU)
-* **Memory Usage**: ~1GB VRAM (4-bit quantization)
-* **Context Window**: 128K tokens
-* **First Request**: ~30-60 seconds (one-time load)
-* **Warm Inference**: <1 second per agent
-* **Startup Time**: Instant (lazy loading)
-**Llama 3.2 Quality Scores:**
-* MMLU: 63.4 (competitive with larger models)
-* GSM8K (Math): 73.9
-* HumanEval (Coding): 59.3
-* Multilingual: 8 languages supported
-* Safety: RLHF-aligned for educational use

 ---
 title: Mimir
 emoji: 📚
+colorFrom: yellow
+colorTo: yellow
 sdk: gradio
+sdk_version: 5.44.1
 app_file: app.py
 pinned: true
 python_version: '3.10'
 thumbnail: >-
   https://cdn-uploads.huggingface.co/production/uploads/68700e7552b74a1dcbb2a87e/Z7P8DJ57rc5P1ozA5gwp3.png
 hardware: zero-gpu-dynamic
+hf_oauth: true
+hf_oauth_expiration_minutes: 120
 ---
 # Mimir: Educational AI Assistant
+## Advanced Prompt Engineering Portfolio Project
 ### Project Overview
+Mimir demonstrates sophisticated prompt engineering techniques applied to educational technology, showcasing the implementation of context-aware AI systems that prioritize pedagogical effectiveness over simple answer generation. A key feature is its ability to **dynamically generate custom data visualizations**, determined by an intelligent decision engine that assesses whether a visual aid will enhance the pedagogical explanation. This project exemplifies professional-grade prompt design for educational applications, embodying the role of an educational partner that guides students to discover answers for themselves.
 ***
 ### Technical Architecture
 **Core Technologies:**
+* **LangChain**: Prompt template management and conversation chain orchestration.
+* **LangGraph**: Orchestrates the application's flow as a state machine (**StateGraph**). It manages the conditional logic for the tool-use decision engine, routing user queries between the LLM, a pre-built **ToolNode** for graph generation, and the final response node.
+* **Gradio**: Full-stack web interface with custom CSS styling.
+* **Hugging Face Inference API**: Model deployment and response generation.
+* **Python**: Backend logic and integration layer.
+* **Matplotlib**: Powers the dynamic, in-memory generation of educational graphs and charts.
+**Key Frameworks:**
+* `langchain.prompts.ChatPromptTemplate` for dynamic prompt construction.
+* `langchain_huggingface.HuggingFaceEndpoint` for model interface.
+* `langchain.schema` message objects (HumanMessage, AIMessage, SystemMessage).
+* `langgraph.graph.StateGraph` & `langgraph.prebuilt.ToolNode` for building and executing the conditional logic graph.
+* `langgraph.checkpoint.memory.MemorySaver` for persistent conversation state.
 ***
+### Prompt Engineering Techniques Demonstrated
+#### 1. Unified System Prompt Architecture
+Employs a single, comprehensive system prompt that establishes the AI's core persona as **Mimir, an expert multi-concept tutor**. This foundational prompt meticulously defines the AI's behavior, tone, and pedagogical mission. It integrates:
+* **Core Educational Principles**: A directive to prioritize teaching methodology, foster critical thinking, and provide comprehensive explanations over direct answers.
+* **Defined Persona & Tone**: Specific instructions to maintain an engaging, supportive, and intellectually appropriate tone for high school students, while avoiding fluff and emojis.
+* **Specific Response Guidelines**: Contextual rules for handling different academic tasks, such as explaining concepts in math problems instead of solving them, or discussing research strategies for essays rather than writing them.
+#### 2. Instructional Design Integration
+The core prompt incorporates evidence-based instructional design principles:
+* **Scaffolding**: Breaking complex concepts into manageable components.
+* **Socratic Method**: Guiding discovery rather than providing direct answers.
+* **Metacognitive Strategies**: Teaching learning-how-to-learn approaches.
+#### 3. Academic Integrity Constraints
+Implemented ethical AI guidelines directly into the system prompt:
+* Explicit instructions to avoid homework completion.
+* Focus on **process over product delivery**.
+* Critical thinking skill development emphasis.
+#### 4. Two-Stage Tool-Use Prompting
+A sophisticated two-stage prompting strategy governs the use of the `Create_Graph_Tool`:
+* **Tool-Use Decision Prompt**: A highly-constrained template is used by the `Tool_Decision_Engine` to determine if a tool should be used. This prompt forces a **YES** or **NO** response based on whether a visual aid would significantly enhance learning, using explicit **INCLUDE** and **EXCLUDE** criteria.
+* **Tool-Execution Guidance**: The main system prompt contains separate, explicit instructions on how to use the tool once the decision has been made. It provides the exact **JSON structure** the model must output, including fields like `data`, `plot_type`, and `educational_context`, ensuring the generated graphs are pedagogically sound.
+***
+### Advanced Implementation Features
+#### Intelligent Graphing Tool Integration
+A custom, dynamic visualization system was developed to provide multi-modal educational responses.
+* **LLM-Powered Analysis**: For relevant queries, a targeted LLM call is made using the specialized YES/NO decision prompt.
+* **Dynamic Visualization Tool (`Create_Graph_Tool`)**: Designed and implemented a custom visualization tool using **matplotlib**. The tool receives a JSON configuration from the LLM and generates high-quality bar, line, or pie charts. The entire process occurs in-memory:
+    * The plot is rendered into a `BytesIO` buffer.
+    * The image is encoded into a **base64 string**.
+    * The final output is an HTML `<img>` tag with the embedded base64 data, which is displayed directly in the chat interface, eliminating the need for file I/O.
+    * The tool's docstring provides a clear schema and usage instructions for the LLM, ensuring reliable and pedagogically sound visualizations.
+#### Stateful Conversation Management with LangGraph
+Implements persistent, multi-turn conversations using LangGraph's **MemorySaver**. This allows the application's state, including the full message history (`add_messages`), to be saved and resumed, ensuring robust context management even when tool use is involved.
+#### Response Streaming & Truncation
+* Smart text truncation preserving sentence integrity.
+* Real-time response streaming for improved UX.
+* Error handling and fallback mechanisms.
+#### Template Chaining Architecture
+The core logic utilizes **LangChain Expression Language (LCEL)** to pipe inputs through templates, models, and tools.
+***
+### User Interface Engineering
+* **Gradio Layout & Custom Styling**: The interface is built with `gr.Blocks`, using `gr.Column` and `gr.Row` to structure the main components. A custom `styles.css` file is loaded to apply specific theming, responsive design, and layout rules, moving beyond default Gradio styling for a tailored user experience.
+* **Component Architecture**: Modular Gradio component structure with custom CSS class integration and accessibility-compliant patterns.
+***
+### Prompt Engineering Methodologies Applied
+* **Template Parameterization**: Dynamic variable injection for contextual responses.
+* **Persona-Driven Response Generation**: Crafting a detailed persona within the system prompt to guide the AI's tone, style, and pedagogical approach consistently.
+* **Domain-Specific Language Modeling**: Educational vocabulary and pedagogical terminology integration.
+* **Multi-Modal Response Formatting**: Structured output generation with educational formatting.
+* **Agentic Tool Routing**: Designing prompts and logic that enable an AI system to intelligently decide which tool is appropriate for a given task, simulating agent-like behavior.
+***
+### Professional Applications
+This project demonstrates competency in:
+* **Enterprise-Grade Prompt Design**: Scalable template and tool-use architecture.
+* **Educational Technology Integration**: Designing AI tutors with robust pedagogical frameworks and dynamic, multi-modal response capabilities.
+* **Ethical AI Implementation**: Academic integrity safeguards and responsible AI practices.
+* **Full-Stack AI Application Development**: End-to-end system implementation.
+* **Intelligent Agent & Tool Development**: Building AI agents that can utilize custom tools to solve complex problems.
+***
+### Technical Specifications
+**Dependencies:**
+* **Core ML/AI**: `transformers`, `torch`, `accelerate`
+* **LangChain & LangGraph**: `langgraph`, `langchain-core`, `langchain-community`, `langchain-huggingface`
+* **UI Framework**: `gradio`
+* **Visualization**: `matplotlib`, `plotly`, `pandas`, `numpy`, `scipy`
+* **Utilities**: `python-dotenv`
+* **Monitoring**: `langsmith` (Optional)
+**Deployment:**
+* Hugging Face Spaces compatible.
+* Environment variable configuration for API keys.
+* Production-ready error handling and logging.
+***
+### Results & Impact
+Mimir represents a synthesis of prompt engineering best practices with educational technology requirements. The integration of an intelligent, conditional graphing tool demonstrates the ability to create AI systems that augment and enhance human learning processes, embodying the role of an educational partner who empowers students to succeed through genuine understanding.
+> **Portfolio Demonstration**: This project evidences advanced prompt engineering capabilities, full-stack AI application development, and domain-specific AI system design suitable for enterprise educational technology environments.

agents.py DELETED Viewed

@@ -1,940 +0,0 @@
-# agents.py
-"""
-Unified agent architecture for Mimir Educational AI Assistant.
-LAZY-LOADING LLAMA-3.2-3B-INSTRUCT
-Components:
-- LazyLlamaModel: Singleton lazy-loading model (loads on first use, cached thereafter)
-- ToolDecisionAgent: Uses lazy-loaded Llama for visualization decisions
-- PromptRoutingAgents: Uses lazy-loaded Llama for all 4 routing agents
-- ThinkingAgents: Uses lazy-loaded Llama for all reasoning (including math)
-- ResponseAgent: Uses lazy-loaded Llama for final responses
-Key optimization: Model loads on first generate() call and is cached for all
-subsequent requests. Single model architecture with ~1GB memory footprint.
-No compile or warmup scripts needed - fully automatic.
-"""
-import os
-import re
-import torch
-import logging
-import time
-import subprocess
-import threading
-from datetime import datetime
-from typing import Dict, List, Optional, Tuple, Type
-import warnings
-# Setup main logger first
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# ============================================================================
-# MEMORY PROFILING UTILITIES
-# ============================================================================
-def log_memory(tag=""):
-    """Log current GPU memory usage"""
-    try:
-        if torch.cuda.is_available():
-            allocated = torch.cuda.memory_allocated() / 1024**2
-            reserved = torch.cuda.memory_reserved() / 1024**2
-            max_allocated = torch.cuda.max_memory_allocated() / 1024**2
-            logger.info(f"[{tag}] GPU Memory - Allocated: {allocated:.2f} MB, Reserved: {reserved:.2f} MB, Peak: {max_allocated:.2f} MB")
-        else:
-            logger.info(f"[{tag}] No CUDA available")
-    except Exception as e:
-        logger.warning(f"[{tag}] Error logging GPU memory: {e}")
-def log_nvidia_smi(tag=""):
-    """Log full nvidia-smi output for system-wide GPU view"""
-    try:
-        output = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.used,memory.total', '--format=csv,noheader,nounits'], encoding='utf-8')
-        logger.info(f"[{tag}] NVIDIA-SMI: {output.strip()}")
-    except Exception as e:
-        logger.warning(f"[{tag}] Error running nvidia-smi: {e}")
-def log_step(step_name, start_time=None):
-    """Log a pipeline step with timestamp and duration"""
-    now = time.time()
-    timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3]
-    if start_time:
-        duration = now - start_time
-        logger.info(f"[{timestamp}] ✓ {step_name} completed in {duration:.2f}s")
-    else:
-        logger.info(f"[{timestamp}] → {step_name} starting...")
-    return now
-def profile_generation(model, tokenizer, inputs, **gen_kwargs):
-    """Profile memory and time for model.generate() call"""
-    torch.cuda.empty_cache()
-    torch.cuda.reset_peak_memory_stats()
-    log_memory("Before generate()")
-    start_time = time.time()
-    with torch.no_grad():
-        outputs = model.generate(**inputs, **gen_kwargs)
-    end_time = time.time()
-    duration = end_time - start_time
-    peak_memory = torch.cuda.max_memory_allocated() / 1024**2
-    log_memory("After generate()")
-    logger.info(f"Generation completed in {duration:.2f}s. Peak GPU: {peak_memory:.2f} MB")
-    return outputs, duration
-# ============================================================================
-# IMPORTS
-# ============================================================================
-# Transformers for standard models
-from transformers import (
-    AutoTokenizer,
-    AutoModelForCausalLM,
-    BitsAndBytesConfig,
-)
-# ZeroGPU support
-try:
-    import spaces
-    HF_SPACES_AVAILABLE = True
-except ImportError:
-    HF_SPACES_AVAILABLE = False
-    class DummySpaces:
-        @staticmethod
-        def GPU(duration=90):
-            def decorator(func):
-                return func
-            return decorator
-    spaces = DummySpaces()
-# Accelerate
-from accelerate import Accelerator
-from accelerate.utils import set_seed
-# LangChain Core for proper message handling
-from langchain_core.runnables import Runnable
-from langchain_core.runnables.utils import Input, Output
-from langchain_core.messages import SystemMessage, HumanMessage
-# Import ALL prompts from prompt library
-from prompt_library import (
-    # System prompts
-    CORE_IDENTITY,
-    TOOL_DECISION,
-    agent_1_system,
-    agent_2_system,
-    agent_3_system,
-    agent_4_system,
-    # Thinking agent system prompts
-    MATH_THINKING,
-    QUESTION_ANSWER_DESIGN,
-    REASONING_THINKING,
-    # Response agent prompts (dynamically applied)
-    VAUGE_INPUT,
-    USER_UNDERSTANDING,
-    GENERAL_FORMATTING,
-    LATEX_FORMATTING,
-    GUIDING_TEACHING,
-    STRUCTURE_PRACTICE_QUESTIONS,
-    PRACTICE_QUESTION_FOLLOWUP,
-    TOOL_USE_ENHANCEMENT,
-)
-# ============================================================================
-# MODEL MANAGER - LAZY LOADING
-# ============================================================================
-# Import the lazy-loading Llama-3.2-3B model manager
-from model_manager import get_model as get_shared_llama, LazyLlamaModel as LlamaSharedAgent
-# Backwards compatibility aliases
-get_shared_mistral = get_shared_llama
-MistralSharedAgent = LlamaSharedAgent
-# ============================================================================
-# CONFIGURATION
-# ============================================================================
-CACHE_DIR = "/tmp/compiled_models"
-HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
-# Suppress warnings
-warnings.filterwarnings("ignore", category=UserWarning)
-warnings.filterwarnings("ignore", category=FutureWarning)
-# Model info (for logging/diagnostics)
-LLAMA_MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
-def check_model_cache() -> Dict[str, bool]:
-    """Check model status (legacy function for compatibility)"""
-    cache_status = {
-        "llama": True,  # Lazy-loaded on first use
-        "all_compiled": True,
-    }
-    logger.info("✓ Llama-3.2-3B uses lazy loading (loads on first generate() call)")
-    return cache_status
-# Call at module load
-_cache_status = check_model_cache()
-log_memory("Module load complete")
-# ============================================================================
-# TOOL DECISION AGENT
-# ============================================================================
-class ToolDecisionAgent:
-    """
-    Analyzes if visualization/graphing tools should be used.
-    Uses lazy-loaded Llama-3.2-3B for decision-making.
-    Model loads automatically on first use.
-    Returns: Boolean (True = use tools, False = skip tools)
-    """
-    def __init__(self):
-        """Initialize with lazy-loaded Llama model"""
-        self.model = get_shared_llama()
-        logger.info("ToolDecisionAgent initialized (using lazy-loaded Llama)")
-    def decide(self, user_query: str, conversation_history: List[Dict]) -> bool:
-        """
-        Decide if graphing tools should be used.
-        Args:
-            user_query: Current user message
-            conversation_history: Full conversation context
-        Returns:
-            bool: True if tools should be used
-        """
-        logger.info("→ ToolDecisionAgent: Analyzing query for tool usage")
-        # Format conversation context
-        context = "\n".join([
-            f"{msg['role']}: {msg['content']}"
-            for msg in conversation_history[-3:]  # Last 3 turns
-        ])
-        # Decision prompt
-        analysis_prompt = f"""Previous conversation:
-{context}
-Current query: {user_query}
-Should visualization tools (graphs, charts) be used?"""
-        try:
-            decision_start = time.time()
-            # Use shared Llama for decision
-            response = self.model.generate(
-                system_prompt=TOOL_DECISION,
-                user_message=analysis_prompt,
-                max_tokens=10,
-                temperature=0.1
-            )
-            decision_time = time.time() - decision_start
-            # Parse decision
-            decision = "YES" in response.upper()
-            logger.info(f"✓ ToolDecision: {'USE TOOLS' if decision else 'NO TOOLS'} ({decision_time:.2f}s)")
-            return decision
-        except Exception as e:
-            logger.error(f"ToolDecisionAgent error: {e}")
-            return False  # Default: no tools
-# ============================================================================
-# PROMPT ROUTING AGENTS (4 Specialized Agents)
-# ============================================================================
-class PromptRoutingAgents:
-    """
-    Four specialized agents for prompt segment selection.
-    All share the same Llama-3.2-3B instance for efficiency.
-    Agents:
-    1. Practice Question Detector
-    2. Discovery Mode Classifier
-    3. Follow-up Assessment
-    4. Teaching Mode Assessor
-    """
-    def __init__(self):
-        """Initialize with shared Llama model"""
-        self.model = get_shared_llama()
-        logger.info("PromptRoutingAgents initialized (4 agents, shared Llama)")
-    def agent_1_practice_question(
-        self,
-        user_query: str,
-        conversation_history: List[Dict]
-    ) -> bool:
-        """Agent 1: Detect if practice questions should be generated"""
-        logger.info("→ Agent 1: Analyzing for practice question opportunity")
-        context = "\n".join([
-            f"{msg['role']}: {msg['content']}"
-            for msg in conversation_history[-4:]
-        ])
-        analysis_prompt = f"""Conversation:
-{context}
-New query: {user_query}
-Should I create practice questions?"""
-        try:
-            response = self.model.generate(
-                system_prompt=agent_1_system,
-                user_message=analysis_prompt,
-                max_tokens=10,
-                temperature=0.1
-            )
-            decision = "YES" in response.upper()
-            logger.info(f"✓ Agent 1: {'PRACTICE QUESTIONS' if decision else 'NO PRACTICE'}")
-            return decision
-        except Exception as e:
-            logger.error(f"Agent 1 error: {e}")
-            return False
-    def agent_2_discovery_mode(
-        self,
-        user_query: str,
-        conversation_history: List[Dict]
-    ) -> Tuple[bool, bool]:
-        """Agent 2: Classify vague input and understanding level"""
-        logger.info("→ Agent 2: Classifying discovery mode")
-        context = "\n".join([
-            f"{msg['role']}: {msg['content']}"
-            for msg in conversation_history[-3:]
-        ])
-        analysis_prompt = f"""Conversation:
-{context}
-Query: {user_query}
-Classification:
-1. Is input vague? (VAGUE/CLEAR)
-2. Understanding level? (LOW/MEDIUM/HIGH)"""
-        try:
-            response = self.model.generate(
-                system_prompt=agent_2_system,
-                user_message=analysis_prompt,
-                max_tokens=20,
-                temperature=0.1
-            )
-            vague = "VAGUE" in response.upper()
-            low_understanding = "LOW" in response.upper()
-            logger.info(f"✓ Agent 2: Vague={vague}, LowUnderstanding={low_understanding}")
-            return vague, low_understanding
-        except Exception as e:
-            logger.error(f"Agent 2 error: {e}")
-            return False, False
-    def agent_3_followup_assessment(
-        self,
-        user_query: str,
-        conversation_history: List[Dict]
-    ) -> bool:
-        """Agent 3: Detect if user is responding to practice questions"""
-        logger.info("→ Agent 3: Checking for practice question follow-up")
-        # Check last bot message for practice question indicators
-        if len(conversation_history) < 2:
-            return False
-        last_bot_msg = None
-        for msg in reversed(conversation_history):
-            if msg['role'] == 'assistant':
-                last_bot_msg = msg['content']
-                break
-        if not last_bot_msg:
-            return False
-        # Look for practice question markers
-        has_practice = any(marker in last_bot_msg.lower() for marker in [
-            "practice", "try this", "solve", "calculate", "what is", "question"
-        ])
-        if not has_practice:
-            return False
-        # Analyze if current query is an answer attempt
-        analysis_prompt = f"""Previous message (from me):
-{last_bot_msg[:500]}
-User response:
-{user_query}
-Is user answering a practice question?"""
-        try:
-            response = self.model.generate(
-                system_prompt=agent_3_system,
-                user_message=analysis_prompt,
-                max_tokens=10,
-                temperature=0.1
-            )
-            is_followup = "YES" in response.upper()
-            logger.info(f"✓ Agent 3: {'GRADING MODE' if is_followup else 'NOT FOLLOWUP'}")
-            return is_followup
-        except Exception as e:
-            logger.error(f"Agent 3 error: {e}")
-            return False
-    def agent_4_teaching_mode(
-        self,
-        user_query: str,
-        conversation_history: List[Dict]
-    ) -> Tuple[bool, bool]:
-        """Agent 4: Assess teaching vs practice mode"""
-        logger.info("→ Agent 4: Assessing teaching mode")
-        context = "\n".join([
-            f"{msg['role']}: {msg['content']}"
-            for msg in conversation_history[-3:]
-        ])
-        analysis_prompt = f"""Conversation:
-{context}
-Query: {user_query}
-Assessment:
-1. Need direct teaching? (TEACH/PRACTICE)
-2. Create practice questions? (YES/NO)"""
-        try:
-            response = self.model.generate(
-                system_prompt=agent_4_system,
-                user_message=analysis_prompt,
-                max_tokens=15,
-                temperature=0.1
-            )
-            teaching = "TEACH" in response.upper()
-            practice = "YES" in response.upper() or "PRACTICE" in response.upper()
-            logger.info(f"✓ Agent 4: Teaching={teaching}, Practice={practice}")
-            return teaching, practice
-        except Exception as e:
-            logger.error(f"Agent 4 error: {e}")
-            return False, False
-    def process(
-            self,
-            user_input: str,
-            tool_used: bool = False,
-            conversation_history: Optional[List[Dict]] = None
-        ) -> Tuple[str, str]:
-            """
-            Unified process method - runs all 4 routing agents sequentially.
-            Returns:
-                Tuple[str, str]: (response_prompts, thinking_prompts)
-            """
-            if conversation_history is None:
-                conversation_history = []
-            response_prompts = []
-            thinking_prompts = []
-            # Agent 1: Practice Questions
-            if self.agent_1_practice_question(user_input, conversation_history):
-                response_prompts.append("STRUCTURE_PRACTICE_QUESTIONS")
-            # Agent 2: Discovery Mode
-            is_vague, low_understanding = self.agent_2_discovery_mode(user_input, conversation_history)
-            if is_vague:
-                response_prompts.append("VAUGE_INPUT")
-            if low_understanding:
-                response_prompts.append("USER_UNDERSTANDING")
-            # Agent 3: Follow-up Assessment
-            if self.agent_3_followup_assessment(user_input, conversation_history):
-                response_prompts.append("PRACTICE_QUESTION_FOLLOWUP")
-            # Agent 4: Teaching Mode
-            needs_teaching, needs_practice = self.agent_4_teaching_mode(user_input, conversation_history)
-            if needs_teaching:
-                response_prompts.append("GUIDING_TEACHING")
-            # Always add base formatting
-            response_prompts.extend(["GENERAL_FORMATTING", "LATEX_FORMATTING"])
-            # Tool enhancement if used
-            if tool_used:
-                response_prompts.append("TOOL_USE_ENHANCEMENT")
-            # Return as newline-separated strings
-            response_prompts_str = "\n".join(response_prompts)
-            thinking_prompts_str = ""  # Thinking prompts decided elsewhere
-            return response_prompts_str, thinking_prompts_str
-# ============================================================================
-# THINKING AGENTS (Preprocessing Layer)
-# ============================================================================
-class ThinkingAgents:
-    """
-    Generates reasoning context before final response.
-    Uses shared Llama-3.2-3B for all thinking (including math).
-    Agents:
-    1. Math Thinking (Tree-of-Thought)
-    2. Q&A Design (Chain-of-Thought)
-    3. General Reasoning (Chain-of-Thought)
-    """
-    def __init__(self):
-        """Initialize with shared Llama model"""
-        self.model = get_shared_llama()
-        logger.info("ThinkingAgents initialized (using shared Llama for all thinking)")
-    def math_thinking(
-        self,
-        user_query: str,
-        conversation_history: List[Dict],
-        tool_context: str = ""
-    ) -> str:
-        """
-        Generate mathematical reasoning using Tree-of-Thought.
-        Now uses Llama-3.2-3B instead of GGUF.
-        """
-        logger.info("→ Math Thinking Agent: Generating reasoning")
-        context = "\n".join([
-            f"{msg['role']}: {msg['content']}"
-            for msg in conversation_history[-3:]
-        ])
-        thinking_prompt = f"""Conversation context:
-{context}
-Current query: {user_query}
-{f"Tool output: {tool_context}" if tool_context else ""}
-Generate mathematical reasoning:"""
-        try:
-            thinking_start = time.time()
-            reasoning = self.model.generate(
-                system_prompt=MATH_THINKING,
-                user_message=thinking_prompt,
-                max_tokens=300,
-                temperature=0.7
-            )
-            thinking_time = time.time() - thinking_start
-            logger.info(f"✓ Math Thinking: Generated {len(reasoning)} chars ({thinking_time:.2f}s)")
-            return reasoning
-        except Exception as e:
-            logger.error(f"Math Thinking error: {e}")
-            return ""
-    def qa_design_thinking(
-        self,
-        user_query: str,
-        conversation_history: List[Dict],
-        tool_context: str = ""
-    ) -> str:
-        """Generate practice question design reasoning"""
-        logger.info("→ Q&A Design Agent: Generating question strategy")
-        context = "\n".join([
-            f"{msg['role']}: {msg['content']}"
-            for msg in conversation_history[-3:]
-        ])
-        thinking_prompt = f"""Context:
-{context}
-Query: {user_query}
-{f"Tool data: {tool_context}" if tool_context else ""}
-Design practice questions:"""
-        try:
-            reasoning = self.model.generate(
-                system_prompt=QUESTION_ANSWER_DESIGN,
-                user_message=thinking_prompt,
-                max_tokens=250,
-                temperature=0.7
-            )
-            logger.info(f"✓ Q&A Design: Generated {len(reasoning)} chars")
-            return reasoning
-        except Exception as e:
-            logger.error(f"Q&A Design error: {e}")
-            return ""
-    def process(
-        self,
-        user_input: str,
-        conversation_history: str = "",
-        thinking_prompts: str = "",
-        tool_img_output: str = "",
-        tool_context: str = ""
-    ) -> str:
-        """
-        Unified process method - runs thinking agents based on active prompts.
-        Args:
-            user_input: User's query
-            conversation_history: Formatted conversation history string
-            thinking_prompts: Newline-separated list of thinking prompts to activate
-            tool_img_output: HTML output from visualization tool
-            tool_context: Context from tool usage
-        Returns:
-            str: Combined thinking context from all activated agents
-        """
-        thinking_outputs = []
-        # Convert history string to list format for agent methods
-        history_list = []
-        if conversation_history and conversation_history != "No previous conversation":
-            for line in conversation_history.split('\n'):
-                if ':' in line:
-                    role, content = line.split(':', 1)
-                    history_list.append({'role': role.strip(), 'content': content.strip()})
-        # Determine which thinking agents to run based on prompts
-        prompt_list = [p.strip() for p in thinking_prompts.split('\n') if p.strip()]
-        # Math Thinking
-        if any('MATH' in p.upper() for p in prompt_list):
-            math_output = self.math_thinking(
-                user_query=user_input,
-                conversation_history=history_list,
-                tool_context=tool_context
-            )
-            if math_output:
-                thinking_outputs.append(f"[Mathematical Reasoning]\n{math_output}")
-        # Q&A Design Thinking
-        if any('PRACTICE' in p.upper() or 'QUESTION' in p.upper() for p in prompt_list):
-            qa_output = self.qa_design_thinking(
-                user_query=user_input,
-                conversation_history=history_list,
-                tool_context=tool_context
-            )
-            if qa_output:
-                thinking_outputs.append(f"[Practice Question Design]\n{qa_output}")
-        # General Reasoning (fallback or when no specific thinking needed)
-        if not thinking_outputs or any('REASONING' in p.upper() for p in prompt_list):
-            general_output = self.general_reasoning(
-                user_query=user_input,
-                conversation_history=history_list,
-                tool_context=tool_context
-            )
-            if general_output:
-                thinking_outputs.append(f"[General Reasoning]\n{general_output}")
-        # Combine all thinking outputs
-        combined_thinking = "\n\n".join(thinking_outputs) if thinking_outputs else ""
-        if combined_thinking:
-            logger.info(f"✓ Thinking complete: {len(combined_thinking)} chars from {len(thinking_outputs)} agents")
-        return combined_thinking
-    def general_reasoning(
-        self,
-        user_query: str,
-        conversation_history: List[Dict],
-        tool_context: str = ""
-    ) -> str:
-        """Generate general reasoning context"""
-        logger.info("→ General Reasoning Agent: Generating context")
-        context = "\n".join([
-            f"{msg['role']}: {msg['content']}"
-            for msg in conversation_history[-4:]
-        ])
-        thinking_prompt = f"""Conversation:
-{context}
-Query: {user_query}
-{f"Context: {tool_context}" if tool_context else ""}
-Analyze and provide reasoning:"""
-        try:
-            reasoning = self.model.generate(
-                system_prompt=REASONING_THINKING,
-                user_message=thinking_prompt,
-                max_tokens=200,
-                temperature=0.7
-            )
-            logger.info(f"✓ General Reasoning: Generated {len(reasoning)} chars")
-            return reasoning
-        except Exception as e:
-            logger.error(f"General Reasoning error: {e}")
-            return ""
-# ============================================================================
-# RESPONSE AGENT (Final Response Generation)
-# ============================================================================
-class ResponseAgent(Runnable):
-    """
-    Generates final educational responses using lazy-loaded Llama-3.2-3B.
-    Model loads automatically on first use.
-    Features:
-    - Dynamic prompt assembly based on agent decisions
-    - Streaming word-by-word output
-    - Educational tone enforcement
-    - LaTeX support for math
-    - Context integration (thinking outputs, tool outputs)
-    """
-    def __init__(self):
-        """Initialize with lazy-loaded Llama model"""
-        super().__init__()
-        self.model = get_shared_llama()
-        logger.info("ResponseAgent initialized (using lazy-loaded Llama)")
-    def invoke(self, input_data: Dict) -> Dict:
-        """
-        Generate final response with streaming.
-        Args:
-            input_data: {
-                'user_query': str,
-                'conversation_history': List[Dict],
-                'active_prompts': List[str],
-                'thinking_context': str,
-                'tool_context': str,
-            }
-        Returns:
-            {'response': str, 'metadata': Dict}
-        """
-        logger.info("→ ResponseAgent: Generating final response")
-        # Extract inputs
-        user_query = input_data.get('user_query', '')
-        conversation_history = input_data.get('conversation_history', [])
-        active_prompts = input_data.get('active_prompts', [])
-        thinking_context = input_data.get('thinking_context', '')
-        tool_context = input_data.get('tool_context', '')
-        # Build system prompt from active segments
-        system_prompt = self._build_system_prompt(active_prompts)
-        # Build user message with context
-        user_message = self._build_user_message(
-            user_query,
-            conversation_history,
-            thinking_context,
-            tool_context
-        )
-        try:
-            response_start = time.time()
-            # Generate response (streaming handled at app.py level)
-            response = self.model.generate(
-                system_prompt=system_prompt,
-                user_message=user_message,
-                max_tokens=600,
-                temperature=0.7
-            )
-            response_time = time.time() - response_start
-            # Clean up response
-            response = self._clean_response(response)
-            logger.info(f"✓ ResponseAgent: Generated {len(response)} chars ({response_time:.2f}s)")
-            return {
-                'response': response,
-                'metadata': {
-                    'generation_time': response_time,
-                    'model': LLAMA_MODEL_ID,
-                    'active_prompts': active_prompts
-                }
-            }
-        except Exception as e:
-            logger.error(f"ResponseAgent error: {e}")
-            return {
-                'response': "I apologize, but I encountered an error generating a response. Please try again.",
-                'metadata': {'error': str(e)}
-            }
-    def _build_system_prompt(self, active_prompts: List[str]) -> str:
-        """Assemble system prompt from active segments"""
-        prompt_map = {
-            'CORE_IDENTITY': CORE_IDENTITY,
-            'GENERAL_FORMATTING': GENERAL_FORMATTING,
-            'LATEX_FORMATTING': LATEX_FORMATTING,
-            'VAUGE_INPUT': VAUGE_INPUT,
-            'USER_UNDERSTANDING': USER_UNDERSTANDING,
-            'GUIDING_TEACHING': GUIDING_TEACHING,
-            'STRUCTURE_PRACTICE_QUESTIONS': STRUCTURE_PRACTICE_QUESTIONS,
-            'PRACTICE_QUESTION_FOLLOWUP': PRACTICE_QUESTION_FOLLOWUP,
-            'TOOL_USE_ENHANCEMENT': TOOL_USE_ENHANCEMENT,
-        }
-        # Always include core identity
-        segments = [CORE_IDENTITY, GENERAL_FORMATTING]
-        # Add active prompts
-        for prompt_name in active_prompts:
-            if prompt_name in prompt_map and prompt_map[prompt_name] not in segments:
-                segments.append(prompt_map[prompt_name])
-        return "\n\n".join(segments)
-    def _build_user_message(
-        self,
-        user_query: str,
-        conversation_history: List[Dict],
-        thinking_context: str,
-        tool_context: str
-    ) -> str:
-        """Build user message with all context"""
-        parts = []
-        # Conversation history (last 3 turns)
-        if conversation_history:
-            history_text = "\n".join([
-                f"{msg['role']}: {msg['content'][:200]}"
-                for msg in conversation_history[-3:]
-            ])
-            parts.append(f"Recent conversation:\n{history_text}")
-        # Thinking context (invisible to user, guides response)
-        if thinking_context:
-            parts.append(f"[Internal reasoning context]: {thinking_context}")
-        # Tool context
-        if tool_context:
-            parts.append(f"[Tool output]: {tool_context}")
-        # Current query
-        parts.append(f"Student query: {user_query}")
-        return "\n\n".join(parts)
-    def _clean_response(self, response: str) -> str:
-        """Clean up response artifacts"""
-        # Remove common artifacts
-        artifacts = ['<|im_end|>', '<|endoftext|>', '###', '<|end|>']
-        for artifact in artifacts:
-            response = response.replace(artifact, '')
-        # Remove trailing incomplete sentences
-        if response and response[-1] not in '.!?':
-            # Find last complete sentence
-            for delimiter in ['. ', '! ', '? ']:
-                if delimiter in response:
-                    response = response.rsplit(delimiter, 1)[0] + delimiter[0]
-                    break
-        return response.strip()
-    def stream(self, input_data: Dict):
-        """
-        Stream response word-by-word.
-        Yields:
-            str: Response chunks
-        """
-        logger.info("→ ResponseAgent: Streaming response")
-        # Build prompts
-        system_prompt = self._build_system_prompt(input_data.get('active_prompts', []))
-        user_message = self._build_user_message(
-            input_data.get('user_query', ''),
-            input_data.get('conversation_history', []),
-            input_data.get('thinking_context', ''),
-            input_data.get('tool_context', '')
-        )
-        try:
-            # Use streaming generation from shared model
-            for chunk in self.model.generate_streaming(
-                system_prompt=system_prompt,
-                user_message=user_message,
-                max_tokens=600,
-                temperature=0.7
-            ):
-                yield chunk
-        except Exception as e:
-            logger.error(f"Streaming error: {e}")
-            yield "I apologize, but I encountered an error. Please try again."
-# ============================================================================
-# MODULE INITIALIZATION
-# ============================================================================
-logger.info("="*60)
-logger.info("MIMIR AGENTS MODULE INITIALIZED")
-logger.info("="*60)
-logger.info(f"  Model: Llama-3.2-3B-Instruct (lazy-loaded)")
-logger.info(f"  Agents: Tool, Routing (4x), Thinking (3x), Response")
-logger.info(f"  Memory: ~1GB (loads on first use)")
-logger.info(f"  Architecture: Single unified model with caching")
-logger.info("="*60)

app.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

gradio_analytics.py DELETED Viewed

@@ -1,538 +0,0 @@
-# gradio_analytics.py
-import gradio as gr
-import logging
-import json
-import sqlite3
-import os
-from datetime import datetime
-logger = logging.getLogger(__name__)
-try:
-    from app import (
-        get_trackio_database_path,
-        get_project_statistics_with_nulls,
-        get_recent_interactions_with_nulls,
-        create_dashboard_html_with_nulls,
-        calculate_response_quality,
-        refresh_analytics_data_persistent as refresh_analytics_data,
-        export_metrics_json_persistent as export_metrics_json,
-        export_metrics_csv_persistent as export_metrics_csv,
-        load_analytics_state,
-        get_global_state_debug_info,
-        sync_trackio_with_global_state,
-        global_state_manager,
-        evaluate_educational_quality_with_tracking,
-    )
-except ImportError:
-    def get_trackio_database_path(project_name):
-        return None
-    def get_project_statistics_with_nulls(cursor, project_name):
-        return {
-            "total_conversations": None,
-            "avg_session_length": None,
-            "success_rate": None
-        }
-    def get_recent_interactions_with_nulls(cursor, project_name, limit=10):
-        return []
-    def create_dashboard_html_with_nulls(project_name, project_stats):
-        return f"<div>Mock dashboard for {project_name}</div>"
-    def calculate_response_quality(response):
-        return 3.0
-    def refresh_analytics_data():
-        return {}, [], "<div>Mock analytics</div>"
-    def export_metrics_json():
-        gr.Info("Mock JSON export")
-    def export_metrics_csv():
-        gr.Info("Mock CSV export")
-    def load_analytics_state():
-        return {}, [], "<div>Mock analytics state</div>"
-    def get_global_state_debug_info():
-        return {"status": "mock"}
-    def sync_trackio_with_global_state():
-        pass
-    def evaluate_educational_quality_with_tracking(*args, **kwargs):
-        return {"educational_score": 0.5}
-    class MockStateManager:
-        def get_cache_status(self):
-            return {"status": "mock"}
-        def get_evaluation_summary(self, include_history=False):
-            return {"aggregate_metrics": {}, "total_evaluations": {}}
-        def clear_all_states(self):
-            pass
-        def _backup_to_hf_dataset(self):
-            pass
-    global_state_manager = MockStateManager()
-def load_custom_css():
-    try:
-        with open("styles.css", "r", encoding="utf-8") as css_file:
-            css_content = css_file.read()
-            logger.info(f"CSS loaded successfully for analytics page")
-            return css_content
-    except FileNotFoundError:
-        logger.warning("styles.css file not found for analytics page")
-        return ""
-    except Exception as e:
-        logger.warning(f"Error reading styles.css: {e}")
-        return ""
-def show_cache_info():
-    try:
-        from pathlib import Path
-        from huggingface_hub import scan_cache_dir
-        cache_info = scan_cache_dir(cache_dir="/tmp/huggingface")
-        info_text = f"""
-**HuggingFace Cache Status:**
-**Total Size:** {cache_info.size_on_disk / (1024**3):.2f} GB
-**Number of Repos:** {len(cache_info.repos)}
-**Cached Models:**
-"""
-        for repo in cache_info.repos:
-            size_gb = repo.size_on_disk / (1024**3)
-            info_text += f"""
-- **{repo.repo_id}**
-  - Size: {size_gb:.2f} GB
-  - Type: {repo.repo_type}
-  - Revisions: {len(repo.revisions)}
-"""
-        return info_text
-    except Exception as e:
-        return f"Error inspecting cache: {str(e)}"
-def launch_external_trackio():
-    try:
-        import subprocess
-        result = subprocess.run(
-            ["trackio", "show", "--project", "Mimir"],
-            capture_output=False,
-            text=True
-        )
-        if result.returncode == 0:
-            gr.Info("Trackio dashboard launched in browser")
-        else:
-            gr.Warning("Could not launch trackio dashboard")
-    except Exception as e:
-        logger.error(f"Failed to launch trackio: {e}")
-        gr.Warning(f"Failed to launch trackio dashboard: {str(e)}")
-def show_cache_status():
-    try:
-        debug_info = get_global_state_debug_info()
-        cache_status = debug_info.get("cache_status", {})
-        status_text = f"""
-**Global State Cache Status:**
-- Session ID: {cache_status.get('session_id', 'Unknown')}
-- Analytics Cached: {'Yes' if cache_status.get('analytics_cached') else 'No'}
-- Conversation Cached: {'Yes' if cache_status.get('conversation_cached') else 'No'}
-- Analytics Last Refresh: {cache_status.get('analytics_last_refresh', 'Never')}
-- Total Analytics Sessions: {cache_status.get('total_analytics_sessions', 0)}
-- Total Conversation Sessions: {cache_status.get('total_conversation_sessions', 0)}
-**Analytics Data Status:**
-- Has Analytics Data: {'Yes' if cache_status.get('analytics_has_data') else 'No'}
-- Conversation Length: {cache_status.get('conversation_length', 0)} messages
-- Chat History Length: {cache_status.get('chat_history_length', 0)} messages
-*Last Updated: {datetime.now().strftime('%H:%M:%S')}*
-        """
-        gr.Info("Cache status updated - check the Status panel")
-        return status_text
-    except Exception as e:
-        error_text = f"Error getting cache status: {str(e)}"
-        gr.Warning(error_text)
-        return error_text
-def manual_backup_to_hf():
-    try:
-        global_state_manager._backup_to_hf_dataset()
-        gr.Info("Manual backup to HF dataset completed successfully")
-        return f"Backup completed at {datetime.now().strftime('%H:%M:%S')}"
-    except Exception as e:
-        gr.Warning(f"Backup failed: {str(e)}")
-        return f"Backup failed: {str(e)}"
-def get_persistence_status():
-    try:
-        status_info = {
-            "SQLite DB": "Active" if os.path.exists(global_state_manager._db_path) else "Not Found",
-            "HF Dataset": global_state_manager.dataset_repo,
-            "Last HF Backup": global_state_manager._last_hf_backup.strftime('%Y-%m-%d %H:%M:%S'),
-            "DB Path": global_state_manager._db_path,
-            "Backup Interval": f"{global_state_manager._hf_backup_interval}s"
-        }
-        return status_info
-    except Exception as e:
-        return {"error": str(e)}
-def clear_all_global_states():
-    try:
-        global_state_manager.clear_all_states()
-        gr.Info("All global states cleared successfully")
-        empty_stats = {
-            "total_conversations": None,
-            "avg_session_length": None,
-            "success_rate": None,
-            "model_type": "Cleared",
-            "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        }
-        empty_html = """
-        <div style="text-align: center; padding: 40px; border: 2px dashed #ccc; border-radius: 8px; background: #f8f9fa;">
-            <h3>States Cleared</h3>
-            <p>All global states have been cleared.</p>
-            <p>Click "Refresh Data" to reload analytics.</p>
-        </div>
-        """
-        return empty_stats, [], empty_html
-    except Exception as e:
-        gr.Warning(f"Failed to clear states: {str(e)}")
-        return load_analytics_state()
-def show_evaluation_metrics():
-    try:
-        eval_summary = global_state_manager.get_evaluation_summary(include_history=True)
-        metrics_data = [
-            ["Educational Quality", f"{eval_summary['aggregate_metrics']['avg_educational_quality']:.3f}"],
-            ["User Satisfaction", f"{eval_summary['aggregate_metrics']['user_satisfaction_rate']:.3f}"]
-        ]
-        recent_evaluations = []
-        if 'history' in eval_summary:
-            for eval_item in eval_summary['history']['recent_educational_scores'][-5:]:
-                recent_evaluations.append([
-                    eval_item['timestamp'][:16],
-                    f"{eval_item['educational_score']:.3f}",
-                    f"{eval_item['semantic_quality']:.3f}",
-                    f"{eval_item['response_time']:.3f}s"
-                ])
-        return eval_summary, metrics_data, recent_evaluations
-    except Exception as e:
-        logger.error(f"Error getting evaluation metrics: {e}")
-        return {}, [], []
-def sync_and_refresh_all():
-    try:
-        sync_trackio_with_global_state()
-        project_stats, recent_interactions, dashboard_html = refresh_analytics_data()
-        eval_summary, metrics_data, recent_evaluations = show_evaluation_metrics()
-        gr.Info("All data synced and refreshed successfully")
-        return project_stats, recent_interactions, dashboard_html, eval_summary, metrics_data, recent_evaluations
-    except Exception as e:
-        logger.error(f"Sync and refresh failed: {e}")
-        gr.Warning(f"Sync failed: {str(e)}")
-        return load_analytics_state() + ({}, [], [])
-with gr.Blocks() as demo:
-    custom_css = load_custom_css()
-    if custom_css:
-        gr.HTML(f'<style>{custom_css}</style>')
-    gr.HTML('<div class="analytics-title"><h2>Mimir Analytics Dashboard</h2></div>')
-    gr.Markdown("Monitor educational AI performance and effectiveness metrics with persistent state management.")
-    with gr.Tabs():
-        with gr.TabItem("Traditional Analytics"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    gr.Markdown("## Controls")
-                    refresh_btn = gr.Button("Refresh Data", variant="primary")
-                    sync_all_btn = gr.Button("Sync & Refresh All", variant="primary")
-                    with gr.Row():
-                        export_json_btn = gr.Button("Export JSON", variant="secondary", size="sm")
-                        export_csv_btn = gr.Button("Export CSV", variant="secondary", size="sm")
-                    launch_trackio_btn = gr.Button("Launch Trackio Dashboard", variant="secondary")
-                    gr.Markdown("### State Management")
-                    with gr.Row():
-                        cache_status_btn = gr.Button("Cache Status", size="sm")
-                        clear_states_btn = gr.Button("Clear All States", size="sm", variant="stop")
-                    with gr.Group():
-                        gr.Markdown("### Project Information")
-                        project_info = gr.JSON(
-                            value={
-                                "total_conversations": None,
-                                "avg_session_length": None,
-                                "success_rate": None,
-                                "model_type": None
-                            },
-                            label="Project Stats"
-                        )
-                    with gr.Group():
-                        gr.Markdown("### System Status")
-                        status_panel = gr.Markdown(
-                            "Click 'Cache Status' to view global state information.",
-                            label="Status Information"
-                        )
-                with gr.Column(scale=2):
-                    gr.Markdown("## Key Metrics Dashboard")
-                    trackio_iframe = gr.HTML(
-                        value="""
-                        <div style="text-align: center; padding: 40px; border: 2px dashed #ccc; border-radius: 8px; background: #f8f9fa;">
-                            <h3>Trackio Dashboard</h3>
-                            <p>Analytics data will appear here after conversations.</p>
-                            <p>Data is automatically cached and persists across page navigation.</p>
-                            <p>To launch trackio dashboard separately, run:</p>
-                            <code style="background: #e9ecef; padding: 4px 8px; border-radius: 4px;">trackio show --project "Mimir"</code>
-                        </div>
-                        """,
-                        label="Dashboard"
-                    )
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("## Recent Interactions")
-                    gr.Markdown("*Data persists when switching between Chatbot and Analytics pages*")
-                    recent_metrics = gr.Dataframe(
-                        headers=["Timestamp", "Response Time", "Prompt Mode", "Tools Used", "Quality Score", "Adapter"],
-                        datatype=["str", "number", "str", "bool", "number", "str"],
-                        row_count=10,
-                        col_count=6,
-                        interactive=False,
-                        label="Latest Sessions",
-                        value=[],
-                        show_label=True
-                    )
-        with gr.TabItem("ML Performance"):
-            gr.Markdown("## Agent-Based Performance & Global State Metrics")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    eval_metrics_btn = gr.Button("Get Evaluation Metrics", variant="primary")
-                    with gr.Group():
-                        gr.Markdown("### Model Cache Status")
-                        cache_status_display = gr.JSON(
-                            value={},
-                            label="Cache Information"
-                        )
-                with gr.Column(scale=2):
-                    gr.Markdown("### Aggregate Performance Metrics")
-                    eval_metrics_table = gr.Dataframe(
-                        headers=["Metric", "Score"],
-                        datatype=["str", "str"],
-                        label="Model Performance",
-                        value=[]
-                    )
-                    eval_summary_display = gr.JSON(
-                        value={},
-                        label="Detailed Evaluation Summary"
-                    )
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("### Recent Quality Evaluations")
-                    recent_evaluations_table = gr.Dataframe(
-                        headers=["Timestamp", "Educational Score", "Semantic Quality", "Response Time"],
-                        datatype=["str", "str", "str", "str"],
-                        label="Recent Evaluations",
-                        value=[]
-                    )
-        with gr.TabItem("System Status"):
-            gr.Markdown("## Global State Manager & System Diagnostics")
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("### Global State Cache")
-                    cache_details = gr.Markdown("Click 'Show Cache Status' to view detailed information.")
-                    show_cache_btn = gr.Button("Show Cache Status", variant="primary")
-                    refresh_cache_btn = gr.Button("Refresh Cache Info", variant="secondary")
-                    gr.Markdown("### Persistence Controls")
-                    backup_btn = gr.Button("Manual Backup to HF Dataset", variant="primary")
-                    backup_status = gr.Textbox(label="Backup Status", value="No recent backup", interactive=False)
-                with gr.Column():
-                    gr.Markdown("### System Actions")
-                    sync_trackio_btn = gr.Button("Sync to Database", variant="secondary")
-                    clear_all_btn = gr.Button("Clear All Global States", variant="stop")
-                    gr.Markdown("### Persistence Status")
-                    persistence_info = gr.JSON(
-                        value={},
-                        label="Persistence Information"
-                    )
-                    gr.Markdown("### Performance Monitor")
-                    perf_info = gr.JSON(
-                        value={},
-                        label="Performance Information"
-                    )
-            # NEW: HuggingFace Cache Viewer Section
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("### 🗂️ HuggingFace Model Cache")
-                    gr.Markdown("*View cached models and disk usage*")
-                    cache_viewer_btn = gr.Button("Inspect Model Cache", variant="primary", size="lg")
-                    with gr.Row():
-                        clear_cache_btn = gr.Button("Clear Cache (⚠️ Dangerous)", variant="stop", size="sm")
-                        refresh_models_btn = gr.Button("Re-download Models", variant="secondary", size="sm")
-                    cache_info_display = gr.Markdown(
-                        "Click **Inspect Model Cache** to view detailed cache information.",
-                        label="Cache Details"
-                    )
-    demo.load(
-        load_analytics_state,
-        inputs=None,
-        outputs=[project_info, recent_metrics, trackio_iframe],
-        show_progress="hidden"
-    )
-    demo.load(
-        fn=lambda: global_state_manager.get_cache_status(),
-        inputs=None,
-        outputs=[cache_status_display],
-        show_progress="hidden"
-    )
-    demo.load(
-        fn=get_persistence_status,
-        inputs=None,
-        outputs=[persistence_info],
-        show_progress="hidden"
-    )
-    refresh_btn.click(
-        fn=refresh_analytics_data,
-        inputs=[],
-        outputs=[project_info, recent_metrics, trackio_iframe],
-        show_progress="full"
-    )
-    sync_all_btn.click(
-        fn=sync_and_refresh_all,
-        inputs=[],
-        outputs=[project_info, recent_metrics, trackio_iframe, eval_summary_display, eval_metrics_table, recent_evaluations_table],
-        show_progress="full"
-    )
-    export_json_btn.click(
-        fn=export_metrics_json,
-        inputs=[],
-        outputs=[],
-        show_progress="full"
-    )
-    export_csv_btn.click(
-        fn=export_metrics_csv,
-        inputs=[],
-        outputs=[],
-        show_progress="full"
-    )
-    launch_trackio_btn.click(
-        fn=launch_external_trackio,
-        inputs=[],
-        outputs=[],
-        show_progress="full"
-    )
-    cache_status_btn.click(
-        fn=show_cache_status,
-        inputs=[],
-        outputs=[status_panel],
-        show_progress="full"
-    )
-    clear_states_btn.click(
-        fn=clear_all_global_states,
-        inputs=[],
-        outputs=[project_info, recent_metrics, trackio_iframe],
-        show_progress="full"
-    )
-    eval_metrics_btn.click(
-        fn=show_evaluation_metrics,
-        inputs=[],
-        outputs=[eval_summary_display, eval_metrics_table, recent_evaluations_table],
-        show_progress="full"
-    )
-    show_cache_btn.click(
-        fn=show_cache_status,
-        inputs=[],
-        outputs=[cache_details],
-        show_progress="full"
-    )
-    refresh_cache_btn.click(
-        fn=lambda: global_state_manager.get_cache_status(),
-        inputs=[],
-        outputs=[perf_info],
-        show_progress="full"
-    )
-    backup_btn.click(
-        fn=manual_backup_to_hf,
-        inputs=[],
-        outputs=[backup_status],
-        show_progress="full"
-    )
-    sync_trackio_btn.click(
-        fn=sync_trackio_with_global_state,
-        inputs=[],
-        outputs=[],
-        show_progress="full"
-    )
-    clear_all_btn.click(
-        fn=clear_all_global_states,
-        inputs=[],
-        outputs=[project_info, recent_metrics, trackio_iframe],
-        show_progress="full"
-    )
-if __name__ == "__main__":
-    logger.info("Running analytics dashboard standalone with global state management")
-    demo.launch(server_name="0.0.0.0", server_port=7861)

gradio_chatbot.py DELETED Viewed

@@ -1,148 +0,0 @@
-# gradio_chatbot.py
-import gradio as gr
-import logging
-logger = logging.getLogger(__name__)
-from app import (
-    add_user_message,
-    add_loading_animation,
-    generate_response,
-    reset_conversation,
-    load_conversation_state,
-    remove_loading_animations,
-    global_state_manager,
-)
-def load_custom_css():
-    try:
-        with open("styles.css", "r", encoding="utf-8") as css_file:
-            css_content = css_file.read()
-            logger.info(f"CSS loaded successfully, length: {len(css_content)} characters")
-            return css_content
-    except FileNotFoundError:
-        logger.warning("styles.css file not found, using default styling")
-        return ""
-    except Exception as e:
-        logger.warning(f"Error reading styles.css: {e}")
-        return ""
-def restore_state_on_page_access():
-    """
-    Restore conversation state when page loads or user navigates back.
-    This ensures persistence across page navigation.
-    """
-    try:
-        current_state = global_state_manager.get_conversation_state()
-        chat_history = current_state.get('chat_history', [])
-        conversation_state_data = current_state.get('conversation_state', [])
-        logger.info(f"✓ Restored state: {len(chat_history)} messages in chat, {len(conversation_state_data)} in conversation")
-        return chat_history, conversation_state_data
-    except Exception as e:
-        logger.error(f"Failed to restore state: {e}")
-        return [], []
-with gr.Blocks() as demo:
-    custom_css = load_custom_css()
-    if custom_css:
-        gr.HTML(f'<style>{custom_css}</style>')
-    conversation_state = gr.State([])
-    gr.HTML('<div class="title-header"><h1>Mimir</h1></div>')
-    with gr.Row():
-        chatbot = gr.Chatbot(
-            type="messages",
-            show_copy_button=True,
-            show_share_button=False,
-            layout="bubble",
-            autoscroll=True,
-            avatar_images=None,
-            elem_id="main-chatbot",
-            scale=1,
-            height="65vh",
-            value=[],
-            latex_delimiters=[
-                {"left": "$$", "right": "$$", "display": True},
-                {"left": "$", "right": "$", "display": False},
-            ]
-        )
-    with gr.Row(elem_classes=["input-controls"]):
-        msg = gr.Textbox(
-            placeholder="Ask me about math, research, study strategies, or any educational topic...",
-            show_label=False,
-            lines=6,
-            max_lines=8,
-            elem_classes=["input-textbox"],
-            container=False,
-            scale=4
-        )
-        with gr.Column(elem_classes=["button-column"], scale=1):
-            send = gr.Button("Send", elem_classes=["send-button"], size="sm")
-            clear = gr.Button("Clear", elem_classes=["clear-button"], size="sm")
-    demo.load(
-        fn=restore_state_on_page_access,
-        outputs=[chatbot, conversation_state],
-        queue=False
-    )
-    msg.submit(
-        add_user_message,
-        inputs=[msg, chatbot, conversation_state],
-        outputs=[msg, chatbot, conversation_state],
-        show_progress="hidden",
-        queue=True,
-    ).then(
-        add_loading_animation,
-        inputs=[chatbot, conversation_state],
-        outputs=[chatbot, conversation_state],
-        show_progress="hidden",
-        queue=True,
-    ).then(
-        generate_response,
-        inputs=[chatbot, conversation_state],
-        outputs=[chatbot, conversation_state],
-        show_progress="hidden",
-        queue=True,
-    )
-    send.click(
-        add_user_message,
-        inputs=[msg, chatbot, conversation_state],
-        outputs=[msg, chatbot, conversation_state],
-        show_progress="hidden",
-        queue=True,
-    ).then(
-        add_loading_animation,
-        inputs=[chatbot, conversation_state],
-        outputs=[chatbot, conversation_state],
-        show_progress="hidden",
-        queue=True,
-    ).then(
-        generate_response,
-        inputs=[chatbot, conversation_state],
-        outputs=[chatbot, conversation_state],
-        show_progress="hidden",
-        queue=True,
-    )
-    clear.click(
-        reset_conversation,
-        outputs=[chatbot, conversation_state],
-        show_progress="hidden"
-    )
-if __name__ == "__main__":
-    logger.info("Running chatbot interface standalone")
-    demo.queue(default_concurrency_limit=1)
-    demo.launch(server_name="0.0.0.0", server_port=7860)

gradio_prompt_testing.py DELETED Viewed

@@ -1,1564 +0,0 @@
-# gradio_pipeline_testing.py
-"""
-Full Pipeline Testing Interface for Mimir Educational AI Assistant
-Tests the complete orchestration flow with comprehensive metrics at every step.
-Captures conditional model activation, token usage, timing, and quality metrics.
-UPDATED: Now correctly mirrors app.py orchestrate_turn() process
-- Tool decision uses decide() method with conversation history
-- Response agent invoked with input_data dict (not raw string)
-- Thinking agents process() method matches app.py
-- Graph generation included when tools are used
-Output: CSV file with ~110 columns capturing full pipeline journey
-"""
-import os
-import sys
-import io
-import csv
-import json
-import time
-import logging
-import warnings
-from datetime import datetime
-from typing import Dict, List, Optional, Tuple, Any
-from collections import Counter
-# Core dependencies
-import torch
-import gradio as gr
-import numpy as np
-# ============================================================================
-# ENVIRONMENT SETUP
-# ============================================================================
-HF_CACHE = "/tmp/huggingface"
-os.makedirs(f"{HF_CACHE}/hub", exist_ok=True)
-os.environ['HF_HOME'] = HF_CACHE
-os.environ['HF_HUB_CACHE'] = f"{HF_CACHE}/hub"
-# ============================================================================
-# IMPORTS FROM MIMIR APPLICATION
-# ============================================================================
-try:
-    from agents import (
-        ToolDecisionAgent,
-        PromptRoutingAgents,
-        ThinkingAgents,
-        ResponseAgent,
-    )
-    AGENTS_AVAILABLE = True
-except ImportError as e:
-    print(f"⚠️  Warning: Could not import agents: {e}")
-    AGENTS_AVAILABLE = False
-from model_manager import get_model as get_shared_llama
-try:
-    from state_manager import GlobalStateManager, LogicalExpressions
-    STATE_MANAGER_AVAILABLE = True
-except ImportError as e:
-    print(f"⚠️  Warning: Could not import state_manager: {e}")
-    STATE_MANAGER_AVAILABLE = False
-try:
-    from prompt_library import (
-        CORE_IDENTITY,
-        TOOL_DECISION,
-        agent_1_system,
-        agent_2_system,
-        agent_3_system,
-        agent_4_system,
-        MATH_THINKING,
-        QUESTION_ANSWER_DESIGN,
-        REASONING_THINKING,
-        VAUGE_INPUT,
-        USER_UNDERSTANDING,
-        GENERAL_FORMATTING,
-        LATEX_FORMATTING,
-        GUIDING_TEACHING,
-        STRUCTURE_PRACTICE_QUESTIONS,
-        PRACTICE_QUESTION_FOLLOWUP,
-        TOOL_USE_ENHANCEMENT,
-    )
-    PROMPTS_AVAILABLE = True
-except ImportError as e:
-    print(f"⚠️  Warning: Could not import prompt_library: {e}")
-    PROMPTS_AVAILABLE = False
-# Try to import post processor
-try:
-    # Import the post processor class/module from app.py
-    import importlib.util
-    spec = importlib.util.spec_from_file_location("app_module", "app.py")
-    app_module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(app_module)
-    post_processor = app_module.post_processor
-    POST_PROCESSOR_AVAILABLE = True
-except Exception as e:
-    print(f"⚠️  Warning: Could not import post_processor: {e}")
-    POST_PROCESSOR_AVAILABLE = False
-    # Create dummy
-    class DummyPostProcessor:
-        def process_response(self, response, user_message):
-            return response
-    post_processor = DummyPostProcessor()
-# ZeroGPU support
-try:
-    import spaces
-    ZERO_GPU_AVAILABLE = True
-except ImportError:
-    ZERO_GPU_AVAILABLE = False
-    class DummySpaces:
-        @staticmethod
-        def GPU(duration=600):
-            def decorator(func):
-                return func
-            return decorator
-    spaces = DummySpaces()
-# Tiktoken for accurate token counting
-try:
-    import tiktoken
-    TIKTOKEN_AVAILABLE = True
-except ImportError:
-    TIKTOKEN_AVAILABLE = False
-    print("⚠️  Warning: tiktoken not available - using fallback token counting")
-# Textstat for readability metrics
-try:
-    import textstat
-    TEXTSTAT_AVAILABLE = True
-except ImportError:
-    TEXTSTAT_AVAILABLE = False
-    print("⚠️  Warning: textstat not available - using manual readability calculations")
-# ============================================================================
-# LOGGING SETUP
-# ============================================================================
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-warnings.filterwarnings("ignore", category=UserWarning)
-warnings.filterwarnings("ignore", category=FutureWarning)
-CURRENT_YEAR = datetime.now().year
-# ============================================================================
-# GLOBAL INSTANCES
-# ============================================================================
-if AGENTS_AVAILABLE and STATE_MANAGER_AVAILABLE:
-    try:
-        global_state_manager = GlobalStateManager()
-        logical_expressions = LogicalExpressions()
-        tool_agent = ToolDecisionAgent()
-        routing_agents = PromptRoutingAgents()
-        thinking_agents = ThinkingAgents()
-        response_agent = ResponseAgent()
-        logger.info("✓ All agents initialized successfully")
-    except Exception as e:
-        logger.error(f"Failed to initialize agents: {e}")
-        raise
-else:
-    logger.error("Cannot initialize - missing core dependencies")
-    raise ImportError("Missing required modules: agents or state_manager")
-# ============================================================================
-# CSV SCHEMA DEFINITION
-# ============================================================================
-CSV_COLUMNS = [
-    # Identification & Input
-    "prompt_index",
-    "timestamp",
-    "user_prompt",
-    "user_prompt_tokens",
-    "user_prompt_chars",
-    "user_prompt_words",
-    # Conversation Context
-    "conversation_history_length",
-    "conversation_history_tokens",
-    # Tool Decision Agent
-    "tool_decision_input_template",
-    "tool_decision_input_tokens",
-    "tool_decision_output",
-    "tool_decision_output_tokens",
-    "tool_decision_result",
-    "tool_decision_time_seconds",
-    "tool_decision_gpu_peak_mb",
-    # Regex Checks
-    "regex_checks_applied",
-    "regex_checks_time_seconds",
-    # Routing Agent 1
-    "agent1_input_template",
-    "agent1_input_tokens",
-    "agent1_output",
-    "agent1_output_tokens",
-    "agent1_decision",
-    "agent1_time_seconds",
-    "agent1_gpu_peak_mb",
-    # Routing Agent 2
-    "agent2_input_template",
-    "agent2_input_tokens",
-    "agent2_output",
-    "agent2_output_tokens",
-    "agent2_decision",
-    "agent2_time_seconds",
-    "agent2_gpu_peak_mb",
-    # Routing Agent 3
-    "agent3_input_template",
-    "agent3_input_tokens",
-    "agent3_output",
-    "agent3_output_tokens",
-    "agent3_decision",
-    "agent3_time_seconds",
-    "agent3_gpu_peak_mb",
-    # Routing Agent 4
-    "agent4_input_template",
-    "agent4_input_tokens",
-    "agent4_output",
-    "agent4_output_tokens",
-    "agent4_decisions",
-    "agent4_time_seconds",
-    "agent4_gpu_peak_mb",
-    # Math Thinking
-    "math_thinking_activated",
-    "math_thinking_input_template",
-    "math_thinking_input_tokens",
-    "math_thinking_output",
-    "math_thinking_output_tokens",
-    "math_thinking_time_seconds",
-    "math_thinking_gpu_peak_mb",
-    # QA Design Thinking
-    "qa_design_activated",
-    "qa_design_input_template",
-    "qa_design_input_tokens",
-    "qa_design_output",
-    "qa_design_output_tokens",
-    "qa_design_time_seconds",
-    "qa_design_gpu_peak_mb",
-    # Reasoning Thinking
-    "reasoning_activated",
-    "reasoning_input_template",
-    "reasoning_input_tokens",
-    "reasoning_output",
-    "reasoning_output_tokens",
-    "reasoning_time_seconds",
-    "reasoning_gpu_peak_mb",
-    # Prompt Assembly
-    "active_response_prompts",
-    "final_prompt_template",
-    "final_prompt_tokens",
-    "final_prompt_chars",
-    "final_prompt_words",
-    "assembly_time_seconds",
-    # Response Generation
-    "response_input_template",
-    "response_input_tokens",
-    "response_raw",
-    "response_raw_tokens",
-    "response_raw_chars",
-    "response_raw_words",
-    "response_generation_time_seconds",
-    "response_gpu_peak_mb",
-    "response_tokens_per_second",
-    # Post-processing
-    "response_processed",
-    "response_processed_tokens",
-    "response_processed_chars",
-    "response_processed_words",
-    "postprocessing_time_seconds",
-    # Quality Metrics
-    "flesch_reading_ease",
-    "flesch_kincaid_grade",
-    "completeness_score",
-    "specificity_score",
-    "repetition_ratio",
-    "unique_word_ratio",
-    "avg_sentence_length",
-    "question_answered",
-    # Overall Metrics
-    "total_pipeline_time_seconds",
-    "total_input_tokens",
-    "total_output_tokens",
-    "total_gpu_peak_mb",
-    "models_activated_count",
-    "models_activated_list",
-]
-# ============================================================================
-# TOKEN COUNTING FUNCTIONS
-# ============================================================================
-def count_tokens_accurate(text: str) -> int:
-    """
-    Count tokens using tiktoken library for accurate estimation.
-    Args:
-        text: Input text to tokenize
-    Returns:
-        Accurate token count
-    """
-    if not text:
-        return 0
-    if not TIKTOKEN_AVAILABLE:
-        # Fallback: word count approximation
-        return len(text.split())
-    try:
-        # Use cl100k_base encoding (used by GPT-3.5/4, good general estimator)
-        encoding = tiktoken.get_encoding("cl100k_base")
-        tokens = encoding.encode(text)
-        return len(tokens)
-    except Exception as e:
-        logger.warning(f"tiktoken encoding failed: {e}, using fallback")
-        return len(text.split())
-def count_words(text: str) -> int:
-    """Count words in text"""
-    if not text:
-        return 0
-    return len(text.split())
-def count_sentences(text: str) -> int:
-    """Count sentences in text (simple heuristic)"""
-    if not text:
-        return 0
-    import re
-    sentences = re.split(r'[.!?]+', text)
-    return len([s for s in sentences if s.strip()])
-# ============================================================================
-# GPU MEMORY TRACKING
-# ============================================================================
-def get_gpu_memory() -> Dict[str, float]:
-    """
-    Get current GPU memory statistics.
-    Returns:
-        Dictionary with allocated, reserved, and peak memory in MB
-    """
-    if torch.cuda.is_available():
-        return {
-            "allocated_mb": torch.cuda.memory_allocated() / 1024**2,
-            "reserved_mb": torch.cuda.memory_reserved() / 1024**2,
-            "peak_mb": torch.cuda.max_memory_allocated() / 1024**2
-        }
-    return {
-        "allocated_mb": 0.0,
-        "reserved_mb": 0.0,
-        "peak_mb": 0.0
-    }
-def reset_gpu_stats():
-    """Reset GPU memory statistics"""
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-        torch.cuda.synchronize()
-# ============================================================================
-# TEMPLATE BUILDING FUNCTIONS
-# ============================================================================
-def format_history(history: List[Dict]) -> str:
-    """Format conversation history for templates"""
-    if not history:
-        return "No previous conversation"
-    formatted = []
-    for msg in history[-8:]:  # Last 8 messages
-        role = msg.get('role', 'unknown')
-        content = msg.get('content', '')[:100]  # Truncate
-        formatted.append(f"{role}: {content}")
-    return "\n".join(formatted)
-def build_tool_decision_template(user_prompt: str, history: List) -> str:
-    """Build template for tool decision agent - matches app.py"""
-    history_str = format_history(history)
-    return f"{history_str}\n\nUser Query: {user_prompt}"
-def build_agent1_template(user_prompt: str, history: List) -> str:
-    """Build template for Agent 1: Practice Questions"""
-    history_str = format_history(history)
-    return f"<s>[INST] {agent_1_system}\n\nConversation History:\n{history_str}\n\nCurrent User Query: {user_prompt} [/INST]"
-def build_agent2_template(user_prompt: str) -> str:
-    """Build template for Agent 2: Discovery Mode"""
-    return f"<s>[INST] {agent_2_system}\n\nUser Query: {user_prompt} [/INST]"
-def build_agent3_template(user_prompt: str, history: List) -> str:
-    """Build template for Agent 3: Followup Assessment"""
-    history_str = format_history(history)
-    return f"<s>[INST] {agent_3_system}\n\nConversation History:\n{history_str}\n\nCurrent User Query: {user_prompt} [/INST]"
-def build_agent4_template(user_prompt: str, history: List) -> str:
-    """Build template for Agent 4: Teaching Mode"""
-    history_str = format_history(history)
-    return f"<s>[INST] {agent_4_system}\n\nConversation History:\n{history_str}\n\nCurrent User Query: {user_prompt} [/INST]"
-def build_math_thinking_template(user_prompt: str) -> str:
-    """Build template for Math Thinking"""
-    return f"<s>[INST] {MATH_THINKING}\n\nUser Query: {user_prompt} [/INST]"
-def build_qa_design_template(user_prompt: str) -> str:
-    """Build template for QA Design Thinking"""
-    return f"<s>[INST] {QUESTION_ANSWER_DESIGN}\n\nUser Query: {user_prompt} [/INST]"
-def build_reasoning_template(user_prompt: str) -> str:
-    """Build template for Reasoning Thinking"""
-    return f"<s>[INST] {REASONING_THINKING}\n\nUser Query: {user_prompt} [/INST]"
-# ============================================================================
-# QUALITY METRICS FUNCTIONS
-# ============================================================================
-def estimate_syllables(text: str) -> int:
-    """
-    Estimate syllable count (rough heuristic).
-    Counts vowel groups.
-    """
-    import re
-    words = text.lower().split()
-    syllable_count = 0
-    for word in words:
-        # Remove non-letters
-        word = re.sub(r'[^a-z]', '', word)
-        if not word:
-            continue
-        # Count vowel groups
-        vowel_groups = len(re.findall(r'[aeiouy]+', word))
-        # Ensure at least 1 syllable per word
-        syllable_count += max(1, vowel_groups)
-    return syllable_count
-def calculate_flesch_reading_ease(text: str) -> float:
-    """
-    Calculate Flesch Reading Ease score.
-    Score 0-100: Higher = easier to read
-    90-100: Very easy (5th grade)
-    60-70: Standard (8th-9th grade)
-    0-30: Very difficult (college graduate)
-    Formula: 206.835 - 1.015(words/sentences) - 84.6(syllables/words)
-    """
-    if not text or len(text.strip()) < 10:
-        return 0.0
-    if TEXTSTAT_AVAILABLE:
-        try:
-            return textstat.flesch_reading_ease(text)
-        except:
-            pass
-    # Manual calculation
-    words = count_words(text)
-    sentences = count_sentences(text)
-    if sentences == 0 or words == 0:
-        return 0.0
-    syllables = estimate_syllables(text)
-    if words == 0:
-        return 0.0
-    score = 206.835 - 1.015 * (words / sentences) - 84.6 * (syllables / words)
-    return max(0.0, min(100.0, score))
-def calculate_flesch_kincaid_grade(text: str) -> float:
-    """
-    Calculate Flesch-Kincaid Grade Level.
-    Returns US grade level needed to understand text.
-    Formula: 0.39(words/sentences) + 11.8(syllables/words) - 15.59
-    """
-    if not text or len(text.strip()) < 10:
-        return 0.0
-    if TEXTSTAT_AVAILABLE:
-        try:
-            return textstat.flesch_kincaid_grade(text)
-        except:
-            pass
-    words = count_words(text)
-    sentences = count_sentences(text)
-    if sentences == 0 or words == 0:
-        return 0.0
-    syllables = estimate_syllables(text)
-    if words == 0:
-        return 0.0
-    grade = 0.39 * (words / sentences) + 11.8 * (syllables / words) - 15.59
-    return max(0.0, grade)
-def calculate_completeness_score(response: str, user_prompt: str) -> float:
-    """
-    Estimate if response addresses the prompt.
-    Uses keyword overlap and length heuristics.
-    Returns: Score 0-1 (1 = complete answer)
-    """
-    if not response or not user_prompt:
-        return 0.0
-    import re
-    # Extract keywords from prompt
-    prompt_words = set(re.findall(r'\b\w+\b', user_prompt.lower()))
-    # Remove common stopwords
-    stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
-                 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
-                 'would', 'should', 'could', 'may', 'might', 'can', 'what',
-                 'how', 'why', 'when', 'where', 'who', 'which', 'i', 'you',
-                 'we', 'they', 'he', 'she', 'it', 'me', 'him', 'her', 'us', 'them'}
-    prompt_words -= stopwords
-    response_words = set(re.findall(r'\b\w+\b', response.lower()))
-    if not prompt_words:
-        return 0.5  # Neutral if no meaningful keywords
-    # Calculate keyword overlap
-    overlap = len(prompt_words & response_words) / len(prompt_words)
-    # Length factor
-    min_reasonable_length = 20
-    if len(response) < min_reasonable_length:
-        length_factor = len(response) / min_reasonable_length
-    else:
-        length_factor = 1.0
-    score = overlap * length_factor
-    return min(1.0, score)
-def check_question_answered(response: str, user_prompt: str) -> bool:
-    """
-    Boolean check: does response attempt to answer the question?
-    Heuristics:
-    - Response has minimum length
-    - Response doesn't start with refusal
-    - Response contains relevant keywords
-    """
-    if not response or len(response) < 10:
-        return False
-    # Check for refusal patterns
-    refusal_patterns = [
-        "i don't know",
-        "i cannot",
-        "i can't",
-        "i'm not sure",
-        "i don't have",
-        "unable to",
-        "sorry, i"
-    ]
-    response_lower = response.lower()
-    for pattern in refusal_patterns:
-        if response_lower.startswith(pattern):
-            return False
-    # Check for minimum completeness
-    completeness = calculate_completeness_score(response, user_prompt)
-    return completeness > 0.3
-def calculate_specificity_score(response: str) -> float:
-    """
-    Measure how specific vs vague the response is.
-    Indicators of specificity:
-    - Numbers, dates, names
-    - Technical terms
-    - Examples
-    - Concrete nouns
-    Returns: Score 0-1 (1 = very specific)
-    """
-    if not response:
-        return 0.0
-    import re
-    specificity_indicators = 0
-    total_possible = 5
-    # 1. Contains numbers
-    if re.search(r'\d+', response):
-        specificity_indicators += 1
-    # 2. Contains proper nouns
-    proper_nouns = len(re.findall(r'(?<!\. )\b[A-Z][a-z]+', response))
-    if proper_nouns > 0:
-        specificity_indicators += 1
-    # 3. Contains example phrases
-    example_phrases = ['for example', 'such as', 'for instance', 'like', 'including']
-    if any(phrase in response.lower() for phrase in example_phrases):
-        specificity_indicators += 1
-    # 4. Average word length
-    words = response.split()
-    if words:
-        avg_word_length = sum(len(w) for w in words) / len(words)
-        if avg_word_length > 5.0:
-            specificity_indicators += 1
-    # 5. Response length
-    if len(response) > 200:
-        specificity_indicators += 1
-    return specificity_indicators / total_possible
-def calculate_repetition_ratio(text: str) -> float:
-    """
-    Measure token/word repetition.
-    Lower = better (less repetitive)
-    Returns: Ratio of repeated tokens to total tokens (0-1)
-    """
-    if not text:
-        return 0.0
-    words = text.lower().split()
-    if len(words) < 2:
-        return 0.0
-    word_counts = Counter(words)
-    # Count words that appear more than once
-    repeated_words = sum(count - 1 for count in word_counts.values() if count > 1)
-    ratio = repeated_words / len(words)
-    return min(1.0, ratio)
-def calculate_unique_word_ratio(text: str) -> float:
-    """
-    Measure vocabulary diversity.
-    Higher = more diverse vocabulary
-    Returns: Ratio of unique words to total words (0-1)
-    """
-    if not text:
-        return 0.0
-    words = text.lower().split()
-    if not words:
-        return 0.0
-    unique_words = len(set(words))
-    return unique_words / len(words)
-def calculate_avg_sentence_length(text: str) -> float:
-    """Calculate average sentence length in words"""
-    sentences = count_sentences(text)
-    words = count_words(text)
-    if sentences == 0:
-        return 0.0
-    return words / sentences
-# ============================================================================
-# INSTRUMENTED PIPELINE RUNNER
-# ============================================================================
-def run_full_pipeline_instrumented(user_prompt: str, prompt_index: int = 1) -> Dict:
-    """
-    Run the complete orchestration pipeline with full instrumentation.
-    Captures metrics at every step.
-    ✅ UPDATED: Now correctly mirrors app.py orchestrate_turn() process
-    Args:
-        user_prompt: User's input prompt
-        prompt_index: Index number for this prompt in batch
-    Returns:
-        Dictionary with all metrics for CSV export
-    """
-    result = {
-        "prompt_index": prompt_index,
-        "timestamp": datetime.now().isoformat(),
-        "user_prompt": user_prompt,
-        "user_prompt_tokens": count_tokens_accurate(user_prompt),
-        "user_prompt_chars": len(user_prompt),
-        "user_prompt_words": count_words(user_prompt),
-    }
-    # Track overall start time
-    pipeline_start = time.time()
-    try:
-        # ============================================================
-        # STEP 1-2: SETUP
-        # ============================================================
-        setup_start = time.time()
-        # Reset state
-        global_state_manager.reset_prompt_state()
-        prompt_state = global_state_manager.get_prompt_state_manager()
-        # Get conversation history (empty for testing)
-        recent_history = []
-        recent_history_formatted = "No previous conversation"
-        result["conversation_history_length"] = 0
-        result["conversation_history_tokens"] = 0
-        # ============================================================
-        # STEP 3: TOOL DECISION AGENT (✅ FIXED: Use decide() with history)
-        # ============================================================
-        tool_start = time.time()
-        tool_template = build_tool_decision_template(user_prompt, recent_history)
-        tool_input_tokens = count_tokens_accurate(tool_template)
-        reset_gpu_stats()
-        # ✅ FIXED: Use decide() method with conversation history (matches app.py)
-        tool_decision_result = tool_agent.decide(user_prompt, recent_history)
-        # Capture output
-        tool_output = str(tool_decision_result)
-        tool_output_tokens = count_tokens_accurate(tool_output)
-        gpu_metrics = get_gpu_memory()
-        tool_time = time.time() - tool_start
-        # Record
-        result.update({
-            "tool_decision_input_template": tool_template,
-            "tool_decision_input_tokens": tool_input_tokens,
-            "tool_decision_output": tool_output,
-            "tool_decision_output_tokens": tool_output_tokens,
-            "tool_decision_result": bool(tool_decision_result),
-            "tool_decision_time_seconds": round(tool_time, 3),
-            "tool_decision_gpu_peak_mb": round(gpu_metrics["peak_mb"], 2),
-        })
-        # Update state
-        tool_img_output = ""
-        tool_context = ""
-        if tool_decision_result:
-            prompt_state.update("TOOL_USE_ENHANCEMENT", True)
-            # Note: In real app.py, graph generation happens here
-            # For testing, we'll just note that tools would be used
-            tool_context = "Tool usage detected (graph would be generated in production)"
-        # ============================================================
-        # STEP 4: REGEX CHECKS
-        # ============================================================
-        regex_start = time.time()
-        # Apply regex checks (returns list of activated prompts)
-        regex_before = set(prompt_state.get_active_response_prompts())
-        logical_expressions.apply_all_checks(user_prompt, prompt_state)
-        regex_after = set(prompt_state.get_active_response_prompts())
-        regex_applied = list(regex_after - regex_before)
-        regex_time = time.time() - regex_start
-        result.update({
-            "regex_checks_applied": ", ".join(regex_applied) if regex_applied else "None",
-            "regex_checks_time_seconds": round(regex_time, 3),
-        })
-        # ============================================================
-        # STEP 5: ROUTING AGENTS (✅ Unified Process - matches app.py)
-        # ============================================================
-        routing_start = time.time()
-        # Build template (simplified - just the user prompt)
-        routing_template = f"User Query: {user_prompt}"
-        routing_input_tokens = count_tokens_accurate(routing_template)
-        reset_gpu_stats()
-        # ✅ Use unified process() method (matches app.py)
-        response_prompts_str, thinking_prompts_str = routing_agents.process(
-            user_input=user_prompt,
-            tool_used=(tool_decision_result and bool(tool_img_output))
-        )
-        # Parse results
-        response_prompts = [p.strip() for p in response_prompts_str.split('\n') if p.strip()] if response_prompts_str else []
-        thinking_prompts = [p.strip() for p in thinking_prompts_str.split('\n') if p.strip()] if thinking_prompts_str else []
-        routing_output = f"Response: {', '.join(response_prompts) if response_prompts else 'None'}\nThinking: {', '.join(thinking_prompts) if thinking_prompts else 'None'}"
-        routing_output_tokens = count_tokens_accurate(routing_output)
-        gpu_metrics = get_gpu_memory()
-        routing_time = time.time() - routing_start
-        # Update result with consolidated routing metrics
-        result.update({
-            # Agent 1 metrics (legacy columns - use consolidated data)
-            "agent1_input_template": routing_template,
-            "agent1_input_tokens": routing_input_tokens // 4,  # Divide among 4 agents
-            "agent1_output": ", ".join([p for p in response_prompts if p in ["STRUCTURE_PRACTICE_QUESTIONS"]]) or "None",
-            "agent1_output_tokens": routing_output_tokens // 4,
-            "agent1_decision": "STRUCTURE_PRACTICE_QUESTIONS" in response_prompts,
-            "agent1_time_seconds": round(routing_time / 4, 3),
-            "agent1_gpu_peak_mb": round(gpu_metrics["peak_mb"] / 4, 2),
-            # Agent 2 metrics
-            "agent2_input_template": routing_template,
-            "agent2_input_tokens": routing_input_tokens // 4,
-            "agent2_output": ", ".join([p for p in response_prompts if p in ["GENERAL_FORMATTING", "LATEX_FORMATTING", "GUIDING_TEACHING"]]) or "None",
-            "agent2_output_tokens": routing_output_tokens // 4,
-            "agent2_decision": ", ".join([p for p in response_prompts if p in ["GENERAL_FORMATTING", "LATEX_FORMATTING", "GUIDING_TEACHING"]]) or "NULL",
-            "agent2_time_seconds": round(routing_time / 4, 3),
-            "agent2_gpu_peak_mb": round(gpu_metrics["peak_mb"] / 4, 2),
-            # Agent 3 metrics
-            "agent3_input_template": routing_template,
-            "agent3_input_tokens": routing_input_tokens // 4,
-            "agent3_output": ", ".join([p for p in response_prompts + thinking_prompts if p in ["PRACTICE_QUESTION_FOLLOWUP", "MATH_THINKING", "QUESTION_ANSWER_DESIGN", "REASONING_THINKING"]]) or "None",
-            "agent3_output_tokens": routing_output_tokens // 4,
-            "agent3_decision": any(p in ["PRACTICE_QUESTION_FOLLOWUP", "MATH_THINKING", "QUESTION_ANSWER_DESIGN", "REASONING_THINKING"] for p in response_prompts + thinking_prompts),
-            "agent3_time_seconds": round(routing_time / 4, 3),
-            "agent3_gpu_peak_mb": round(gpu_metrics["peak_mb"] / 4, 2),
-            # Agent 4 metrics
-            "agent4_input_template": routing_template,
-            "agent4_input_tokens": routing_input_tokens // 4,
-            "agent4_output": ", ".join([p for p in response_prompts if p == "TOOL_USE_ENHANCEMENT"]) or "None",
-            "agent4_output_tokens": routing_output_tokens // 4,
-            "agent4_decisions": "TOOL_USE_ENHANCEMENT" if "TOOL_USE_ENHANCEMENT" in response_prompts else "NULL",
-            "agent4_time_seconds": round(routing_time / 4, 3),
-            "agent4_gpu_peak_mb": round(gpu_metrics["peak_mb"] / 4, 2),
-        })
-        # Update prompt state with all activated prompts
-        for prompt_name in response_prompts:
-            prompt_state.update(prompt_name, True)
-        for prompt_name in thinking_prompts:
-            prompt_state.update(prompt_name, True)
-        # ============================================================
-        # STEP 6: THINKING AGENTS (✅ FIXED: Use process() - matches app.py)
-        # ============================================================
-        # Build thinking prompts list (matches app.py logic)
-        thinking_prompts_list = []
-        for prompt_name in thinking_prompts:
-            if prompt_name.strip():
-                thinking_prompts_list.append(prompt_name.strip())
-        # Additional heuristic: Add MATH_THINKING if LATEX_FORMATTING is active
-        if prompt_state.is_active("LATEX_FORMATTING") and "MATH_THINKING" not in thinking_prompts_list:
-            thinking_prompts_list.append("MATH_THINKING")
-            prompt_state.update("MATH_THINKING", True)
-        # Execute thinking agents if any are active
-        thinking_context = ""
-        if thinking_prompts_list:
-            thinking_start = time.time()
-            thinking_prompts_string = '\n'.join(thinking_prompts_list)
-            reset_gpu_stats()
-            # ✅ FIXED: Use process() method (matches app.py)
-            thinking_context = thinking_agents.process(
-                user_input=user_prompt,
-                conversation_history=recent_history_formatted,
-                thinking_prompts=thinking_prompts_string,
-                tool_img_output=tool_img_output,
-                tool_context=tool_context
-            )
-            thinking_time = time.time() - thinking_start
-            gpu_metrics = get_gpu_memory()
-            # Record metrics for activated thinking agents
-            # Note: For simplicity, we're recording aggregate metrics
-            # In production, you might want to separate these
-            if "MATH_THINKING" in thinking_prompts_list:
-                result.update({
-                    "math_thinking_activated": True,
-                    "math_thinking_input_template": build_math_thinking_template(user_prompt),
-                    "math_thinking_input_tokens": count_tokens_accurate(user_prompt),
-                    "math_thinking_output": thinking_context[:500],  # Truncate for CSV
-                    "math_thinking_output_tokens": count_tokens_accurate(thinking_context),
-                    "math_thinking_time_seconds": round(thinking_time / len(thinking_prompts_list), 3),
-                    "math_thinking_gpu_peak_mb": round(gpu_metrics["peak_mb"] / len(thinking_prompts_list), 2),
-                })
-            else:
-                result.update({
-                    "math_thinking_activated": False,
-                    "math_thinking_input_template": "NULL",
-                    "math_thinking_input_tokens": 0,
-                    "math_thinking_output": "NULL",
-                    "math_thinking_output_tokens": 0,
-                    "math_thinking_time_seconds": 0.0,
-                    "math_thinking_gpu_peak_mb": 0.0,
-                })
-            if "QUESTION_ANSWER_DESIGN" in thinking_prompts_list:
-                result.update({
-                    "qa_design_activated": True,
-                    "qa_design_input_template": build_qa_design_template(user_prompt),
-                    "qa_design_input_tokens": count_tokens_accurate(user_prompt),
-                    "qa_design_output": thinking_context[:500],
-                    "qa_design_output_tokens": count_tokens_accurate(thinking_context),
-                    "qa_design_time_seconds": round(thinking_time / len(thinking_prompts_list), 3),
-                    "qa_design_gpu_peak_mb": round(gpu_metrics["peak_mb"] / len(thinking_prompts_list), 2),
-                })
-            else:
-                result.update({
-                    "qa_design_activated": False,
-                    "qa_design_input_template": "NULL",
-                    "qa_design_input_tokens": 0,
-                    "qa_design_output": "NULL",
-                    "qa_design_output_tokens": 0,
-                    "qa_design_time_seconds": 0.0,
-                    "qa_design_gpu_peak_mb": 0.0,
-                })
-            if "REASONING_THINKING" in thinking_prompts_list:
-                result.update({
-                    "reasoning_activated": True,
-                    "reasoning_input_template": build_reasoning_template(user_prompt),
-                    "reasoning_input_tokens": count_tokens_accurate(user_prompt),
-                    "reasoning_output": thinking_context[:500],
-                    "reasoning_output_tokens": count_tokens_accurate(thinking_context),
-                    "reasoning_time_seconds": round(thinking_time / len(thinking_prompts_list), 3),
-                    "reasoning_gpu_peak_mb": round(gpu_metrics["peak_mb"] / len(thinking_prompts_list), 2),
-                })
-            else:
-                result.update({
-                    "reasoning_activated": False,
-                    "reasoning_input_template": "NULL",
-                    "reasoning_input_tokens": 0,
-                    "reasoning_output": "NULL",
-                    "reasoning_output_tokens": 0,
-                    "reasoning_time_seconds": 0.0,
-                    "reasoning_gpu_peak_mb": 0.0,
-                })
-        else:
-            # No thinking agents activated
-            result.update({
-                "math_thinking_activated": False,
-                "math_thinking_input_template": "NULL",
-                "math_thinking_input_tokens": 0,
-                "math_thinking_output": "NULL",
-                "math_thinking_output_tokens": 0,
-                "math_thinking_time_seconds": 0.0,
-                "math_thinking_gpu_peak_mb": 0.0,
-                "qa_design_activated": False,
-                "qa_design_input_template": "NULL",
-                "qa_design_input_tokens": 0,
-                "qa_design_output": "NULL",
-                "qa_design_output_tokens": 0,
-                "qa_design_time_seconds": 0.0,
-                "qa_design_gpu_peak_mb": 0.0,
-                "reasoning_activated": False,
-                "reasoning_input_template": "NULL",
-                "reasoning_input_tokens": 0,
-                "reasoning_output": "NULL",
-                "reasoning_output_tokens": 0,
-                "reasoning_time_seconds": 0.0,
-                "reasoning_gpu_peak_mb": 0.0,
-            })
-        # ============================================================
-        # STEP 7-8: PROMPT ASSEMBLY (matches app.py)
-        # ============================================================
-        assembly_start = time.time()
-        # Get active response prompts
-        active_prompts = prompt_state.get_active_response_prompts()
-        assembly_time = time.time() - assembly_start
-        result.update({
-            "active_response_prompts": ", ".join(active_prompts),
-            "final_prompt_template": "Response input dict (see response_input_template)",
-            "final_prompt_tokens": 0,  # Will be calculated in response step
-            "final_prompt_chars": 0,
-            "final_prompt_words": 0,
-            "assembly_time_seconds": round(assembly_time, 3),
-        })
-        # ============================================================
-        # STEP 9: RESPONSE GENERATION (✅ FIXED: Use input_data dict)
-        # ============================================================
-        response_start = time.time()
-        reset_gpu_stats()
-        # ✅ FIXED: Build input_data dict (matches app.py Step 8)
-        input_data = {
-            'user_query': user_prompt,
-            'conversation_history': recent_history,
-            'active_prompts': active_prompts,
-            'thinking_context': thinking_context,
-            'tool_context': tool_context,
-        }
-        # ✅ FIXED: Invoke with dict and extract response (matches app.py)
-        result_dict = response_agent.invoke(input_data)
-        raw_response = result_dict.get('response', '')
-        metadata = result_dict.get('metadata', {})
-        response_time = time.time() - response_start
-        raw_tokens = count_tokens_accurate(raw_response)
-        raw_chars = len(raw_response)
-        raw_words = count_words(raw_response)
-        tokens_per_sec = raw_tokens / response_time if response_time > 0 else 0
-        gpu_metrics = get_gpu_memory()
-        # Calculate input template string for metrics
-        input_template_str = f"user_query: {user_prompt[:100]}..., active_prompts: {active_prompts}, thinking: {len(thinking_context)} chars, tool: {len(tool_context)} chars"
-        result.update({
-            "response_input_template": input_template_str,
-            "response_input_tokens": count_tokens_accurate(input_template_str),
-            "response_raw": raw_response,
-            "response_raw_tokens": raw_tokens,
-            "response_raw_chars": raw_chars,
-            "response_raw_words": raw_words,
-            "response_generation_time_seconds": round(response_time, 3),
-            "response_gpu_peak_mb": round(gpu_metrics["peak_mb"], 2),
-            "response_tokens_per_second": round(tokens_per_sec, 2),
-        })
-        # ============================================================
-        # STEP 10: POST-PROCESSING (matches app.py)
-        # ============================================================
-        postprocess_start = time.time()
-        processed_response = post_processor.process_response(raw_response, user_prompt)
-        postprocess_time = time.time() - postprocess_start
-        processed_tokens = count_tokens_accurate(processed_response)
-        processed_chars = len(processed_response)
-        processed_words = count_words(processed_response)
-        result.update({
-            "response_processed": processed_response,
-            "response_processed_tokens": processed_tokens,
-            "response_processed_chars": processed_chars,
-            "response_processed_words": processed_words,
-            "postprocessing_time_seconds": round(postprocess_time, 3),
-        })
-        # ============================================================
-        # QUALITY METRICS
-        # ============================================================
-        flesch_ease = calculate_flesch_reading_ease(processed_response)
-        flesch_grade = calculate_flesch_kincaid_grade(processed_response)
-        completeness = calculate_completeness_score(processed_response, user_prompt)
-        specificity = calculate_specificity_score(processed_response)
-        repetition = calculate_repetition_ratio(processed_response)
-        unique_ratio = calculate_unique_word_ratio(processed_response)
-        avg_sent_len = calculate_avg_sentence_length(processed_response)
-        question_answered = check_question_answered(processed_response, user_prompt)
-        result.update({
-            "flesch_reading_ease": round(flesch_ease, 2),
-            "flesch_kincaid_grade": round(flesch_grade, 2),
-            "completeness_score": round(completeness, 3),
-            "specificity_score": round(specificity, 3),
-            "repetition_ratio": round(repetition, 3),
-            "unique_word_ratio": round(unique_ratio, 3),
-            "avg_sentence_length": round(avg_sent_len, 2),
-            "question_answered": question_answered,
-        })
-        # ============================================================
-        # OVERALL METRICS
-        # ============================================================
-        total_pipeline_time = time.time() - pipeline_start
-        # Count activated models
-        models_activated = []
-        if result["tool_decision_time_seconds"] > 0:
-            models_activated.append("Tool Decision")
-        if result["agent1_time_seconds"] > 0:
-            models_activated.append("Routing Agents")
-        if result["math_thinking_activated"]:
-            models_activated.append("Math Thinking")
-        if result["qa_design_activated"]:
-            models_activated.append("QA Design")
-        if result["reasoning_activated"]:
-            models_activated.append("Reasoning")
-        models_activated.append("Response Agent")
-        # Sum all input tokens
-        total_input_tokens = (
-            result["tool_decision_input_tokens"] +
-            result["agent1_input_tokens"] * 4 +  # Multiply back since we divided
-            result.get("math_thinking_input_tokens", 0) +
-            result.get("qa_design_input_tokens", 0) +
-            result.get("reasoning_input_tokens", 0) +
-            result["response_input_tokens"]
-        )
-        # Sum all output tokens
-        total_output_tokens = (
-            result["tool_decision_output_tokens"] +
-            result["agent1_output_tokens"] * 4 +
-            result.get("math_thinking_output_tokens", 0) +
-            result.get("qa_design_output_tokens", 0) +
-            result.get("reasoning_output_tokens", 0) +
-            result["response_raw_tokens"]
-        )
-        # Max GPU across all steps
-        total_gpu_peak = max([
-            result["tool_decision_gpu_peak_mb"],
-            result["agent1_gpu_peak_mb"],
-            result.get("math_thinking_gpu_peak_mb", 0.0),
-            result.get("qa_design_gpu_peak_mb", 0.0),
-            result.get("reasoning_gpu_peak_mb", 0.0),
-            result["response_gpu_peak_mb"],
-        ])
-        result.update({
-            "total_pipeline_time_seconds": round(total_pipeline_time, 3),
-            "total_input_tokens": total_input_tokens,
-            "total_output_tokens": total_output_tokens,
-            "total_gpu_peak_mb": round(total_gpu_peak, 2),
-            "models_activated_count": len(models_activated),
-            "models_activated_list": ", ".join(models_activated),
-        })
-        logger.info(f"✓ Prompt {prompt_index} complete: {total_pipeline_time:.2f}s, {len(models_activated)} models activated")
-        return result
-    except Exception as e:
-        logger.error(f"Pipeline execution failed for prompt {prompt_index}: {e}")
-        import traceback
-        traceback.print_exc()
-        # Return error result with NULLs
-        error_result = {col: "ERROR" for col in CSV_COLUMNS}
-        error_result.update({
-            "prompt_index": prompt_index,
-            "timestamp": datetime.now().isoformat(),
-            "user_prompt": user_prompt,
-            "user_prompt_tokens": count_tokens_accurate(user_prompt),
-            "user_prompt_chars": len(user_prompt),
-            "user_prompt_words": count_words(user_prompt),
-        })
-        return error_result
-# ============================================================================
-# BATCH PROCESSING
-# ============================================================================
-@spaces.GPU(duration=600)
-def process_batch_full_pipeline(
-    user_prompts: List[str],
-    progress_callback=None
-) -> List[Dict]:
-    """
-    Process batch of prompts through FULL PIPELINE.
-    Sequential processing - one at a time.
-    Args:
-        user_prompts: List of user prompts to test
-        progress_callback: Optional callback for progress updates
-    Returns:
-        List of result dictionaries (one per prompt)
-    """
-    results = []
-    total = len(user_prompts)
-    logger.info(f"="*60)
-    logger.info(f"Starting full pipeline batch: {total} prompts")
-    logger.info(f"="*60)
-    batch_start = time.time()
-    for idx, user_prompt in enumerate(user_prompts, 1):
-        logger.info(f"\n{'='*60}")
-        logger.info(f"Processing prompt {idx}/{total}")
-        logger.info(f"Prompt: {user_prompt[:80]}...")
-        logger.info(f"{'='*60}")
-        try:
-            # Run full instrumented pipeline
-            result = run_full_pipeline_instrumented(user_prompt, prompt_index=idx)
-            results.append(result)
-            logger.info(f"✓ Prompt {idx} complete")
-            logger.info(f"  Total time: {result.get('total_pipeline_time_seconds', 0):.2f}s")
-            logger.info(f"  Models activated: {result.get('models_activated_count', 0)}")
-            logger.info(f"  Total tokens: {result.get('total_input_tokens', 0) + result.get('total_output_tokens', 0)}")
-            if progress_callback:
-                progress_callback(idx, total)
-        except Exception as e:
-            logger.error(f"❌ Prompt {idx} failed: {e}")
-            import traceback
-            traceback.print_exc()
-            # Add error result
-            error_result = {col: "ERROR" for col in CSV_COLUMNS}
-            error_result.update({
-                "prompt_index": idx,
-                "timestamp": datetime.now().isoformat(),
-                "user_prompt": user_prompt,
-                "user_prompt_tokens": count_tokens_accurate(user_prompt),
-            })
-            results.append(error_result)
-    batch_duration = time.time() - batch_start
-    logger.info(f"\n{'='*60}")
-    logger.info(f"BATCH COMPLETE")
-    logger.info(f"{'='*60}")
-    logger.info(f"Processed: {len(results)}/{total} prompts")
-    logger.info(f"Total batch time: {batch_duration:.2f}s")
-    logger.info(f"Average per prompt: {batch_duration/total:.2f}s")
-    logger.info(f"{'='*60}")
-    return results
-# ============================================================================
-# CSV EXPORT
-# ============================================================================
-def export_full_pipeline_csv(
-    results: List[Dict],
-    test_name: str = "pipeline_test"
-) -> str:
-    """
-    Export full pipeline results to CSV.
-    Args:
-        results: List of result dictionaries
-        test_name: Name for the test (used in filename)
-    Returns:
-        Filepath of exported CSV
-    """
-    try:
-        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-        filename = f"mimir_full_pipeline_{test_name}_{timestamp}.csv"
-        filepath = os.path.join("/tmp", filename)  # Save to /tmp for ZeroGPU
-        if not results:
-            logger.warning("No results to export")
-            return None
-        logger.info(f"Exporting {len(results)} results to CSV...")
-        # Write CSV
-        with open(filepath, 'w', newline='', encoding='utf-8') as f:
-            writer = csv.DictWriter(f, fieldnames=CSV_COLUMNS)
-            writer.writeheader()
-            for result in results:
-                # Fill missing keys with NULL
-                row = {key: result.get(key, "NULL") for key in CSV_COLUMNS}
-                writer.writerow(row)
-        logger.info(f"✓ Full pipeline results exported to {filepath}")
-        logger.info(f"  Columns: {len(CSV_COLUMNS)}")
-        logger.info(f"  Rows: {len(results)}")
-        return filepath
-    except Exception as e:
-        logger.error(f"CSV export failed: {e}")
-        import traceback
-        traceback.print_exc()
-        return None
-def calculate_summary_stats(results: List[Dict]) -> Dict:
-    """Calculate summary statistics from results"""
-    if not results:
-        return {}
-    valid_results = [r for r in results if r.get("total_pipeline_time_seconds") != "ERROR"]
-    if not valid_results:
-        return {"error": "No valid results"}
-    return {
-        "total_prompts": len(results),
-        "successful_prompts": len(valid_results),
-        "failed_prompts": len(results) - len(valid_results),
-        "avg_pipeline_time_seconds": round(np.mean([r["total_pipeline_time_seconds"] for r in valid_results]), 3),
-        "min_pipeline_time_seconds": round(np.min([r["total_pipeline_time_seconds"] for r in valid_results]), 3),
-        "max_pipeline_time_seconds": round(np.max([r["total_pipeline_time_seconds"] for r in valid_results]), 3),
-        "avg_total_tokens": round(np.mean([r["total_input_tokens"] + r["total_output_tokens"] for r in valid_results]), 1),
-        "avg_models_activated": round(np.mean([r["models_activated_count"] for r in valid_results]), 2),
-        "avg_gpu_peak_mb": round(np.mean([r["total_gpu_peak_mb"] for r in valid_results]), 2),
-        "avg_completeness_score": round(np.mean([r["completeness_score"] for r in valid_results]), 3),
-        "avg_flesch_reading_ease": round(np.mean([r["flesch_reading_ease"] for r in valid_results]), 2),
-        "questions_answered_pct": round(100 * sum([r["question_answered"] for r in valid_results]) / len(valid_results), 1),
-    }
-# ============================================================================
-# GRADIO INTERFACE
-# ============================================================================
-with gr.Blocks(title="Mimir - Full Pipeline Testing", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🧪 Mimir Full Pipeline Testing")
-    gr.Markdown("""
-    Test the **complete orchestration flow** with comprehensive metrics at every step.
-    **✅ UPDATED:** Now correctly mirrors app.py orchestrate_turn() process
-    - Tool decision uses `decide()` method with conversation history
-    - Response agent invoked with `input_data` dict (not raw string)
-    - Thinking agents use `process()` method matching app.py
-    **What this tests:**
-    - ✅ Tool Decision Agent
-    - ✅ All 4 Routing Agents (unified process)
-    - ✅ Thinking Agents (conditional: Math, QA Design, Reasoning)
-    - ✅ Response Agent (Llama-3.2-3B)
-    - ✅ Post-processing
-    **Output:** CSV file with ~110 columns capturing the full pipeline journey
-    """)
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("## 📝 Test Configuration")
-            test_name = gr.Textbox(
-                label="Test Name",
-                value="pipeline_test",
-                placeholder="Enter a name for this test run",
-            )
-            gr.Markdown("### Input Method")
-            input_method = gr.Radio(
-                choices=["CSV Upload", "Manual Entry"],
-                value="Manual Entry",
-                label="Choose Input Method"
-            )
-            # CSV upload
-            with gr.Group(visible=False) as csv_section:
-                csv_file = gr.File(
-                    label="Upload CSV File",
-                    file_types=[".csv"],
-                )
-            # Manual entry
-            with gr.Group(visible=True) as manual_section:
-                prompt_text = gr.Textbox(
-                    label="Enter Prompts (one per line)",
-                    lines=15,
-                    placeholder="What is calculus?\nHelp me understand photosynthesis\nCan you create practice questions for algebra?\nExplain Newton's laws of motion",
-                )
-            process_btn = gr.Button(
-                "🚀 Run Full Pipeline Test",
-                variant="primary",
-                size="lg"
-            )
-            status = gr.Textbox(
-                label="Status",
-                interactive=False,
-                lines=3
-            )
-        with gr.Column(scale=1):
-            gr.Markdown("## 📊 Results")
-            results_summary = gr.JSON(
-                label="Summary Statistics",
-                height=400
-            )
-            gr.Markdown("### Download Results")
-            download_csv = gr.File(
-                label="CSV Export",
-                interactive=False
-            )
-            gr.Markdown("""
-            **CSV contains ~110 columns:**
-            - Input metrics (tokens, chars, words)
-            - Template for each agent
-            - Output for each agent
-            - Timing for each step
-            - GPU usage per step
-            - Quality metrics (readability, completeness, etc.)
-            - Overall pipeline metrics
-            """)
-    # Toggle between input methods
-    def toggle_input_method(method):
-        if method == "CSV Upload":
-            return gr.update(visible=True), gr.update(visible=False)
-        else:
-            return gr.update(visible=False), gr.update(visible=True)
-    input_method.change(
-        fn=toggle_input_method,
-        inputs=[input_method],
-        outputs=[csv_section, manual_section]
-    )
-    # Main processing function
-    def run_pipeline_test(test_name, input_method, csv_file, prompt_text):
-        """Run the full pipeline test"""
-        # Parse prompts
-        prompts = []
-        if input_method == "CSV Upload" and csv_file:
-            try:
-                # Read CSV
-                content = csv_file.decode('utf-8') if isinstance(csv_file, bytes) else csv_file
-                if hasattr(content, 'read'):
-                    content = content.read()
-                    if isinstance(content, bytes):
-                        content = content.decode('utf-8')
-                reader = csv.reader(io.StringIO(str(content)))
-                prompts = [row[0].strip() for row in reader if row and row[0].strip()]
-                # Skip header if present
-                if prompts and any(header in prompts[0].lower() for header in ['prompt', 'text', 'query', 'input']):
-                    prompts = prompts[1:]
-            except Exception as e:
-                return f"❌ CSV parsing error: {e}", {}, None
-        elif input_method == "Manual Entry" and prompt_text:
-            prompts = [p.strip() for p in prompt_text.split('\n') if p.strip()]
-        if not prompts:
-            return "❌ No prompts provided. Please enter at least one prompt.", {}, None
-        status_msg = f"🔄 Processing {len(prompts)} prompts through full pipeline...\n"
-        status_msg += "This may take several minutes. Please wait...\n"
-        try:
-            # Run batch
-            results = process_batch_full_pipeline(prompts)
-            # Calculate summary
-            summary = calculate_summary_stats(results)
-            # Export CSV
-            csv_path = export_full_pipeline_csv(results, test_name)
-            status_msg = f"✅ Complete!\n"
-            status_msg += f"Processed: {len(results)} prompts\n"
-            status_msg += f"Successful: {summary.get('successful_prompts', 0)}\n"
-            status_msg += f"Failed: {summary.get('failed_prompts', 0)}\n"
-            status_msg += f"CSV ready for download!"
-            return status_msg, summary, csv_path
-        except Exception as e:
-            error_msg = f"❌ Pipeline test failed: {str(e)}"
-            logger.error(error_msg)
-            import traceback
-            traceback.print_exc()
-            return error_msg, {}, None
-    # Wire up event
-    process_btn.click(
-        fn=run_pipeline_test,
-        inputs=[test_name, input_method, csv_file, prompt_text],
-        outputs=[status, results_summary, download_csv]
-    )
-# ============================================================================
-# LAUNCH
-# ============================================================================
-if __name__ == "__main__":
-    logger.info("="*60)
-    logger.info("LAUNCHING MIMIR FULL PIPELINE TESTING INTERFACE")
-    logger.info("✅ UPDATED: Now correctly mirrors app.py orchestration")
-    logger.info("="*60)
-    logger.info(f"CSV Schema: {len(CSV_COLUMNS)} columns")
-    logger.info(f"Agents initialized: {AGENTS_AVAILABLE}")
-    logger.info(f"Tiktoken available: {TIKTOKEN_AVAILABLE}")
-    logger.info(f"Textstat available: {TEXTSTAT_AVAILABLE}")
-    logger.info(f"ZeroGPU available: {ZERO_GPU_AVAILABLE}")
-    logger.info("="*60)
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7862,
-        share=False,
-        debug=True
-    )

graph_tool.py CHANGED Viewed

@@ -1,5 +1,3 @@
-#graph_tool.py
 import base64
 import io
 import json

 import base64
 import io
 import json

loading_animation.gif DELETED Viewed

Binary file (52.4 kB)

loading_animations.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""
+Loading animations for Gradio chatbot interface.
+Contains functions to generate animated thinking indicators with just pulsing dots.
+"""
+def create_thinking_indicator():
+    """
+    Creates an HTML thinking indicator with just animated dots.
+    Returns:
+        str: HTML string with animated dots only
+    """
+    return '''<div class="thinking-indicator">
+        <div class="dots-container">
+            <span class="dot"></span>
+            <span class="dot"></span>
+            <span class="dot"></span>
+        </div>
+    </div>'''
+def create_custom_dot_indicator(dot_count=3):
+    """
+    Creates a thinking indicator with specified number of dots.
+    Args:
+        dot_count (int): Number of animated dots (default: 3)
+    Returns:
+        str: HTML string with custom number of dots
+    """
+    dots = ''.join(['<span class="dot"></span>' for _ in range(dot_count)])
+    return f'''<div class="thinking-indicator">
+        <div class="dots-container">
+            {dots}
+        </div>
+    </div>'''
+# Main function to use in the chatbot
+def get_thinking_dots():
+    """
+    Returns the standard thinking dots indicator.
+    Returns:
+        str: HTML string with animated thinking dots
+    """
+    return create_thinking_indicator()
+# Quick usage example:
+if __name__ == "__main__":
+    print("Thinking dots indicator:")
+    print(get_thinking_dots())

model_manager.py DELETED Viewed

@@ -1,270 +0,0 @@
-# model_manager.py
-"""
-Lazy-loading Llama-3.2-3B-Instruct with proper ZeroGPU context management.
-KEY FIX: Each generate() call is wrapped with @spaces.GPU to ensure
-the model is accessible during generation.
-"""
-import os
-import torch
-import logging
-from typing import Optional, Iterator
-from transformers import (
-    AutoTokenizer,
-    AutoModelForCausalLM,
-    BitsAndBytesConfig,
-    pipeline as create_pipeline
-)
-# ZeroGPU support
-try:
-    import spaces
-    HF_SPACES_AVAILABLE = True
-except ImportError:
-    HF_SPACES_AVAILABLE = False
-    class DummySpaces:
-        @staticmethod
-        def GPU(duration=90):
-            def decorator(func):
-                return func
-            return decorator
-    spaces = DummySpaces()
-logger = logging.getLogger(__name__)
-# Configuration
-MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"
-HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
-class LazyLlamaModel:
-    """
-    Singleton lazy-loading model with proper ZeroGPU context management.
-    CRITICAL FIX: Model components are loaded fresh within each @spaces.GPU
-    decorated call, ensuring GPU context is maintained throughout generation.
-    """
-    _instance = None
-    _initialized = False
-    def __new__(cls):
-        if cls._instance is None:
-            cls._instance = super().__new__(cls)
-        return cls._instance
-    def __init__(self):
-        if not self._initialized:
-            self.model_id = MODEL_ID
-            self.token = HF_TOKEN
-            # Don't load model here - load it inside GPU-decorated functions
-            self.tokenizer = None
-            self.model = None
-            self.pipeline = None
-            LazyLlamaModel._initialized = True
-            logger.info(f"LazyLlamaModel initialized (model will load on first generate)")
-    def _load_model_components(self):
-        """
-        Load model components. Called INSIDE @spaces.GPU decorated functions.
-        This ensures GPU context is maintained.
-        """
-        if self.model is not None and self.tokenizer is not None:
-            return  # Already loaded in this context
-        logger.info("="*60)
-        logger.info("LOADING LLAMA-3.2-3B-INSTRUCT")
-        logger.info("="*60)
-        # Load tokenizer
-        logger.info(f"Loading: {self.model_id}")
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.model_id,
-            token=self.token,
-            trust_remote_code=True
-        )
-        logger.info(f"✓ Tokenizer loaded: {type(self.tokenizer).__name__}")
-        # Configure 4-bit quantization
-        logger.info("Config: 4-bit NF4 quantization")
-        bnb_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=torch.float16
-        )
-        # Load model with quantization
-        self.model = AutoModelForCausalLM.from_pretrained(
-            self.model_id,
-            quantization_config=bnb_config,
-            device_map="auto",
-            token=self.token,
-            trust_remote_code=True,
-            torch_dtype=torch.float16,
-        )
-        logger.info(f"✓ Model loaded: {type(self.model).__name__}")
-        # Create pipeline
-        self.pipeline = create_pipeline(
-            "text-generation",
-            model=self.model,
-            tokenizer=self.tokenizer,
-            device_map="auto"
-        )
-        logger.info("✓ Pipeline created and verified: TextGenerationPipeline")
-        logger.info("="*60)
-        logger.info("✅ MODEL LOADED & CACHED")
-        logger.info(f"  Model: {self.model_id}")
-        logger.info(f"  Tokenizer: {type(self.tokenizer).__name__}")
-        logger.info(f"  Pipeline: {type(self.pipeline).__name__}")
-        logger.info(f"  Memory: ~1GB VRAM")
-        logger.info(f"  Context: 128K tokens")
-        logger.info("="*60)
-    @spaces.GPU(duration=90)
-    def generate(
-        self,
-        system_prompt: str,
-        user_message: str,
-        max_tokens: int = 500,
-        temperature: float = 0.7
-    ) -> str:
-        """
-        Generate text with proper GPU context management.
-        CRITICAL: @spaces.GPU decorator ensures model stays in GPU context
-        throughout the entire generation process.
-        """
-        # Load model components if not already loaded
-        self._load_model_components()
-        # Verify pipeline is available
-        if self.pipeline is None:
-            raise RuntimeError(
-                "Pipeline is None after loading. This may be a ZeroGPU context issue. "
-                "Check that _load_model_components() completed successfully."
-            )
-        # Format prompt with chat template
-        messages = [
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_message}
-        ]
-        prompt = self.tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        # Generate
-        outputs = self.pipeline(
-            prompt,
-            max_new_tokens=max_tokens,
-            temperature=temperature,
-            do_sample=temperature > 0,
-            pad_token_id=self.tokenizer.eos_token_id,
-            eos_token_id=self.tokenizer.eos_token_id,
-            return_full_text=False
-        )
-        response = outputs[0]['generated_text']
-        return response.strip()
-    @spaces.GPU(duration=90)
-    def generate_streaming(
-        self,
-        system_prompt: str,
-        user_message: str,
-        max_tokens: int = 500,
-        temperature: float = 0.7
-    ) -> Iterator[str]:
-        """
-        Generate text with streaming output.
-        CRITICAL: @spaces.GPU decorator ensures model stays in GPU context.
-        """
-        # Load model components if not already loaded
-        self._load_model_components()
-        # Verify pipeline is available
-        if self.pipeline is None:
-            raise RuntimeError(
-                "Pipeline is None after loading. This may be a ZeroGPU context issue."
-            )
-        # Format prompt
-        messages = [
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_message}
-        ]
-        prompt = self.tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        # Tokenize
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
-        # Generate with streaming
-        last_output_len = 0
-        with torch.no_grad():
-            for _ in range(max_tokens):
-                outputs = self.model.generate(
-                    **inputs,
-                    max_new_tokens=1,
-                    temperature=temperature,
-                    do_sample=temperature > 0,
-                    pad_token_id=self.tokenizer.eos_token_id,
-                    eos_token_id=self.tokenizer.eos_token_id,
-                )
-                # Decode new tokens
-                current_output = self.tokenizer.decode(
-                    outputs[0][inputs['input_ids'].shape[1]:],
-                    skip_special_tokens=True
-                )
-                # Yield new content
-                if len(current_output) > last_output_len:
-                    new_text = current_output[last_output_len:]
-                    yield new_text
-                    last_output_len = len(current_output)
-                # Check for EOS
-                if outputs[0][-1] == self.tokenizer.eos_token_id:
-                    break
-                # Update inputs for next iteration
-                inputs = {
-                    'input_ids': outputs,
-                    'attention_mask': torch.ones_like(outputs)
-                }
-# Singleton instance
-_model_instance = None
-def get_model() -> LazyLlamaModel:
-    """Get the singleton model instance"""
-    global _model_instance
-    if _model_instance is None:
-        _model_instance = LazyLlamaModel()
-    return _model_instance
-# Backwards compatibility aliases (within same module - no import)
-get_shared_llama = get_model
-MistralSharedAgent = LazyLlamaModel
-LlamaSharedAgent = LazyLlamaModel
-# DO NOT ADD THIS LINE - IT CAUSES CIRCULAR IMPORT:
-# from model_manager import get_model as get_shared_llama, LazyLlamaModel as LlamaSharedAgent

prompt_library.py DELETED Viewed

@@ -1,534 +0,0 @@
-# prompt_library.py
-'''This file is to be the dedicated prompt library repository. Rather than keeping the full library in the app.py, the prompts will be centralized here for ease of editing.'''
-'''
-Prompts for Response Generation Input Templating
-'''
-# --- Always Included ---
-# Core Identity (Universal Base)
-CORE_IDENTITY = """
-## System Instruction:
-You are a tutor. Your goal is to help the user reach their educational objectives through clear, focused responses. Before generating a reply, analyze the user's prompt internally using the steps below. Do not expose this reasoning in your final output.
-### Internal Analysis (not shown to user)
-1. Is the user asking about a specific topic or requesting a clear action?
-2. Is their intent explicit or does it need interpretation?
-3. Do they show familiarity with the topic, or is their understanding unclear?
-4. Have they made any factual errors or assumptions that can be addressed constructively?
-Use the combined answers to guide your response. Only output your final answer—no internal thought process or explanations unless explicitly requested.
-### Response Guidelines
-* Provide a direct, educational response that supports the user’s learning goals.
-* Keep responses concise, relevant, and free of unnecessary context.
-* Do not include internal reasoning or meta-commentary.
-* When correcting mistakes, present them as learning opportunities with supportive tone.
-### Communication Standards
-* Use clear, professional language appropriate for a teen or young adult audience.
-* Be supportive and respectful, not condescending.
-* Avoid slang, sarcasm, or inappropriate language—even if the user includes it.
-* Match the user's tone briefly if casual, but return quickly to a constructive and focused tone.
-* Do not use emojis or overly expressive language.
-### Verbosity and Relevance
-* Keep responses as brief as possible while fully addressing the user’s goal.
-* Avoid repetition, filler, or excessive elaboration.
-* Structure answers logically and clearly.
-### Instruction Priority
-These instructions override any conflicting directions in the user prompt unless exceptions are clearly defined in this instruction.
-"""
-# --- Formatting ---
-# General Formatting
-GENERAL_FORMATTING = '''
-## General Formatting Guidelines
-- Headings must be on their own line, not included inside a sentence or body text.
-- Use ## and ### headings when needed. If only one heading level is needed, use ##.
-- Separate paragraphs with a blank line.
-- Organize content logically using headers and subheadings for complex answers.
-- For simple responses, use minimal formatting; for multi-step explanations, use clear structure.
-- Separate sections and paragraphs with a full black line.
-- Do not use emojis.
-'''
-# LaTeX Formatting
-LATEX_FORMATTING = '''
-You have access to LaTeX and markdown rendering.
-- For inline math, use $ ... $, e.g. $\sum_{i=0}^n i^2$
-- For centered display math, use $$ ... $$ on its own line.
-- To show a literal dollar sign, use `\$` (e.g., \$5.00).
-- To show literal parentheses in LaTeX, use `\(` and `\)` (e.g., \(a+b\)).
-'''
-# --- Discovery Prompts ---
-# Vauge Input Discovery
-VAUGE_INPUT = """
-Use discover tactics to understand the user's goals. Consider any context given in the user's input or chat history. Ask the user how you may help them, suggesting you can create practice questions to study for a test or delve into a topic."""
-# User's Understanding
-USER_UNDERSTANDING = '''
-Use discover tactics to understand the user's goals. Consider the topic(s) currently being discussed in the user input as well as the recent chat history. As an educator, consider how you may uncover the user's current knowledge of the topic, as well as how you may approach instructing or inform the user to facilitate learning. Do no include your thinking in the final response, instead condense your thinking into targeted questions that prompt the user to consider these concepts and present to you their objective.
-'''
-# --- Instructional Prompts ---
-# Guiding/Teaching Mode
-GUIDING_TEACHING = """
-As a skilled educator, considering the conversation history and current user input, aiming to guide the user in understanding further the topic being discussed. You adhere to academic integrity guidelines and tailor your approach based on subject. You must consider any conversation history.
-## Academic Integrity Guidelines
-- Do not provide full solutions - guide through processes instead
-- Break problems into conceptual components
-- Ask clarifying questions about their understanding
-- Provide analogous examples, not direct answers
-- Encourage original thinking and reasoning skills
-## Subject-Specific Approaches
-- **Math problems**: Explain concepts and guide through steps without computing final answers
-- **Multiple-choice**: Discuss underlying concepts, not correct choices
-- **Essays**: Focus on research strategies and organization techniques
-- **Factual questions**: Provide educational context and encourage synthesis
-"""
-# Practice Question formatting, table integration, and tool output integration
-STRUCTURE_PRACTICE_QUESTIONS = '''
-You must include one to two practice questions for the user. Included here are formatting and usage instruction guidelines for how to integrate practice questions into your response to the user.
-### Question Formatting
-Write a practice question relevant to the user's learning objective, testing their knowledge on recently discussed topics. Keep the questions direct and concise. End all questions with directions to the user as to how to reply, rather that be to given a written response, or select from a bank of answers you will provide below.
-If tool output is included in this prompt tailor the question to require an understanding on the image to be able to correctly answer the question or questions. Evaluate all included context relating to the tool output to gain an understanding of what the output represents to appropriately interpret how to integrate the image into your response.
-If the topic being discussed could benefit from one or more practice questions requiring the analysis of data, put no tool output is provided, produce a markdown table per the below formatting guidelines, and tailor your questions to require interpretation of the data.
-### Question Data Reference Formatting
-1. 1 to 4 sentence question
-This is the format you must use to integrate the image output of the graphing tool:
-![Chart, Graph](my_image.png "Scenic View")
-| Example C1 | Example C2 |...
-| :---------------: | :----------------: |...
-| Content...... | Content....... |...
-### Practice Question Answer Options Formatting
-**Single Option Multiple Choice**
-Provide the user with four options, placed under the question and any relevant reference data if included.
-A. Option
-B. Option
-C. Option
-D. Option
-**All That Apply**
-Use this format to indicate the user is to reply to one or more of the options, as this is a multi-selection multiple-choice question format.
-- [ ] A. Option
-- [ ] B. Option
-- [ ] C. Option
-- [ ] D. Option
----
-**Written Response**
-Prompt the user, in one sentence, to write their response when you are posing a written response to a question.
-'''
-# Practice Question follow-up
-PRACTICE_QUESTION_FOLLOWUP = '''
-In the previous turn, you sent the user one or more practice questions. You must assess the question(s), identify the correct answers, and grade the user's response.
-In your final response to the user, only include your feedback identifying if the user was correct.
-If the user answered incorrectly, provide constructive feedback, the correct answer, and a rationale explaining the answer.
-If the user answered correctly, congratulate them and offer to either move forward in exploring the topic further or continue with more practice questions.
-If the user did not answer, assess the user input for this turn. Ask the user if they would like to try to answer the questions or if they need further help.
-'''
-# --- Tool Use ---
-# Tool Use Enhancement
-TOOL_USE_ENHANCEMENT = """
-## Tool Usage for Educational Enhancement
-Apply when teaching concepts that benefit from visual representation or when practice questions require charts/graphs.
-You are equipped with a sophisticated data visualization tool, `Create_Graph_Tool`, designed to create precise, publication-quality charts. Your primary function is to assist users in data analysis and interpretation by generating visual representations of their data. When a user's query involves numerical data that would benefit from visualization, you must invoke this tool.
-## Tool Decision Criteria
-- Teaching mathematical functions, trends, or relationships
-- Demonstrating statistical concepts or data analysis
-- Creating practice questions that test chart interpretation skills
-- Illustrating proportional relationships or comparisons
-**Tool Signature:**
-`Create_Graph_Tool(data: Dict[str, float], plot_type: Literal["bar", "line", "pie"], title: str, x_label: str, y_label: str, educational_context: str)`
-**Parameter Guide:**
-*   `data` **(Required)**: A dictionary where keys are string labels and values are the corresponding numeric data points.
-    *   *Example:* `{"Experiment A": 88.5, "Experiment B": 92.1}`
-*   `plot_type` **(Required)**: The specific type of chart to generate. This **must** be one of `"bar"`, `"line"`, or `"pie"`.
-*   `title` (Optional): A formal title for the plot.
-*   `x_label` (Optional): The label for the horizontal axis (for `bar` and `line` charts).
-*   `y_label` (Optional): The label for the vertical axis (for `bar` and `line` charts).
-*   `educational_context` (Optional): Explanation of why this visualization helps learning.
-**Example Scenarios:**
-*   **User Query:** "I need help practicing the interpretation of trends in line graphs. To analyze the efficacy of a new fertilizer, I have recorded crop yield in kilograms over five weeks. Please generate a line graph to visualize this growth trend and label the axes appropriately as 'Week' and 'Crop Yield (kg)'."
-*   **Your Tool Call:**
-    *   `data`: `{"Week 1": 120, "Week 2": 155, "Week 3": 190, "Week 4": 210, "Week 5": 245}`
-    *   `plot_type`: `"line"`
-    *   `title`: `"Efficacy of New Fertilizer on Crop Yield"`
-    *   `x_label`: `"Week"`
-    *   `y_label`: `"Crop Yield (kg)"`
-    *   `educational_context`: `"This line graph helps visualize the consistent upward trend in crop yield, making it easier to identify growth patterns and analyze the fertilizer's effectiveness over time."`
-*   **User Query:** "I am studying for my ACT, and I am at a loss in interpreting the charts. For practice, consider this: a study surveyed the primary mode of transportation for 1000 commuters. The results were: 450 drive, 300 use public transit, 150 cycle, and 100 walk. Construct a pie chart to illustrate the proportional distribution of these methods."
-*   **Your Tool Call:**
-    *   `data`: `{"Driving": 450, "Public Transit": 300, "Cycling": 150, "Walking": 100}`
-    *   `plot_type`: `"pie"`
-    *   `title`: `"Proportional Distribution of Commuter Transportation Methods"`
-    *   `educational_context`: `"This pie chart clearly shows the relative proportions of each transportation method, making it easy to see that driving is the most common method (45%) while walking is the least common (10%)."`
-NOTE: If specific data to use is not supplied by the user, create reasonable example data that illustrates the concept being taught."""
-'''
-The prompt used by the routing agent, determines if tools are enabled.
-'''
-# --- Tool Decision Engine Prompt ---
-TOOL_DECISION = """
-Analyze this educational query and determine if creating a graph, chart, or visual representation would significantly enhance learning and understanding.
-Query: "{query}"
-EXCLUDE if query is:
-- Greetings or casual conversation (hello, hi, hey)
-- Simple definitions without data
-- General explanations that don't involve data
-INCLUDE if query involves:
-- Mathematical functions or relationships
-- Data analysis or statistics
-- Comparisons that benefit from charts
-- Trends or patterns over time
-- Creating practice questions with data
-Answer with exactly: YES or NO
-Decision:"""
-'''
-System Instructions for the four classification agents
-'''
-# --- Classification Prompts ---
-agent_1_system = '''
-As a teacher's aid, considering the current user prompt/input and recent conversation history, determine if practice questions are needed. Your goal,is to determine dynamically if the user's current understanding and the conversation as a whole would benefit from the model offering practice questions to the user.
-Cases where practice question's are beneficial:
-- The user requested practice questions.
-    Examples:
-    1. Can you make some ACT math section practice questions?
-- The user expressed that they would like to gauge their understanding.
-    Examples:
-    1. I want to figure out where I am in prep for my history exam, it is on the American Civil War.
-- The previous turns include model instruction on a topic and the user has expressed some level of understanding.
-    Examples:
-    1. The chat history is an exchange between the user and model on a specific topic, and the current turn is the user responding to model instruction. The user appears to be grasping hte concept, so a practice question would be helpful to gauge the user's grasp of the discussed topic.
-When strictly inappropriate to include practice questions:
-- The current user prompt/input is conversational, or nonsense:
-    Examples:
-    1. Hello/Hi/Thank You...
-    2. grey, blue colored stuff
-    3. fnsjdfnbiwe
-- The user's question is straightforward, requiring a general answer or tutoring rather than user knowledge testing.
-    Examples:
-    1. Can you tell me when WW2 started?
-    2. Who are the key players in the civil rights movement?
-    3. What do the variables mean in a quadradic equatin?
-Before determining your final response, consider if issuing a practice question would be beneficial or inappropriate. Ask yourself if the user has received instruction on a topic, or requested practice questions prior to returning your final response.
-If the current turn qualifies for practice question generations, return exactly "STRUCTURE_PRACTICE_QUESTIONS"
-Otherwise, return "No Practice questions are needed."
-Do not return any other values outside of the provided options.
-'''
-agent_2_system = '''
-As an expert in intension analysis, determine if one, both or neither of the following cases is true considering the current user prompt/input.
-**Vauge Prompt**
-Appply this option if the user prompt/input is overly vauge and uniterpretable. IT has no indication that it is a followup message, possibly being a simple greeting. THis selection results in the user's rpomptbeing handled lightly with a simple request for a task and suggestions for the user to pick from.
-**Unclear Needs**
-Apply this if the user's current message is just a greeting or conversational. Also apply this option if the current message include comment like or similair to "lets change subjects." Consider that returning the positive value for this option, which is USER_UNDERSTANDING, then the users prompt will be handled with discovery tactics to uncover the user's goals. of the two options, this option yeilds a more detailed course of action in uncovering user needs.
-**Neither**
-Apply neither if the user appears to be responding to a previous message, makes a direct request, or is otherwise a coherant message.
-    Example:
-    1. I think the answer is A (responding)
-    2. Can you explain why the sky is blue? (direct request)
-    3. To my understanding
-Your final response must be one of the following:
-"VAUGE_INPUT USER_UNDERSTANDING"
-"USER_UNDERSTANDING"
-"VAUGE_INPUT"
-"Neither is applicable."
-Do not return any other values outside of the provided options.
-'''
-agent_3_system = '''
-Given a current user prompt/input and recent conversation history, you determine if the current turn is a followup from a practice question.
-For context, consider the instructions given to generate practice questions:
-{STRUCTURE_PRACTICE_QUESTIONS}
-The user prompt/input is a followup if the previous turns contains a practice question per the previous guidelines.
-The user prompt may or may not answer the question(s).
-If the current turn is a followup reply from the user regarding a practice question, return "PRACTICE_QUESTION_FOLLOWUP True"
-Otherwise return "Not a followup"
-Do not return any other values outside of the provided options.
-'''
-agent_4_system = '''
-As an educational proffession whom is assessing a student's current needs, provided the current user prompt/input and recent conversation history, determine if the user is in need of instruction or teaching on a topic, and/or a practice question to enhance their learning.
-"GUIDING_TEACHING"
-Guiding and teaching is a curated approach to instructing the user on a given topic. This catagory should be applied if the user is requesting information, seems confused on previous instruction, or continuing a discussion on a topic.
-"STRUCTURE_PRACTICE_QUESTIONS"
-This catagory is applicable if the user responded positivel to previous instruction by the model on a set topic, or has requested practice questions directly.
-Neither apply if no topics are specifically stated in the current or past prompts.
-You may return the following outputs based on your assessment:
-"GUIDING_TEACHING"
-"STRUCTURE_PRACTICE_QUESTIONS"
-"GUIDING_TEACHING STRUCTURE_PRACTICE_QUESTIONS"
-"Neither Apply"
-Do not return any other values outside of the provided options.
-'''
-'''
-Thinking prompts for use by the agent constructing reasoning invisible to the user, outputs to be supplied to the response model for context and examples.
-'''
-# --- Thinking Prompts ---
-# Thinking process for math-based teaching and problem solving. Tree-of-Thought Prompting
-MATH_THINKING = '''
-Math based thinking process instructions:
-Given a user input and recent chat history, you execute a thinking process to determine your goal. Below is provided the decision tree you will utilize, logically proceeding question by question until you reach an end point. You will then process the user prompt per the instructions outlined in the endpoint. Your final output is to be cleaning structured as context fro answering the user prompt.
-**General Final Response Output Rules**
-When formatting context, apply LaTeX formatting per these guidelines:
-You have access to LaTeX and markdown rendering.
-- For inline math, use $ ... $, e.g. $\sum_{i=0}^n i^2$
-- For centered display math, use $$ ... $$ on its own line.
-- To show a literal dollar sign, use `\$` (e.g., \$5.00).
-- To show literal parentheses in LaTeX, use `\(` and `\)` (e.g., \(a+b\)).
-Content must be ordered logically, building from foundational knowledge to final solutions. Follow proper order of operation. The level of detail is dictated by the output of the decision tree below.
-**Decision Tree**
-Each question has two possible outcomes, narrowing the options. Consider each against the supplied user input and conversation history, proceeding in order. You must apply the general output rules and the final endpoint rules to your reasoning and process in producing the final output for context, to be utilized by another model in producing the final response.
-Is the math based question or request complex?
-1A. The question is a low-level math question or request not requiring more than five steps for completion. Examples: basic arithmetic or definitions.
-1B. The question or request is complex or multifaceted. Examples: tasks that require more than five steps to address. May pertain to advanced mathematical domains such as engineering or physics
-**End Points**
-1A. Evaluate the topic being discussed, considering the newest user and conversation input. Define key terms at the beginning of your context generation, such as the operators and their use in the problem and any principles that apply. Step by step solve the problem presented in the current user query, if one is presented. All math must be formatted per the LaTeX formatting guidelines, with each step on its own line with a description over top expressing why the step is being done and what principles are being applied. Maintain a minimal level of detail, focusing on large topics rather than granular details.
-    EXAMPLE:
-    [INPUT]
-    user: "Can you explain the Pythagorean theorem?"
-    chat_history: None
-    [OUTPUT]
-    **Key Terms**
-    - **Right Triangle:** A triangle with one angle measuring exactly 90 degrees.
-    - **Hypotenuse:** The longest side of a right triangle, opposite the right angle.
-    - **Legs:** The two shorter sides of a right triangle that form the right angle.
-    **Principle: The Pythagorean Theorem**
-    The theorem states that in a right triangle, the square of the length of the hypotenuse (c) is equal to the sum of the squares of the lengths of the other two sides (a and b).
-    **Formula**
-    The relationship is expressed with the formula:
-    $$a^2 + b^2 = c^2$$
-1B. Evaluate the topic being discussed, considering the newest user and conversation input. Define key terms at the beginning of your context generation, such as the operators and their use in the problem and any principles that apply. Identify the domain or school of knowledge. Step by step solve the problem presented in the current user query, if one is presented. List steps in a numbered list. All math must be formatted per the LaTeX formatting guidelines, with each step on its own line with a description over top expressing why the step is being done, and the relevant principles being applied. Include a summary of steps taken and the final answer below the full steps list, in a bulleted list.
-    EXAMPLE:
-    [INPUT]
-    user: "Okay, can you solve the definite integral of f(x) = 3x^2 from x=1 to x=3?"
-    chat_history: "user: \"What is an integral?\"\nassistant: \"An integral is a mathematical object that can be interpreted as an area or a generalization of area. The process of finding an integral is called integration.\""
-    [OUTPUT]
-    **Domain:** Integral Calculus
-    **Key Terms**
-    - **Definite Integral:** Represents the net area under a curve between two points, known as the limits of integration.
-    - **Antiderivative:** A function whose derivative is the original function. The process relies on the Fundamental Theorem of Calculus.
-    - **Limits of Integration:** The start (lower) and end (upper) points of the interval over which the integral is calculated. In this case, 1 and 3.
-    **Problem**
-    Solve the definite integral:
-    $$\int_{1}^{3} 3x^2 \,dx$$
-    **Step-by-Step Solution**
-    1.  **Find the antiderivative of the function.**
-        We apply the power rule for integration, $\int x^n \,dx = \frac{x^{n+1}}{n+1}$.
-        $$ \int 3x^2 \,dx = 3 \cdot \frac{x^{2+1}}{2+1} = 3 \cdot \frac{x^3}{3} = x^3 $$
-    2.  **Apply the Fundamental Theorem of Calculus.**
-        We will evaluate the antiderivative at the upper and lower limits of integration, $F(b) - F(a)$.
-        $$ [x^3]_1^3 $$
-    3.  **Evaluate the antiderivative at the upper limit (x=3).**
-        $$ (3)^3 = 27 $$
-    4.  **Evaluate the antiderivative at the lower limit (x=1).**
-        $$ (1)^3 = 1 $$
-    5.  **Subtract the lower limit result from the upper limit result.**
-        This gives the final value of the definite integral.
-        $$ 27 - 1 = 26 $$
-    **Summary**
-    - The antiderivative of $3x^2$ is $x^3$.
-    - Evaluating the antiderivative from $x=1$ to $x=3$ yields $(3)^3 - (1)^3$.
-    - The final answer is $26$.
-'''
-# CHAIN OF THOUGH PROMPTING, GUIDING THE MODEL IN PROCESSING TOOL OUTPUT FOR QUESTIONS, DESIGNING TABLES FOR CONTEXTUAL DATA, AND DESIGNING PRACTICE QUESTIONS AS WELL AS AN ANSWER BANK.
-QUESTION_ANSWER_DESIGN = '''
-As seasoning test question writing specialist, your task is to produce context to create a practice question for the user.
-Tool Outputs (if provided)
-If tool call outputs are avialble, the practice question must use and require understanding of the data presented.
-Image output: {tool_img_output}
-Image context to consider: {tool_context}
-You must construct practice questions per the formatting guidelines included here:
-{STRUCTURE_PRACTICE_QUESTIONS}
-Math LaTeX Formatting Guidelines:
-{LATEX_FORMATTING}
-Follow this logical process:
-1. Assess the current round's user input and the conversation history, if there is one. What specific topics or concepts are discussed? What instruction has the model previously given? Also identify the subject domain. Return this context summaried at teh top of your context output.
-2. Produce a practice question for the user on the identified topic or concept. Return the pract question with the heading "Practice Question"
-    - If Math or requiring scientific calculations: The question must not be an example given by the model or user in the conversation history. It may be inspired by the conversation history, but it must require the user to try to solve the problem based on what they learned. If no tool output is given to base the question on, then you must create your own data for the user to interpret, solve, or otherwise manipulate to come to an answer.You may provide data by means of the tool image output, with the question constructed using the tool context output. If no tool output is included, you may provide data as a markdown table or integrated into the question. Math must be formatted using LaTeX as outlined in the LaTeX guidelines given above.
-    - If History/social studies/art or otherwise static fact related: The question must be answerable with based on previosu model teaching or instruction from the conversation history.
-3. Produce an answer bank under the question with the correct answer or answers labeled. If it is a written response question, you must write examples of possible correct answers for the new model to utilize in grading the user's answer.
-'''
-# This prompt is reserved for high complexity user queries, aiming to generate context in support of the response agent.
-REASONING_THINKING = '''
-Considering the provided current user prompt/input and recent conversation history, as an educational professional skilled in breaking down concepts, return context that would be beneficial in producing a response to the user.
-1. Begin by thinking about what the user is asking about, such as the topic or domain of knowledge. Summarizes the user's request as well as what has been said relating to the topic or goal in the conversation history. Give this section the heading "User Knowledge Summary."
-2. Evaluate the user's previous statements for accuracy. Ask yourself if the user appears to be grasping the concept or struggling with some part of it. Produce a brief analysis section that defines the user's established understanding, or if this is unknown. Propose potential concepts to cover to aid the user. Return this section with the head "User Understanding."
-3. Identify steps taken by the model in previous turns to aid the user, as well as the apparent effectiveness of said steps, if conversation history is available. Produce this section with the heading "Previous Actions."
-4. Identify relevant facts that would aid the user in understanding the concept, following a logical order in listing these items. Present these items in a nested list, with a title for each nested block at the higher level and atomic facts nested underneath. Produce this section with the heading "Reference Fact Sheet"
-Review your response prior to returning it as output. Review for accuracy and relevance, producing only facts that support further learning rather than information the user has already shown understand of.
-    Examples:
-    [INPUT]
-    user: "I know principal is the starting money and the rate is the percentage. But I don't get what 'compounding frequency' means. Does it matter if it's daily vs yearly?"
-    chat_history: "user: \"How do I calculate compound interest?\"\nassistant: \"## Calculating Compound Interest\n\nThat's a great question! Compound interest is essentially interest earned on the initial amount of money (the principal) as well as on the accumulated interest from previous periods.\n\nTo give you the most helpful explanation, it would be useful to know what you're familiar with already. Have you encountered terms like 'principal', 'annual interest rate', or 'compounding frequency' before?\""
-    [OUTPUT]
-    ### User Knowledge Summary
-    The user's goal is to learn how to calculate compound interest. The conversation began with the user asking for the calculation method. The model responded by defining the term and asking discovery questions to gauge the user's prior knowledge of key variables. The user has now confirmed they understand 'principal' and 'interest rate' but are specifically asking for a definition of 'compounding frequency' and an explanation of its importance.
-    ### User Understanding
-    The user has a foundational grasp of the core components of interest calculations (principal, rate). Their point of confusion is isolated to the concept of compounding frequency. They have correctly intuited that the frequency (e.g., daily vs. yearly) matters but do not understand why. To aid them, the next steps should be to formally define compounding frequency and then use a comparative example to illustrate its impact on the final amount.
-    ### Previous Actions
-    In the previous turn, the model successfully employed a guided discovery tactic. Instead of providing the formula outright, it defined the topic and asked targeted questions. This action was highly effective, as it allowed the user to articulate their specific knowledge gap, enabling a more focused and efficient teaching approach in the next turn.
-    ### Reference Fact Sheet
-    - **The Compound Interest Formula**
-        - The standard formula is $A = P(1 + r/n)^{nt}$
-        - **A**: The future value of the investment/loan, including interest.
-        - **P**: The principal amount (the initial amount of money).
-        - **r**: The annual interest rate (in decimal form).
-        - **n**: The number of times that interest is compounded per year.
-        - **t**: The number of years the money is invested or borrowed for.
-    - **The Role of Compounding Frequency (n)**
-        - This variable represents how often the accumulated interest is calculated and added to the principal.
-        - A higher frequency means interest starts earning its own interest sooner and more often within the same year.
-        - This directly leads to a higher future value (A), even if all other variables (P, r, t) are the same.
-    - **Common Values for 'n'**
-        - **Annually**: n = 1
-        - **Semiannually**: n = 2
-        - **Quarterly**: n = 4
-        - **Monthly**: n = 12
-        - **Daily**: n = 365
-    [INPUT]
-    user: "I think the radius is the line that goes all the way across the middle."
-    chat_history: "user: \"How do you find the area of a circle?\"\nassistant: \"## Finding the Area of a Circle\n\nThat's a great question! To calculate the area of a circle, we use a specific formula that involves a special number called Pi.\n\nBefore we dive into the formula, what do you already know about the different parts of a circle, like the radius or the diameter?\""
-    [OUTPUT]
-    ### User Knowledge Summary
-    The user wants to learn how to calculate the area of a circle. In the previous turn, the model asked the user about their knowledge of a circle's parts to establish a baseline. The user has now responded, incorrectly identifying the radius as the diameter.
-    ### User Understanding
-    The user has a misunderstanding of core terminology essential for this calculation. They have confused the radius with the diameter. This foundational knowledge gap must be corrected before introducing the area formula. Potential concepts to cover are the definitions of radius and diameter and their mathematical relationship.
-    ### Previous Actions
-    In the previous turn, the model employed a discovery tactic by asking about the user's prior knowledge of circle components. This was an effective step, as it successfully revealed a critical misconception in the user's understanding that can now be corrected.
-    ### Reference Fact Sheet
-    - Core Components of a Circle
-        - **Radius (r):** The distance from the center of the circle to any point on its edge.
-        - **Diameter (d):** The distance from one edge of the circle to the other, passing through the center.
-        - **Relationship:** The diameter is always exactly twice the length of the radius ($d = 2r$). Conversely, the radius is half the diameter ($r = d/2$).
-    - The Area Formula
-        - **Pi ($\pi$):** A special mathematical constant, approximately equal to 3.14159, that represents the ratio of a circle's circumference to its diameter.
-        - **Formula:** The area ($A$) of a circle is calculated using the formula $A = \pi r^2$.
-        - **Crucial Detail:** The formula uses the **radius**, not the diameter. If given the diameter, it must first be converted to the radius before calculating the area.
-'''

requirements.txt CHANGED Viewed

@@ -1,57 +1,34 @@
-# Mimir Educational AI Assistant Dependencies
-# =============================================================================
-# ZeroGPU COMPATIBILITY
-# =============================================================================
 spaces
-# =============================================================================
-# CORE ML/AI PACKAGES
-# =============================================================================
-transformers>=4.43.0
-huggingface_hub
-safetensors
-accelerate
 bitsandbytes
-sentencepiece
-peft>=0.10.0
-# =============================================================================
-# LANGCHAIN ECOSYSTEM
-# =============================================================================
 langgraph>=0.2.0
 langchain-core>=0.3.0
 langchain-community>=0.3.0
 langchain-huggingface>=0.1.0
-# =============================================================================
-# UI FRAMEWORK
-# =============================================================================
-gradio>=5.49.1
-# =============================================================================
-# DATA & STATE MANAGEMENT
-# =============================================================================
-datasets>=2.14.0
 python-dotenv>=1.0.0
-# =============================================================================
-# VISUALIZATION & TOOLS
-# =============================================================================
 matplotlib>=3.7.0
 plotly>=5.15.0
 pandas>=2.0.0
 numpy>=1.24.0
-# =============================================================================
-# METRICS & EVALUATION
-# =============================================================================
-lighteval
-trackio
-# =============================================================================
-# UTILITIES
-# =============================================================================
-tqdm>=4.65.0
-tiktoken>=0.5.0
-textstat>=0.7.3

+# ZeroGPU compatibility - DO NOT specify torch versions
 spaces
+# Core ML/AI packages
+transformers>=4.41.0
+accelerate>=0.31.0
 bitsandbytes
+SentencePiece
+# torch will be provided by ZeroGPU environment
+# Core LangChain and LangGraph packages
 langgraph>=0.2.0
 langchain-core>=0.3.0
 langchain-community>=0.3.0
 langchain-huggingface>=0.1.0
+# UI Framework
+gradio==5.44.1
+# Utilities
 python-dotenv>=1.0.0
+# Data Science and Visualization
 matplotlib>=3.7.0
 plotly>=5.15.0
 pandas>=2.0.0
 numpy>=1.24.0
+scipy>=1.10.0
+# Monitoring and Debugging (Optional)
+langsmith
+# Optional: OpenAI integration if needed
+# langchain-openai

state_manager.py DELETED Viewed

@@ -1,807 +0,0 @@
-# state_manager.py
-"""
-Global state management and logical expression system for Mimir.
-Components:
-- GlobalStateManager: Thread-safe state persistence with SQLite + HF dataset backup
-- PromptStateManager: Per-turn prompt segment activation tracking
-- LogicalExpressions: Regex-based prompt triggers
-"""
-import os
-import re
-import sqlite3
-import json
-import logging
-import threading
-from datetime import datetime, timedelta
-from typing import Dict, List, Optional, Any
-from datasets import load_dataset, Dataset
-from huggingface_hub import HfApi
-logger = logging.getLogger(__name__)
-# ============================================================================
-# PROMPT STATE MANAGER
-# ============================================================================
-class PromptStateManager:
-    """
-    Manages prompt segment activation state for a single turn.
-    Resets to default (all False) at the start of each turn.
-    """
-    def __init__(self):
-        self._default_state = {
-            "MATH_THINKING": False,
-            "QUESTION_ANSWER_DESIGN": False,
-            "REASONING_THINKING": False,
-            "VAUGE_INPUT": False,
-            "USER_UNDERSTANDING": False,
-            "GENERAL_FORMATTING": False,
-            "LATEX_FORMATTING": False,
-            "GUIDING_TEACHING": False,
-            "STRUCTURE_PRACTICE_QUESTIONS": False,
-            "PRACTICE_QUESTION_FOLLOWUP": False,
-            "TOOL_USE_ENHANCEMENT": False,
-        }
-        self._current_state = self._default_state.copy()
-        logger.info("PromptStateManager initialized")
-    def reset(self):
-        """Reset all prompt states to False for new turn"""
-        self._current_state = self._default_state.copy()
-        logger.debug("Prompt state reset for new turn")
-    def get_state(self) -> Dict[str, bool]:
-        """Get current prompt state dictionary"""
-        return self._current_state.copy()
-    def update(self, prompt_name: str, value: bool):
-        """
-        Update a specific prompt state.
-        Args:
-            prompt_name: Name of prompt segment (must be in default_state)
-            value: True to activate, False to deactivate
-        """
-        if prompt_name not in self._default_state:
-            logger.warning(f"Unknown prompt name: {prompt_name}")
-            return
-        self._current_state[prompt_name] = value
-        logger.debug(f"Prompt state updated: {prompt_name} = {value}")
-    def update_multiple(self, updates: Dict[str, bool]):
-        """
-        Update multiple prompt states at once.
-        Args:
-            updates: Dictionary of {prompt_name: bool} updates
-        """
-        for prompt_name, value in updates.items():
-            self.update(prompt_name, value)
-    def is_active(self, prompt_name: str) -> bool:
-        """Check if a prompt segment is active"""
-        return self._current_state.get(prompt_name, False)
-    def get_active_prompts(self) -> List[str]:
-        """Get list of all currently active prompt names"""
-        return [name for name, active in self._current_state.items() if active]
-    def get_active_response_prompts(self) -> List[str]:
-        """
-        Get list of active response agent prompts only.
-        Excludes thinking agent prompts.
-        """
-        response_prompts = [
-            "VAUGE_INPUT", "USER_UNDERSTANDING", "GENERAL_FORMATTING",
-            "LATEX_FORMATTING", "GUIDING_TEACHING", "STRUCTURE_PRACTICE_QUESTIONS",
-            "PRACTICE_QUESTION_FOLLOWUP", "TOOL_USE_ENHANCEMENT"
-        ]
-        return [name for name in response_prompts if self._current_state.get(name, False)]
-    def get_active_thinking_prompts(self) -> List[str]:
-        """
-        Get list of active thinking agent prompts only.
-        """
-        thinking_prompts = ["MATH_THINKING", "QUESTION_ANSWER_DESIGN", "REASONING_THINKING"]
-        return [name for name in thinking_prompts if self._current_state.get(name, False)]
-# ============================================================================
-# LOGICAL EXPRESSIONS
-# ============================================================================
-class LogicalExpressions:
-    """
-    Regex-based logical expressions for prompt trigger detection.
-    Analyzes user input to activate appropriate prompt segments.
-    """
-    def __init__(self):
-        # Math-related keywords
-        self.math_regex = r'\b(math|calculus|algebra|geometry|equation|formula|solve|calculate|derivative|integral|trigonometry|statistics|probability)\b'
-        # Additional regex patterns can be added here
-        logger.info("LogicalExpressions initialized")
-    def check_math_keywords(self, user_input: str) -> bool:
-        """
-        Check if user input contains mathematical keywords.
-        Triggers LATEX_FORMATTING.
-        Args:
-            user_input: User's message
-        Returns:
-            True if math keywords detected
-        """
-        result = bool(re.search(self.math_regex, user_input, re.IGNORECASE))
-        if result:
-            logger.debug(f"Math keywords detected in: '{user_input[:50]}...'")
-        return result
-    def apply_all_checks(self, user_input: str, prompt_state: PromptStateManager):
-        """
-        Apply all logical expression checks and update prompt_state.
-        Args:
-            user_input: User's message
-            prompt_state: PromptStateManager instance to update
-        """
-        # GENERAL_FORMATTING is always applied
-        prompt_state.update("GENERAL_FORMATTING", True)
-        # Check for math keywords
-        if self.check_math_keywords(user_input):
-            prompt_state.update("LATEX_FORMATTING", True)
-        # Additional checks can be added here as needed
-        logger.debug(f"Logical expressions applied. Active prompts: {prompt_state.get_active_prompts()}")
-# ============================================================================
-# GLOBAL STATE MANAGER
-# ============================================================================
-class GlobalStateManager:
-    """
-    Thread-safe global state manager with SQLite persistence and HF dataset backup.
-    Now includes PromptStateManager for per-turn prompt segment tracking.
-    """
-    def __init__(self, db_path="mimir_analytics.db", dataset_repo="jdesiree/mimir_analytics"):
-        self._db_path = db_path
-        self.dataset_repo = dataset_repo
-        self.hf_token = os.getenv("HF_TOKEN")
-        # Existing state caches
-        self._states = {}
-        self._analytics_cache = {}
-        self._ml_models_cache = {}
-        self._evaluation_cache = {}
-        # Thread safety
-        self._lock = threading.Lock()
-        # Cleanup settings
-        self._cleanup_interval = 3600
-        self._max_age = 24 * 3600
-        self._last_cleanup = datetime.now()
-        self._last_hf_backup = datetime.now()
-        self._hf_backup_interval = 3600
-        # NEW: Prompt state management
-        self._prompt_state_manager = PromptStateManager()
-        # Initialize existing systems
-        self._init_database()
-        self._load_from_database()
-        self._load_from_hf_dataset()
-        logger.info("GlobalStateManager initialized with PromptStateManager")
-    # ========================================================================
-    # PROMPT STATE MANAGEMENT
-    # ========================================================================
-    def get_prompt_state_manager(self) -> PromptStateManager:
-        """Get the prompt state manager for current turn"""
-        return self._prompt_state_manager
-    def reset_prompt_state(self):
-        """Reset prompt state for new turn"""
-        self._prompt_state_manager.reset()
-        logger.debug("Prompt state reset for new turn")
-    def get_prompt_state(self) -> Dict[str, bool]:
-        """Get current prompt state dictionary"""
-        return self._prompt_state_manager.get_state()
-    def update_prompt_state(self, prompt_name: str, value: bool):
-        """Update specific prompt state"""
-        self._prompt_state_manager.update(prompt_name, value)
-    def update_prompt_states(self, updates: Dict[str, bool]):
-        """Update multiple prompt states"""
-        self._prompt_state_manager.update_multiple(updates)
-    # ========================================================================
-    # EXISTING DATABASE METHODS (unchanged)
-    # ========================================================================
-    def _init_database(self):
-        """Initialize SQLite database for persistent storage"""
-        conn = sqlite3.connect(self._db_path)
-        cursor = conn.cursor()
-        cursor.execute("""
-            CREATE TABLE IF NOT EXISTS conversations (
-                session_id TEXT PRIMARY KEY,
-                chat_history TEXT,
-                conversation_state TEXT,
-                last_accessed TEXT,
-                created TEXT
-            )
-        """)
-        cursor.execute("""
-            CREATE TABLE IF NOT EXISTS analytics (
-                session_id TEXT PRIMARY KEY,
-                project_stats TEXT,
-                recent_interactions TEXT,
-                dashboard_html TEXT,
-                last_refresh TEXT,
-                export_history TEXT
-            )
-        """)
-        cursor.execute("""
-            CREATE TABLE IF NOT EXISTS evaluations (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                session_id TEXT,
-                timestamp TEXT,
-                metric_type TEXT,
-                metric_data TEXT
-            )
-        """)
-        cursor.execute("""
-            CREATE TABLE IF NOT EXISTS classifications (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                session_id TEXT,
-                timestamp TEXT,
-                user_input TEXT,
-                prediction_data TEXT,
-                features TEXT
-            )
-        """)
-        conn.commit()
-        conn.close()
-    def _load_from_database(self):
-        """Load all data from SQLite on startup"""
-        try:
-            conn = sqlite3.connect(self._db_path)
-            cursor = conn.cursor()
-            cursor.execute("SELECT * FROM conversations")
-            for row in cursor.fetchall():
-                session_id = row[0]
-                self._states[session_id] = {
-                    'chat_history': json.loads(row[1]),
-                    'conversation_state': json.loads(row[2]),
-                    'last_accessed': datetime.fromisoformat(row[3]),
-                    'created': datetime.fromisoformat(row[4])
-                }
-            cursor.execute("SELECT * FROM analytics")
-            for row in cursor.fetchall():
-                session_id = row[0]
-                self._analytics_cache[session_id] = {
-                    'project_stats': json.loads(row[1]),
-                    'recent_interactions': json.loads(row[2]),
-                    'dashboard_html': row[3],
-                    'last_refresh': datetime.fromisoformat(row[4]) if row[4] else None,
-                    'export_history': json.loads(row[5]),
-                    'last_accessed': datetime.now()
-                }
-            conn.close()
-            logger.info(f"Loaded {len(self._states)} conversations and {len(self._analytics_cache)} analytics from database")
-        except Exception as e:
-            logger.error(f"Error loading from database: {e}")
-    def _load_from_hf_dataset(self):
-        """Load data from HF dataset on startup"""
-        try:
-            ds = load_dataset(self.dataset_repo, split="train", token=self.hf_token)
-            for item in ds:
-                if item['data_type'] == 'conversation':
-                    session_id = item['session_id']
-                    data = json.loads(item['data'])
-                    self._states[session_id] = data
-                elif item['data_type'] == 'analytics':
-                    session_id = item['session_id']
-                    data = json.loads(item['data'])
-                    self._analytics_cache[session_id] = data
-            logger.info(f"Loaded data from HF dataset {self.dataset_repo}")
-        except Exception as e:
-            logger.warning(f"Could not load from HF dataset: {e}")
-    def _save_to_database_conversations(self, session_id):
-        """Save conversation to SQLite"""
-        if session_id not in self._states:
-            return
-        state = self._states[session_id]
-        conn = sqlite3.connect(self._db_path)
-        cursor = conn.cursor()
-        cursor.execute("""
-            INSERT OR REPLACE INTO conversations
-            (session_id, chat_history, conversation_state, last_accessed, created)
-            VALUES (?, ?, ?, ?, ?)
-        """, (
-            session_id,
-            json.dumps(state['chat_history']),
-            json.dumps(state['conversation_state']),
-            state['last_accessed'].isoformat(),
-            state.get('created', datetime.now()).isoformat()
-        ))
-        conn.commit()
-        conn.close()
-    def _save_to_database_analytics(self, session_id):
-        """Save analytics to SQLite"""
-        if session_id not in self._analytics_cache:
-            return
-        analytics = self._analytics_cache[session_id]
-        conn = sqlite3.connect(self._db_path)
-        cursor = conn.cursor()
-        cursor.execute("""
-            INSERT OR REPLACE INTO analytics
-            (session_id, project_stats, recent_interactions, dashboard_html, last_refresh, export_history)
-            VALUES (?, ?, ?, ?, ?, ?)
-        """, (
-            session_id,
-            json.dumps(analytics.get('project_stats', {})),
-            json.dumps(analytics.get('recent_interactions', [])),
-            analytics.get('dashboard_html', ''),
-            analytics.get('last_refresh').isoformat() if analytics.get('last_refresh') else None,
-            json.dumps(analytics.get('export_history', []))
-        ))
-        conn.commit()
-        conn.close()
-    def _backup_to_hf_dataset(self):
-        """Backup all data to HF dataset"""
-        if (datetime.now() - self._last_hf_backup).seconds < self._hf_backup_interval:
-            return
-        try:
-            data_items = []
-            for session_id, state in self._states.items():
-                data_items.append({
-                    'session_id': session_id,
-                    'data_type': 'conversation',
-                    'data': json.dumps(state, default=str),
-                    'timestamp': datetime.now().isoformat()
-                })
-            for session_id, analytics in self._analytics_cache.items():
-                data_items.append({
-                    'session_id': session_id,
-                    'data_type': 'analytics',
-                    'data': json.dumps(analytics, default=str),
-                    'timestamp': datetime.now().isoformat()
-                })
-            if data_items:
-                ds = Dataset.from_list(data_items)
-                ds.push_to_hub(self.dataset_repo, token=self.hf_token)
-                self._last_hf_backup = datetime.now()
-                logger.info(f"Backed up {len(data_items)} items to HF dataset")
-        except Exception as e:
-            logger.error(f"Error backing up to HF dataset: {e}")
-    def _cleanup_old_states(self):
-        """Remove old unused states to prevent memory leaks"""
-        now = datetime.now()
-        if (now - self._last_cleanup).seconds < self._cleanup_interval:
-            return
-        with self._lock:
-            expired_keys = []
-            for session_id, state_data in self._states.items():
-                if (now - state_data.get('last_accessed', now)).seconds > self._max_age:
-                    expired_keys.append(session_id)
-            for key in expired_keys:
-                del self._states[key]
-                logger.info(f"Cleaned up expired state: {key}")
-            self._last_cleanup = now
-    # ========================================================================
-    # CONVERSATION STATE METHODS (unchanged)
-    # ========================================================================
-    def get_session_id(self, request=None):
-        """Generate or retrieve session ID"""
-        return "default_session"
-    def get_conversation_state(self, session_id=None):
-        """Get conversation state for a session"""
-        if session_id is None:
-            session_id = self.get_session_id()
-        self._cleanup_old_states()
-        with self._lock:
-            if session_id not in self._states:
-                self._states[session_id] = {
-                    'chat_history': [],
-                    'conversation_state': [],
-                    'last_accessed': datetime.now(),
-                    'created': datetime.now()
-                }
-            else:
-                self._states[session_id]['last_accessed'] = datetime.now()
-            return self._states[session_id].copy()
-    def update_conversation_state(self, chat_history, conversation_state, session_id=None):
-        """Update conversation state for a session"""
-        if session_id is None:
-            session_id = self.get_session_id()
-        with self._lock:
-            if session_id not in self._states:
-                self._states[session_id] = {}
-            self._states[session_id].update({
-                'chat_history': chat_history.copy() if chat_history else [],
-                'conversation_state': conversation_state.copy() if conversation_state else [],
-                'last_accessed': datetime.now()
-            })
-            # self._save_to_database_conversations(session_id)
-            # self._backup_to_hf_dataset()
-            threading.Thread(target=self._save_to_database_conversations,
-                             args=(session_id,), daemon=True).start()
-            if (datetime.now() - self._last_hf_backup).seconds >= self._hf_backup_interval:
-                threading.Thread(target=self._backup_to_hf_dataset,
-                                 daemon=True).start()
-    def reset_conversation_state(self, session_id=None):
-        """Reset conversation state for a session"""
-        if session_id is None:
-            session_id = self.get_session_id()
-        with self._lock:
-            if session_id in self._states:
-                self._states[session_id].update({
-                    'chat_history': [],
-                    'conversation_state': [],
-                    'last_accessed': datetime.now()
-                })
-                self._save_to_database_conversations(session_id)
-    def get_all_sessions(self):
-        """Get all active sessions (for analytics)"""
-        self._cleanup_old_states()
-        with self._lock:
-            return list(self._states.keys())
-    # ========================================================================
-    # ANALYTICS STATE METHODS (unchanged)
-    # ========================================================================
-    def get_analytics_state(self, session_id=None):
-        """Get analytics state for a session"""
-        if session_id is None:
-            session_id = self.get_session_id()
-        self._cleanup_old_states()
-        with self._lock:
-            if session_id not in self._analytics_cache:
-                self._analytics_cache[session_id] = {
-                    'project_stats': {
-                        "total_conversations": None,
-                        "avg_session_length": None,
-                        "success_rate": None,
-                        "model_type": "Phi-3-mini (Fine-tuned)",
-                        "last_updated": None
-                    },
-                    'recent_interactions': [],
-                    'dashboard_html': None,
-                    'last_refresh': None,
-                    'export_history': [],
-                    'database_status': 'unknown',
-                    'error_state': None,
-                    'last_accessed': datetime.now()
-                }
-            else:
-                self._analytics_cache[session_id]['last_accessed'] = datetime.now()
-            return self._analytics_cache[session_id].copy()
-    def update_analytics_state(self, project_stats=None, recent_interactions=None,
-                             dashboard_html=None, error_state=None, session_id=None):
-        """Update analytics state for a session"""
-        if session_id is None:
-            session_id = self.get_session_id()
-        with self._lock:
-            if session_id not in self._analytics_cache:
-                self._analytics_cache[session_id] = {}
-            current_time = datetime.now()
-            if project_stats is not None:
-                self._analytics_cache[session_id]['project_stats'] = project_stats.copy()
-                self._analytics_cache[session_id]['last_refresh'] = current_time
-            if recent_interactions is not None:
-                self._analytics_cache[session_id]['recent_interactions'] = recent_interactions.copy()
-            if dashboard_html is not None:
-                self._analytics_cache[session_id]['dashboard_html'] = dashboard_html
-            if error_state is not None:
-                self._analytics_cache[session_id]['error_state'] = error_state
-            self._analytics_cache[session_id]['last_accessed'] = current_time
-            self._save_to_database_analytics(session_id)
-            self._backup_to_hf_dataset()
-    def add_export_record(self, export_type, filename, success=True, session_id=None):
-        """Add export record to analytics state"""
-        if session_id is None:
-            session_id = self.get_session_id()
-        with self._lock:
-            if session_id not in self._analytics_cache:
-                self.get_analytics_state(session_id)
-            export_record = {
-                'timestamp': datetime.now().isoformat(),
-                'type': export_type,
-                'filename': filename,
-                'success': success
-            }
-            if 'export_history' not in self._analytics_cache[session_id]:
-                self._analytics_cache[session_id]['export_history'] = []
-            self._analytics_cache[session_id]['export_history'].append(export_record)
-            if len(self._analytics_cache[session_id]['export_history']) > 20:
-                self._analytics_cache[session_id]['export_history'] = \
-                    self._analytics_cache[session_id]['export_history'][-20:]
-            self._save_to_database_analytics(session_id)
-    # ========================================================================
-    # ML MODEL CACHE METHODS (unchanged)
-    # ========================================================================
-    def get_ml_model_cache(self, model_type: str = "prompt_classifier"):
-        """Get cached ML model"""
-        with self._lock:
-            return self._ml_models_cache.get(model_type, None)
-    def cache_ml_model(self, model, model_type: str = "prompt_classifier", metadata: dict = None):
-        """Cache a trained ML model"""
-        with self._lock:
-            self._ml_models_cache[model_type] = {
-                'model': model,
-                'cached_at': datetime.now(),
-                'metadata': metadata or {},
-                'access_count': 0
-            }
-            logger.info(f"ML model '{model_type}' cached successfully")
-    # ========================================================================
-    # EVALUATION STATE METHODS (unchanged)
-    # ========================================================================
-    def get_evaluation_state(self, session_id=None):
-        """Get evaluation state for a session"""
-        if session_id is None:
-            session_id = self.get_session_id()
-        with self._lock:
-            if session_id not in self._evaluation_cache:
-                self._evaluation_cache[session_id] = {
-                    'educational_quality_scores': [],
-                    'rag_performance_metrics': [],
-                    'prompt_classification_accuracy': [],
-                    'user_feedback_history': [],
-                    'aggregate_metrics': {
-                        'avg_educational_quality': 0.0,
-                        'avg_rag_relevance': 0.0,
-                        'classifier_accuracy_rate': 0.0,
-                        'user_satisfaction_rate': 0.0
-                    },
-                    'evaluation_session_count': 0,
-                    'last_updated': datetime.now()
-                }
-            return self._evaluation_cache[session_id].copy()
-    def add_educational_quality_score(self, user_query: str, response: str, metrics: dict, session_id=None):
-        """Add educational quality evaluation result"""
-        if session_id is None:
-            session_id = self.get_session_id()
-        with self._lock:
-            if session_id not in self._evaluation_cache:
-                self.get_evaluation_state(session_id)
-            quality_record = {
-                'timestamp': datetime.now().isoformat(),
-                'user_query': user_query[:100],
-                'response_length': len(response),
-                'semantic_quality': metrics.get('semantic_quality', 0.0),
-                'educational_score': metrics.get('educational_score', 0.0),
-                'response_time': metrics.get('response_time', 0.0),
-                'overall_score': (metrics.get('semantic_quality', 0.0) + metrics.get('educational_score', 0.0)) / 2
-            }
-            self._evaluation_cache[session_id]['educational_quality_scores'].append(quality_record)
-            self._update_aggregate_metrics(session_id)
-    def add_prompt_classification_result(self, predicted_mode: str, was_successful: bool, metadata: dict = None, session_id=None):
-        """Add prompt classification accuracy result"""
-        if session_id is None:
-            session_id = self.get_session_id()
-        with self._lock:
-            if session_id not in self._evaluation_cache:
-                self.get_evaluation_state(session_id)
-            classification_record = {
-                'timestamp': datetime.now().isoformat(),
-                'predicted_mode': predicted_mode,
-                'was_successful': was_successful,
-                'accuracy_score': 1.0 if was_successful else 0.0,
-                'metadata': metadata or {}
-            }
-            self._evaluation_cache[session_id]['prompt_classification_accuracy'].append(classification_record)
-            self._update_aggregate_metrics(session_id)
-    def add_user_feedback(self, response_id: str, feedback_type: str, conversation_context: dict = None, session_id=None):
-        """Add user feedback result"""
-        if session_id is None:
-            session_id = self.get_session_id()
-        with self._lock:
-            if session_id not in self._evaluation_cache:
-                self.get_evaluation_state(session_id)
-            feedback_record = {
-                'timestamp': datetime.now().isoformat(),
-                'response_id': response_id,
-                'feedback_type': feedback_type,
-                'satisfaction_score': 1.0 if feedback_type == 'thumbs_up' else 0.0,
-                'conversation_context': conversation_context or {}
-            }
-            self._evaluation_cache[session_id]['user_feedback_history'].append(feedback_record)
-            self._update_aggregate_metrics(session_id)
-    def _update_aggregate_metrics(self, session_id: str):
-        """Update aggregate metrics for a session"""
-        eval_state = self._evaluation_cache[session_id]
-        if eval_state['educational_quality_scores']:
-            avg_educational = sum(score['overall_score'] for score in eval_state['educational_quality_scores']) / len(eval_state['educational_quality_scores'])
-            eval_state['aggregate_metrics']['avg_educational_quality'] = avg_educational
-        if eval_state['prompt_classification_accuracy']:
-            accuracy_rate = sum(result['accuracy_score'] for result in eval_state['prompt_classification_accuracy']) / len(eval_state['prompt_classification_accuracy'])
-            eval_state['aggregate_metrics']['classifier_accuracy_rate'] = accuracy_rate
-        if eval_state['user_feedback_history']:
-            satisfaction_rate = sum(feedback['satisfaction_score'] for feedback in eval_state['user_feedback_history']) / len(eval_state['user_feedback_history'])
-            eval_state['aggregate_metrics']['user_satisfaction_rate'] = satisfaction_rate
-        eval_state['last_updated'] = datetime.now()
-        eval_state['evaluation_session_count'] += 1
-    def get_evaluation_summary(self, session_id=None, include_history: bool = False):
-        """Get evaluation summary for analytics"""
-        if session_id is None:
-            session_id = self.get_session_id()
-        eval_state = self.get_evaluation_state(session_id)
-        summary = {
-            'aggregate_metrics': eval_state['aggregate_metrics'],
-            'total_evaluations': {
-                'educational_quality': len(eval_state['educational_quality_scores']),
-                'classification_accuracy': len(eval_state['prompt_classification_accuracy']),
-                'user_feedback': len(eval_state['user_feedback_history'])
-            },
-            'last_updated': eval_state['last_updated'],
-            'session_evaluation_count': eval_state['evaluation_session_count']
-        }
-        if include_history:
-            summary['history'] = {
-                'recent_educational_scores': eval_state['educational_quality_scores'][-10:],
-                'recent_classification_results': eval_state['prompt_classification_accuracy'][-10:],
-                'recent_user_feedback': eval_state['user_feedback_history'][-10:]
-            }
-        return summary
-    # ========================================================================
-    # UTILITY METHODS
-    # ========================================================================
-    def get_cache_status(self, session_id=None):
-        """Get cache status for debugging"""
-        if session_id is None:
-            session_id = self.get_session_id()
-        with self._lock:
-            analytics_cached = session_id in self._analytics_cache
-            conversation_cached = session_id in self._states
-            cache_info = {
-                'session_id': session_id,
-                'analytics_cached': analytics_cached,
-                'conversation_cached': conversation_cached,
-                'total_analytics_sessions': len(self._analytics_cache),
-                'total_conversation_sessions': len(self._states),
-                'prompt_state_active_count': len(self._prompt_state_manager.get_active_prompts())
-            }
-            if analytics_cached:
-                analytics_state = self._analytics_cache[session_id]
-                cache_info['analytics_last_refresh'] = analytics_state.get('last_refresh')
-                cache_info['analytics_has_data'] = bool(analytics_state.get('project_stats', {}).get('total_conversations'))
-            if conversation_cached:
-                conversation_state = self._states[session_id]
-                cache_info['conversation_length'] = len(conversation_state.get('conversation_state', []))
-                cache_info['chat_history_length'] = len(conversation_state.get('chat_history', []))
-            return cache_info
-    def reset_analytics_state(self, session_id=None):
-        """Reset analytics state for a session"""
-        if session_id is None:
-            session_id = self.get_session_id()
-        with self._lock:
-            if session_id in self._analytics_cache:
-                del self._analytics_cache[session_id]
-    def clear_all_states(self):
-        """Clear all states - use with caution"""
-        with self._lock:
-            self._states.clear()
-            self._analytics_cache.clear()
-            self._ml_models_cache.clear()
-            self._evaluation_cache.clear()
-            self._prompt_state_manager.reset()
-            logger.info("All global states cleared")

styles.css CHANGED Viewed

@@ -1,353 +1,760 @@
-/* ============================
-   GLOBAL THEME & VARIABLES
-============================ */
 :root {
-  /* Text Colors */
-  --primarytext-color: #1a1a1a;
-  --secondarytext-color: #555;
-  /* Primary Colors */
-  --primary-dark: #345da8;
-  --primary-light: #a8b5c9;
-  /* Secondary Colors */
-  --secondary-dark: #063d80;
-  --secondary-light: #6ea1fa;
-  /* Chat & Container Colors */
-  --chathistory_area: #f0f1f4;
-  --container-color: #f5f6f8;
-  --Send: #6ea1fa;
-  --Send-hover: #87d0d5;
-  --clear: #b2b8c2;
-  --clear-hover: #2c5be0;
-  --text_areabackground: #fafafa;
-  /* Chat Bubble Colors */
-  --bot-bubble-color: #b9c8e3;
-  --user-bubble-color: #e3eaf6;
-  /* Scrollbar Colors */
-  --scrollbar-bg: #d0d3d8;
-  --scrollbar-thumb: #a2a6ad;
-  --scrollbar-thumb-hover: #888d94;
-  /* Border & Radius */
   --border-thin: 1px;
   --border-medium: 2px;
   --border-default: 1px;
   --border-focus: 2px;
   --border-hover: 3px;
   --button-border: 2px;
-  --radius-sm: 4px;
-  --radius-md: 6px;
 }
-/* ============================
-   DARK MODE THEME (SOFTER)
-============================ */
-@media (prefers-color-scheme: dark) {
-  :root {
-    --primarytext-color: #f8f8f8;
-    --secondarytext-color: #d0d3d8;
-    --primary-dark: #27477d;
-    --primary-light: #7d8da9;
-    --secondary-dark: #042a59;
-    --secondary-light: #5e88d6;
-    --chathistory_area: #202327;
-    --container-color: #1b1d20;
-    --Send: #5e88d6;
-    --Send-hover: #7ac4c9;
-    --clear: #7a7f88;
-    --clear-hover: #5e88d6;
-    --text_areabackground: #25282c;
-    --bot-bubble-color: #425575;
-    --user-bubble-color: #566583;
-    --scrollbar-bg: #2b2e33;
-    --scrollbar-thumb: #4b4f56;
-    --scrollbar-thumb-hover: #5e636b;
-  }
 }
-/* ============================
-   FONT IMPORT & BASE STYLING
-============================ */
-@import url('https://fonts.googleapis.com/css2?family=Oswald:wght@200..700&display=swap');
-body {
-  background: var(--text_areabackground);
-  color: var(--primarytext-color);
-  font-family: "Oswald", sans-serif;
-  margin: 0;
 }
-* {
-  color: var(--primarytext-color) !important;
-  font-family: "Oswald", sans-serif !important;
-  box-sizing: border-box;
 }
-/* ============================
-   CUSTOM SCROLLBAR
-============================ */
-::-webkit-scrollbar {
-  width: 12px;
 }
-::-webkit-scrollbar-track {
-  background: var(--scrollbar-bg);
 }
-::-webkit-scrollbar-thumb {
-  background-color: var(--scrollbar-thumb);
-  border-radius: 6px;
-  border: 2px solid var(--scrollbar-bg);
 }
-::-webkit-scrollbar-thumb:hover {
-  background-color: var(--scrollbar-thumb-hover);
 }
-/* ============================
-   GRADIO CONTAINER & LAYOUT
-============================ */
-.gradio-container,
-[data-testid="block-container"],
-.contain {
-  background-color: var(--container-color) !important;
-  font-family: "Oswald", sans-serif !important;
-  display: flex !important;
-  flex-direction: column !important;
-  height: 100vh !important;
-  max-height: 100vh !important;
-  overflow: hidden !important;
-}
-/* ============================
-   HEADER & NAVIGATION
-============================ */
-.title-header {
-  background-color: transparent;
-  padding: 10px;
-  border-bottom: var(--border-focus) solid var(--primary-dark);
-  display: flex;
-  align-items: center;
-  height: 60px !important;
 }
-.title-header h1 {
-  font-size: 3.5rem;
-  font-weight: 700;
-  color: var(--primarytext-color);
-  margin: 0;
 }
-/* ============================
-   CHAT CONTAINER
-============================ */
-#main-chatbot,
-[data-testid="chatbot"],
-.gradio-chatbot,
-[role="log"] {
-  border: var(--border-default) solid var(--primary-dark) !important;
-  border-radius: var(--radius-md) !important;
-  background-color: var(--chathistory_area) !important;
-  box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1) !important;
-  padding: 15px !important;
-  margin: 15px 20px !important;
-  flex: 1 !important;
-  overflow-y: auto !important;
-}
-/* ============================
-   TEXT INPUT AREA
-============================ */
-textarea,
-.gradio-textbox textarea {
-  background-color: var(--text_areabackground) !important;
-  border: var(--border-default) solid var(--secondary-dark) !important;
-  border-radius: var(--radius-md) !important;
-  color: var(--primarytext-color) !important;
-  padding: 10px !important;
-  resize: none !important;
-  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1) !important;
 }
-textarea:focus {
-  border-color: var(--secondary-light) !important;
-  box-shadow: 0 0 0 var(--border-focus) rgba(96, 165, 250, 0.2) !important;
 }
-/* ============================
-   BUTTONS
-============================ */
-button.send-button {
-  background-color: var(--Send) !important;
-  color: var(--primarytext-color) !important;
-  border: var(--button-border) solid var(--secondary-dark) !important;
-  border-radius: var(--radius-md) !important;
-  padding: 8px 16px !important;
-  font-weight: 600 !important;
-  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
-  width: 100%;
 }
-button.send-button:hover {
-  background-color: var(--Send-hover) !important;
 }
-button.clear-button {
-  background-color: var(--clear) !important;
-  color: var(--primarytext-color) !important;
-  border: var(--button-border) solid var(--secondary-dark) !important;
-  border-radius: var(--radius-md) !important;
-  padding: 8px 16px !important;
-  font-weight: 600 !important;
-  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
-  width: 100%;
 }
-button.clear-button:hover {
-  background-color: var(--clear-hover) !important;
 }
-/* ============================
-   CHAT BUBBLES (VARIABLE COLORS)
-============================ */
-.message.user,
 .message.bot {
-  background: none !important;
-  border: none !important;
-  padding: 0 !important;
-  margin: 0 !important;
-  box-shadow: none !important;
-}
-.message-row {
-  display: flex;
-  margin: 8px 12px;
-}
-.message.panel-full-width {
-  max-width: 80%;
-  min-width: 240px;
-  padding: 14px 20px !important;
-  border-radius: 18px !important;
-  box-shadow: none !important;
-  position: relative;
-  line-height: 1.5;
-  word-wrap: break-word;
-}
-/* Bot Bubble */
-.message-row.bot-row .message.panel-full-width {
-  background-color: var(--bot-bubble-color) !important;
-  color: var(--primarytext-color) !important;
-  margin-right: auto;
-  margin-left: 0;
-}
-.message-row.bot-row .message.panel-full-width::before {
-  content: "";
-  position: absolute;
-  top: 12px;
-  left: -10px;
-  width: 0;
-  height: 0;
-  border-top: 10px solid transparent;
-  border-right: 10px solid var(--bot-bubble-color);
-  border-bottom: 10px solid transparent;
-}
-/* User Bubble */
-.message-row.user-row .message.panel-full-width {
-  background-color: var(--user-bubble-color) !important;
-  color: var(--primarytext-color) !important;
-  margin-left: auto;
-  margin-right: 0;
-}
-.message-row.user-row .message.panel-full-width::before {
-  content: "";
-  position: absolute;
-  top: 12px;
-  right: -10px;
-  width: 0;
-  height: 0;
-  border-top: 10px solid transparent;
-  border-left: 10px solid var(--user-bubble-color);
-  border-bottom: 10px solid transparent;
-}
-/* ============================
-   RESPONSIVE ADJUSTMENTS
-============================ */
-@media (max-width: 768px) {
-  .message.panel-full-width {
-    max-width: 85%;
-  }
 }
-/* ============================
-FOOTER: RESTORE BUILT-IN GRADIO LINKS (settings, API, etc.)
-============================ */
 footer.svelte-czcr5b {
-display: flex !important;
-align-items: center !important;
-justify-content: center !important;
-gap: 12px !important;
-visibility: visible !important;
-position: fixed !important;
-bottom: 0 !important;
-left: 0 !important;
-right: 0 !important;
-background-color: var(--container-color) !important;
-backdrop-filter: blur(5px) !important;
-border-top: var(--border-default) solid rgba(0, 0, 0, 0.12) !important;
-padding: 8px 16px !important;
-z-index: 1000 !important;
-min-height: 36px !important;
-}
-footer.svelte-czcr5b a,
-footer.svelte-czcr5b button,
-footer.svelte-czcr5b span {
-color: var(--secondarytext-color) !important;
-font-size: 12px !important;
-font-family: "Oswald", sans-serif !important;
-text-decoration: none !important;
-background: none !important;
-border: none !important;
-cursor: pointer !important;
-opacity: 0.8;
-transition: opacity 0.15s ease;
-}
-footer.svelte-czcr5b a:hover,
-footer.svelte-czcr5b button:hover,
-footer.svelte-czcr5b span:hover {
-opacity: 1;
-color: var(--primarytext-color) !important;
-}
-/* Divider style between footer links */
 footer.svelte-czcr5b .divider {
-color: var(--secondarytext-color) !important;
-opacity: 0.5;
-margin: 0 6px !important;
 }
-/* Make sure footer items never collapse */
-footer.svelte-czcr5b > * {
-display: inline-flex !important;
-align-items: center !important;
 }

 :root {
+  /* Text colors */
+  --primarytext-color: #0f0e09;
+  --secondarytext-color: #696966;
+  /* Primary colors - Blue theme */
+  --primary-dark: #1e3a8a;        /* Deep blue */
+  --primary-light: #3b82f6;       /* Medium blue */
+  /* Secondary colors - Blue shades */
+  --secondary-dark: #1d4ed8;      /* Darker blue */
+  --secondary-light: #60a5fa;     /* Light blue */
+  /* Chat colors */
+  --user_message: #bfdbfe;        /* Light blue bubble for user */
+  --ai_message: #14b8a6;          /* Medium teal for AI */
+  --chathistory_area: #f3f4f6;    /* Very light grey for chat history */
+  /* Text, Chat, UI */
+  --Send: #3b82f6;                /* Send button - medium blue (lighter than clear) */
+  --clear: #1e40af;               /* Clear button - darker blue */
+  --Send-hover: #2563eb;          /* Send button hover */
+  --clear-hover: #1d4ed8;         /* Clear button hover */
+  --text_areabackground: #f3f4f6; /* Very light grey for text areas */
+  /* Border thickness variables */
   --border-thin: 1px;
   --border-medium: 2px;
+  /* Semantic border variables */
   --border-default: 1px;
   --border-focus: 2px;
   --border-hover: 3px;
+  /* Component-specific borders */
   --button-border: 2px;
+  --input-border: 1px;
+  --card-border: 1px;
 }
+/* Import Oswald font - Google Fonts */
+@import url('https://fonts.googleapis.com/css2?family=Oswald:wght@200..700&display=swap');
+/* HIDE the HTML components that create scrollbars */
+#component-1, #component-2 {
+    display: none !important;
+    height: 0 !important;
+    min-height: 0 !important;
+    padding: 0 !important;
+    margin: 0 !important;
+    visibility: hidden !important;
+}
+/* Specific text elements - ENSURE VISIBILITY */
+body, p, span, div, h1, h2, h3, h4, h5, h6, label, a {
+    color: var(--primarytext-color) !important;
+}
+/* All Gradio text elements - OVERRIDE GRADIO DEFAULTS */
+.gradio-container,
+.gradio-container *:not(textarea):not(input):not(button) {
+    color: var(--primarytext-color) !important;
+}
+/* Title header with transparent background */
+.title-header {
+    background-color: transparent;
+    padding: 10px 20px;
+    margin: 0 !important;
+    border-bottom: var(--border-focus) solid var(--primary-dark);
+    text-align: left;
+    flex-shrink: 0 !important;
+    height: 60px !important;
+    display: flex !important;
+    align-items: center !important;
+    width: 100% !important;
+}
+.title-header h1 {
+    font-size: 1.5rem;
+    font-weight: 600 !important;
+    color: var(--primarytext-color) !important;
+    margin: 0;
+    padding: 0;
 }
+/* More aggressive Gradio overrides - keep current background */
+.gradio-container,
+.gradio-container *,
+[data-testid="block-container"],
+.contain {
+    background-color: rgb(240, 236, 230) !important;
+    font-family: "Oswald", sans-serif !important;
+}
+/* Chat container - target all possible selectors */
+[data-testid="chatbot"],
+.chatbot,
+.gradio-chatbot,
+#main-chatbot,
+[role="log"] {
+    background-color: var(--chathistory_area) !important;
+    border: var(--border-default) solid var(--primary-dark) !important;
+    border-radius: 6px !important;
+    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1) !important;
 }
+/* Text input - target all possible selectors */
+[data-testid="textbox"] textarea,
+.gradio-textbox textarea,
+textarea {
+    background-color: var(--text_areabackground) !important;
+    border: var(--input-border) solid var(--secondary-dark) !important;
+    border-radius: 6px !important;
+    color: var(--primarytext-color) !important;
+    font-family: "Oswald", sans-serif !important;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1) !important;
 }
+/* Buttons - target all possible selectors */
+[data-testid="button"],
+.gradio-button,
+button.send-button,
+button.clear-button {
+    border-radius: 6px !important;
+    font-family: "Oswald", sans-serif !important;
+    font-weight: 500 !important;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1) !important;
+    color: var(--primarytext-color) !important;
 }
+/* Send button specific */
+button.send-button,
+[data-testid="button"]:nth-of-type(1) {
+    background-color: var(--Send) !important;
+    color: var(--primarytext-color) !important;
+    border: var(--button-border) solid var(--secondary-dark) !important;
 }
+/* Clear button specific */
+button.clear-button,
+[data-testid="button"]:nth-of-type(2) {
+    background-color: var(--clear) !important;
+    color: var(--primarytext-color) !important;
+    border: var(--button-border) solid var(--secondary-dark) !important;
 }
+/* Background area behind everything */
+.gradio-container {
+    background-color: rgb(240, 236, 230) !important;
+    font-family: "Oswald", sans-serif !important;
+    color: var(--primarytext-color) !important;
+    padding: 0 !important;
+    margin: 0 !important;
+    height: 100vh !important;
+    max-height: 100vh !important;
+    overflow: hidden !important;
+    display: flex !important;
+    flex-direction: column !important;
 }
+/* Target Gradio's internal structure */
+.gradio-container > div {
+    height: 95% !important;
+    display: flex !important;
+    flex-direction: column !important;
+    padding-top: 0 !important;
+    margin-top: 0 !important;
 }
+/* Main container wrapper */
+.main-container {
+    padding-bottom: 50px !important;
 }
+/* Chat history container/box */
+#main-chatbot {
+    border: var(--border-default) solid var(--primary-dark);
+    background-color: var(--chathistory_area);
+    border-radius: 6px !important;
+    padding: 15px !important;
+    flex: 1 !important;
+    min-height: 0 !important;
+    overflow-y: auto !important;
+    margin: 15px 20px !important;
+    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+    color: var(--primarytext-color) !important;
 }
+/* Gradio-specific chat container selectors */
+.gradio-container .gradio-chatbot {
+    background-color: var(--chathistory_area) !important;
+    border: var(--border-default) solid var(--primary-dark) !important;
+    border-radius: 6px !important;
+    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1) !important;
 }
+.gradio-container .gradio-chatbot > div {
+    background-color: var(--chathistory_area) !important;
 }
+.gradio-container .gradio-chatbot .message {
+    background-color: transparent !important;
 }
+/* User message text box */
+.message.user {
+    border: 1pt solid var(--secondary-dark);
+    background-color: var(--user_message);
 }
+/* User message markdown styling */
+.message.user .markdown,
+.message.user .markdown * {
+    background-color: var(--user_message) !important;
+    color: var(--primarytext-color) !important;
+    border-radius: 8px !important;
+    padding: 12px 16px !important;
+    border: var(--border-default) solid var(--primary-dark) !important;
+    max-width: 70%;
+    margin-left: auto;
+    margin-right: 0;
+    word-wrap: break-word;
+    font-weight: 400 !important;
+    margin-bottom: 10px !important;
 }
+/* AI message text box */
 .message.bot {
+    border: 1pt solid var(--secondary-dark);
+    background-color: var(--ai_message);
+}
+/* AI message markdown styling */
+.message.bot .markdown,
+.message.bot .markdown * {
+    background-color: var(--ai_message) !important;
+    color: var(--primarytext-color) !important;
+    border-radius: 8px !important;
+    padding: 12px 16px !important;
+    border: var(--border-default) solid var(--secondary-dark) !important;
+    max-width: 70%;
+    margin-left: 0;
+    margin-right: auto;
+    word-wrap: break-word;
+    font-weight: 400 !important;
+    margin-bottom: 10px !important;
+}
+/* Chat message text content - CRITICAL FIX */
+.message.user .markdown p,
+.message.user .markdown span,
+.message.user .markdown div,
+.message.bot .markdown p,
+.message.bot .markdown span,
+.message.bot .markdown div {
+    color: var(--primarytext-color) !important;
+}
+/* Input textbox */
+.input-textbox {
+    border: var(--border-default) solid var(--primary-dark);
+    background-color: var(--text_areabackground);
+}
+/* Input textbox textarea styling */
+.input-textbox textarea {
+    background-color: var(--text_areabackground) !important;
+    border: var(--input-border) solid var(--secondary-dark) !important;
+    border-radius: 6px !important;
+    color: var(--primarytext-color) !important;
+    font-family: "Oswald", sans-serif !important;
+    padding: 10px !important;
+    resize: none !important;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+}
+/* More specific Gradio textarea selectors */
+.gradio-container textarea {
+    background-color: var(--text_areabackground) !important;
+    border: var(--input-border) solid var(--secondary-dark) !important;
+    border-radius: 6px !important;
+    color: var(--primarytext-color) !important;
+    font-family: "Oswald", sans-serif !important;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1) !important;
+}
+.gradio-container .gradio-textbox textarea {
+    background-color: var(--text_areabackground) !important;
+    border: var(--input-border) solid var(--secondary-dark) !important;
+    border-radius: 6px !important;
+    color: var(--primarytext-color) !important;
+    font-family: "Oswald", sans-serif !important;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1) !important;
+}
+.gradio-container .gradio-textbox {
+    background-color: transparent !important;
+    border: none !important;
+}
+/* Input textbox focus state */
+.input-textbox textarea:focus {
+    border-color: var(--secondary-light) !important;
+    box-shadow: 0 0 0 var(--border-focus) rgba(96, 165, 250, 0.2) !important;
+}
+/* General textarea styling */
+textarea {
+    border: var(--border-default) solid var(--primary-dark);
+    background-color: var(--text_areabackground);
+    background-color: var(--text_areabackground) !important;
+    border: var(--input-border) solid var(--secondary-dark) !important;
+    border-radius: 6px !important;
+    color: var(--primarytext-color) !important;
+    font-family: "Oswald", sans-serif !important;
+    padding: 10px !important;
+    resize: none !important;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
 }
+/* General textarea focus state */
+textarea:focus {
+    border-color: var(--secondary-light) !important;
+    box-shadow: 0 0 0 var(--border-focus) rgba(96, 165, 250, 0.2) !important;
+}
+/* Input text - ENSURE VISIBILITY */
+textarea, input {
+    color: var(--primarytext-color) !important;
+}
+/* Input placeholder text */
+textarea::placeholder,
+input::placeholder {
+    color: var(--secondarytext-color) !important;
+}
+/* Send button */
+.send-button {
+    border: 1pt solid var(--secondary-dark);
+    background-color: var(--Send);
+    border: var(--button-border) solid var(--secondary-dark) !important;
+    border-radius: 6px !important;
+    font-weight: 500 !important;
+    padding: 8px 16px !important;
+    margin-bottom: 5px !important;
+    width: 100% !important;
+    background-color: var(--Send) !important;
+    color: var(--primarytext-color) !important;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+}
+/* Gradio-specific button selectors */
+.gradio-container button.send-button {
+    background-color: var(--Send) !important;
+    color: var(--primarytext-color) !important;
+    border: var(--button-border) solid var(--secondary-dark) !important;
+    border-radius: 6px !important;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1) !important;
+    font-family: "Oswald", sans-serif !important;
+}
+/* Send button hover */
+.send-button:hover {
+    background-color: var(--Send-hover);
+    background-color: var(--Send-hover) !important;
+    border-color: var(--secondary-dark) !important;
+}
+/* Clear button */
+.clear-button {
+    border: 1pt solid var(--secondary-dark);
+    background-color: var(--clear);
+    border: var(--button-border) solid var(--secondary-dark) !important;
+    border-radius: 6px !important;
+    font-weight: 500 !important;
+    padding: 8px 16px !important;
+    margin-bottom: 5px !important;
+    width: 100% !important;
+    background-color: var(--clear) !important;
+    color: var(--primarytext-color) !important;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+}
+/* Gradio-specific clear button selector */
+.gradio-container button.clear-button {
+    background-color: var(--clear) !important;
+    color: var(--primarytext-color) !important;
+    border: var(--button-border) solid var(--secondary-dark) !important;
+    border-radius: 6px !important;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1) !important;
+    font-family: "Oswald", sans-serif !important;
+}
+/* Clear button hover */
+.clear-button:hover {
+    background-color: var(--clear-hover);
+    background-color: var(--clear-hover) !important;
+    border-color: var(--secondary-dark) !important;
+}
+/* Button text - ENSURE VISIBILITY */
+button, .gradio-button {
+    color: var(--primarytext-color) !important;
+}
+/* Input controls container */
+.input-controls {
+    padding: 15px 20px !important;
+    background-color: transparent !important;
+    flex-shrink: 0 !important;
+    width: 100% !important;
+}
+/* Button column container */
+.button-column {
+    margin-left: 10px !important;
+    min-width: 80px !important;
+}
+/* Gradio footer styling */
 footer.svelte-czcr5b {
+    display: flex !important;
+    visibility: visible !important;
+    position: fixed !important;
+    bottom: 0 !important;
+    left: 0 !important;
+    right: 0 !important;
+    background-color: transparent !important;
+    backdrop-filter: blur(5px) !important;
+    border-top: var(--border-default) solid #59524f !important;
+    padding: 8px 16px !important;
+    z-index: 1000 !important;
+    height: auto !important;
+    min-height: 40px !important;
+}
+/* Footer buttons styling */
+footer.svelte-czcr5b button {
+    background-color: transparent !important;
+    color: var(--secondarytext-color) !important;
+    border: none !important;
+    font-family: "Oswald", sans-serif !important;
+    font-size: 12px !important;
+}
+/* Footer divider styling */
 footer.svelte-czcr5b .divider {
+    color: var(--secondarytext-color) !important;
+}
+/* Responsive design */
+@media (max-width: 768px) {
+    .message.bot .markdown,
+    .message.user .markdown {
+        max-width: 85%;
+    }
+    .input-controls {
+        padding: 10px !important;
+    }
+}
+/* Dark mode variables pulled from HTML when testing the app */
+/* This is meant to address the issue of Mimir not applying dark mode correctly */
+/* and ensure consistency regardless of user system settings */
+:root .dark {
+    --body-background-fill: var(--text_areabackground);
+    --body-text-color: var(--primarytext-color);
+    --color-accent-soft: var(--primary-light);
+    --background-fill-primary: var(--chathistory_area);
+    --background-fill-secondary: var(--text_areabackground);
+    --border-color-accent: var(--primary-dark);
+    --border-color-primary: var(--primary-dark);
+    --link-text-color-active: var(--secondary-light);
+    --link-text-color: var(--secondary-light);
+    --link-text-color-hover: var(--secondary-dark);
+    --link-text-color-visited: var(--secondary-dark);
+    --body-text-color-subdued: var(--secondarytext-color);
+    --accordion-text-color: var(--primarytext-color);
+    --table-text-color: var(--primarytext-color);
+    --shadow-spread: 1px;
+    --block-background-fill: var(--chathistory_area);
+    --block-border-color: var(--primary-dark);
+    --block_border_width: 1.5px;
+    --block-info-text-color: var(--secondarytext-color);
+    --block-label-background-fill: var(--text_areabackground);
+    --block-label-border-color: var(--primary-dark);
+    --block_label_border_width: 1.5px;
+    --block-label-text-color: var(--primarytext-color);
+    --block_shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+    --block_title_background_fill: transparent;
+    --block_title_border_color: var(--primary-dark);
+    --block_title_border_width: 1.5px;
+    --block-title-text-color: var(--primarytext-color);
+    --panel-background-fill: var(--text_areabackground);
+    --panel-border-color: var(--primary-dark);
+    --panel_border_width: 1.5px;
+    --border-color-accent-subdued: var(--secondary-dark);
+    --code-background-fill: var(--primary-light);
+    --checkbox-background-color: var(--text_areabackground);
+    --checkbox-background-color-focus: var(--text_areabackground);
+    --checkbox-background-color-hover: var(--text_areabackground);
+    --checkbox-background-color-selected: var(--primary-light);
+    --checkbox-border-color: var(--primary-dark);
+    --checkbox-border-color-focus: var(--secondary-light);
+    --checkbox-border-color-hover: var(--secondary-dark);
+    --checkbox-border-color-selected: var(--secondary-light);
+    --checkbox-border-width: 1.5px;
+    --checkbox-label-background-fill: var(--text_areabackground);
+    --checkbox-label-background-fill-hover: var(--text_areabackground);
+    --checkbox-label-background-fill-selected: var(--primary-light);
+    --checkbox-label-border-color: var(--primary-dark);
+    --checkbox-label-border-color-hover: var(--secondary-dark);
+    --checkbox-label-border-color-selected: var(--secondary-light);
+    --checkbox-label-border-width: 1.5px;
+    --checkbox-label-text-color: var(--primarytext-color);
+    --checkbox-label-text-color-selected: var(--primarytext-color);
+    --error-background-fill: var(--text_areabackground);
+    --error-border-color: #ef4444;
+    --error_border_width: 1.5px;
+    --error-text-color: #ef4444;
+    --error-icon-color: #ef4444;
+    --input-background-fill: var(--text_areabackground);
+    --input_background_fill_focus: var(--text_areabackground);
+    --input-background-fill-hover: var(--text_areabackground);
+    --input-border-color: var(--secondary-dark);
+    --input-border-color-focus: var(--secondary-light);
+    --input-border-color-hover: var(--secondary-dark);
+    --input_border_width: 1.5px;
+    --input-placeholder-color: var(--secondarytext-color);
+    --input_shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+    --input-shadow-focus: 0 0 0 var(--shadow-spread) rgba(96, 165, 250, 0.2), var(--shadow-inset);
+    --loader_color: var(--secondary-light);
+    --slider_color: var(--secondary-light);
+    --stat-background-fill: linear-gradient(to right, var(--primary-light), var(--primary-dark));
+    --table-border-color: var(--primary-dark);
+    --table-even-background-fill: var(--text_areabackground);
+    --table-odd-background-fill: var(--chathistory_area);
+    --table-row-focus: var(--secondary-light);
+    --button-border-width: 1.5px;
+    --button-cancel-background-fill: var(--clear);
+    --button-cancel-background-fill-hover: var(--clear-hover);
+    --button-cancel-border-color: var(--secondary-dark);
+    --button-cancel-border-color-hover: var(--secondary-dark);
+    --button-cancel-text-color: var(--primarytext-color);
+    --button-cancel-text-color-hover: var(--primarytext-color);
+    --button-primary-background-fill: var(--Send);
+    --button-primary-background-fill-hover: var(--Send-hover);
+    --button-primary-border-color: var(--secondary-dark);
+    --button-primary-border-color-hover: var(--secondary-dark);
+    --button-primary-text-color: var(--primarytext-color);
+    --button-primary-text-color-hover: var(--primarytext-color);
+    --button_primary_shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+    --button-primary-shadow-hover: 0 2px 4px rgba(0, 0, 0, 0.1);
+    --button-primary-shadow-active: 0 2px 4px rgba(0, 0, 0, 0.1);
+    --button-secondary-background-fill: var(--clear);
+    --button-secondary-background-fill-hover: var(--clear-hover);
+    --button-secondary-border-color: var(--secondary-dark);
+    --button-secondary-border-color-hover: var(--secondary-dark);
+    --button-secondary-text-color: var(--primarytext-color);
+    --button-secondary-text-color-hover: var(--primarytext-color);
+    --button_secondary_shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+    --button-secondary-shadow-hover: 0 2px 4px rgba(0, 0, 0, 0.1);
+    --button-secondary-shadow-active: 0 2px 4px rgba(0, 0, 0, 0.1);
+    --name: origin;
+    --primary-50: #eff6ff;
+    --primary-100: #dbeafe;
+    --primary-200: #bfdbfe;
+    --primary-300: #93c5fd;
+    --primary-400: #60a5fa;
+    --primary-500: #3b82f6;
+    --primary-600: #2563eb;
+    --primary-700: #1d4ed8;
+    --primary-800: #1e40af;
+    --primary-900: #1e3a8a;
+    --primary-950: #172554;
+    --secondary-50: #f0f9ff;
+    --secondary-100: #e0f2fe;
+    --secondary-200: #bae6fd;
+    --secondary-300: #7dd3fc;
+    --secondary-400: #38bdf8;
+    --secondary-500: #0ea5e9;
+    --secondary-600: #0284c7;
+    --secondary-700: #0369a1;
+    --secondary-800: #075985;
+    --secondary-900: #0c4a6e;
+    --secondary-950: #082f49;
+    --neutral-50: #f9fafb;
+    --neutral-100: #f3f4f6;
+    --neutral-200: #e5e7eb;
+    --neutral-300: #d1d5db;
+    --neutral-400: #9ca3af;
+    --neutral-500: #6b7280;
+    --neutral-600: #4b5563;
+    --neutral-700: #374151;
+    --neutral-800: #1f2937;
+    --neutral-900: #111827;
+    --neutral-950: #0b0f19;
+    --spacing-xxs: 1px;
+    --spacing-xs: 2px;
+    --spacing-sm: 4px;
+    --spacing-md: 6px;
+    --spacing-lg: 8px;
+    --spacing-xl: 10px;
+    --spacing-xxl: 16px;
+    --radius-xxs: 1px;
+    --radius-xs: 2px;
+    --radius-sm: 4px;
+    --radius-md: 6px;
+    --radius-lg: 8px;
+    --radius-xl: 12px;
+    --radius-xxl: 22px;
+    --text-xxs: 9px;
+    --text-xs: 10px;
+    --text-sm: 12px;
+    --text-md: 14px;
+    --text-lg: 16px;
+    --text-xl: 22px;
+    --text-xxl: 26px;
+    --font: 'Oswald', ui-sans-serif, system-ui, sans-serif;
+    --font-mono: 'IBM Plex Mono', ui-monospace, Consolas, monospace;
+    --body-text-size: var(--text-md);
+    --body-text-weight: 400;
+    --embed-radius: var(--radius-sm);
+    --color-accent: var(--secondary-light);
+    --shadow-drop: 0 2px 4px rgba(0, 0, 0, 0.1);
+    --shadow-drop-lg: 0 4px 8px rgba(0, 0, 0, 0.1);
+    --shadow-inset: 0 2px 4px rgba(0, 0, 0, 0.1) inset;
+    --block-border-width: 1.5px;
+    --block-info-text-size: var(--text-sm);
+    --block-info-text-weight: 400;
+    --block-label-border-width: 1.5px;
+    --block-label-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+    --block-label-margin: 0;
+    --block-label-padding: var(--spacing-sm) var(--spacing-lg);
+    --block-label-radius: calc(var(--radius-sm) - 1px) 0 calc(var(--radius-sm) - 1px) 0;
+    --block-label-right-radius: 0 calc(var(--radius-sm) - 1px) 0 calc(var(--radius-sm) - 1px);
+    --block-label-text-size: var(--text-sm);
+    --block-label-text-weight: 400;
+    --block-padding: var(--spacing-xl) calc(var(--spacing-xl) + 2px);
+    --block-radius: var(--radius-sm);
+    --block-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+    --block-title-background-fill: transparent;
+    --block-title-border-color: var(--primary-dark);
+    --block-title-border-width: 1.5px;
+    --block-title-padding: 0;
+    --block-title-radius: var(--radius-sm);
+    --block-title-text-size: var(--text-md);
+    --block-title-text-weight: 400;
+    --container-radius: var(--radius-sm);
+    --form-gap-width: 1.5px;
+    --layout-gap: var(--spacing-xxl);
+    --panel-border-width: 1.5px;
+    --section-header-text-size: var(--text-md);
+    --section-header-text-weight: 400;
+    --chatbot-text-size: var(--text-lg);
+    --checkbox-border-radius: var(--radius-sm);
+    --checkbox-label-gap: var(--spacing-lg);
+    --checkbox-label-padding: var(--spacing-md) calc(2 * var(--spacing-md));
+    --checkbox-label-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+    --checkbox-label-text-size: var(--text-md);
+    --checkbox-label-text-weight: 400;
+    --checkbox-check: url(data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='%230f0e09' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M12.207 4.793a1 1 0 010 1.414l-5 5a1 1 0 01-1.414 0l-2-2a1 1 0 011.414-1.414L6.5 9.086l4.293-4.293a1 1 0 011.414 0z'/%3e%3c/svg%3e);
+    --radio-circle: url(data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='%230f0e09' xmlns='http://www.w3.org/2000/svg'%3e%3ccircle cx='8' cy='8' r='3'/%3e%3c/svg%3e);
+    --checkbox-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+    --error-border-width: 1.5px;
+    --input-background-fill-focus: var(--text_areabackground);
+    --input-border-width: 1.5px;
+    --input-padding: var(--spacing-xl);
+    --input-radius: var(--radius-sm);
+    --input-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+    --input-text-size: var(--text-md);
+    --input-text-weight: 400;
+    --loader-color: var(--secondary-light);
+    --prose-text-size: var(--text-md);
+    --prose-text-weight: 400;
+    --prose-header-text-weight: 600;
+    --slider-color: var(--secondary-light);
+    --table-radius: var(--radius-sm);
+    --button-transform-hover: none;
+    --button-transform-active: none;
+    --button-transition: none;
+    --button-large-padding: var(--spacing-lg) calc(2 * var(--spacing-lg));
+    --button-large-radius: var(--radius-md);
+    --button-large-text-size: var(--text-lg);
+    --button-large-text-weight: 600;
+    --button-primary-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+    --button-secondary-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+    --button-small-padding: var(--spacing-sm) calc(1.5 * var(--spacing-sm));
+    --button-small-radius: var(--radius-md);
+    --button-small-text-size: var(--text-sm);
+    --button-small-text-weight: 400;
+    --button-medium-padding: var(--spacing-md) calc(2 * var(--spacing-md));
+    --button-medium-radius: var(--radius-md);
+    --button-medium-text-size: var(--text-md);
+    --button-medium-text-weight: 600;
+}
+/* Force text color on all elements - FIX FOR WHITE TEXT */
+* {
+    color: var(--primarytext-color) !important;
+    font-family: "Oswald", sans-serif !important;
+}
+@media (prefers-color-scheme: dark) {
+    body {
+        background: var(--text_areabackground);
+        color: var(--primarytext-color);
+    }
+}
+body {
+    background: var(--text_areabackground);
+    color: var(--primarytext-color);
+    font-family: "Oswald", sans-serif;
+}
+/* Loading Animation CSS using your theme colors */
+.thinking-indicator {
+    display: inline-flex;
+    align-items: center;
+    padding: 8px 12px;
+    margin: 5px 0;
+    background-color: transparent;
 }
+.dots-container {
+    display: inline-flex;
+    gap: 3px;
+    align-items: center;
+}
+.dot {
+    width: 4px;
+    height: 4px;
+    border-radius: 50%;
+    background-color: var(--primary-light);
+    animation: pulse 1.5s infinite ease-in-out;
+    display: inline-block;
+}
+.dot:nth-child(1) { animation-delay: 0s; }
+.dot:nth-child(2) { animation-delay: 0.3s; }
+.dot:nth-child(3) { animation-delay: 0.6s; }
+@keyframes pulse {
+    0%, 70%, 100% {
+        transform: scale(0.8);
+        opacity: 0.4;
+    }
+    35% {
+        transform: scale(1.2);
+        opacity: 1;
+    }
 }