Spaces:

RedRepter
/

TutorX-MCP

Running

App Files Files Community

Meet Patel commited on Jun 7, 2025

Commit

14940e1

1 Parent(s): 5e80e3b

Step 3: Added multi-modal interaction capabilities with text, voice, and handwriting processing

Browse files

Files changed (3) hide show

main.py +109 -0
utils/__init__.py +3 -0
utils/multimodal.py +140 -0

main.py CHANGED Viewed

@@ -4,6 +4,14 @@ import json
 from typing import List, Dict, Any, Optional
 from datetime import datetime
 # Create the TutorX MCP server
 mcp = FastMCP("TutorX")
@@ -346,5 +354,106 @@ def update_accessibility_settings(student_id: str, settings: Dict[str, Any]) ->
         "updated_at": datetime.now().isoformat()
     }
 if __name__ == "__main__":
     mcp.run()

 from typing import List, Dict, Any, Optional
 from datetime import datetime
+# Import utility functions for multi-modal interactions
+from utils.multimodal import (
+    process_text_query,
+    process_voice_input,
+    process_handwriting,
+    generate_speech_response
+)
 # Create the TutorX MCP server
 mcp = FastMCP("TutorX")
         "updated_at": datetime.now().isoformat()
     }
+# ------------------ Multi-Modal Interaction ------------------
+@mcp.tool()
+def text_interaction(query: str, student_id: str, session_context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    """
+    Process a text query from the student
+    Args:
+        query: The text query from the student
+        student_id: The student's unique identifier
+        session_context: Optional context about the current session
+    Returns:
+        Processed response
+    """
+    # Add student information to context
+    context = session_context or {}
+    context["student_id"] = student_id
+    return process_text_query(query, context)
+@mcp.tool()
+def voice_interaction(audio_data_base64: str, student_id: str) -> Dict[str, Any]:
+    """
+    Process voice input from the student
+    Args:
+        audio_data_base64: Base64 encoded audio data
+        student_id: The student's unique identifier
+    Returns:
+        Transcription and response
+    """
+    # Process voice input
+    result = process_voice_input(audio_data_base64)
+    # Process the transcription as a text query
+    text_response = process_text_query(result["transcription"], {"student_id": student_id})
+    # Generate speech response
+    speech_response = generate_speech_response(
+        text_response["response"],
+        {"voice_id": "educational_tutor"}
+    )
+    # Combine results
+    return {
+        "input_transcription": result["transcription"],
+        "input_confidence": result["confidence"],
+        "detected_emotions": result.get("detected_emotions", {}),
+        "text_response": text_response["response"],
+        "speech_response": speech_response,
+        "timestamp": datetime.now().isoformat()
+    }
+@mcp.tool()
+def handwriting_recognition(image_data_base64: str, student_id: str) -> Dict[str, Any]:
+    """
+    Process handwritten input from the student
+    Args:
+        image_data_base64: Base64 encoded image data of handwriting
+        student_id: The student's unique identifier
+    Returns:
+        Transcription and analysis
+    """
+    # Process handwriting input
+    result = process_handwriting(image_data_base64)
+    # If it's a math equation, solve it
+    if result["detected_content_type"] == "math_equation":
+        # In a real implementation, this would use a math engine to solve the equation
+        # For demonstration, we'll provide a simulated solution
+        if result["equation_type"] == "quadratic":
+            solution = {
+                "equation": result["transcription"],
+                "solution_steps": [
+                    "x^2 + 5x + 6 = 0",
+                    "Factor: (x + 2)(x + 3) = 0",
+                    "x + 2 = 0 or x + 3 = 0",
+                    "x = -2 or x = -3"
+                ],
+                "solutions": [-2, -3]
+            }
+        else:
+            solution = {
+                "equation": result["transcription"],
+                "note": "Solution not implemented for this equation type"
+            }
+    else:
+        solution = None
+    return {
+        "transcription": result["transcription"],
+        "confidence": result["confidence"],
+        "detected_content_type": result["detected_content_type"],
+        "solution": solution,
+        "timestamp": datetime.now().isoformat()
+    }
 if __name__ == "__main__":
     mcp.run()

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+TutorX MCP Server utilities.
+"""

utils/multimodal.py ADDED Viewed

	@@ -0,0 +1,140 @@

+"""
+Utility functions for multi-modal interactions including text processing,
+voice recognition and handwriting recognition for the TutorX MCP server.
+"""
+from typing import Dict, Any, List, Optional
+import base64
+import json
+from datetime import datetime
+def process_text_query(query: str, context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    """
+    Process a text query from the student
+    Args:
+        query: The text query from the student
+        context: Optional context about the student and current session
+    Returns:
+        Processed response
+    """
+    # In a real implementation, this would use NLP to understand the query
+    # and generate an appropriate response
+    # Simple keyword-based response for demonstration
+    keywords = {
+        "solve": {
+            "type": "math_solution",
+            "response": "To solve this equation, first isolate the variable by..."
+        },
+        "what is": {
+            "type": "definition",
+            "response": "This concept refers to..."
+        },
+        "how do i": {
+            "type": "procedure",
+            "response": "Follow these steps: 1)..."
+        },
+        "help": {
+            "type": "assistance",
+            "response": "I'm here to help! You can ask me questions about..."
+        }
+    }
+    for key, value in keywords.items():
+        if key in query.lower():
+            return {
+                "query": query,
+                "response_type": value["type"],
+                "response": value["response"],
+                "confidence": 0.85,
+                "timestamp": datetime.now().isoformat()
+            }
+    # Default response if no keywords match
+    return {
+        "query": query,
+        "response_type": "general",
+        "response": "That's an interesting question. Let me think about how to help you with that.",
+        "confidence": 0.6,
+        "timestamp": datetime.now().isoformat()
+    }
+def process_voice_input(audio_data_base64: str) -> Dict[str, Any]:
+    """
+    Process voice input from the student
+    Args:
+        audio_data_base64: Base64 encoded audio data
+    Returns:
+        Transcription and analysis
+    """
+    # In a real implementation, this would use ASR to transcribe the audio
+    # and then process the transcribed text
+    # For demonstration purposes, we'll simulate a transcription
+    return {
+        "transcription": "What is the quadratic formula?",
+        "confidence": 0.92,
+        "detected_emotions": {
+            "confusion": 0.7,
+            "interest": 0.9,
+            "frustration": 0.2
+        },
+        "audio_quality": "good",
+        "background_noise": "low",
+        "timestamp": datetime.now().isoformat()
+    }
+def process_handwriting(image_data_base64: str) -> Dict[str, Any]:
+    """
+    Process handwritten input from the student
+    Args:
+        image_data_base64: Base64 encoded image data of handwriting
+    Returns:
+        Transcription and analysis
+    """
+    # In a real implementation, this would use OCR/handwriting recognition
+    # to transcribe the handwritten text or equations
+    # For demonstration purposes, we'll simulate a transcription
+    return {
+        "transcription": "x^2 + 5x + 6 = 0",
+        "confidence": 0.85,
+        "detected_content_type": "math_equation",
+        "equation_type": "quadratic",
+        "parsed_latex": "x^2 + 5x + 6 = 0",
+        "timestamp": datetime.now().isoformat()
+    }
+def generate_speech_response(text: str, voice_params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    """
+    Generate speech response from text
+    Args:
+        text: The text to convert to speech
+        voice_params: Parameters for the voice (gender, age, accent, etc.)
+    Returns:
+        Speech data and metadata
+    """
+    # In a real implementation, this would use TTS to generate audio
+    # For demonstration, we'll simulate audio generation metadata
+    return {
+        "text": text,
+        "audio_format": "mp3",
+        "audio_data_base64": "SIMULATED_BASE64_AUDIO_DATA",
+        "voice_id": voice_params.get("voice_id", "default"),
+        "duration_seconds": len(text) / 15,  # Rough estimate of speech duration
+        "sample_rate": 24000,
+        "timestamp": datetime.now().isoformat()
+    }