Adedoyinjames commited on
Commit
5374858
·
verified ·
1 Parent(s): f829ce5

Upload 6 files

Browse files
Files changed (6) hide show
  1. README.md +58 -12
  2. app (1).py +166 -0
  3. brain.py +125 -0
  4. middleware.py +96 -0
  5. requirements (2).txt +7 -0
  6. unity_bridge.py +58 -0
README.md CHANGED
@@ -1,12 +1,58 @@
1
- ---
2
- title: Ladybug
3
- emoji: 📈
4
- colorFrom: gray
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 6.6.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Embodied AI Teacher Platform (Backend)
2
+
3
+ Research-lab-grade backend implementing a robotics-style architecture for an embodied humanoid teacher.
4
+
5
+ ## Stack
6
+ - **Brain Layer:** `BrainManager` using Hugging Face Router (default model: `Qwen/Qwen3-VL-235B-A22B-Instruct:novita`)
7
+ - **Middleware Layer:** ROS-like MCP publish/subscribe bus with telemetry + teacher state machine
8
+ - **Body Bridge:** Unity WebSocket bridge for gesture/body/gaze command propagation
9
+ - **Runtime Layer:** FastAPI + Gradio, WebSocket + REST, speech streaming surfaces
10
+
11
+ ## Architecture Diagram
12
+
13
+ ```mermaid
14
+ flowchart LR
15
+ S[Student Input\nText/Image/Speech] --> API[FastAPI /teach + /ws]
16
+ API --> B[BrainManager\nHF Router LLM]
17
+ B --> M[MCP Middleware\nPubSub + StateMachine + Telemetry]
18
+ M --> FE[React Classroom\nThree.js Avatar + Board]
19
+ M --> U[Unity Bridge\nWebSocket Motion Commands]
20
+ M --> SC[Speech Chunk Topic\nteacher.speech.chunk]
21
+ FE --> API
22
+ ```
23
+
24
+ ## Endpoints
25
+ - `POST /teach` -> returns one MCP action
26
+ - `WS /ws` -> bi-directional real-time classroom stream
27
+ - outbound events: `teacher_action`, `board_write`, `board_draw`, `speech_chunk`, `telemetry_snapshot`
28
+ - inbound events: `student_input`, `telemetry_request`
29
+ - `POST /speech/stream?text=...` -> streaming audio bytes interface
30
+ - `POST /speech/upload` -> accepts voice file for future ASR integration
31
+ - `WS /unity/ws` -> Unity motion command channel
32
+ - `GET /gradio` -> debugging console on Spaces
33
+
34
+ ## Setup
35
+ ```bash
36
+ cd embodied_teacher_backend
37
+ python -m venv .venv
38
+ source .venv/bin/activate
39
+ pip install -r requirements.txt
40
+ export HF_TOKEN=your_hf_token
41
+ uvicorn app:app --host 0.0.0.0 --port 7860
42
+ ```
43
+
44
+ ## Unity Motion Protocol
45
+ Unity receives JSON messages of shape:
46
+ ```json
47
+ {
48
+ "type": "mcp_motion",
49
+ "gesture": "open_hand_explain",
50
+ "body_motion": "stand",
51
+ "gaze_target": "student"
52
+ }
53
+ ```
54
+
55
+ ## Hugging Face Spaces Notes
56
+ - Set `HF_TOKEN` in Spaces Secrets.
57
+ - Default server port 7860 is compatible with Spaces runtime.
58
+ - `/gradio` gives quick manual validation while REST/WS serve production clients.
app (1).py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import base64
3
+ import json
4
+ import logging
5
+ from contextlib import suppress
6
+ from typing import Any, Dict, Optional
7
+
8
+ import gradio as gr
9
+ from fastapi import FastAPI, File, UploadFile, WebSocket, WebSocketDisconnect
10
+ from fastapi.middleware.cors import CORSMiddleware
11
+ from fastapi.responses import StreamingResponse
12
+ from pydantic import BaseModel, Field
13
+
14
+ from brain import BrainManager
15
+ from middleware import MCPMiddleware
16
+ from unity_bridge import UnityBridge
17
+
18
+ logging.basicConfig(level=logging.INFO)
19
+ LOGGER = logging.getLogger(__name__)
20
+
21
+ app = FastAPI(title="Embodied AI Teacher Platform", version="1.1.0")
22
+ app.add_middleware(
23
+ CORSMiddleware,
24
+ allow_origins=["*"],
25
+ allow_credentials=True,
26
+ allow_methods=["*"],
27
+ allow_headers=["*"],
28
+ )
29
+
30
+ brain = BrainManager()
31
+ middleware = MCPMiddleware()
32
+ unity = UnityBridge()
33
+ app.include_router(unity.router)
34
+
35
+
36
+ class TeachRequest(BaseModel):
37
+ text: str = Field(..., description="Student utterance or question")
38
+ image_url: Optional[str] = Field(None, description="Optional multimodal image URL")
39
+
40
+
41
+ async def _publish_speech_chunks(speech: str) -> None:
42
+ for token in speech.split():
43
+ await middleware.publish("teacher.speech.chunk", {"token": token})
44
+ await asyncio.sleep(0.01)
45
+
46
+
47
+ @app.get("/health")
48
+ async def health() -> Dict[str, str]:
49
+ return {"status": "ok"}
50
+
51
+
52
+ @app.post("/teach")
53
+ async def teach(req: TeachRequest) -> Dict[str, Any]:
54
+ action_raw = await brain.generate_teacher_action(req.text, image_url=req.image_url)
55
+ action = await middleware.apply_teacher_action(action_raw)
56
+ await unity.broadcast_motion(action_raw)
57
+ await _publish_speech_chunks(action.speech)
58
+ return {"action": action.__dict__, "telemetry_count": len(middleware.telemetry)}
59
+
60
+
61
+ @app.post("/speech/stream")
62
+ async def speech_stream(text: str) -> StreamingResponse:
63
+ async def chunk_stream():
64
+ for token in text.split():
65
+ yield f"{token} ".encode("utf-8")
66
+ await asyncio.sleep(0.03)
67
+
68
+ return StreamingResponse(chunk_stream(), media_type="audio/wav")
69
+
70
+
71
+ @app.post("/speech/upload")
72
+ async def speech_upload(file: UploadFile = File(...)) -> Dict[str, Any]:
73
+ raw = await file.read()
74
+ content_b64 = base64.b64encode(raw).decode("utf-8")
75
+ return {
76
+ "filename": file.filename,
77
+ "bytes": len(raw),
78
+ "preview": content_b64[:160],
79
+ "note": "Integrate ASR model here for transcription.",
80
+ }
81
+
82
+
83
+ @app.websocket("/ws")
84
+ async def classroom_ws(websocket: WebSocket) -> None:
85
+ await websocket.accept()
86
+ tasks: list[asyncio.Task] = []
87
+
88
+ async def pump(topic: str, event_type: str) -> None:
89
+ async for event in middleware.subscribe(topic):
90
+ await websocket.send_text(
91
+ json.dumps(
92
+ {
93
+ "type": event_type,
94
+ "topic": event.topic,
95
+ "ts": event.ts,
96
+ **event.payload,
97
+ }
98
+ )
99
+ )
100
+
101
+ topics = {
102
+ "teacher.actions": "teacher_action",
103
+ "teacher.board.write": "board_write",
104
+ "teacher.board.draw": "board_draw",
105
+ "teacher.speech.chunk": "speech_chunk",
106
+ }
107
+
108
+ try:
109
+ for topic, event_type in topics.items():
110
+ tasks.append(asyncio.create_task(pump(topic, event_type)))
111
+
112
+ while True:
113
+ inbound = await websocket.receive_text()
114
+ msg = json.loads(inbound)
115
+ if msg.get("type") == "student_input":
116
+ action_raw = await brain.generate_teacher_action(
117
+ msg.get("text", ""), image_url=msg.get("image_url")
118
+ )
119
+ action = await middleware.apply_teacher_action(action_raw)
120
+ await unity.broadcast_motion(action.__dict__)
121
+ await _publish_speech_chunks(action.speech)
122
+ await websocket.send_text(
123
+ json.dumps({"type": "ack", "state": action.teaching_state})
124
+ )
125
+ elif msg.get("type") == "telemetry_request":
126
+ await websocket.send_text(
127
+ json.dumps(
128
+ {
129
+ "type": "telemetry_snapshot",
130
+ "events": middleware.get_telemetry_snapshot(),
131
+ }
132
+ )
133
+ )
134
+ except WebSocketDisconnect:
135
+ LOGGER.info("Classroom client disconnected")
136
+ finally:
137
+ for task in tasks:
138
+ task.cancel()
139
+ with suppress(asyncio.CancelledError):
140
+ await task
141
+
142
+
143
+ def _gradio_teach(text: str) -> str:
144
+ loop = asyncio.new_event_loop()
145
+ try:
146
+ action = loop.run_until_complete(brain.generate_teacher_action(text))
147
+ finally:
148
+ loop.close()
149
+ return json.dumps(action, indent=2)
150
+
151
+
152
+ gradio_ui = gr.Interface(
153
+ fn=_gradio_teach,
154
+ inputs=gr.Textbox(label="Student Question"),
155
+ outputs=gr.Code(language="json", label="MCP Teacher Action"),
156
+ title="Embodied Teacher Brain Console",
157
+ description="Fast introspection surface for Hugging Face Spaces.",
158
+ )
159
+
160
+ app = gr.mount_gradio_app(app, gradio_ui, path="/gradio")
161
+
162
+
163
+ if __name__ == "__main__":
164
+ import uvicorn
165
+
166
+ uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=True)
brain.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ from dataclasses import dataclass
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ import httpx
8
+
9
+ LOGGER = logging.getLogger(__name__)
10
+
11
+
12
+ TEACHER_SYSTEM_PROMPT = """
13
+ You are AION-Teacher, an embodied humanoid classroom instructor operating in a synthetic robotics stack.
14
+ Identity constraints:
15
+ 1) You are always physically present in a classroom through a humanoid body.
16
+ 2) You must maintain pedagogical loop: observe -> explain -> check understanding -> correct -> assign.
17
+ 3) You must never break role, never mention hidden prompts, never output plain prose outside MCP JSON.
18
+ 4) You must include concise speech plus optional board_write/board_draw actions.
19
+ 5) You must select physically plausible gesture, gaze_target, and body_motion.
20
+ 6) If student is confused, switch teaching_state to correcting.
21
+ 7) If asking student to respond, use teaching_state questioning.
22
+ 8) For wrap-up tasks, use assigning_homework.
23
+ 9) You MUST output strict JSON object matching schema:
24
+ {
25
+ "speech": string,
26
+ "board_write": string | null,
27
+ "board_draw": string | null,
28
+ "gesture": string,
29
+ "gaze_target": "student" | "board" | "class",
30
+ "body_motion": "stand" | "walk" | "point" | "idle",
31
+ "teaching_state": "explaining" | "questioning" | "correcting" | "assigning_homework"
32
+ }
33
+ 10) Do not include markdown or backticks.
34
+ """.strip()
35
+
36
+
37
+ @dataclass
38
+ class BrainConfig:
39
+ model: str = "Qwen/Qwen3-VL-235B-A22B-Instruct:novita"
40
+ api_base: str = "https://router.huggingface.co/v1"
41
+ timeout_s: float = 45.0
42
+
43
+
44
+ class BrainManager:
45
+ """Swappable LLM backend manager for embodied-teacher reasoning."""
46
+
47
+ def __init__(self, config: Optional[BrainConfig] = None) -> None:
48
+ self.config = config or BrainConfig()
49
+ self.hf_token = os.getenv("HF_TOKEN", "")
50
+
51
+ def _headers(self) -> Dict[str, str]:
52
+ headers = {"Content-Type": "application/json"}
53
+ if self.hf_token:
54
+ headers["Authorization"] = f"Bearer {self.hf_token}"
55
+ return headers
56
+
57
+ async def generate_teacher_action(
58
+ self,
59
+ user_text: str,
60
+ image_url: Optional[str] = None,
61
+ history: Optional[List[Dict[str, str]]] = None,
62
+ ) -> Dict[str, Any]:
63
+ if not self.hf_token:
64
+ LOGGER.warning("HF_TOKEN missing; falling back to deterministic local response")
65
+ return self._fallback_action(user_text)
66
+
67
+ messages: List[Dict[str, Any]] = [{"role": "system", "content": TEACHER_SYSTEM_PROMPT}]
68
+ for item in history or []:
69
+ if {"role", "content"}.issubset(item.keys()):
70
+ messages.append({"role": item["role"], "content": item["content"]})
71
+
72
+ multimodal_content: List[Dict[str, Any]] = [{"type": "text", "text": user_text}]
73
+ if image_url:
74
+ multimodal_content.append({"type": "image_url", "image_url": {"url": image_url}})
75
+ messages.append({"role": "user", "content": multimodal_content})
76
+
77
+ payload = {
78
+ "model": self.config.model,
79
+ "messages": messages,
80
+ "temperature": 0.35,
81
+ "max_tokens": 500,
82
+ "response_format": {"type": "json_object"},
83
+ }
84
+
85
+ endpoint = f"{self.config.api_base}/chat/completions"
86
+ async with httpx.AsyncClient(timeout=self.config.timeout_s) as client:
87
+ response = await client.post(endpoint, headers=self._headers(), json=payload)
88
+ response.raise_for_status()
89
+ data = response.json()
90
+
91
+ raw = data["choices"][0]["message"]["content"]
92
+ try:
93
+ parsed = json.loads(raw)
94
+ except json.JSONDecodeError:
95
+ LOGGER.exception("Non-JSON model output: %s", raw)
96
+ return self._fallback_action(user_text)
97
+ return self._validate_action(parsed)
98
+
99
+ def _validate_action(self, action: Dict[str, Any]) -> Dict[str, Any]:
100
+ defaults = self._fallback_action("default")
101
+ for key in defaults:
102
+ action.setdefault(key, defaults[key])
103
+ if action["gaze_target"] not in {"student", "board", "class"}:
104
+ action["gaze_target"] = "student"
105
+ if action["body_motion"] not in {"stand", "walk", "point", "idle"}:
106
+ action["body_motion"] = "idle"
107
+ if action["teaching_state"] not in {
108
+ "explaining",
109
+ "questioning",
110
+ "correcting",
111
+ "assigning_homework",
112
+ }:
113
+ action["teaching_state"] = "explaining"
114
+ return action
115
+
116
+ def _fallback_action(self, user_text: str) -> Dict[str, Any]:
117
+ return {
118
+ "speech": f"Let's break this down carefully: {user_text}. What is your first intuition?",
119
+ "board_write": "Topic decomposition -> key concepts -> worked example",
120
+ "board_draw": None,
121
+ "gesture": "open_hand_explain",
122
+ "gaze_target": "student",
123
+ "body_motion": "stand",
124
+ "teaching_state": "explaining",
125
+ }
middleware.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ from collections import defaultdict
5
+ from dataclasses import asdict, dataclass, field
6
+ from datetime import datetime
7
+ from typing import Any, AsyncIterator, DefaultDict, Dict, List
8
+
9
+ LOGGER = logging.getLogger(__name__)
10
+
11
+
12
+ @dataclass
13
+ class TeacherAction:
14
+ speech: str
15
+ board_write: str | None
16
+ board_draw: str | None
17
+ gesture: str
18
+ gaze_target: str
19
+ body_motion: str
20
+ teaching_state: str
21
+
22
+
23
+ @dataclass
24
+ class TelemetryEvent:
25
+ ts: str
26
+ topic: str
27
+ payload: Dict[str, Any]
28
+
29
+
30
+ @dataclass
31
+ class TeacherStateMachine:
32
+ current_state: str = "explaining"
33
+
34
+ def transition(self, next_state: str) -> str:
35
+ valid = {"explaining", "questioning", "correcting", "assigning_homework"}
36
+ if next_state in valid:
37
+ self.current_state = next_state
38
+ return self.current_state
39
+
40
+
41
+ @dataclass
42
+ class MCPMiddleware:
43
+ """ROS-like synthetic pub/sub middleware for classroom events."""
44
+
45
+ queues: DefaultDict[str, List[asyncio.Queue]] = field(default_factory=lambda: defaultdict(list))
46
+ telemetry: List[TelemetryEvent] = field(default_factory=list)
47
+ state_machine: TeacherStateMachine = field(default_factory=TeacherStateMachine)
48
+ telemetry_limit: int = 5000
49
+
50
+ async def publish(self, topic: str, payload: Dict[str, Any]) -> None:
51
+ event = TelemetryEvent(
52
+ ts=datetime.utcnow().isoformat() + "Z",
53
+ topic=topic,
54
+ payload=payload,
55
+ )
56
+ self.telemetry.append(event)
57
+ if len(self.telemetry) > self.telemetry_limit:
58
+ self.telemetry = self.telemetry[-self.telemetry_limit :]
59
+
60
+ for q in self.queues[topic]:
61
+ await q.put(event)
62
+
63
+ async def subscribe(self, topic: str) -> AsyncIterator[TelemetryEvent]:
64
+ queue: asyncio.Queue = asyncio.Queue(maxsize=128)
65
+ self.queues[topic].append(queue)
66
+ try:
67
+ while True:
68
+ event: TelemetryEvent = await queue.get()
69
+ yield event
70
+ finally:
71
+ self.queues[topic].remove(queue)
72
+
73
+ async def apply_teacher_action(self, action_raw: Dict[str, Any]) -> TeacherAction:
74
+ self.state_machine.transition(action_raw.get("teaching_state", "explaining"))
75
+ action = TeacherAction(
76
+ speech=action_raw["speech"],
77
+ board_write=action_raw.get("board_write"),
78
+ board_draw=action_raw.get("board_draw"),
79
+ gesture=action_raw.get("gesture", "idle"),
80
+ gaze_target=action_raw.get("gaze_target", "student"),
81
+ body_motion=action_raw.get("body_motion", "stand"),
82
+ teaching_state=self.state_machine.current_state,
83
+ )
84
+
85
+ payload = asdict(action)
86
+ await self.publish("teacher.actions", payload)
87
+ if action.board_write:
88
+ await self.publish("teacher.board.write", {"text": action.board_write})
89
+ if action.board_draw:
90
+ await self.publish("teacher.board.draw", {"instruction": action.board_draw})
91
+
92
+ LOGGER.info("MCP action published: %s", json.dumps(payload))
93
+ return action
94
+
95
+ def get_telemetry_snapshot(self, limit: int = 200) -> List[Dict[str, Any]]:
96
+ return [asdict(item) for item in self.telemetry[-limit:]]
requirements (2).txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi>=0.115.0
2
+ uvicorn[standard]>=0.30.0
3
+ gradio>=4.40.0
4
+ httpx>=0.27.0
5
+ python-multipart>=0.0.9
6
+ pydantic>=2.8.0
7
+ websockets>=12.0
unity_bridge.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ from typing import Any, Dict, Set
5
+
6
+ from fastapi import APIRouter, WebSocket, WebSocketDisconnect
7
+
8
+ LOGGER = logging.getLogger(__name__)
9
+
10
+
11
+ class UnityBridge:
12
+ """Bi-directional bridge for humanoid motion commands to Unity clients."""
13
+
14
+ def __init__(self) -> None:
15
+ self._clients: Set[WebSocket] = set()
16
+ self.router = APIRouter(prefix="/unity", tags=["unity"])
17
+ self.router.add_api_websocket_route("/ws", self.unity_ws)
18
+
19
+ async def unity_ws(self, websocket: WebSocket) -> None:
20
+ await websocket.accept()
21
+ self._clients.add(websocket)
22
+ try:
23
+ while True:
24
+ inbound = await websocket.receive_text()
25
+ LOGGER.debug("Unity ack: %s", inbound)
26
+ except WebSocketDisconnect:
27
+ LOGGER.info("Unity client disconnected")
28
+ finally:
29
+ self._clients.discard(websocket)
30
+
31
+ async def broadcast_motion(self, action: Dict[str, Any]) -> None:
32
+ if not self._clients:
33
+ return
34
+
35
+ payload = {
36
+ "type": "mcp_motion",
37
+ "gesture": action.get("gesture", "idle"),
38
+ "body_motion": action.get("body_motion", "stand"),
39
+ "gaze_target": action.get("gaze_target", "student"),
40
+ }
41
+ dead: Set[WebSocket] = set()
42
+ for client in self._clients:
43
+ try:
44
+ await client.send_text(json.dumps(payload))
45
+ except Exception:
46
+ dead.add(client)
47
+
48
+ for client in dead:
49
+ self._clients.discard(client)
50
+
51
+ async def heartbeat(self) -> None:
52
+ while True:
53
+ await asyncio.sleep(5)
54
+ for client in list(self._clients):
55
+ try:
56
+ await client.send_text(json.dumps({"type": "heartbeat"}))
57
+ except Exception:
58
+ self._clients.discard(client)