Spaces:
Runtime error
Runtime error
| """ | |
| Encrypted local progress store (SQLite + AES-256-GCM via cryptography package). | |
| Differential-privacy (ε-DP) upstream sync of aggregated cooperative stats. | |
| Privacy model: | |
| - All data stays on-device in an encrypted DB. | |
| - Upstream sync exports only per-skill averages with Gaussian noise (ε = 1.0 / week). | |
| - No individual response records leave the device. | |
| """ | |
| from __future__ import annotations | |
| import hashlib | |
| import json | |
| import math | |
| import os | |
| import sqlite3 | |
| import tempfile | |
| import time | |
| from pathlib import Path | |
| from typing import Dict, List, Optional | |
| import numpy as np | |
| # ------------------------------------------------------------------ | |
| # Encryption helpers | |
| # ------------------------------------------------------------------ | |
| def _derive_key(password: str) -> bytes: | |
| """Derive a 32-byte AES key from a password (PBKDF2-HMAC-SHA256).""" | |
| return hashlib.pbkdf2_hmac( | |
| "sha256", password.encode(), b"tutor-salt-v1", iterations=100_000 | |
| ) | |
| def _encrypt(plaintext: bytes, key: bytes) -> bytes: | |
| try: | |
| from cryptography.hazmat.primitives.ciphers.aead import AESGCM | |
| import secrets | |
| nonce = secrets.token_bytes(12) | |
| ct = AESGCM(key).encrypt(nonce, plaintext, None) | |
| return nonce + ct | |
| except ImportError: | |
| # Graceful degradation: store unencrypted with a warning prefix | |
| return b"UNENC:" + plaintext | |
| def _decrypt(ciphertext: bytes, key: bytes) -> bytes: | |
| if ciphertext.startswith(b"UNENC:"): | |
| return ciphertext[6:] | |
| from cryptography.hazmat.primitives.ciphers.aead import AESGCM | |
| nonce, ct = ciphertext[:12], ciphertext[12:] | |
| return AESGCM(key).decrypt(nonce, ct, None) | |
| # ------------------------------------------------------------------ | |
| # Database | |
| # ------------------------------------------------------------------ | |
| SKILLS = ["counting", "number_sense", "addition", "subtraction", "word_problem"] | |
| _SCHEMA = """ | |
| CREATE TABLE IF NOT EXISTS sessions ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| learner_id TEXT NOT NULL, | |
| started_at INTEGER NOT NULL, | |
| ended_at INTEGER, | |
| lang TEXT, | |
| state_json BLOB NOT NULL | |
| ); | |
| CREATE TABLE IF NOT EXISTS responses ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| learner_id TEXT NOT NULL, | |
| session_id INTEGER NOT NULL, | |
| ts INTEGER NOT NULL, | |
| item_id TEXT NOT NULL, | |
| skill TEXT NOT NULL, | |
| difficulty INTEGER NOT NULL, | |
| correct INTEGER NOT NULL, | |
| latency_ms INTEGER | |
| ); | |
| CREATE TABLE IF NOT EXISTS learners ( | |
| learner_id TEXT PRIMARY KEY, | |
| pin_hash TEXT, | |
| display_name TEXT, | |
| created_at INTEGER NOT NULL | |
| ); | |
| CREATE TABLE IF NOT EXISTS interactions ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| learner_id TEXT NOT NULL, | |
| session_id INTEGER NOT NULL, | |
| ts INTEGER NOT NULL, | |
| item_id TEXT NOT NULL, | |
| skill TEXT NOT NULL, | |
| difficulty INTEGER NOT NULL, | |
| lang TEXT NOT NULL, | |
| prompt_enc BLOB NOT NULL, | |
| child_response_enc BLOB NOT NULL, | |
| feedback_enc BLOB NOT NULL, | |
| correct INTEGER NOT NULL, | |
| latency_ms INTEGER | |
| ); | |
| """ | |
| class ProgressStore: | |
| """Thread-safe progress store backed by encrypted SQLite.""" | |
| def __init__(self, db_path: str | Path, password: str = "tutor-default-key"): | |
| self._path = Path(db_path) | |
| self._key = _derive_key(password) | |
| self._conn = sqlite3.connect(str(self._path), check_same_thread=False) | |
| self._conn.executescript(_SCHEMA) | |
| self._conn.commit() | |
| # ------------------------------------------------------------------ | |
| # Learner management | |
| # ------------------------------------------------------------------ | |
| def add_learner(self, learner_id: str, display_name: str, pin: Optional[str] = None) -> None: | |
| pin_hash = hashlib.sha256(pin.encode()).hexdigest() if pin else None | |
| self._conn.execute( | |
| "INSERT OR IGNORE INTO learners (learner_id, pin_hash, display_name, created_at) VALUES (?,?,?,?)", | |
| (learner_id, pin_hash, display_name, int(time.time())), | |
| ) | |
| self._conn.commit() | |
| def verify_pin(self, learner_id: str, pin: str) -> bool: | |
| row = self._conn.execute( | |
| "SELECT pin_hash FROM learners WHERE learner_id=?", (learner_id,) | |
| ).fetchone() | |
| if not row or row[0] is None: | |
| return True # no PIN set | |
| return row[0] == hashlib.sha256(pin.encode()).hexdigest() | |
| def list_learners(self) -> List[Dict]: | |
| rows = self._conn.execute( | |
| "SELECT learner_id, display_name, created_at FROM learners" | |
| ).fetchall() | |
| return [{"learner_id": r[0], "display_name": r[1], "created_at": r[2]} for r in rows] | |
| # ------------------------------------------------------------------ | |
| # Session management | |
| # ------------------------------------------------------------------ | |
| def start_session(self, learner_id: str, state_dict: dict, lang: str = "en") -> int: | |
| blob = _encrypt(json.dumps(state_dict).encode(), self._key) | |
| cur = self._conn.execute( | |
| "INSERT INTO sessions (learner_id, started_at, lang, state_json) VALUES (?,?,?,?)", | |
| (learner_id, int(time.time()), lang, blob), | |
| ) | |
| self._conn.commit() | |
| return cur.lastrowid | |
| def end_session(self, session_id: int, state_dict: dict) -> None: | |
| blob = _encrypt(json.dumps(state_dict).encode(), self._key) | |
| self._conn.execute( | |
| "UPDATE sessions SET ended_at=?, state_json=? WHERE id=?", | |
| (int(time.time()), blob, session_id), | |
| ) | |
| self._conn.commit() | |
| def load_latest_state(self, learner_id: str) -> Optional[dict]: | |
| row = self._conn.execute( | |
| "SELECT state_json FROM sessions WHERE learner_id=? ORDER BY started_at DESC LIMIT 1", | |
| (learner_id,), | |
| ).fetchone() | |
| if not row: | |
| return None | |
| return json.loads(_decrypt(row[0], self._key)) | |
| # ------------------------------------------------------------------ | |
| # Response logging | |
| # ------------------------------------------------------------------ | |
| def log_response( | |
| self, | |
| learner_id: str, | |
| session_id: int, | |
| item_id: str, | |
| skill: str, | |
| difficulty: int, | |
| correct: bool, | |
| latency_ms: Optional[int] = None, | |
| ) -> None: | |
| self._conn.execute( | |
| "INSERT INTO responses (learner_id, session_id, ts, item_id, skill, difficulty, correct, latency_ms) " | |
| "VALUES (?,?,?,?,?,?,?,?)", | |
| (learner_id, session_id, int(time.time()), item_id, skill, difficulty, int(correct), latency_ms), | |
| ) | |
| self._conn.commit() | |
| def log_interaction( | |
| self, | |
| learner_id: str, | |
| session_id: int, | |
| item_id: str, | |
| skill: str, | |
| difficulty: int, | |
| lang: str, | |
| prompt_text: str, | |
| child_response: str, | |
| feedback_given: str, | |
| correct: bool, | |
| latency_ms: Optional[int] = None, | |
| ) -> None: | |
| """Log a full prompt→response→feedback triple, encrypted on-device. | |
| These records are the raw material for future real-data fine-tuning. | |
| Nothing leaves the device; use export_interactions_for_finetuning() | |
| to produce a pseudonymised JSONL file for training. | |
| """ | |
| self._conn.execute( | |
| "INSERT INTO interactions " | |
| "(learner_id, session_id, ts, item_id, skill, difficulty, lang, " | |
| " prompt_enc, child_response_enc, feedback_enc, correct, latency_ms) " | |
| "VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", | |
| ( | |
| learner_id, | |
| session_id, | |
| int(time.time()), | |
| item_id, | |
| skill, | |
| difficulty, | |
| lang, | |
| _encrypt(prompt_text.encode(), self._key), | |
| _encrypt(child_response.encode(), self._key), | |
| _encrypt(feedback_given.encode(), self._key), | |
| int(correct), | |
| latency_ms, | |
| ), | |
| ) | |
| self._conn.commit() | |
| def export_interactions_for_finetuning(self, out_path: str | Path) -> int: | |
| """Export pseudonymised interactions as JSONL for LoRA fine-tuning. | |
| Learner IDs are replaced with a one-way hash so no real identity leaks. | |
| Returns the number of records written. | |
| """ | |
| rows = self._conn.execute( | |
| "SELECT learner_id, skill, difficulty, lang, " | |
| "prompt_enc, child_response_enc, feedback_enc, correct " | |
| "FROM interactions ORDER BY ts" | |
| ).fetchall() | |
| out_path = Path(out_path) | |
| out_path.parent.mkdir(parents=True, exist_ok=True) | |
| count = 0 | |
| with out_path.open("w", encoding="utf-8") as f: | |
| for learner_id, skill, difficulty, lang, p_enc, r_enc, fb_enc, correct in rows: | |
| try: | |
| prompt = _decrypt(p_enc, self._key).decode() | |
| response = _decrypt(r_enc, self._key).decode() | |
| feedback = _decrypt(fb_enc, self._key).decode() | |
| except Exception: | |
| continue # skip records that fail to decrypt | |
| pseudo_id = hashlib.sha256(learner_id.encode()).hexdigest()[:12] | |
| record = { | |
| "instruction": ( | |
| f"You are a friendly math tutor for children aged 5–9. " | |
| f"Skill: {skill}, difficulty: {difficulty}/10, language: {lang}." | |
| ), | |
| "input": f"Question shown: {prompt}\nChild answered: {response}\nCorrect: {bool(correct)}", | |
| "output": feedback, | |
| "meta": {"pseudo_learner": pseudo_id, "skill": skill, "lang": lang}, | |
| } | |
| f.write(json.dumps(record, ensure_ascii=False) + "\n") | |
| count += 1 | |
| return count | |
| # ------------------------------------------------------------------ | |
| # Weekly report | |
| # ------------------------------------------------------------------ | |
| def weekly_report(self, learner_id: str, week_start_ts: Optional[int] = None) -> dict: | |
| if week_start_ts is None: | |
| week_start_ts = int(time.time()) - 7 * 86400 | |
| rows = self._conn.execute( | |
| "SELECT skill, correct FROM responses WHERE learner_id=? AND ts>=?", | |
| (learner_id, week_start_ts), | |
| ).fetchall() | |
| skill_stats: Dict[str, Dict] = {s: {"correct": 0, "total": 0} for s in SKILLS} | |
| for skill, correct in rows: | |
| if skill in skill_stats: | |
| skill_stats[skill]["total"] += 1 | |
| skill_stats[skill]["correct"] += correct | |
| state = self.load_latest_state(learner_id) | |
| skills_out = {} | |
| for s in SKILLS: | |
| total = skill_stats[s]["total"] | |
| acc = skill_stats[s]["correct"] / total if total else 0.0 | |
| bkt_current = 0.0 | |
| if state and "bkt" in state and s in state["bkt"]: | |
| bkt_current = state["bkt"][s].get("p_known", 0.0) | |
| skills_out[s] = { | |
| "current": round(max(acc, bkt_current), 3), | |
| "delta": round(acc - 0.5, 3), | |
| "weekly_attempts": total, | |
| "weekly_accuracy": round(acc, 3), | |
| } | |
| sessions_this_week = self._conn.execute( | |
| "SELECT COUNT(*) FROM sessions WHERE learner_id=? AND started_at>=?", | |
| (learner_id, week_start_ts), | |
| ).fetchone()[0] | |
| best = max(skills_out, key=lambda s: skills_out[s]["current"]) | |
| worst = min(skills_out, key=lambda s: skills_out[s]["current"]) | |
| return { | |
| "learner_id": learner_id, | |
| "week_starting": _ts_to_date(week_start_ts), | |
| "sessions": sessions_this_week, | |
| "skills": skills_out, | |
| "icons_for_parent": { | |
| "overall_arrow": "up" if sum(v["delta"] for v in skills_out.values()) > 0 else "flat", | |
| "best_skill": best, | |
| "needs_help": worst, | |
| }, | |
| "voiced_summary_audio": f"tts/reports/{learner_id}_week_{_ts_to_date(week_start_ts)}.wav", | |
| } | |
| # ------------------------------------------------------------------ | |
| # Differential-privacy upstream sync payload | |
| # ------------------------------------------------------------------ | |
| def dp_sync_payload( | |
| self, epsilon: float = 1.0, delta: float = 1e-5, week_start_ts: Optional[int] = None | |
| ) -> dict: | |
| """ | |
| Build a DP-sanitised aggregation over ALL learners for cooperative stats. | |
| Uses Gaussian mechanism: noise σ = sqrt(2 * ln(1.25/δ)) * sensitivity / ε | |
| Sensitivity = 1 (per-skill accuracy in [0,1]). | |
| The returned dict has NO individual-level data — only noisy cohort averages. | |
| """ | |
| if week_start_ts is None: | |
| week_start_ts = int(time.time()) - 7 * 86400 | |
| rows = self._conn.execute( | |
| "SELECT skill, correct FROM responses WHERE ts>=?", (week_start_ts,) | |
| ).fetchall() | |
| skill_acc: Dict[str, list] = {s: [] for s in SKILLS} | |
| for skill, correct in rows: | |
| if skill in skill_acc: | |
| skill_acc[skill].append(float(correct)) | |
| sensitivity = 1.0 | |
| sigma = math.sqrt(2 * math.log(1.25 / delta)) * sensitivity / epsilon | |
| noisy: Dict[str, float] = {} | |
| for s in SKILLS: | |
| vals = skill_acc[s] | |
| if vals: | |
| mean = sum(vals) / len(vals) | |
| noise = float(np.random.normal(0, sigma)) | |
| noisy[s] = float(np.clip(mean + noise, 0.0, 1.0)) | |
| else: | |
| noisy[s] = 0.5 # no data → report prior | |
| return { | |
| "epsilon_used": epsilon, | |
| "delta_used": delta, | |
| "week_starting": _ts_to_date(week_start_ts), | |
| "cohort_size": self._conn.execute("SELECT COUNT(DISTINCT learner_id) FROM learners").fetchone()[0], | |
| "noisy_skill_accuracy": noisy, | |
| } | |
| def close(self): | |
| self._conn.close() | |
| def _ts_to_date(ts: int) -> str: | |
| import datetime | |
| return datetime.datetime.utcfromtimestamp(ts).strftime("%Y-%m-%d") | |