File size: 5,033 Bytes
cf93910
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c476eae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf93910
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import { useEffect, useRef, useState, useCallback } from 'react';
import {
  HandLandmarker,
  FilesetResolver,
  type HandLandmarkerResult,
  type NormalizedLandmark,
} from '@mediapipe/tasks-vision';
import { normaliseLandmarks } from '../lib/landmarkUtils';

export interface MediaPipeState {
  landmarks: number[] | null;          // 63-float normalised vector
  rawLandmarks: NormalizedLandmark[] | null; // 21-point raw result (for canvas drawing)
  handedness: 'Left' | 'Right' | null;
  isDetecting: boolean;
  isLoading: boolean;
  error: string | null;
  startDetection: (video: HTMLVideoElement) => void;
  stopDetection: () => void;
}

const WASM_URL =
  'https://cdn.jsdelivr.net/npm/@mediapipe/tasks-vision@latest/wasm';
const MODEL_URL =
  'https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task';

/**
 * Hook that drives MediaPipe HandLandmarker inference on a video element.
 * Runs at ~30 fps using requestAnimationFrame.
 */
export function useMediaPipe(): MediaPipeState {
  const landmarkerRef = useRef<HandLandmarker | null>(null);
  const rafRef        = useRef<number>(0);
  const lastTsRef     = useRef<number>(0);

  const [landmarks,    setLandmarks]    = useState<number[] | null>(null);
  const [rawLandmarks, setRawLandmarks] = useState<NormalizedLandmark[] | null>(null);
  const [handedness,   setHandedness]   = useState<'Left' | 'Right' | null>(null);
  const [isDetecting,  setIsDetecting]  = useState(false);
  const [isLoading,    setIsLoading]    = useState(false);
  const [error,        setError]        = useState<string | null>(null);

  // initialise on mount
  useEffect(() => {
    let cancelled = false;
    setIsLoading(true);

    (async () => {
      try {
        const vision = await FilesetResolver.forVisionTasks(WASM_URL);

        // Try GPU first for best performance; fall back to CPU if unavailable.
        let hl: HandLandmarker | null = null;
        try {
          hl = await HandLandmarker.createFromOptions(vision, {
            baseOptions: { modelAssetPath: MODEL_URL, delegate: 'GPU' },
            runningMode: 'VIDEO',
            numHands: 1,
            minHandDetectionConfidence: 0.4,
            minHandPresenceConfidence: 0.4,
            minTrackingConfidence: 0.4,
          });
        } catch {
          console.warn('GPU delegate unavailable, falling back to CPU.');
          hl = await HandLandmarker.createFromOptions(vision, {
            baseOptions: { modelAssetPath: MODEL_URL, delegate: 'CPU' },
            runningMode: 'VIDEO',
            numHands: 1,
            minHandDetectionConfidence: 0.4,
            minHandPresenceConfidence: 0.4,
            minTrackingConfidence: 0.4,
          });
        }

        if (!cancelled) {
          landmarkerRef.current = hl;
          setIsLoading(false);
        }
      } catch (err) {
        if (!cancelled) {
          console.error('MediaPipe init error', err);
          setError('Failed to load hand detection model. Check network.');
          setIsLoading(false);
        }
      }
    })();

    return () => {
      cancelled = true;
      cancelAnimationFrame(rafRef.current);
      landmarkerRef.current?.close();
    };
  }, []);

  const startDetection = useCallback((video: HTMLVideoElement) => {
    if (!landmarkerRef.current) return;
    setIsDetecting(true);

    const detect = (now: number) => {
      if (!landmarkerRef.current || !video || video.paused || video.ended) {
        rafRef.current = requestAnimationFrame(detect);
        return;
      }

      // Throttle to ~30 fps
      if (now - lastTsRef.current < 33) {
        rafRef.current = requestAnimationFrame(detect);
        return;
      }
      lastTsRef.current = now;

      let result: HandLandmarkerResult;
      try {
        result = landmarkerRef.current.detectForVideo(video, now);
      } catch {
        rafRef.current = requestAnimationFrame(detect);
        return;
      }

      if (result.handednesses.length > 0 && result.landmarks.length > 0) {
        const raw = result.landmarks[0];          // NormalizedLandmark[]
        const hand = result.handednesses[0][0].categoryName as 'Left' | 'Right';
        try {
          const flat = normaliseLandmarks(raw);
          setLandmarks(flat);
          setRawLandmarks(raw);
          setHandedness(hand);
        } catch {
          setLandmarks(null);
          setRawLandmarks(null);
        }
      } else {
        setLandmarks(null);
        setRawLandmarks(null);
        setHandedness(null);
      }

      rafRef.current = requestAnimationFrame(detect);
    };

    rafRef.current = requestAnimationFrame(detect);
  }, []);

  const stopDetection = useCallback(() => {
    cancelAnimationFrame(rafRef.current);
    setIsDetecting(false);
    setLandmarks(null);
    setRawLandmarks(null);
    setHandedness(null);
  }, []);

  return {
    landmarks,
    rawLandmarks,
    handedness,
    isDetecting,
    isLoading,
    error,
    startDetection,
    stopDetection,
  };
}