diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..e4ca99992c7e4c6def4720465534dcb623897738 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,29 @@ +# ── Python ──────────────────────────────────────────────────────────────────── +__pycache__/ +*.py[cod] +*.pyo +.venv/ +*.egg-info/ +.pytest_cache/ +.mypy_cache/ +dist/ + +# ── Node / Frontend ─────────────────────────────────────────────────────────── +frontend/node_modules/ +frontend/dist/ +frontend/.env.local + +# ── Git / Editor ────────────────────────────────────────────────────────────── +.git/ +.gitignore +.vscode/ +*.md +TASKS.md + +# ── OS ──────────────────────────────────────────────────────────────────────── +.DS_Store +Thumbs.db + +# ── Model binary variants (keep only .pkl, not duplicate .pth) ─────────────── +# Both extensions are identical — Docker only needs .pkl +**/*.pth diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..bca5bb0a4e7b23ba6b0c8ca778633c20e429a6cb --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +*.pth filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/deploy-backend.yml b/.github/workflows/deploy-backend.yml new file mode 100644 index 0000000000000000000000000000000000000000..2845792e5fa003c26c740417ba2b687d313932d3 --- /dev/null +++ b/.github/workflows/deploy-backend.yml @@ -0,0 +1,61 @@ +name: Deploy Backend → Fly.io + +on: + push: + branches: [main] + paths: + - 'backend/**' + - 'Dockerfile' + - '.dockerignore' + - 'fly.toml' + - 'Mediapipe_XGBoost/**' + - 'CNN_Autoencoder_LightGBM/**' + - 'CNN_PreTrained/**' + +jobs: + test: + name: Run backend tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: pip + cache-dependency-path: backend/requirements.txt + + - name: Install deps + run: | + pip install -r backend/requirements.txt + pip install -r backend/requirements-dev.txt + + - name: Run tests + working-directory: backend + env: + KERAS_BACKEND: tensorflow + TF_CPP_MIN_LOG_LEVEL: "3" + CUDA_VISIBLE_DEVICES: "" + TF_ENABLE_ONEDNN_OPTS: "0" + run: pytest tests/ -v --tb=short -q + # Note: tests will be skipped automatically if model .pkl files are absent + # (model artefacts are gitignored). Add them as GitHub Actions artifacts + # or use DVC/GCS to restore them in CI if you want full test coverage. + + deploy: + name: Deploy to Fly.io + needs: test + runs-on: ubuntu-latest + environment: production + concurrency: + group: fly-deploy + cancel-in-progress: true + steps: + - uses: actions/checkout@v4 + + - uses: superfly/flyctl-actions/setup-flyctl@master + + - name: Deploy + run: flyctl deploy --remote-only + env: + FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} diff --git a/.github/workflows/deploy-frontend.yml b/.github/workflows/deploy-frontend.yml new file mode 100644 index 0000000000000000000000000000000000000000..9028fddef4abded7700393976e118561d5d0f046 --- /dev/null +++ b/.github/workflows/deploy-frontend.yml @@ -0,0 +1,45 @@ +name: Deploy Frontend → Vercel + +on: + push: + branches: [main] + paths: + - 'frontend/**' + +jobs: + build-and-deploy: + name: Build & Deploy + runs-on: ubuntu-latest + environment: production + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-node@v4 + with: + node-version: '22' + cache: npm + cache-dependency-path: frontend/package-lock.json + + - name: Install dependencies + working-directory: frontend + run: npm ci + + - name: Type-check + working-directory: frontend + run: npx tsc --project tsconfig.app.json --noEmit + + - name: Build + working-directory: frontend + env: + VITE_WS_URL: ${{ vars.VITE_WS_URL }} + VITE_API_URL: ${{ vars.VITE_API_URL }} + run: npm run build + + - name: Deploy to Vercel + uses: amondnet/vercel-action@v25 + with: + vercel-token: ${{ secrets.VERCEL_TOKEN }} + vercel-org-id: ${{ secrets.VERCEL_ORG_ID }} + vercel-project-id: ${{ secrets.VERCEL_PROJECT_ID }} + working-directory: frontend + vercel-args: '--prod' diff --git a/CNN_Autoencoder_LightGBM/autoencoder_model.pkl b/CNN_Autoencoder_LightGBM/autoencoder_model.pkl new file mode 100644 index 0000000000000000000000000000000000000000..187665d56106a08c70e7530ff9cf6ea0a18f9c5c --- /dev/null +++ b/CNN_Autoencoder_LightGBM/autoencoder_model.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5843688f059d26851774e553c4afddbc7c0f2f7fc048401b8447f290a63d2cbe +size 92934 diff --git a/CNN_Autoencoder_LightGBM/autoencoder_model.pth b/CNN_Autoencoder_LightGBM/autoencoder_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..9d2087d04c172dc153c50533b4c101b020d98b51 --- /dev/null +++ b/CNN_Autoencoder_LightGBM/autoencoder_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3593536edda0328121d5f92fd186a8e40c341799bd9bb703e0e2ad155b6e7aeb +size 121321 diff --git a/CNN_Autoencoder_LightGBM/lgbm_model.pkl b/CNN_Autoencoder_LightGBM/lgbm_model.pkl new file mode 100644 index 0000000000000000000000000000000000000000..cb2d2867b68e9620a66689514a19a6f958d56124 --- /dev/null +++ b/CNN_Autoencoder_LightGBM/lgbm_model.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e83d2bb3a18da0b3ccdd7afc5d044fa52c6e70c4e6090b312a622a866ee0008 +size 3623126 diff --git a/CNN_Autoencoder_LightGBM/lgbm_model.pth b/CNN_Autoencoder_LightGBM/lgbm_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..0534d3026a1e90107943adfeaa834eff72035ba2 --- /dev/null +++ b/CNN_Autoencoder_LightGBM/lgbm_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a09f7b712da5f0e6b63e222e4ea938029567bd8cf496da7ad93752d54219b57 +size 3626367 diff --git a/CNN_PreTrained/cnn_model.pkl b/CNN_PreTrained/cnn_model.pkl new file mode 100644 index 0000000000000000000000000000000000000000..6adb94be239ba68afcc89f5d9da68f836b8dd2d8 --- /dev/null +++ b/CNN_PreTrained/cnn_model.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:464df17407edea99db1b69c20e7ff718f6ceafb05f1bbeaacc889499e4cd920a +size 97136794 diff --git a/CNN_PreTrained/cnn_model.pth b/CNN_PreTrained/cnn_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..0bc2e3ed93a88f248a2aa58e0d4558bc1e40609b --- /dev/null +++ b/CNN_PreTrained/cnn_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eb8e3419763c47b5ba2480ccaf9907e8d748602b26fe59c009b6112fa840ae5 +size 146278905 diff --git a/CNN_PreTrained/svm_model.pkl b/CNN_PreTrained/svm_model.pkl new file mode 100644 index 0000000000000000000000000000000000000000..bce377664a6b217e15764a9e77150678d635609e --- /dev/null +++ b/CNN_PreTrained/svm_model.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf21a17c1340b84359c3431fc4ae8eb05239e4e1ef58dd34ab775f53b9bc7f53 +size 929927 diff --git a/CNN_PreTrained/svm_model.pth b/CNN_PreTrained/svm_model.pth new file mode 100644 index 0000000000000000000000000000000000000000..259ea84b1227e68cb399d4209d6cd7845b624483 --- /dev/null +++ b/CNN_PreTrained/svm_model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e706ebf4588c580d0e6ac6f1554f9fd2eaef5564ee02f8022e3ca5f13bb8985b +size 1079865 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..7168a5adbb77fc546be24fdbdcd88aecd76658a5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,48 @@ +# ───────────────────────────────────────────────────────────────────────────── +# SanketSetu Backend — Dockerfile +# Build context: repo root (SanketSetu/) +# +# docker build -t sanketsetu-backend . +# docker run -p 8000:8000 sanketsetu-backend +# ───────────────────────────────────────────────────────────────────────────── + +FROM python:3.12-slim AS base + +# System libraries needed by OpenCV headless + Pillow +RUN apt-get update && apt-get install -y --no-install-recommends \ + libgl1 libglib2.0-0 libgomp1 \ + && rm -rf /var/lib/apt/lists/* + +# ── Python dependencies (cached layer) ─────────────────────────────────────── +WORKDIR /app +COPY backend/requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +# ── Application source ──────────────────────────────────────────────────────── +COPY backend/app/ ./app/ + +# ── Model artefacts ─────────────────────────────────────────────────────────── +# Copied to /models so the container is fully self-contained. +# Override at runtime with -e WEIGHTS_DIR=/mnt/models + bind-mount if preferred. +COPY Mediapipe_XGBoost/ /models/Mediapipe_XGBoost/ +COPY CNN_Autoencoder_LightGBM/ /models/CNN_Autoencoder_LightGBM/ +COPY CNN_PreTrained/ /models/CNN_PreTrained/ + +# ── Runtime environment ─────────────────────────────────────────────────────── +ENV WEIGHTS_DIR=/models \ + KERAS_BACKEND=tensorflow \ + TF_CPP_MIN_LOG_LEVEL=3 \ + CUDA_VISIBLE_DEVICES="" \ + TF_ENABLE_ONEDNN_OPTS=0 \ + OMP_NUM_THREADS=4 \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +EXPOSE 8000 + +# ── Health-check ────────────────────────────────────────────────────────────── +# Wait up to 3 minutes for models to load before marking the container healthy. +HEALTHCHECK --interval=30s --timeout=10s --start-period=180s --retries=3 \ + CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health', timeout=5)" + +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/Mediapipe_XGBoost/model.pkl b/Mediapipe_XGBoost/model.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5b963393836905c736c041f9e3a43471dc2c070c --- /dev/null +++ b/Mediapipe_XGBoost/model.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a736b83df3e56b69b0f1c11f018257760746969d6598d90ea2a60c78f8305883 +size 1711525 diff --git a/Mediapipe_XGBoost/model.pth b/Mediapipe_XGBoost/model.pth new file mode 100644 index 0000000000000000000000000000000000000000..f2547200fcf8a63e8f02717877c16e5926ed8e0b --- /dev/null +++ b/Mediapipe_XGBoost/model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ff5f1cbc121be57f2a7fe04b38925ea740fe79602a6205ca09a748cb0f20b81 +size 1895969 diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c27590d088ed7bf0275991d407a54da55c1eea3e --- /dev/null +++ b/README.md @@ -0,0 +1,108 @@ +# SanketSetu + +A real-time sign language recognition system using machine learning and computer vision. + +## Overview + +SanketSetu is an intelligent sign language interpretation system that provides real-time recognition and translation of sign language gestures using advanced machine learning models and MediaPipe hand tracking. + +## Project Structure + +``` +├── backend/ # FastAPI backend server +│ ├── app/ # Main application code +│ │ ├── inference/ # ML inference pipelines +│ │ └── models/ # Model loading and management +│ └── tests/ # Backend tests +├── frontend/ # React + TypeScript frontend +│ └── src/ +│ ├── components/ # React components +│ ├── hooks/ # Custom React hooks +│ └── lib/ # Utility libraries +├── CNN_Autoencoder_LightGBM/ # CNN Autoencoder + LightGBM model +├── CNN_PreTrained/ # CNN + SVM model +└── Mediapipe_XGBoost/ # MediaPipe + XGBoost model +``` + +## Features + +- Real-time sign language gesture recognition +- Multiple ML model ensemble approach +- WebSocket-based real-time communication +- MediaPipe hand landmark tracking +- Interactive webcam feed with visual feedback +- Prediction confidence display + +## Tech Stack + +### Backend +- FastAPI +- Python 3.x +- PyTorch +- LightGBM +- XGBoost +- MediaPipe + +### Frontend +- React +- TypeScript +- Vite +- TailwindCSS + +## Getting Started + +### Prerequisites +- Python 3.8+ +- Node.js 16+ +- npm or yarn + +### Backend Setup + +```bash +cd backend +pip install -r requirements.txt +python -m app.main +``` + +### Frontend Setup + +```bash +cd frontend +npm install +npm run dev +``` + +## Development + +Run the development servers: + +```bash +# Start both frontend and backend +.\start.ps1 +``` + +## Docker + +Build and run using Docker: + +```bash +docker build -t sanketsetu . +docker run -p 8000:8000 sanketsetu +``` + +## Testing + +Run backend tests: + +```bash +cd backend +pytest +``` + +## License + +All rights reserved. + +## Author + +Devrajsinh Gohil (devrajsinh2012) diff --git a/SanketSetu_ Production-Grade Implementation Plan.md b/SanketSetu_ Production-Grade Implementation Plan.md new file mode 100644 index 0000000000000000000000000000000000000000..e877d0d19dac08ead1126465f3819bedf184bef4 --- /dev/null +++ b/SanketSetu_ Production-Grade Implementation Plan.md @@ -0,0 +1,99 @@ +# SanketSetu: Production-Grade Implementation Plan + +## 1. Executive Summary +**SanketSetu** (Bridge of Signs) is a high-performance, real-time Gujarati Sign Language (GSL) recognition system. This document outlines a production-ready architecture designed to run entirely on **free-tier cloud services**. The system leverages a decoupled architecture with a React-based interactive frontend and a FastAPI backend, ensuring low-latency inference and a seamless user experience. + +--- + +## 2. High-Level System Architecture +The system follows a modern microservices-inspired pattern to ensure scalability and ease of updates. + +| Component | Technology | Role | Hosting (Free Tier) | +| :--- | :--- | :--- | :--- | +| **Frontend** | React + Vite + TS | User interface, webcam capture, real-time feedback | **Vercel** | +| **Backend API** | FastAPI (Python) | WebSocket management, API gateway, logic | **Fly.io** | +| **Inference Engine** | ONNX Runtime / XGBoost | High-speed model execution | **Fly.io** (Internal) | +| **Storage** | Cloudflare R2 | S3-compatible storage for model weights | **Cloudflare** | +| **Real-time** | WebSockets (WSS) | Low-latency frame-by-frame data transfer | N/A | + +--- + +## 3. Backend Implementation Details + +### 3.1 API Design (FastAPI) +The backend is built for speed. It handles binary data from WebSockets to minimize overhead. + +* **WebSocket Protocol**: The client sends a stream of normalized hand landmark coordinates (63 points per frame) extracted locally via MediaPipe. This reduces bandwidth significantly compared to sending raw video frames. +* **Concurrency**: Uses `asyncio` to handle multiple simultaneous user connections without blocking the event loop. +* **Model Loading**: Models are loaded into memory at startup using a Singleton pattern to ensure zero-latency on the first request. + +### 3.2 Model Serving Strategy +1. **Primary Model**: The **XGBoost** model is used as the default due to its sub-millisecond inference time. +2. **Backup/Ensemble**: The system can optionally query the **CNN+SVM** or **LGBM** models for high-confidence verification if the XGBoost score is below a certain threshold. +3. **Optimization**: Models are converted to **ONNX** format to leverage the ONNX Runtime's hardware-specific optimizations, even on free-tier CPU instances. + +--- + +## 4. Frontend & Interactive UI/UX + +The frontend is designed to be "cool," responsive, and highly interactive, providing users with a "futuristic" feel. + +### 4.1 Tech Stack +* **Styling**: Tailwind CSS for rapid, modern UI development. +* **Animations**: Framer Motion for smooth transitions, layout changes, and interactive elements. +* **Icons**: Lucide React for a clean, consistent icon set. + +### 4.2 Key UI Features +* **Glassmorphism Design**: Use of semi-transparent backgrounds with blur effects for a modern look. +* **Interactive Landmark Overlay**: A canvas overlay on the webcam feed that draws the 21 hand landmarks in real-time. Landmarks will "glow" when a sign is successfully recognized. +* **Dynamic Prediction HUD**: A Head-Up Display (HUD) style interface that shows the current prediction, confidence level, and a history of recently detected signs. +* **Responsive Layout**: Fully functional on mobile and desktop, with optimized camera controls for both. + +### 4.3 User Experience Flow +1. **Onboarding**: A quick, animated guide on how to position the hand for best results. +2. **Calibration**: A brief "Ready?" state that ensures the lighting and hand distance are optimal. +3. **Real-time Translation**: Instant feedback as the user signs, with the translated Gujarati text appearing in a stylized "speech bubble" or text box. + +--- + +## 4. Deployment & DevOps + +### 4.1 Continuous Integration/Deployment (CI/CD) +Using **GitHub Actions**, the project will follow a strict deployment pipeline: +1. **Lint & Test**: Ensure code quality and run unit tests for ML logic. +2. **Build**: Create optimized production builds for the React app and Dockerize the FastAPI backend. +3. **Deploy**: + * Frontend automatically pushes to **Vercel**. + * Backend pushes to **Fly.io** using `flyctl`. + +### 4.2 Scalability & Cost Management +* **Scale-to-Zero**: The backend on Fly.io can be configured to sleep when not in use to preserve free-tier resources. +* **CDN Caching**: Vercel's Edge Network will cache all static assets, ensuring fast load times globally. + +--- + +## 5. Implementation Roadmap + +### Phase 1: Core Backend & ML Integration +- [ ] Set up FastAPI project structure. +- [ ] Implement WebSocket handler for landmark data. +- [ ] Integrate the trained XGBoost model for real-time inference. + +### Phase 2: Advanced Frontend Development +- [ ] Initialize Vite + React project with Tailwind. +- [ ] Implement webcam capture and MediaPipe landmark extraction (client-side). +- [ ] Create the interactive HUD and glassmorphism UI. + +### Phase 3: Production Hardening +- [ ] Set up GitHub Actions for automated deployment. +- [ ] Implement error handling for low-bandwidth scenarios. +- [ ] Finalize documentation and user guide. + +--- + +## 6. References +[1] [FastAPI Documentation](https://fastapi.tiangolo.com/) - High-performance web framework for building APIs. +[2] [MediaPipe Hands](https://developers.google.com/mediapipe/solutions/vision/hand_landmarker) - Real-time hand landmark detection. +[3] [Framer Motion](https://www.framer.com/motion/) - A production-ready motion library for React. +[4] [Fly.io Free Tier](https://fly.io/docs/about/pricing/) - Details on free-tier resource allocation. +[5] [Vercel Deployment](https://vercel.com/docs/deployments/overview) - Global CDN and hosting for frontend applications. diff --git a/TASKS.md b/TASKS.md new file mode 100644 index 0000000000000000000000000000000000000000..2a397bc7610da3482b311851bcfa1d850754f355 --- /dev/null +++ b/TASKS.md @@ -0,0 +1,284 @@ +# SanketSetu — Execution TODO & Implementation Tracker + +## Model Analysis (Reviewed 2026-03-02) + +All 5 model files inspected. Three distinct inference pipelines exist: + +| Pipeline | Files | Input | Process | Output | +|---|---|---|---|---| +| **A — Primary (Fastest)** | `Mediapipe_XGBoost/model.pkl` | 63 MediaPipe coords (21 landmarks × x,y,z) | XGBClassifier (50 trees) | 34-class probability | +| **B — Autoencoder + LGBM** | `CNN_Autoencoder_LightGBM/autoencoder_model.pkl` + `lgbm_model.pkl` | 63 MediaPipe coords | Encoder (63→32→**16** bottleneck) + LGBMClassifier | 34-class probability | +| **C — Vision CNN + SVM** | `CNN_PreTrained/cnn_model.pkl` + `svm_model.pkl` | 128×128×3 RGB image | ResNet50-based CNN (179 layers) → 256 features + SVC(C=10) | 34-class probability w/ probability=True | + +### Key Architecture Facts +- **34 classes** (Gujarati Sign Language alphabet + digits, labels 0–33) +- **Pipeline A** input: 63 floats — directly from MediaPipe `hand_landmarks` (x, y, z per landmark, flattened) +- **Pipeline B** input: same 63 floats → takes only the encoder half (first 3 Dense layers, output of `dense_1` layer = 16 features) +- **Pipeline C** input: 128×128 BGR/RGB cropped hand image, normalized to [0,1] +- All `.pth` files are identical copies of the `.pkl` files (same objects, different extension) +- Model quality strategy: A is primary (sub-ms); if confidence < threshold, query B or C for ensemble + +--- + +## Project Folder Structure to Create + +``` +SanketSetu/ +├── backend/ ← FastAPI server +│ ├── app/ +│ │ ├── main.py ← FastAPI entry, WebSocket + REST +│ │ ├── models/ +│ │ │ ├── loader.py ← Singleton model loader +│ │ │ └── label_map.py ← 0–33 → Gujarati sign name mapping +│ │ ├── inference/ +│ │ │ ├── pipeline_a.py ← XGBoost inference (63 landmarks) +│ │ │ ├── pipeline_b.py ← Autoencoder encoder + LightGBM +│ │ │ ├── pipeline_c.py ← ResNet CNN + SVM (image-based) +│ │ │ └── ensemble.py ← Confidence-weighted ensemble logic +│ │ ├── schemas.py ← Pydantic request/response models +│ │ └── config.py ← Settings (confidence threshold, etc.) +│ ├── weights/ ← Symlink or copy of model pkl files +│ ├── requirements.txt +│ ├── Dockerfile +│ └── fly.toml +│ +├── frontend/ ← Vite + React + TS +│ ├── src/ +│ │ ├── components/ +│ │ │ ├── WebcamFeed.tsx ← Webcam + canvas landmark overlay +│ │ │ ├── LandmarkCanvas.tsx ← Draws 21 hand points + connections +│ │ │ ├── PredictionHUD.tsx ← Live sign, confidence bar, history +│ │ │ ├── OnboardingGuide.tsx ← Animated intro wizard +│ │ │ └── Calibration.tsx ← Lighting/distance check UI +│ │ ├── hooks/ +│ │ │ ├── useWebSocket.ts ← WS connection, send/receive +│ │ │ ├── useMediaPipe.ts ← MediaPipe Hands JS integration +│ │ │ └── useWebcam.ts ← Camera permissions + stream +│ │ ├── lib/ +│ │ │ └── landmarkUtils.ts ← Landmark normalization (mirror XGBoost preprocessing) +│ │ ├── App.tsx +│ │ └── main.tsx +│ ├── public/ +│ ├── index.html +│ ├── tailwind.config.ts +│ ├── vite.config.ts +│ └── package.json +│ +├── CNN_Autoencoder_LightGBM/ ← (existing) +├── CNN_PreTrained/ ← (existing) +├── Mediapipe_XGBoost/ ← (existing) +└── .github/ + └── workflows/ + ├── deploy-backend.yml + └── deploy-frontend.yml +``` + +--- + +## Phase 1 — Backend Core (FastAPI + Model Integration) + +### 1.1 Project Bootstrap +- [x] Create `backend/` folder and `app/` package structure +- [x] Create `backend/requirements.txt` with: `fastapi`, `uvicorn[standard]`, `websockets`, `xgboost`, `lightgbm`, `scikit-learn`, `keras==3.13.2`, `tensorflow-cpu`, `numpy`, `opencv-python-headless`, `pillow`, `python-dotenv` +- [x] Create `backend/app/config.py` — confidence threshold (default 0.7), WebSocket max connections, pipeline mode (A/B/C/ensemble) +- [x] Create `backend/app/models/label_map.py` — map class indices 0–33 to Gujarati sign names + +### 1.2 Model Loader (Singleton) +- [x] Create `backend/app/models/loader.py` + - Load `model.pkl` (XGBoost) at startup + - Load `autoencoder_model.pkl` (extract encoder layers only: input → dense → dense_1) and `lgbm_model.pkl` + - Load `cnn_model.pkl` (full ResNet50 feature extractor, strip any classification head) and `svm_model.pkl` + - Expose `ModelStore` singleton accessed via `get_model_store()` dependency + - Log load times for each model + +### 1.3 Pipeline A — XGBoost (Primary, Landmarks) +- [x] Create `backend/app/inference/pipeline_a.py` + - Input: `List[float]` of length 63 (x,y,z per landmark, already normalized by MediaPipe) + - Output: `{"sign": str, "confidence": float, "probabilities": List[float]}` + - Use `model.predict_proba(np.array(landmarks).reshape(1,-1))[0]` + - Return `classes_[argmax]` and `max(probabilities)` as confidence + +### 1.4 Pipeline B — Autoencoder Encoder + LightGBM +- [x] Create `backend/app/inference/pipeline_b.py` + - Build encoder-only submodel: `encoder = keras.Model(inputs=model.input, outputs=model.layers[2].output)` (output of `dense_1`, the 16-D bottleneck) + - Input: 63 MediaPipe coords + - Encode: `features = encoder.predict(np.array(landmarks).reshape(1,-1))[0]` → shape (16,) + - Classify: `lgbm.predict_proba(features.reshape(1,-1))[0]` + +### 1.5 Pipeline C — CNN + SVM (Image-based) +- [x] Create `backend/app/inference/pipeline_c.py` + - Input: base64-encoded JPEG or raw bytes of the cropped hand region (128×128 px) + - Decode → numpy array (128,128,3) uint8 → normalize to float32 [0,1] + - `features = cnn_model.predict(img[np.newaxis])[0]` → shape (256,) + - `proba = svm.predict_proba(features.reshape(1,-1))[0]` + - Note: CNN inference is slower (~50–200ms on CPU); only call when Pipeline A confidence < threshold + +### 1.6 Ensemble Logic +- [x] Create `backend/app/inference/ensemble.py` + - Call Pipeline A first + - If `confidence < config.THRESHOLD` (default 0.7), call Pipeline B + - If still below threshold and image data available, call Pipeline C + - Final result: weighted average of probabilities from each pipeline that was called + - Return the top predicted class and ensemble confidence score + +### 1.7 WebSocket Handler +- [x] Create `backend/app/main.py` with FastAPI app +- [x] Implement `GET /health` — returns `{"status": "ok", "models_loaded": true}` +- [x] Implement `WS /ws/landmarks` — primary endpoint + - Client sends JSON: `{"landmarks": [63 floats], "session_id": "..."}` + - Server responds: `{"sign": "...", "confidence": 0.95, "pipeline": "A", "label_index": 12}` + - Handle disconnect gracefully +- [x] Implement `WS /ws/image` — optional image-based endpoint for Pipeline C + - Client sends JSON: `{"image_b64": "...", "session_id": "..."}` +- [x] Implement `POST /api/predict` — REST fallback for non-WS clients + - Body: `{"landmarks": [63 floats]}` + - Returns same response schema as WS + +### 1.8 Schemas & Validation +- [x] Create `backend/app/schemas.py` + - `LandmarkMessage(BaseModel)`: `landmarks: List[float]` (must be length 63), `session_id: str` + - `ImageMessage(BaseModel)`: `image_b64: str`, `session_id: str` + - `PredictionResponse(BaseModel)`: `sign: str`, `confidence: float`, `pipeline: str`, `label_index: int`, `probabilities: Optional[List[float]]` + +### 1.9 CORS & Middleware +- [x] Configure CORS for Vercel frontend domain + localhost:5173 +- [x] Add request logging middleware (log session_id, pipeline used, latency ms) +- [x] Add global exception handler returning proper JSON errors + +--- + +## Phase 2 — Frontend (React + Vite + Tailwind + Framer Motion) + +### 2.1 Project Bootstrap +- [x] Run `npm create vite@latest frontend -- --template react-ts` inside `SanketSetu/` +- [x] Install deps: `tailwindcss`, `framer-motion`, `lucide-react`, `@mediapipe/tasks-vision` +- [x] Configure Tailwind with custom palette (dark neon-cyan glassmorphism theme) +- [x] Set up `vite.config.ts` proxy: `/api` → backend URL, `/ws` → backend WS URL + +### 2.2 Webcam Hook (`useWebcam.ts`) +- [x] Request `getUserMedia({ video: { width: 1280, height: 720 } })` +- [x] Expose `videoRef`, `isReady`, `error`, `switchCamera()` (for mobile front/back toggle) +- [x] Handle permission denied state with instructional UI + +### 2.3 MediaPipe Hook (`useMediaPipe.ts`) +- [x] Initialize `HandLandmarker` from `@mediapipe/tasks-vision` (WASM backend) +- [x] Process video frames at target 30fps using `requestAnimationFrame` +- [x] Extract `landmarks[0]` (first hand) → flatten to 63 floats `[x0,y0,z0, x1,y1,z1, ...]` +- [x] Normalize: subtract wrist (landmark 0) position to make translation-invariant — **must match training preprocessing** +- [x] Expose `landmarks: number[] | null`, `handedness: string`, `isDetecting: boolean` + +### 2.4 WebSocket Hook (`useWebSocket.ts`) +- [x] Connect to `wss://backend-url/ws/landmarks` on mount +- [x] Auto-reconnect with exponential backoff on disconnect +- [x] `sendLandmarks(landmarks: number[])` — throttled to max 15 sends/sec +- [x] Expose `lastPrediction: PredictionResponse | null`, `isConnected: boolean`, `latency: number` + +### 2.5 Landmark Canvas (`LandmarkCanvas.tsx`) +- [x] Overlay `` on top of `