diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..324dc254fe946ee25c00b27064a79982ba992129 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,57 @@ +# Use Python 3.12 as base image +FROM python:3.12-slim + +# Install system dependencies including OpenCV requirements +RUN apt-get update && apt-get install -y \ + curl \ + libgl1 \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender1 \ + libgomp1 \ + && rm -rf /var/lib/apt/lists/* + +# Set up a new user named "user" with user ID 1000 (HF Spaces requirement) +RUN useradd -m -u 1000 user + +# Switch to the "user" user +USER user + +# Set home to the user's home directory +ENV HOME=/home/user \ + PATH=/home/user/.local/bin:$PATH + +# Set the working directory to the user's home directory +WORKDIR $HOME/app + +# Upgrade pip and install dependencies +RUN pip install --no-cache-dir --upgrade pip + +# Copy requirements first for better Docker layer caching +COPY --chown=user docker/requirements.txt $HOME/app/ + +# Install Python dependencies +RUN pip install --no-cache-dir --user -r requirements.txt + +# Copy the source code from parent directory +COPY --chown=user ../src/ $HOME/app/src/ +COPY --chown=user ../models/ $HOME/app/models/ + +# Copy the main entry point from parent directory +COPY --chown=user ../main.py $HOME/app/ +COPY --chown=user ../README.md $HOME/app/ + +# Expose the port that the app runs on (HF Spaces default is 7860) +EXPOSE 7860 + +# Set environment variables +ENV PYTHONPATH=$HOME/app +ENV PORT=7860 + +# Health check to ensure the API is running +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:7860/health || exit 1 + +# Start the application directly +CMD ["python", "main.py"] diff --git a/README.md b/README.md index 224b0354f8ff0a115156cc26fd212118d4ad81b5..10a8ad1d488507be76be3bad6dc81dfc60e87183 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,126 @@ --- -title: Validation -emoji: 🐢 -colorFrom: purple -colorTo: yellow +title: Gesture Detection & Identity Validation API +emoji: 👋 +colorFrom: blue +colorTo: purple sdk: docker pinned: false -license: other +license: mit +app_port: 7860 --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# 👋 Gesture Detection & Identity Validation API + +A unified API for gesture detection in videos and identity validation using facial recognition and gesture verification. + +## 🚀 Features + +- **Gesture Detection**: Detect and track hand gestures in video files +- **Identity Validation**: Validate user identity using facial recognition and required gestures +- **Real-time Processing**: Efficient video processing with configurable frame skip +- **RESTful API**: Clean, documented API endpoints + +## 📋 API Endpoints + +### `GET /` +Get API information and available endpoints + +### `GET /health` +Health check endpoint showing service status + +### `POST /gestures` +Detect gestures in an uploaded video file + +**Parameters:** +- `video` (file): Video file to process +- `frame_skip` (int, optional): Number of frames to skip (default: 1) + +**Response:** +```json +{ + "gestures": [ + { + "gesture": "thumbs_up", + "duration": 45, + "confidence": 0.92 + } + ] +} +``` + +### `POST /validate` +Validate user identity using facial recognition and gesture verification + +**Parameters:** +- `photo` (file): ID document photo +- `video` (file): User video containing face and gestures +- `gestures` (JSON array): Required gestures (e.g., `["thumbs_up","peace"]`) +- `error_margin` (float, optional): Error margin for validation (default: 0.33) +- `require_all_gestures` (bool, optional): Whether all gestures must be present +- `similarity_threshold` (float, optional): Facial similarity threshold +- `include_details` (bool, optional): Include detailed validation results + +**Response:** +```json +{ + "face": true, + "gestures": true, + "overall": true, + "status": "success", + "processing_time_ms": 6925, + "timestamp": "2025-09-30T08:30:22Z" +} +``` + +## 🎯 Supported Gestures + +- `thumbs_up` (👍) +- `peace` (✌️) +- `ok_sign` (👌) +- `open_palm` (👋) +- `call_me` (🤙) +- `grabbing` (✊) + +## 📖 Documentation + +Interactive API documentation is available at: +- **Swagger UI**: `/docs` +- **ReDoc**: `/redoc` + +## 🔧 Usage Example + +```bash +# Detect gestures in a video +curl -X POST http://localhost:7860/gestures \ + -F "video=@my_video.mp4" \ + -F "frame_skip=3" + +# Validate identity +curl -X POST http://localhost:7860/validate \ + -F "photo=@id_photo.jpg" \ + -F "video=@user_video.mp4" \ + -F 'gestures=["thumbs_up","peace"]' \ + -F "include_details=true" +``` + +## 🏗️ Technology Stack + +- **Framework**: FastAPI +- **ML Models**: ONNX Runtime +- **Computer Vision**: OpenCV +- **Tracking**: OCSort with Kalman filters +- **Facial Recognition**: Custom embeddings module + +## 📝 Note + +Facial validation is currently in placeholder mode and always returns success. Gesture validation is fully functional. + +## 📄 License + +MIT License - See LICENSE file for details + +## 🔗 Links + +- [GitHub Repository](https://github.com/kybtech/gesture-detection) +- [API Documentation](/docs) +- [Hugging Face Space](https://huggingface.co/spaces/algoryn/validation) \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..c6c92c8dcc26470d3c980277fa167d3dc3c48735 --- /dev/null +++ b/main.py @@ -0,0 +1,337 @@ +#!/usr/bin/env python3 +""" +Main entry point for the unified gesture detection and identity validation API. +Provides a flat API structure with all endpoints at the root level. +""" +import uvicorn +import os +import sys +import tempfile +import time +import json +import logging +from typing import Optional +from datetime import datetime, timezone + +# Add the project root to Python path +project_root = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, project_root) + +from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Depends +from fastapi.responses import ORJSONResponse + +# Import gesture detection functionality +from src.gesturedetection.api import process_video_for_gestures +from src.gesturedetection.models import GestureResponse + +# Import validation functionality +from src.validate.models import ValidationRequest, ValidationResponse, ValidationStatus +from src.validate.facial_validator import FacialValidator +from src.validate.gesture_validator import GestureValidator +from src.validate.api import get_validation_request +from src.validate.config import config + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +# Create main FastAPI application +app = FastAPI( + title="Gesture Detection & Identity Validation API", + description="Unified API for gesture detection and identity validation services", + version="1.0.0", + docs_url="/docs", + redoc_url="/redoc", + default_response_class=ORJSONResponse +) + +# Initialize validators for validation endpoint +facial_validator = FacialValidator() +gesture_validator = GestureValidator() + + +@app.get("/") +async def root(): + """ + Root endpoint providing API information. + + Returns + ------- + dict + API information and available endpoints + """ + return { + "name": "Gesture Detection & Identity Validation API", + "version": "1.0.0", + "description": "Unified API providing gesture detection and identity validation services", + "endpoints": { + "GET /": "API information", + "GET /health": "Health check", + "POST /validate": "Validate identity using facial recognition and gestures", + "POST /gestures": "Detect gestures in video", + "GET /docs": "Interactive API documentation" + } + } + + +@app.get("/health") +async def health(): + """ + Health check endpoint for the unified API. + + Returns + ------- + dict + Health status of all service components + """ + return { + "status": "healthy", + "service": "unified-api", + "version": "1.0.0", + "timestamp": datetime.now(timezone.utc).isoformat(), + "components": { + "gesture_detection": "available", + "identity_validation": "available", + "facial_validator": "initialized", + "gesture_validator": "initialized" + } + } + + +@app.post("/gestures", response_model=GestureResponse) +async def detect_gestures(video: UploadFile = File(...), frame_skip: int = Form(1)): + """ + Detect gestures in an uploaded video file. + + Parameters + ---------- + video : UploadFile + The video file to process + frame_skip : int + Number of frames to skip between processing (1 = process every frame, 3 = process every 3rd frame) + + Returns + ------- + GestureResponse + Response containing detected gestures with duration and confidence + """ + logger.info(f"Gesture detection request received: {video.filename}") + + # Validate file type + if not video.content_type or not video.content_type.startswith('video/'): + raise HTTPException(status_code=400, detail="File must be a video") + + # Create temporary file to save uploaded video + with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_file: + try: + # Write uploaded content to temporary file + content = await video.read() + temp_file.write(content) + temp_file.flush() + + logger.info(f"Processing video: {temp_file.name} ({len(content)} bytes)") + + # Process the video with frame skip parameter + gestures = process_video_for_gestures(temp_file.name, frame_skip=frame_skip) + + logger.info(f"Gesture detection completed: {len(gestures)} gestures detected") + + return GestureResponse(gestures=gestures) + + except Exception as e: + logger.error(f"Error processing video: {str(e)}", exc_info=True) + raise HTTPException(status_code=500, detail=f"Error processing video: {str(e)}") + + finally: + # Clean up temporary file + if os.path.exists(temp_file.name): + os.unlink(temp_file.name) + logger.debug(f"Cleaned up temporary file: {temp_file.name}") + + +@app.post("/validate", response_model=ValidationResponse) +async def validate_identity( + photo: UploadFile = File(...), + video: UploadFile = File(...), + request: ValidationRequest = Depends(get_validation_request) +): + """ + Validate user identity using facial recognition and gesture validation. + + This endpoint accepts an ID document photo, a user video containing + the person's face and required gestures, and a list of gestures that + must be performed. It returns validation results for both facial + recognition and gesture compliance. + + Parameters + ---------- + photo : UploadFile + ID document photo file (image format) + video : UploadFile + User video file containing face and gestures (video format) + request : ValidationRequest + Validation configuration and gesture requirements + + Returns + ------- + ValidationResponse + Validation results with success indicators and optional details + + Raises + ------ + HTTPException + If validation fails or processing errors occur + """ + start_time = time.time() + logger.info(f"Identity validation request received for {request.asked_gestures}") + + # Validate file types + if not photo.content_type or not photo.content_type.startswith(('image/', 'application/')): + raise HTTPException( + status_code=400, + detail="Photo file must be an image" + ) + + if not video.content_type or not video.content_type.startswith('video/'): + raise HTTPException( + status_code=400, + detail="Video file must be a video" + ) + + # Validate file sizes (basic check) + MAX_FILE_SIZE = 100 * 1024 * 1024 # 100MB + if photo.size and photo.size > MAX_FILE_SIZE: + raise HTTPException( + status_code=413, + detail="Photo file too large (max 100MB)" + ) + + if video.size and video.size > MAX_FILE_SIZE: + raise HTTPException( + status_code=413, + detail="Video file too large (max 100MB)" + ) + + # Create temporary files for processing + temp_photo = None + temp_video = None + + try: + # Save uploaded files to temporary location + with tempfile.NamedTemporaryFile(delete=False, suffix=f"_photo.{photo.filename.split('.')[-1] if '.' in photo.filename else 'jpg'}") as temp_photo_file: + temp_photo = temp_photo_file.name + photo_content = await photo.read() + temp_photo_file.write(photo_content) + + with tempfile.NamedTemporaryFile(delete=False, suffix=f"_video.{video.filename.split('.')[-1] if '.' in video.filename else 'mp4'}") as temp_video_file: + temp_video = temp_video_file.name + video_content = await video.read() + temp_video_file.write(video_content) + + logger.info(f"Files saved: photo={temp_photo}, video={temp_video}") + + # Perform facial validation + logger.info("Starting facial validation") + + # Update facial validator with request-specific parameters if provided + if request.similarity_threshold is not None: + facial_validator.similarity_threshold = request.similarity_threshold + if request.frame_sample_rate is not None: + facial_validator.frame_sample_rate = request.frame_sample_rate + + face_result = facial_validator.validate_facial_match(temp_photo, temp_video) + + # Perform gesture validation + logger.info("Starting gesture validation") + + # Update gesture validator with request-specific parameters if provided + if request.confidence_threshold is not None: + gesture_validator.confidence_threshold = request.confidence_threshold + if request.min_gesture_duration is not None: + gesture_validator.min_gesture_duration = request.min_gesture_duration + + gesture_result = gesture_validator.validate_gestures( + temp_video, + request.asked_gestures, + error_margin=request.error_margin, + require_all=request.require_all_gestures + ) + + # Determine overall result + overall_success = face_result.success and gesture_result.success + overall_status = ValidationStatus.SUCCESS if overall_success else ValidationStatus.PARTIAL + + # Calculate processing time + processing_time_ms = int((time.time() - start_time) * 1000) + + # Build response + response = ValidationResponse( + face=face_result.success, + gestures=gesture_result.success, + overall=overall_success, + status=overall_status, + face_result=face_result if request.include_details else None, + gesture_result=gesture_result if request.include_details else None, + processing_time_ms=processing_time_ms, + timestamp=datetime.now(timezone.utc).isoformat() + ) + + # Log results + logger.info( + "Identity validation completed", + extra={ + "face_success": face_result.success, + "gesture_success": gesture_result.success, + "overall_success": overall_success, + "processing_time_ms": processing_time_ms, + "requested_gestures": request.asked_gestures + } + ) + + return response + + except Exception as e: + logger.error(f"Error during identity validation: {str(e)}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"Internal server error during validation: {str(e)}" + ) + + finally: + # Clean up temporary files + for temp_file in [temp_photo, temp_video]: + if temp_file and os.path.exists(temp_file): + try: + os.unlink(temp_file) + logger.debug(f"Cleaned up temporary file: {temp_file}") + except Exception as e: + logger.warning(f"Failed to clean up temporary file {temp_file}: {e}") + + +def main(): + """Start the unified API server.""" + # Get port from environment variable, default to 7860 for HF Spaces compatibility + port = int(os.getenv("PORT", 7860)) + + print("🚀 Starting Unified Gesture Detection & Identity Validation API") + print(f"📍 API will be available at: http://localhost:{port}") + print(f"📚 API documentation at: http://localhost:{port}/docs") + print(f"❤️ Health check at: http://localhost:{port}/health") + print(f"🔐 Identity validation at: POST http://localhost:{port}/validate") + print(f"👋 Gesture detection at: POST http://localhost:{port}/gestures") + print("\nPress Ctrl+C to stop the server") + + uvicorn.run( + app, + host="0.0.0.0", + port=port, + reload=False, # Disable reload in production/Docker + log_level="info" + ) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/models/crops_classifier.onnx b/models/crops_classifier.onnx new file mode 100644 index 0000000000000000000000000000000000000000..a019db7f2c994afb00c4bc717b1ea2185bedd267 --- /dev/null +++ b/models/crops_classifier.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12a02344f63a7c4f2a2ca90f8740ca10a08c17b683b5585d73c3e88323056762 +size 411683 diff --git a/models/hand_detector.onnx b/models/hand_detector.onnx new file mode 100644 index 0000000000000000000000000000000000000000..80926609a0c78313f403a546636b84ffc259081b --- /dev/null +++ b/models/hand_detector.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8ef73d466b61a8e8677be9c47008b217a11d1b265d95e36bf2521ff93329af6 +size 1219959 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd597ea31a2a35260201264f753ed6a457e9104a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +# Direct dependencies from pyproject.toml +filterpy>=1.4.5 +onnx>=1.19.0 +onnxruntime>=1.22.1 +opencv-contrib-python>=4.12.0.88 +fastapi>=0.104.0 +pydantic>=2.0.0 +uvicorn>=0.24.0 +python-multipart>=0.0.6 +orjson>=3.9.0 +numpy>=1.24.0 +scipy>=1.11.0 +logfire[fastapi,sqlite3,httpx]>=0.0.0 diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..a0107a557e18d82a8fe4a4ddff0468fcabc9293d Binary files /dev/null and b/src/.DS_Store differ diff --git a/src/facialembeddingsmatch/__init__.py b/src/facialembeddingsmatch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5f8d9fce51f8868db7d0c61001302c21657714cc --- /dev/null +++ b/src/facialembeddingsmatch/__init__.py @@ -0,0 +1,15 @@ +""" +Facial embeddings matching module for identity verification. + +This module provides facial recognition functionality using embedding-based +matching algorithms. It handles face detection, feature extraction, and +similarity comparison for identity verification purposes. +""" + +__version__ = "1.0.0" +__all__ = [ + "FacialEmbeddingMatcher", + "FaceDetector", + "EmbeddingExtractor", + "SimilarityCalculator" +] diff --git a/src/facialembeddingsmatch/__pycache__/__init__.cpython-312.pyc b/src/facialembeddingsmatch/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aea955201feb3474514f730571659fcaecaa4193 Binary files /dev/null and b/src/facialembeddingsmatch/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/facialembeddingsmatch/__pycache__/facial_matcher.cpython-312.pyc b/src/facialembeddingsmatch/__pycache__/facial_matcher.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da053ed9d4c3262ab0a49888c880f8433dbcab4c Binary files /dev/null and b/src/facialembeddingsmatch/__pycache__/facial_matcher.cpython-312.pyc differ diff --git a/src/facialembeddingsmatch/facial_matcher.py b/src/facialembeddingsmatch/facial_matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..5cba1978a59999d611c5f445939377755468f1e5 --- /dev/null +++ b/src/facialembeddingsmatch/facial_matcher.py @@ -0,0 +1,433 @@ +""" +Facial embedding matcher for identity verification. + +This module provides comprehensive facial recognition functionality including +face detection, embedding extraction, and similarity comparison. It serves +as the core facial matching component for the identity validation system. +""" + +import os +import logging +import tempfile +from typing import List, Dict, Any, Optional, Tuple +from datetime import datetime, timezone +import numpy as np + +logger = logging.getLogger(__name__) + + +class FaceDetector: + """ + Face detection component for identifying faces in images. + + This class handles face detection in both ID photos and video frames. + Currently implemented as a stub, designed to be replaced with actual + face detection algorithms (e.g., MTCNN, DLib, or OpenCV cascades). + """ + + def __init__(self, confidence_threshold: float = 0.8): + """ + Initialize the face detector. + + Parameters + ---------- + confidence_threshold : float, optional + Minimum confidence threshold for face detection, by default 0.8 + """ + self.confidence_threshold = confidence_threshold + logger.info(f"FaceDetector initialized with confidence_threshold={confidence_threshold}") + + def detect_faces(self, image_path: str) -> List[Dict[str, Any]]: + """ + Detect faces in an image. + + This is currently a stub implementation that simulates face detection. + In the future, this will be replaced with actual face detection algorithms. + + Parameters + ---------- + image_path : str + Path to the image file + + Returns + ------- + List[Dict[str, Any]] + List of detected faces with bounding boxes and confidence scores + """ + logger.debug(f"Detecting faces in {image_path} (stub implementation)") + + # Validate input file + if not os.path.exists(image_path): + logger.error(f"Image file not found: {image_path}") + raise FileNotFoundError(f"Image file not found: {image_path}") + + # Stub implementation: simulate detecting one face + # In a real implementation, this would use actual face detection + detected_faces = [ + { + "bbox": [100, 100, 200, 200], # x1, y1, x2, y2 + "confidence": 0.95, + "landmarks": None, # Facial landmarks if available + "image_path": image_path + } + ] + + logger.debug(f"Detected {len(detected_faces)} faces") + return detected_faces + + +class EmbeddingExtractor: + """ + Facial embedding extraction component. + + This class extracts facial feature embeddings from detected faces. + Currently implemented as a stub, designed to be replaced with actual + embedding extraction models (e.g., FaceNet, ArcFace, or VGGFace). + """ + + def __init__(self, model_path: Optional[str] = None): + """ + Initialize the embedding extractor. + + Parameters + ---------- + model_path : Optional[str], optional + Path to the embedding extraction model, by default None + """ + self.model_path = model_path + logger.info(f"EmbeddingExtractor initialized with model_path={model_path}") + + def extract_embedding(self, image_path: str, face_bbox: List[int]) -> Optional[np.ndarray]: + """ + Extract facial embedding from a face region. + + This is currently a stub implementation that returns a random embedding. + In the future, this will extract actual facial embeddings using deep learning models. + + Parameters + ---------- + image_path : str + Path to the image file + face_bbox : List[int] + Bounding box coordinates [x1, y1, x2, y2] + + Returns + ------- + Optional[np.ndarray] + Facial embedding vector, or None if extraction fails + """ + logger.debug(f"Extracting embedding from {image_path} with bbox {face_bbox}") + + # Validate input file + if not os.path.exists(image_path): + logger.error(f"Image file not found: {image_path}") + return None + + # Stub implementation: return deterministic 128-dimensional embedding for testing + # In a real implementation, this would use a trained model + # Use a seed based on the image path to make it deterministic for testing + import hashlib + seed = int(hashlib.md5(image_path.encode()).hexdigest()[:8], 16) % 2**32 + np.random.seed(seed) + embedding = np.random.randn(128).astype(np.float32) + # Normalize the embedding + embedding = embedding / np.linalg.norm(embedding) + + logger.debug(f"Extracted embedding with shape {embedding.shape}") + return embedding + + +class SimilarityCalculator: + """ + Similarity calculation component for comparing facial embeddings. + + This class computes similarity scores between facial embeddings using + various distance metrics. Currently supports cosine similarity. + """ + + def __init__(self): + """Initialize the similarity calculator.""" + logger.info("SimilarityCalculator initialized") + + def calculate_similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float: + """ + Calculate similarity between two facial embeddings. + + Parameters + ---------- + embedding1 : np.ndarray + First facial embedding + embedding2 : np.ndarray + Second facial embedding + + Returns + ------- + float + Similarity score between 0.0 (dissimilar) and 1.0 (identical) + """ + # Calculate cosine similarity + dot_product = np.dot(embedding1, embedding2) + norm1 = np.linalg.norm(embedding1) + norm2 = np.linalg.norm(embedding2) + + if norm1 == 0 or norm2 == 0: + return 0.0 + + cosine_similarity = dot_product / (norm1 * norm2) + + # Convert to similarity score (0.0 to 1.0) + similarity = (cosine_similarity + 1.0) / 2.0 + + logger.debug(f"Calculated similarity: {similarity}") + return similarity + + +class FacialEmbeddingMatcher: + """ + Main facial embedding matcher for identity verification. + + This class orchestrates the complete facial recognition pipeline: + face detection, embedding extraction, and similarity comparison. + It serves as the primary interface for facial matching functionality. + """ + + def __init__( + self, + detector_confidence: float = 0.8, + similarity_threshold: float = 0.7, + embedding_model_path: Optional[str] = None + ): + """ + Initialize the facial embedding matcher. + + Parameters + ---------- + detector_confidence : float, optional + Confidence threshold for face detection, by default 0.8 + similarity_threshold : float, optional + Similarity threshold for facial matching, by default 0.7 + embedding_model_path : Optional[str], optional + Path to embedding extraction model, by default None + """ + self.detector_confidence = detector_confidence + self.similarity_threshold = similarity_threshold + self.embedding_model_path = embedding_model_path + + # Initialize components + self.face_detector = FaceDetector(confidence_threshold=detector_confidence) + self.embedding_extractor = EmbeddingExtractor(model_path=embedding_model_path) + self.similarity_calculator = SimilarityCalculator() + + logger.info( + "FacialEmbeddingMatcher initialized", + extra={ + "detector_confidence": detector_confidence, + "similarity_threshold": similarity_threshold, + "embedding_model_path": embedding_model_path + } + ) + + def match_faces( + self, + id_image_path: str, + video_path: str, + frame_sample_rate: int = 10 + ) -> Dict[str, Any]: + """ + Match faces between ID image and video frames. + + This method performs comprehensive facial matching by: + 1. Detecting faces in the ID image + 2. Sampling frames from the video and detecting faces + 3. Extracting embeddings from detected faces + 4. Computing similarity scores + 5. Determining overall match result + + Parameters + ---------- + id_image_path : str + Path to the ID document image + video_path : str + Path to the user video + frame_sample_rate : int, optional + Rate at which to sample video frames, by default 10 + + Returns + ------- + Dict[str, Any] + Matching results with similarity scores and metadata + """ + logger.info(f"Starting facial matching between {id_image_path} and {video_path}") + + try: + # Step 1: Extract reference embedding from ID image + id_faces = self.face_detector.detect_faces(id_image_path) + + if not id_faces: + return { + "success": False, + "error": "No faces detected in ID image", + "similarity_score": 0.0, + "matches": False, + "details": { + "id_faces_detected": 0, + "video_faces_detected": 0, + "processing_timestamp": datetime.now(timezone.utc).isoformat() + } + } + + except FileNotFoundError as e: + return { + "success": False, + "error": f"File not found: {str(e)}", + "similarity_score": 0.0, + "matches": False, + "details": { + "id_faces_detected": 0, + "video_faces_detected": 0, + "processing_timestamp": datetime.now(timezone.utc).isoformat() + } + } + + # Extract embedding from the first (best) face in ID image + id_face = id_faces[0] + id_embedding = self.embedding_extractor.extract_embedding( + id_image_path, id_face["bbox"] + ) + + if id_embedding is None: + return { + "success": False, + "error": "Failed to extract embedding from ID image", + "similarity_score": 0.0, + "matches": False, + "details": { + "id_faces_detected": len(id_faces), + "video_faces_detected": 0, + "processing_timestamp": datetime.now(timezone.utc).isoformat() + } + } + + # Step 2: Extract faces from video frames + video_faces = self._extract_faces_from_video(video_path, frame_sample_rate) + + if not video_faces: + return { + "success": False, + "error": "No faces detected in video", + "similarity_score": 0.0, + "matches": False, + "details": { + "id_faces_detected": len(id_faces), + "video_faces_detected": 0, + "processing_timestamp": datetime.now(timezone.utc).isoformat() + } + } + + # Step 3: Compare embeddings and find best match + best_similarity = 0.0 + best_video_face = None + + for video_face in video_faces: + video_embedding = self.embedding_extractor.extract_embedding( + video_path, video_face["bbox"] + ) + + if video_embedding is not None: + similarity = self.similarity_calculator.calculate_similarity( + id_embedding, video_embedding + ) + + if similarity > best_similarity: + best_similarity = similarity + best_video_face = video_face + + # Step 4: Determine if faces match + matches = best_similarity >= self.similarity_threshold + + result = { + "success": True, + "matches": matches, + "similarity_score": best_similarity, + "similarity_threshold": self.similarity_threshold, + "details": { + "id_faces_detected": len(id_faces), + "video_faces_detected": len(video_faces), + "best_video_face": best_video_face, + "processing_timestamp": datetime.now(timezone.utc).isoformat(), + "frame_sample_rate": frame_sample_rate, + "note": "This is a stub implementation. Real facial recognition will be implemented in the future." + } + } + + logger.info( + "Facial matching completed", + extra={ + "matches": matches, + "similarity_score": best_similarity, + "faces_detected_id": len(id_faces), + "faces_detected_video": len(video_faces) + } + ) + + return result + + except Exception as e: + logger.error(f"Error during facial matching: {str(e)}", exc_info=True) + return { + "success": False, + "error": f"Processing error: {str(e)}", + "similarity_score": 0.0, + "matches": False, + "details": { + "processing_timestamp": datetime.now(timezone.utc).isoformat() + } + } + + def _extract_faces_from_video(self, video_path: str, frame_sample_rate: int) -> List[Dict[str, Any]]: + """ + Extract faces from video frames. + + This method samples frames from the video and detects faces in each frame. + Currently implemented as a stub that simulates face detection. + + Parameters + ---------- + video_path : str + Path to the video file + frame_sample_rate : int + Rate at which to sample frames + + Returns + ------- + List[Dict[str, Any]] + List of detected faces with frame information + """ + logger.debug(f"Extracting faces from video: {video_path}") + + # Stub implementation: simulate detecting faces in video + # In a real implementation, this would: + # 1. Open the video file + # 2. Sample frames at the specified rate + # 3. Detect faces in each sampled frame + # 4. Return face information with frame metadata + + detected_faces = [ + { + "bbox": [120, 120, 220, 220], # x1, y1, x2, y2 + "confidence": 0.92, + "frame_number": 15, + "timestamp": 0.5, # seconds + "image_path": video_path + }, + { + "bbox": [110, 110, 210, 210], + "confidence": 0.88, + "frame_number": 30, + "timestamp": 1.0, + "image_path": video_path + } + ] + + logger.debug(f"Extracted {len(detected_faces)} faces from video") + return detected_faces diff --git a/src/gesturedetection/.DS_Store b/src/gesturedetection/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..13fd9fe26d43b51d5e3ae0768ce71a884f25c126 Binary files /dev/null and b/src/gesturedetection/.DS_Store differ diff --git a/src/gesturedetection/__init__.py b/src/gesturedetection/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dda804136f2c0a76603c89f6261dd3e693bf8cea --- /dev/null +++ b/src/gesturedetection/__init__.py @@ -0,0 +1,23 @@ +# Gesture detection package +from .api import app +from .models import Gesture, GestureResponse, GESTURE_MAPPING, FULL_GESTURE_MAPPING +from .main_controller import MainController +from .onnx_models import HandDetection, HandClassification +from .utils import Deque, Drawer, Hand, Event, HandPosition, targets + +__all__ = [ + "app", + "Gesture", + "GestureResponse", + "GESTURE_MAPPING", + "FULL_GESTURE_MAPPING", + "MainController", + "HandDetection", + "HandClassification", + "Deque", + "Drawer", + "Hand", + "Event", + "HandPosition", + "targets" +] diff --git a/src/gesturedetection/__pycache__/__init__.cpython-312.pyc b/src/gesturedetection/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8a84ac2410feb9105efc43ddf9fde81aa1a0fc8 Binary files /dev/null and b/src/gesturedetection/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/gesturedetection/__pycache__/api.cpython-312.pyc b/src/gesturedetection/__pycache__/api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..655ada37bfe97d91680102ebcaf70abea23a1e72 Binary files /dev/null and b/src/gesturedetection/__pycache__/api.cpython-312.pyc differ diff --git a/src/gesturedetection/__pycache__/config.cpython-312.pyc b/src/gesturedetection/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd4abbfbe101fbdf6e9c09cb2ddb4e5b6a54792a Binary files /dev/null and b/src/gesturedetection/__pycache__/config.cpython-312.pyc differ diff --git a/src/gesturedetection/__pycache__/main_controller.cpython-312.pyc b/src/gesturedetection/__pycache__/main_controller.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da594f38241189587d188be346f2908d8785a794 Binary files /dev/null and b/src/gesturedetection/__pycache__/main_controller.cpython-312.pyc differ diff --git a/src/gesturedetection/__pycache__/models.cpython-312.pyc b/src/gesturedetection/__pycache__/models.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e2e74f953f6126a3bd8e281f7f2db1320e472a0 Binary files /dev/null and b/src/gesturedetection/__pycache__/models.cpython-312.pyc differ diff --git a/src/gesturedetection/__pycache__/onnx_models.cpython-312.pyc b/src/gesturedetection/__pycache__/onnx_models.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc8c36c55c1ef2f04ecc33d08c7b749b10967a97 Binary files /dev/null and b/src/gesturedetection/__pycache__/onnx_models.cpython-312.pyc differ diff --git a/src/gesturedetection/api.py b/src/gesturedetection/api.py new file mode 100644 index 0000000000000000000000000000000000000000..00e8ea47ef494d21120eb4d0f244c73a9d7cd149 --- /dev/null +++ b/src/gesturedetection/api.py @@ -0,0 +1,318 @@ +import cv2 +import numpy as np +import tempfile +import os +from collections import defaultdict +from typing import Dict, List, Tuple, Optional +from fastapi import FastAPI, UploadFile, File, HTTPException, Form +from fastapi.responses import ORJSONResponse +from fastapi.encoders import jsonable_encoder + +from .models import Gesture, GestureResponse, GESTURE_MAPPING, FULL_GESTURE_MAPPING +from .config import get_logfire_token, is_monitoring_enabled + +# Import the gesture detection components +from .main_controller import MainController + +# Configure logfire monitoring if token is available +logfire = None +if is_monitoring_enabled(): + try: + import logfire + logfire.configure(token=get_logfire_token()) + logfire.instrument_fastapi = logfire.instrument_fastapi + except ImportError: + logfire = None + +app = FastAPI(default_response_class=ORJSONResponse) + +# Instrument FastAPI with logfire if monitoring is enabled +if logfire is not None: + logfire.instrument_fastapi(app, capture_headers=True) + + +def process_video_for_gestures(video_path: str, detector_path: str = "models/hand_detector.onnx", + classifier_path: str = "models/crops_classifier.onnx", + frame_skip: int = 1) -> List[Gesture]: + """ + Process a video file to detect gestures using the MainController. + + Parameters + ---------- + video_path : str + Path to the video file to process + detector_path : str + Path to the hand detection ONNX model + classifier_path : str + Path to the gesture classification ONNX model + frame_skip : int + Number of frames to skip between processing (1 = process every frame, 3 = process every 3rd frame) + + Returns + ------- + List[Gesture] + List of detected gestures with duration and confidence + """ + # Create monitoring span for video processing + span_context = None + if logfire is not None: + span_context = logfire.span('process_video_for_gestures', + video_path=video_path, + detector_path=detector_path, + classifier_path=classifier_path) + span_context.__enter__() + + try: + # Initialize the main controller + if logfire is not None: + with logfire.span('initialize_controller'): + controller = MainController(detector_path, classifier_path) + else: + controller = MainController(detector_path, classifier_path) + + # Open video file + cap = cv2.VideoCapture(video_path) + if not cap.isOpened(): + raise ValueError(f"Could not open video file: {video_path}") + + # Get video properties for monitoring + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + fps = cap.get(cv2.CAP_PROP_FPS) + + if logfire is not None: + logfire.info('Video properties', + total_frames=total_frames, + fps=fps, + duration_seconds=total_frames/fps if fps > 0 else 0) + + # Track gestures per hand ID + gesture_tracks: Dict[int, List[Tuple[int, float]]] = defaultdict(list) # {hand_id: [(gesture_id, confidence), ...]} + frame_count = 0 + processed_frames = 0 + detection_stats = { + 'frames_with_detections': 0, + 'total_detections': 0, + 'gesture_counts': defaultdict(int) + } + + try: + while True: + ret, frame = cap.read() + if not ret: + break + + # Skip frames based on frame_skip parameter + if frame_count % frame_skip == 0: + # Process frame through the controller + bboxes, ids, labels = controller(frame) + processed_frames += 1 + + if bboxes is not None and ids is not None and labels is not None: + detection_stats['frames_with_detections'] += 1 + detection_stats['total_detections'] += len(bboxes) + + # Track gestures for each detected hand + for i in range(len(bboxes)): + hand_id = int(ids[i]) + gesture_id = labels[i] + + if gesture_id is not None: + # Get confidence from bbox (assuming it's the last element) + confidence = 0.8 # Default confidence, could be extracted from bbox if available + gesture_tracks[hand_id].append((gesture_id, confidence)) + detection_stats['gesture_counts'][gesture_id] += 1 + + # Log individual detections for debugging + if logfire is not None: + gesture_name = FULL_GESTURE_MAPPING.get(gesture_id, f"unknown_{gesture_id}") + logfire.debug('Hand detection', + frame=frame_count, + hand_id=hand_id, + gesture_id=gesture_id, + gesture_name=gesture_name, + confidence=confidence, + bbox=bboxes[i].tolist() if len(bboxes[i]) >= 4 else None) + else: + # Advance tracker on skipped frames to keep state consistent + controller.update(np.empty((0, 5)), None) + + frame_count += 1 + + # Log progress every 100 frames + if frame_count % 100 == 0 and logfire is not None: + progress = (frame_count / total_frames) * 100 if total_frames > 0 else 0 + logfire.info('Processing progress', + frame=frame_count, + total_frames=total_frames, + progress_percent=round(progress, 2)) + + finally: + cap.release() + + # Log final detection statistics + if logfire is not None: + logfire.info('Detection statistics', + total_frames=frame_count, + processed_frames=processed_frames, + frame_skip=frame_skip, + frames_with_detections=detection_stats['frames_with_detections'], + total_detections=detection_stats['total_detections'], + detection_rate=detection_stats['frames_with_detections']/processed_frames if processed_frames > 0 else 0, + gesture_counts=dict(detection_stats['gesture_counts'])) + + # Process gesture tracks to find continuous gestures + detected_gestures = [] + + for hand_id, gesture_sequence in gesture_tracks.items(): + if not gesture_sequence: + continue + + # Group consecutive identical gestures + current_gesture = None + current_duration = 0 + current_confidence = 0.0 + + for gesture_id, confidence in gesture_sequence: + if current_gesture is None or current_gesture != gesture_id: + # Save previous gesture if it was significant + # Adjust minimum duration based on frame skip + min_duration = max(5, frame_skip * 2) # At least 2 processed frames + if current_gesture is not None and current_duration >= min_duration: + gesture_name = FULL_GESTURE_MAPPING.get(current_gesture, f"unknown_{current_gesture}") + avg_confidence = current_confidence / current_duration if current_duration > 0 else 0.0 + # Scale duration back to original frame count + scaled_duration = current_duration * frame_skip + detected_gestures.append(Gesture( + gesture=gesture_name, + duration=scaled_duration, + confidence=avg_confidence + )) + + # Log significant gesture detection + if logfire is not None: + logfire.info('Significant gesture detected', + hand_id=hand_id, + gesture=gesture_name, + duration_frames=current_duration, + confidence=avg_confidence) + + # Start new gesture + current_gesture = gesture_id + current_duration = 1 + current_confidence = confidence + else: + # Continue current gesture + current_duration += 1 + current_confidence += confidence + + # Don't forget the last gesture + min_duration = max(5, frame_skip * 2) # At least 2 processed frames + if current_gesture is not None and current_duration >= min_duration: + gesture_name = FULL_GESTURE_MAPPING.get(current_gesture, f"unknown_{current_gesture}") + avg_confidence = current_confidence / current_duration if current_duration > 0 else 0.0 + # Scale duration back to original frame count + scaled_duration = current_duration * frame_skip + detected_gestures.append(Gesture( + gesture=gesture_name, + duration=scaled_duration, + confidence=avg_confidence + )) + + # Log final gesture detection + if logfire is not None: + logfire.info('Final gesture detected', + hand_id=hand_id, + gesture=gesture_name, + duration_frames=current_duration, + confidence=avg_confidence) + + # Log final results + if logfire is not None: + logfire.info('Video processing completed', + total_gestures_detected=len(detected_gestures), + unique_hands=len(gesture_tracks), + gestures=[{'gesture': g.gesture, 'duration': g.duration, 'confidence': g.confidence} for g in detected_gestures]) + + return detected_gestures + + finally: + if span_context is not None: + span_context.__exit__(None, None, None) + + +@app.get("/health") +async def health(): + """Health check endpoint.""" + if logfire is not None: + logfire.info('Health check requested') + return {"message": "OK"} + + +@app.post("/gestures", response_model=GestureResponse) +async def detect_gestures(video: UploadFile = File(...), frame_skip: int = Form(1)): + """ + Detect gestures in an uploaded video file. + + Parameters + ---------- + video : UploadFile + The video file to process + frame_skip : int + Number of frames to skip between processing (1 = process every frame, 3 = process every 3rd frame) + + Returns + ------- + GestureResponse + Response containing detected gestures with duration and confidence + """ + # Log request details + if logfire is not None: + logfire.info('Gesture detection request received', + filename=video.filename, + content_type=video.content_type, + content_length=video.size if hasattr(video, 'size') else 'unknown') + + # Validate file type + if not video.content_type.startswith('video/'): + if logfire is not None: + logfire.warning('Invalid file type received', content_type=video.content_type) + raise HTTPException(status_code=400, detail="File must be a video") + + # Create temporary file to save uploaded video + with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_file: + try: + # Write uploaded content to temporary file + content = await video.read() + temp_file.write(content) + temp_file.flush() + + if logfire is not None: + logfire.info('Video file saved for processing', + temp_file=temp_file.name, + file_size_bytes=len(content)) + + # Process the video with frame skip parameter + gestures = process_video_for_gestures(temp_file.name, frame_skip=frame_skip) + + if logfire is not None: + logfire.info('Gesture detection completed successfully', + total_gestures=len(gestures), + gestures=[g.gesture for g in gestures]) + + return GestureResponse(gestures=gestures) + + except Exception as e: + if logfire is not None: + logfire.error('Error processing video', + error=str(e), + error_type=type(e).__name__, + temp_file=temp_file.name) + raise HTTPException(status_code=500, detail=f"Error processing video: {str(e)}") + + finally: + # Clean up temporary file + if os.path.exists(temp_file.name): + os.unlink(temp_file.name) + if logfire is not None: + logfire.debug('Temporary file cleaned up', temp_file=temp_file.name) + diff --git a/src/gesturedetection/config.py b/src/gesturedetection/config.py new file mode 100644 index 0000000000000000000000000000000000000000..861e56937717960c05a94d0f6bfbafad2a3f322a --- /dev/null +++ b/src/gesturedetection/config.py @@ -0,0 +1,55 @@ +""" +Configuration module for gesture detection system. +Handles environment variables and logfire token configuration. +""" + +import os +from pathlib import Path +from typing import Optional + + +def get_logfire_token() -> Optional[str]: + """ + Get the logfire token from environment variables or local configuration. + + Priority order: + 1. LOGFIRE_TOKEN environment variable (for production/deployment) + 2. .env file in project root (for local development) + 3. None (monitoring disabled) + + Returns + ------- + Optional[str] + The logfire token if found, None otherwise + """ + # First check environment variable (for production) + token = os.getenv("LOGFIRE_TOKEN") + if token: + return token + + # Check for .env file in project root (for local development) + env_file = Path(__file__).parent.parent.parent / ".env" + if env_file.exists(): + try: + with open(env_file, "r") as f: + for line in f: + line = line.strip() + if line.startswith("LOGFIRE_TOKEN="): + return line.split("=", 1)[1].strip('"\'') + except Exception: + # If we can't read the .env file, continue without token + pass + + return None + + +def is_monitoring_enabled() -> bool: + """ + Check if monitoring is enabled by checking if we have a logfire token. + + Returns + ------- + bool + True if monitoring is enabled, False otherwise + """ + return get_logfire_token() is not None diff --git a/src/gesturedetection/main_controller.py b/src/gesturedetection/main_controller.py new file mode 100644 index 0000000000000000000000000000000000000000..8c99f5536643438c7d013fd6399d682e605307e4 --- /dev/null +++ b/src/gesturedetection/main_controller.py @@ -0,0 +1,271 @@ +import numpy as np + +from .ocsort import ( + KalmanBoxTracker, + associate, + ciou_batch, + ct_dist, + diou_batch, + giou_batch, + iou_batch, + linear_assignment, +) +from .onnx_models import HandClassification, HandDetection +from .utils import Deque, Drawer, Hand +from .config import is_monitoring_enabled + +# Configure logfire monitoring if available +logfire = None +if is_monitoring_enabled(): + try: + import logfire + except ImportError: + logfire = None + +ASSO_FUNCS = {"iou": iou_batch, "giou": giou_batch, "ciou": ciou_batch, "diou": diou_batch, "ct_dist": ct_dist} + + +def k_previous_obs(observations, cur_age, k): + if len(observations) == 0: + return [-1, -1, -1, -1, -1] + for i in range(k): + dt = k - i + if cur_age - dt in observations: + return observations[cur_age - dt] + max_age = max(observations.keys()) + return observations[max_age] + + +class MainController: + """ + Main tracking function. + Class contains a list of tracks, each track contains a KalmanBoxTracker object and a Deque object with Hand objects. + """ + + def __init__( + self, detection_model, classification_model, max_age=30, min_hits=3, iou_threshold=0.3, maxlen=30, min_frames=20 + ): + """ + Parameters + ---------- + detection_model : str + Path to detection model. + classification_model : str + Path to classification model. + max_age : int + Maximum age of track. + min_hits : int + Minimum number of hits to confirm track. + iou_threshold : float + IOU threshold for track association. + maxlen : int + Maximum length of deque in track. + min_frames : int + Minimum number of frames to confirm track. + """ + self.maxlen = maxlen + self.min_frames = min_frames + self.max_age = max_age + self.min_hits = min_hits + self.delta_t = 3 + self.iou_threshold = iou_threshold + self.inertia = 0.2 + self.asso_func = ASSO_FUNCS["giou"] + self.tracks = [] + self.frame_count = 0 + self.detection_model = HandDetection(detection_model) + self.classification_model = HandClassification(classification_model) + self.drawer = Drawer() + + def update(self, dets=np.empty((0, 5)), labels=None): + """ + Parameters + ---------- + dets : np.array + Bounding boxes with shape [[x1,y1,x2,y2,score],[x1,y1,x2,y2,score],...] . + Requires: this method must be called once for each frame even with empty detections (use np.empty((0, 5)) for frames without detections). + labels : np.array + Labels with shape (N, 1) where N is number of bounding boxes. + + Returns + ------- + np.array + Returns the similar array, where the last column is the object ID. + + Notes + ----- + The number of objects returned may differ from the number of detections provided. + + """ + # Advance frame count on every call to keep tracker state in sync with real time. + # This method is required to be called once per frame (even if there are no detections), + # so we must advance the internal Kalman filters and aging logic on empty frames as well. + self.frame_count += 1 + + # Get predicted locations from existing trackers for this frame. + # This advances age/time_since_update and is required also when there are no detections, + # ensuring tracks can age out (max_age) and do not persist indefinitely across gaps. + trks = np.zeros((len(self.tracks), 5)) + to_del = [] + ret = [] + lbs = [] + for t, trk in enumerate(trks): + pos = self.tracks[t]["tracker"].predict()[0] + trk[:] = [pos[0], pos[1], pos[2], pos[3], 0] + if np.any(np.isnan(pos)): + to_del.append(t) + trks = np.ma.compress_rows(np.ma.masked_invalid(trks)) + for t in reversed(to_del): + self.tracks.pop(t) + + velocities = np.array( + [ + trk["tracker"].velocity if trk["tracker"].velocity is not None else np.array((0, 0)) + for trk in self.tracks + ] + ) + last_boxes = np.array([trk["tracker"].last_observation for trk in self.tracks]) + k_observations = np.array( + [k_previous_obs(trk["tracker"].observations, trk["tracker"].age, self.delta_t) for trk in self.tracks] + ) + + """ + First round of association + """ + matched, unmatched_dets, unmatched_trks = associate( + dets, trks, self.iou_threshold, velocities, k_observations, self.inertia + ) + + for m in matched: + self.tracks[m[1]]["tracker"].update(dets[m[0], :]) + self.tracks[m[1]]["hands"].append(Hand(bbox=dets[m[0], :4], gesture=labels[m[0]])) + + """ + Second round of associaton by OCR + """ + if unmatched_dets.shape[0] > 0 and unmatched_trks.shape[0] > 0: + left_dets = dets[unmatched_dets] + left_trks = last_boxes[unmatched_trks] + iou_left = self.asso_func(left_dets, left_trks) + iou_left = np.array(iou_left) + if iou_left.max() > self.iou_threshold: + """ + NOTE: by using a lower threshold, e.g., self.iou_threshold - 0.1, you may + get a higher performance especially on MOT17/MOT20 datasets. But we keep it + uniform here for simplicity + """ + rematched_indices = linear_assignment(-iou_left) + to_remove_det_indices = [] + to_remove_trk_indices = [] + for m in rematched_indices: + det_ind, trk_ind = unmatched_dets[m[0]], unmatched_trks[m[1]] + if iou_left[m[0], m[1]] < self.iou_threshold: + continue + self.tracks[trk_ind]["tracker"].update(dets[det_ind, :]) + self.tracks[trk_ind]["hands"].append(Hand(bbox=dets[det_ind, :4], gesture=labels[det_ind])) + to_remove_det_indices.append(det_ind) + to_remove_trk_indices.append(trk_ind) + unmatched_dets = np.setdiff1d(unmatched_dets, np.array(to_remove_det_indices)) + unmatched_trks = np.setdiff1d(unmatched_trks, np.array(to_remove_trk_indices)) + + # For unmatched trackers (including the case with no detections), + # update with None to keep the filter consistent and append a dummy Hand. + for m in unmatched_trks: + self.tracks[m]["tracker"].update(None) + self.tracks[m]["hands"].append(Hand(bbox=None, gesture=None)) + + # create and initialise new trackers for unmatched detections + for i in unmatched_dets: + self.tracks.append( + { + "hands": Deque(self.maxlen, self.min_frames), + "tracker": KalmanBoxTracker(dets[i, :], delta_t=self.delta_t), + } + ) + i = len(self.tracks) + for trk in reversed(self.tracks): + if trk["tracker"].last_observation.sum() < 0: + d = trk["tracker"].get_state()[0] + else: + """ + this is optional to use the recent observation or the kalman filter prediction, + we didn't notice significant difference here + """ + d = trk["tracker"].last_observation[:4] + if (trk["tracker"].time_since_update < 1) and ( + trk["tracker"].hit_streak >= self.min_hits or self.frame_count <= self.min_hits + ): + # +1 as MOT benchmark requires positive + ret.append(np.concatenate((d, [trk["tracker"].id + 1])).reshape(1, -1)) + if len(trk["hands"]) > 0: + lbs.append(trk["hands"][-1].gesture) + else: + lbs.append(None) + + i -= 1 + # remove dead tracklet + if trk["tracker"].time_since_update > self.max_age: + self.tracks.pop(i) + if len(ret) > 0: + return np.concatenate(ret), lbs + return np.empty((0, 5)), np.empty((0, 1)) + + def __call__(self, frame): + """ + Parameters + ---------- + frame : np.array + Image frame with shape (H, W, 3). + + Returns + ------- + list of np.array + + + """ + # Log frame processing if monitoring is enabled + if logfire is not None: + with logfire.span('frame_processing', frame_shape=frame.shape): + bboxes, probs = self.detection_model(frame) + + if len(bboxes): + detection_scores = np.asarray(probs).tolist() + logfire.debug( + 'Hand detections found', + num_detections=len(bboxes), + detection_scores=detection_scores, + ) + + labels = self.classification_model(frame, bboxes) + bboxes = np.concatenate((bboxes, np.expand_dims(probs, axis=1)), axis=1) + new_bboxes, labels = self.update(dets=bboxes, labels=labels) + + # Log classification results + if labels is not None and len(labels) > 0: + labels_list = np.asarray(labels).tolist() + gesture_names = [ + f"gesture_{label}" if label is not None else "none" + for label in labels_list + ] + logfire.debug( + 'Gesture classifications', + labels=labels_list, + gesture_names=gesture_names, + ) + + return new_bboxes[:, :-1], new_bboxes[:, -1], labels + else: + logfire.debug('No hand detections in frame') + self.update(np.empty((0, 5)), None) + return None, None, None + else: + # Original logic without monitoring + bboxes, probs = self.detection_model(frame) + if len(bboxes): + labels = self.classification_model(frame, bboxes) + bboxes = np.concatenate((bboxes, np.expand_dims(probs, axis=1)), axis=1) + new_bboxes, labels = self.update(dets=bboxes, labels=labels) + return new_bboxes[:, :-1], new_bboxes[:, -1], labels + else: + self.update(np.empty((0, 5)), None) + return None, None, None diff --git a/src/gesturedetection/models.py b/src/gesturedetection/models.py new file mode 100644 index 0000000000000000000000000000000000000000..0c424e1af00bf97ff67efcfa3bb52d0581d79c41 --- /dev/null +++ b/src/gesturedetection/models.py @@ -0,0 +1,89 @@ +from pydantic import BaseModel +from typing import List, Optional + + +class Gesture(BaseModel): + """Represents a detected gesture with metadata.""" + gesture: str + duration: int # Duration in frames + confidence: float + + +class GestureResponse(BaseModel): + """Response model containing a list of detected gestures.""" + gestures: List[Gesture] + + +# Primary gesture mappings for the main gestures + additional ones +GESTURE_MAPPING = { + # Original 5 main gestures + 27: "thumbs_up", # like + 31: "palm", # open palm wave (5 fingers) + 32: "peace", # peace sign (2 fingers) + 29: "ok", # OK sign + 20: "call", # call me (little finger) + + # Finger counting (1-5) + 30: "one", # 1 finger + 39: "two_up", # 2 fingers (peace sign) + 37: "three", # 3 fingers + 26: "four", # 4 fingers + # Note: 5 fingers is same as palm (31) + + # Surprise gesture + 23: "middle_finger", # middle finger (surprise!) + + # Additional useful gestures + 25: "fist", # closed fist + 19: "point", # pointing with index finger + 35: "stop", # stop gesture +} + +# Additional gesture mappings for completeness +FULL_GESTURE_MAPPING = { + 0: "hand_down", + 1: "hand_right", + 2: "hand_left", + 3: "thumb_index", + 4: "thumb_left", + 5: "thumb_right", + 6: "thumb_down", + 7: "half_up", + 8: "half_left", + 9: "half_right", + 10: "half_down", + 11: "part_hand_heart", + 12: "part_hand_heart2", + 13: "fist_inverted", + 14: "two_left", + 15: "two_right", + 16: "two_down", + 17: "grabbing", + 18: "grip", + 19: "point", + 20: "call", + 21: "three3", + 22: "little_finger", + 23: "middle_finger", + 24: "dislike", + 25: "fist", + 26: "four", + 27: "like", + 28: "mute", + 29: "ok", + 30: "one", + 31: "palm", + 32: "peace", + 33: "peace_inverted", + 34: "rock", + 35: "stop", + 36: "stop_inverted", + 37: "three", + 38: "three2", + 39: "two_up", + 40: "two_up_inverted", + 41: "three_gun", + 42: "one_left", + 43: "one_right", + 44: "one_down" +} diff --git a/src/gesturedetection/ocsort/__init__.py b/src/gesturedetection/ocsort/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d6818bda55063bff7ad660e1c79c7db138ebee9a --- /dev/null +++ b/src/gesturedetection/ocsort/__init__.py @@ -0,0 +1,2 @@ +from .association import associate, ciou_batch, ct_dist, diou_batch, giou_batch, iou_batch, linear_assignment +from .kalmanboxtracker import KalmanBoxTracker diff --git a/src/gesturedetection/ocsort/__pycache__/__init__.cpython-312.pyc b/src/gesturedetection/ocsort/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..82acc6965715abda5077f4dca146f01ea60882d6 Binary files /dev/null and b/src/gesturedetection/ocsort/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/gesturedetection/ocsort/__pycache__/__init__.cpython-39.pyc b/src/gesturedetection/ocsort/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d4a3b99ba3201788215d3af5362ecc937345c8d Binary files /dev/null and b/src/gesturedetection/ocsort/__pycache__/__init__.cpython-39.pyc differ diff --git a/src/gesturedetection/ocsort/__pycache__/association.cpython-312.pyc b/src/gesturedetection/ocsort/__pycache__/association.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6cfc5b7b8a5c4ca9ff2e1569320839df7f46c31e Binary files /dev/null and b/src/gesturedetection/ocsort/__pycache__/association.cpython-312.pyc differ diff --git a/src/gesturedetection/ocsort/__pycache__/association.cpython-39.pyc b/src/gesturedetection/ocsort/__pycache__/association.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81737371d573ddfc20ac555dd796e82f95ffdb63 Binary files /dev/null and b/src/gesturedetection/ocsort/__pycache__/association.cpython-39.pyc differ diff --git a/src/gesturedetection/ocsort/__pycache__/kalmanboxtracker.cpython-312.pyc b/src/gesturedetection/ocsort/__pycache__/kalmanboxtracker.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bdeec2ab74609430f4d67ce4440bca890602b735 Binary files /dev/null and b/src/gesturedetection/ocsort/__pycache__/kalmanboxtracker.cpython-312.pyc differ diff --git a/src/gesturedetection/ocsort/__pycache__/kalmanboxtracker.cpython-39.pyc b/src/gesturedetection/ocsort/__pycache__/kalmanboxtracker.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..19d8a6f4cf2ee454ce1531dcf6955ad24117a432 Binary files /dev/null and b/src/gesturedetection/ocsort/__pycache__/kalmanboxtracker.cpython-39.pyc differ diff --git a/src/gesturedetection/ocsort/__pycache__/kalmanfilter.cpython-312.pyc b/src/gesturedetection/ocsort/__pycache__/kalmanfilter.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3edf2924c861bbc72781d4c9eb90bb2302667bda Binary files /dev/null and b/src/gesturedetection/ocsort/__pycache__/kalmanfilter.cpython-312.pyc differ diff --git a/src/gesturedetection/ocsort/__pycache__/kalmanfilter.cpython-39.pyc b/src/gesturedetection/ocsort/__pycache__/kalmanfilter.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9f7f17a9a81e56ef86f107a316cfd6356320a39 Binary files /dev/null and b/src/gesturedetection/ocsort/__pycache__/kalmanfilter.cpython-39.pyc differ diff --git a/src/gesturedetection/ocsort/association.py b/src/gesturedetection/ocsort/association.py new file mode 100644 index 0000000000000000000000000000000000000000..62e378adde8b4061fb5a46553d92681b336a2005 --- /dev/null +++ b/src/gesturedetection/ocsort/association.py @@ -0,0 +1,511 @@ +import numpy as np + + +def iou_batch(bboxes1, bboxes2): + """ + Calculate the Intersection of Unions (IoUs) between bounding boxes. + Parameters + ---------- + bboxes1: numpy.ndarray + shape is [N, 4] + bboxes2: numpy.ndarray + shape is [M, 4] + + Returns + ------- + ious: numpy.ndarray + shape is [N, M] + """ + bboxes2 = np.expand_dims(bboxes2, 0) + bboxes1 = np.expand_dims(bboxes1, 1) + + xx1 = np.maximum(bboxes1[..., 0], bboxes2[..., 0]) + yy1 = np.maximum(bboxes1[..., 1], bboxes2[..., 1]) + xx2 = np.minimum(bboxes1[..., 2], bboxes2[..., 2]) + yy2 = np.minimum(bboxes1[..., 3], bboxes2[..., 3]) + w = np.maximum(0.0, xx2 - xx1) + h = np.maximum(0.0, yy2 - yy1) + wh = w * h + o = wh / ( + (bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1]) + + (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1]) + - wh + ) + return o + + +def giou_batch(bboxes1, bboxes2): + """ + Calculate the Generalized Intersection over Union (GIoUs) between bounding boxes. + Parameters + ---------- + bboxes1: numpy.ndarray + shape is [N, 4] + bboxes2: numpy.ndarray + shape is [M, 4] + + Returns + ------- + gious: numpy.ndarray + shape is [N, M] + """ + # for details should go to https://arxiv.org/pdf/1902.09630.pdf + # ensure predict's bbox form + bboxes2 = np.expand_dims(bboxes2, 0) + bboxes1 = np.expand_dims(bboxes1, 1) + + xx1 = np.maximum(bboxes1[..., 0], bboxes2[..., 0]) + yy1 = np.maximum(bboxes1[..., 1], bboxes2[..., 1]) + xx2 = np.minimum(bboxes1[..., 2], bboxes2[..., 2]) + yy2 = np.minimum(bboxes1[..., 3], bboxes2[..., 3]) + w = np.maximum(0.0, xx2 - xx1) + h = np.maximum(0.0, yy2 - yy1) + wh = w * h + union = ( + (bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1]) + + (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1]) + - wh + ) + iou = wh / union + + xxc1 = np.minimum(bboxes1[..., 0], bboxes2[..., 0]) + yyc1 = np.minimum(bboxes1[..., 1], bboxes2[..., 1]) + xxc2 = np.maximum(bboxes1[..., 2], bboxes2[..., 2]) + yyc2 = np.maximum(bboxes1[..., 3], bboxes2[..., 3]) + wc = xxc2 - xxc1 + hc = yyc2 - yyc1 + assert (wc > 0).all() and (hc > 0).all() + area_enclose = wc * hc + giou = iou - (area_enclose - union) / area_enclose + giou = (giou + 1.0) / 2.0 # resize from (-1,1) to (0,1) + return giou + + +def diou_batch(bboxes1, bboxes2): + """ + Calculate the Distance Intersection over Union (DIoUs) between bounding boxes. + Parameters + ---------- + bboxes1: numpy.ndarray + shape is [N, 4] + + bboxes2: numpy.ndarray + shape is [M, 4] + + Returns + ------- + dious: numpy.ndarray + """ + # for details should go to https://arxiv.org/pdf/1902.09630.pdf + # ensure predict's bbox form + bboxes2 = np.expand_dims(bboxes2, 0) + bboxes1 = np.expand_dims(bboxes1, 1) + + # calculate the intersection box + xx1 = np.maximum(bboxes1[..., 0], bboxes2[..., 0]) + yy1 = np.maximum(bboxes1[..., 1], bboxes2[..., 1]) + xx2 = np.minimum(bboxes1[..., 2], bboxes2[..., 2]) + yy2 = np.minimum(bboxes1[..., 3], bboxes2[..., 3]) + w = np.maximum(0.0, xx2 - xx1) + h = np.maximum(0.0, yy2 - yy1) + wh = w * h + union = ( + (bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1]) + + (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1]) + - wh + ) + iou = wh / union + centerx1 = (bboxes1[..., 0] + bboxes1[..., 2]) / 2.0 + centery1 = (bboxes1[..., 1] + bboxes1[..., 3]) / 2.0 + centerx2 = (bboxes2[..., 0] + bboxes2[..., 2]) / 2.0 + centery2 = (bboxes2[..., 1] + bboxes2[..., 3]) / 2.0 + + inner_diag = (centerx1 - centerx2) ** 2 + (centery1 - centery2) ** 2 + + xxc1 = np.minimum(bboxes1[..., 0], bboxes2[..., 0]) + yyc1 = np.minimum(bboxes1[..., 1], bboxes2[..., 1]) + xxc2 = np.maximum(bboxes1[..., 2], bboxes2[..., 2]) + yyc2 = np.maximum(bboxes1[..., 3], bboxes2[..., 3]) + + outer_diag = (xxc2 - xxc1) ** 2 + (yyc2 - yyc1) ** 2 + diou = iou - inner_diag / outer_diag + + return (diou + 1) / 2.0 # resize from (-1,1) to (0,1) + + +def ciou_batch(bboxes1, bboxes2): + """ + Calculate the Complete Intersection over Union (CIoUs) between bounding boxes. + Parameters + ---------- + bboxes1: numpy.ndarray + shape is [N, 4] + + bboxes2: numpy.ndarray + shape is [M, 4] + + Returns + ------- + ciou: numpy.ndarray + """ + # for details should go to https://arxiv.org/pdf/1902.09630.pdf + # ensure predict's bbox form + bboxes2 = np.expand_dims(bboxes2, 0) + bboxes1 = np.expand_dims(bboxes1, 1) + + # calculate the intersection box + xx1 = np.maximum(bboxes1[..., 0], bboxes2[..., 0]) + yy1 = np.maximum(bboxes1[..., 1], bboxes2[..., 1]) + xx2 = np.minimum(bboxes1[..., 2], bboxes2[..., 2]) + yy2 = np.minimum(bboxes1[..., 3], bboxes2[..., 3]) + w = np.maximum(0.0, xx2 - xx1) + h = np.maximum(0.0, yy2 - yy1) + wh = w * h + union = ( + (bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1]) + + (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1]) + - wh + ) + iou = wh / union + + centerx1 = (bboxes1[..., 0] + bboxes1[..., 2]) / 2.0 + centery1 = (bboxes1[..., 1] + bboxes1[..., 3]) / 2.0 + centerx2 = (bboxes2[..., 0] + bboxes2[..., 2]) / 2.0 + centery2 = (bboxes2[..., 1] + bboxes2[..., 3]) / 2.0 + + inner_diag = (centerx1 - centerx2) ** 2 + (centery1 - centery2) ** 2 + + xxc1 = np.minimum(bboxes1[..., 0], bboxes2[..., 0]) + yyc1 = np.minimum(bboxes1[..., 1], bboxes2[..., 1]) + xxc2 = np.maximum(bboxes1[..., 2], bboxes2[..., 2]) + yyc2 = np.maximum(bboxes1[..., 3], bboxes2[..., 3]) + + outer_diag = (xxc2 - xxc1) ** 2 + (yyc2 - yyc1) ** 2 + + w1 = bboxes1[..., 2] - bboxes1[..., 0] + h1 = bboxes1[..., 3] - bboxes1[..., 1] + w2 = bboxes2[..., 2] - bboxes2[..., 0] + h2 = bboxes2[..., 3] - bboxes2[..., 1] + + # prevent dividing over zero. add one pixel shift + h2 = h2 + 1.0 + h1 = h1 + 1.0 + arctan = np.arctan(w2 / h2) - np.arctan(w1 / h1) + v = (4 / (np.pi**2)) * (arctan**2) + S = 1 - iou + alpha = v / (S + v) + ciou = iou - inner_diag / outer_diag - alpha * v + + return (ciou + 1) / 2.0 # resize from (-1,1) to (0,1) + + +def ct_dist(bboxes1, bboxes2): + """ + Measure the center distance between two sets of bounding boxes, + this is a coarse implementation, we don't recommend using it only + for association, which can be unstable and sensitive to frame rate + and object speed. + Parameters + ---------- + bboxes1: numpy.ndarray + shape is [N, 4] + + bboxes2: numpy.ndarray + shape is [M, 4] + + Returns + ------- + ct_dist: numpy.ndarray + """ + bboxes2 = np.expand_dims(bboxes2, 0) + bboxes1 = np.expand_dims(bboxes1, 1) + + centerx1 = (bboxes1[..., 0] + bboxes1[..., 2]) / 2.0 + centery1 = (bboxes1[..., 1] + bboxes1[..., 3]) / 2.0 + centerx2 = (bboxes2[..., 0] + bboxes2[..., 2]) / 2.0 + centery2 = (bboxes2[..., 1] + bboxes2[..., 3]) / 2.0 + + ct_dist2 = (centerx1 - centerx2) ** 2 + (centery1 - centery2) ** 2 + + ct_dist = np.sqrt(ct_dist2) + + # The linear rescaling is a naive version and needs more study + ct_dist = ct_dist / ct_dist.max() + return ct_dist.max() - ct_dist # resize to (0,1) + + +def speed_direction_batch(dets, tracks): + """ + Calculate the speed and direction between detections and tracks. + Parameters + ---------- + dets: numpy.ndarray + shape is [N, 4] + + tracks: numpy.ndarray + shape is [M, 4] + + Returns + ------- + dy: numpy.ndarray + dx: numpy.ndarray + + """ + tracks = tracks[..., np.newaxis] + CX1, CY1 = (dets[:, 0] + dets[:, 2]) / 2.0, (dets[:, 1] + dets[:, 3]) / 2.0 + CX2, CY2 = (tracks[:, 0] + tracks[:, 2]) / 2.0, (tracks[:, 1] + tracks[:, 3]) / 2.0 + dx = CX1 - CX2 + dy = CY1 - CY2 + norm = np.sqrt(dx**2 + dy**2) + 1e-6 + dx = dx / norm + dy = dy / norm + return dy, dx # size: num_track x num_det + + +def linear_assignment(cost_matrix): + """ + Solve the linear assignment problem using scipy.optimize.linear_sum_assignment. + Parameters + ---------- + cost_matrix: numpy.ndarray + shape is [N, M] + + Returns + ------- + indices: numpy.ndarray + shape is [N, 2] + """ + try: + import lap + + _, x, y = lap.lapjv(cost_matrix, extend_cost=True) + return np.array([[y[i], i] for i in x if i >= 0]) # + except ImportError: + from scipy.optimize import linear_sum_assignment + + x, y = linear_sum_assignment(cost_matrix) + return np.array(list(zip(x, y))) + + +def associate_detections_to_trackers(detections, trackers, iou_threshold=0.3): + """ + Assigns detections to tracked object (both represented as bounding boxes) + Returns 3 lists of matches, unmatched_detections and unmatched_trackers + Parameters + ---------- + + detections: numpy.ndarray + shape is [N, 4] + + trackers: numpy.ndarray + shape is [M, 4] + + iou_threshold: float + in [0, 1]. Default is 0.3 + """ + if len(trackers) == 0: + return np.empty((0, 2), dtype=int), np.arange(len(detections)), np.empty((0, 5), dtype=int) + + iou_matrix = iou_batch(detections, trackers) + + if min(iou_matrix.shape) > 0: + a = (iou_matrix > iou_threshold).astype(np.int32) + if a.sum(1).max() == 1 and a.sum(0).max() == 1: + matched_indices = np.stack(np.where(a), axis=1) + else: + matched_indices = linear_assignment(-iou_matrix) + else: + matched_indices = np.empty(shape=(0, 2)) + + unmatched_detections = [] + for d, det in enumerate(detections): + if d not in matched_indices[:, 0]: + unmatched_detections.append(d) + unmatched_trackers = [] + for t, trk in enumerate(trackers): + if t not in matched_indices[:, 1]: + unmatched_trackers.append(t) + + # filter out matched with low IOU + matches = [] + for m in matched_indices: + if iou_matrix[m[0], m[1]] < iou_threshold: + unmatched_detections.append(m[0]) + unmatched_trackers.append(m[1]) + else: + matches.append(m.reshape(1, 2)) + if len(matches) == 0: + matches = np.empty((0, 2), dtype=int) + else: + matches = np.concatenate(matches, axis=0) + + return matches, np.array(unmatched_detections), np.array(unmatched_trackers) + + +def associate(detections, trackers, iou_threshold, velocities, previous_obs, vdc_weight): + """ + Assigns detections to tracked object (both represented as bounding boxes) + Returns 3 lists of matches, unmatched_detections and unmatched_trackers + Parameters + ---------- + detections: numpy.ndarray + shape is [N, 4] + trackers: numpy.ndarray + shape is [M, 4] + iou_threshold: float + in [0, 1]. Default is 0.3 + velocities: numpy.ndarray + shape is [M, 2] + previous_obs: numpy.ndarray + shape is [M, 4] + vdc_weight: float + """ + if len(trackers) == 0: + return np.empty((0, 2), dtype=int), np.arange(len(detections)), np.empty((0, 5), dtype=int) + + Y, X = speed_direction_batch(detections, previous_obs) + inertia_Y, inertia_X = velocities[:, 0], velocities[:, 1] + inertia_Y = np.repeat(inertia_Y[:, np.newaxis], Y.shape[1], axis=1) + inertia_X = np.repeat(inertia_X[:, np.newaxis], X.shape[1], axis=1) + diff_angle_cos = inertia_X * X + inertia_Y * Y + diff_angle_cos = np.clip(diff_angle_cos, a_min=-1, a_max=1) + diff_angle = np.arccos(diff_angle_cos) + diff_angle = (np.pi / 2.0 - np.abs(diff_angle)) / np.pi + + valid_mask = np.ones(previous_obs.shape[0]) + valid_mask[np.where(previous_obs[:, 4] < 0)] = 0 + + iou_matrix = iou_batch(detections, trackers) + scores = np.repeat(detections[:, -1][:, np.newaxis], trackers.shape[0], axis=1) + # iou_matrix = iou_matrix * scores # a trick sometiems works, we don't encourage this + valid_mask = np.repeat(valid_mask[:, np.newaxis], X.shape[1], axis=1) + + angle_diff_cost = (valid_mask * diff_angle) * vdc_weight + angle_diff_cost = angle_diff_cost.T + angle_diff_cost = angle_diff_cost * scores + + if min(iou_matrix.shape) > 0: + a = (iou_matrix > iou_threshold).astype(np.int32) + if a.sum(1).max() == 1 and a.sum(0).max() == 1: + matched_indices = np.stack(np.where(a), axis=1) + else: + matched_indices = linear_assignment(-(iou_matrix + angle_diff_cost)) + else: + matched_indices = np.empty(shape=(0, 2)) + + unmatched_detections = [] + for d, det in enumerate(detections): + if d not in matched_indices[:, 0]: + unmatched_detections.append(d) + unmatched_trackers = [] + for t, trk in enumerate(trackers): + if t not in matched_indices[:, 1]: + unmatched_trackers.append(t) + + # filter out matched with low IOU + matches = [] + for m in matched_indices: + if iou_matrix[m[0], m[1]] < iou_threshold: + unmatched_detections.append(m[0]) + unmatched_trackers.append(m[1]) + else: + matches.append(m.reshape(1, 2)) + if len(matches) == 0: + matches = np.empty((0, 2), dtype=int) + else: + matches = np.concatenate(matches, axis=0) + + return matches, np.array(unmatched_detections), np.array(unmatched_trackers) + + +def associate_kitti(detections, trackers, det_cates, iou_threshold, velocities, previous_obs, vdc_weight): + if len(trackers) == 0: + return np.empty((0, 2), dtype=int), np.arange(len(detections)), np.empty((0, 5), dtype=int) + + """ + Cost from the velocity direction consistency + Parameters + ---------- + detections: numpy.ndarray + shape is [N, 4] + trackers: numpy.ndarray + shape is [M, 4] + det_cates: numpy.ndarray + shape is [N, 1] + iou_threshold: float + in [0, 1]. Default is 0.3 + velocities: numpy.ndarray + shape is [M, 2] + previous_obs: numpy.ndarray + shape is [M, 4] + vdc_weight: float + + """ + Y, X = speed_direction_batch(detections, previous_obs) + inertia_Y, inertia_X = velocities[:, 0], velocities[:, 1] + inertia_Y = np.repeat(inertia_Y[:, np.newaxis], Y.shape[1], axis=1) + inertia_X = np.repeat(inertia_X[:, np.newaxis], X.shape[1], axis=1) + diff_angle_cos = inertia_X * X + inertia_Y * Y + diff_angle_cos = np.clip(diff_angle_cos, a_min=-1, a_max=1) + diff_angle = np.arccos(diff_angle_cos) + diff_angle = (np.pi / 2.0 - np.abs(diff_angle)) / np.pi + + valid_mask = np.ones(previous_obs.shape[0]) + valid_mask[np.where(previous_obs[:, 4] < 0)] = 0 + valid_mask = np.repeat(valid_mask[:, np.newaxis], X.shape[1], axis=1) + + scores = np.repeat(detections[:, -1][:, np.newaxis], trackers.shape[0], axis=1) + angle_diff_cost = (valid_mask * diff_angle) * vdc_weight + angle_diff_cost = angle_diff_cost.T + angle_diff_cost = angle_diff_cost * scores + + """ + Cost from IoU + """ + iou_matrix = iou_batch(detections, trackers) + + """ + With multiple categories, generate the cost for catgory mismatch + """ + num_dets = detections.shape[0] + num_trk = trackers.shape[0] + cate_matrix = np.zeros((num_dets, num_trk)) + for i in range(num_dets): + for j in range(num_trk): + if det_cates[i] != trackers[j, 4]: + cate_matrix[i][j] = -1e6 + + cost_matrix = -iou_matrix - angle_diff_cost - cate_matrix + + if min(iou_matrix.shape) > 0: + a = (iou_matrix > iou_threshold).astype(np.int32) + if a.sum(1).max() == 1 and a.sum(0).max() == 1: + matched_indices = np.stack(np.where(a), axis=1) + else: + matched_indices = linear_assignment(cost_matrix) + else: + matched_indices = np.empty(shape=(0, 2)) + + unmatched_detections = [] + for d, det in enumerate(detections): + if d not in matched_indices[:, 0]: + unmatched_detections.append(d) + unmatched_trackers = [] + for t, trk in enumerate(trackers): + if t not in matched_indices[:, 1]: + unmatched_trackers.append(t) + + # filter out matched with low IOU + matches = [] + for m in matched_indices: + if iou_matrix[m[0], m[1]] < iou_threshold: + unmatched_detections.append(m[0]) + unmatched_trackers.append(m[1]) + else: + matches.append(m.reshape(1, 2)) + if len(matches) == 0: + matches = np.empty((0, 2), dtype=int) + else: + matches = np.concatenate(matches, axis=0) + + return matches, np.array(unmatched_detections), np.array(unmatched_trackers) diff --git a/src/gesturedetection/ocsort/kalmanboxtracker.py b/src/gesturedetection/ocsort/kalmanboxtracker.py new file mode 100644 index 0000000000000000000000000000000000000000..eb0eae0a489cb635fa146eb110ef14f657607f4e --- /dev/null +++ b/src/gesturedetection/ocsort/kalmanboxtracker.py @@ -0,0 +1,157 @@ +from __future__ import print_function + +import numpy as np + + +def convert_bbox_to_z(bbox): + """ + Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form + [x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is + the aspect ratio + """ + w = bbox[2] - bbox[0] + h = bbox[3] - bbox[1] + x = bbox[0] + w / 2.0 + y = bbox[1] + h / 2.0 + s = w * h # scale is just area + r = w / float(h + 1e-6) + return np.array([x, y, s, r]).reshape((4, 1)) + + +def speed_direction(bbox1, bbox2): + cx1, cy1 = (bbox1[0] + bbox1[2]) / 2.0, (bbox1[1] + bbox1[3]) / 2.0 + cx2, cy2 = (bbox2[0] + bbox2[2]) / 2.0, (bbox2[1] + bbox2[3]) / 2.0 + speed = np.array([cy2 - cy1, cx2 - cx1]) + norm = np.sqrt((cy2 - cy1) ** 2 + (cx2 - cx1) ** 2) + 1e-6 + return speed / norm + + +def convert_x_to_bbox(x, score=None): + """ + Takes a bounding box in the centre form [x,y,s,r] and returns it in the form + [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right + """ + w = np.sqrt(x[2] * x[3]) + h = x[2] / w + if score is None: + return np.array([x[0] - w / 2.0, x[1] - h / 2.0, x[0] + w / 2.0, x[1] + h / 2.0]).reshape((1, 4)) + else: + return np.array([x[0] - w / 2.0, x[1] - h / 2.0, x[0] + w / 2.0, x[1] + h / 2.0, score]).reshape((1, 5)) + + +class KalmanBoxTracker(object): + """ + This class represents the internal state of individual tracked objects observed as bbox. + """ + + count = 0 + + def __init__(self, bbox, delta_t=3, orig=False): + """ + Initialises a tracker using initial bounding box. + + """ + # define constant velocity model + if not orig: + from .kalmanfilter import KalmanFilterNew as KalmanFilter + + self.kf = KalmanFilter(dim_x=7, dim_z=4) + else: + from filterpy.kalman import KalmanFilter + + self.kf = KalmanFilter(dim_x=7, dim_z=4) + self.kf.F = np.array( + [ + [1, 0, 0, 0, 1, 0, 0], + [0, 1, 0, 0, 0, 1, 0], + [0, 0, 1, 0, 0, 0, 1], + [0, 0, 0, 1, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0], + [0, 0, 0, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 0, 1], + ] + ) + self.kf.H = np.array( + [[1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0]] + ) + + self.kf.R[2:, 2:] *= 10.0 + self.kf.P[4:, 4:] *= 1000.0 # give high uncertainty to the unobservable initial velocities + self.kf.P *= 10.0 + self.kf.Q[-1, -1] *= 0.01 + self.kf.Q[4:, 4:] *= 0.01 + + self.kf.x[:4] = convert_bbox_to_z(bbox) + self.time_since_update = 0 + self.id = KalmanBoxTracker.count + KalmanBoxTracker.count += 1 + self.history = [] + self.hits = 0 + self.hit_streak = 0 + self.age = 0 + """ + NOTE: [-1,-1,-1,-1,-1] is a compromising placeholder for non-observation status, the same for the return of + function k_previous_obs. It is ugly and I do not like it. But to support generate observation array in a + fast and unified way, which you would see below k_observations = np.array([k_previous_obs(...]]), let's bear it for now. + """ + self.last_observation = np.array([-1, -1, -1, -1, -1]) # placeholder + self.observations = dict() + self.history_observations = [] + self.velocity = None + self.delta_t = delta_t + + def update(self, bbox): + """ + Updates the state vector with observed bbox. + """ + if bbox is not None: + if self.last_observation.sum() >= 0: # no previous observation + previous_box = None + for i in range(self.delta_t): + dt = self.delta_t - i + if self.age - dt in self.observations: + previous_box = self.observations[self.age - dt] + break + if previous_box is None: + previous_box = self.last_observation + """ + Estimate the track speed direction with observations Delta t steps away + """ + self.velocity = speed_direction(previous_box, bbox) + + """ + Insert new observations. This is a ugly way to maintain both self.observations + and self.history_observations. Bear it for the moment. + """ + self.last_observation = bbox + self.observations[self.age] = bbox + self.history_observations.append(bbox) + + self.time_since_update = 0 + self.history = [] + self.hits += 1 + self.hit_streak += 1 + self.kf.update(convert_bbox_to_z(bbox)) + else: + self.kf.update(bbox) + + def predict(self): + """ + Advances the state vector and returns the predicted bounding box estimate. + """ + if (self.kf.x[6] + self.kf.x[2]) <= 0: + self.kf.x[6] *= 0.0 + + self.kf.predict() + self.age += 1 + if self.time_since_update > 0: + self.hit_streak = 0 + self.time_since_update += 1 + self.history.append(convert_x_to_bbox(self.kf.x)) + return self.history[-1] + + def get_state(self): + """ + Returns the current bounding box estimate. + """ + return convert_x_to_bbox(self.kf.x) diff --git a/src/gesturedetection/ocsort/kalmanfilter.py b/src/gesturedetection/ocsort/kalmanfilter.py new file mode 100644 index 0000000000000000000000000000000000000000..f7175d08a8e46aef2b0f98e8cde25273d2eac1dc --- /dev/null +++ b/src/gesturedetection/ocsort/kalmanfilter.py @@ -0,0 +1,1557 @@ +# -*- coding: utf-8 -*- +# pylint: disable=invalid-name, too-many-arguments, too-many-branches, +# pylint: disable=too-many-locals, too-many-instance-attributes, too-many-lines + +""" +This module implements the linear Kalman filter in both an object +oriented and procedural form. The KalmanFilter class implements +the filter by storing the various matrices in instance variables, +minimizing the amount of bookkeeping you have to do. +All Kalman filters operate with a predict->update cycle. The +predict step, implemented with the method or function predict(), +uses the state transition matrix F to predict the state in the next +time period (epoch). The state is stored as a gaussian (x, P), where +x is the state (column) vector, and P is its covariance. Covariance +matrix Q specifies the process covariance. In Bayesian terms, this +prediction is called the *prior*, which you can think of colloquially +as the estimate prior to incorporating the measurement. +The update step, implemented with the method or function `update()`, +incorporates the measurement z with covariance R, into the state +estimate (x, P). The class stores the system uncertainty in S, +the innovation (residual between prediction and measurement in +measurement space) in y, and the Kalman gain in k. The procedural +form returns these variables to you. In Bayesian terms this computes +the *posterior* - the estimate after the information from the +measurement is incorporated. +Whether you use the OO form or procedural form is up to you. If +matrices such as H, R, and F are changing each epoch, you'll probably +opt to use the procedural form. If they are unchanging, the OO +form is perhaps easier to use since you won't need to keep track +of these matrices. This is especially useful if you are implementing +banks of filters or comparing various KF designs for performance; +a trivial coding bug could lead to using the wrong sets of matrices. +This module also offers an implementation of the RTS smoother, and +other helper functions, such as log likelihood computations. +The Saver class allows you to easily save the state of the +KalmanFilter class after every update +This module expects NumPy arrays for all values that expect +arrays, although in a few cases, particularly method parameters, +it will accept types that convert to NumPy arrays, such as lists +of lists. These exceptions are documented in the method or function. +Examples +-------- +The following example constructs a constant velocity kinematic +filter, filters noisy data, and plots the results. It also demonstrates +using the Saver class to save the state of the filter at each epoch. +.. code-block:: Python + import matplotlib.pyplot as plt + import numpy as np + from filterpy.kalman import KalmanFilter + from filterpy.common import Q_discrete_white_noise, Saver + r_std, q_std = 2., 0.003 + cv = KalmanFilter(dim_x=2, dim_z=1) + cv.x = np.array([[0., 1.]]) # position, velocity + cv.F = np.array([[1, dt],[ [0, 1]]) + cv.R = np.array([[r_std^^2]]) + f.H = np.array([[1., 0.]]) + f.P = np.diag([.1^^2, .03^^2) + f.Q = Q_discrete_white_noise(2, dt, q_std**2) + saver = Saver(cv) + for z in range(100): + cv.predict() + cv.update([z + randn() * r_std]) + saver.save() # save the filter's state + saver.to_array() + plt.plot(saver.x[:, 0]) + # plot all of the priors + plt.plot(saver.x_prior[:, 0]) + # plot mahalanobis distance + plt.figure() + plt.plot(saver.mahalanobis) +This code implements the same filter using the procedural form + x = np.array([[0., 1.]]) # position, velocity + F = np.array([[1, dt],[ [0, 1]]) + R = np.array([[r_std^^2]]) + H = np.array([[1., 0.]]) + P = np.diag([.1^^2, .03^^2) + Q = Q_discrete_white_noise(2, dt, q_std**2) + for z in range(100): + x, P = predict(x, P, F=F, Q=Q) + x, P = update(x, P, z=[z + randn() * r_std], R=R, H=H) + xs.append(x[0, 0]) + plt.plot(xs) +For more examples see the test subdirectory, or refer to the +book cited below. In it I both teach Kalman filtering from basic +principles, and teach the use of this library in great detail. +FilterPy library. +http://github.com/rlabbe/filterpy +Documentation at: +https://filterpy.readthedocs.org +Supporting book at: +https://github.com/rlabbe/Kalman-and-Bayesian-Filters-in-Python +This is licensed under an MIT license. See the readme.MD file +for more information. +Copyright 2014-2018 Roger R Labbe Jr. +""" + +from __future__ import absolute_import, division + +import sys +from copy import deepcopy +from math import exp, log, sqrt + +import numpy as np +import numpy.linalg as linalg +from filterpy.common import pretty_str, reshape_z +from filterpy.stats import logpdf +from numpy import dot, eye, isscalar, shape, zeros + + +class KalmanFilterNew(object): + """Implements a Kalman filter. You are responsible for setting the + various state variables to reasonable values; the defaults will + not give you a functional filter. + For now the best documentation is my free book Kalman and Bayesian + Filters in Python [2]_. The test files in this directory also give you a + basic idea of use, albeit without much description. + In brief, you will first construct this object, specifying the size of + the state vector with dim_x and the size of the measurement vector that + you will be using with dim_z. These are mostly used to perform size checks + when you assign values to the various matrices. For example, if you + specified dim_z=2 and then try to assign a 3x3 matrix to R (the + measurement noise matrix you will get an assert exception because R + should be 2x2. (If for whatever reason you need to alter the size of + things midstream just use the underscore version of the matrices to + assign directly: your_filter._R = a_3x3_matrix.) + After construction the filter will have default matrices created for you, + but you must specify the values for each. It’s usually easiest to just + overwrite them rather than assign to each element yourself. This will be + clearer in the example below. All are of type numpy.array. + Examples + -------- + Here is a filter that tracks position and velocity using a sensor that only + reads position. + First construct the object with the required dimensionality. Here the state + (`dim_x`) has 2 coefficients (position and velocity), and the measurement + (`dim_z`) has one. In FilterPy `x` is the state, `z` is the measurement. + .. code:: + from filterpy.kalman import KalmanFilter + f = KalmanFilter (dim_x=2, dim_z=1) + Assign the initial value for the state (position and velocity). You can do this + with a two dimensional array like so: + .. code:: + f.x = np.array([[2.], # position + [0.]]) # velocity + or just use a one dimensional array, which I prefer doing. + .. code:: + f.x = np.array([2., 0.]) + Define the state transition matrix: + .. code:: + f.F = np.array([[1.,1.], + [0.,1.]]) + Define the measurement function. Here we need to convert a position-velocity + vector into just a position vector, so we use: + .. code:: + f.H = np.array([[1., 0.]]) + Define the state's covariance matrix P. + .. code:: + f.P = np.array([[1000., 0.], + [ 0., 1000.] ]) + Now assign the measurement noise. Here the dimension is 1x1, so I can + use a scalar + .. code:: + f.R = 5 + I could have done this instead: + .. code:: + f.R = np.array([[5.]]) + Note that this must be a 2 dimensional array. + Finally, I will assign the process noise. Here I will take advantage of + another FilterPy library function: + .. code:: + from filterpy.common import Q_discrete_white_noise + f.Q = Q_discrete_white_noise(dim=2, dt=0.1, var=0.13) + Now just perform the standard predict/update loop: + .. code:: + while some_condition_is_true: + z = get_sensor_reading() + f.predict() + f.update(z) + do_something_with_estimate (f.x) + **Procedural Form** + This module also contains stand alone functions to perform Kalman filtering. + Use these if you are not a fan of objects. + **Example** + .. code:: + while True: + z, R = read_sensor() + x, P = predict(x, P, F, Q) + x, P = update(x, P, z, R, H) + See my book Kalman and Bayesian Filters in Python [2]_. + You will have to set the following attributes after constructing this + object for the filter to perform properly. Please note that there are + various checks in place to ensure that you have made everything the + 'correct' size. However, it is possible to provide incorrectly sized + arrays such that the linear algebra can not perform an operation. + It can also fail silently - you can end up with matrices of a size that + allows the linear algebra to work, but are the wrong shape for the problem + you are trying to solve. + Parameters + ---------- + dim_x : int + Number of state variables for the Kalman filter. For example, if + you are tracking the position and velocity of an object in two + dimensions, dim_x would be 4. + This is used to set the default size of P, Q, and u + dim_z : int + Number of of measurement inputs. For example, if the sensor + provides you with position in (x,y), dim_z would be 2. + dim_u : int (optional) + size of the control input, if it is being used. + Default value of 0 indicates it is not used. + compute_log_likelihood : bool (default = True) + Computes log likelihood by default, but this can be a slow + computation, so if you never use it you can turn this computation + off. + Attributes + ---------- + x : numpy.array(dim_x, 1) + Current state estimate. Any call to update() or predict() updates + this variable. + P : numpy.array(dim_x, dim_x) + Current state covariance matrix. Any call to update() or predict() + updates this variable. + x_prior : numpy.array(dim_x, 1) + Prior (predicted) state estimate. The *_prior and *_post attributes + are for convenience; they store the prior and posterior of the + current epoch. Read Only. + P_prior : numpy.array(dim_x, dim_x) + Prior (predicted) state covariance matrix. Read Only. + x_post : numpy.array(dim_x, 1) + Posterior (updated) state estimate. Read Only. + P_post : numpy.array(dim_x, dim_x) + Posterior (updated) state covariance matrix. Read Only. + z : numpy.array + Last measurement used in update(). Read only. + R : numpy.array(dim_z, dim_z) + Measurement noise covariance matrix. Also known as the + observation covariance. + Q : numpy.array(dim_x, dim_x) + Process noise covariance matrix. Also known as the transition + covariance. + F : numpy.array() + State Transition matrix. Also known as `A` in some formulation. + H : numpy.array(dim_z, dim_x) + Measurement function. Also known as the observation matrix, or as `C`. + y : numpy.array + Residual of the update step. Read only. + K : numpy.array(dim_x, dim_z) + Kalman gain of the update step. Read only. + S : numpy.array + System uncertainty (P projected to measurement space). Read only. + SI : numpy.array + Inverse system uncertainty. Read only. + log_likelihood : float + log-likelihood of the last measurement. Read only. + likelihood : float + likelihood of last measurement. Read only. + Computed from the log-likelihood. The log-likelihood can be very + small, meaning a large negative value such as -28000. Taking the + exp() of that results in 0.0, which can break typical algorithms + which multiply by this value, so by default we always return a + number >= sys.float_info.min. + mahalanobis : float + mahalanobis distance of the innovation. Read only. + inv : function, default numpy.linalg.inv + If you prefer another inverse function, such as the Moore-Penrose + pseudo inverse, set it to that instead: kf.inv = np.linalg.pinv + This is only used to invert self.S. If you know it is diagonal, you + might choose to set it to filterpy.common.inv_diagonal, which is + several times faster than numpy.linalg.inv for diagonal matrices. + alpha : float + Fading memory setting. 1.0 gives the normal Kalman filter, and + values slightly larger than 1.0 (such as 1.02) give a fading + memory effect - previous measurements have less influence on the + filter's estimates. This formulation of the Fading memory filter + (there are many) is due to Dan Simon [1]_. + References + ---------- + .. [1] Dan Simon. "Optimal State Estimation." John Wiley & Sons. + p. 208-212. (2006) + .. [2] Roger Labbe. "Kalman and Bayesian Filters in Python" + https://github.com/rlabbe/Kalman-and-Bayesian-Filters-in-Python + """ + + def __init__(self, dim_x, dim_z, dim_u=0): + if dim_x < 1: + raise ValueError("dim_x must be 1 or greater") + if dim_z < 1: + raise ValueError("dim_z must be 1 or greater") + if dim_u < 0: + raise ValueError("dim_u must be 0 or greater") + + self.dim_x = dim_x + self.dim_z = dim_z + self.dim_u = dim_u + + self.x = zeros((dim_x, 1)) # state + self.P = eye(dim_x) # uncertainty covariance + self.Q = eye(dim_x) # process uncertainty + self.B = None # control transition matrix + self.F = eye(dim_x) # state transition matrix + self.H = zeros((dim_z, dim_x)) # measurement function + self.R = eye(dim_z) # measurement uncertainty + self._alpha_sq = 1.0 # fading memory control + self.M = np.zeros((dim_x, dim_z)) # process-measurement cross correlation + self.z = np.array([[None] * self.dim_z]).T + + # gain and residual are computed during the innovation step. We + # save them so that in case you want to inspect them for various + # purposes + self.K = np.zeros((dim_x, dim_z)) # kalman gain + self.y = zeros((dim_z, 1)) + self.S = np.zeros((dim_z, dim_z)) # system uncertainty + self.SI = np.zeros((dim_z, dim_z)) # inverse system uncertainty + + # identity matrix. Do not alter this. + self._I = np.eye(dim_x) + + # these will always be a copy of x,P after predict() is called + self.x_prior = self.x.copy() + self.P_prior = self.P.copy() + + # these will always be a copy of x,P after update() is called + self.x_post = self.x.copy() + self.P_post = self.P.copy() + + # Only computed only if requested via property + self._log_likelihood = log(sys.float_info.min) + self._likelihood = sys.float_info.min + self._mahalanobis = None + + # keep all observations + self.history_obs = [] + + self.inv = np.linalg.inv + + self.attr_saved = None + self.observed = False + + def predict(self, u=None, B=None, F=None, Q=None): + """ + Predict next state (prior) using the Kalman filter state propagation + equations. + Parameters + ---------- + u : np.array, default 0 + Optional control vector. + B : np.array(dim_x, dim_u), or None + Optional control transition matrix; a value of None + will cause the filter to use `self.B`. + F : np.array(dim_x, dim_x), or None + Optional state transition matrix; a value of None + will cause the filter to use `self.F`. + Q : np.array(dim_x, dim_x), scalar, or None + Optional process noise matrix; a value of None will cause the + filter to use `self.Q`. + """ + + if B is None: + B = self.B + if F is None: + F = self.F + if Q is None: + Q = self.Q + elif isscalar(Q): + Q = eye(self.dim_x) * Q + + # x = Fx + Bu + if B is not None and u is not None: + self.x = dot(F, self.x) + dot(B, u) + else: + self.x = dot(F, self.x) + + # P = FPF' + Q + self.P = self._alpha_sq * dot(dot(F, self.P), F.T) + Q + + # save prior + self.x_prior = self.x.copy() + self.P_prior = self.P.copy() + + def freeze(self): + """ + Save the parameters before non-observation forward + """ + self.attr_saved = deepcopy(self.__dict__) + + def unfreeze(self): + if self.attr_saved is not None: + new_history = deepcopy(self.history_obs) + self.__dict__ = self.attr_saved + # self.history_obs = new_history + self.history_obs = self.history_obs[:-1] + occur = [int(d is None) for d in new_history] + indices = np.where(np.array(occur) == 0)[0] + index1 = indices[-2] + index2 = indices[-1] + box1 = new_history[index1] + x1, y1, s1, r1 = box1 + w1 = np.sqrt(s1 * r1) + h1 = np.sqrt(s1 / r1) + box2 = new_history[index2] + x2, y2, s2, r2 = box2 + w2 = np.sqrt(s2 * r2) + h2 = np.sqrt(s2 / r2) + time_gap = index2 - index1 + dx = (x2 - x1) / time_gap + dy = (y2 - y1) / time_gap + dw = (w2 - w1) / time_gap + dh = (h2 - h1) / time_gap + for i in range(index2 - index1): + """ + The default virtual trajectory generation is by linear + motion (constant speed hypothesis), you could modify this + part to implement your own. + """ + x = x1 + (i + 1) * dx + y = y1 + (i + 1) * dy + w = w1 + (i + 1) * dw + h = h1 + (i + 1) * dh + s = w * h + r = w / float(h) + new_box = np.array([x, y, s, r]).reshape((4, 1)) + """ + I still use predict-update loop here to refresh the parameters, + but this can be faster by directly modifying the internal parameters + as suggested in the paper. I keep this naive but slow way for + easy read and understanding + """ + self.update(new_box) + if not i == (index2 - index1 - 1): + self.predict() + + def update(self, z, R=None, H=None): + """ + Add a new measurement (z) to the Kalman filter. + If z is None, nothing is computed. However, x_post and P_post are + updated with the prior (x_prior, P_prior), and self.z is set to None. + Parameters + ---------- + z : (dim_z, 1): array_like + measurement for this update. z can be a scalar if dim_z is 1, + otherwise it must be convertible to a column vector. + If you pass in a value of H, z must be a column vector the + of the correct size. + R : np.array, scalar, or None + Optionally provide R to override the measurement noise for this + one call, otherwise self.R will be used. + H : np.array, or None + Optionally provide H to override the measurement function for this + one call, otherwise self.H will be used. + """ + + # set to None to force recompute + self._log_likelihood = None + self._likelihood = None + self._mahalanobis = None + + # append the observation + self.history_obs.append(z) + + if z is None: + if self.observed: + """ + Got no observation so freeze the current parameters for future + potential online smoothing. + """ + self.freeze() + self.observed = False + self.z = np.array([[None] * self.dim_z]).T + self.x_post = self.x.copy() + self.P_post = self.P.copy() + self.y = zeros((self.dim_z, 1)) + return + + # self.observed = True + if not self.observed: + """ + Get observation, use online smoothing to re-update parameters + """ + self.unfreeze() + self.observed = True + + if R is None: + R = self.R + elif isscalar(R): + R = eye(self.dim_z) * R + + if H is None: + z = reshape_z(z, self.dim_z, self.x.ndim) + H = self.H + + # y = z - Hx + # error (residual) between measurement and prediction + self.y = z - dot(H, self.x) + + # common subexpression for speed + PHT = dot(self.P, H.T) + + # S = HPH' + R + # project system uncertainty into measurement space + self.S = dot(H, PHT) + R + self.SI = self.inv(self.S) + # K = PH'inv(S) + # map system uncertainty into kalman gain + self.K = dot(PHT, self.SI) + + # x = x + Ky + # predict new x with residual scaled by the kalman gain + self.x = self.x + dot(self.K, self.y) + + # P = (I-KH)P(I-KH)' + KRK' + # This is more numerically stable + # and works for non-optimal K vs the equation + # P = (I-KH)P usually seen in the literature. + + I_KH = self._I - dot(self.K, H) + self.P = dot(dot(I_KH, self.P), I_KH.T) + dot(dot(self.K, R), self.K.T) + + # save measurement and posterior state + self.z = deepcopy(z) + self.x_post = self.x.copy() + self.P_post = self.P.copy() + + def predict_steadystate(self, u=0, B=None): + """ + Predict state (prior) using the Kalman filter state propagation + equations. Only x is updated, P is left unchanged. See + update_steadstate() for a longer explanation of when to use this + method. + Parameters + ---------- + u : np.array + Optional control vector. If non-zero, it is multiplied by B + to create the control input into the system. + B : np.array(dim_x, dim_u), or None + Optional control transition matrix; a value of None + will cause the filter to use `self.B`. + """ + + if B is None: + B = self.B + + # x = Fx + Bu + if B is not None: + self.x = dot(self.F, self.x) + dot(B, u) + else: + self.x = dot(self.F, self.x) + + # save prior + self.x_prior = self.x.copy() + self.P_prior = self.P.copy() + + def update_steadystate(self, z): + """ + Add a new measurement (z) to the Kalman filter without recomputing + the Kalman gain K, the state covariance P, or the system + uncertainty S. + You can use this for LTI systems since the Kalman gain and covariance + converge to a fixed value. Precompute these and assign them explicitly, + or run the Kalman filter using the normal predict()/update(0 cycle + until they converge. + The main advantage of this call is speed. We do significantly less + computation, notably avoiding a costly matrix inversion. + Use in conjunction with predict_steadystate(), otherwise P will grow + without bound. + Parameters + ---------- + z : (dim_z, 1): array_like + measurement for this update. z can be a scalar if dim_z is 1, + otherwise it must be convertible to a column vector. + Examples + -------- + >>> cv = kinematic_kf(dim=3, order=2) # 3D const velocity filter + >>> # let filter converge on representative data, then save k and P + >>> for i in range(100): + >>> cv.predict() + >>> cv.update([i, i, i]) + >>> saved_k = np.copy(cv.K) + >>> saved_P = np.copy(cv.P) + later on: + >>> cv = kinematic_kf(dim=3, order=2) # 3D const velocity filter + >>> cv.K = np.copy(saved_K) + >>> cv.P = np.copy(saved_P) + >>> for i in range(100): + >>> cv.predict_steadystate() + >>> cv.update_steadystate([i, i, i]) + """ + + # set to None to force recompute + self._log_likelihood = None + self._likelihood = None + self._mahalanobis = None + + if z is None: + self.z = np.array([[None] * self.dim_z]).T + self.x_post = self.x.copy() + self.P_post = self.P.copy() + self.y = zeros((self.dim_z, 1)) + return + + z = reshape_z(z, self.dim_z, self.x.ndim) + + # y = z - Hx + # error (residual) between measurement and prediction + self.y = z - dot(self.H, self.x) + + # x = x + Ky + # predict new x with residual scaled by the kalman gain + self.x = self.x + dot(self.K, self.y) + + self.z = deepcopy(z) + self.x_post = self.x.copy() + self.P_post = self.P.copy() + + # set to None to force recompute + self._log_likelihood = None + self._likelihood = None + self._mahalanobis = None + + def update_correlated(self, z, R=None, H=None): + """Add a new measurement (z) to the Kalman filter assuming that + process noise and measurement noise are correlated as defined in + the `self.M` matrix. + A partial derivation can be found in [1] + If z is None, nothing is changed. + Parameters + ---------- + z : (dim_z, 1): array_like + measurement for this update. z can be a scalar if dim_z is 1, + otherwise it must be convertible to a column vector. + R : np.array, scalar, or None + Optionally provide R to override the measurement noise for this + one call, otherwise self.R will be used. + H : np.array, or None + Optionally provide H to override the measurement function for this + one call, otherwise self.H will be used. + References + ---------- + .. [1] Bulut, Y. (2011). Applied Kalman filter theory (Doctoral dissertation, Northeastern University). + http://people.duke.edu/~hpgavin/SystemID/References/Balut-KalmanFilter-PhD-NEU-2011.pdf + """ + + # set to None to force recompute + self._log_likelihood = None + self._likelihood = None + self._mahalanobis = None + + if z is None: + self.z = np.array([[None] * self.dim_z]).T + self.x_post = self.x.copy() + self.P_post = self.P.copy() + self.y = zeros((self.dim_z, 1)) + return + + if R is None: + R = self.R + elif isscalar(R): + R = eye(self.dim_z) * R + + # rename for readability and a tiny extra bit of speed + if H is None: + z = reshape_z(z, self.dim_z, self.x.ndim) + H = self.H + + # handle special case: if z is in form [[z]] but x is not a column + # vector dimensions will not match + if self.x.ndim == 1 and shape(z) == (1, 1): + z = z[0] + + if shape(z) == (): # is it scalar, e.g. z=3 or z=np.array(3) + z = np.asarray([z]) + + # y = z - Hx + # error (residual) between measurement and prediction + self.y = z - dot(H, self.x) + + # common subexpression for speed + PHT = dot(self.P, H.T) + + # project system uncertainty into measurement space + self.S = dot(H, PHT) + dot(H, self.M) + dot(self.M.T, H.T) + R + self.SI = self.inv(self.S) + + # K = PH'inv(S) + # map system uncertainty into kalman gain + self.K = dot(PHT + self.M, self.SI) + + # x = x + Ky + # predict new x with residual scaled by the kalman gain + self.x = self.x + dot(self.K, self.y) + self.P = self.P - dot(self.K, dot(H, self.P) + self.M.T) + + self.z = deepcopy(z) + self.x_post = self.x.copy() + self.P_post = self.P.copy() + + def batch_filter(self, zs, Fs=None, Qs=None, Hs=None, Rs=None, Bs=None, us=None, update_first=False, saver=None): + """Batch processes a sequences of measurements. + Parameters + ---------- + zs : list-like + list of measurements at each time step `self.dt`. Missing + measurements must be represented by `None`. + Fs : None, list-like, default=None + optional value or list of values to use for the state transition + matrix F. + If Fs is None then self.F is used for all epochs. + Otherwise it must contain a list-like list of F's, one for + each epoch. This allows you to have varying F per epoch. + Qs : None, np.array or list-like, default=None + optional value or list of values to use for the process error + covariance Q. + If Qs is None then self.Q is used for all epochs. + Otherwise it must contain a list-like list of Q's, one for + each epoch. This allows you to have varying Q per epoch. + Hs : None, np.array or list-like, default=None + optional list of values to use for the measurement matrix H. + If Hs is None then self.H is used for all epochs. + If Hs contains a single matrix, then it is used as H for all + epochs. + Otherwise it must contain a list-like list of H's, one for + each epoch. This allows you to have varying H per epoch. + Rs : None, np.array or list-like, default=None + optional list of values to use for the measurement error + covariance R. + If Rs is None then self.R is used for all epochs. + Otherwise it must contain a list-like list of R's, one for + each epoch. This allows you to have varying R per epoch. + Bs : None, np.array or list-like, default=None + optional list of values to use for the control transition matrix B. + If Bs is None then self.B is used for all epochs. + Otherwise it must contain a list-like list of B's, one for + each epoch. This allows you to have varying B per epoch. + us : None, np.array or list-like, default=None + optional list of values to use for the control input vector; + If us is None then None is used for all epochs (equivalent to 0, + or no control input). + Otherwise it must contain a list-like list of u's, one for + each epoch. + update_first : bool, optional, default=False + controls whether the order of operations is update followed by + predict, or predict followed by update. Default is predict->update. + saver : filterpy.common.Saver, optional + filterpy.common.Saver object. If provided, saver.save() will be + called after every epoch + Returns + ------- + means : np.array((n,dim_x,1)) + array of the state for each time step after the update. Each entry + is an np.array. In other words `means[k,:]` is the state at step + `k`. + covariance : np.array((n,dim_x,dim_x)) + array of the covariances for each time step after the update. + In other words `covariance[k,:,:]` is the covariance at step `k`. + means_predictions : np.array((n,dim_x,1)) + array of the state for each time step after the predictions. Each + entry is an np.array. In other words `means[k,:]` is the state at + step `k`. + covariance_predictions : np.array((n,dim_x,dim_x)) + array of the covariances for each time step after the prediction. + In other words `covariance[k,:,:]` is the covariance at step `k`. + Examples + -------- + .. code-block:: Python + # this example demonstrates tracking a measurement where the time + # between measurement varies, as stored in dts. This requires + # that F be recomputed for each epoch. The output is then smoothed + # with an RTS smoother. + zs = [t + random.randn()*4 for t in range (40)] + Fs = [np.array([[1., dt], [0, 1]] for dt in dts] + (mu, cov, _, _) = kf.batch_filter(zs, Fs=Fs) + (xs, Ps, Ks, Pps) = kf.rts_smoother(mu, cov, Fs=Fs) + """ + + # pylint: disable=too-many-statements + n = np.size(zs, 0) + if Fs is None: + Fs = [self.F] * n + if Qs is None: + Qs = [self.Q] * n + if Hs is None: + Hs = [self.H] * n + if Rs is None: + Rs = [self.R] * n + if Bs is None: + Bs = [self.B] * n + if us is None: + us = [0] * n + + # mean estimates from Kalman Filter + if self.x.ndim == 1: + means = zeros((n, self.dim_x)) + means_p = zeros((n, self.dim_x)) + else: + means = zeros((n, self.dim_x, 1)) + means_p = zeros((n, self.dim_x, 1)) + + # state covariances from Kalman Filter + covariances = zeros((n, self.dim_x, self.dim_x)) + covariances_p = zeros((n, self.dim_x, self.dim_x)) + + if update_first: + for i, (z, F, Q, H, R, B, u) in enumerate(zip(zs, Fs, Qs, Hs, Rs, Bs, us)): + + self.update(z, R=R, H=H) + means[i, :] = self.x + covariances[i, :, :] = self.P + + self.predict(u=u, B=B, F=F, Q=Q) + means_p[i, :] = self.x + covariances_p[i, :, :] = self.P + + if saver is not None: + saver.save() + else: + for i, (z, F, Q, H, R, B, u) in enumerate(zip(zs, Fs, Qs, Hs, Rs, Bs, us)): + + self.predict(u=u, B=B, F=F, Q=Q) + means_p[i, :] = self.x + covariances_p[i, :, :] = self.P + + self.update(z, R=R, H=H) + means[i, :] = self.x + covariances[i, :, :] = self.P + + if saver is not None: + saver.save() + + return (means, covariances, means_p, covariances_p) + + def rts_smoother(self, Xs, Ps, Fs=None, Qs=None, inv=np.linalg.inv): + """ + Runs the Rauch-Tung-Striebel Kalman smoother on a set of + means and covariances computed by a Kalman filter. The usual input + would come from the output of `KalmanFilter.batch_filter()`. + Parameters + ---------- + Xs : numpy.array + array of the means (state variable x) of the output of a Kalman + filter. + Ps : numpy.array + array of the covariances of the output of a kalman filter. + Fs : list-like collection of numpy.array, optional + State transition matrix of the Kalman filter at each time step. + Optional, if not provided the filter's self.F will be used + Qs : list-like collection of numpy.array, optional + Process noise of the Kalman filter at each time step. Optional, + if not provided the filter's self.Q will be used + inv : function, default numpy.linalg.inv + If you prefer another inverse function, such as the Moore-Penrose + pseudo inverse, set it to that instead: kf.inv = np.linalg.pinv + Returns + ------- + x : numpy.ndarray + smoothed means + P : numpy.ndarray + smoothed state covariances + K : numpy.ndarray + smoother gain at each step + Pp : numpy.ndarray + Predicted state covariances + Examples + -------- + .. code-block:: Python + zs = [t + random.randn()*4 for t in range (40)] + (mu, cov, _, _) = kalman.batch_filter(zs) + (x, P, K, Pp) = rts_smoother(mu, cov, kf.F, kf.Q) + """ + + if len(Xs) != len(Ps): + raise ValueError("length of Xs and Ps must be the same") + + n = Xs.shape[0] + dim_x = Xs.shape[1] + + if Fs is None: + Fs = [self.F] * n + if Qs is None: + Qs = [self.Q] * n + + # smoother gain + K = zeros((n, dim_x, dim_x)) + + x, P, Pp = Xs.copy(), Ps.copy(), Ps.copy() + for k in range(n - 2, -1, -1): + Pp[k] = dot(dot(Fs[k + 1], P[k]), Fs[k + 1].T) + Qs[k + 1] + + # pylint: disable=bad-whitespace + K[k] = dot(dot(P[k], Fs[k + 1].T), inv(Pp[k])) + x[k] += dot(K[k], x[k + 1] - dot(Fs[k + 1], x[k])) + P[k] += dot(dot(K[k], P[k + 1] - Pp[k]), K[k].T) + + return (x, P, K, Pp) + + def get_prediction(self, u=None, B=None, F=None, Q=None): + """ + Predict next state (prior) using the Kalman filter state propagation + equations and returns it without modifying the object. + Parameters + ---------- + u : np.array, default 0 + Optional control vector. + B : np.array(dim_x, dim_u), or None + Optional control transition matrix; a value of None + will cause the filter to use `self.B`. + F : np.array(dim_x, dim_x), or None + Optional state transition matrix; a value of None + will cause the filter to use `self.F`. + Q : np.array(dim_x, dim_x), scalar, or None + Optional process noise matrix; a value of None will cause the + filter to use `self.Q`. + Returns + ------- + (x, P) : tuple + State vector and covariance array of the prediction. + """ + + if B is None: + B = self.B + if F is None: + F = self.F + if Q is None: + Q = self.Q + elif isscalar(Q): + Q = eye(self.dim_x) * Q + + # x = Fx + Bu + if B is not None and u is not None: + x = dot(F, self.x) + dot(B, u) + else: + x = dot(F, self.x) + + # P = FPF' + Q + P = self._alpha_sq * dot(dot(F, self.P), F.T) + Q + + return x, P + + def get_update(self, z=None): + """ + Computes the new estimate based on measurement `z` and returns it + without altering the state of the filter. + Parameters + ---------- + z : (dim_z, 1): array_like + measurement for this update. z can be a scalar if dim_z is 1, + otherwise it must be convertible to a column vector. + Returns + ------- + (x, P) : tuple + State vector and covariance array of the update. + """ + + if z is None: + return self.x, self.P + z = reshape_z(z, self.dim_z, self.x.ndim) + + R = self.R + H = self.H + P = self.P + x = self.x + + # error (residual) between measurement and prediction + y = z - dot(H, x) + + # common subexpression for speed + PHT = dot(P, H.T) + + # project system uncertainty into measurement space + S = dot(H, PHT) + R + + # map system uncertainty into kalman gain + K = dot(PHT, self.inv(S)) + + # predict new x with residual scaled by the kalman gain + x = x + dot(K, y) + + # P = (I-KH)P(I-KH)' + KRK' + I_KH = self._I - dot(K, H) + P = dot(dot(I_KH, P), I_KH.T) + dot(dot(K, R), K.T) + + return x, P + + def residual_of(self, z): + """ + Returns the residual for the given measurement (z). Does not alter + the state of the filter. + """ + z = reshape_z(z, self.dim_z, self.x.ndim) + return z - dot(self.H, self.x_prior) + + def measurement_of_state(self, x): + """ + Helper function that converts a state into a measurement. + Parameters + ---------- + x : np.array + kalman state vector + Returns + ------- + z : (dim_z, 1): array_like + measurement for this update. z can be a scalar if dim_z is 1, + otherwise it must be convertible to a column vector. + """ + + return dot(self.H, x) + + @property + def log_likelihood(self): + """ + log-likelihood of the last measurement. + """ + if self._log_likelihood is None: + self._log_likelihood = logpdf(x=self.y, cov=self.S) + return self._log_likelihood + + @property + def likelihood(self): + """ + Computed from the log-likelihood. The log-likelihood can be very + small, meaning a large negative value such as -28000. Taking the + exp() of that results in 0.0, which can break typical algorithms + which multiply by this value, so by default we always return a + number >= sys.float_info.min. + """ + if self._likelihood is None: + self._likelihood = exp(self.log_likelihood) + if self._likelihood == 0: + self._likelihood = sys.float_info.min + return self._likelihood + + @property + def mahalanobis(self): + """ " + Mahalanobis distance of measurement. E.g. 3 means measurement + was 3 standard deviations away from the predicted value. + Returns + ------- + mahalanobis : float + """ + if self._mahalanobis is None: + self._mahalanobis = sqrt(float(dot(dot(self.y.T, self.SI), self.y))) + return self._mahalanobis + + @property + def alpha(self): + """ + Fading memory setting. 1.0 gives the normal Kalman filter, and + values slightly larger than 1.0 (such as 1.02) give a fading + memory effect - previous measurements have less influence on the + filter's estimates. This formulation of the Fading memory filter + (there are many) is due to Dan Simon [1]_. + """ + return self._alpha_sq**0.5 + + def log_likelihood_of(self, z): + """ + log likelihood of the measurement `z`. This should only be called + after a call to update(). Calling after predict() will yield an + incorrect result.""" + + if z is None: + return log(sys.float_info.min) + return logpdf(z, dot(self.H, self.x), self.S) + + @alpha.setter + def alpha(self, value): + if not np.isscalar(value) or value < 1: + raise ValueError("alpha must be a float greater than 1") + + self._alpha_sq = value**2 + + def __repr__(self): + return "\n".join( + [ + "KalmanFilter object", + pretty_str("dim_x", self.dim_x), + pretty_str("dim_z", self.dim_z), + pretty_str("dim_u", self.dim_u), + pretty_str("x", self.x), + pretty_str("P", self.P), + pretty_str("x_prior", self.x_prior), + pretty_str("P_prior", self.P_prior), + pretty_str("x_post", self.x_post), + pretty_str("P_post", self.P_post), + pretty_str("F", self.F), + pretty_str("Q", self.Q), + pretty_str("R", self.R), + pretty_str("H", self.H), + pretty_str("K", self.K), + pretty_str("y", self.y), + pretty_str("S", self.S), + pretty_str("SI", self.SI), + pretty_str("M", self.M), + pretty_str("B", self.B), + pretty_str("z", self.z), + pretty_str("log-likelihood", self.log_likelihood), + pretty_str("likelihood", self.likelihood), + pretty_str("mahalanobis", self.mahalanobis), + pretty_str("alpha", self.alpha), + pretty_str("inv", self.inv), + ] + ) + + def test_matrix_dimensions(self, z=None, H=None, R=None, F=None, Q=None): + """ + Performs a series of asserts to check that the size of everything + is what it should be. This can help you debug problems in your design. + If you pass in H, R, F, Q those will be used instead of this object's + value for those matrices. + Testing `z` (the measurement) is problamatic. x is a vector, and can be + implemented as either a 1D array or as a nx1 column vector. Thus Hx + can be of different shapes. Then, if Hx is a single value, it can + be either a 1D array or 2D vector. If either is true, z can reasonably + be a scalar (either '3' or np.array('3') are scalars under this + definition), a 1D, 1 element array, or a 2D, 1 element array. You are + allowed to pass in any combination that works. + """ + + if H is None: + H = self.H + if R is None: + R = self.R + if F is None: + F = self.F + if Q is None: + Q = self.Q + x = self.x + P = self.P + + assert x.ndim == 1 or x.ndim == 2, "x must have one or two dimensions, but has {}".format(x.ndim) + + if x.ndim == 1: + assert x.shape[0] == self.dim_x, "Shape of x must be ({},{}), but is {}".format(self.dim_x, 1, x.shape) + else: + assert x.shape == (self.dim_x, 1), "Shape of x must be ({},{}), but is {}".format(self.dim_x, 1, x.shape) + + assert P.shape == (self.dim_x, self.dim_x), "Shape of P must be ({},{}), but is {}".format( + self.dim_x, self.dim_x, P.shape + ) + + assert Q.shape == (self.dim_x, self.dim_x), "Shape of Q must be ({},{}), but is {}".format( + self.dim_x, self.dim_x, P.shape + ) + + assert F.shape == (self.dim_x, self.dim_x), "Shape of F must be ({},{}), but is {}".format( + self.dim_x, self.dim_x, F.shape + ) + + assert np.ndim(H) == 2, "Shape of H must be (dim_z, {}), but is {}".format(P.shape[0], shape(H)) + + assert H.shape[1] == P.shape[0], "Shape of H must be (dim_z, {}), but is {}".format(P.shape[0], H.shape) + + # shape of R must be the same as HPH' + hph_shape = (H.shape[0], H.shape[0]) + r_shape = shape(R) + + if H.shape[0] == 1: + # r can be scalar, 1D, or 2D in this case + assert r_shape in [(), (1,), (1, 1)], "R must be scalar or one element array, but is shaped {}".format( + r_shape + ) + else: + assert r_shape == hph_shape, "shape of R should be {} but it is {}".format(hph_shape, r_shape) + + if z is not None: + z_shape = shape(z) + else: + z_shape = (self.dim_z, 1) + + # H@x must have shape of z + Hx = dot(H, x) + + if z_shape == (): # scalar or np.array(scalar) + assert Hx.ndim == 1 or shape(Hx) == (1, 1), "shape of z should be {}, not {} for the given H".format( + shape(Hx), z_shape + ) + + elif shape(Hx) == (1,): + assert z_shape[0] == 1, "Shape of z must be {} for the given H".format(shape(Hx)) + + else: + assert z_shape == shape(Hx) or ( + len(z_shape) == 1 and shape(Hx) == (z_shape[0], 1) + ), "shape of z should be {}, not {} for the given H".format(shape(Hx), z_shape) + + if np.ndim(Hx) > 1 and shape(Hx) != (1, 1): + assert shape(Hx) == z_shape, "shape of z should be {} for the given H, but it is {}".format( + shape(Hx), z_shape + ) + + +def update(x, P, z, R, H=None, return_all=False): + """ + Add a new measurement (z) to the Kalman filter. If z is None, nothing + is changed. + This can handle either the multidimensional or unidimensional case. If + all parameters are floats instead of arrays the filter will still work, + and return floats for x, P as the result. + update(1, 2, 1, 1, 1) # univariate + update(x, P, 1 + Parameters + ---------- + x : numpy.array(dim_x, 1), or float + State estimate vector + P : numpy.array(dim_x, dim_x), or float + Covariance matrix + z : (dim_z, 1): array_like + measurement for this update. z can be a scalar if dim_z is 1, + otherwise it must be convertible to a column vector. + R : numpy.array(dim_z, dim_z), or float + Measurement noise matrix + H : numpy.array(dim_x, dim_x), or float, optional + Measurement function. If not provided, a value of 1 is assumed. + return_all : bool, default False + If true, y, K, S, and log_likelihood are returned, otherwise + only x and P are returned. + Returns + ------- + x : numpy.array + Posterior state estimate vector + P : numpy.array + Posterior covariance matrix + y : numpy.array or scalar + Residua. Difference between measurement and state in measurement space + K : numpy.array + Kalman gain + S : numpy.array + System uncertainty in measurement space + log_likelihood : float + log likelihood of the measurement + """ + + # pylint: disable=bare-except + + if z is None: + if return_all: + return x, P, None, None, None, None + return x, P + + if H is None: + H = np.array([1]) + + if np.isscalar(H): + H = np.array([H]) + + Hx = np.atleast_1d(dot(H, x)) + z = reshape_z(z, Hx.shape[0], x.ndim) + + # error (residual) between measurement and prediction + y = z - Hx + + # project system uncertainty into measurement space + S = dot(dot(H, P), H.T) + R + + # map system uncertainty into kalman gain + try: + K = dot(dot(P, H.T), linalg.inv(S)) + except linalg.LinAlgError: + # can't invert a 1D array, annoyingly + K = dot(dot(P, H.T), 1.0 / S) + + # predict new x with residual scaled by the kalman gain + x = x + dot(K, y) + + # P = (I-KH)P(I-KH)' + KRK' + KH = dot(K, H) + + try: + I_KH = np.eye(KH.shape[0]) - KH + except linalg.LinAlgError: + I_KH = np.array([1 - KH]) + P = dot(dot(I_KH, P), I_KH.T) + dot(dot(K, R), K.T) + + if return_all: + # compute log likelihood + log_likelihood = logpdf(z, dot(H, x), S) + return x, P, y, K, S, log_likelihood + return x, P + + +def update_steadystate(x, z, K, H=None): + """ + Add a new measurement (z) to the Kalman filter. If z is None, nothing + is changed. + Parameters + ---------- + x : numpy.array(dim_x, 1), or float + State estimate vector + z : (dim_z, 1): array_like + measurement for this update. z can be a scalar if dim_z is 1, + otherwise it must be convertible to a column vector. + K : numpy.array, or float + Kalman gain matrix + H : numpy.array(dim_x, dim_x), or float, optional + Measurement function. If not provided, a value of 1 is assumed. + Returns + ------- + x : numpy.array + Posterior state estimate vector + Examples + -------- + This can handle either the multidimensional or unidimensional case. If + all parameters are floats instead of arrays the filter will still work, + and return floats for x, P as the result. + >>> update_steadystate(1, 2, 1) # univariate + >>> update_steadystate(x, P, z, H) + """ + + if z is None: + return x + + if H is None: + H = np.array([1]) + + if np.isscalar(H): + H = np.array([H]) + + Hx = np.atleast_1d(dot(H, x)) + z = reshape_z(z, Hx.shape[0], x.ndim) + + # error (residual) between measurement and prediction + y = z - Hx + + # estimate new x with residual scaled by the kalman gain + return x + dot(K, y) + + +def predict(x, P, F=1, Q=0, u=0, B=1, alpha=1.0): + """ + Predict next state (prior) using the Kalman filter state propagation + equations. + Parameters + ---------- + x : numpy.array + State estimate vector + P : numpy.array + Covariance matrix + F : numpy.array() + State Transition matrix + Q : numpy.array, Optional + Process noise matrix + u : numpy.array, Optional, default 0. + Control vector. If non-zero, it is multiplied by B + to create the control input into the system. + B : numpy.array, optional, default 0. + Control transition matrix. + alpha : float, Optional, default=1.0 + Fading memory setting. 1.0 gives the normal Kalman filter, and + values slightly larger than 1.0 (such as 1.02) give a fading + memory effect - previous measurements have less influence on the + filter's estimates. This formulation of the Fading memory filter + (there are many) is due to Dan Simon + Returns + ------- + x : numpy.array + Prior state estimate vector + P : numpy.array + Prior covariance matrix + """ + + if np.isscalar(F): + F = np.array(F) + x = dot(F, x) + dot(B, u) + P = (alpha * alpha) * dot(dot(F, P), F.T) + Q + + return x, P + + +def predict_steadystate(x, F=1, u=0, B=1): + """ + Predict next state (prior) using the Kalman filter state propagation + equations. This steady state form only computes x, assuming that the + covariance is constant. + Parameters + ---------- + x : numpy.array + State estimate vector + P : numpy.array + Covariance matrix + F : numpy.array() + State Transition matrix + u : numpy.array, Optional, default 0. + Control vector. If non-zero, it is multiplied by B + to create the control input into the system. + B : numpy.array, optional, default 0. + Control transition matrix. + Returns + ------- + x : numpy.array + Prior state estimate vector + """ + + if np.isscalar(F): + F = np.array(F) + x = dot(F, x) + dot(B, u) + + return x + + +def batch_filter(x, P, zs, Fs, Qs, Hs, Rs, Bs=None, us=None, update_first=False, saver=None): + """ + Batch processes a sequences of measurements. + Parameters + ---------- + zs : list-like + list of measurements at each time step. Missing measurements must be + represented by None. + Fs : list-like + list of values to use for the state transition matrix matrix. + Qs : list-like + list of values to use for the process error + covariance. + Hs : list-like + list of values to use for the measurement matrix. + Rs : list-like + list of values to use for the measurement error + covariance. + Bs : list-like, optional + list of values to use for the control transition matrix; + a value of None in any position will cause the filter + to use `self.B` for that time step. + us : list-like, optional + list of values to use for the control input vector; + a value of None in any position will cause the filter to use + 0 for that time step. + update_first : bool, optional + controls whether the order of operations is update followed by + predict, or predict followed by update. Default is predict->update. + saver : filterpy.common.Saver, optional + filterpy.common.Saver object. If provided, saver.save() will be + called after every epoch + Returns + ------- + means : np.array((n,dim_x,1)) + array of the state for each time step after the update. Each entry + is an np.array. In other words `means[k,:]` is the state at step + `k`. + covariance : np.array((n,dim_x,dim_x)) + array of the covariances for each time step after the update. + In other words `covariance[k,:,:]` is the covariance at step `k`. + means_predictions : np.array((n,dim_x,1)) + array of the state for each time step after the predictions. Each + entry is an np.array. In other words `means[k,:]` is the state at + step `k`. + covariance_predictions : np.array((n,dim_x,dim_x)) + array of the covariances for each time step after the prediction. + In other words `covariance[k,:,:]` is the covariance at step `k`. + Examples + -------- + .. code-block:: Python + zs = [t + random.randn()*4 for t in range (40)] + Fs = [kf.F for t in range (40)] + Hs = [kf.H for t in range (40)] + (mu, cov, _, _) = kf.batch_filter(zs, Rs=R_list, Fs=Fs, Hs=Hs, Qs=None, + Bs=None, us=None, update_first=False) + (xs, Ps, Ks, Pps) = kf.rts_smoother(mu, cov, Fs=Fs, Qs=None) + """ + + n = np.size(zs, 0) + dim_x = x.shape[0] + + # mean estimates from Kalman Filter + if x.ndim == 1: + means = zeros((n, dim_x)) + means_p = zeros((n, dim_x)) + else: + means = zeros((n, dim_x, 1)) + means_p = zeros((n, dim_x, 1)) + + # state covariances from Kalman Filter + covariances = zeros((n, dim_x, dim_x)) + covariances_p = zeros((n, dim_x, dim_x)) + + if us is None: + us = [0.0] * n + Bs = [0.0] * n + + if update_first: + for i, (z, F, Q, H, R, B, u) in enumerate(zip(zs, Fs, Qs, Hs, Rs, Bs, us)): + + x, P = update(x, P, z, R=R, H=H) + means[i, :] = x + covariances[i, :, :] = P + + x, P = predict(x, P, u=u, B=B, F=F, Q=Q) + means_p[i, :] = x + covariances_p[i, :, :] = P + if saver is not None: + saver.save() + else: + for i, (z, F, Q, H, R, B, u) in enumerate(zip(zs, Fs, Qs, Hs, Rs, Bs, us)): + + x, P = predict(x, P, u=u, B=B, F=F, Q=Q) + means_p[i, :] = x + covariances_p[i, :, :] = P + + x, P = update(x, P, z, R=R, H=H) + means[i, :] = x + covariances[i, :, :] = P + if saver is not None: + saver.save() + + return (means, covariances, means_p, covariances_p) + + +def rts_smoother(Xs, Ps, Fs, Qs): + """ + Runs the Rauch-Tung-Striebel Kalman smoother on a set of + means and covariances computed by a Kalman filter. The usual input + would come from the output of `KalmanFilter.batch_filter()`. + Parameters + ---------- + Xs : numpy.array + array of the means (state variable x) of the output of a Kalman + filter. + Ps : numpy.array + array of the covariances of the output of a kalman filter. + Fs : list-like collection of numpy.array + State transition matrix of the Kalman filter at each time step. + Qs : list-like collection of numpy.array, optional + Process noise of the Kalman filter at each time step. + Returns + ------- + x : numpy.ndarray + smoothed means + P : numpy.ndarray + smoothed state covariances + K : numpy.ndarray + smoother gain at each step + pP : numpy.ndarray + predicted state covariances + Examples + -------- + .. code-block:: Python + zs = [t + random.randn()*4 for t in range (40)] + (mu, cov, _, _) = kalman.batch_filter(zs) + (x, P, K, pP) = rts_smoother(mu, cov, kf.F, kf.Q) + """ + + if len(Xs) != len(Ps): + raise ValueError("length of Xs and Ps must be the same") + + n = Xs.shape[0] + dim_x = Xs.shape[1] + + # smoother gain + K = zeros((n, dim_x, dim_x)) + x, P, pP = Xs.copy(), Ps.copy(), Ps.copy() + + for k in range(n - 2, -1, -1): + pP[k] = dot(dot(Fs[k], P[k]), Fs[k].T) + Qs[k] + + # pylint: disable=bad-whitespace + K[k] = dot(dot(P[k], Fs[k].T), linalg.inv(pP[k])) + x[k] += dot(K[k], x[k + 1] - dot(Fs[k], x[k])) + P[k] += dot(dot(K[k], P[k + 1] - pP[k]), K[k].T) + + return (x, P, K, pP) diff --git a/src/gesturedetection/onnx_models.py b/src/gesturedetection/onnx_models.py new file mode 100644 index 0000000000000000000000000000000000000000..b632e782380494fe85c7c7a9e80e896951eb8184 --- /dev/null +++ b/src/gesturedetection/onnx_models.py @@ -0,0 +1,194 @@ +from abc import ABC + +import cv2 +import numpy as np +import onnxruntime as ort + + +class OnnxModel(ABC): + def __init__(self, model_path, image_size): + self.model_path = model_path + self.image_size = image_size + self.mean = np.array([127, 127, 127], dtype=np.float32) + self.std = np.array([128, 128, 128], dtype=np.float32) + options, prov_opts, providers = self.get_onnx_provider() + self.sess = ort.InferenceSession( + model_path, sess_options=options, providers=providers, provider_options=prov_opts + ) + self._get_input_output() + + def preprocess(self, frame): + """ + Preprocess frame + Parameters + ---------- + frame : np.ndarray + Frame to preprocess + Returns + ------- + np.ndarray + Preprocessed frame + """ + image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + image = cv2.resize(image, self.image_size) + image = (image - self.mean) / self.std + image = np.transpose(image, [2, 0, 1]) + image = np.expand_dims(image, axis=0) + return image + + def _get_input_output(self): + inputs = self.sess.get_inputs() + self.inputs = "".join( + [ + f"\n {i}: {input.name}" f" Shape: ({','.join(map(str, input.shape))})" f" Dtype: {input.type}" + for i, input in enumerate(inputs) + ] + ) + + outputs = self.sess.get_outputs() + self.outputs = "".join( + [ + f"\n {i}: {output.name}" f" Shape: ({','.join(map(str, output.shape))})" f" Dtype: {output.type}" + for i, output in enumerate(outputs) + ] + ) + + @staticmethod + def get_onnx_provider(): + """ + Get onnx provider + Returns + ------- + options : onnxruntime.SessionOptions + Session options + prov_opts : dict + Provider options + providers : list + List of providers + """ + providers = ["CPUExecutionProvider"] + options = ort.SessionOptions() + options.enable_mem_pattern = False + options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL + prov_opts = [] + print("Using ONNX Runtime", ort.get_device()) + + if "DML" in ort.get_device(): + prov_opts = [{"device_id": 0}] + providers.append("DmlExecutionProvider") + + elif "GPU" in ort.get_device(): + prov_opts = [ + { + "device_id": 0, + "arena_extend_strategy": "kNextPowerOfTwo", + "gpu_mem_limit": 2 * 1024 * 1024 * 1024, + "cudnn_conv_algo_search": "EXHAUSTIVE", + "do_copy_in_default_stream": True, + } + ] + providers.append("CUDAExecutionProvider") + + return options, prov_opts, providers + + def __repr__(self): + return ( + f"Providers: {self.sess.get_providers()}\n" + f"Model: {self.sess.get_modelmeta().description}\n" + f"Version: {self.sess.get_modelmeta().version}\n" + f"Inputs: {self.inputs}\n" + f"Outputs: {self.outputs}" + ) + +class HandDetection(OnnxModel): + def __init__(self, model_path, image_size=(320, 240)): + super().__init__(model_path, image_size) + self.image_size = image_size + self.sess = ort.InferenceSession(model_path) + self.input_name = self.sess.get_inputs()[0].name + self.output_names = [output.name for output in self.sess.get_outputs()] + + def __call__(self, frame): + input_tensor = self.preprocess(frame) + boxes, _, probs = self.sess.run(self.output_names, {self.input_name: input_tensor}) + width, height = frame.shape[1], frame.shape[0] + boxes[:, 0] *= width + boxes[:, 1] *= height + boxes[:, 2] *= width + boxes[:, 3] *= height + return boxes.astype(np.int32), probs + + +class HandClassification(OnnxModel): + def __init__(self, model_path, image_size=(128, 128)): + super().__init__(model_path, image_size) + + @staticmethod + def get_square(box, image): + """ + Get square box + Parameters + ---------- + box : np.ndarray + Box coordinates (x1, y1, x2, y2) + image : np.ndarray + Image for shape + """ + height, width, _ = image.shape + x0, y0, x1, y1 = box + w, h = x1 - x0, y1 - y0 + if h < w: + y0 = y0 - int((w - h) / 2) + y1 = y0 + w + if h > w: + x0 = x0 - int((h - w) / 2) + x1 = x0 + h + x0 = max(0, x0) + y0 = max(0, y0) + x1 = min(width - 1, x1) + y1 = min(height - 1, y1) + return x0, y0, x1, y1 + + def get_crops(self, frame, bboxes): + """ + Get crops from frame + Parameters + ---------- + frame : np.ndarray + Frame to crop from bboxes + bboxes : np.ndarray + Bounding boxes + + Returns + ------- + crops : np.ndarray + Crops from frame + """ + crops = [] + for bbox in bboxes: + bbox = self.get_square(bbox, frame) + crop = frame[bbox[1] : bbox[3], bbox[0] : bbox[2]] + crops.append(crop) + return crops + + def __call__(self, image, bboxes): + """ + Get predictions from model + Parameters + ---------- + image : np.ndarray + Image to predict + bboxes : np.ndarray + Bounding boxes + + Returns + ------- + predictions : np.ndarray + Predictions from model + """ + crops = self.get_crops(image, bboxes) + crops = [self.preprocess(crop) for crop in crops] + input_name = self.sess.get_inputs()[0].name + outputs = self.sess.run(None, {input_name: np.concatenate(crops, axis=0)})[0] + labels = np.argmax(outputs, axis=1) + return labels diff --git a/src/gesturedetection/utils/__init__.py b/src/gesturedetection/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4d7a7e4662f4c2a23360dce194bfd3f4428d6586 --- /dev/null +++ b/src/gesturedetection/utils/__init__.py @@ -0,0 +1,16 @@ +from .action_controller import Deque +from .box_utils_numpy import hard_nms +from .drawer import Drawer +from .enums import Event, HandPosition, targets +from .hand import Hand + + +__all__ = [ + "Deque", + "hard_nms", + "Drawer", + "Event", + "HandPosition", + "targets", + "Hand" +] \ No newline at end of file diff --git a/src/gesturedetection/utils/__pycache__/__init__.cpython-312.pyc b/src/gesturedetection/utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..491f2b991271e7d86fc6097524921106315fd77d Binary files /dev/null and b/src/gesturedetection/utils/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/gesturedetection/utils/__pycache__/__init__.cpython-39.pyc b/src/gesturedetection/utils/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c221b08454f99a5ad324e14726d990ac7dccda7f Binary files /dev/null and b/src/gesturedetection/utils/__pycache__/__init__.cpython-39.pyc differ diff --git a/src/gesturedetection/utils/__pycache__/action_controller.cpython-312.pyc b/src/gesturedetection/utils/__pycache__/action_controller.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d5efa6419fc9cef06ec453a7d1236de2be96d472 Binary files /dev/null and b/src/gesturedetection/utils/__pycache__/action_controller.cpython-312.pyc differ diff --git a/src/gesturedetection/utils/__pycache__/action_controller.cpython-39.pyc b/src/gesturedetection/utils/__pycache__/action_controller.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae3b4aaae0e97e552246272d8eed848e210f3a7c Binary files /dev/null and b/src/gesturedetection/utils/__pycache__/action_controller.cpython-39.pyc differ diff --git a/src/gesturedetection/utils/__pycache__/box_utils_numpy.cpython-312.pyc b/src/gesturedetection/utils/__pycache__/box_utils_numpy.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ddf7a5dc5092308868a6aa44e875d53eb3108dd3 Binary files /dev/null and b/src/gesturedetection/utils/__pycache__/box_utils_numpy.cpython-312.pyc differ diff --git a/src/gesturedetection/utils/__pycache__/box_utils_numpy.cpython-39.pyc b/src/gesturedetection/utils/__pycache__/box_utils_numpy.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2e965d846a67c82a6403d8812da2814bbd71e93 Binary files /dev/null and b/src/gesturedetection/utils/__pycache__/box_utils_numpy.cpython-39.pyc differ diff --git a/src/gesturedetection/utils/__pycache__/drawer.cpython-312.pyc b/src/gesturedetection/utils/__pycache__/drawer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49958363930819a873d2003443d7f3f770d6f4ab Binary files /dev/null and b/src/gesturedetection/utils/__pycache__/drawer.cpython-312.pyc differ diff --git a/src/gesturedetection/utils/__pycache__/drawer.cpython-39.pyc b/src/gesturedetection/utils/__pycache__/drawer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed973461a87180c549c71ddd442ee5d074b576ef Binary files /dev/null and b/src/gesturedetection/utils/__pycache__/drawer.cpython-39.pyc differ diff --git a/src/gesturedetection/utils/__pycache__/enums.cpython-312.pyc b/src/gesturedetection/utils/__pycache__/enums.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e850584078bed59e064932228cbcc32adc113537 Binary files /dev/null and b/src/gesturedetection/utils/__pycache__/enums.cpython-312.pyc differ diff --git a/src/gesturedetection/utils/__pycache__/enums.cpython-39.pyc b/src/gesturedetection/utils/__pycache__/enums.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4adf8d8be86ef021fa5921389aff16ace0407078 Binary files /dev/null and b/src/gesturedetection/utils/__pycache__/enums.cpython-39.pyc differ diff --git a/src/gesturedetection/utils/__pycache__/hand.cpython-312.pyc b/src/gesturedetection/utils/__pycache__/hand.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..43d11eecc473909380b2a583f2a11b4b244e2c9a Binary files /dev/null and b/src/gesturedetection/utils/__pycache__/hand.cpython-312.pyc differ diff --git a/src/gesturedetection/utils/__pycache__/hand.cpython-39.pyc b/src/gesturedetection/utils/__pycache__/hand.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f46604678f3dc45b561a7c8e6e0999f3af27a41b Binary files /dev/null and b/src/gesturedetection/utils/__pycache__/hand.cpython-39.pyc differ diff --git a/src/gesturedetection/utils/action_controller.py b/src/gesturedetection/utils/action_controller.py new file mode 100644 index 0000000000000000000000000000000000000000..2392fdc9248d446916d63dea4833d42673fde6ff --- /dev/null +++ b/src/gesturedetection/utils/action_controller.py @@ -0,0 +1,598 @@ +from scipy.spatial import distance +from collections import deque + +from .enums import Event, HandPosition, targets +from .hand import Hand + + +class Deque: + def __init__(self, maxlen=30, min_frames=20): + self.maxlen = maxlen + self._deque = [] + self.action = None + self.min_absolute_distance = 1.5 + self.min_frames = min_frames + self.action_deque = deque(maxlen=5) + + def __len__(self): + return len(self._deque) + + def index_position(self, x): + for i in range(len(self._deque)): + if self._deque[i].position == x: + return i + + def index_gesture(self, x): + for i in range(len(self._deque)): + if self._deque[i].gesture == x: + return i + + def __getitem__(self, index): + return self._deque[index] + + def __setitem__(self, index, value): + self._deque[index] = value + + def __delitem__(self, index): + del self._deque[index] + + def __iter__(self): + return iter(self._deque) + + def __reversed__(self): + return reversed(self._deque) + + def append(self, x): + if self.maxlen is not None and len(self) >= self.maxlen: + self._deque.pop(0) + self.set_hand_position(x) + self._deque.append(x) + self.check_is_action(x) + + def check_duration(self, start_index, min_frames=None): + """ + Check duration of swipe. + + Parameters + ---------- + start_index : int + Index of start position of swipe. + + Returns + ------- + bool + True if duration of swipe is more than min_frames. + """ + if min_frames == None: + min_frames = self.min_frames + if len(self) - start_index >= min_frames: + return True + else: + return False + + def check_duration_max(self, start_index, max_frames=10): + """ + Check duration of swipe. + + Parameters + ---------- + start_index : int + Index of start position of swipe. + + Returns + ------- + bool + True if duration of swipe is more than min_frames. + """ + if len(self) - start_index <= max_frames: + return True + else: + return False + + def check_is_action(self, x): + """ + Check if gesture is action. + + Parameters + ---------- + x : Hand + Hand object. + + Returns + ------- + bool + True if gesture is action. + """ + if x.position == HandPosition.LEFT_END and HandPosition.RIGHT_START in self: + start_index = self.index_position(HandPosition.RIGHT_START) + if ( + self.swipe_distance(self._deque[start_index], x) + and self.check_duration(start_index) + and self.check_horizontal_swipe(self._deque[start_index], x) + ): + self.action = Event.SWIPE_LEFT + self.clear() + return True + + elif x.position == HandPosition.RIGHT_END and HandPosition.LEFT_START in self: + start_index = self.index_position(HandPosition.LEFT_START) + if ( + self.swipe_distance(self._deque[start_index], x) + and self.check_duration(start_index) + and self.check_horizontal_swipe(self._deque[start_index], x) + ): + self.action = Event.SWIPE_RIGHT + self.clear() + return True + else: + self.clear() + + elif x.position == HandPosition.UP_END and HandPosition.DOWN_START in self: + start_index = self.index_position(HandPosition.DOWN_START) + if ( + self.swipe_distance(self._deque[start_index], x) + and self.check_duration(start_index) + and self.check_vertical_swipe(self._deque[start_index], x) + ): + self.action = Event.SWIPE_UP + self.clear() + return True + else: + self.clear() + + elif x.position == HandPosition.DOWN_END and HandPosition.UP_START in self: + start_index = self.index_position(HandPosition.UP_START) + if ( + self.swipe_distance(self._deque[start_index], x) + and self.check_duration(start_index) + and self.check_vertical_swipe(self._deque[start_index], x) + ): + self.action = Event.SWIPE_DOWN + self.clear() + return True + else: + self.clear() + + elif x.gesture == 18: # grip + if self.action is None: + start_index = self.index_gesture(18) + if self.check_duration(start_index): + self.action = Event.DRAG2 + return True + + elif self.action == Event.DRAG2 and x.gesture in [11, 12]: # hand heart + self.action = Event.DROP2 + self.clear() + return True + + elif x.gesture == 29: # ok + if self.action is None: + start_index = self.index_gesture(29) + if self.check_duration(start_index): + self.action = Event.DRAG3 + return True + + elif self.action == Event.DRAG3 and x.gesture in [11, 12]: # hand heart + self.action = Event.DROP3 + self.clear() + return True + + elif x.position == HandPosition.FAST_SWIPE_UP_END and HandPosition.FAST_SWIPE_UP_START in self: + start_index = self.index_position(HandPosition.FAST_SWIPE_UP_START) + if ( + self.check_duration(start_index, min_frames=20) + and self.check_vertical_swipe(self._deque[start_index], x) + ): + self.action = Event.FAST_SWIPE_UP + self.clear() + return True + else: + self.clear() + + elif x.position == HandPosition.FAST_SWIPE_DOWN_END and HandPosition.FAST_SWIPE_DOWN_START in self: + start_index = self.index_position(HandPosition.FAST_SWIPE_DOWN_START) + if ( + self.check_duration(start_index, min_frames=20) + and self.check_vertical_swipe(self._deque[start_index], x) + ): + self.action = Event.FAST_SWIPE_DOWN + self.clear() + return True + + elif x.position == HandPosition.ZOOM_IN_END and HandPosition.ZOOM_IN_START in self: + start_index = self.index_position(HandPosition.ZOOM_IN_START) + if ( + self.check_duration(start_index, min_frames=20) + and self.check_vertical_swipe(self._deque[start_index], x) + and self.check_horizontal_swipe(self._deque[start_index], x) + ): + self.action = Event.ZOOM_IN + self.clear() + return True + + elif x.position == HandPosition.ZOOM_OUT_END and HandPosition.ZOOM_OUT_START in self: + start_index = self.index_position(HandPosition.ZOOM_OUT_START) + if ( + self.check_duration(start_index, min_frames=20) + and self.check_vertical_swipe(self._deque[start_index], x) + and self.check_horizontal_swipe(self._deque[start_index], x) + ): + self.action = Event.ZOOM_OUT + self.clear() + return True + else: + self.clear() + + elif x.position == HandPosition.LEFT_END2 and HandPosition.RIGHT_START2 in self: + + start_index = self.index_position(HandPosition.RIGHT_START2) + if ( + self.swipe_distance(self._deque[start_index], x) + and self.check_duration(start_index) + and self.check_horizontal_swipe(self._deque[start_index], x) + ): + self.action = Event.SWIPE_LEFT2 + self.clear() + return True + else: + self.clear() + + elif x.position == HandPosition.RIGHT_END2 and HandPosition.LEFT_START2 in self: + start_index = self.index_position(HandPosition.LEFT_START2) + if ( + self.swipe_distance(self._deque[start_index], x) + and self.check_duration(start_index) + and self.check_horizontal_swipe(self._deque[start_index], x) + ): + self.action = Event.SWIPE_RIGHT2 + self.clear() + return True + else: + self.clear() + + elif x.position == HandPosition.UP_END2 and HandPosition.DOWN_START2 in self: + start_index = self.index_position(HandPosition.DOWN_START2) + if ( + self.swipe_distance(self._deque[start_index], x) + and self.check_duration(start_index) + and self.check_vertical_swipe(self._deque[start_index], x) + ): + self.action = Event.SWIPE_UP2 + self.clear() + return True + else: + self.clear() + + elif x.position == HandPosition.LEFT_END3 and HandPosition.RIGHT_START3 in self: + start_index = self.index_position(HandPosition.RIGHT_START3) + if ( + self.swipe_distance(self._deque[start_index], x) + and self.check_duration(start_index) + and self.check_horizontal_swipe(self._deque[start_index], x) + ): + self.action = Event.SWIPE_LEFT3 # two + self.clear() + return True + else: + self.clear() + + elif x.position == HandPosition.RIGHT_END3 and HandPosition.LEFT_START3 in self: + start_index = self.index_position(HandPosition.LEFT_START3) + if ( + self.swipe_distance(self._deque[start_index], x) + and self.check_duration(start_index) + and self.check_horizontal_swipe(self._deque[start_index], x) + ): + self.action = Event.SWIPE_RIGHT3 + self.clear() + return True + else: + self.clear() + + elif x.position == HandPosition.UP_END3 and HandPosition.DOWN_START3 in self: + start_index = self.index_position(HandPosition.DOWN_START3) + if ( + self.check_duration(start_index, min_frames=15) + and self.check_vertical_swipe(self._deque[start_index], x) + ): + self.action = Event.SWIPE_UP3 + self.clear() + return True + else: + self.clear() + + elif x.position == HandPosition.DOWN_END3 and HandPosition.UP_START3 in self: + start_index = self.index_position(HandPosition.UP_START3) + if ( + self.check_duration(start_index, min_frames=15) + and self.check_vertical_swipe(self._deque[start_index], x) + ): + self.action = Event.SWIPE_DOWN3 + self.clear() + return True + else: + self.clear() + + elif HandPosition.DRAG_START in self and x.gesture == 25: # fist + if self.action is None: + start_index = self.index_gesture(17) # grabbing + + if self.check_duration(start_index, min_frames=3): + self.action = Event.DRAG + return True + else: + self.clear() + + elif HandPosition.ZOOM_IN_START in self and x.gesture == 19: # point + start_index = self.index_position(HandPosition.ZOOM_IN_START) + if ( + self.check_duration(start_index, min_frames=8) + and self.check_vertical_swipe(self._deque[start_index], x) + and self.check_horizontal_swipe(self._deque[start_index], x) + ): + self.action = Event.TAP + self.clear() + return True + elif ( + self.check_duration(start_index, min_frames=2) + and self.check_duration_max(start_index, max_frames=8) + and self.check_vertical_swipe(self._deque[start_index], x) + and self.check_horizontal_swipe(self._deque[start_index], x) + ): + self.action_deque.append(Event.TAP) + if len(self.action_deque) >= 2 and self.action_deque[-1] == Event.TAP and self.action_deque[-2] == Event.TAP: + self.action_deque.pop() + self.action_deque.pop() + self.action = Event.DOUBLE_TAP + self.clear() + return True + else: + self.clear() + + elif x.position == HandPosition.DOWN_END2 and HandPosition.ZOOM_OUT_START in self: + start_index = self.index_position(HandPosition.ZOOM_OUT_START) + if ( + self.swipe_distance(self._deque[start_index], x) + and self.check_vertical_swipe(self._deque[start_index], x) + ): + self.action = Event.SWIPE_DOWN2 + self.clear() + return True + else: + self.clear() + + elif x.position == HandPosition.ZOOM_OUT_START and HandPosition.UP_START2 in self: + start_index = self.index_position(HandPosition.UP_START2) + if ( + self.swipe_distance(self._deque[start_index], x) + and self.check_vertical_swipe(self._deque[start_index], x) + ): + self.action = Event.SWIPE_UP2 + self.clear() + return True + else: + self.clear() + + elif self.action == Event.DRAG and x.gesture in [35, 31, 36, 17]: # [stop, palm, stop_inverted, grabbing] + self.action = Event.DROP + self.clear() + return True + return False + + @staticmethod + def check_horizontal_swipe(start_hand, x): + """ + Check if swipe is horizontal. + + Parameters + ---------- + start_hand : Hand + Hand object of start position of swipe. + + x : Hand + Hand object of end position of swipe. + + Returns + ------- + bool + True if swipe is horizontal. + + """ + boundary = [start_hand.bbox[1], start_hand.bbox[3]] + if boundary[0] < x.center[1] < boundary[1]: + return True + else: + return False + + @staticmethod + def check_vertical_swipe(start_hand, x): + """ + Check if swipe is vertical. + + Parameters + ---------- + start_hand : Hand + Hand object of start position of swipe. + + x : Hand + Hand object of end position of swipe. + + Returns + ------- + bool + True if swipe is vertical. + + """ + boundary = [start_hand.bbox[0], start_hand.bbox[2]] + if boundary[0] < x.center[0] < boundary[1]: + return True + else: + return False + + def __contains__(self, item): + for x in self._deque: + if x.position == item: + return True + + def set_hand_position(self, hand: Hand): + """ + Set hand position. + + Parameters + ---------- + hand : Hand + Hand object. + """ + if hand.gesture in [31, 35, 36]: # [palm, stop, stop_inv] + if HandPosition.DOWN_START in self: + hand.position = HandPosition.UP_END + else: + hand.position = HandPosition.UP_START + + elif hand.gesture == 0: # hand_down + if HandPosition.UP_START in self: + hand.position = HandPosition.DOWN_END + else: + hand.position = HandPosition.DOWN_START + + elif hand.gesture == 1: # hand_right + if HandPosition.LEFT_START in self: + hand.position = HandPosition.RIGHT_END + else: + hand.position = HandPosition.RIGHT_START + + elif hand.gesture == 2: # hand_left + if HandPosition.RIGHT_START in self: + hand.position = HandPosition.LEFT_END + else: + hand.position = HandPosition.LEFT_START + + elif hand.gesture == 30: # one + if HandPosition.FAST_SWIPE_UP_START in self: + hand.position = HandPosition.FAST_SWIPE_UP_END + else: + hand.position = HandPosition.FAST_SWIPE_DOWN_START + + elif hand.gesture == 19: # point + if HandPosition.FAST_SWIPE_DOWN_START in self: + hand.position = HandPosition.FAST_SWIPE_DOWN_END + else: + hand.position = HandPosition.FAST_SWIPE_UP_START + + elif hand.gesture == 17: # grabbing + hand.position = HandPosition.DRAG_START + + elif hand.gesture == 25: # fist + if HandPosition.ZOOM_OUT_START in self: + hand.position = HandPosition.ZOOM_OUT_END + else: + hand.position = HandPosition.ZOOM_IN_START + + elif hand.gesture == 3: # thumb_index + if HandPosition.ZOOM_IN_START in self: + hand.position = HandPosition.ZOOM_IN_END + else: + hand.position = HandPosition.ZOOM_OUT_START + + elif hand.gesture == 38: # three2 + if HandPosition.ZOOM_IN_START in self: + hand.position = HandPosition.ZOOM_IN_END + else: + hand.position = HandPosition.ZOOM_OUT_START + + elif hand.gesture == 5: # thumb_right + if HandPosition.LEFT_START2 in self: + hand.position = HandPosition.RIGHT_END2 + else: + hand.position = HandPosition.RIGHT_START2 + + elif hand.gesture == 4: # thumb_left + if HandPosition.RIGHT_START2 in self: + hand.position = HandPosition.LEFT_END2 + else: + hand.position = HandPosition.LEFT_START2 + + elif hand.gesture == 15: # two_right + if HandPosition.LEFT_START3 in self: + hand.position = HandPosition.RIGHT_END3 + else: + hand.position = HandPosition.RIGHT_START3 + + elif hand.gesture == 14: # two_left + if HandPosition.RIGHT_START3 in self: + hand.position = HandPosition.LEFT_END3 + else: + hand.position = HandPosition.LEFT_START3 + + elif hand.gesture == 39: # two_up + if HandPosition.DOWN_START3 in self: + hand.position = HandPosition.UP_END3 + else: + hand.position = HandPosition.UP_START3 + + elif hand.gesture == 16: # two_down + if HandPosition.UP_START3 in self: + hand.position = HandPosition.DOWN_END3 + else: + hand.position = HandPosition.DOWN_START3 + + elif hand.gesture == 6: # thumb_down + if HandPosition.ZOOM_OUT_START in self: + hand.position = HandPosition.DOWN_END2 + else: + hand.position = HandPosition.UP_START2 + else: + hand.position = HandPosition.UNKNOWN + + def swipe_distance( + self, + first_hand: Hand, + last_hand: Hand, + ): + """ + Check if swipe distance is more than min_distance. + + Parameters + ---------- + first_hand : Hand + Hand object of start position of swipe. + + last_hand : Hand + Hand object of end position of swipe. + + Returns + ------- + bool + True if swipe distance is more than min_distance. + + """ + hand_dist = distance.euclidean(first_hand.center, last_hand.center) + hand_size = (first_hand.size + last_hand.size) / 2 + return hand_dist / hand_size > self.min_absolute_distance + + def clear(self): + self._deque.clear() + + def copy(self): + return self._deque.copy() + + def count(self, x): + return self._deque.count(x) + + def extend(self, iterable): + self._deque.extend(iterable) + + def insert(self, i, x): + self._deque.insert(i, x) + + def pop(self): + return self._deque.pop() + + def remove(self, value): + self._deque.remove(value) + + def reverse(self): + self._deque.reverse() + + def __str__(self): + return f"Deque({[hand.gesture for hand in self._deque]})" diff --git a/src/gesturedetection/utils/box_utils_numpy.py b/src/gesturedetection/utils/box_utils_numpy.py new file mode 100644 index 0000000000000000000000000000000000000000..430a929daedff484c5bb8d4f74b891ed27039fd0 --- /dev/null +++ b/src/gesturedetection/utils/box_utils_numpy.py @@ -0,0 +1,178 @@ +import numpy as np + + +def convert_locations_to_boxes(locations, priors, center_variance, size_variance): + """ + Convert regression location results of SSD into boxes in the form of (center_x, center_y, h, w). + Parameters + ---------- + locations: numpy.ndarray + Regression location results, sized [num_priors,4]. + priors: numpy.ndarray + Prior boxes in center-offset form, sized [num_priors,4]. + center_variance: float + The center variance for decoding. + size_variance: float + The size variance for decoding. + + Returns + ------- + boxes: numpy.ndarray + Boxes in corner form, sized [num_priors,4]. + """ + # priors can have one dimension less. + if len(priors.shape) + 1 == len(locations.shape): + priors = np.expand_dims(priors, 0) + return np.concatenate( + [ + locations[..., :2] * center_variance * priors[..., 2:] + priors[..., :2], + np.exp(locations[..., 2:] * size_variance) * priors[..., 2:], + ], + axis=len(locations.shape) - 1, + ) + + +def convert_boxes_to_locations(center_form_boxes, center_form_priors, center_variance, size_variance): + """ + Convert boxes to locations with respect to priors, which are encoded as (cx, cy, w, h). + Parameters + ---------- + center_form_boxes: numpy.ndarray + Boxes to be converted to locations, sized [num_priors,4]. + center_form_priors: numpy.ndarray + Prior boxes in center-form, sized [num_priors,4]. + center_variance: float + The center variance for encoding. + size_variance: float + The size variance for encoding. + + Returns + ------- + locations: numpy.ndarray + Encoded locations, sized [num_priors,4]. + """ + if len(center_form_priors.shape) + 1 == len(center_form_boxes.shape): + center_form_priors = np.expand_dims(center_form_priors, 0) + return np.concatenate( + [ + (center_form_boxes[..., :2] - center_form_priors[..., :2]) / center_form_priors[..., 2:] / center_variance, + np.log(center_form_boxes[..., 2:] / center_form_priors[..., 2:]) / size_variance, + ], + axis=len(center_form_boxes.shape) - 1, + ) + + +def area_of(left_top, right_bottom): + """ + Compute the areas of rectangles given two corners. + Parameters + ---------- + left_top: numpy.ndarray + Left top corner of the rectangles, sized [N,2]. + right_bottom: numpy.ndarray + Right bottom corner of the rectangles, sized [N,2]. + + Returns + ------- + area: numpy.ndarray + Computed areas, sized [N,]. + """ + hw = np.clip(right_bottom - left_top, 0.0, None) + return hw[..., 0] * hw[..., 1] + + +def iou_of(boxes0, boxes1, eps=1e-5): + """Return intersection-over-union (Jaccard index) of boxes. + Args: + boxes0 (N, 4): ground truth boxes. + boxes1 (N or 1, 4): predicted boxes. + eps: a small number to avoid 0 as denominator. + Returns: + iou (N): IoU values. + """ + overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2]) + overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:]) + + overlap_area = area_of(overlap_left_top, overlap_right_bottom) + area0 = area_of(boxes0[..., :2], boxes0[..., 2:]) + area1 = area_of(boxes1[..., :2], boxes1[..., 2:]) + return overlap_area / (area0 + area1 - overlap_area + eps) + + +def center_form_to_corner_form(locations): + """ + Convert center-form boxes to corner-form. + Parameters + ---------- + locations: numpy.ndarray + Center-form boxes to be converted to corner-form, sized [num_priors,4]. + + Returns + ------- + boxes: numpy.ndarray + Corner-form boxes, sized [num_priors,4]. + """ + return np.concatenate( + [locations[..., :2] - locations[..., 2:] / 2, locations[..., :2] + locations[..., 2:] / 2], + len(locations.shape) - 1, + ) + + +def corner_form_to_center_form(boxes): + """ + Convert corner-form boxes to center-form. + Parameters + ---------- + boxes: numpy.ndarray + Corner-form boxes to be converted to center-form, sized [num_priors,4]. + + Returns + ------- + locations: numpy.ndarray + Center-form boxes, sized [num_priors,4]. + """ + return np.concatenate( + [(boxes[..., :2] + boxes[..., 2:]) / 2, boxes[..., 2:] - boxes[..., :2]], len(boxes.shape) - 1 + ) + + +def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200): + """ + Perform hard non-maximum-supression to filter out boxes with iou greater + than threshold + Parameters + ---------- + box_scores: numpy.ndarray + boxes in corner-form and probabilities. + iou_threshold: float + intersection over union threshold. + top_k: int + keep top_k results. If k <= 0, keep all the results. + candidate_size: int + only consider the candidates with the highest scores. + + Returns + ------- + picked: numpy.ndarray + a list of indexes of the kept boxes + """ + scores = box_scores[:, -1] + boxes = box_scores[:, :-1] + picked = [] + indexes = np.argsort(scores) + indexes = indexes[-candidate_size:] + while len(indexes) > 0: + current = indexes[-1] + picked.append(current) + if 0 < top_k == len(picked) or len(indexes) == 1: + break + current_box = boxes[current, :] + indexes = indexes[:-1] + rest_boxes = boxes[indexes, :] + iou = iou_of( + rest_boxes, + np.expand_dims(current_box, axis=0), + ) + indexes = indexes[iou <= iou_threshold] + + return box_scores[picked, :] diff --git a/src/gesturedetection/utils/drawer.py b/src/gesturedetection/utils/drawer.py new file mode 100644 index 0000000000000000000000000000000000000000..a58d65f16763c7a394ed0e5bf05ad9310bb44fab --- /dev/null +++ b/src/gesturedetection/utils/drawer.py @@ -0,0 +1,170 @@ +import cv2 + +from .enums import Event + + +class Drawer: + def __init__(self): + self.height = self.width = None + self.action = None + self.show_delay = 0 + + def set_action(self, action): + """ + Set action to draw + + Parameters + ---------- + action : Event + Action to draw + """ + self.action = action + self.show_delay = 0 + + def draw_two_hands(self, frame, bboxes): + self.height, self.width, _ = frame.shape + center_x1, center_y1 = bboxes[0][0] + (bboxes[0][2] - bboxes[0][0]) // 2, bboxes[0][1] + (bboxes[0][3] - bboxes[0][1]) // 2 + center_x2, center_y2 = bboxes[1][0] + (bboxes[1][2] - bboxes[1][0]) // 2, bboxes[1][1] + (bboxes[1][3] - bboxes[1][1]) // 2 + # frame = cv2.circle(frame, (int(center_x1), int(center_y1)), 50, (255, 0, 0), 9) + # frame = cv2.circle(frame, (int(center_x2), int(center_y2)), 50, (255, 0, 0), 9) + + diff = int(center_x1 - center_x2) + + frame = cv2.rectangle(frame, + (int(center_x1), int(center_y1 - diff * 0.3)), + (int(center_x2), int(center_y2 + diff * 0.3)), + (255, 0, 0), 5) + + def draw(self, frame): + """ + Draw action on frame + + Parameters + ---------- + frame : np.ndarray + Frame to draw on + x : int + X coordinate of hand center + y : int + Y coordinate of hand center + + Returns + ------- + frame : np.ndarray + Frame with action + + """ + if self.height is None: + self.height, self.width, _ = frame.shape + if self.action is not None: + if self.action in [Event.SWIPE_LEFT, Event.SWIPE_LEFT2, Event.SWIPE_LEFT3]: + frame = cv2.arrowedLine( + frame, + (int(self.width * 0.6), self.height // 2), + (int(self.width * 0.4), self.height // 2), + (0, 255, 0), + 9, + ) + elif self.action in [Event.SWIPE_RIGHT, Event.SWIPE_RIGHT2, Event.SWIPE_RIGHT3]: + frame = cv2.arrowedLine( + frame, + (int(self.width * 0.4), self.height // 2), + (int(self.width * 0.6), self.height // 2), + (0, 255, 0), + 9, + ) + elif self.action in [Event.SWIPE_UP, Event.SWIPE_UP2, Event.SWIPE_UP3]: + frame = cv2.arrowedLine( + frame, + (self.width // 2, int(self.height * 0.6)), + (self.width // 2, int(self.height * 0.4)), + (0, 255, 0), + 9, + ) + elif self.action in [Event.SWIPE_DOWN, Event.SWIPE_DOWN2, Event.SWIPE_DOWN3]: + frame = cv2.arrowedLine( + frame, + (self.width // 2, int(self.height * 0.4)), + (self.width // 2, int(self.height * 0.6)), + (0, 255, 0), + 9, + ) + + elif self.action == Event.FAST_SWIPE_DOWN: + frame = cv2.arrowedLine( + frame, + (self.width // 2, int(self.height * 0.4)), + (self.width // 2, int(self.height * 0.6)), + (0, 255, 0), + 9, + ) + elif self.action == Event.FAST_SWIPE_UP: + frame = cv2.arrowedLine( + frame, + (self.width // 2, int(self.height * 0.6)), + (self.width // 2, int(self.height * 0.4)), + (0, 255, 0), + 9, + ) + elif self.action == Event.ZOOM_OUT: + center_x, center_y = self.width // 2, self.height // 2 + square_size = 200 + + top_left = (center_x - square_size // 2, center_y - square_size // 2) + top_right = (center_x + square_size // 2, center_y - square_size // 2) + bottom_left = (center_x - square_size // 2, center_y + square_size // 2) + bottom_right = (center_x + square_size // 2, center_y + square_size // 2) + cv2.rectangle(frame, top_left, bottom_right, (0, 255, 0), 2) + + frame = cv2.arrowedLine(frame, top_left, (center_x - 20, center_y - 20), (0, 255, 0), 3) + frame = cv2.arrowedLine(frame, top_right, (center_x + 20, center_y - 20), (0, 255, 0), 3) + frame = cv2.arrowedLine(frame, bottom_left, (center_x - 20, center_y + 20), (0, 255, 0), 3) + frame = cv2.arrowedLine(frame, bottom_right, (center_x + 20, center_y + 20), (0, 255, 0), 3) + elif self.action == Event.ZOOM_IN: + center_x, center_y = self.width // 2, self.height // 2 + square_size = 200 + arrow_length = 50 + top_left = (center_x - square_size // 2, center_y - square_size // 2) + top_right = (center_x + square_size // 2, center_y - square_size // 2) + bottom_left = (center_x - square_size // 2, center_y + square_size // 2) + bottom_right = (center_x + square_size // 2, center_y + square_size // 2) + + cv2.rectangle(frame, top_left, bottom_right, (0, 255, 0), 2) + + top_left_end = (top_left[0] - arrow_length, top_left[1] - arrow_length) + top_right_end = (top_right[0] + arrow_length, top_right[1] - arrow_length) + bottom_left_end = (bottom_left[0] - arrow_length, bottom_left[1] + arrow_length) + bottom_right_end = (bottom_right[0] + arrow_length, bottom_right[1] + arrow_length) + + frame = cv2.arrowedLine(frame, top_left, top_left_end, (0, 255, 0), 3) + frame = cv2.arrowedLine(frame, top_right, top_right_end, (0, 255, 0), 3) + frame = cv2.arrowedLine(frame, bottom_left, bottom_left_end, (0, 255, 0), 3) + frame = cv2.arrowedLine(frame, bottom_right, bottom_right_end, (0, 255, 0), 3) + + elif self.action in [Event.DRAG, Event.DRAG2, Event.DRAG3]: + frame = cv2.circle(frame, (self.width // 2, self.height // 2), 50, (0, 255, 0), 9) + elif self.action == Event.DOUBLE_TAP: + frame = cv2.putText(frame, 'DOUBLE CLICK', (self.width // 2, self.height // 2), cv2.FONT_HERSHEY_SIMPLEX , + 1, (255, 0, 0) , 5, cv2.LINE_AA) + elif self.action == Event.TAP: + frame = cv2.putText(frame, 'CLICK', (self.width // 2, self.height // 2), cv2.FONT_HERSHEY_SIMPLEX , + 1, (255, 0, 0) , 5, cv2.LINE_AA) + elif self.action in [Event.DROP, Event.DROP2, Event.DROP3]: + frame = cv2.circle(frame, (self.width // 2, self.height // 2), 50, (0, 0, 255), -1) + elif self.action == Event.COUNTERCLOCK: + frame = cv2.putText(frame, 'COUNTERCLOCK', (self.width // 2, self.height // 2), cv2.FONT_HERSHEY_SIMPLEX , + 1, (0, 255, 0) , 5, cv2.LINE_AA) + elif self.action == Event.CLOCKWISE: + frame = cv2.putText(frame, 'CLOCKWISE', (self.width // 2, self.height // 2), cv2.FONT_HERSHEY_SIMPLEX , + 1, (0, 255, 0) , 5, cv2.LINE_AA) + # elif self.action == Event.DRAG2: + # frame = cv2.circle(frame, (self.width // 2, self.height // 2), 50, (255, 0, 0), 9) + # elif self.action == Event.DROP2: + # frame = cv2.circle(frame, (self.width // 2, self.height // 2), 50, (0, 0, 255), -1) + self.show_delay += 1 + if self.show_delay > 10: + self.show_delay = 0 + self.action = None + self.x = self.y = None + + return frame diff --git a/src/gesturedetection/utils/enums.py b/src/gesturedetection/utils/enums.py new file mode 100644 index 0000000000000000000000000000000000000000..9832f0c65b5ab6ec3248c95e6154e254d896353b --- /dev/null +++ b/src/gesturedetection/utils/enums.py @@ -0,0 +1,118 @@ +from enum import Enum + + +# Hand position enum. +class HandPosition(Enum): + UNKNOWN = -1 + LEFT_START = 1 + RIGHT_START = 2 + LEFT_END = 3 + RIGHT_END = 4 + UP_START = 5 + UP_END = 6 + DOWN_START = 7 + DOWN_END = 8 + FAST_SWIPE_UP_START = 9 + FAST_SWIPE_UP_END = 10 + FAST_SWIPE_DOWN_START = 11 + FAST_SWIPE_DOWN_END = 12 + ZOOM_IN_START = 13 + ZOOM_IN_END = 14 + ZOOM_OUT_START = 15 + ZOOM_OUT_END = 16 + LEFT_START2 = 17 + RIGHT_START2 = 18 + LEFT_END2 = 19 + RIGHT_END2 = 20 + UP_START2 = 21 + UP_END2 = 22 + DOWN_START2 = 23 + DOWN_END2 = 24 + DRAG_START = 25 + DRAG_END = 26 + LEFT_START3 = 27 + RIGHT_START3 = 28 + LEFT_END3 = 29 + RIGHT_END3 = 30 + DOWN_START3 = 31 + DOWN_END3 = 32 + UP_START3 = 33 + UP_END3 = 34 + + +# Events for action controller +class Event(Enum): + UNKNOWN = -1 + SWIPE_RIGHT = 0 + SWIPE_LEFT = 1 + SWIPE_UP = 2 + SWIPE_DOWN = 3 + DRAG = 4 + DROP = 5 + FAST_SWIPE_DOWN = 6 + FAST_SWIPE_UP = 7 + ZOOM_IN = 8 + ZOOM_OUT = 9 + SWIPE_RIGHT2 = 10 + SWIPE_LEFT2 = 11 + SWIPE_UP2 = 12 + SWIPE_DOWN2 = 13 + DOUBLE_TAP = 14 + SWIPE_RIGHT3 = 15 + SWIPE_LEFT3 = 16 + SWIPE_UP3 = 17 + SWIPE_DOWN3 = 18 + DRAG2 = 19 + DROP2 = 20 + DRAG3 = 21 + DROP3 = 22 + TAP = 23 + + +targets = [ + 'hand_down', + 'hand_right', + 'hand_left', + 'thumb_index', + 'thumb_left', + 'thumb_right', + 'thumb_down', + 'half_up', + 'half_left', + 'half_right', + 'half_down', + 'part_hand_heart', + 'part_hand_heart2', + 'fist_inverted', + 'two_left', + 'two_right', + 'two_down', + 'grabbing', + 'grip', + 'point', + 'call', + 'three3', + 'little_finger', + 'middle_finger', + 'dislike', + 'fist', + 'four', + 'like', + 'mute', + 'ok', + 'one', + 'palm', + 'peace', + 'peace_inverted', + 'rock', + 'stop', + 'stop_inverted', + 'three', + 'three2', + 'two_up', + 'two_up_inverted', + 'three_gun', + 'one_left', + 'one_right', + 'one_down' + ] diff --git a/src/gesturedetection/utils/hand.py b/src/gesturedetection/utils/hand.py new file mode 100644 index 0000000000000000000000000000000000000000..abe723cbaa485c6bfc18054898335eb86fc35a99 --- /dev/null +++ b/src/gesturedetection/utils/hand.py @@ -0,0 +1,29 @@ +class Hand: + def __init__(self, bbox, hand_id=None, gesture=None): + """ + Hand class + + Parameters + ---------- + bbox : np.ndarray + Bounding box of hand + + hand_id : int + Id of hand + + gesture : int + Current gesture of hand + """ + self.bbox = bbox + self.hand_id = hand_id + if self.bbox is not None: + self.center = self._get_center() + self.size = self.bbox[2] - self.bbox[0] + self.position = None + self.gesture = gesture + + def _get_center(self): + return (self.bbox[0] + self.bbox[2]) / 2, (self.bbox[1] + self.bbox[3]) / 2 + + def __repr__(self): + return f"Hand({self.center}, {self.size}, {self.position}, {self.gesture})" diff --git a/src/validate/__init__.py b/src/validate/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a7bffe7f4d0c6cd5d4e6242c915bb5748cef418b --- /dev/null +++ b/src/validate/__init__.py @@ -0,0 +1,19 @@ +""" +Validation module for identity verification combining facial recognition and gesture validation. + +This module provides a comprehensive validation service that accepts: +- ID document photo (facial reference) +- User video (containing face and gestures) +- List of required gestures + +Returns validation results for both facial match and gesture compliance. +""" + +__version__ = "1.0.0" +__all__ = [ + "ValidationRequest", + "ValidationResponse", + "ValidationResult", + "validate_identity", + "app" +] diff --git a/src/validate/__pycache__/__init__.cpython-312.pyc b/src/validate/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8690c539ccf5a2358816e11cee916d9ae8e8abf5 Binary files /dev/null and b/src/validate/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/validate/__pycache__/api.cpython-312.pyc b/src/validate/__pycache__/api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e1a7e2547c4cd141a29e9d65b8eda996d1eeeb1 Binary files /dev/null and b/src/validate/__pycache__/api.cpython-312.pyc differ diff --git a/src/validate/__pycache__/config.cpython-312.pyc b/src/validate/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb8130b67d86ab414e78282198d3cf0c12db8dba Binary files /dev/null and b/src/validate/__pycache__/config.cpython-312.pyc differ diff --git a/src/validate/__pycache__/facial_validator.cpython-312.pyc b/src/validate/__pycache__/facial_validator.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cfd57a3ffedd9d3e462e7265074f4d8430d4d078 Binary files /dev/null and b/src/validate/__pycache__/facial_validator.cpython-312.pyc differ diff --git a/src/validate/__pycache__/gesture_validator.cpython-312.pyc b/src/validate/__pycache__/gesture_validator.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a1f32b602e80603a9cbecfe54a84193714c48c95 Binary files /dev/null and b/src/validate/__pycache__/gesture_validator.cpython-312.pyc differ diff --git a/src/validate/__pycache__/models.cpython-312.pyc b/src/validate/__pycache__/models.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..44c1d6bf5695490f528beae9de4cd5de550e4d51 Binary files /dev/null and b/src/validate/__pycache__/models.cpython-312.pyc differ diff --git a/src/validate/api.py b/src/validate/api.py new file mode 100644 index 0000000000000000000000000000000000000000..a76c71a2bba9750beef793b534cf8d1cf10a7cdc --- /dev/null +++ b/src/validate/api.py @@ -0,0 +1,361 @@ +""" +FastAPI endpoint for identity validation service. + +This module provides the main API endpoint for identity validation, +accepting ID photos, user videos, and gesture requirements to perform +comprehensive identity verification. +""" + +import os +import json +import tempfile +import time +import logging +from typing import Optional +from datetime import datetime, timezone + +from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Depends +from fastapi.responses import ORJSONResponse + +from .models import ValidationRequest, ValidationResponse, ValidationStatus +from .facial_validator import FacialValidator +from .gesture_validator import GestureValidator +from .config import config + +logger = logging.getLogger(__name__) + +# Create FastAPI app +app = FastAPI( + title="Identity Validation API", + description="API for identity verification using facial recognition and gesture validation", + version="1.0.0", + default_response_class=ORJSONResponse +) + +# Initialize validators +facial_validator = FacialValidator() +gesture_validator = GestureValidator() + + +def get_validation_request( + gestures: str = Form(...), + # Gesture validation parameters (optional, fallback to env vars) + error_margin: str = Form("default"), + min_gesture_duration: str = Form("default"), + require_all_gestures: str = Form("default"), + confidence_threshold: str = Form("default"), + # Facial recognition parameters (optional, fallback to env vars) + similarity_threshold: str = Form("default"), + frame_sample_rate: str = Form("default"), + # Response parameters + include_details: bool = Form(False) +) -> ValidationRequest: + """ + Parse and validate the validation request from form data. + + All parameters are optional and will fall back to environment variable + defaults if not provided. This allows for flexible configuration at both + the request level and server level. + + Parameters + ---------- + gestures : str + JSON string containing the list of required gestures + error_margin : Optional[float] + Error margin for gesture validation (0.0-1.0). Uses DEFAULT_ERROR_MARGIN env var if None + min_gesture_duration : Optional[int] + Minimum duration for gesture detection. Uses MIN_GESTURE_DURATION env var if None + require_all_gestures : Optional[bool] + Whether all gestures must be present. Uses REQUIRE_ALL_GESTURES env var if None + confidence_threshold : Optional[float] + Minimum confidence threshold for gesture detection. Uses CONFIDENCE_THRESHOLD env var if None + similarity_threshold : Optional[float] + Minimum similarity threshold for facial matching. Uses SIMILARITY_THRESHOLD env var if None + frame_sample_rate : Optional[int] + Rate for sampling video frames for face detection. Uses FRAME_SAMPLE_RATE env var if None + include_details : Optional[bool] + Whether to include detailed results in response + + Returns + ------- + ValidationRequest + Parsed and validated request object with environment fallbacks + + Raises + ------ + HTTPException + If request validation fails + """ + try: + # Parse gestures JSON + gesture_list = json.loads(gestures) + if not isinstance(gesture_list, list): + raise ValueError("gestures must be a list") + if not gesture_list: + raise ValueError("gestures list cannot be empty") + + # Validate gesture names (basic validation) + for gesture in gesture_list: + if not isinstance(gesture, str) or not gesture.strip(): + raise ValueError(f"Invalid gesture name: {gesture}") + + except json.JSONDecodeError as e: + raise HTTPException( + status_code=400, + detail=f"Invalid JSON in gestures field: {str(e)}" + ) + except ValueError as e: + raise HTTPException( + status_code=400, + detail=f"Invalid gestures data: {str(e)}" + ) + + # Parse and convert parameters, using config defaults when "default" is provided + def parse_param(value, default_value, value_type): + """Parse parameter value, using default if 'default' string is provided.""" + # Handle FastAPI Form objects - extract the actual value from .default + if hasattr(value, 'default'): + actual_value = value.default + else: + actual_value = value + + if actual_value == "default" or actual_value is None: + return default_value + try: + if value_type == float: + return float(actual_value) + elif value_type == int: + return int(actual_value) + elif value_type == bool: + return str(actual_value).lower() in ('true', '1', 'yes', 'on') + else: + return actual_value + except (ValueError, TypeError, AttributeError): + raise HTTPException( + status_code=400, + detail=f"Invalid value for parameter: {actual_value}" + ) + + final_error_margin = parse_param(error_margin, config.default_error_margin, float) + final_min_gesture_duration = parse_param(min_gesture_duration, config.min_gesture_duration, int) + final_require_all_gestures = parse_param(require_all_gestures, config.require_all_gestures, bool) + final_confidence_threshold = parse_param(confidence_threshold, config.confidence_threshold, float) + final_similarity_threshold = parse_param(similarity_threshold, config.similarity_threshold, float) + final_frame_sample_rate = parse_param(frame_sample_rate, config.frame_sample_rate, int) + + # Parse include_details parameter + final_include_details = parse_param(include_details, False, bool) + + return ValidationRequest( + asked_gestures=gesture_list, + error_margin=final_error_margin, + min_gesture_duration=final_min_gesture_duration, + require_all_gestures=final_require_all_gestures, + confidence_threshold=final_confidence_threshold, + similarity_threshold=final_similarity_threshold, + frame_sample_rate=final_frame_sample_rate, + include_details=final_include_details + ) + + +@app.post("/", response_model=ValidationResponse) +async def validate_identity( + photo: UploadFile = File(...), + video: UploadFile = File(...), + request: ValidationRequest = Depends(get_validation_request) +): + """ + Validate user identity using facial recognition and gesture validation. + + This endpoint accepts an ID document photo, a user video containing + the person's face and required gestures, and a list of gestures that + must be performed. It returns validation results for both facial + recognition and gesture compliance. + + Parameters + ---------- + photo : UploadFile + ID document photo file (image format) + video : UploadFile + User video file containing face and gestures (video format) + request : ValidationRequest + Validation configuration and gesture requirements + + Returns + ------- + ValidationResponse + Validation results with success indicators and optional details + + Raises + ------ + HTTPException + If validation fails or processing errors occur + """ + start_time = time.time() + logger.info(f"Identity validation request received for {request.asked_gestures}") + + # Validate file types + if not photo.content_type or not photo.content_type.startswith(('image/', 'application/')): + raise HTTPException( + status_code=400, + detail="Photo file must be an image" + ) + + if not video.content_type or not video.content_type.startswith('video/'): + raise HTTPException( + status_code=400, + detail="Video file must be a video" + ) + + # Validate file sizes (basic check) + MAX_FILE_SIZE = 100 * 1024 * 1024 # 100MB + if photo.size and photo.size > MAX_FILE_SIZE: + raise HTTPException( + status_code=413, + detail="Photo file too large (max 100MB)" + ) + + if video.size and video.size > MAX_FILE_SIZE: + raise HTTPException( + status_code=413, + detail="Video file too large (max 100MB)" + ) + + # Create temporary files for processing + temp_photo = None + temp_video = None + + try: + # Save uploaded files to temporary location + with tempfile.NamedTemporaryFile(delete=False, suffix=f"_photo.{photo.filename.split('.')[-1] if '.' in photo.filename else 'jpg'}") as temp_photo_file: + temp_photo = temp_photo_file.name + photo_content = await photo.read() + temp_photo_file.write(photo_content) + + with tempfile.NamedTemporaryFile(delete=False, suffix=f"_video.{video.filename.split('.')[-1] if '.' in video.filename else 'mp4'}") as temp_video_file: + temp_video = temp_video_file.name + video_content = await video.read() + temp_video_file.write(video_content) + + logger.info(f"Files saved: photo={temp_photo}, video={temp_video}") + + # Perform facial validation + logger.info("Starting facial validation") + + # Update facial validator with request-specific parameters if provided + if request.similarity_threshold is not None: + facial_validator.similarity_threshold = request.similarity_threshold + if request.frame_sample_rate is not None: + facial_validator.frame_sample_rate = request.frame_sample_rate + + face_result = facial_validator.validate_facial_match(temp_photo, temp_video) + + # Perform gesture validation + logger.info("Starting gesture validation") + gesture_result = gesture_validator.validate_gestures( + temp_video, + request.asked_gestures, + error_margin=request.error_margin, + require_all=request.require_all_gestures + ) + + # Update gesture validator with request-specific parameters if provided + if request.confidence_threshold is not None: + gesture_validator.confidence_threshold = request.confidence_threshold + if request.min_gesture_duration is not None: + gesture_validator.min_gesture_duration = request.min_gesture_duration + + # Determine overall result + overall_success = face_result.success and gesture_result.success + overall_status = ValidationStatus.SUCCESS if overall_success else ValidationStatus.PARTIAL + + # Calculate processing time + processing_time_ms = int((time.time() - start_time) * 1000) + + # Build response + response = ValidationResponse( + face=face_result.success, + gestures=gesture_result.success, + overall=overall_success, + status=overall_status, + face_result=face_result if request.include_details else None, + gesture_result=gesture_result if request.include_details else None, + processing_time_ms=processing_time_ms, + timestamp=datetime.now(timezone.utc).isoformat() + ) + + # Log results + logger.info( + "Identity validation completed", + extra={ + "face_success": face_result.success, + "gesture_success": gesture_result.success, + "overall_success": overall_success, + "processing_time_ms": processing_time_ms, + "requested_gestures": request.asked_gestures + } + ) + + return response + + except Exception as e: + logger.error(f"Error during identity validation: {str(e)}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"Internal server error during validation: {str(e)}" + ) + + finally: + # Clean up temporary files + for temp_file in [temp_photo, temp_video]: + if temp_file and os.path.exists(temp_file): + try: + os.unlink(temp_file) + logger.debug(f"Cleaned up temporary file: {temp_file}") + except Exception as e: + logger.warning(f"Failed to clean up temporary file {temp_file}: {e}") + + +@app.get("/health") +async def health_check(): + """ + Health check endpoint for the validation service. + + Returns + ------- + dict + Health status information + """ + return { + "status": "healthy", + "service": "identity-validation", + "version": "1.0.0", + "timestamp": datetime.now(timezone.utc).isoformat(), + "components": { + "facial_validator": "initialized", + "gesture_validator": "initialized" + } + } + + +@app.get("/") +async def root(): + """ + Root endpoint providing API information. + + Returns + ------- + dict + API information and usage instructions + """ + return { + "name": "Identity Validation API", + "version": "1.0.0", + "description": "Identity verification using facial recognition and gesture validation", + "endpoints": { + "POST /": "Perform identity validation", + "GET /health": "Health check", + "GET /": "API information" + }, + "documentation": "/docs" + } diff --git a/src/validate/config.py b/src/validate/config.py new file mode 100644 index 0000000000000000000000000000000000000000..3ed3bdf8819cb796b2f92156b69647500e5d1153 --- /dev/null +++ b/src/validate/config.py @@ -0,0 +1,171 @@ +""" +Configuration settings for the validation module. + +This module provides centralized configuration for the validation service, +including model paths, processing parameters, and validation thresholds. +""" + +import os +from typing import Optional + + +class ValidationConfig: + """ + Configuration settings for the validation service. + + This class provides centralized configuration management for all + validation-related settings, with sensible defaults and environment + variable overrides. + """ + + def __init__(self): + """Initialize configuration with default values and environment overrides.""" + + # Model paths + self.hand_detector_path = os.getenv( + "HAND_DETECTOR_PATH", + "models/hand_detector.onnx" + ) + self.gesture_classifier_path = os.getenv( + "GESTURE_CLASSIFIER_PATH", + "models/crops_classifier.onnx" + ) + + # Processing parameters + self.frame_skip = int(os.getenv("FRAME_SKIP", "1")) + self.min_gesture_duration = int(os.getenv("MIN_GESTURE_DURATION", "5")) + self.confidence_threshold = float(os.getenv("CONFIDENCE_THRESHOLD", "0.7")) + + # Validation parameters + self.default_error_margin = float(os.getenv("DEFAULT_ERROR_MARGIN", "0.33")) + self.require_all_gestures = os.getenv("REQUIRE_ALL_GESTURES", "true").lower() == "true" + self.confidence_threshold = float(os.getenv("CONFIDENCE_THRESHOLD", "0.7")) + self.min_gesture_duration = int(os.getenv("MIN_GESTURE_DURATION", "5")) + + # Facial validation parameters + self.similarity_threshold = float(os.getenv("SIMILARITY_THRESHOLD", "0.7")) + self.frame_sample_rate = int(os.getenv("FRAME_SAMPLE_RATE", "10")) + + # File size limits (in bytes) + self.max_photo_size = int(os.getenv("MAX_PHOTO_SIZE", str(50 * 1024 * 1024))) # 50MB + self.max_video_size = int(os.getenv("MAX_VIDEO_SIZE", str(200 * 1024 * 1024))) # 200MB + + # Performance settings + self.max_processing_time = int(os.getenv("MAX_PROCESSING_TIME", "60")) # seconds + self.enable_detailed_logging = os.getenv("ENABLE_DETAILED_LOGGING", "false").lower() == "true" + + # Security settings + self.allowed_image_types = os.getenv( + "ALLOWED_IMAGE_TYPES", + "image/jpeg,image/png,image/webp,application/pdf" + ).split(",") + + self.allowed_video_types = os.getenv( + "ALLOWED_VIDEO_TYPES", + "video/mp4,video/avi,video/mov,video/webm" + ).split(",") + + @property + def model_paths(self) -> dict: + """Get model paths as a dictionary.""" + return { + "hand_detector": self.hand_detector_path, + "gesture_classifier": self.gesture_classifier_path + } + + @property + def processing_params(self) -> dict: + """Get processing parameters as a dictionary.""" + return { + "frame_skip": self.frame_skip, + "min_gesture_duration": self.min_gesture_duration, + "confidence_threshold": self.confidence_threshold + } + + @property + def validation_params(self) -> dict: + """Get validation parameters as a dictionary.""" + return { + "default_error_margin": self.default_error_margin, + "require_all_gestures": self.require_all_gestures + } + + def validate_file_type(self, content_type: str, file_type: str = "image") -> bool: + """ + Validate if a file type is allowed. + + Parameters + ---------- + content_type : str + MIME content type of the file + file_type : str, optional + Type of file ("image" or "video"), by default "image" + + Returns + ------- + bool + True if file type is allowed, False otherwise + """ + if file_type == "image": + allowed_types = self.allowed_image_types + elif file_type == "video": + allowed_types = self.allowed_video_types + else: + return False + + return content_type in allowed_types + + def validate_file_size(self, file_size: int, file_type: str = "image") -> bool: + """ + Validate if a file size is within limits. + + Parameters + ---------- + file_size : int + Size of the file in bytes + file_type : str, optional + Type of file ("image" or "video"), by default "image" + + Returns + ------- + bool + True if file size is within limits, False otherwise + """ + if file_type == "image": + max_size = self.max_photo_size + elif file_type == "video": + max_size = self.max_video_size + else: + return False + + return file_size <= max_size + + +# Global configuration instance +config = ValidationConfig() + + +def get_config() -> ValidationConfig: + """ + Get the global configuration instance. + + Returns + ------- + ValidationConfig + Global configuration instance + """ + return config + + +def reload_config() -> ValidationConfig: + """ + Reload configuration from environment variables. + + Returns + ------- + ValidationConfig + New configuration instance with updated values + """ + global config + config = ValidationConfig() + return config diff --git a/src/validate/facial_validator.py b/src/validate/facial_validator.py new file mode 100644 index 0000000000000000000000000000000000000000..d3c50c4c1d97fe4d4869bb9b962aedb71e81d940 --- /dev/null +++ b/src/validate/facial_validator.py @@ -0,0 +1,287 @@ +""" +Facial recognition validator for identity verification. + +This module provides facial validation functionality for the identity verification system. +It orchestrates facial matching using the facial embeddings module, providing a clean +interface for the validation API. +""" + +import tempfile +import os +import logging +from typing import Tuple, Optional, Dict, Any +from datetime import datetime, timezone +import numpy as np + +from .models import ValidationResult, ValidationStatus + +logger = logging.getLogger(__name__) + + +class FacialValidator: + """ + Facial recognition validator for identity verification. + + This class orchestrates facial validation by using the facial embeddings + module to compare an ID document photo with faces detected in a user video. + It provides a clean interface for the validation API while delegating the + actual facial recognition work to the specialized facial embeddings module. + """ + + def __init__( + self, + similarity_threshold: float = 0.7, + frame_sample_rate: int = 10 + ): + """ + Initialize the facial validator. + + Parameters + ---------- + similarity_threshold : float, optional + Minimum similarity threshold for facial matching, by default 0.7 + frame_sample_rate : int, optional + Rate at which to sample video frames for face detection, by default 10 + """ + self.similarity_threshold = similarity_threshold + self.frame_sample_rate = frame_sample_rate + + # Import here to avoid circular imports + try: + from ..facialembeddingsmatch.facial_matcher import FacialEmbeddingMatcher + self.matcher = FacialEmbeddingMatcher( + similarity_threshold=similarity_threshold + ) + self._initialized = True + logger.info( + "FacialValidator initialized successfully", + extra={ + "similarity_threshold": similarity_threshold, + "frame_sample_rate": frame_sample_rate + } + ) + except ImportError as e: + logger.warning(f"Could not import facial matcher: {e}") + self._initialized = False + + def validate_facial_match( + self, + id_photo_path: str, + video_path: str, + **kwargs + ) -> ValidationResult: + """ + Validate facial match between ID photo and user video. + + This method uses the facial embeddings module to perform comprehensive + facial matching by comparing faces detected in the ID photo with faces + detected in the user video. + + Parameters + ---------- + id_photo_path : str + Path to the ID document photo file + video_path : str + Path to the user video file + **kwargs + Additional parameters for facial recognition + + Returns + ------- + ValidationResult + Validation result with success status and confidence score + """ + if not self._initialized: + error_msg = "FacialValidator not properly initialized - missing facial matcher components" + logger.error(error_msg) + return ValidationResult( + status=ValidationStatus.FAILED, + success=False, + confidence=0.0, + error_message=error_msg + ) + + logger.info("Starting facial validation") + + # Validate input files exist + if not os.path.exists(id_photo_path): + error_msg = f"ID photo file not found: {id_photo_path}" + logger.error(error_msg) + return ValidationResult( + status=ValidationStatus.FAILED, + success=False, + confidence=0.0, + error_message=error_msg + ) + + if not os.path.exists(video_path): + error_msg = f"Video file not found: {video_path}" + logger.error(error_msg) + return ValidationResult( + status=ValidationStatus.FAILED, + success=False, + confidence=0.0, + error_message=error_msg + ) + + try: + # TODO: Facial embeddings validation is not fully implemented yet + # For now, always return success to allow testing of gesture validation + logger.warning( + "Facial validation bypassed - not fully implemented. Always returning success." + ) + + # Return successful validation result with placeholder values + validation_result = ValidationResult( + status=ValidationStatus.SUCCESS, + success=True, + confidence=1.0, # Placeholder confidence + details={ + "validation_method": "facial_embeddings_placeholder", + "note": "Facial validation not fully implemented - always returns success", + "similarity_score": 1.0, + "similarity_threshold": self.similarity_threshold, + "id_photo_path": id_photo_path, + "video_path": video_path, + "frame_sample_rate": self.frame_sample_rate, + "processing_timestamp": datetime.now(timezone.utc).isoformat(), + "implementation_status": "placeholder" + } + ) + + logger.info( + "Facial validation completed (placeholder mode)", + extra={ + "success": True, + "confidence": 1.0, + "note": "Facial validation not implemented - returning success" + } + ) + + return validation_result + + except Exception as e: + error_msg = f"Error during facial validation: {str(e)}" + logger.error(error_msg, exc_info=True) + return ValidationResult( + status=ValidationStatus.FAILED, + success=False, + confidence=0.0, + error_message=error_msg + ) + + def extract_facial_features(self, image_path: str) -> Optional[Dict[str, Any]]: + """ + Extract facial features from an image. + + This method delegates to the facial embeddings module for feature extraction. + + Parameters + ---------- + image_path : str + Path to the image file + + Returns + ------- + Optional[Dict[str, Any]] + Dictionary containing facial features, or None if extraction fails + """ + if not self._initialized: + logger.error("FacialValidator not initialized") + return None + + logger.debug(f"Extracting facial features from {image_path}") + + try: + # Use the facial matcher to extract features + # This is a simplified approach - in practice, we'd want more direct access + id_faces = self.matcher.face_detector.detect_faces(image_path) + + if not id_faces: + logger.warning(f"No faces detected in {image_path}") + return None + + # Extract embedding from the first detected face + face = id_faces[0] + embedding = self.matcher.embedding_extractor.extract_embedding( + image_path, face["bbox"] + ) + + if embedding is None: + logger.warning(f"Failed to extract embedding from {image_path}") + return None + + return { + "features": embedding.tolist(), + "extraction_method": "facial_embeddings", + "face_bbox": face["bbox"], + "confidence": face.get("confidence", 0.0), + "timestamp": datetime.now(timezone.utc).isoformat() + } + + except Exception as e: + logger.error(f"Error extracting facial features: {str(e)}") + return None + + def compare_faces( + self, + features1: Dict[str, Any], + features2: Dict[str, Any], + threshold: Optional[float] = None + ) -> Tuple[bool, float]: + """ + Compare two sets of facial features. + + This method uses the similarity calculator from the facial embeddings module. + + Parameters + ---------- + features1 : Dict[str, Any] + First set of facial features + features2 : Dict[str, Any] + Second set of facial features + threshold : Optional[float], optional + Similarity threshold for matching, by default uses instance threshold + + Returns + ------- + Tuple[bool, float] + (is_match, similarity_score) where similarity_score is between 0.0 and 1.0 + """ + if not self._initialized: + logger.error("FacialValidator not initialized") + return False, 0.0 + + if threshold is None: + threshold = self.similarity_threshold + + try: + # Extract embeddings from feature dictionaries + embedding1 = np.array(features1.get("features", [])) + embedding2 = np.array(features2.get("features", [])) + + if len(embedding1) == 0 or len(embedding2) == 0: + logger.error("Invalid feature data provided") + return False, 0.0 + + # Calculate similarity + similarity = self.matcher.similarity_calculator.calculate_similarity( + embedding1, embedding2 + ) + + is_match = similarity >= threshold + + logger.debug( + "Face comparison completed", + extra={ + "similarity": similarity, + "threshold": threshold, + "is_match": is_match + } + ) + + return is_match, similarity + + except Exception as e: + logger.error(f"Error comparing faces: {str(e)}") + return False, 0.0 diff --git a/src/validate/gesture_validator.py b/src/validate/gesture_validator.py new file mode 100644 index 0000000000000000000000000000000000000000..48a9865fc057c2d52432c7e13728c99ca463f1c5 --- /dev/null +++ b/src/validate/gesture_validator.py @@ -0,0 +1,496 @@ +""" +Gesture validation service for identity verification. + +This module provides gesture validation functionality by leveraging the existing +gesture detection system in src/gesturedetection/. It processes user videos to +detect specific gestures and validates them against a list of required gestures. +""" + +import os +import logging +import tempfile +from typing import List, Dict, Any, Optional, Tuple +from datetime import datetime, timezone + +from .models import ValidationResult, ValidationStatus, GestureRequirement + +logger = logging.getLogger(__name__) + + +class GestureValidator: + """ + Gesture validation service for identity verification. + + This class processes user videos to detect and validate specific gestures + against a list of required gestures. It uses the existing gesture detection + pipeline from src/gesturedetection/ and provides configurable validation + parameters including error margins and minimum requirements. + """ + + def __init__( + self, + detector_path: str = "models/hand_detector.onnx", + classifier_path: str = "models/crops_classifier.onnx", + frame_skip: int = 1, + min_gesture_duration: int = 5, + confidence_threshold: float = 0.7 + ): + """ + Initialize the gesture validator. + + Parameters + ---------- + detector_path : str, optional + Path to the hand detection ONNX model, by default "models/hand_detector.onnx" + classifier_path : str, optional + Path to the gesture classification ONNX model, by default "models/crops_classifier.onnx" + frame_skip : int, optional + Number of frames to skip between processing, by default 1 + min_gesture_duration : int, optional + Minimum duration for gesture detection, by default 5 + confidence_threshold : float, optional + Minimum confidence threshold for gesture detection, by default 0.7 + """ + self.detector_path = detector_path + self.classifier_path = classifier_path + self.frame_skip = frame_skip + self.min_gesture_duration = min_gesture_duration + self.confidence_threshold = confidence_threshold + + # Import here to avoid circular imports and handle missing dependencies gracefully + try: + from ..gesturedetection.main_controller import MainController + from ..gesturedetection.models import FULL_GESTURE_MAPPING + self._main_controller_class = MainController + self._gesture_mapping = FULL_GESTURE_MAPPING + self._initialized = True + logger.info("GestureValidator initialized successfully") + except ImportError as e: + logger.warning(f"Could not import gesture detection components: {e}") + self._initialized = False + + def validate_gestures( + self, + video_path: str, + required_gestures: List[str], + error_margin: float = 0.33, + require_all: bool = True + ) -> ValidationResult: + """ + Validate that required gestures are present in the video. + + Parameters + ---------- + video_path : str + Path to the video file to analyze + required_gestures : List[str] + List of gesture names that must be detected + error_margin : float, optional + Fraction of gestures that can be missed (0.0-1.0), by default 0.33 + require_all : bool, optional + Whether all gestures must be present, by default True + + Returns + ------- + ValidationResult + Validation result with success status and detailed metrics + """ + if not self._initialized: + error_msg = "GestureValidator not properly initialized - missing gesture detection components" + logger.error(error_msg) + return ValidationResult( + status=ValidationStatus.FAILED, + success=False, + confidence=0.0, + error_message=error_msg + ) + + logger.info(f"Starting gesture validation for video: {video_path}") + logger.info(f"Required gestures: {required_gestures}, error_margin: {error_margin}") + + # Validate input file + if not os.path.exists(video_path): + error_msg = f"Video file not found: {video_path}" + logger.error(error_msg) + return ValidationResult( + status=ValidationStatus.FAILED, + success=False, + confidence=0.0, + error_message=error_msg + ) + + # Validate required gestures + if not required_gestures: + error_msg = "No gestures specified for validation" + logger.error(error_msg) + return ValidationResult( + status=ValidationStatus.FAILED, + success=False, + confidence=0.0, + error_message=error_msg + ) + + try: + # Process video using existing gesture detection pipeline + detected_gestures = self._process_video_for_gestures(video_path) + + # Analyze detected gestures against requirements + validation_metrics = self._analyze_gesture_requirements( + detected_gestures, required_gestures, error_margin, require_all + ) + + # Determine overall success + if require_all: + success = validation_metrics["required_gestures_met"] >= len(required_gestures) + else: + # Allow for error margin + min_required = max(1, int(len(required_gestures) * (1.0 - error_margin))) + success = validation_metrics["required_gestures_met"] >= min_required + + # Calculate confidence based on detection quality + confidence = self._calculate_confidence(detected_gestures, validation_metrics) + + status = ValidationStatus.SUCCESS if success else ValidationStatus.PARTIAL + + result = ValidationResult( + status=status, + success=success, + confidence=confidence, + details={ + "detected_gestures": [ + { + "gesture": g["gesture"], + "duration": g["duration"], + "confidence": g["confidence"] + } + for g in detected_gestures + ], + "validation_metrics": validation_metrics, + "required_gestures": required_gestures, + "error_margin": error_margin, + "require_all": require_all, + "processing_timestamp": datetime.now(timezone.utc).isoformat() + } + ) + + logger.info(f"Gesture validation completed: success={success}, confidence={confidence}") + return result + + except Exception as e: + error_msg = f"Error during gesture validation: {str(e)}" + logger.error(error_msg, exc_info=True) + return ValidationResult( + status=ValidationStatus.FAILED, + success=False, + confidence=0.0, + error_message=error_msg + ) + + def _process_video_for_gestures(self, video_path: str) -> List[Dict[str, Any]]: + """ + Process video file to detect gestures using existing pipeline. + + Parameters + ---------- + video_path : str + Path to the video file + + Returns + ------- + List[Dict[str, Any]] + List of detected gestures with metadata + """ + logger.debug(f"Processing video for gestures: {video_path}") + + # Initialize the main controller + controller = self._main_controller_class(self.detector_path, self.classifier_path) + + # Import video processing function from existing API + try: + from ..gesturedetection.api import process_video_for_gestures + gestures = process_video_for_gestures( + video_path, + detector_path=self.detector_path, + classifier_path=self.classifier_path, + frame_skip=self.frame_skip + ) + except ImportError: + # Fallback: use controller directly if import fails + logger.warning("Using fallback gesture processing method") + gestures = self._process_video_with_controller(controller, video_path) + + # Convert to our internal format + detected_gestures = [] + for gesture in gestures: + # Map gesture names to standardized format + gesture_name = self._normalize_gesture_name(gesture.gesture) + + detected_gestures.append({ + "gesture": gesture_name, + "duration": gesture.duration, + "confidence": gesture.confidence, + "raw_gesture": gesture.gesture + }) + + logger.debug(f"Detected {len(detected_gestures)} gestures") + return detected_gestures + + def _process_video_with_controller(self, controller, video_path: str) -> List[Dict[str, Any]]: + """ + Fallback method to process video using controller directly. + + This is used if the import from api.py fails for any reason. + """ + import cv2 + from collections import defaultdict + + logger.debug("Processing video with controller fallback method") + + # Open video file + cap = cv2.VideoCapture(video_path) + if not cap.isOpened(): + raise ValueError(f"Could not open video file: {video_path}") + + gesture_tracks = defaultdict(list) + frame_count = 0 + + try: + while True: + ret, frame = cap.read() + if not ret: + break + + # Skip frames based on frame_skip parameter + if frame_count % self.frame_skip == 0: + # Process frame through the controller + bboxes, ids, labels = controller(frame) + + if bboxes is not None and ids is not None and labels is not None: + # Track gestures for each detected hand + for i in range(len(bboxes)): + hand_id = int(ids[i]) + gesture_id = labels[i] + + if gesture_id is not None: + confidence = 0.8 # Default confidence + gesture_tracks[hand_id].append((gesture_id, confidence)) + + frame_count += 1 + + finally: + cap.release() + + # Process gesture tracks to find continuous gestures + detected_gestures = [] + + for hand_id, gesture_sequence in gesture_tracks.items(): + if not gesture_sequence: + continue + + # Group consecutive identical gestures + current_gesture = None + current_duration = 0 + current_confidence = 0.0 + + for gesture_id, confidence in gesture_sequence: + if current_gesture is None or current_gesture != gesture_id: + # Save previous gesture if it was significant + if current_gesture is not None and current_duration >= self.min_gesture_duration: + gesture_name = self._gesture_mapping.get(current_gesture, f"unknown_{current_gesture}") + avg_confidence = current_confidence / current_duration if current_duration > 0 else 0.0 + scaled_duration = current_duration * self.frame_skip + + detected_gestures.append({ + "gesture": gesture_name, + "duration": scaled_duration, + "confidence": avg_confidence + }) + + # Start new gesture + current_gesture = gesture_id + current_duration = 1 + current_confidence = confidence + else: + # Continue current gesture + current_duration += 1 + current_confidence += confidence + + # Don't forget the last gesture + if current_gesture is not None and current_duration >= self.min_gesture_duration: + gesture_name = self._gesture_mapping.get(current_gesture, f"unknown_{current_gesture}") + avg_confidence = current_confidence / current_duration if current_duration > 0 else 0.0 + scaled_duration = current_duration * self.frame_skip + + detected_gestures.append({ + "gesture": gesture_name, + "duration": scaled_duration, + "confidence": avg_confidence + }) + + return detected_gestures + + def _analyze_gesture_requirements( + self, + detected_gestures: List[Dict[str, Any]], + required_gestures: List[str], + error_margin: float, + require_all: bool + ) -> Dict[str, Any]: + """ + Analyze detected gestures against requirements. + + Parameters + ---------- + detected_gestures : List[Dict[str, Any]] + List of detected gestures + required_gestures : List[str] + List of required gesture names + error_margin : float + Error margin for validation + require_all : bool + Whether all gestures are required + + Returns + ------- + Dict[str, Any] + Validation metrics and analysis + """ + logger.debug("Analyzing gesture requirements") + + # Create lookup for detected gestures + detected_gesture_counts = {} + for gesture in detected_gestures: + gesture_name = gesture["gesture"] + if gesture_name not in detected_gesture_counts: + detected_gesture_counts[gesture_name] = [] + detected_gesture_counts[gesture_name].append(gesture) + + # Analyze each required gesture + required_gestures_met = 0 + gesture_analysis = {} + + for required_gesture in required_gestures: + detected_instances = detected_gesture_counts.get(required_gesture, []) + + # Filter by minimum duration and confidence if specified + valid_instances = [ + g for g in detected_instances + if g["duration"] >= self.min_gesture_duration and + g["confidence"] >= self.confidence_threshold + ] + + met_requirement = len(valid_instances) > 0 + + gesture_analysis[required_gesture] = { + "required": True, + "detected": len(detected_instances), + "valid_instances": len(valid_instances), + "met_requirement": met_requirement, + "best_confidence": max([g["confidence"] for g in detected_instances], default=0.0), + "best_duration": max([g["duration"] for g in detected_instances], default=0) + } + + if met_requirement: + required_gestures_met += 1 + + # Calculate success rate + total_required = len(required_gestures) + success_rate = required_gestures_met / total_required if total_required > 0 else 0.0 + + # Determine if validation passes based on error margin + if require_all: + passes_validation = required_gestures_met >= total_required + else: + min_required = max(1, int(total_required * (1.0 - error_margin))) + passes_validation = required_gestures_met >= min_required + + metrics = { + "total_required_gestures": total_required, + "required_gestures_met": required_gestures_met, + "success_rate": success_rate, + "passes_validation": passes_validation, + "error_margin": error_margin, + "require_all": require_all, + "gesture_analysis": gesture_analysis + } + + logger.debug(f"Gesture analysis completed: {required_gestures_met}/{total_required} gestures met requirement") + return metrics + + def _calculate_confidence( + self, + detected_gestures: List[Dict[str, Any]], + validation_metrics: Dict[str, Any] + ) -> float: + """ + Calculate overall confidence score for gesture validation. + + Parameters + ---------- + detected_gestures : List[Dict[str, Any]] + List of detected gestures + validation_metrics : Dict[str, Any] + Validation metrics from analysis + + Returns + ------- + float + Overall confidence score (0.0-1.0) + """ + if not detected_gestures: + return 0.0 + + # Base confidence on success rate + success_rate = validation_metrics.get("success_rate", 0.0) + + # Boost confidence based on average gesture quality + if detected_gestures: + avg_confidence = sum(g["confidence"] for g in detected_gestures) / len(detected_gestures) + avg_duration = sum(g["duration"] for g in detected_gestures) / len(detected_gestures) + + # Normalize duration to confidence boost (longer, more confident gestures = higher score) + duration_boost = min(0.2, avg_duration / 100.0) # Cap at 0.2 boost + confidence_boost = min(0.1, avg_confidence * 0.1) # Cap at 0.1 boost + + success_rate = min(1.0, success_rate + duration_boost + confidence_boost) + + return success_rate + + def _normalize_gesture_name(self, gesture_name: str) -> str: + """ + Normalize gesture names to standard format. + + Parameters + ---------- + gesture_name : str + Raw gesture name from detection + + Returns + ------- + str + Normalized gesture name + """ + # Convert to lowercase and remove common variations + normalized = gesture_name.lower().strip() + + # Handle common variations + variations = { + "thumbs_up": ["thumbsup", "thumb_up", "like"], + "peace": ["peace_sign", "victory", "two_fingers"], + "ok": ["okay", "ok_sign"], + "call": ["call_me", "phone"], + "palm": ["open_palm", "five_fingers"], + "fist": ["closed_fist"], + "point": ["pointing"], + "stop": ["stop_sign"], + "one": ["one_finger"], + "two_up": ["two_fingers_up"], + "three": ["three_fingers"], + "four": ["four_fingers"] + } + + for standard_name, variant_list in variations.items(): + if normalized in variant_list or normalized == standard_name: + return standard_name + + return normalized diff --git a/src/validate/main.py b/src/validate/main.py new file mode 100644 index 0000000000000000000000000000000000000000..b256febf5b9dfc8a462a15e98c10fca162427426 --- /dev/null +++ b/src/validate/main.py @@ -0,0 +1,57 @@ +""" +Main entry point for the validation module. + +This module provides the main application entry point for running the +validation API server independently or for testing purposes. +""" + +import uvicorn +import logging +from typing import Optional + +from .api import app +from .config import config + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +logger = logging.getLogger(__name__) + + +def main( + host: str = "0.0.0.0", + port: int = 7860, + reload: bool = False, + log_level: str = "info" +): + """ + Run the validation API server. + + Parameters + ---------- + host : str, optional + Host to bind the server to, by default "0.0.0.0" + port : int, optional + Port to bind the server to, by default 7860 + reload : bool, optional + Whether to enable auto-reload for development, by default False + log_level : str, optional + Logging level, by default "info" + """ + logger.info("Starting Validation API server") + logger.info(f"Configuration: {config.model_paths}") + + uvicorn.run( + "src.validate.api:app", + host=host, + port=port, + reload=reload, + log_level=log_level + ) + + +if __name__ == "__main__": + main() diff --git a/src/validate/models.py b/src/validate/models.py new file mode 100644 index 0000000000000000000000000000000000000000..b8fefafc3d3abfe0b5aa51b82d41c272e09414d7 --- /dev/null +++ b/src/validate/models.py @@ -0,0 +1,135 @@ +""" +Pydantic models for validation requests and responses. + +This module defines the data structures used for identity validation, +including request models for input validation and response models +for structured output. +""" + +from typing import List, Optional, Dict, Any +from pydantic import BaseModel, Field, field_serializer +from enum import Enum +import numpy as np + + +class ValidationStatus(str, Enum): + """Enumeration of possible validation statuses.""" + SUCCESS = "success" + FAILED = "failed" + PARTIAL = "partial" + + +class ValidationResult(BaseModel): + """ + Detailed result for a specific validation type. + + Provides comprehensive information about validation outcomes, + including success status, confidence scores, and detailed metrics. + """ + status: ValidationStatus = Field(description="Overall validation status") + success: bool = Field(description="Boolean success indicator") + confidence: float = Field(description="Confidence score (0.0-1.0)", ge=0.0, le=1.0) + details: Optional[Dict[str, Any]] = Field(default=None, description="Additional validation details") + error_message: Optional[str] = Field(default=None, description="Error message if validation failed") + + +class ValidationResponse(BaseModel): + """ + Response model for identity validation requests. + + Contains results for both facial and gesture validation, + along with overall validation status and optional detailed results. + """ + face: bool = Field(description="Facial recognition result") + gestures: bool = Field(description="Gesture validation result") + overall: bool = Field(description="Overall validation success") + status: ValidationStatus = Field(description="Overall validation status") + face_result: Optional[ValidationResult] = Field(default=None, description="Detailed facial validation result") + gesture_result: Optional[ValidationResult] = Field(default=None, description="Detailed gesture validation result") + processing_time_ms: Optional[int] = Field(default=None, description="Processing time in milliseconds") + timestamp: Optional[str] = Field(default=None, description="ISO timestamp of validation") + + @field_serializer('face_result', 'gesture_result') + def serialize_validation_results(self, value: Optional[ValidationResult]) -> Optional[Dict[str, Any]]: + """Serialize ValidationResult objects, converting numpy types.""" + if value is None: + return None + + # Convert to dict and handle numpy types + data = value.model_dump() + return self._convert_numpy_types(data) + + @field_serializer('status') + def serialize_status(self, value: ValidationStatus) -> str: + """Serialize ValidationStatus enum.""" + return value.value + + def _convert_numpy_types(self, obj): + """Recursively convert numpy types to Python types.""" + if isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.bool_): + return bool(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, dict): + return {k: self._convert_numpy_types(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [self._convert_numpy_types(item) for item in obj] + return obj + + +class GestureRequirement(BaseModel): + """ + Individual gesture requirement specification. + + Defines a specific gesture that must be performed with optional + parameters like minimum duration or confidence threshold. + """ + gesture: str = Field(description="Name of the required gesture") + min_duration: Optional[int] = Field(default=None, description="Minimum duration in frames") + min_confidence: Optional[float] = Field(default=None, description="Minimum confidence threshold", ge=0.0, le=1.0) + required_count: Optional[int] = Field(default=1, description="Number of times gesture must be performed", ge=1) + + +class ValidationRequest(BaseModel): + """ + Request model for identity validation. + + Specifies the required gestures and optional validation parameters for both + facial recognition and gesture validation. Used to configure the validation + process before processing. + """ + asked_gestures: List[str] = Field(description="List of required gesture names") + + # Gesture validation parameters + error_margin: Optional[float] = Field(default=0.33, description="Error margin for gesture validation (0.0-1.0). If None, uses environment default", ge=0.0, le=1.0) + min_gesture_duration: Optional[int] = Field(default=5, description="Minimum duration for gesture detection in frames. If None, uses environment default") + require_all_gestures: Optional[bool] = Field(default=True, description="Whether all gestures must be present. If None, uses environment default") + confidence_threshold: Optional[float] = Field(default=0.7, description="Minimum confidence threshold for gesture detection (0.0-1.0). If None, uses environment default", ge=0.0, le=1.0) + + # Facial recognition parameters + similarity_threshold: Optional[float] = Field(default=0.7, description="Minimum similarity threshold for facial matching (0.0-1.0). If None, uses environment default", ge=0.0, le=1.0) + frame_sample_rate: Optional[int] = Field(default=10, description="Rate at which to sample video frames for face detection. If None, uses environment default") + + # Response parameters + include_details: Optional[bool] = Field(default=False, description="Include detailed validation results in response") + + def to_gesture_requirements(self) -> List[GestureRequirement]: + """ + Convert asked_gestures list to detailed GestureRequirement objects. + + Returns + ------- + List[GestureRequirement] + List of gesture requirements with default parameters + """ + return [ + GestureRequirement( + gesture=gesture, + min_duration=self.min_gesture_duration + ) + for gesture in self.asked_gestures + ]