Spaces:
Sleeping
Sleeping
| import cv2 | |
| import numpy as np | |
| import pandas as pd | |
| from tensorflow.keras.models import load_model | |
| import mediapipe as mp | |
| modelPath = 'ai_model/words/saved_models/best_sign_classifier_model_40_words_seq90.keras' | |
| csvPath = 'ai_model/words/wlasl_40_words_personal_final_processed_data_augmented_seq90.csv' | |
| sequenceLength = 30 | |
| expectedCoordsPerFrame = 1662 | |
| confidenceThreshold = 0.7 | |
| model = load_model(modelPath) | |
| df = pd.read_csv(csvPath) | |
| uniqueGlosses = df['gloss'].unique() | |
| idToGloss = {i: g for i, g in enumerate(uniqueGlosses)} | |
| mpHolistic = mp.solutions.holistic.Holistic( | |
| static_image_mode=True, | |
| model_complexity=1, | |
| min_detection_confidence=0.2, | |
| min_tracking_confidence=0.5 | |
| ) | |
| numPoseCoordsSingle = 33*4 | |
| numHandCoordsSingle = 21*3 | |
| numFaceCoordsSingle = 468*3 | |
| def normalizeLandmarks(landmarksSequence): | |
| if landmarksSequence.ndim == 1: | |
| landmarksSequence = np.expand_dims(landmarksSequence, axis=0) | |
| normalizedSequences = [] | |
| for frameLandmarks in landmarksSequence: | |
| if np.all(frameLandmarks == 0): | |
| normalizedSequences.append(np.zeros(expectedCoordsPerFrame, dtype=np.float32)) | |
| continue | |
| poseCoordsFlat = frameLandmarks[0 : numPoseCoordsSingle] | |
| leftHandCoordsFlat = frameLandmarks[numPoseCoordsSingle : numPoseCoordsSingle + numHandCoordsSingle] | |
| rightHandCoordsFlat = frameLandmarks[numPoseCoordsSingle + numHandCoordsSingle : numPoseCoordsSingle + numHandCoordsSingle*2] | |
| faceCoordsFlat = frameLandmarks[numPoseCoordsSingle + numHandCoordsSingle*2 : ] | |
| allPartsData = [ | |
| (poseCoordsFlat, 4, [0.0]*numPoseCoordsSingle), | |
| (leftHandCoordsFlat, 3, [0.0]*numHandCoordsSingle), | |
| (rightHandCoordsFlat, 3, [0.0]*numHandCoordsSingle), | |
| (faceCoordsFlat, 3, [0.0]*numFaceCoordsSingle) | |
| ] | |
| normalizedFrameParts = [] | |
| for flatLms, coordsPerLm, template in allPartsData: | |
| if np.all(flatLms == 0): | |
| normalizedFrameParts.append(np.array(template, dtype=np.float32)) | |
| continue | |
| lmsArray = flatLms.reshape(-1, coordsPerLm) | |
| coordsForMean = lmsArray[:, :3] if coordsPerLm == 4 else lmsArray | |
| meanCoords = np.mean(coordsForMean, axis=0) | |
| translatedLms = lmsArray.copy() | |
| translatedLms[:, :3] -= meanCoords | |
| scaleFactor = np.max(np.linalg.norm(translatedLms[:, :3], axis=1)) | |
| if scaleFactor > 1e-6: | |
| translatedLms[:, :3] /= scaleFactor | |
| normalizedFrameParts.append(translatedLms.flatten()) | |
| combinedFrame = np.concatenate(normalizedFrameParts).astype(np.float32) | |
| if len(combinedFrame) < expectedCoordsPerFrame: | |
| combinedFrame = np.pad(combinedFrame, (0, expectedCoordsPerFrame - len(combinedFrame)), 'constant') | |
| elif len(combinedFrame) > expectedCoordsPerFrame: | |
| combinedFrame = combinedFrame[:expectedCoordsPerFrame] | |
| normalizedSequences.append(combinedFrame) | |
| return np.array(normalizedSequences, dtype=np.float32) | |
| def padOrTruncateSequence(sequence, targetLength, featureDimension): | |
| if sequence.shape[0] < targetLength: | |
| padding = np.zeros((targetLength - sequence.shape[0], featureDimension), dtype=np.float32) | |
| return np.vstack((sequence, padding)) | |
| return sequence[:targetLength, :] | |
| def detectFromImageBytes(sequenceBytesList): | |
| sequence = [] | |
| for idx, imageBytes in enumerate(sequenceBytesList): | |
| nparr = np.frombuffer(imageBytes, np.uint8) | |
| img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) | |
| if img is None: | |
| print(f"Warning: Could not decode image bytes at index {idx}") | |
| sequence.append(np.zeros(expectedCoordsPerFrame, dtype=np.float32)) | |
| continue | |
| imgRgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) | |
| mpResults = mpHolistic.process(imgRgb) | |
| frameLms = np.zeros(expectedCoordsPerFrame, dtype=np.float32) | |
| currentIdx = 0 | |
| if mpResults.pose_landmarks: | |
| poseFlat = [coord for lm in mpResults.pose_landmarks.landmark for coord in [lm.x, lm.y, lm.z, lm.visibility]] | |
| frameLms[currentIdx:currentIdx + len(poseFlat)] = poseFlat | |
| else: | |
| print(f"Warning: No pose landmarks detected in frame {idx}") | |
| currentIdx += numPoseCoordsSingle | |
| if mpResults.left_hand_landmarks: | |
| lhFlat = [coord for lm in mpResults.left_hand_landmarks.landmark for coord in [lm.x, lm.y, lm.z]] | |
| frameLms[currentIdx:currentIdx + len(lhFlat)] = lhFlat | |
| else: | |
| print(f"Warning: No left hand landmarks detected in frame {idx}") | |
| currentIdx += numHandCoordsSingle | |
| if mpResults.right_hand_landmarks: | |
| rhFlat = [coord for lm in mpResults.right_hand_landmarks.landmark for coord in [lm.x, lm.y, lm.z]] | |
| frameLms[currentIdx:currentIdx + len(rhFlat)] = rhFlat | |
| else: | |
| print(f"Warning: No right hand landmarks detected in frame {idx}") | |
| currentIdx += numHandCoordsSingle | |
| if mpResults.face_landmarks: | |
| faceFlat = [coord for lm in mpResults.face_landmarks.landmark for coord in [lm.x, lm.y, lm.z]] | |
| frameLms[currentIdx:currentIdx + len(faceFlat)] = faceFlat | |
| else: | |
| print(f"Warning: No face landmarks detected in frame {idx}") | |
| sequence.append(frameLms) | |
| if not sequence: | |
| return {"word": "", "confidence": 0.0} | |
| sequence = normalizeLandmarks(np.array(sequence, dtype=np.float32)) | |
| sequence = padOrTruncateSequence(sequence, sequenceLength, expectedCoordsPerFrame) | |
| sequence = np.expand_dims(sequence, axis=0) | |
| preds = model.predict(sequence, verbose=0) | |
| predictedId = int(np.argmax(preds)) | |
| confidence = float(np.max(preds)) | |
| predictedWord = idToGloss.get(predictedId, "Unknown") | |
| result = {"word": predictedWord if confidence >= confidenceThreshold else "", | |
| "confidence": confidence} | |
| print(f"Prediction result: {result}") | |
| return result |