Spaces:
Sleeping
Sleeping
File size: 6,043 Bytes
ae78832 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import cv2
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
import mediapipe as mp
modelPath = 'ai_model/words/saved_models/best_sign_classifier_model_40_words_seq90.keras'
csvPath = 'ai_model/words/wlasl_40_words_personal_final_processed_data_augmented_seq90.csv'
sequenceLength = 30
expectedCoordsPerFrame = 1662
confidenceThreshold = 0.7
model = load_model(modelPath)
df = pd.read_csv(csvPath)
uniqueGlosses = df['gloss'].unique()
idToGloss = {i: g for i, g in enumerate(uniqueGlosses)}
mpHolistic = mp.solutions.holistic.Holistic(
static_image_mode=True,
model_complexity=1,
min_detection_confidence=0.2,
min_tracking_confidence=0.5
)
numPoseCoordsSingle = 33*4
numHandCoordsSingle = 21*3
numFaceCoordsSingle = 468*3
def normalizeLandmarks(landmarksSequence):
if landmarksSequence.ndim == 1:
landmarksSequence = np.expand_dims(landmarksSequence, axis=0)
normalizedSequences = []
for frameLandmarks in landmarksSequence:
if np.all(frameLandmarks == 0):
normalizedSequences.append(np.zeros(expectedCoordsPerFrame, dtype=np.float32))
continue
poseCoordsFlat = frameLandmarks[0 : numPoseCoordsSingle]
leftHandCoordsFlat = frameLandmarks[numPoseCoordsSingle : numPoseCoordsSingle + numHandCoordsSingle]
rightHandCoordsFlat = frameLandmarks[numPoseCoordsSingle + numHandCoordsSingle : numPoseCoordsSingle + numHandCoordsSingle*2]
faceCoordsFlat = frameLandmarks[numPoseCoordsSingle + numHandCoordsSingle*2 : ]
allPartsData = [
(poseCoordsFlat, 4, [0.0]*numPoseCoordsSingle),
(leftHandCoordsFlat, 3, [0.0]*numHandCoordsSingle),
(rightHandCoordsFlat, 3, [0.0]*numHandCoordsSingle),
(faceCoordsFlat, 3, [0.0]*numFaceCoordsSingle)
]
normalizedFrameParts = []
for flatLms, coordsPerLm, template in allPartsData:
if np.all(flatLms == 0):
normalizedFrameParts.append(np.array(template, dtype=np.float32))
continue
lmsArray = flatLms.reshape(-1, coordsPerLm)
coordsForMean = lmsArray[:, :3] if coordsPerLm == 4 else lmsArray
meanCoords = np.mean(coordsForMean, axis=0)
translatedLms = lmsArray.copy()
translatedLms[:, :3] -= meanCoords
scaleFactor = np.max(np.linalg.norm(translatedLms[:, :3], axis=1))
if scaleFactor > 1e-6:
translatedLms[:, :3] /= scaleFactor
normalizedFrameParts.append(translatedLms.flatten())
combinedFrame = np.concatenate(normalizedFrameParts).astype(np.float32)
if len(combinedFrame) < expectedCoordsPerFrame:
combinedFrame = np.pad(combinedFrame, (0, expectedCoordsPerFrame - len(combinedFrame)), 'constant')
elif len(combinedFrame) > expectedCoordsPerFrame:
combinedFrame = combinedFrame[:expectedCoordsPerFrame]
normalizedSequences.append(combinedFrame)
return np.array(normalizedSequences, dtype=np.float32)
def padOrTruncateSequence(sequence, targetLength, featureDimension):
if sequence.shape[0] < targetLength:
padding = np.zeros((targetLength - sequence.shape[0], featureDimension), dtype=np.float32)
return np.vstack((sequence, padding))
return sequence[:targetLength, :]
def detectFromImageBytes(sequenceBytesList):
sequence = []
for idx, imageBytes in enumerate(sequenceBytesList):
nparr = np.frombuffer(imageBytes, np.uint8)
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
if img is None:
print(f"Warning: Could not decode image bytes at index {idx}")
sequence.append(np.zeros(expectedCoordsPerFrame, dtype=np.float32))
continue
imgRgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
mpResults = mpHolistic.process(imgRgb)
frameLms = np.zeros(expectedCoordsPerFrame, dtype=np.float32)
currentIdx = 0
if mpResults.pose_landmarks:
poseFlat = [coord for lm in mpResults.pose_landmarks.landmark for coord in [lm.x, lm.y, lm.z, lm.visibility]]
frameLms[currentIdx:currentIdx + len(poseFlat)] = poseFlat
else:
print(f"Warning: No pose landmarks detected in frame {idx}")
currentIdx += numPoseCoordsSingle
if mpResults.left_hand_landmarks:
lhFlat = [coord for lm in mpResults.left_hand_landmarks.landmark for coord in [lm.x, lm.y, lm.z]]
frameLms[currentIdx:currentIdx + len(lhFlat)] = lhFlat
else:
print(f"Warning: No left hand landmarks detected in frame {idx}")
currentIdx += numHandCoordsSingle
if mpResults.right_hand_landmarks:
rhFlat = [coord for lm in mpResults.right_hand_landmarks.landmark for coord in [lm.x, lm.y, lm.z]]
frameLms[currentIdx:currentIdx + len(rhFlat)] = rhFlat
else:
print(f"Warning: No right hand landmarks detected in frame {idx}")
currentIdx += numHandCoordsSingle
if mpResults.face_landmarks:
faceFlat = [coord for lm in mpResults.face_landmarks.landmark for coord in [lm.x, lm.y, lm.z]]
frameLms[currentIdx:currentIdx + len(faceFlat)] = faceFlat
else:
print(f"Warning: No face landmarks detected in frame {idx}")
sequence.append(frameLms)
if not sequence:
return {"word": "", "confidence": 0.0}
sequence = normalizeLandmarks(np.array(sequence, dtype=np.float32))
sequence = padOrTruncateSequence(sequence, sequenceLength, expectedCoordsPerFrame)
sequence = np.expand_dims(sequence, axis=0)
preds = model.predict(sequence, verbose=0)
predictedId = int(np.argmax(preds))
confidence = float(np.max(preds))
predictedWord = idToGloss.get(predictedId, "Unknown")
result = {"word": predictedWord if confidence >= confidenceThreshold else "",
"confidence": confidence}
print(f"Prediction result: {result}")
return result |