Spaces:

tmkdt
/

newHandsupModel

Sleeping

newHandsupModel / wordsControllerS.py

mutarisi

socket things

ae78832 4 months ago

6.04 kB

	import cv2
	import numpy as np
	import pandas as pd
	from tensorflow.keras.models import load_model
	import mediapipe as mp

	modelPath = 'ai_model/words/saved_models/best_sign_classifier_model_40_words_seq90.keras'
	csvPath = 'ai_model/words/wlasl_40_words_personal_final_processed_data_augmented_seq90.csv'
	sequenceLength = 30
	expectedCoordsPerFrame = 1662
	confidenceThreshold = 0.7

	model = load_model(modelPath)
	df = pd.read_csv(csvPath)
	uniqueGlosses = df['gloss'].unique()
	idToGloss = {i: g for i, g in enumerate(uniqueGlosses)}

	mpHolistic = mp.solutions.holistic.Holistic(
	static_image_mode=True,
	model_complexity=1,
	min_detection_confidence=0.2,
	min_tracking_confidence=0.5
	)

	numPoseCoordsSingle = 33*4
	numHandCoordsSingle = 21*3
	numFaceCoordsSingle = 468*3

	def normalizeLandmarks(landmarksSequence):
	if landmarksSequence.ndim == 1:
	landmarksSequence = np.expand_dims(landmarksSequence, axis=0)

	normalizedSequences = []
	for frameLandmarks in landmarksSequence:
	if np.all(frameLandmarks == 0):
	normalizedSequences.append(np.zeros(expectedCoordsPerFrame, dtype=np.float32))
	continue

	poseCoordsFlat = frameLandmarks[0 : numPoseCoordsSingle]
	leftHandCoordsFlat = frameLandmarks[numPoseCoordsSingle : numPoseCoordsSingle + numHandCoordsSingle]
	rightHandCoordsFlat = frameLandmarks[numPoseCoordsSingle + numHandCoordsSingle : numPoseCoordsSingle + numHandCoordsSingle*2]
	faceCoordsFlat = frameLandmarks[numPoseCoordsSingle + numHandCoordsSingle*2 : ]

	allPartsData = [
	(poseCoordsFlat, 4, [0.0]*numPoseCoordsSingle),
	(leftHandCoordsFlat, 3, [0.0]*numHandCoordsSingle),
	(rightHandCoordsFlat, 3, [0.0]*numHandCoordsSingle),
	(faceCoordsFlat, 3, [0.0]*numFaceCoordsSingle)
	]

	normalizedFrameParts = []
	for flatLms, coordsPerLm, template in allPartsData:
	if np.all(flatLms == 0):
	normalizedFrameParts.append(np.array(template, dtype=np.float32))
	continue

	lmsArray = flatLms.reshape(-1, coordsPerLm)
	coordsForMean = lmsArray[:, :3] if coordsPerLm == 4 else lmsArray
	meanCoords = np.mean(coordsForMean, axis=0)
	translatedLms = lmsArray.copy()
	translatedLms[:, :3] -= meanCoords
	scaleFactor = np.max(np.linalg.norm(translatedLms[:, :3], axis=1))
	if scaleFactor > 1e-6:
	translatedLms[:, :3] /= scaleFactor
	normalizedFrameParts.append(translatedLms.flatten())

	combinedFrame = np.concatenate(normalizedFrameParts).astype(np.float32)
	if len(combinedFrame) < expectedCoordsPerFrame:
	combinedFrame = np.pad(combinedFrame, (0, expectedCoordsPerFrame - len(combinedFrame)), 'constant')
	elif len(combinedFrame) > expectedCoordsPerFrame:
	combinedFrame = combinedFrame[:expectedCoordsPerFrame]

	normalizedSequences.append(combinedFrame)

	return np.array(normalizedSequences, dtype=np.float32)

	def padOrTruncateSequence(sequence, targetLength, featureDimension):
	if sequence.shape[0] < targetLength:
	padding = np.zeros((targetLength - sequence.shape[0], featureDimension), dtype=np.float32)
	return np.vstack((sequence, padding))
	return sequence[:targetLength, :]

	def detectFromImageBytes(sequenceBytesList):
	sequence = []

	for idx, imageBytes in enumerate(sequenceBytesList):
	nparr = np.frombuffer(imageBytes, np.uint8)
	img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
	if img is None:
	print(f"Warning: Could not decode image bytes at index {idx}")
	sequence.append(np.zeros(expectedCoordsPerFrame, dtype=np.float32))
	continue

	imgRgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
	mpResults = mpHolistic.process(imgRgb)

	frameLms = np.zeros(expectedCoordsPerFrame, dtype=np.float32)
	currentIdx = 0

	if mpResults.pose_landmarks:
	poseFlat = [coord for lm in mpResults.pose_landmarks.landmark for coord in [lm.x, lm.y, lm.z, lm.visibility]]
	frameLms[currentIdx:currentIdx + len(poseFlat)] = poseFlat
	else:
	print(f"Warning: No pose landmarks detected in frame {idx}")
	currentIdx += numPoseCoordsSingle


	if mpResults.left_hand_landmarks:
	lhFlat = [coord for lm in mpResults.left_hand_landmarks.landmark for coord in [lm.x, lm.y, lm.z]]
	frameLms[currentIdx:currentIdx + len(lhFlat)] = lhFlat
	else:
	print(f"Warning: No left hand landmarks detected in frame {idx}")
	currentIdx += numHandCoordsSingle

	if mpResults.right_hand_landmarks:
	rhFlat = [coord for lm in mpResults.right_hand_landmarks.landmark for coord in [lm.x, lm.y, lm.z]]
	frameLms[currentIdx:currentIdx + len(rhFlat)] = rhFlat
	else:
	print(f"Warning: No right hand landmarks detected in frame {idx}")
	currentIdx += numHandCoordsSingle

	if mpResults.face_landmarks:
	faceFlat = [coord for lm in mpResults.face_landmarks.landmark for coord in [lm.x, lm.y, lm.z]]
	frameLms[currentIdx:currentIdx + len(faceFlat)] = faceFlat
	else:
	print(f"Warning: No face landmarks detected in frame {idx}")

	sequence.append(frameLms)

	if not sequence:
	return {"word": "", "confidence": 0.0}

	sequence = normalizeLandmarks(np.array(sequence, dtype=np.float32))
	sequence = padOrTruncateSequence(sequence, sequenceLength, expectedCoordsPerFrame)
	sequence = np.expand_dims(sequence, axis=0)

	preds = model.predict(sequence, verbose=0)
	predictedId = int(np.argmax(preds))
	confidence = float(np.max(preds))
	predictedWord = idToGloss.get(predictedId, "Unknown")

	result = {"word": predictedWord if confidence >= confidenceThreshold else "",
	"confidence": confidence}

	print(f"Prediction result: {result}")
	return result