Spaces:
Sleeping
Sleeping
mutarisi
commited on
Commit
·
de8ea8e
1
Parent(s):
0070d5a
add app file
Browse files- glossController.py +19 -0
- lettersController.py +141 -0
- requirements.txt +17 -0
- runtime.txt +1 -0
- upload.py +97 -0
- wordsController.py +141 -0
glossController.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
def translateGloss(gloss: str, model_id: str = "rrrr66254/Glossa-BART") -> str:
|
| 5 |
+
|
| 6 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
| 7 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, trust_remote_code=True)
|
| 8 |
+
model.eval()
|
| 9 |
+
if torch.cuda.is_available():
|
| 10 |
+
model = model.to("cuda")
|
| 11 |
+
|
| 12 |
+
inputs = tokenizer(gloss, return_tensors="pt", padding=True, truncation=True)
|
| 13 |
+
if torch.cuda.is_available():
|
| 14 |
+
inputs = {k: v.to("cuda") for k,v in inputs.items()}
|
| 15 |
+
|
| 16 |
+
outputs = model.generate(**inputs, max_new_tokens=50, do_sample=False)
|
| 17 |
+
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 18 |
+
return result
|
| 19 |
+
|
lettersController.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pickle
|
| 4 |
+
import tensorflow as tf
|
| 5 |
+
import mediapipe as mp
|
| 6 |
+
|
| 7 |
+
lettersModel = tf.keras.models.load_model('ai_model/models/detectLettersModel.keras')
|
| 8 |
+
with open('ai_model/models/labelEncoder.pickle', 'rb') as f:
|
| 9 |
+
labelEncoder = pickle.load(f)
|
| 10 |
+
|
| 11 |
+
lettersModel2 = tf.keras.models.load_model('ai_model/jz_model/JZModel.keras')
|
| 12 |
+
with open('ai_model/jz_model/labelEncoder.pickle', 'rb') as f:
|
| 13 |
+
labelEncoder2 = pickle.load(f)
|
| 14 |
+
|
| 15 |
+
numbersModel = tf.keras.models.load_model('ai_model/models/detectNumbersModel.keras')
|
| 16 |
+
with open('ai_model/models/numLabelEncoder.pickle', 'rb') as f:
|
| 17 |
+
numLabelEncoder = pickle.load(f)
|
| 18 |
+
|
| 19 |
+
sequenceNum = 20
|
| 20 |
+
hands = mp.solutions.hands.Hands(static_image_mode=True)
|
| 21 |
+
|
| 22 |
+
def detectFromImage(sequenceList):
|
| 23 |
+
|
| 24 |
+
if len(sequenceList) != sequenceNum:
|
| 25 |
+
return {'letter': '', 'confidence': 0.0}
|
| 26 |
+
|
| 27 |
+
processedSequence = []
|
| 28 |
+
|
| 29 |
+
for imagePath in sequenceList:
|
| 30 |
+
image = cv2.imread(imagePath)
|
| 31 |
+
if image is None:
|
| 32 |
+
continue
|
| 33 |
+
|
| 34 |
+
imgRGB = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 35 |
+
results = hands.process(imgRGB)
|
| 36 |
+
|
| 37 |
+
if not results.multi_hand_landmarks:
|
| 38 |
+
continue
|
| 39 |
+
|
| 40 |
+
handLandmarks = results.multi_hand_landmarks[0]
|
| 41 |
+
|
| 42 |
+
xList, yList = [], []
|
| 43 |
+
dataAux2 = []
|
| 44 |
+
|
| 45 |
+
for lm in handLandmarks.landmark:
|
| 46 |
+
xList.append(lm.x)
|
| 47 |
+
yList.append(lm.y)
|
| 48 |
+
|
| 49 |
+
for lm in handLandmarks.landmark:
|
| 50 |
+
dataAux2.append(lm.x - min(xList))
|
| 51 |
+
dataAux2.append(lm.y - min(yList))
|
| 52 |
+
dataAux2.append(0)
|
| 53 |
+
|
| 54 |
+
processedSequence.append(dataAux2)
|
| 55 |
+
|
| 56 |
+
confidence2 = 0.0
|
| 57 |
+
label2 = ""
|
| 58 |
+
fallback_frame = cv2.imread(sequenceList[-1])
|
| 59 |
+
|
| 60 |
+
# for i in range(len(processedSequence)):
|
| 61 |
+
# if processedSequence[i] is None:
|
| 62 |
+
# prevIdx, nextIdx = -1, -1
|
| 63 |
+
|
| 64 |
+
# for j in range(i - 1, -1, -1):
|
| 65 |
+
# if processedSequence[j] is not None:
|
| 66 |
+
# prevIdx = j
|
| 67 |
+
# break
|
| 68 |
+
|
| 69 |
+
# for j in range(i + 1, len(processedSequence)):
|
| 70 |
+
# if processedSequence[j] is not None:
|
| 71 |
+
# nextIdx = j
|
| 72 |
+
# break
|
| 73 |
+
|
| 74 |
+
# if prevIdx != -1 and nextIdx != -1:
|
| 75 |
+
# prevData = np.array(processedSequence[prevIdx])
|
| 76 |
+
# nextData = np.array(processedSequence[nextIdx])
|
| 77 |
+
# t = (i - prevIdx) / (nextIdx - prevIdx)
|
| 78 |
+
# interpolatedData = prevData + (nextData - prevData) * t
|
| 79 |
+
# processedSequence[i] = interpolatedData.tolist()
|
| 80 |
+
# elif prevIdx != -1:
|
| 81 |
+
# processedSequence[i] = processedSequence[prevIdx]
|
| 82 |
+
# elif nextIdx != -1:
|
| 83 |
+
# processedSequence[i] = processedSequence[nextIdx]
|
| 84 |
+
|
| 85 |
+
if len(processedSequence) != sequenceNum:
|
| 86 |
+
print("incomplete sequence: ", len(processedSequence))
|
| 87 |
+
return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0}
|
| 88 |
+
|
| 89 |
+
inputData2 = np.array(processedSequence, dtype=np.float32).reshape(1, sequenceNum, 63)
|
| 90 |
+
prediction2 = lettersModel2.predict(inputData2, verbose=0)
|
| 91 |
+
|
| 92 |
+
index2 = np.argmax(prediction2, axis=1)[0]
|
| 93 |
+
confidence2 = float(np.max(prediction2))
|
| 94 |
+
label2 = labelEncoder2.inverse_transform([index2])[0]
|
| 95 |
+
print(f'Letters Model 2:{label2} at {confidence2}')
|
| 96 |
+
|
| 97 |
+
if fallback_frame is not None:
|
| 98 |
+
imgRGB = cv2.cvtColor(fallback_frame, cv2.COLOR_BGR2RGB)
|
| 99 |
+
results = hands.process(imgRGB)
|
| 100 |
+
if results.multi_hand_landmarks:
|
| 101 |
+
handLandmarks = results.multi_hand_landmarks[0]
|
| 102 |
+
xList, yList = [], []
|
| 103 |
+
dataAux = []
|
| 104 |
+
|
| 105 |
+
for lm in handLandmarks.landmark:
|
| 106 |
+
xList.append(lm.x)
|
| 107 |
+
yList.append(lm.y)
|
| 108 |
+
|
| 109 |
+
for lm in handLandmarks.landmark:
|
| 110 |
+
dataAux.append(lm.x - min(xList))
|
| 111 |
+
dataAux.append(lm.y - min(yList))
|
| 112 |
+
|
| 113 |
+
#check in letters model1
|
| 114 |
+
inputData1 = np.array(dataAux, dtype=np.float32).reshape(1, 42, 1)
|
| 115 |
+
prediction1 = lettersModel.predict(inputData1, verbose=0)
|
| 116 |
+
index1 = np.argmax(prediction1, axis=1)[0]
|
| 117 |
+
confidence1 = float(np.max(prediction1))
|
| 118 |
+
label1 = labelEncoder.inverse_transform([index1])[0]
|
| 119 |
+
|
| 120 |
+
print(f'Letters Model 1: {label1} at {confidence1}')
|
| 121 |
+
|
| 122 |
+
prediction3 = numbersModel.predict(inputData1, verbose=0)
|
| 123 |
+
index3 = np.argmax(prediction3, axis=1)[0]
|
| 124 |
+
confidence3 = float(np.max(prediction3))
|
| 125 |
+
label3 = numLabelEncoder.inverse_transform([index3])[0]
|
| 126 |
+
|
| 127 |
+
print(f'Numbers Model: {label3} at {confidence3}')
|
| 128 |
+
|
| 129 |
+
if label1==label2:
|
| 130 |
+
return {'letter': label2, 'confidenceLetter': confidence2,
|
| 131 |
+
'number': label3, 'confidenceNumber': confidence3}
|
| 132 |
+
# elif label2=="Z" and label1=="L":
|
| 133 |
+
# return {'letter': label2, 'confidence': confidence2}
|
| 134 |
+
# elif label2=="J" and label1=="I":
|
| 135 |
+
# return {'letter': label2, 'confidence': confidence2}
|
| 136 |
+
else:
|
| 137 |
+
return {'letter': label1, 'confidenceLetter': confidence1
|
| 138 |
+
, 'number': label3, 'confidenceNumber': confidence3}
|
| 139 |
+
else:
|
| 140 |
+
return {'letter': label2, 'confidenceLetter': confidence2
|
| 141 |
+
, 'number': '', 'confidenceNumber': 0.0}
|
requirements.txt
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.116.1
|
| 2 |
+
matplotlib==3.10.6
|
| 3 |
+
mediapipe==0.10.21
|
| 4 |
+
numpy>=1.24.3,<2
|
| 5 |
+
openai==1.106.1
|
| 6 |
+
opencv_contrib_python==4.11.0.86
|
| 7 |
+
opencv_python==4.11.0.86
|
| 8 |
+
opencv_python_headless==4.11.0.86
|
| 9 |
+
pandas==2.3.2
|
| 10 |
+
python-dotenv==1.1.1
|
| 11 |
+
scikit_learn==1.7.1
|
| 12 |
+
starlette==0.47.3
|
| 13 |
+
tensorflow==2.19.0
|
| 14 |
+
tqdm==4.67.1
|
| 15 |
+
uvicorn==0.35.0
|
| 16 |
+
transformers
|
| 17 |
+
torch
|
runtime.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python-3.11.0
|
upload.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import uvicorn
|
| 4 |
+
from fastapi import FastAPI, UploadFile, File, Form
|
| 5 |
+
from typing import Optional
|
| 6 |
+
from starlette.responses import JSONResponse
|
| 7 |
+
|
| 8 |
+
# --- Import AI Functions ---
|
| 9 |
+
# Now that Detection.py is in the same directory, the import is straightforward.
|
| 10 |
+
try:
|
| 11 |
+
from models.Detection import load_model_and_assets, process_image_and_predict, process_video_and_predict_realtime, MIN_CONFIDENCE_THRESHOLD
|
| 12 |
+
print("Successfully imported functions from Detection.py")
|
| 13 |
+
except ImportError as e:
|
| 14 |
+
print(f"ERROR: Could not import from Detection.py. Please ensure the file is in the same repository as app.py: {e}")
|
| 15 |
+
# It's good practice to exit if a critical import fails
|
| 16 |
+
sys.exit(1)
|
| 17 |
+
|
| 18 |
+
# --- FastAPI App Initialization ---
|
| 19 |
+
app = FastAPI()
|
| 20 |
+
|
| 21 |
+
# --- Load AI Model and Assets on Startup ---
|
| 22 |
+
# This function, located in your Detection.py, should be updated to correctly
|
| 23 |
+
# reference the files within the 'models' subfolder of your Hugging Face Space.
|
| 24 |
+
print("\n--- Hugging Face Space starting: Loading AI model and assets... ---")
|
| 25 |
+
load_model_and_assets()
|
| 26 |
+
print("--- AI model and assets loaded. Ready to serve predictions. ---\n")
|
| 27 |
+
|
| 28 |
+
# --- FastAPI Routes ---
|
| 29 |
+
# The rest of the routes remain the same, as they now correctly call the functions
|
| 30 |
+
# from your Detection.py script.
|
| 31 |
+
|
| 32 |
+
@app.post("/process-image")
|
| 33 |
+
async def process_image_api(
|
| 34 |
+
image: UploadFile = File(...),
|
| 35 |
+
min_confidence: Optional[float] = Form(MIN_CONFIDENCE_THRESHOLD)
|
| 36 |
+
):
|
| 37 |
+
try:
|
| 38 |
+
contents = await image.read()
|
| 39 |
+
temp_filepath = f"/tmp/{image.filename}"
|
| 40 |
+
with open(temp_filepath, "wb") as f:
|
| 41 |
+
f.write(contents)
|
| 42 |
+
|
| 43 |
+
action, confidence = process_image_and_predict(temp_filepath, min_confidence)
|
| 44 |
+
os.remove(temp_filepath)
|
| 45 |
+
|
| 46 |
+
response = {
|
| 47 |
+
"sign": action if action else "UNKNOWN",
|
| 48 |
+
"confidence": round(float(confidence), 2),
|
| 49 |
+
"success": True,
|
| 50 |
+
"filename": image.filename
|
| 51 |
+
}
|
| 52 |
+
return JSONResponse(content=response)
|
| 53 |
+
|
| 54 |
+
except Exception as e:
|
| 55 |
+
return JSONResponse(
|
| 56 |
+
status_code=500,
|
| 57 |
+
content={
|
| 58 |
+
"error": "Error processing image with AI model",
|
| 59 |
+
"details": str(e),
|
| 60 |
+
"success": False
|
| 61 |
+
}
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
@app.post("/process-video")
|
| 65 |
+
async def process_video_api(
|
| 66 |
+
video: UploadFile = File(...),
|
| 67 |
+
min_confidence: Optional[float] = Form(MIN_CONFIDENCE_THRESHOLD)
|
| 68 |
+
):
|
| 69 |
+
try:
|
| 70 |
+
contents = await video.read()
|
| 71 |
+
temp_filepath = f"/tmp/{video.filename}"
|
| 72 |
+
with open(temp_filepath, "wb") as f:
|
| 73 |
+
f.write(contents)
|
| 74 |
+
|
| 75 |
+
action, confidence = process_video_and_predict_realtime(temp_filepath, min_confidence)
|
| 76 |
+
os.remove(temp_filepath)
|
| 77 |
+
|
| 78 |
+
response = {
|
| 79 |
+
"phrase": action if action else "UNKNOWN",
|
| 80 |
+
"confidence": round(float(confidence), 2),
|
| 81 |
+
"success": True,
|
| 82 |
+
"filename": video.filename
|
| 83 |
+
}
|
| 84 |
+
return JSONResponse(content=response)
|
| 85 |
+
except Exception as e:
|
| 86 |
+
return JSONResponse(
|
| 87 |
+
status_code=500,
|
| 88 |
+
content={
|
| 89 |
+
"error": "Error processing video with AI model",
|
| 90 |
+
"details": str(e),
|
| 91 |
+
"success": False
|
| 92 |
+
}
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
#just added
|
| 96 |
+
if __name__ == "__main__":
|
| 97 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
wordsController.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from tensorflow.keras.models import load_model
|
| 5 |
+
import mediapipe as mp
|
| 6 |
+
|
| 7 |
+
MODEL_PATH = 'ai_model/words/saved_models/best_sign_classifier_model_125_words_seq90.keras'
|
| 8 |
+
CSV_PATH = 'ai_model/words/wlasl_125_words_personal_final_processed_data_augmented_seq90.csv'
|
| 9 |
+
SEQUENCE_LENGTH = 90
|
| 10 |
+
EXPECTED_COORDS_PER_FRAME = 1662
|
| 11 |
+
CONFIDENCE_THRESHOLD = 0.1
|
| 12 |
+
|
| 13 |
+
model = load_model(MODEL_PATH)
|
| 14 |
+
df = pd.read_csv(CSV_PATH)
|
| 15 |
+
unique_glosses = df['gloss'].unique()
|
| 16 |
+
id_to_gloss = {i: g for i, g in enumerate(unique_glosses)}
|
| 17 |
+
|
| 18 |
+
mp_holistic = mp.solutions.holistic.Holistic(
|
| 19 |
+
static_image_mode=True,
|
| 20 |
+
model_complexity=1,
|
| 21 |
+
min_detection_confidence=0.2,
|
| 22 |
+
min_tracking_confidence=0.5
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
NUM_POSE_COORDS_SINGLE = 33*4
|
| 26 |
+
NUM_HAND_COORDS_SINGLE = 21*3
|
| 27 |
+
NUM_FACE_COORDS_SINGLE = 468*3
|
| 28 |
+
|
| 29 |
+
def normalize_landmarks(landmarks_sequence):
|
| 30 |
+
if landmarks_sequence.ndim == 1:
|
| 31 |
+
landmarks_sequence = np.expand_dims(landmarks_sequence, axis=0)
|
| 32 |
+
|
| 33 |
+
normalized_sequences = []
|
| 34 |
+
for frame_landmarks in landmarks_sequence:
|
| 35 |
+
if np.all(frame_landmarks == 0):
|
| 36 |
+
normalized_sequences.append(np.zeros(EXPECTED_COORDS_PER_FRAME, dtype=np.float32))
|
| 37 |
+
continue
|
| 38 |
+
|
| 39 |
+
pose_coords_flat = frame_landmarks[0 : NUM_POSE_COORDS_SINGLE]
|
| 40 |
+
left_hand_coords_flat = frame_landmarks[NUM_POSE_COORDS_SINGLE : NUM_POSE_COORDS_SINGLE + NUM_HAND_COORDS_SINGLE]
|
| 41 |
+
right_hand_coords_flat = frame_landmarks[NUM_POSE_COORDS_SINGLE + NUM_HAND_COORDS_SINGLE : NUM_POSE_COORDS_SINGLE + NUM_HAND_COORDS_SINGLE*2]
|
| 42 |
+
face_coords_flat = frame_landmarks[NUM_POSE_COORDS_SINGLE + NUM_HAND_COORDS_SINGLE*2 : ]
|
| 43 |
+
|
| 44 |
+
all_parts_data = [
|
| 45 |
+
(pose_coords_flat, 4, [0.0]*NUM_POSE_COORDS_SINGLE),
|
| 46 |
+
(left_hand_coords_flat, 3, [0.0]*NUM_HAND_COORDS_SINGLE),
|
| 47 |
+
(right_hand_coords_flat, 3, [0.0]*NUM_HAND_COORDS_SINGLE),
|
| 48 |
+
(face_coords_flat, 3, [0.0]*NUM_FACE_COORDS_SINGLE)
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
normalized_frame_parts = []
|
| 52 |
+
for flat_lms, coords_per_lm, template in all_parts_data:
|
| 53 |
+
if np.all(flat_lms==0):
|
| 54 |
+
normalized_frame_parts.append(np.array(template, dtype=np.float32))
|
| 55 |
+
continue
|
| 56 |
+
|
| 57 |
+
lms_array = flat_lms.reshape(-1, coords_per_lm)
|
| 58 |
+
coords_for_mean = lms_array[:, :3] if coords_per_lm==4 else lms_array
|
| 59 |
+
mean_coords = np.mean(coords_for_mean, axis=0)
|
| 60 |
+
translated_lms = lms_array.copy()
|
| 61 |
+
translated_lms[:, :3] -= mean_coords
|
| 62 |
+
scale_factor = np.max(np.linalg.norm(translated_lms[:, :3], axis=1))
|
| 63 |
+
if scale_factor > 1e-6:
|
| 64 |
+
translated_lms[:, :3] /= scale_factor
|
| 65 |
+
normalized_frame_parts.append(translated_lms.flatten())
|
| 66 |
+
|
| 67 |
+
combined_frame = np.concatenate(normalized_frame_parts).astype(np.float32)
|
| 68 |
+
if len(combined_frame) < EXPECTED_COORDS_PER_FRAME:
|
| 69 |
+
combined_frame = np.pad(combined_frame, (0, EXPECTED_COORDS_PER_FRAME - len(combined_frame)), 'constant')
|
| 70 |
+
elif len(combined_frame) > EXPECTED_COORDS_PER_FRAME:
|
| 71 |
+
combined_frame = combined_frame[:EXPECTED_COORDS_PER_FRAME]
|
| 72 |
+
|
| 73 |
+
normalized_sequences.append(combined_frame)
|
| 74 |
+
|
| 75 |
+
return np.array(normalized_sequences, dtype=np.float32)
|
| 76 |
+
|
| 77 |
+
def pad_or_truncate_sequence(sequence, target_length, feature_dimension):
|
| 78 |
+
if sequence.shape[0] < target_length:
|
| 79 |
+
padding = np.zeros((target_length - sequence.shape[0], feature_dimension), dtype=np.float32)
|
| 80 |
+
return np.vstack((sequence, padding))
|
| 81 |
+
return sequence[:target_length, :]
|
| 82 |
+
|
| 83 |
+
def detectWords(image_paths):
|
| 84 |
+
results_dict = {}
|
| 85 |
+
sequence = []
|
| 86 |
+
|
| 87 |
+
for idx, path in enumerate(image_paths):
|
| 88 |
+
img = cv2.imread(path)
|
| 89 |
+
if img is None:
|
| 90 |
+
print(f"Warning: Could not read image {path}")
|
| 91 |
+
continue
|
| 92 |
+
|
| 93 |
+
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
| 94 |
+
mp_results = mp_holistic.process(img_rgb)
|
| 95 |
+
|
| 96 |
+
frame_lms = np.zeros(EXPECTED_COORDS_PER_FRAME, dtype=np.float32)
|
| 97 |
+
current_idx = 0
|
| 98 |
+
|
| 99 |
+
if mp_results.pose_landmarks:
|
| 100 |
+
pose_flat = [coord for lm in mp_results.pose_landmarks.landmark for coord in [lm.x, lm.y, lm.z, lm.visibility]]
|
| 101 |
+
frame_lms[current_idx:current_idx+len(pose_flat)] = pose_flat
|
| 102 |
+
else:
|
| 103 |
+
print(f"Warning: No pose landmarks detected in frame {idx}")
|
| 104 |
+
current_idx += NUM_POSE_COORDS_SINGLE
|
| 105 |
+
|
| 106 |
+
if mp_results.left_hand_landmarks:
|
| 107 |
+
lh_flat = [coord for lm in mp_results.left_hand_landmarks.landmark for coord in [lm.x, lm.y, lm.z]]
|
| 108 |
+
frame_lms[current_idx:current_idx+len(lh_flat)] = lh_flat
|
| 109 |
+
else:
|
| 110 |
+
print(f"Warning: No left hand landmarks detected in frame {idx}")
|
| 111 |
+
current_idx += NUM_HAND_COORDS_SINGLE
|
| 112 |
+
|
| 113 |
+
if mp_results.right_hand_landmarks:
|
| 114 |
+
rh_flat = [coord for lm in mp_results.right_hand_landmarks.landmark for coord in [lm.x, lm.y, lm.z]]
|
| 115 |
+
frame_lms[current_idx:current_idx+len(rh_flat)] = rh_flat
|
| 116 |
+
else:
|
| 117 |
+
print(f"Warning: No right hand landmarks detected in frame {idx}")
|
| 118 |
+
current_idx += NUM_HAND_COORDS_SINGLE
|
| 119 |
+
|
| 120 |
+
if mp_results.face_landmarks:
|
| 121 |
+
face_flat = [coord for lm in mp_results.face_landmarks.landmark for coord in [lm.x, lm.y, lm.z]]
|
| 122 |
+
frame_lms[current_idx:current_idx+len(face_flat)] = face_flat
|
| 123 |
+
else:
|
| 124 |
+
print(f"Warning: No pose landmarks detected in frame {idx}")
|
| 125 |
+
|
| 126 |
+
sequence.append(frame_lms)
|
| 127 |
+
|
| 128 |
+
sequence = normalize_landmarks(np.array(sequence, dtype=np.float32))
|
| 129 |
+
sequence = pad_or_truncate_sequence(sequence, SEQUENCE_LENGTH, EXPECTED_COORDS_PER_FRAME)
|
| 130 |
+
sequence = np.expand_dims(sequence, axis=0)
|
| 131 |
+
|
| 132 |
+
preds = model.predict(sequence, verbose=0)
|
| 133 |
+
predicted_id = int(np.argmax(preds))
|
| 134 |
+
confidence = float(np.max(preds))
|
| 135 |
+
predicted_word = id_to_gloss.get(predicted_id, "Unknown")
|
| 136 |
+
|
| 137 |
+
result = {"word": predicted_word if confidence >= CONFIDENCE_THRESHOLD else "",
|
| 138 |
+
"confidence": confidence}
|
| 139 |
+
|
| 140 |
+
print(f"Prediction result: {result}")
|
| 141 |
+
return result
|