mutarisi commited on
Commit
de8ea8e
·
1 Parent(s): 0070d5a

add app file

Browse files
Files changed (6) hide show
  1. glossController.py +19 -0
  2. lettersController.py +141 -0
  3. requirements.txt +17 -0
  4. runtime.txt +1 -0
  5. upload.py +97 -0
  6. wordsController.py +141 -0
glossController.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+ import torch
3
+
4
+ def translateGloss(gloss: str, model_id: str = "rrrr66254/Glossa-BART") -> str:
5
+
6
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
7
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_id, trust_remote_code=True)
8
+ model.eval()
9
+ if torch.cuda.is_available():
10
+ model = model.to("cuda")
11
+
12
+ inputs = tokenizer(gloss, return_tensors="pt", padding=True, truncation=True)
13
+ if torch.cuda.is_available():
14
+ inputs = {k: v.to("cuda") for k,v in inputs.items()}
15
+
16
+ outputs = model.generate(**inputs, max_new_tokens=50, do_sample=False)
17
+ result = tokenizer.decode(outputs[0], skip_special_tokens=True)
18
+ return result
19
+
lettersController.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import pickle
4
+ import tensorflow as tf
5
+ import mediapipe as mp
6
+
7
+ lettersModel = tf.keras.models.load_model('ai_model/models/detectLettersModel.keras')
8
+ with open('ai_model/models/labelEncoder.pickle', 'rb') as f:
9
+ labelEncoder = pickle.load(f)
10
+
11
+ lettersModel2 = tf.keras.models.load_model('ai_model/jz_model/JZModel.keras')
12
+ with open('ai_model/jz_model/labelEncoder.pickle', 'rb') as f:
13
+ labelEncoder2 = pickle.load(f)
14
+
15
+ numbersModel = tf.keras.models.load_model('ai_model/models/detectNumbersModel.keras')
16
+ with open('ai_model/models/numLabelEncoder.pickle', 'rb') as f:
17
+ numLabelEncoder = pickle.load(f)
18
+
19
+ sequenceNum = 20
20
+ hands = mp.solutions.hands.Hands(static_image_mode=True)
21
+
22
+ def detectFromImage(sequenceList):
23
+
24
+ if len(sequenceList) != sequenceNum:
25
+ return {'letter': '', 'confidence': 0.0}
26
+
27
+ processedSequence = []
28
+
29
+ for imagePath in sequenceList:
30
+ image = cv2.imread(imagePath)
31
+ if image is None:
32
+ continue
33
+
34
+ imgRGB = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
35
+ results = hands.process(imgRGB)
36
+
37
+ if not results.multi_hand_landmarks:
38
+ continue
39
+
40
+ handLandmarks = results.multi_hand_landmarks[0]
41
+
42
+ xList, yList = [], []
43
+ dataAux2 = []
44
+
45
+ for lm in handLandmarks.landmark:
46
+ xList.append(lm.x)
47
+ yList.append(lm.y)
48
+
49
+ for lm in handLandmarks.landmark:
50
+ dataAux2.append(lm.x - min(xList))
51
+ dataAux2.append(lm.y - min(yList))
52
+ dataAux2.append(0)
53
+
54
+ processedSequence.append(dataAux2)
55
+
56
+ confidence2 = 0.0
57
+ label2 = ""
58
+ fallback_frame = cv2.imread(sequenceList[-1])
59
+
60
+ # for i in range(len(processedSequence)):
61
+ # if processedSequence[i] is None:
62
+ # prevIdx, nextIdx = -1, -1
63
+
64
+ # for j in range(i - 1, -1, -1):
65
+ # if processedSequence[j] is not None:
66
+ # prevIdx = j
67
+ # break
68
+
69
+ # for j in range(i + 1, len(processedSequence)):
70
+ # if processedSequence[j] is not None:
71
+ # nextIdx = j
72
+ # break
73
+
74
+ # if prevIdx != -1 and nextIdx != -1:
75
+ # prevData = np.array(processedSequence[prevIdx])
76
+ # nextData = np.array(processedSequence[nextIdx])
77
+ # t = (i - prevIdx) / (nextIdx - prevIdx)
78
+ # interpolatedData = prevData + (nextData - prevData) * t
79
+ # processedSequence[i] = interpolatedData.tolist()
80
+ # elif prevIdx != -1:
81
+ # processedSequence[i] = processedSequence[prevIdx]
82
+ # elif nextIdx != -1:
83
+ # processedSequence[i] = processedSequence[nextIdx]
84
+
85
+ if len(processedSequence) != sequenceNum:
86
+ print("incomplete sequence: ", len(processedSequence))
87
+ return {'letter': '', 'confidenceLetter': 0.0, 'number': '', 'confidenceNumber': 0.0}
88
+
89
+ inputData2 = np.array(processedSequence, dtype=np.float32).reshape(1, sequenceNum, 63)
90
+ prediction2 = lettersModel2.predict(inputData2, verbose=0)
91
+
92
+ index2 = np.argmax(prediction2, axis=1)[0]
93
+ confidence2 = float(np.max(prediction2))
94
+ label2 = labelEncoder2.inverse_transform([index2])[0]
95
+ print(f'Letters Model 2:{label2} at {confidence2}')
96
+
97
+ if fallback_frame is not None:
98
+ imgRGB = cv2.cvtColor(fallback_frame, cv2.COLOR_BGR2RGB)
99
+ results = hands.process(imgRGB)
100
+ if results.multi_hand_landmarks:
101
+ handLandmarks = results.multi_hand_landmarks[0]
102
+ xList, yList = [], []
103
+ dataAux = []
104
+
105
+ for lm in handLandmarks.landmark:
106
+ xList.append(lm.x)
107
+ yList.append(lm.y)
108
+
109
+ for lm in handLandmarks.landmark:
110
+ dataAux.append(lm.x - min(xList))
111
+ dataAux.append(lm.y - min(yList))
112
+
113
+ #check in letters model1
114
+ inputData1 = np.array(dataAux, dtype=np.float32).reshape(1, 42, 1)
115
+ prediction1 = lettersModel.predict(inputData1, verbose=0)
116
+ index1 = np.argmax(prediction1, axis=1)[0]
117
+ confidence1 = float(np.max(prediction1))
118
+ label1 = labelEncoder.inverse_transform([index1])[0]
119
+
120
+ print(f'Letters Model 1: {label1} at {confidence1}')
121
+
122
+ prediction3 = numbersModel.predict(inputData1, verbose=0)
123
+ index3 = np.argmax(prediction3, axis=1)[0]
124
+ confidence3 = float(np.max(prediction3))
125
+ label3 = numLabelEncoder.inverse_transform([index3])[0]
126
+
127
+ print(f'Numbers Model: {label3} at {confidence3}')
128
+
129
+ if label1==label2:
130
+ return {'letter': label2, 'confidenceLetter': confidence2,
131
+ 'number': label3, 'confidenceNumber': confidence3}
132
+ # elif label2=="Z" and label1=="L":
133
+ # return {'letter': label2, 'confidence': confidence2}
134
+ # elif label2=="J" and label1=="I":
135
+ # return {'letter': label2, 'confidence': confidence2}
136
+ else:
137
+ return {'letter': label1, 'confidenceLetter': confidence1
138
+ , 'number': label3, 'confidenceNumber': confidence3}
139
+ else:
140
+ return {'letter': label2, 'confidenceLetter': confidence2
141
+ , 'number': '', 'confidenceNumber': 0.0}
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.116.1
2
+ matplotlib==3.10.6
3
+ mediapipe==0.10.21
4
+ numpy>=1.24.3,<2
5
+ openai==1.106.1
6
+ opencv_contrib_python==4.11.0.86
7
+ opencv_python==4.11.0.86
8
+ opencv_python_headless==4.11.0.86
9
+ pandas==2.3.2
10
+ python-dotenv==1.1.1
11
+ scikit_learn==1.7.1
12
+ starlette==0.47.3
13
+ tensorflow==2.19.0
14
+ tqdm==4.67.1
15
+ uvicorn==0.35.0
16
+ transformers
17
+ torch
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.11.0
upload.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import uvicorn
4
+ from fastapi import FastAPI, UploadFile, File, Form
5
+ from typing import Optional
6
+ from starlette.responses import JSONResponse
7
+
8
+ # --- Import AI Functions ---
9
+ # Now that Detection.py is in the same directory, the import is straightforward.
10
+ try:
11
+ from models.Detection import load_model_and_assets, process_image_and_predict, process_video_and_predict_realtime, MIN_CONFIDENCE_THRESHOLD
12
+ print("Successfully imported functions from Detection.py")
13
+ except ImportError as e:
14
+ print(f"ERROR: Could not import from Detection.py. Please ensure the file is in the same repository as app.py: {e}")
15
+ # It's good practice to exit if a critical import fails
16
+ sys.exit(1)
17
+
18
+ # --- FastAPI App Initialization ---
19
+ app = FastAPI()
20
+
21
+ # --- Load AI Model and Assets on Startup ---
22
+ # This function, located in your Detection.py, should be updated to correctly
23
+ # reference the files within the 'models' subfolder of your Hugging Face Space.
24
+ print("\n--- Hugging Face Space starting: Loading AI model and assets... ---")
25
+ load_model_and_assets()
26
+ print("--- AI model and assets loaded. Ready to serve predictions. ---\n")
27
+
28
+ # --- FastAPI Routes ---
29
+ # The rest of the routes remain the same, as they now correctly call the functions
30
+ # from your Detection.py script.
31
+
32
+ @app.post("/process-image")
33
+ async def process_image_api(
34
+ image: UploadFile = File(...),
35
+ min_confidence: Optional[float] = Form(MIN_CONFIDENCE_THRESHOLD)
36
+ ):
37
+ try:
38
+ contents = await image.read()
39
+ temp_filepath = f"/tmp/{image.filename}"
40
+ with open(temp_filepath, "wb") as f:
41
+ f.write(contents)
42
+
43
+ action, confidence = process_image_and_predict(temp_filepath, min_confidence)
44
+ os.remove(temp_filepath)
45
+
46
+ response = {
47
+ "sign": action if action else "UNKNOWN",
48
+ "confidence": round(float(confidence), 2),
49
+ "success": True,
50
+ "filename": image.filename
51
+ }
52
+ return JSONResponse(content=response)
53
+
54
+ except Exception as e:
55
+ return JSONResponse(
56
+ status_code=500,
57
+ content={
58
+ "error": "Error processing image with AI model",
59
+ "details": str(e),
60
+ "success": False
61
+ }
62
+ )
63
+
64
+ @app.post("/process-video")
65
+ async def process_video_api(
66
+ video: UploadFile = File(...),
67
+ min_confidence: Optional[float] = Form(MIN_CONFIDENCE_THRESHOLD)
68
+ ):
69
+ try:
70
+ contents = await video.read()
71
+ temp_filepath = f"/tmp/{video.filename}"
72
+ with open(temp_filepath, "wb") as f:
73
+ f.write(contents)
74
+
75
+ action, confidence = process_video_and_predict_realtime(temp_filepath, min_confidence)
76
+ os.remove(temp_filepath)
77
+
78
+ response = {
79
+ "phrase": action if action else "UNKNOWN",
80
+ "confidence": round(float(confidence), 2),
81
+ "success": True,
82
+ "filename": video.filename
83
+ }
84
+ return JSONResponse(content=response)
85
+ except Exception as e:
86
+ return JSONResponse(
87
+ status_code=500,
88
+ content={
89
+ "error": "Error processing video with AI model",
90
+ "details": str(e),
91
+ "success": False
92
+ }
93
+ )
94
+
95
+ #just added
96
+ if __name__ == "__main__":
97
+ uvicorn.run(app, host="0.0.0.0", port=7860)
wordsController.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import pandas as pd
4
+ from tensorflow.keras.models import load_model
5
+ import mediapipe as mp
6
+
7
+ MODEL_PATH = 'ai_model/words/saved_models/best_sign_classifier_model_125_words_seq90.keras'
8
+ CSV_PATH = 'ai_model/words/wlasl_125_words_personal_final_processed_data_augmented_seq90.csv'
9
+ SEQUENCE_LENGTH = 90
10
+ EXPECTED_COORDS_PER_FRAME = 1662
11
+ CONFIDENCE_THRESHOLD = 0.1
12
+
13
+ model = load_model(MODEL_PATH)
14
+ df = pd.read_csv(CSV_PATH)
15
+ unique_glosses = df['gloss'].unique()
16
+ id_to_gloss = {i: g for i, g in enumerate(unique_glosses)}
17
+
18
+ mp_holistic = mp.solutions.holistic.Holistic(
19
+ static_image_mode=True,
20
+ model_complexity=1,
21
+ min_detection_confidence=0.2,
22
+ min_tracking_confidence=0.5
23
+ )
24
+
25
+ NUM_POSE_COORDS_SINGLE = 33*4
26
+ NUM_HAND_COORDS_SINGLE = 21*3
27
+ NUM_FACE_COORDS_SINGLE = 468*3
28
+
29
+ def normalize_landmarks(landmarks_sequence):
30
+ if landmarks_sequence.ndim == 1:
31
+ landmarks_sequence = np.expand_dims(landmarks_sequence, axis=0)
32
+
33
+ normalized_sequences = []
34
+ for frame_landmarks in landmarks_sequence:
35
+ if np.all(frame_landmarks == 0):
36
+ normalized_sequences.append(np.zeros(EXPECTED_COORDS_PER_FRAME, dtype=np.float32))
37
+ continue
38
+
39
+ pose_coords_flat = frame_landmarks[0 : NUM_POSE_COORDS_SINGLE]
40
+ left_hand_coords_flat = frame_landmarks[NUM_POSE_COORDS_SINGLE : NUM_POSE_COORDS_SINGLE + NUM_HAND_COORDS_SINGLE]
41
+ right_hand_coords_flat = frame_landmarks[NUM_POSE_COORDS_SINGLE + NUM_HAND_COORDS_SINGLE : NUM_POSE_COORDS_SINGLE + NUM_HAND_COORDS_SINGLE*2]
42
+ face_coords_flat = frame_landmarks[NUM_POSE_COORDS_SINGLE + NUM_HAND_COORDS_SINGLE*2 : ]
43
+
44
+ all_parts_data = [
45
+ (pose_coords_flat, 4, [0.0]*NUM_POSE_COORDS_SINGLE),
46
+ (left_hand_coords_flat, 3, [0.0]*NUM_HAND_COORDS_SINGLE),
47
+ (right_hand_coords_flat, 3, [0.0]*NUM_HAND_COORDS_SINGLE),
48
+ (face_coords_flat, 3, [0.0]*NUM_FACE_COORDS_SINGLE)
49
+ ]
50
+
51
+ normalized_frame_parts = []
52
+ for flat_lms, coords_per_lm, template in all_parts_data:
53
+ if np.all(flat_lms==0):
54
+ normalized_frame_parts.append(np.array(template, dtype=np.float32))
55
+ continue
56
+
57
+ lms_array = flat_lms.reshape(-1, coords_per_lm)
58
+ coords_for_mean = lms_array[:, :3] if coords_per_lm==4 else lms_array
59
+ mean_coords = np.mean(coords_for_mean, axis=0)
60
+ translated_lms = lms_array.copy()
61
+ translated_lms[:, :3] -= mean_coords
62
+ scale_factor = np.max(np.linalg.norm(translated_lms[:, :3], axis=1))
63
+ if scale_factor > 1e-6:
64
+ translated_lms[:, :3] /= scale_factor
65
+ normalized_frame_parts.append(translated_lms.flatten())
66
+
67
+ combined_frame = np.concatenate(normalized_frame_parts).astype(np.float32)
68
+ if len(combined_frame) < EXPECTED_COORDS_PER_FRAME:
69
+ combined_frame = np.pad(combined_frame, (0, EXPECTED_COORDS_PER_FRAME - len(combined_frame)), 'constant')
70
+ elif len(combined_frame) > EXPECTED_COORDS_PER_FRAME:
71
+ combined_frame = combined_frame[:EXPECTED_COORDS_PER_FRAME]
72
+
73
+ normalized_sequences.append(combined_frame)
74
+
75
+ return np.array(normalized_sequences, dtype=np.float32)
76
+
77
+ def pad_or_truncate_sequence(sequence, target_length, feature_dimension):
78
+ if sequence.shape[0] < target_length:
79
+ padding = np.zeros((target_length - sequence.shape[0], feature_dimension), dtype=np.float32)
80
+ return np.vstack((sequence, padding))
81
+ return sequence[:target_length, :]
82
+
83
+ def detectWords(image_paths):
84
+ results_dict = {}
85
+ sequence = []
86
+
87
+ for idx, path in enumerate(image_paths):
88
+ img = cv2.imread(path)
89
+ if img is None:
90
+ print(f"Warning: Could not read image {path}")
91
+ continue
92
+
93
+ img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
94
+ mp_results = mp_holistic.process(img_rgb)
95
+
96
+ frame_lms = np.zeros(EXPECTED_COORDS_PER_FRAME, dtype=np.float32)
97
+ current_idx = 0
98
+
99
+ if mp_results.pose_landmarks:
100
+ pose_flat = [coord for lm in mp_results.pose_landmarks.landmark for coord in [lm.x, lm.y, lm.z, lm.visibility]]
101
+ frame_lms[current_idx:current_idx+len(pose_flat)] = pose_flat
102
+ else:
103
+ print(f"Warning: No pose landmarks detected in frame {idx}")
104
+ current_idx += NUM_POSE_COORDS_SINGLE
105
+
106
+ if mp_results.left_hand_landmarks:
107
+ lh_flat = [coord for lm in mp_results.left_hand_landmarks.landmark for coord in [lm.x, lm.y, lm.z]]
108
+ frame_lms[current_idx:current_idx+len(lh_flat)] = lh_flat
109
+ else:
110
+ print(f"Warning: No left hand landmarks detected in frame {idx}")
111
+ current_idx += NUM_HAND_COORDS_SINGLE
112
+
113
+ if mp_results.right_hand_landmarks:
114
+ rh_flat = [coord for lm in mp_results.right_hand_landmarks.landmark for coord in [lm.x, lm.y, lm.z]]
115
+ frame_lms[current_idx:current_idx+len(rh_flat)] = rh_flat
116
+ else:
117
+ print(f"Warning: No right hand landmarks detected in frame {idx}")
118
+ current_idx += NUM_HAND_COORDS_SINGLE
119
+
120
+ if mp_results.face_landmarks:
121
+ face_flat = [coord for lm in mp_results.face_landmarks.landmark for coord in [lm.x, lm.y, lm.z]]
122
+ frame_lms[current_idx:current_idx+len(face_flat)] = face_flat
123
+ else:
124
+ print(f"Warning: No pose landmarks detected in frame {idx}")
125
+
126
+ sequence.append(frame_lms)
127
+
128
+ sequence = normalize_landmarks(np.array(sequence, dtype=np.float32))
129
+ sequence = pad_or_truncate_sequence(sequence, SEQUENCE_LENGTH, EXPECTED_COORDS_PER_FRAME)
130
+ sequence = np.expand_dims(sequence, axis=0)
131
+
132
+ preds = model.predict(sequence, verbose=0)
133
+ predicted_id = int(np.argmax(preds))
134
+ confidence = float(np.max(preds))
135
+ predicted_word = id_to_gloss.get(predicted_id, "Unknown")
136
+
137
+ result = {"word": predicted_word if confidence >= CONFIDENCE_THRESHOLD else "",
138
+ "confidence": confidence}
139
+
140
+ print(f"Prediction result: {result}")
141
+ return result