Spaces:
Sleeping
Sleeping
Commit ·
1b1014b
1
Parent(s): 0aa47d4
working example
Browse files- .gitignore +2 -1
- classificator_training/helpers/args.py +1 -1
- classificator_training/model/feature_extractor.py +31 -0
- classificator_training/run_script.sh +0 -4
- myapp/.gitignore +2 -1
- myapp/AR/model_template.py +1 -1
- myapp/AR/pose_estimate_utils.py +3 -255
- myapp/AR/pose_network.py +74 -243
- myapp/main.py +3 -2
- {classificator_training/helpers → myapp/utils}/load_best_model.py +16 -6
.gitignore
CHANGED
|
@@ -188,4 +188,5 @@ myapp/debug_template.jpg
|
|
| 188 |
|
| 189 |
debug_*.jpg
|
| 190 |
*_clip_*.pth
|
| 191 |
-
batch_run_*.txt
|
|
|
|
|
|
| 188 |
|
| 189 |
debug_*.jpg
|
| 190 |
*_clip_*.pth
|
| 191 |
+
batch_run_*.txt
|
| 192 |
+
.claude/
|
classificator_training/helpers/args.py
CHANGED
|
@@ -56,7 +56,7 @@ def _override_args_from_model_name(args: Namespace, load_model_name: str, verbos
|
|
| 56 |
)
|
| 57 |
|
| 58 |
match = pattern.search(load_model_name)
|
| 59 |
-
|
| 60 |
if match:
|
| 61 |
extracted_values = match.groupdict()
|
| 62 |
|
|
|
|
| 56 |
)
|
| 57 |
|
| 58 |
match = pattern.search(load_model_name)
|
| 59 |
+
print(match)
|
| 60 |
if match:
|
| 61 |
extracted_values = match.groupdict()
|
| 62 |
|
classificator_training/model/feature_extractor.py
CHANGED
|
@@ -162,4 +162,35 @@ class FeatureExtractor(nn.Module):
|
|
| 162 |
extracted_feature_dict['vit'] = self.vit(self._ensure_batch_dim(vit_input))
|
| 163 |
|
| 164 |
return extracted_feature_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
|
|
|
| 162 |
extracted_feature_dict['vit'] = self.vit(self._ensure_batch_dim(vit_input))
|
| 163 |
|
| 164 |
return extracted_feature_dict
|
| 165 |
+
|
| 166 |
+
def extract_feature_from_model(self, model_name: str, input_tensor: torch.Tensor):
|
| 167 |
+
"""
|
| 168 |
+
Extracts features from a specific backbone model.
|
| 169 |
+
"""
|
| 170 |
+
if model_name not in self.use_models or not self.use_models[model_name]:
|
| 171 |
+
raise ValueError(f"Model '{model_name}' is not enabled in use_models.")
|
| 172 |
+
|
| 173 |
+
input_tensor = self._ensure_batch_dim(input_tensor)
|
| 174 |
+
|
| 175 |
+
if model_name == 'clip':
|
| 176 |
+
return self.clip_model(input_tensor).pooler_output
|
| 177 |
+
elif model_name == 'segformer':
|
| 178 |
+
output = self.segformer_model(input_tensor, output_hidden_states=True)
|
| 179 |
+
return output.last_hidden_state.mean(dim=[2, 3])
|
| 180 |
+
elif model_name == 'dpt':
|
| 181 |
+
output = self.dpt_model(input_tensor)
|
| 182 |
+
return output.pooler_output
|
| 183 |
+
elif model_name == 'midas':
|
| 184 |
+
output = self.midas_model(input_tensor, output_hidden_states=True)
|
| 185 |
+
return output.hidden_states[-1][:, 1:, :].mean(dim=1)
|
| 186 |
+
elif model_name == 'resnet':
|
| 187 |
+
return self.resnet(input_tensor)
|
| 188 |
+
elif model_name == 'mobilenet':
|
| 189 |
+
return self.mobilenet(input_tensor)
|
| 190 |
+
elif model_name == 'efficientnet':
|
| 191 |
+
return self.efficientnet(input_tensor)
|
| 192 |
+
elif model_name == 'vit':
|
| 193 |
+
return self.vit(input_tensor)
|
| 194 |
+
else:
|
| 195 |
+
raise ValueError(f"Unknown model name: {model_name}")
|
| 196 |
|
classificator_training/run_script.sh
CHANGED
|
@@ -185,8 +185,6 @@ RESULTS_FILE="batch_run_results2.txt"
|
|
| 185 |
|
| 186 |
|
| 187 |
for SET in "${PARAMETER_SETS[@]}"; do
|
| 188 |
-
# Read 15 variables (plus P16/Alpha if needed, currently Alpha is treated as part of the set logic usually)
|
| 189 |
-
# Mapping based on new order:
|
| 190 |
read P_CLIP P_SEG P_MIDAS P_DPT P_RES P_MOB P_EFF P_VIT P_GATE P_BATCH P_TYPE P_HEAD P_LR P_MARGIN P_ALPHA <<< "$SET"
|
| 191 |
|
| 192 |
echo "--- Starting run with Models: Clip=$P_CLIP Seg=$P_SEG Midas=$P_MIDAS Dpt=$P_DPT Res=$P_RES Mob=$P_MOB Eff=$P_EFF Vit=$P_VIT ---"
|
|
@@ -210,7 +208,6 @@ for SET in "${PARAMETER_SETS[@]}"; do
|
|
| 210 |
|
| 211 |
echo "$COMMAND_TO_RUN"
|
| 212 |
|
| 213 |
-
# Check if this is a "silent" run (P11 logic from your old script was unclear, assuming standard logging here)
|
| 214 |
OUTPUT=$($COMMAND_TO_RUN 2>&1 | tee /dev/tty | tail -n 1)
|
| 215 |
EXIT_CODE=${PIPESTATUS[0]}
|
| 216 |
|
|
@@ -222,7 +219,6 @@ for SET in "${PARAMETER_SETS[@]}"; do
|
|
| 222 |
echo "Run finished **successfully**."
|
| 223 |
else
|
| 224 |
echo "Run **failed**. Stopping batch."
|
| 225 |
-
# break # Uncomment to stop on failure
|
| 226 |
fi
|
| 227 |
echo "---"
|
| 228 |
done
|
|
|
|
| 185 |
|
| 186 |
|
| 187 |
for SET in "${PARAMETER_SETS[@]}"; do
|
|
|
|
|
|
|
| 188 |
read P_CLIP P_SEG P_MIDAS P_DPT P_RES P_MOB P_EFF P_VIT P_GATE P_BATCH P_TYPE P_HEAD P_LR P_MARGIN P_ALPHA <<< "$SET"
|
| 189 |
|
| 190 |
echo "--- Starting run with Models: Clip=$P_CLIP Seg=$P_SEG Midas=$P_MIDAS Dpt=$P_DPT Res=$P_RES Mob=$P_MOB Eff=$P_EFF Vit=$P_VIT ---"
|
|
|
|
| 208 |
|
| 209 |
echo "$COMMAND_TO_RUN"
|
| 210 |
|
|
|
|
| 211 |
OUTPUT=$($COMMAND_TO_RUN 2>&1 | tee /dev/tty | tail -n 1)
|
| 212 |
EXIT_CODE=${PIPESTATUS[0]}
|
| 213 |
|
|
|
|
| 219 |
echo "Run finished **successfully**."
|
| 220 |
else
|
| 221 |
echo "Run **failed**. Stopping batch."
|
|
|
|
| 222 |
fi
|
| 223 |
echo "---"
|
| 224 |
done
|
myapp/.gitignore
CHANGED
|
@@ -171,4 +171,5 @@ cython_debug/
|
|
| 171 |
.ruff_cache/
|
| 172 |
|
| 173 |
# PyPI configuration file
|
| 174 |
-
.pypirc
|
|
|
|
|
|
| 171 |
.ruff_cache/
|
| 172 |
|
| 173 |
# PyPI configuration file
|
| 174 |
+
.pypirc
|
| 175 |
+
.claude/
|
myapp/AR/model_template.py
CHANGED
|
@@ -5,6 +5,6 @@ TRAINED_CLASSIFICATION_MODEL = {
|
|
| 5 |
"feature_extractor": None,
|
| 6 |
"id_to_tag": None,
|
| 7 |
"tag_to_id": None,
|
| 8 |
-
"
|
| 9 |
"keypoints": None
|
| 10 |
}
|
|
|
|
| 5 |
"feature_extractor": None,
|
| 6 |
"id_to_tag": None,
|
| 7 |
"tag_to_id": None,
|
| 8 |
+
"pose_model": None,
|
| 9 |
"keypoints": None
|
| 10 |
}
|
myapp/AR/pose_estimate_utils.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
-
import json
|
| 2 |
from typing import Dict, List, Tuple
|
| 3 |
-
|
| 4 |
from transformers import CLIPConfig, CLIPProcessor, CLIPVisionModel
|
|
|
|
| 5 |
from classificator_training.model.feature_extractor import CLIP_MODEL_ID
|
|
|
|
| 6 |
from myapp.AR.model_template import TRAINED_CLASSIFICATION_MODEL
|
| 7 |
import os
|
| 8 |
import io
|
|
@@ -14,16 +14,9 @@ import numpy as np
|
|
| 14 |
import torch
|
| 15 |
import cv2
|
| 16 |
import numpy as np
|
| 17 |
-
import json
|
| 18 |
-
import argparse
|
| 19 |
-
from pathlib import Path
|
| 20 |
-
from tqdm import tqdm
|
| 21 |
import trimesh
|
| 22 |
import pyrender
|
| 23 |
-
from torchvision import transforms
|
| 24 |
-
from datetime import datetime
|
| 25 |
|
| 26 |
-
from myapp.AR.pose_network import BuildingPoseNet, quaternion_to_rotation_matrix
|
| 27 |
|
| 28 |
def get_camera_matrix(img_w, img_h):
|
| 29 |
"""
|
|
@@ -41,183 +34,6 @@ def get_camera_matrix(img_w, img_h):
|
|
| 41 |
[0, fy, cy],
|
| 42 |
[0, 0, 1]
|
| 43 |
], dtype=np.float32)
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
class BatchBuildingPoseEstimator:
|
| 47 |
-
"""
|
| 48 |
-
Process folders of building images and generate pose predictions
|
| 49 |
-
Works with models trained on cached CLIP features
|
| 50 |
-
"""
|
| 51 |
-
|
| 52 |
-
def __init__(self, model_path, config_path, device='cuda'):
|
| 53 |
-
self.device = device
|
| 54 |
-
|
| 55 |
-
# Load config
|
| 56 |
-
with open(config_path, 'r') as f:
|
| 57 |
-
self.config = json.load(f)
|
| 58 |
-
|
| 59 |
-
print("Loading CLIP model for feature extraction...")
|
| 60 |
-
# Load CLIP for feature extraction
|
| 61 |
-
clip_config = CLIPConfig.from_pretrained(CLIP_MODEL_ID)
|
| 62 |
-
self.clip_model = CLIPVisionModel.from_pretrained(
|
| 63 |
-
CLIP_MODEL_ID,
|
| 64 |
-
config=clip_config.vision_config,
|
| 65 |
-
)
|
| 66 |
-
self.clip_model.to(device)
|
| 67 |
-
self.clip_model.eval()
|
| 68 |
-
|
| 69 |
-
self.clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_ID)
|
| 70 |
-
|
| 71 |
-
feature_dim = self.clip_model.config.hidden_size
|
| 72 |
-
print(f"✓ CLIP loaded (feature dim: {feature_dim})")
|
| 73 |
-
|
| 74 |
-
# Load pose estimation model
|
| 75 |
-
print("Loading pose estimation model...")
|
| 76 |
-
self.model = BuildingPoseNet(
|
| 77 |
-
num_buildings=self.config['num_buildings'],
|
| 78 |
-
feature_dim=feature_dim
|
| 79 |
-
)
|
| 80 |
-
|
| 81 |
-
checkpoint = torch.load(model_path, map_location=device, weights_only=False)
|
| 82 |
-
self.model.load_state_dict(checkpoint['model_state_dict'])
|
| 83 |
-
self.model.to(device)
|
| 84 |
-
self.model.eval()
|
| 85 |
-
|
| 86 |
-
print(f"✓ Pose estimation model loaded successfully:")
|
| 87 |
-
print(f" Model path: {model_path}")
|
| 88 |
-
print(f" Device: {device}")
|
| 89 |
-
print(f" Number of buildings: {self.config['num_buildings']}")
|
| 90 |
-
|
| 91 |
-
def extract_clip_features(self, image):
|
| 92 |
-
"""
|
| 93 |
-
Extract CLIP features from a single image
|
| 94 |
-
|
| 95 |
-
Args:
|
| 96 |
-
image: numpy array (H, W, 3) in BGR or RGB format
|
| 97 |
-
|
| 98 |
-
Returns:
|
| 99 |
-
features: torch.Tensor [1, feature_dim]
|
| 100 |
-
"""
|
| 101 |
-
# Convert BGR to RGB if needed
|
| 102 |
-
if len(image.shape) == 2:
|
| 103 |
-
image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
|
| 104 |
-
elif image.shape[2] == 3:
|
| 105 |
-
# Assume BGR from OpenCV, convert to RGB
|
| 106 |
-
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 107 |
-
|
| 108 |
-
# Process image with CLIP processor
|
| 109 |
-
inputs = self.clip_processor(images=image, return_tensors="pt")
|
| 110 |
-
pixel_values = inputs['pixel_values'].to(self.device)
|
| 111 |
-
|
| 112 |
-
# Extract features
|
| 113 |
-
with torch.no_grad():
|
| 114 |
-
outputs = self.clip_model(pixel_values=pixel_values)
|
| 115 |
-
features = outputs.pooler_output # [1, feature_dim]
|
| 116 |
-
|
| 117 |
-
return features
|
| 118 |
-
|
| 119 |
-
def predict_pose(self, image):
|
| 120 |
-
"""
|
| 121 |
-
Predict pose from single image
|
| 122 |
-
|
| 123 |
-
Args:
|
| 124 |
-
image: numpy array (H, W, 3) - raw image
|
| 125 |
-
|
| 126 |
-
Returns:
|
| 127 |
-
dict with pose predictions
|
| 128 |
-
"""
|
| 129 |
-
# Step 1: Extract CLIP features
|
| 130 |
-
features = self.extract_clip_features(image)
|
| 131 |
-
|
| 132 |
-
# Step 2: Predict pose from features
|
| 133 |
-
with torch.no_grad():
|
| 134 |
-
output = self.model(features)
|
| 135 |
-
|
| 136 |
-
# Parse outputs
|
| 137 |
-
building_id = torch.argmax(output['building_logits'], dim=1).item()
|
| 138 |
-
building_probs = torch.softmax(output['building_logits'], dim=1)[0].cpu().numpy()
|
| 139 |
-
rotation_quat = output['rotation'][0].cpu().numpy()
|
| 140 |
-
translation = output['translation'][0].cpu().numpy()
|
| 141 |
-
confidence = output['confidence'][0].item()
|
| 142 |
-
|
| 143 |
-
# Convert quaternion to rotation matrix
|
| 144 |
-
rotation_matrix = quaternion_to_rotation_matrix(
|
| 145 |
-
torch.from_numpy(rotation_quat)
|
| 146 |
-
).numpy()
|
| 147 |
-
|
| 148 |
-
return {
|
| 149 |
-
'building_id': building_id,
|
| 150 |
-
'building_name': self.config['buildings'][building_id],
|
| 151 |
-
'building_probabilities': {
|
| 152 |
-
self.config['buildings'][i]: float(building_probs[i])
|
| 153 |
-
for i in range(len(building_probs))
|
| 154 |
-
},
|
| 155 |
-
'rotation_quaternion': rotation_quat.tolist(),
|
| 156 |
-
'rotation_matrix': rotation_matrix.tolist(),
|
| 157 |
-
'translation': translation.tolist(),
|
| 158 |
-
'confidence': float(confidence)
|
| 159 |
-
}
|
| 160 |
-
|
| 161 |
-
def predict_pose_batch(self, images):
|
| 162 |
-
"""
|
| 163 |
-
Predict poses for a batch of images (faster)
|
| 164 |
-
|
| 165 |
-
Args:
|
| 166 |
-
images: list of numpy arrays
|
| 167 |
-
|
| 168 |
-
Returns:
|
| 169 |
-
list of prediction dicts
|
| 170 |
-
"""
|
| 171 |
-
# Convert all to RGB
|
| 172 |
-
rgb_images = []
|
| 173 |
-
for img in images:
|
| 174 |
-
if len(img.shape) == 2:
|
| 175 |
-
img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
|
| 176 |
-
elif img.shape[2] == 3:
|
| 177 |
-
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
| 178 |
-
rgb_images.append(img)
|
| 179 |
-
|
| 180 |
-
# Extract features for all images
|
| 181 |
-
inputs = self.clip_processor(images=rgb_images, return_tensors="pt")
|
| 182 |
-
pixel_values = inputs['pixel_values'].to(self.device)
|
| 183 |
-
|
| 184 |
-
with torch.no_grad():
|
| 185 |
-
# Extract CLIP features
|
| 186 |
-
outputs = self.clip_model(pixel_values=pixel_values)
|
| 187 |
-
features = outputs.pooler_output # [batch_size, feature_dim]
|
| 188 |
-
|
| 189 |
-
# Predict poses
|
| 190 |
-
predictions = self.model(features)
|
| 191 |
-
|
| 192 |
-
# Parse results
|
| 193 |
-
results = []
|
| 194 |
-
batch_size = len(images)
|
| 195 |
-
|
| 196 |
-
for i in range(batch_size):
|
| 197 |
-
building_id = torch.argmax(predictions['building_logits'][i]).item()
|
| 198 |
-
building_probs = torch.softmax(predictions['building_logits'][i], dim=0).cpu().numpy()
|
| 199 |
-
rotation_quat = predictions['rotation'][i].cpu().numpy()
|
| 200 |
-
translation = predictions['translation'][i].cpu().numpy()
|
| 201 |
-
confidence = predictions['confidence'][i].item()
|
| 202 |
-
|
| 203 |
-
rotation_matrix = quaternion_to_rotation_matrix(
|
| 204 |
-
torch.from_numpy(rotation_quat)
|
| 205 |
-
).numpy()
|
| 206 |
-
|
| 207 |
-
results.append({
|
| 208 |
-
'building_id': building_id,
|
| 209 |
-
'building_name': self.config['buildings'][building_id],
|
| 210 |
-
'building_probabilities': {
|
| 211 |
-
self.config['buildings'][j]: float(building_probs[j])
|
| 212 |
-
for j in range(len(building_probs))
|
| 213 |
-
},
|
| 214 |
-
'rotation_quaternion': rotation_quat.tolist(),
|
| 215 |
-
'rotation_matrix': rotation_matrix.tolist(),
|
| 216 |
-
'translation': translation.tolist(),
|
| 217 |
-
'confidence': float(confidence)
|
| 218 |
-
})
|
| 219 |
-
|
| 220 |
-
return results
|
| 221 |
|
| 222 |
|
| 223 |
def get_common_camera_intrinsics(img_width: int, img_height: int) -> List[Dict]:
|
|
@@ -439,13 +255,7 @@ def process_image_heuristic(img_bytes: bytes, model_bytes: bytes, model_name: st
|
|
| 439 |
|
| 440 |
print("Running DL Pose Prediction...")
|
| 441 |
|
| 442 |
-
|
| 443 |
-
pose_estimator = BatchBuildingPoseEstimator(
|
| 444 |
-
model_path='AR/best_model.pth',
|
| 445 |
-
config_path='AR/dataset.json',
|
| 446 |
-
device='cuda' if torch.cuda.is_available() else 'cpu',
|
| 447 |
-
)
|
| 448 |
-
|
| 449 |
pose_result = pose_estimator.predict_pose(img)
|
| 450 |
|
| 451 |
R_blender = pose_result['rotation_matrix']
|
|
@@ -485,68 +295,6 @@ def process_image_heuristic(img_bytes: bytes, model_bytes: bytes, model_name: st
|
|
| 485 |
rot_matrix[0:3, 0:3] = blender_to_opencv
|
| 486 |
mesh_final.apply_transform(rot_matrix)
|
| 487 |
|
| 488 |
-
# camera_intrinsics = np.array([
|
| 489 |
-
# [
|
| 490 |
-
# 796.4444444444445,
|
| 491 |
-
# 0,
|
| 492 |
-
# 512.0
|
| 493 |
-
# ],
|
| 494 |
-
# [
|
| 495 |
-
# 0,
|
| 496 |
-
# 796.4444444444445,
|
| 497 |
-
# 512.0
|
| 498 |
-
# ],
|
| 499 |
-
# [
|
| 500 |
-
# 0,
|
| 501 |
-
# 0,
|
| 502 |
-
# 1
|
| 503 |
-
# ]
|
| 504 |
-
# ])
|
| 505 |
-
|
| 506 |
-
# fx = camera_intrinsics[0, 0]
|
| 507 |
-
# fy = camera_intrinsics[1, 1]
|
| 508 |
-
# cx = camera_intrinsics[0, 2]
|
| 509 |
-
# cy = camera_intrinsics[1, 2]
|
| 510 |
-
|
| 511 |
-
# training_width = 1024
|
| 512 |
-
# training_height = 1024
|
| 513 |
-
# scale_x = w_img / training_width
|
| 514 |
-
# scale_y = h_img / training_height
|
| 515 |
-
|
| 516 |
-
# fx_scaled = fx * scale_x
|
| 517 |
-
# fy_scaled = fy * scale_y
|
| 518 |
-
# cx_scaled = cx * scale_x
|
| 519 |
-
# cy_scaled = cy * scale_y
|
| 520 |
-
|
| 521 |
-
# view_matrix = np.eye(4)
|
| 522 |
-
# view_matrix[:3, :3] = R_opencv
|
| 523 |
-
# view_matrix[:3, 3] = t_opencv.squeeze()
|
| 524 |
-
|
| 525 |
-
# cam_pose_cv = np.linalg.inv(view_matrix)
|
| 526 |
-
|
| 527 |
-
# cv_to_gl = np.array([[1,0,0,0], [0,-1,0,0], [0,0,-1,0], [0,0,0,1]])
|
| 528 |
-
# final_pose = cam_pose_cv @ cv_to_gl
|
| 529 |
-
|
| 530 |
-
# scene = pyrender.Scene(bg_color=[0.0, 0.0, 0.0, 0.0], ambient_light=[0.8, 0.8, 0.8])
|
| 531 |
-
# scene.add(pyrender.Mesh.from_trimesh(mesh_final))
|
| 532 |
-
|
| 533 |
-
# cam = pyrender.IntrinsicsCamera(
|
| 534 |
-
# fx=fx_scaled,
|
| 535 |
-
# fy=fy_scaled,
|
| 536 |
-
# cx=cx_scaled,
|
| 537 |
-
# cy=cy_scaled,
|
| 538 |
-
# znear=0.05,
|
| 539 |
-
# zfar=1000.0
|
| 540 |
-
# )
|
| 541 |
-
|
| 542 |
-
# scene.add(cam, pose=final_pose)
|
| 543 |
-
|
| 544 |
-
# light = pyrender.PointLight(color=[1.0, 1.0, 1.0], intensity=1000.0)
|
| 545 |
-
# scene.add(light, pose=final_pose)
|
| 546 |
-
|
| 547 |
-
# r = pyrender.OffscreenRenderer(w_img, h_img)
|
| 548 |
-
# color, depth = r.render(scene, flags=pyrender.RenderFlags.RGBA)
|
| 549 |
-
# r.delete()
|
| 550 |
camera_configs = get_common_camera_intrinsics(w_img, h_img)[:3]
|
| 551 |
|
| 552 |
best_score = -1
|
|
|
|
|
|
|
| 1 |
from typing import Dict, List, Tuple
|
|
|
|
| 2 |
from transformers import CLIPConfig, CLIPProcessor, CLIPVisionModel
|
| 3 |
+
from classificator_training.data.dataset import preprocess_single_image
|
| 4 |
from classificator_training.model.feature_extractor import CLIP_MODEL_ID
|
| 5 |
+
from classificator_training.utils import move_to_device
|
| 6 |
from myapp.AR.model_template import TRAINED_CLASSIFICATION_MODEL
|
| 7 |
import os
|
| 8 |
import io
|
|
|
|
| 14 |
import torch
|
| 15 |
import cv2
|
| 16 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
import trimesh
|
| 18 |
import pyrender
|
|
|
|
|
|
|
| 19 |
|
|
|
|
| 20 |
|
| 21 |
def get_camera_matrix(img_w, img_h):
|
| 22 |
"""
|
|
|
|
| 34 |
[0, fy, cy],
|
| 35 |
[0, 0, 1]
|
| 36 |
], dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
def get_common_camera_intrinsics(img_width: int, img_height: int) -> List[Dict]:
|
|
|
|
| 255 |
|
| 256 |
print("Running DL Pose Prediction...")
|
| 257 |
|
| 258 |
+
pose_estimator = TRAINED_CLASSIFICATION_MODEL['pose_model']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
pose_result = pose_estimator.predict_pose(img)
|
| 260 |
|
| 261 |
R_blender = pose_result['rotation_matrix']
|
|
|
|
| 295 |
rot_matrix[0:3, 0:3] = blender_to_opencv
|
| 296 |
mesh_final.apply_transform(rot_matrix)
|
| 297 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
camera_configs = get_common_camera_intrinsics(w_img, h_img)[:3]
|
| 299 |
|
| 300 |
best_score = -1
|
myapp/AR/pose_network.py
CHANGED
|
@@ -1,222 +1,12 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
# Combines classification with pose regression for accurate 6DoF estimation
|
| 4 |
-
# """
|
| 5 |
-
|
| 6 |
-
# import torch
|
| 7 |
-
# import torch.nn as nn
|
| 8 |
-
# import torchvision.models as models
|
| 9 |
-
# from torchvision.models import efficientnet_b3, EfficientNet_B3_Weights
|
| 10 |
-
|
| 11 |
-
# class BuildingPoseNet(nn.Module):
|
| 12 |
-
# """
|
| 13 |
-
# Neural network for estimating 6DoF pose of buildings from images.
|
| 14 |
-
|
| 15 |
-
# Architecture:
|
| 16 |
-
# - EfficientNet-B3 backbone (pre-trained on ImageNet)
|
| 17 |
-
# - Separate heads for rotation and translation
|
| 18 |
-
# - Rotation: quaternion representation (4 values)
|
| 19 |
-
# - Translation: 3D position (3 values)
|
| 20 |
-
# """
|
| 21 |
-
|
| 22 |
-
# def __init__(self, num_buildings=10, pretrained=True):
|
| 23 |
-
# super(BuildingPoseNet, self).__init__()
|
| 24 |
-
|
| 25 |
-
# if pretrained:
|
| 26 |
-
# weights = EfficientNet_B3_Weights.IMAGENET1K_V1
|
| 27 |
-
# self.backbone = efficientnet_b3(weights=weights)
|
| 28 |
-
# else:
|
| 29 |
-
# self.backbone = efficientnet_b3(weights=None)
|
| 30 |
-
|
| 31 |
-
# feature_dim = self.backbone.classifier[1].in_features
|
| 32 |
-
|
| 33 |
-
# self.backbone.classifier = nn.Identity()
|
| 34 |
-
|
| 35 |
-
# self.building_classifier = nn.Sequential(
|
| 36 |
-
# nn.Linear(feature_dim, 512),
|
| 37 |
-
# nn.ReLU(),
|
| 38 |
-
# nn.Dropout(0.3),
|
| 39 |
-
# nn.Linear(512, num_buildings)
|
| 40 |
-
# )
|
| 41 |
-
|
| 42 |
-
# self.rotation_head = nn.Sequential(
|
| 43 |
-
# nn.Linear(feature_dim, 512),
|
| 44 |
-
# nn.ReLU(),
|
| 45 |
-
# nn.Dropout(0.2),
|
| 46 |
-
# nn.Linear(512, 256),
|
| 47 |
-
# nn.ReLU(),
|
| 48 |
-
# nn.Linear(256, 4)
|
| 49 |
-
# )
|
| 50 |
-
|
| 51 |
-
# self.translation_head = nn.Sequential(
|
| 52 |
-
# nn.Linear(feature_dim, 512),
|
| 53 |
-
# nn.ReLU(),
|
| 54 |
-
# nn.Dropout(0.2),
|
| 55 |
-
# nn.Linear(512, 256),
|
| 56 |
-
# nn.ReLU(),
|
| 57 |
-
# nn.Linear(256, 3)
|
| 58 |
-
# )
|
| 59 |
-
|
| 60 |
-
# self.confidence_head = nn.Sequential(
|
| 61 |
-
# nn.Linear(feature_dim, 256),
|
| 62 |
-
# nn.ReLU(),
|
| 63 |
-
# nn.Linear(256, 1),
|
| 64 |
-
# nn.Sigmoid()
|
| 65 |
-
# )
|
| 66 |
-
|
| 67 |
-
# def forward(self, x):
|
| 68 |
-
# features = self.backbone(x)
|
| 69 |
-
|
| 70 |
-
# building_logits = self.building_classifier(features)
|
| 71 |
-
# rotation_quat = self.rotation_head(features)
|
| 72 |
-
# translation = self.translation_head(features)
|
| 73 |
-
# confidence = self.confidence_head(features)
|
| 74 |
-
|
| 75 |
-
# rotation_quat = rotation_quat / (torch.norm(rotation_quat, dim=1, keepdim=True) + 1e-8)
|
| 76 |
-
|
| 77 |
-
# return {
|
| 78 |
-
# 'building_logits': building_logits,
|
| 79 |
-
# 'rotation': rotation_quat,
|
| 80 |
-
# 'translation': translation,
|
| 81 |
-
# 'confidence': confidence
|
| 82 |
-
# }
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
# class PoseLoss(nn.Module):
|
| 86 |
-
# """
|
| 87 |
-
# Combined loss for pose estimation training
|
| 88 |
-
# """
|
| 89 |
-
|
| 90 |
-
# def __init__(self, rotation_weight=1.0, translation_weight=1.0,
|
| 91 |
-
# classification_weight=0.5):
|
| 92 |
-
# super(PoseLoss, self).__init__()
|
| 93 |
-
# self.rotation_weight = rotation_weight
|
| 94 |
-
# self.translation_weight = translation_weight
|
| 95 |
-
# self.classification_weight = classification_weight
|
| 96 |
-
# self.ce_loss = nn.CrossEntropyLoss()
|
| 97 |
-
|
| 98 |
-
# def quaternion_distance(self, q1, q2):
|
| 99 |
-
# """
|
| 100 |
-
# Compute geodesic distance between quaternions
|
| 101 |
-
# Returns angle in radians
|
| 102 |
-
# """
|
| 103 |
-
# q1 = q1 / (torch.norm(q1, dim=1, keepdim=True) + 1e-8)
|
| 104 |
-
# q2 = q2 / (torch.norm(q2, dim=1, keepdim=True) + 1e-8)
|
| 105 |
-
|
| 106 |
-
# dot_product = torch.abs(torch.sum(q1 * q2, dim=1))
|
| 107 |
-
# dot_product = torch.clamp(dot_product, -1.0, 1.0)
|
| 108 |
-
|
| 109 |
-
# return 2 * torch.acos(dot_product)
|
| 110 |
-
|
| 111 |
-
# def forward(self, predictions, targets):
|
| 112 |
-
# """
|
| 113 |
-
# Args:
|
| 114 |
-
# predictions: dict with 'rotation', 'translation', 'building_logits'
|
| 115 |
-
# targets: dict with 'rotation', 'translation', 'building_id'
|
| 116 |
-
# """
|
| 117 |
-
# pred_rot = predictions['rotation']
|
| 118 |
-
# pred_rot = pred_rot / (torch.norm(pred_rot, dim=1, keepdim=True) + 1e-8)
|
| 119 |
-
|
| 120 |
-
# target_rot = targets['rotation']
|
| 121 |
-
# target_rot = target_rot / (torch.norm(target_rot, dim=1, keepdim=True) + 1e-8)
|
| 122 |
-
|
| 123 |
-
# rot_loss = self.quaternion_distance(pred_rot, target_rot).mean()
|
| 124 |
-
|
| 125 |
-
# rot_loss = torch.clamp(rot_loss, 0, 10.0)
|
| 126 |
-
|
| 127 |
-
# if torch.isnan(rot_loss):
|
| 128 |
-
# rot_loss = torch.tensor(1.0, device=rot_loss.device)
|
| 129 |
-
|
| 130 |
-
# trans_loss = torch.nn.functional.mse_loss(
|
| 131 |
-
# predictions['translation'],
|
| 132 |
-
# targets['translation']
|
| 133 |
-
# )
|
| 134 |
-
|
| 135 |
-
# trans_loss = torch.clamp(trans_loss, 0, 100.0)
|
| 136 |
-
|
| 137 |
-
# if torch.isnan(trans_loss):
|
| 138 |
-
# trans_loss = torch.tensor(1.0, device=trans_loss.device)
|
| 139 |
-
|
| 140 |
-
# cls_loss = 0
|
| 141 |
-
# if 'building_id' in targets and 'building_logits' in predictions:
|
| 142 |
-
# cls_loss = self.ce_loss(
|
| 143 |
-
# predictions['building_logits'],
|
| 144 |
-
# targets['building_id']
|
| 145 |
-
# )
|
| 146 |
-
# if torch.isnan(cls_loss):
|
| 147 |
-
# cls_loss = torch.tensor(0.1, device=predictions['building_logits'].device)
|
| 148 |
-
|
| 149 |
-
# total_loss = (
|
| 150 |
-
# self.rotation_weight * rot_loss +
|
| 151 |
-
# self.translation_weight * trans_loss +
|
| 152 |
-
# self.classification_weight * cls_loss
|
| 153 |
-
# )
|
| 154 |
-
|
| 155 |
-
# if torch.isnan(total_loss) or torch.isinf(total_loss):
|
| 156 |
-
# total_loss = torch.tensor(1.0, device=total_loss.device, requires_grad=True)
|
| 157 |
-
|
| 158 |
-
# return {
|
| 159 |
-
# 'total_loss': total_loss,
|
| 160 |
-
# 'rotation_loss': rot_loss,
|
| 161 |
-
# 'translation_loss': trans_loss,
|
| 162 |
-
# 'classification_loss': cls_loss
|
| 163 |
-
# }
|
| 164 |
-
|
| 165 |
-
# def quaternion_to_rotation_matrix(quaternion):
|
| 166 |
-
# """
|
| 167 |
-
# Convert quaternion to 3x3 rotation matrix
|
| 168 |
-
|
| 169 |
-
# Args:
|
| 170 |
-
# quaternion: torch.Tensor of shape (4,) or (N, 4) [w, x, y, z]
|
| 171 |
-
|
| 172 |
-
# Returns:
|
| 173 |
-
# Rotation matrix of shape (3, 3) or (N, 3, 3)
|
| 174 |
-
# """
|
| 175 |
-
# if quaternion.dim() == 1:
|
| 176 |
-
# quaternion = quaternion.unsqueeze(0)
|
| 177 |
-
# squeeze = True
|
| 178 |
-
# else:
|
| 179 |
-
# squeeze = False
|
| 180 |
-
|
| 181 |
-
# quaternion = quaternion / torch.norm(quaternion, dim=1, keepdim=True)
|
| 182 |
-
|
| 183 |
-
# w, x, y, z = quaternion[:, 0], quaternion[:, 1], quaternion[:, 2], quaternion[:, 3]
|
| 184 |
-
|
| 185 |
-
# R = torch.zeros((quaternion.shape[0], 3, 3), device=quaternion.device)
|
| 186 |
-
|
| 187 |
-
# R[:, 0, 0] = 1 - 2*y**2 - 2*z**2
|
| 188 |
-
# R[:, 0, 1] = 2*x*y - 2*w*z
|
| 189 |
-
# R[:, 0, 2] = 2*x*z + 2*w*y
|
| 190 |
-
|
| 191 |
-
# R[:, 1, 0] = 2*x*y + 2*w*z
|
| 192 |
-
# R[:, 1, 1] = 1 - 2*x**2 - 2*z**2
|
| 193 |
-
# R[:, 1, 2] = 2*y*z - 2*w*x
|
| 194 |
-
|
| 195 |
-
# R[:, 2, 0] = 2*x*z - 2*w*y
|
| 196 |
-
# R[:, 2, 1] = 2*y*z + 2*w*x
|
| 197 |
-
# R[:, 2, 2] = 1 - 2*x**2 - 2*y**2
|
| 198 |
-
|
| 199 |
-
# if squeeze:
|
| 200 |
-
# R = R.squeeze(0)
|
| 201 |
-
|
| 202 |
-
# return R
|
| 203 |
-
|
| 204 |
-
# if __name__ == "__main__":
|
| 205 |
-
# model = BuildingPoseNet(num_buildings=5)
|
| 206 |
-
|
| 207 |
-
# batch_size = 4
|
| 208 |
-
# dummy_input = torch.randn(batch_size, 3, 224, 224)
|
| 209 |
-
|
| 210 |
-
# output = model(dummy_input)
|
| 211 |
-
|
| 212 |
-
# quat = output['rotation'][0]
|
| 213 |
-
# R = quaternion_to_rotation_matrix(quat)
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
import torch
|
| 218 |
import torch.nn as nn
|
| 219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
class BuildingPoseNet(nn.Module):
|
| 222 |
"""
|
|
@@ -443,32 +233,73 @@ def quaternion_to_rotation_matrix(quaternion):
|
|
| 443 |
return R
|
| 444 |
|
| 445 |
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
# Create dummy features (simulating pre-extracted CLIP features)
|
| 453 |
-
batch_size = 4
|
| 454 |
-
dummy_features = torch.randn(batch_size, 1280)
|
| 455 |
-
|
| 456 |
-
# Forward pass
|
| 457 |
-
output = model(dummy_features)
|
| 458 |
-
|
| 459 |
-
print("\nNetwork Output Shapes:")
|
| 460 |
-
print(f"Building logits: {output['building_logits'].shape}")
|
| 461 |
-
print(f"Rotation (quaternion): {output['rotation'].shape}")
|
| 462 |
-
print(f"Translation: {output['translation'].shape}")
|
| 463 |
-
print(f"Confidence: {output['confidence'].shape}")
|
| 464 |
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import cv2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import torch
|
| 4 |
import torch.nn as nn
|
| 5 |
|
| 6 |
+
from classificator_training.data.dataset import preprocess_single_image
|
| 7 |
+
from classificator_training.model.feature_extractor import FeatureExtractor
|
| 8 |
+
from classificator_training.utils import move_to_device
|
| 9 |
+
|
| 10 |
|
| 11 |
class BuildingPoseNet(nn.Module):
|
| 12 |
"""
|
|
|
|
| 233 |
return R
|
| 234 |
|
| 235 |
|
| 236 |
+
|
| 237 |
+
class BuildingPoseEstimator:
|
| 238 |
+
"""
|
| 239 |
+
Process folders of building images and generate pose predictions
|
| 240 |
+
Works with models trained on cached CLIP features
|
| 241 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
+
def __init__(self, model_path, config_path, feature_extractor: FeatureExtractor, device='cuda'):
|
| 244 |
+
self.device = device
|
| 245 |
+
|
| 246 |
+
with open(config_path, 'r') as f:
|
| 247 |
+
self.config = json.load(f)
|
| 248 |
+
|
| 249 |
+
self.feature_extractor = feature_extractor
|
| 250 |
+
feature_dim = feature_extractor.feature_dims['clip']
|
| 251 |
+
|
| 252 |
+
self.model = BuildingPoseNet(
|
| 253 |
+
num_buildings=self.config['num_buildings'],
|
| 254 |
+
feature_dim=feature_dim
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
checkpoint = torch.load(model_path, map_location=device, weights_only=False)
|
| 258 |
+
self.model.load_state_dict(checkpoint['model_state_dict'])
|
| 259 |
+
self.model.to(device)
|
| 260 |
+
self.model.eval()
|
| 261 |
|
| 262 |
+
def predict_pose(self, image):
|
| 263 |
+
"""
|
| 264 |
+
Predict pose from single image
|
| 265 |
+
|
| 266 |
+
Args:
|
| 267 |
+
image: numpy array (H, W, 3) - raw image
|
| 268 |
+
|
| 269 |
+
Returns:
|
| 270 |
+
dict with pose predictions
|
| 271 |
+
"""
|
| 272 |
+
# Step 1: Extract CLIP features
|
| 273 |
+
inputs = preprocess_single_image(image, self.feature_extractor.use_models)
|
| 274 |
+
inputs = move_to_device(inputs, "cuda" if torch.cuda.is_available() else "cpu")
|
| 275 |
+
features = self.feature_extractor.extract_feature_from_model("clip", inputs["clip_input"])
|
| 276 |
+
|
| 277 |
+
# Step 2: Predict pose from features
|
| 278 |
+
with torch.no_grad():
|
| 279 |
+
output = self.model(features)
|
| 280 |
+
|
| 281 |
+
# Parse outputs
|
| 282 |
+
building_id = torch.argmax(output['building_logits'], dim=1).item()
|
| 283 |
+
building_probs = torch.softmax(output['building_logits'], dim=1)[0].cpu().numpy()
|
| 284 |
+
rotation_quat = output['rotation'][0].cpu().numpy()
|
| 285 |
+
translation = output['translation'][0].cpu().numpy()
|
| 286 |
+
confidence = output['confidence'][0].item()
|
| 287 |
+
|
| 288 |
+
# Convert quaternion to rotation matrix
|
| 289 |
+
rotation_matrix = quaternion_to_rotation_matrix(
|
| 290 |
+
torch.from_numpy(rotation_quat)
|
| 291 |
+
).numpy()
|
| 292 |
+
|
| 293 |
+
return {
|
| 294 |
+
'building_id': building_id,
|
| 295 |
+
'building_name': self.config['buildings'][building_id],
|
| 296 |
+
'building_probabilities': {
|
| 297 |
+
self.config['buildings'][i]: float(building_probs[i])
|
| 298 |
+
for i in range(len(building_probs))
|
| 299 |
+
},
|
| 300 |
+
'rotation_quaternion': rotation_quat.tolist(),
|
| 301 |
+
'rotation_matrix': rotation_matrix.tolist(),
|
| 302 |
+
'translation': translation.tolist(),
|
| 303 |
+
'confidence': float(confidence)
|
| 304 |
+
}
|
| 305 |
+
|
myapp/main.py
CHANGED
|
@@ -3,20 +3,21 @@ from fastapi import FastAPI
|
|
| 3 |
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
from myapp.routers import buildings_router
|
| 5 |
from myapp.routers import buildings_search_router
|
| 6 |
-
from
|
| 7 |
from myapp.AR.model_template import TRAINED_CLASSIFICATION_MODEL
|
| 8 |
|
| 9 |
@asynccontextmanager
|
| 10 |
async def lifespan(app: FastAPI):
|
| 11 |
print("Starting up: Loading Neural Network Model...")
|
| 12 |
try:
|
| 13 |
-
model, prototype_tensor, class_ids, feature_extractor, id_to_tag, tag_to_id = load_best_model()
|
| 14 |
TRAINED_CLASSIFICATION_MODEL["model"] = model
|
| 15 |
TRAINED_CLASSIFICATION_MODEL["prototype_tensor"] = prototype_tensor
|
| 16 |
TRAINED_CLASSIFICATION_MODEL["class_ids"] = class_ids
|
| 17 |
TRAINED_CLASSIFICATION_MODEL["feature_extractor"] = feature_extractor
|
| 18 |
TRAINED_CLASSIFICATION_MODEL["id_to_tag"] = id_to_tag
|
| 19 |
TRAINED_CLASSIFICATION_MODEL['tag_to_id'] = tag_to_id
|
|
|
|
| 20 |
TRAINED_CLASSIFICATION_MODEL["model"].eval()
|
| 21 |
|
| 22 |
print("Model loaded.")
|
|
|
|
| 3 |
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
from myapp.routers import buildings_router
|
| 5 |
from myapp.routers import buildings_search_router
|
| 6 |
+
from myapp.utils.load_best_model import load_best_model
|
| 7 |
from myapp.AR.model_template import TRAINED_CLASSIFICATION_MODEL
|
| 8 |
|
| 9 |
@asynccontextmanager
|
| 10 |
async def lifespan(app: FastAPI):
|
| 11 |
print("Starting up: Loading Neural Network Model...")
|
| 12 |
try:
|
| 13 |
+
model, prototype_tensor, class_ids, feature_extractor, id_to_tag, tag_to_id, pose_model = load_best_model()
|
| 14 |
TRAINED_CLASSIFICATION_MODEL["model"] = model
|
| 15 |
TRAINED_CLASSIFICATION_MODEL["prototype_tensor"] = prototype_tensor
|
| 16 |
TRAINED_CLASSIFICATION_MODEL["class_ids"] = class_ids
|
| 17 |
TRAINED_CLASSIFICATION_MODEL["feature_extractor"] = feature_extractor
|
| 18 |
TRAINED_CLASSIFICATION_MODEL["id_to_tag"] = id_to_tag
|
| 19 |
TRAINED_CLASSIFICATION_MODEL['tag_to_id'] = tag_to_id
|
| 20 |
+
TRAINED_CLASSIFICATION_MODEL["pose_model"] = pose_model
|
| 21 |
TRAINED_CLASSIFICATION_MODEL["model"].eval()
|
| 22 |
|
| 23 |
print("Model loaded.")
|
{classificator_training/helpers → myapp/utils}/load_best_model.py
RENAMED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
from argparse import Namespace
|
| 2 |
-
import json
|
| 3 |
import os
|
|
|
|
| 4 |
import torch
|
| 5 |
from classificator_training.model.model import FusedFeatureModel
|
| 6 |
from classificator_training.model.feature_extractor import FeatureExtractor
|
| 7 |
from classificator_training.helpers.args import _override_args_from_model_name
|
| 8 |
-
from
|
| 9 |
|
| 10 |
ENVIRONEMENT = os.getenv("ENVIRONMENT", "production")
|
| 11 |
MODEL_PATH = "1_fused_feature_model.pth_full_clip1_segformer0_midas0_dpt0_gate0_batch64_traintypehardmining_bigfusionhead2_lr2e-07_margin1.2_alpha64.0_datasetsize114272_rendersBlenderRenders7_testdatatest_data3.model"
|
|
@@ -18,9 +18,9 @@ FULL_YOLO_MODEL_PATH = FULL_MODEL_PATH + YOLO_MODEL_PATH
|
|
| 18 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 19 |
|
| 20 |
def load_best_model():
|
| 21 |
-
args = Namespace(clip=1, segformer=0, midas=0, dpt=0, gate=0, big_fusion_head=2, train_type='hardmining', lr=2e-07, margin=
|
| 22 |
args = _override_args_from_model_name(args, MODEL_PATH)
|
| 23 |
-
|
| 24 |
MODELS_USED = {
|
| 25 |
'clip': args.clip,
|
| 26 |
'segformer': args.segformer,
|
|
@@ -41,7 +41,17 @@ def load_best_model():
|
|
| 41 |
class_ids = checkpoint.get('class_ids', None)
|
| 42 |
id_to_tag = checkpoint.get('id_to_tag', None)
|
| 43 |
tag_to_id = {tag: id for id, tag in id_to_tag.items()}
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
|
|
|
|
| 1 |
from argparse import Namespace
|
|
|
|
| 2 |
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
import torch
|
| 5 |
from classificator_training.model.model import FusedFeatureModel
|
| 6 |
from classificator_training.model.feature_extractor import FeatureExtractor
|
| 7 |
from classificator_training.helpers.args import _override_args_from_model_name
|
| 8 |
+
from myapp.AR.pose_network import BuildingPoseEstimator
|
| 9 |
|
| 10 |
ENVIRONEMENT = os.getenv("ENVIRONMENT", "production")
|
| 11 |
MODEL_PATH = "1_fused_feature_model.pth_full_clip1_segformer0_midas0_dpt0_gate0_batch64_traintypehardmining_bigfusionhead2_lr2e-07_margin1.2_alpha64.0_datasetsize114272_rendersBlenderRenders7_testdatatest_data3.model"
|
|
|
|
| 18 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 19 |
|
| 20 |
def load_best_model():
|
| 21 |
+
args = Namespace(clip=1, segformer=0, midas=0, dpt=0, gate=0, big_fusion_head=2, train_type='hardmining', lr=2e-07, margin=1.2, alpha=64.0)
|
| 22 |
args = _override_args_from_model_name(args, MODEL_PATH)
|
| 23 |
+
print("Loading model with args:", args)
|
| 24 |
MODELS_USED = {
|
| 25 |
'clip': args.clip,
|
| 26 |
'segformer': args.segformer,
|
|
|
|
| 41 |
class_ids = checkpoint.get('class_ids', None)
|
| 42 |
id_to_tag = checkpoint.get('id_to_tag', None)
|
| 43 |
tag_to_id = {tag: id for id, tag in id_to_tag.items()}
|
| 44 |
+
|
| 45 |
+
current_dir = Path(__file__).parent
|
| 46 |
+
file_path = current_dir.parent / 'AR'
|
| 47 |
+
print("Loaded")
|
| 48 |
+
pose_model = BuildingPoseEstimator(
|
| 49 |
+
model_path=file_path / 'best_model.pth',
|
| 50 |
+
config_path=file_path / 'dataset.json',
|
| 51 |
+
feature_extractor=feature_extractor,
|
| 52 |
+
device='cuda' if torch.cuda.is_available() else 'cpu',
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
return model, prototype_tensor, class_ids, feature_extractor, id_to_tag, tag_to_id, pose_model
|
| 56 |
|
| 57 |
|