ZZZdream95 commited on
Commit
1b1014b
·
1 Parent(s): 0aa47d4

working example

Browse files
.gitignore CHANGED
@@ -188,4 +188,5 @@ myapp/debug_template.jpg
188
 
189
  debug_*.jpg
190
  *_clip_*.pth
191
- batch_run_*.txt
 
 
188
 
189
  debug_*.jpg
190
  *_clip_*.pth
191
+ batch_run_*.txt
192
+ .claude/
classificator_training/helpers/args.py CHANGED
@@ -56,7 +56,7 @@ def _override_args_from_model_name(args: Namespace, load_model_name: str, verbos
56
  )
57
 
58
  match = pattern.search(load_model_name)
59
-
60
  if match:
61
  extracted_values = match.groupdict()
62
 
 
56
  )
57
 
58
  match = pattern.search(load_model_name)
59
+ print(match)
60
  if match:
61
  extracted_values = match.groupdict()
62
 
classificator_training/model/feature_extractor.py CHANGED
@@ -162,4 +162,35 @@ class FeatureExtractor(nn.Module):
162
  extracted_feature_dict['vit'] = self.vit(self._ensure_batch_dim(vit_input))
163
 
164
  return extracted_feature_dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
 
162
  extracted_feature_dict['vit'] = self.vit(self._ensure_batch_dim(vit_input))
163
 
164
  return extracted_feature_dict
165
+
166
+ def extract_feature_from_model(self, model_name: str, input_tensor: torch.Tensor):
167
+ """
168
+ Extracts features from a specific backbone model.
169
+ """
170
+ if model_name not in self.use_models or not self.use_models[model_name]:
171
+ raise ValueError(f"Model '{model_name}' is not enabled in use_models.")
172
+
173
+ input_tensor = self._ensure_batch_dim(input_tensor)
174
+
175
+ if model_name == 'clip':
176
+ return self.clip_model(input_tensor).pooler_output
177
+ elif model_name == 'segformer':
178
+ output = self.segformer_model(input_tensor, output_hidden_states=True)
179
+ return output.last_hidden_state.mean(dim=[2, 3])
180
+ elif model_name == 'dpt':
181
+ output = self.dpt_model(input_tensor)
182
+ return output.pooler_output
183
+ elif model_name == 'midas':
184
+ output = self.midas_model(input_tensor, output_hidden_states=True)
185
+ return output.hidden_states[-1][:, 1:, :].mean(dim=1)
186
+ elif model_name == 'resnet':
187
+ return self.resnet(input_tensor)
188
+ elif model_name == 'mobilenet':
189
+ return self.mobilenet(input_tensor)
190
+ elif model_name == 'efficientnet':
191
+ return self.efficientnet(input_tensor)
192
+ elif model_name == 'vit':
193
+ return self.vit(input_tensor)
194
+ else:
195
+ raise ValueError(f"Unknown model name: {model_name}")
196
 
classificator_training/run_script.sh CHANGED
@@ -185,8 +185,6 @@ RESULTS_FILE="batch_run_results2.txt"
185
 
186
 
187
  for SET in "${PARAMETER_SETS[@]}"; do
188
- # Read 15 variables (plus P16/Alpha if needed, currently Alpha is treated as part of the set logic usually)
189
- # Mapping based on new order:
190
  read P_CLIP P_SEG P_MIDAS P_DPT P_RES P_MOB P_EFF P_VIT P_GATE P_BATCH P_TYPE P_HEAD P_LR P_MARGIN P_ALPHA <<< "$SET"
191
 
192
  echo "--- Starting run with Models: Clip=$P_CLIP Seg=$P_SEG Midas=$P_MIDAS Dpt=$P_DPT Res=$P_RES Mob=$P_MOB Eff=$P_EFF Vit=$P_VIT ---"
@@ -210,7 +208,6 @@ for SET in "${PARAMETER_SETS[@]}"; do
210
 
211
  echo "$COMMAND_TO_RUN"
212
 
213
- # Check if this is a "silent" run (P11 logic from your old script was unclear, assuming standard logging here)
214
  OUTPUT=$($COMMAND_TO_RUN 2>&1 | tee /dev/tty | tail -n 1)
215
  EXIT_CODE=${PIPESTATUS[0]}
216
 
@@ -222,7 +219,6 @@ for SET in "${PARAMETER_SETS[@]}"; do
222
  echo "Run finished **successfully**."
223
  else
224
  echo "Run **failed**. Stopping batch."
225
- # break # Uncomment to stop on failure
226
  fi
227
  echo "---"
228
  done
 
185
 
186
 
187
  for SET in "${PARAMETER_SETS[@]}"; do
 
 
188
  read P_CLIP P_SEG P_MIDAS P_DPT P_RES P_MOB P_EFF P_VIT P_GATE P_BATCH P_TYPE P_HEAD P_LR P_MARGIN P_ALPHA <<< "$SET"
189
 
190
  echo "--- Starting run with Models: Clip=$P_CLIP Seg=$P_SEG Midas=$P_MIDAS Dpt=$P_DPT Res=$P_RES Mob=$P_MOB Eff=$P_EFF Vit=$P_VIT ---"
 
208
 
209
  echo "$COMMAND_TO_RUN"
210
 
 
211
  OUTPUT=$($COMMAND_TO_RUN 2>&1 | tee /dev/tty | tail -n 1)
212
  EXIT_CODE=${PIPESTATUS[0]}
213
 
 
219
  echo "Run finished **successfully**."
220
  else
221
  echo "Run **failed**. Stopping batch."
 
222
  fi
223
  echo "---"
224
  done
myapp/.gitignore CHANGED
@@ -171,4 +171,5 @@ cython_debug/
171
  .ruff_cache/
172
 
173
  # PyPI configuration file
174
- .pypirc
 
 
171
  .ruff_cache/
172
 
173
  # PyPI configuration file
174
+ .pypirc
175
+ .claude/
myapp/AR/model_template.py CHANGED
@@ -5,6 +5,6 @@ TRAINED_CLASSIFICATION_MODEL = {
5
  "feature_extractor": None,
6
  "id_to_tag": None,
7
  "tag_to_id": None,
8
- "yolo_model": None,
9
  "keypoints": None
10
  }
 
5
  "feature_extractor": None,
6
  "id_to_tag": None,
7
  "tag_to_id": None,
8
+ "pose_model": None,
9
  "keypoints": None
10
  }
myapp/AR/pose_estimate_utils.py CHANGED
@@ -1,8 +1,8 @@
1
- import json
2
  from typing import Dict, List, Tuple
3
-
4
  from transformers import CLIPConfig, CLIPProcessor, CLIPVisionModel
 
5
  from classificator_training.model.feature_extractor import CLIP_MODEL_ID
 
6
  from myapp.AR.model_template import TRAINED_CLASSIFICATION_MODEL
7
  import os
8
  import io
@@ -14,16 +14,9 @@ import numpy as np
14
  import torch
15
  import cv2
16
  import numpy as np
17
- import json
18
- import argparse
19
- from pathlib import Path
20
- from tqdm import tqdm
21
  import trimesh
22
  import pyrender
23
- from torchvision import transforms
24
- from datetime import datetime
25
 
26
- from myapp.AR.pose_network import BuildingPoseNet, quaternion_to_rotation_matrix
27
 
28
  def get_camera_matrix(img_w, img_h):
29
  """
@@ -41,183 +34,6 @@ def get_camera_matrix(img_w, img_h):
41
  [0, fy, cy],
42
  [0, 0, 1]
43
  ], dtype=np.float32)
44
-
45
-
46
- class BatchBuildingPoseEstimator:
47
- """
48
- Process folders of building images and generate pose predictions
49
- Works with models trained on cached CLIP features
50
- """
51
-
52
- def __init__(self, model_path, config_path, device='cuda'):
53
- self.device = device
54
-
55
- # Load config
56
- with open(config_path, 'r') as f:
57
- self.config = json.load(f)
58
-
59
- print("Loading CLIP model for feature extraction...")
60
- # Load CLIP for feature extraction
61
- clip_config = CLIPConfig.from_pretrained(CLIP_MODEL_ID)
62
- self.clip_model = CLIPVisionModel.from_pretrained(
63
- CLIP_MODEL_ID,
64
- config=clip_config.vision_config,
65
- )
66
- self.clip_model.to(device)
67
- self.clip_model.eval()
68
-
69
- self.clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_ID)
70
-
71
- feature_dim = self.clip_model.config.hidden_size
72
- print(f"✓ CLIP loaded (feature dim: {feature_dim})")
73
-
74
- # Load pose estimation model
75
- print("Loading pose estimation model...")
76
- self.model = BuildingPoseNet(
77
- num_buildings=self.config['num_buildings'],
78
- feature_dim=feature_dim
79
- )
80
-
81
- checkpoint = torch.load(model_path, map_location=device, weights_only=False)
82
- self.model.load_state_dict(checkpoint['model_state_dict'])
83
- self.model.to(device)
84
- self.model.eval()
85
-
86
- print(f"✓ Pose estimation model loaded successfully:")
87
- print(f" Model path: {model_path}")
88
- print(f" Device: {device}")
89
- print(f" Number of buildings: {self.config['num_buildings']}")
90
-
91
- def extract_clip_features(self, image):
92
- """
93
- Extract CLIP features from a single image
94
-
95
- Args:
96
- image: numpy array (H, W, 3) in BGR or RGB format
97
-
98
- Returns:
99
- features: torch.Tensor [1, feature_dim]
100
- """
101
- # Convert BGR to RGB if needed
102
- if len(image.shape) == 2:
103
- image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
104
- elif image.shape[2] == 3:
105
- # Assume BGR from OpenCV, convert to RGB
106
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
107
-
108
- # Process image with CLIP processor
109
- inputs = self.clip_processor(images=image, return_tensors="pt")
110
- pixel_values = inputs['pixel_values'].to(self.device)
111
-
112
- # Extract features
113
- with torch.no_grad():
114
- outputs = self.clip_model(pixel_values=pixel_values)
115
- features = outputs.pooler_output # [1, feature_dim]
116
-
117
- return features
118
-
119
- def predict_pose(self, image):
120
- """
121
- Predict pose from single image
122
-
123
- Args:
124
- image: numpy array (H, W, 3) - raw image
125
-
126
- Returns:
127
- dict with pose predictions
128
- """
129
- # Step 1: Extract CLIP features
130
- features = self.extract_clip_features(image)
131
-
132
- # Step 2: Predict pose from features
133
- with torch.no_grad():
134
- output = self.model(features)
135
-
136
- # Parse outputs
137
- building_id = torch.argmax(output['building_logits'], dim=1).item()
138
- building_probs = torch.softmax(output['building_logits'], dim=1)[0].cpu().numpy()
139
- rotation_quat = output['rotation'][0].cpu().numpy()
140
- translation = output['translation'][0].cpu().numpy()
141
- confidence = output['confidence'][0].item()
142
-
143
- # Convert quaternion to rotation matrix
144
- rotation_matrix = quaternion_to_rotation_matrix(
145
- torch.from_numpy(rotation_quat)
146
- ).numpy()
147
-
148
- return {
149
- 'building_id': building_id,
150
- 'building_name': self.config['buildings'][building_id],
151
- 'building_probabilities': {
152
- self.config['buildings'][i]: float(building_probs[i])
153
- for i in range(len(building_probs))
154
- },
155
- 'rotation_quaternion': rotation_quat.tolist(),
156
- 'rotation_matrix': rotation_matrix.tolist(),
157
- 'translation': translation.tolist(),
158
- 'confidence': float(confidence)
159
- }
160
-
161
- def predict_pose_batch(self, images):
162
- """
163
- Predict poses for a batch of images (faster)
164
-
165
- Args:
166
- images: list of numpy arrays
167
-
168
- Returns:
169
- list of prediction dicts
170
- """
171
- # Convert all to RGB
172
- rgb_images = []
173
- for img in images:
174
- if len(img.shape) == 2:
175
- img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
176
- elif img.shape[2] == 3:
177
- img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
178
- rgb_images.append(img)
179
-
180
- # Extract features for all images
181
- inputs = self.clip_processor(images=rgb_images, return_tensors="pt")
182
- pixel_values = inputs['pixel_values'].to(self.device)
183
-
184
- with torch.no_grad():
185
- # Extract CLIP features
186
- outputs = self.clip_model(pixel_values=pixel_values)
187
- features = outputs.pooler_output # [batch_size, feature_dim]
188
-
189
- # Predict poses
190
- predictions = self.model(features)
191
-
192
- # Parse results
193
- results = []
194
- batch_size = len(images)
195
-
196
- for i in range(batch_size):
197
- building_id = torch.argmax(predictions['building_logits'][i]).item()
198
- building_probs = torch.softmax(predictions['building_logits'][i], dim=0).cpu().numpy()
199
- rotation_quat = predictions['rotation'][i].cpu().numpy()
200
- translation = predictions['translation'][i].cpu().numpy()
201
- confidence = predictions['confidence'][i].item()
202
-
203
- rotation_matrix = quaternion_to_rotation_matrix(
204
- torch.from_numpy(rotation_quat)
205
- ).numpy()
206
-
207
- results.append({
208
- 'building_id': building_id,
209
- 'building_name': self.config['buildings'][building_id],
210
- 'building_probabilities': {
211
- self.config['buildings'][j]: float(building_probs[j])
212
- for j in range(len(building_probs))
213
- },
214
- 'rotation_quaternion': rotation_quat.tolist(),
215
- 'rotation_matrix': rotation_matrix.tolist(),
216
- 'translation': translation.tolist(),
217
- 'confidence': float(confidence)
218
- })
219
-
220
- return results
221
 
222
 
223
  def get_common_camera_intrinsics(img_width: int, img_height: int) -> List[Dict]:
@@ -439,13 +255,7 @@ def process_image_heuristic(img_bytes: bytes, model_bytes: bytes, model_name: st
439
 
440
  print("Running DL Pose Prediction...")
441
 
442
-
443
- pose_estimator = BatchBuildingPoseEstimator(
444
- model_path='AR/best_model.pth',
445
- config_path='AR/dataset.json',
446
- device='cuda' if torch.cuda.is_available() else 'cpu',
447
- )
448
-
449
  pose_result = pose_estimator.predict_pose(img)
450
 
451
  R_blender = pose_result['rotation_matrix']
@@ -485,68 +295,6 @@ def process_image_heuristic(img_bytes: bytes, model_bytes: bytes, model_name: st
485
  rot_matrix[0:3, 0:3] = blender_to_opencv
486
  mesh_final.apply_transform(rot_matrix)
487
 
488
- # camera_intrinsics = np.array([
489
- # [
490
- # 796.4444444444445,
491
- # 0,
492
- # 512.0
493
- # ],
494
- # [
495
- # 0,
496
- # 796.4444444444445,
497
- # 512.0
498
- # ],
499
- # [
500
- # 0,
501
- # 0,
502
- # 1
503
- # ]
504
- # ])
505
-
506
- # fx = camera_intrinsics[0, 0]
507
- # fy = camera_intrinsics[1, 1]
508
- # cx = camera_intrinsics[0, 2]
509
- # cy = camera_intrinsics[1, 2]
510
-
511
- # training_width = 1024
512
- # training_height = 1024
513
- # scale_x = w_img / training_width
514
- # scale_y = h_img / training_height
515
-
516
- # fx_scaled = fx * scale_x
517
- # fy_scaled = fy * scale_y
518
- # cx_scaled = cx * scale_x
519
- # cy_scaled = cy * scale_y
520
-
521
- # view_matrix = np.eye(4)
522
- # view_matrix[:3, :3] = R_opencv
523
- # view_matrix[:3, 3] = t_opencv.squeeze()
524
-
525
- # cam_pose_cv = np.linalg.inv(view_matrix)
526
-
527
- # cv_to_gl = np.array([[1,0,0,0], [0,-1,0,0], [0,0,-1,0], [0,0,0,1]])
528
- # final_pose = cam_pose_cv @ cv_to_gl
529
-
530
- # scene = pyrender.Scene(bg_color=[0.0, 0.0, 0.0, 0.0], ambient_light=[0.8, 0.8, 0.8])
531
- # scene.add(pyrender.Mesh.from_trimesh(mesh_final))
532
-
533
- # cam = pyrender.IntrinsicsCamera(
534
- # fx=fx_scaled,
535
- # fy=fy_scaled,
536
- # cx=cx_scaled,
537
- # cy=cy_scaled,
538
- # znear=0.05,
539
- # zfar=1000.0
540
- # )
541
-
542
- # scene.add(cam, pose=final_pose)
543
-
544
- # light = pyrender.PointLight(color=[1.0, 1.0, 1.0], intensity=1000.0)
545
- # scene.add(light, pose=final_pose)
546
-
547
- # r = pyrender.OffscreenRenderer(w_img, h_img)
548
- # color, depth = r.render(scene, flags=pyrender.RenderFlags.RGBA)
549
- # r.delete()
550
  camera_configs = get_common_camera_intrinsics(w_img, h_img)[:3]
551
 
552
  best_score = -1
 
 
1
  from typing import Dict, List, Tuple
 
2
  from transformers import CLIPConfig, CLIPProcessor, CLIPVisionModel
3
+ from classificator_training.data.dataset import preprocess_single_image
4
  from classificator_training.model.feature_extractor import CLIP_MODEL_ID
5
+ from classificator_training.utils import move_to_device
6
  from myapp.AR.model_template import TRAINED_CLASSIFICATION_MODEL
7
  import os
8
  import io
 
14
  import torch
15
  import cv2
16
  import numpy as np
 
 
 
 
17
  import trimesh
18
  import pyrender
 
 
19
 
 
20
 
21
  def get_camera_matrix(img_w, img_h):
22
  """
 
34
  [0, fy, cy],
35
  [0, 0, 1]
36
  ], dtype=np.float32)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
 
39
  def get_common_camera_intrinsics(img_width: int, img_height: int) -> List[Dict]:
 
255
 
256
  print("Running DL Pose Prediction...")
257
 
258
+ pose_estimator = TRAINED_CLASSIFICATION_MODEL['pose_model']
 
 
 
 
 
 
259
  pose_result = pose_estimator.predict_pose(img)
260
 
261
  R_blender = pose_result['rotation_matrix']
 
295
  rot_matrix[0:3, 0:3] = blender_to_opencv
296
  mesh_final.apply_transform(rot_matrix)
297
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  camera_configs = get_common_camera_intrinsics(w_img, h_img)[:3]
299
 
300
  best_score = -1
myapp/AR/pose_network.py CHANGED
@@ -1,222 +1,12 @@
1
- # """
2
- # Pose Estimation Network for Building Alignment
3
- # Combines classification with pose regression for accurate 6DoF estimation
4
- # """
5
-
6
- # import torch
7
- # import torch.nn as nn
8
- # import torchvision.models as models
9
- # from torchvision.models import efficientnet_b3, EfficientNet_B3_Weights
10
-
11
- # class BuildingPoseNet(nn.Module):
12
- # """
13
- # Neural network for estimating 6DoF pose of buildings from images.
14
-
15
- # Architecture:
16
- # - EfficientNet-B3 backbone (pre-trained on ImageNet)
17
- # - Separate heads for rotation and translation
18
- # - Rotation: quaternion representation (4 values)
19
- # - Translation: 3D position (3 values)
20
- # """
21
-
22
- # def __init__(self, num_buildings=10, pretrained=True):
23
- # super(BuildingPoseNet, self).__init__()
24
-
25
- # if pretrained:
26
- # weights = EfficientNet_B3_Weights.IMAGENET1K_V1
27
- # self.backbone = efficientnet_b3(weights=weights)
28
- # else:
29
- # self.backbone = efficientnet_b3(weights=None)
30
-
31
- # feature_dim = self.backbone.classifier[1].in_features
32
-
33
- # self.backbone.classifier = nn.Identity()
34
-
35
- # self.building_classifier = nn.Sequential(
36
- # nn.Linear(feature_dim, 512),
37
- # nn.ReLU(),
38
- # nn.Dropout(0.3),
39
- # nn.Linear(512, num_buildings)
40
- # )
41
-
42
- # self.rotation_head = nn.Sequential(
43
- # nn.Linear(feature_dim, 512),
44
- # nn.ReLU(),
45
- # nn.Dropout(0.2),
46
- # nn.Linear(512, 256),
47
- # nn.ReLU(),
48
- # nn.Linear(256, 4)
49
- # )
50
-
51
- # self.translation_head = nn.Sequential(
52
- # nn.Linear(feature_dim, 512),
53
- # nn.ReLU(),
54
- # nn.Dropout(0.2),
55
- # nn.Linear(512, 256),
56
- # nn.ReLU(),
57
- # nn.Linear(256, 3)
58
- # )
59
-
60
- # self.confidence_head = nn.Sequential(
61
- # nn.Linear(feature_dim, 256),
62
- # nn.ReLU(),
63
- # nn.Linear(256, 1),
64
- # nn.Sigmoid()
65
- # )
66
-
67
- # def forward(self, x):
68
- # features = self.backbone(x)
69
-
70
- # building_logits = self.building_classifier(features)
71
- # rotation_quat = self.rotation_head(features)
72
- # translation = self.translation_head(features)
73
- # confidence = self.confidence_head(features)
74
-
75
- # rotation_quat = rotation_quat / (torch.norm(rotation_quat, dim=1, keepdim=True) + 1e-8)
76
-
77
- # return {
78
- # 'building_logits': building_logits,
79
- # 'rotation': rotation_quat,
80
- # 'translation': translation,
81
- # 'confidence': confidence
82
- # }
83
-
84
-
85
- # class PoseLoss(nn.Module):
86
- # """
87
- # Combined loss for pose estimation training
88
- # """
89
-
90
- # def __init__(self, rotation_weight=1.0, translation_weight=1.0,
91
- # classification_weight=0.5):
92
- # super(PoseLoss, self).__init__()
93
- # self.rotation_weight = rotation_weight
94
- # self.translation_weight = translation_weight
95
- # self.classification_weight = classification_weight
96
- # self.ce_loss = nn.CrossEntropyLoss()
97
-
98
- # def quaternion_distance(self, q1, q2):
99
- # """
100
- # Compute geodesic distance between quaternions
101
- # Returns angle in radians
102
- # """
103
- # q1 = q1 / (torch.norm(q1, dim=1, keepdim=True) + 1e-8)
104
- # q2 = q2 / (torch.norm(q2, dim=1, keepdim=True) + 1e-8)
105
-
106
- # dot_product = torch.abs(torch.sum(q1 * q2, dim=1))
107
- # dot_product = torch.clamp(dot_product, -1.0, 1.0)
108
-
109
- # return 2 * torch.acos(dot_product)
110
-
111
- # def forward(self, predictions, targets):
112
- # """
113
- # Args:
114
- # predictions: dict with 'rotation', 'translation', 'building_logits'
115
- # targets: dict with 'rotation', 'translation', 'building_id'
116
- # """
117
- # pred_rot = predictions['rotation']
118
- # pred_rot = pred_rot / (torch.norm(pred_rot, dim=1, keepdim=True) + 1e-8)
119
-
120
- # target_rot = targets['rotation']
121
- # target_rot = target_rot / (torch.norm(target_rot, dim=1, keepdim=True) + 1e-8)
122
-
123
- # rot_loss = self.quaternion_distance(pred_rot, target_rot).mean()
124
-
125
- # rot_loss = torch.clamp(rot_loss, 0, 10.0)
126
-
127
- # if torch.isnan(rot_loss):
128
- # rot_loss = torch.tensor(1.0, device=rot_loss.device)
129
-
130
- # trans_loss = torch.nn.functional.mse_loss(
131
- # predictions['translation'],
132
- # targets['translation']
133
- # )
134
-
135
- # trans_loss = torch.clamp(trans_loss, 0, 100.0)
136
-
137
- # if torch.isnan(trans_loss):
138
- # trans_loss = torch.tensor(1.0, device=trans_loss.device)
139
-
140
- # cls_loss = 0
141
- # if 'building_id' in targets and 'building_logits' in predictions:
142
- # cls_loss = self.ce_loss(
143
- # predictions['building_logits'],
144
- # targets['building_id']
145
- # )
146
- # if torch.isnan(cls_loss):
147
- # cls_loss = torch.tensor(0.1, device=predictions['building_logits'].device)
148
-
149
- # total_loss = (
150
- # self.rotation_weight * rot_loss +
151
- # self.translation_weight * trans_loss +
152
- # self.classification_weight * cls_loss
153
- # )
154
-
155
- # if torch.isnan(total_loss) or torch.isinf(total_loss):
156
- # total_loss = torch.tensor(1.0, device=total_loss.device, requires_grad=True)
157
-
158
- # return {
159
- # 'total_loss': total_loss,
160
- # 'rotation_loss': rot_loss,
161
- # 'translation_loss': trans_loss,
162
- # 'classification_loss': cls_loss
163
- # }
164
-
165
- # def quaternion_to_rotation_matrix(quaternion):
166
- # """
167
- # Convert quaternion to 3x3 rotation matrix
168
-
169
- # Args:
170
- # quaternion: torch.Tensor of shape (4,) or (N, 4) [w, x, y, z]
171
-
172
- # Returns:
173
- # Rotation matrix of shape (3, 3) or (N, 3, 3)
174
- # """
175
- # if quaternion.dim() == 1:
176
- # quaternion = quaternion.unsqueeze(0)
177
- # squeeze = True
178
- # else:
179
- # squeeze = False
180
-
181
- # quaternion = quaternion / torch.norm(quaternion, dim=1, keepdim=True)
182
-
183
- # w, x, y, z = quaternion[:, 0], quaternion[:, 1], quaternion[:, 2], quaternion[:, 3]
184
-
185
- # R = torch.zeros((quaternion.shape[0], 3, 3), device=quaternion.device)
186
-
187
- # R[:, 0, 0] = 1 - 2*y**2 - 2*z**2
188
- # R[:, 0, 1] = 2*x*y - 2*w*z
189
- # R[:, 0, 2] = 2*x*z + 2*w*y
190
-
191
- # R[:, 1, 0] = 2*x*y + 2*w*z
192
- # R[:, 1, 1] = 1 - 2*x**2 - 2*z**2
193
- # R[:, 1, 2] = 2*y*z - 2*w*x
194
-
195
- # R[:, 2, 0] = 2*x*z - 2*w*y
196
- # R[:, 2, 1] = 2*y*z + 2*w*x
197
- # R[:, 2, 2] = 1 - 2*x**2 - 2*y**2
198
-
199
- # if squeeze:
200
- # R = R.squeeze(0)
201
-
202
- # return R
203
-
204
- # if __name__ == "__main__":
205
- # model = BuildingPoseNet(num_buildings=5)
206
-
207
- # batch_size = 4
208
- # dummy_input = torch.randn(batch_size, 3, 224, 224)
209
-
210
- # output = model(dummy_input)
211
-
212
- # quat = output['rotation'][0]
213
- # R = quaternion_to_rotation_matrix(quat)
214
-
215
-
216
-
217
  import torch
218
  import torch.nn as nn
219
 
 
 
 
 
220
 
221
  class BuildingPoseNet(nn.Module):
222
  """
@@ -443,32 +233,73 @@ def quaternion_to_rotation_matrix(quaternion):
443
  return R
444
 
445
 
446
- if __name__ == "__main__":
447
- # Test the network
448
- print("Testing BuildingPoseNetCached...")
449
-
450
- model = BuildingPoseNet(num_buildings=5, feature_dim=1280)
451
-
452
- # Create dummy features (simulating pre-extracted CLIP features)
453
- batch_size = 4
454
- dummy_features = torch.randn(batch_size, 1280)
455
-
456
- # Forward pass
457
- output = model(dummy_features)
458
-
459
- print("\nNetwork Output Shapes:")
460
- print(f"Building logits: {output['building_logits'].shape}")
461
- print(f"Rotation (quaternion): {output['rotation'].shape}")
462
- print(f"Translation: {output['translation'].shape}")
463
- print(f"Confidence: {output['confidence'].shape}")
464
 
465
- # Test quaternion to rotation matrix conversion
466
- quat = output['rotation'][0]
467
- R = quaternion_to_rotation_matrix(quat)
468
- print(f"\nRotation matrix shape: {R.shape}")
469
- print(f"Is orthogonal: {torch.allclose(R @ R.T, torch.eye(3), atol=1e-5)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
470
 
471
- # Count parameters
472
- total_params = sum(p.numel() for p in model.parameters())
473
- print(f"\nTotal parameters: {total_params:,}")
474
- print("(Much smaller than full CLIP model!)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import cv2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import torch
4
  import torch.nn as nn
5
 
6
+ from classificator_training.data.dataset import preprocess_single_image
7
+ from classificator_training.model.feature_extractor import FeatureExtractor
8
+ from classificator_training.utils import move_to_device
9
+
10
 
11
  class BuildingPoseNet(nn.Module):
12
  """
 
233
  return R
234
 
235
 
236
+
237
+ class BuildingPoseEstimator:
238
+ """
239
+ Process folders of building images and generate pose predictions
240
+ Works with models trained on cached CLIP features
241
+ """
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
+ def __init__(self, model_path, config_path, feature_extractor: FeatureExtractor, device='cuda'):
244
+ self.device = device
245
+
246
+ with open(config_path, 'r') as f:
247
+ self.config = json.load(f)
248
+
249
+ self.feature_extractor = feature_extractor
250
+ feature_dim = feature_extractor.feature_dims['clip']
251
+
252
+ self.model = BuildingPoseNet(
253
+ num_buildings=self.config['num_buildings'],
254
+ feature_dim=feature_dim
255
+ )
256
+
257
+ checkpoint = torch.load(model_path, map_location=device, weights_only=False)
258
+ self.model.load_state_dict(checkpoint['model_state_dict'])
259
+ self.model.to(device)
260
+ self.model.eval()
261
 
262
+ def predict_pose(self, image):
263
+ """
264
+ Predict pose from single image
265
+
266
+ Args:
267
+ image: numpy array (H, W, 3) - raw image
268
+
269
+ Returns:
270
+ dict with pose predictions
271
+ """
272
+ # Step 1: Extract CLIP features
273
+ inputs = preprocess_single_image(image, self.feature_extractor.use_models)
274
+ inputs = move_to_device(inputs, "cuda" if torch.cuda.is_available() else "cpu")
275
+ features = self.feature_extractor.extract_feature_from_model("clip", inputs["clip_input"])
276
+
277
+ # Step 2: Predict pose from features
278
+ with torch.no_grad():
279
+ output = self.model(features)
280
+
281
+ # Parse outputs
282
+ building_id = torch.argmax(output['building_logits'], dim=1).item()
283
+ building_probs = torch.softmax(output['building_logits'], dim=1)[0].cpu().numpy()
284
+ rotation_quat = output['rotation'][0].cpu().numpy()
285
+ translation = output['translation'][0].cpu().numpy()
286
+ confidence = output['confidence'][0].item()
287
+
288
+ # Convert quaternion to rotation matrix
289
+ rotation_matrix = quaternion_to_rotation_matrix(
290
+ torch.from_numpy(rotation_quat)
291
+ ).numpy()
292
+
293
+ return {
294
+ 'building_id': building_id,
295
+ 'building_name': self.config['buildings'][building_id],
296
+ 'building_probabilities': {
297
+ self.config['buildings'][i]: float(building_probs[i])
298
+ for i in range(len(building_probs))
299
+ },
300
+ 'rotation_quaternion': rotation_quat.tolist(),
301
+ 'rotation_matrix': rotation_matrix.tolist(),
302
+ 'translation': translation.tolist(),
303
+ 'confidence': float(confidence)
304
+ }
305
+
myapp/main.py CHANGED
@@ -3,20 +3,21 @@ from fastapi import FastAPI
3
  from fastapi.middleware.cors import CORSMiddleware
4
  from myapp.routers import buildings_router
5
  from myapp.routers import buildings_search_router
6
- from classificator_training.helpers.load_best_model import load_best_model
7
  from myapp.AR.model_template import TRAINED_CLASSIFICATION_MODEL
8
 
9
  @asynccontextmanager
10
  async def lifespan(app: FastAPI):
11
  print("Starting up: Loading Neural Network Model...")
12
  try:
13
- model, prototype_tensor, class_ids, feature_extractor, id_to_tag, tag_to_id = load_best_model()
14
  TRAINED_CLASSIFICATION_MODEL["model"] = model
15
  TRAINED_CLASSIFICATION_MODEL["prototype_tensor"] = prototype_tensor
16
  TRAINED_CLASSIFICATION_MODEL["class_ids"] = class_ids
17
  TRAINED_CLASSIFICATION_MODEL["feature_extractor"] = feature_extractor
18
  TRAINED_CLASSIFICATION_MODEL["id_to_tag"] = id_to_tag
19
  TRAINED_CLASSIFICATION_MODEL['tag_to_id'] = tag_to_id
 
20
  TRAINED_CLASSIFICATION_MODEL["model"].eval()
21
 
22
  print("Model loaded.")
 
3
  from fastapi.middleware.cors import CORSMiddleware
4
  from myapp.routers import buildings_router
5
  from myapp.routers import buildings_search_router
6
+ from myapp.utils.load_best_model import load_best_model
7
  from myapp.AR.model_template import TRAINED_CLASSIFICATION_MODEL
8
 
9
  @asynccontextmanager
10
  async def lifespan(app: FastAPI):
11
  print("Starting up: Loading Neural Network Model...")
12
  try:
13
+ model, prototype_tensor, class_ids, feature_extractor, id_to_tag, tag_to_id, pose_model = load_best_model()
14
  TRAINED_CLASSIFICATION_MODEL["model"] = model
15
  TRAINED_CLASSIFICATION_MODEL["prototype_tensor"] = prototype_tensor
16
  TRAINED_CLASSIFICATION_MODEL["class_ids"] = class_ids
17
  TRAINED_CLASSIFICATION_MODEL["feature_extractor"] = feature_extractor
18
  TRAINED_CLASSIFICATION_MODEL["id_to_tag"] = id_to_tag
19
  TRAINED_CLASSIFICATION_MODEL['tag_to_id'] = tag_to_id
20
+ TRAINED_CLASSIFICATION_MODEL["pose_model"] = pose_model
21
  TRAINED_CLASSIFICATION_MODEL["model"].eval()
22
 
23
  print("Model loaded.")
{classificator_training/helpers → myapp/utils}/load_best_model.py RENAMED
@@ -1,11 +1,11 @@
1
  from argparse import Namespace
2
- import json
3
  import os
 
4
  import torch
5
  from classificator_training.model.model import FusedFeatureModel
6
  from classificator_training.model.feature_extractor import FeatureExtractor
7
  from classificator_training.helpers.args import _override_args_from_model_name
8
- from ultralytics import YOLO
9
 
10
  ENVIRONEMENT = os.getenv("ENVIRONMENT", "production")
11
  MODEL_PATH = "1_fused_feature_model.pth_full_clip1_segformer0_midas0_dpt0_gate0_batch64_traintypehardmining_bigfusionhead2_lr2e-07_margin1.2_alpha64.0_datasetsize114272_rendersBlenderRenders7_testdatatest_data3.model"
@@ -18,9 +18,9 @@ FULL_YOLO_MODEL_PATH = FULL_MODEL_PATH + YOLO_MODEL_PATH
18
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
19
 
20
  def load_best_model():
21
- args = Namespace(clip=1, segformer=0, midas=0, dpt=0, gate=0, big_fusion_head=2, train_type='hardmining', lr=2e-07, margin=0.8, alpha=64.0)
22
  args = _override_args_from_model_name(args, MODEL_PATH)
23
-
24
  MODELS_USED = {
25
  'clip': args.clip,
26
  'segformer': args.segformer,
@@ -41,7 +41,17 @@ def load_best_model():
41
  class_ids = checkpoint.get('class_ids', None)
42
  id_to_tag = checkpoint.get('id_to_tag', None)
43
  tag_to_id = {tag: id for id, tag in id_to_tag.items()}
44
-
45
- return model, prototype_tensor, class_ids, feature_extractor, id_to_tag, tag_to_id
 
 
 
 
 
 
 
 
 
 
46
 
47
 
 
1
  from argparse import Namespace
 
2
  import os
3
+ from pathlib import Path
4
  import torch
5
  from classificator_training.model.model import FusedFeatureModel
6
  from classificator_training.model.feature_extractor import FeatureExtractor
7
  from classificator_training.helpers.args import _override_args_from_model_name
8
+ from myapp.AR.pose_network import BuildingPoseEstimator
9
 
10
  ENVIRONEMENT = os.getenv("ENVIRONMENT", "production")
11
  MODEL_PATH = "1_fused_feature_model.pth_full_clip1_segformer0_midas0_dpt0_gate0_batch64_traintypehardmining_bigfusionhead2_lr2e-07_margin1.2_alpha64.0_datasetsize114272_rendersBlenderRenders7_testdatatest_data3.model"
 
18
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
19
 
20
  def load_best_model():
21
+ args = Namespace(clip=1, segformer=0, midas=0, dpt=0, gate=0, big_fusion_head=2, train_type='hardmining', lr=2e-07, margin=1.2, alpha=64.0)
22
  args = _override_args_from_model_name(args, MODEL_PATH)
23
+ print("Loading model with args:", args)
24
  MODELS_USED = {
25
  'clip': args.clip,
26
  'segformer': args.segformer,
 
41
  class_ids = checkpoint.get('class_ids', None)
42
  id_to_tag = checkpoint.get('id_to_tag', None)
43
  tag_to_id = {tag: id for id, tag in id_to_tag.items()}
44
+
45
+ current_dir = Path(__file__).parent
46
+ file_path = current_dir.parent / 'AR'
47
+ print("Loaded")
48
+ pose_model = BuildingPoseEstimator(
49
+ model_path=file_path / 'best_model.pth',
50
+ config_path=file_path / 'dataset.json',
51
+ feature_extractor=feature_extractor,
52
+ device='cuda' if torch.cuda.is_available() else 'cpu',
53
+ )
54
+
55
+ return model, prototype_tensor, class_ids, feature_extractor, id_to_tag, tag_to_id, pose_model
56
 
57