| import torch |
| from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor |
| from PIL import Image |
|
|
| def analyze_construction_site(image_path: str, device: str = "cuda") -> str: |
| """ |
| Uses Qwen2-VL (Track 3) to process a construction site image (e.g., from a drone) |
| and output a structured technical description. This description acts as the 'Context' |
| for the fine-tuned Track 2 Compliance Auditor model. |
| """ |
| |
| |
| model_id = "Qwen/Qwen2-VL-7B-Instruct" |
| |
| print(f"Loading {model_id} on {device}...") |
| try: |
| model = Qwen2VLForConditionalGeneration.from_pretrained( |
| model_id, |
| torch_dtype=torch.bfloat16, |
| device_map=device |
| ) |
| processor = AutoProcessor.from_pretrained(model_id) |
| except Exception as e: |
| print(f"Model loading failed (this is expected if weights aren't downloaded): {e}") |
| |
| return _mocked_vision_output() |
|
|
| |
| try: |
| image = Image.open(image_path).convert("RGB") |
| except Exception as e: |
| raise ValueError(f"Could not load image at {image_path}: {e}") |
|
|
| |
| prompt = ( |
| "You are an expert construction site inspector. Describe the structural elements, " |
| "materials, and construction practices visible in this image. Focus on technical " |
| "details like concrete pouring, rebar placement, structural steel connections, " |
| "and any visible environmental exposure factors. Be highly descriptive and objective." |
| ) |
| |
| |
| messages = [ |
| { |
| "role": "user", |
| "content": [ |
| {"type": "image", "image": image}, |
| {"type": "text", "text": prompt}, |
| ], |
| } |
| ] |
| |
| |
| text = processor.apply_chat_template( |
| messages, tokenize=False, add_generation_prompt=True |
| ) |
| image_inputs, video_inputs = processor.image_processor(image), None |
| inputs = processor( |
| text=[text], |
| images=image_inputs, |
| videos=video_inputs, |
| padding=True, |
| return_tensors="pt", |
| ) |
| inputs = inputs.to(device) |
|
|
| |
| print("Analyzing image...") |
| with torch.no_grad(): |
| generated_ids = model.generate(**inputs, max_new_tokens=256) |
| |
| generated_ids_trimmed = [ |
| out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
| ] |
| |
| output_text = processor.batch_decode( |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False |
| )[0] |
| |
| return _format_for_track2(output_text) |
|
|
| def _mocked_vision_output() -> str: |
| """Provides a mocked output when running without the heavy VLM weights.""" |
| mocked_description = ( |
| "A bridge pier is constructed using concrete. Reinforcement bars are visible with approximately " |
| "50mm of concrete cover. The pier is located directly in a tidal splash zone (marine environment). " |
| "Concrete surface appears to have minor honeycombing at the base." |
| ) |
| return _format_for_track2(mocked_description) |
|
|
| def _format_for_track2(vision_text: str) -> str: |
| """ |
| Structures the vision output so it can be seamlessly passed |
| as input 'Context' to the fine-tuned 35B model. |
| """ |
| structured_context = ( |
| "### VISUAL INSPECTION REPORT (TRACK 3)\n" |
| f"{vision_text}\n\n" |
| "### TASK\n" |
| "Based on the visual inspection report above, identify any violations of structural codes " |
| "(e.g., Eurocodes, ASTM, ISO 9001). Provide a label of 'Compliant' or 'Non-Compliant' " |
| "followed by a detailed reasoning trace." |
| ) |
| return structured_context |
|
|
| if __name__ == "__main__": |
| |
| test_image = "dummy_construction_site.jpg" |
| print(f"Testing Multimodal Pipeline with {test_image}") |
| try: |
| context_for_track2 = analyze_construction_site(test_image) |
| print("\n--- Structured Output for Track 2 Model ---\n") |
| print(context_for_track2) |
| print("\n-------------------------------------------\n") |
| except Exception as e: |
| print(f"Error: {e}") |
|
|