ForgeSight / backend /amd_hackathon /track3_vision.py
rasAli02's picture
git add, commit, push
307f1c5
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from PIL import Image
def analyze_construction_site(image_path: str, device: str = "cuda") -> str:
"""
Uses Qwen2-VL (Track 3) to process a construction site image (e.g., from a drone)
and output a structured technical description. This description acts as the 'Context'
for the fine-tuned Track 2 Compliance Auditor model.
"""
# Initialize the model and processor
# We use a placeholder path for the Qwen2-VL model here.
model_id = "Qwen/Qwen2-VL-7B-Instruct"
print(f"Loading {model_id} on {device}...")
try:
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map=device
)
processor = AutoProcessor.from_pretrained(model_id)
except Exception as e:
print(f"Model loading failed (this is expected if weights aren't downloaded): {e}")
# Return a mocked structured output for demonstration purposes in the hackathon
return _mocked_vision_output()
# Load the image
try:
image = Image.open(image_path).convert("RGB")
except Exception as e:
raise ValueError(f"Could not load image at {image_path}: {e}")
# Prepare the prompt tailored for technical extraction
prompt = (
"You are an expert construction site inspector. Describe the structural elements, "
"materials, and construction practices visible in this image. Focus on technical "
"details like concrete pouring, rebar placement, structural steel connections, "
"and any visible environmental exposure factors. Be highly descriptive and objective."
)
# Qwen2-VL format
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": prompt},
],
}
]
# Preprocess inputs
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = processor.image_processor(image), None
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to(device)
# Generate output
print("Analyzing image...")
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
return _format_for_track2(output_text)
def _mocked_vision_output() -> str:
"""Provides a mocked output when running without the heavy VLM weights."""
mocked_description = (
"A bridge pier is constructed using concrete. Reinforcement bars are visible with approximately "
"50mm of concrete cover. The pier is located directly in a tidal splash zone (marine environment). "
"Concrete surface appears to have minor honeycombing at the base."
)
return _format_for_track2(mocked_description)
def _format_for_track2(vision_text: str) -> str:
"""
Structures the vision output so it can be seamlessly passed
as input 'Context' to the fine-tuned 35B model.
"""
structured_context = (
"### VISUAL INSPECTION REPORT (TRACK 3)\n"
f"{vision_text}\n\n"
"### TASK\n"
"Based on the visual inspection report above, identify any violations of structural codes "
"(e.g., Eurocodes, ASTM, ISO 9001). Provide a label of 'Compliant' or 'Non-Compliant' "
"followed by a detailed reasoning trace."
)
return structured_context
if __name__ == "__main__":
# Test the pipeline
test_image = "dummy_construction_site.jpg"
print(f"Testing Multimodal Pipeline with {test_image}")
try:
context_for_track2 = analyze_construction_site(test_image)
print("\n--- Structured Output for Track 2 Model ---\n")
print(context_for_track2)
print("\n-------------------------------------------\n")
except Exception as e:
print(f"Error: {e}")