import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from PIL import Image

def analyze_construction_site(image_path: str, device: str = "cuda") -> str:
    """
    Uses Qwen2-VL (Track 3) to process a construction site image (e.g., from a drone) 
    and output a structured technical description. This description acts as the 'Context' 
    for the fine-tuned Track 2 Compliance Auditor model.
    """
    # Initialize the model and processor
    # We use a placeholder path for the Qwen2-VL model here.
    model_id = "Qwen/Qwen2-VL-7B-Instruct"
    
    print(f"Loading {model_id} on {device}...")
    try:
        model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_id, 
            torch_dtype=torch.bfloat16, 
            device_map=device
        )
        processor = AutoProcessor.from_pretrained(model_id)
    except Exception as e:
        print(f"Model loading failed (this is expected if weights aren't downloaded): {e}")
        # Return a mocked structured output for demonstration purposes in the hackathon
        return _mocked_vision_output()

    # Load the image
    try:
        image = Image.open(image_path).convert("RGB")
    except Exception as e:
        raise ValueError(f"Could not load image at {image_path}: {e}")

    # Prepare the prompt tailored for technical extraction
    prompt = (
        "You are an expert construction site inspector. Describe the structural elements, "
        "materials, and construction practices visible in this image. Focus on technical "
        "details like concrete pouring, rebar placement, structural steel connections, "
        "and any visible environmental exposure factors. Be highly descriptive and objective."
    )
    
    # Qwen2-VL format
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt},
            ],
        }
    ]
    
    # Preprocess inputs
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = processor.image_processor(image), None
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(device)

    # Generate output
    print("Analyzing image...")
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=256)
        
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    
    return _format_for_track2(output_text)

def _mocked_vision_output() -> str:
    """Provides a mocked output when running without the heavy VLM weights."""
    mocked_description = (
        "A bridge pier is constructed using concrete. Reinforcement bars are visible with approximately "
        "50mm of concrete cover. The pier is located directly in a tidal splash zone (marine environment). "
        "Concrete surface appears to have minor honeycombing at the base."
    )
    return _format_for_track2(mocked_description)

def _format_for_track2(vision_text: str) -> str:
    """
    Structures the vision output so it can be seamlessly passed 
    as input 'Context' to the fine-tuned 35B model.
    """
    structured_context = (
        "### VISUAL INSPECTION REPORT (TRACK 3)\n"
        f"{vision_text}\n\n"
        "### TASK\n"
        "Based on the visual inspection report above, identify any violations of structural codes "
        "(e.g., Eurocodes, ASTM, ISO 9001). Provide a label of 'Compliant' or 'Non-Compliant' "
        "followed by a detailed reasoning trace."
    )
    return structured_context

if __name__ == "__main__":
    # Test the pipeline
    test_image = "dummy_construction_site.jpg"
    print(f"Testing Multimodal Pipeline with {test_image}")
    try:
        context_for_track2 = analyze_construction_site(test_image)
        print("\n--- Structured Output for Track 2 Model ---\n")
        print(context_for_track2)
        print("\n-------------------------------------------\n")
    except Exception as e:
        print(f"Error: {e}")