File size: 5,000 Bytes

b7bca53

# predict.py
import argparse
import os
import json
import re
import torch
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from qwen_vl_utils import process_vision_info

# --- CONFIGURATION ---
# We use the base Qwen model. It will download automatically on the first run.
MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"

def load_model():
    """Loads the model with 4-bit quantization for efficiency."""
    print(f"⏳ Loading Model: {MODEL_ID}...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    
    # Use 'sdpa' implementation for broad compatibility (Colab T4 / RTX GPUs)
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        MODEL_ID,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        attn_implementation="sdpa" 
    )
    processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
    print("✅ Model Loaded.")
    return model, processor

def analyze_image(model, processor, image_path):
    """Runs the VLM analysis."""
    prompt_text = (
        "You are a Forensic Image Analyst. Analyze this image for GenAI manipulation.\n"
        "Focus on: Lighting inconsistencies, structural logic, and unnatural textures.\n"
        "Provide your output STRICTLY as a JSON object with these keys:\n"
        "- 'authenticity_score': float (0.0=Real, 1.0=Fake)\n"
        "- 'manipulation_type': string (e.g., 'Inpainting', 'None')\n"
        "- 'vlm_reasoning': string (max 2 sentences)\n"
    )

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image_path},
                {"type": "text", "text": prompt_text},
            ],
        }
    ]

    # Preprocess
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    # Generate
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs, 
            max_new_tokens=256,
            temperature=0.1 # Low temp for consistency
        )

    # Decode
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]

    return clean_json(output_text)

def clean_json(text):
    """Extracts JSON from response."""
    try:
        json_match = re.search(r"\{.*\}", text, re.DOTALL)
        if json_match:
            return json.loads(json_match.group(0))
        return {"authenticity_score": 0.5, "manipulation_type": "Unknown", "vlm_reasoning": "Parse Error"}
    except:
        return {"authenticity_score": 0.5, "manipulation_type": "Error", "vlm_reasoning": "JSON Error"}

def main(input_dir, output_file):
    # Load model once
    model, processor = load_model()
    
    predictions = []
    
    # Process images
    valid_extensions = ('.png', '.jpg', '.jpeg', '.webp')
    files = [f for f in os.listdir(input_dir) if f.lower().endswith(valid_extensions)]
    
    print(f"🚀 Starting inference on {len(files)} images...")
    
    for img_name in files:
        img_path = os.path.join(input_dir, img_name)
        try:
            result = analyze_image(model, processor, img_path)
            
            entry = {
                "image_name": img_name,
                "authenticity_score": result.get("authenticity_score", 0.5),
                "manipulation_type": result.get("manipulation_type", "Unknown"),
                "vlm_reasoning": result.get("vlm_reasoning", "No reasoning provided.")
            }
            predictions.append(entry)
            print(f"Processed: {img_name}")
        except Exception as e:
            print(f"Failed to process {img_name}: {e}")

    # Save output
    with open(output_file, 'w') as f:
        json.dump(predictions, f, indent=4)
    print(f"✅ Submission file saved to: {output_file}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_dir", type=str, required=True, help="Path to input images")
    parser.add_argument("--output_file", type=str, required=True, help="Path to output JSON")
    args = parser.parse_args()
    
    if not os.path.exists(args.input_dir):
        raise FileNotFoundError(f"Input directory {args.input_dir} not found.")
        
    main(args.input_dir, args.output_file)