bmrayan
/

ForensicFusion

Model card Files Files and versions

xet

Community

bmrayan commited on Jan 28

Commit

9ea88f7

verified ·

1 Parent(s): 7c73207

Upload predict.py with huggingface_hub

Browse files

Files changed (1) hide show

predict.py +217 -0

predict.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""
+Prediction script combining DINOv2 classifier and Qwen2-VL reasoner
+Outputs predictions.json in required format
+"""
+import torch
+import torch.nn as nn
+from torchvision import transforms
+from transformers import (
+    AutoImageProcessor,
+    Dinov2Model,
+    Qwen3VLForConditionalGeneration,
+    AutoProcessor
+)
+from peft import PeftModel
+from PIL import Image
+import json
+import os
+from pathlib import Path
+from tqdm import tqdm
+from qwen_vl_utils import process_vision_info
+class DINOv2Classifier(nn.Module):
+    def __init__(self, num_classes=3):
+        super().__init__()
+        self.dinov2 = Dinov2Model.from_pretrained("facebook/dinov2-base")
+        # Classification head
+        self.classifier = nn.Sequential(
+            nn.Linear(768, 512),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(256, num_classes)
+        )
+    def forward(self, pixel_values):
+        outputs = self.dinov2(pixel_values)
+        cls_token = outputs.last_hidden_state[:, 0]
+        logits = self.classifier(cls_token)
+        return logits
+class GenAIDetector:
+    def __init__(self, classifier_path):
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        print(f"Using device: {self.device}")
+        # Load DINOv2 classifier
+        print("Loading classifier...")
+        self.classifier = DINOv2Classifier(num_classes=3).to(self.device)
+        checkpoint = torch.load(classifier_path, map_location=self.device)
+        self.classifier.load_state_dict(checkpoint['model_state_dict'])
+        self.classifier.eval()
+        self.image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
+        # Load VLM
+        print("Loading VLM reasoner...")
+        base_model = Qwen3VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen3-VL-8B-Instruct",
+            torch_dtype="auto",
+            device_map="auto"
+        )
+        self.vlm = base_model
+        self.vlm_processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-8B-Instruct")
+        self.vlm.eval()
+        self.class_names = ['real', 'manipulated', 'fake']
+        self.manipulation_types = {
+            'real': 'none',
+            'manipulated': 'inpainting',
+            'fake': 'full_synthesis'
+        }
+    def classify_image(self, image_path):
+        """Classify image and get confidence scores"""
+        image = Image.open(image_path).convert('RGB')
+        inputs = self.image_processor(images=image, return_tensors="pt")
+        pixel_values = inputs['pixel_values'].to(self.device)
+        with torch.no_grad():
+            logits = self.classifier(pixel_values)
+            probs = torch.softmax(logits, dim=1)
+            pred_class = torch.argmax(probs, dim=1).item()
+            confidence = probs[0].cpu().numpy()
+        return pred_class, confidence
+    def generate_reasoning(self, image_path, predicted_class):
+        """Generate reasoning using VLM"""
+        class_name = self.class_names[predicted_class]
+        # Prepare prompt
+        prompt = f"The given image has been flagged as {class_name}. Explain in 2-3 sentences why that might be. Focus on specific features which indicated this."
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image_path},
+                    {"type": "text", "text": prompt}
+                ]
+            }
+        ]
+        # Apply chat template
+        text = self.vlm_processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        # Process inputs
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.vlm_processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt"
+        )
+        inputs = inputs.to(self.device)
+        # Generate
+        with torch.no_grad():
+            output_ids = self.vlm.generate(
+                **inputs,
+                max_new_tokens=150,
+                temperature=0.7,
+                do_sample=True
+            )
+        # Decode
+        generated_text = self.vlm_processor.batch_decode(
+            output_ids,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        # Extract assistant response
+        if "assistant" in generated_text.lower():
+            reasoning = generated_text.split("assistant")[-1].strip()
+        else:
+            reasoning = generated_text.strip()
+        return reasoning
+    def predict(self, image_path):
+        """Full prediction pipeline"""
+        # Classify
+        pred_class, confidence = self.classify_image(image_path)
+        # Get authenticity score (confidence that it's real, i.e., confidence[0])
+        authenticity_score = float(1.0 - confidence[0])  # Higher score = more manipulated
+        # Get manipulation type
+        class_name = self.class_names[pred_class]
+        manipulation_type = self.manipulation_types[class_name]
+        # Generate reasoning
+        reasoning = self.generate_reasoning(image_path, pred_class)
+        return {
+            'authenticity_score': round(authenticity_score, 2),
+            'manipulation_type': manipulation_type,
+            'vlm_reasoning': reasoning
+        }
+def main(image_dir, classifier_path, output_file):
+    """Main prediction function"""
+    # Initialize detector
+    detector = GenAIDetector(classifier_path)
+    # Get all images
+    image_extensions = ['.jpg', '.jpeg', '.png']
+    image_files = []
+    for ext in image_extensions:
+        image_files.extend(Path(image_dir).glob(f'*{ext}'))
+        image_files.extend(Path(image_dir).glob(f'*{ext.upper()}'))
+    print(f"Found {len(image_files)} images")
+    # Process images
+    predictions = []
+    for image_path in tqdm(image_files, desc="Processing images"):
+        try:
+            result = detector.predict(str(image_path))
+            result['image_name'] = image_path.name
+            predictions.append(result)
+        except Exception as e:
+            print(f"Error processing {image_path.name}: {str(e)}")
+            continue
+    # Save predictions
+    with open(output_file, 'w') as f:
+        json.dump(predictions, f, indent=2)
+    print(f"\n✓ Processed {len(predictions)} images")
+    print(f"✓ Saved predictions to {output_file}")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--image_dir', type=str, default='./test_images',
+                        help='Directory containing images to predict')
+    parser.add_argument('--classifier_path', type=str, default='best_model.pth',
+                        help='Path to trained DINOv2 checkpoint (.pth file)')
+    parser.add_argument('--output_file', type=str, default='predictions.json',
+                        help='Output JSON file')
+    args = parser.parse_args()
+    main(args.image_dir, args.classifier_path, args.output_file)