from transformers import CLIPModel, CLIPProcessor
from PIL import Image
import torch
import numpy as np
import matplotlib.pyplot as plt
import cv2
TF_ENABLE_ONEDNN_OPTS=0

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name).to(device)
processor = CLIPProcessor.from_pretrained(model_name)

# This function extracts patches from an image and returns them along with their coordinates.
def image_patch(img, patch_size =(100, 100), stride = 2):

    img_w, img_h = img.size
    print(f"Image dimensions: width={img_w}, height={img_h}")
    patches = []

    for i in range(0, img_h - patch_size[1] + 1, stride):
        for j in range(0, img_w - patch_size[0] + 1, stride):
            patch = img.crop((j, i, j + patch_size[0], i + patch_size[1]))
            patches.append((patch, (j, i)))

    return patches

def bounding_box(img, heatmap):

    img_copy = np.array(img).copy()
    found = False

    normalized = cv2.normalize(heatmap, None, 0, 255, cv2.NORM_MINMAX, cv2.CV_8U)

    _, binary = cv2.threshold(normalized, 200, 255, cv2.THRESH_BINARY)

    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    if contours:
        largest = max(contours, key=cv2.contourArea)

        x, y, w, h = cv2.boundingRect(largest)
        cv2.rectangle(img_copy, (x, y), (x + w, y + h), (255, 0, 0), 2)
        found = True
    
    return img_copy, found

# def main():

#     print("Starting the object detection process...")

#     img_path = r"C:\Users\sahas\OneDrive\Desktop\GenMatch\Photo of a dog.jpg"

#     score_patches = []
#     prompt = ["a photo of a human", "a close up of a dog's face"]

#     try:

#         # Open the image
#         img = Image.open(img_path)
#         print(f"Image opened successfully: {img_path}")

#         # Extract patches from the image
#         patches = image_patch(img)
#         print(f"Extracted {len(patches)} patches from the image.")

#         # Process all patches with the CLIP model to get the probabilities
#         patch_batch = [p for p, (x, y) in patches]
#         input = processor(text=prompt, images=patch_batch, return_tensors="pt", padding=True)
#         input = {k: v.to(device) for k, v in input.items()}
#         with torch.no_grad():
#             output = model(**input)

#         logits = output.logits_per_image
#         prob = logits.softmax(dim=1)

#         for i, (patch, (x, y)) in enumerate(patches):
#             score = prob[i][0].item()
#             score_patches.append((patch, (x, y), score))

#         # Create heatmap based on scores
#         img_h, img_w = img.size
#         pat_h, pat_w = patches[0][0].size

#         heatmap = np.zeros((img_h, img_w))

#         for _, (x, y), score in score_patches:
#             heatmap[y:y + pat_h, x:x + pat_w] += score
        
#         fig, ax = plt.subplots()
#         ax.imshow(img)
#         ax.imshow(heatmap, cmap='viridis', alpha=0.6)
#         ax.axis('off')
#         plt.show()

#         print("Genrating images with bounding box")

#         box_img = bounding_box(img, heatmap)

#         plt.imshow(box_img)
#         plt.axis('off')
#         plt.show()


#     except FileNotFoundError:
#         print(f"Error opening image: {img_path}")
#         return

# if __name__ == "__main__":
#     main()

def run_detection_pipeline(input_image, text_prompt):

    print("Starting the object detection process...")

    img = input_image
    prompt = [text_prompt, "a photo of a blank background"]
    score_patches = []
    all_scores = []
    
    patches = image_patch(img)
    print(f"Extracted {len(patches)} patches from the image.")

    patch_batch = [p for p, (x, y) in patches]
    input_data = processor(text=prompt, images=patch_batch, return_tensors="pt", padding=True)
    input_data = {k: v.to(device) for k, v in input_data.items()}
    with torch.no_grad():
        output = model(**input_data)

    logits = output.logits_per_image
    prob = logits.softmax(dim=1)

    for i, (patch, (x, y)) in enumerate(patches):
        score = prob[i][0].item()
        score_patches.append((patch, (x, y), score))
        all_scores.append(score)
    
    confidence_threshold = 0.20
    max_score = max(all_scores) if all_scores else 0
    print(f"Max confidence score: {max_score:.4f}")

    if max_score < confidence_threshold:
        msg = f"Could not find '{text_prompt}' with enough confidence."
        return msg, input_image

    img_h, img_w = img.size

    if not patches:
        print("Warning: No patches were extracted from the image.")
        return img

    pat_h, pat_w = patches[0][0].size
    heatmap = np.zeros((img_h, img_w))

    for _, (x, y), score in score_patches:
        heatmap[y:y + pat_h, x:x + pat_w] += score
    
    print("Generating image with bounding box...")
    box_img, found = bounding_box(img, heatmap)

    if not found:
        msg = "No object detected matching the prompt."
    else:
        msg = "Object detected and highlighted."

    return msg, Image.fromarray(box_img)