import gradio as gr
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

# -- Model (loaded once at startup) --
MODEL_ID = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(MODEL_ID)
processor = CLIPProcessor.from_pretrained(MODEL_ID)
model.eval()

# -- Curated car labels --
CAR_LABELS = [
    # Sedans
    "2020 Toyota Camry", "2021 Honda Accord", "2019 Hyundai Sonata",
    "2022 Nissan Altima", "2021 Kia K5", "2020 Mazda 6",
    "2019 Volkswagen Passat", "2021 Subaru Legacy",
    # SUVs & Crossovers
    "2021 Toyota RAV4", "2022 Honda CR-V", "2020 Ford Escape",
    "2021 Chevrolet Equinox", "2022 Jeep Cherokee", "2021 Hyundai Tucson",
    "2020 Kia Sportage", "2022 Mazda CX-5", "2021 Subaru Forester",
    "2021 Volkswagen Tiguan", "2022 Nissan Rogue",
    # Trucks
    "2022 Ford F-150", "2021 Chevrolet Silverado 1500", "2020 RAM 1500",
    "2021 GMC Sierra", "2022 Toyota Tacoma", "2021 Nissan Frontier",
    # Muscle & Sports Cars
    "2021 Ford Mustang GT", "2022 Chevrolet Camaro SS", "2020 Dodge Challenger",
    "2021 Dodge Charger Hellcat", "2020 Subaru WRX STI",
    # Luxury Sedans
    "2021 BMW 3 Series", "2022 Mercedes-Benz C-Class", "2021 Audi A4",
    "2020 Lexus ES", "2022 Genesis G70", "2021 Cadillac CT5",
    "2020 Volvo S60", "2022 Infiniti Q50",
    # Luxury SUVs
    "2021 BMW X5", "2022 Mercedes-Benz GLC", "2021 Audi Q5",
    "2020 Lexus RX 350", "2022 Volvo XC90", "2021 Cadillac Escalade",
    "2022 Lincoln Navigator",
    # EVs & Hybrids
    "2022 Tesla Model 3", "2021 Tesla Model Y", "2022 Tesla Model S",
    "2021 Chevrolet Bolt EV", "2022 Ford Mustang Mach-E",
    "2021 Toyota Prius", "2022 Hyundai Ioniq 5", "2021 Kia EV6",
    # Supercars
    "2020 Ferrari 488", "2021 Lamborghini Huracan",
    "2021 McLaren 720S", "2022 Ferrari F8",
    # Porsche (expanded)
    "2020 Porsche 911 Carrera", "2021 Porsche 911 Carrera S",
    "2022 Porsche 911 GT3", "2019 Porsche 911 Turbo S",
    "2023 Porsche 911 Targa 4S", "2021 Porsche 718 Cayman",
    "2022 Porsche 718 Boxster", "2021 Porsche Cayenne",
    "2022 Porsche Cayenne GTS", "2021 Porsche Macan",
    "2022 Porsche Panamera", "2021 Porsche Taycan",
    "2022 Porsche Taycan 4S", "2023 Porsche Taycan Turbo S Cross Turismo",
]

# -- Prompt templates (ensemble improves CLIP accuracy significantly) --
PROMPT_TEMPLATES = [
    "a photo of a {}",
    "a photograph of a {}",
    "a {} driving on the road",
    "a {} parked in a driveway",
    "a side view of a {}",
]


def build_text_features():
    """Pre-compute averaged text embeddings for all labels across all templates."""
    all_embeddings = []
    for template in PROMPT_TEMPLATES:
        texts = [template.format(label) for label in CAR_LABELS]
        inputs = processor(text=texts, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            emb = model.get_text_features(**inputs)
            emb = emb / emb.norm(dim=-1, keepdim=True)
        all_embeddings.append(emb)
    avg_emb = torch.stack(all_embeddings).mean(dim=0)
    avg_emb = avg_emb / avg_emb.norm(dim=-1, keepdim=True)
    return avg_emb


# Pre-compute once at startup
TEXT_FEATURES = build_text_features()


def classify_car(image: Image.Image):
    """Run ensembled CLIP classification and return top-5 predictions."""
    if image is None:
        return {}

    inputs = processor(images=image, return_tensors="pt")
    with torch.no_grad():
        img_features = model.get_image_features(**inputs)
        img_features = img_features / img_features.norm(dim=-1, keepdim=True)

    logits = (img_features @ TEXT_FEATURES.T) * model.logit_scale.exp()
    probs = logits.softmax(dim=-1)[0]

    top_k = min(5, len(CAR_LABELS))
    top_indices = probs.topk(top_k).indices.tolist()

    results = {CAR_LABELS[i]: float(probs[i]) for i in top_indices}
    return results


# -- Gradio UI --
with gr.Blocks(
    theme=gr.themes.Soft(
        primary_hue="blue",
        secondary_hue="indigo",
    ),
    title="Car Classifier",
    css="""
        #header { text-align: center; margin-bottom: 10px; }
        #header h1 { font-size: 2.4rem; font-weight: 800; }
        #header p  { color: #6b7280; font-size: 1rem; }
        #result-label .label-container { font-size: 1.05rem; }
        footer { display: none !important; }
    """,
) as demo:

    gr.HTML("""
        <div id="header">
            <h1>Car Classifier</h1>
            <p>Upload a photo of any car -- get the brand, model &amp; year instantly.</p>
        </div>
    """)

    with gr.Row():
        with gr.Column(scale=1):
            image_input = gr.Image(
                type="pil",
                label="Upload Car Image",
                sources=["upload", "clipboard"],
                height=320,
            )
            classify_btn = gr.Button("Classify", variant="primary", size="lg")

        with gr.Column(scale=1):
            output_label = gr.Label(
                num_top_classes=5,
                label="Top-5 Predictions",
                elem_id="result-label",
            )

    gr.Examples(
        examples=[],
        inputs=image_input,
        label="Example Images",
    )

    gr.Markdown(
        """
        ---
        **How it works** - Uses [CLIP](https://huggingface.co/openai/clip-vit-base-patch32)
        (zero-shot vision-language model) with **prompt ensembling** (5 templates averaged)
        to match your image against ~70 curated car labels.
        No GPU required - runs entirely on CPU.
        """,
        elem_id="footer-note",
    )

    classify_btn.click(fn=classify_car, inputs=image_input, outputs=output_label)
    image_input.change(fn=classify_car, inputs=image_input, outputs=output_label)

if __name__ == "__main__":
    demo.launch()