"""
auto.py — Smart project creation from samples + description.

The core v2 experience: a user says "label these as X", gives a few sample
images, and the factory figures out the best project config and backends.

Usage:
    # CLI
    data_label_factory auto \
        --samples ~/my-samples/ \
        --description "fire hydrants in urban settings" \
        --output projects/fire-hydrants.yaml

    # Python
    from data_label_factory.auto import auto_project
    project = auto_project(
        samples=["img1.jpg", "img2.jpg"],
        description="fiber optic drones with cable spools",
    )

How it works:
    1. Analyze sample images via available VLMs to understand the domain
    2. Pick the best backends per stage based on content type
    3. Generate search queries for the gather stage
    4. Create a complete project YAML ready to run
"""

from __future__ import annotations

import json
import os
import time
import yaml
from datetime import datetime
from pathlib import Path
from typing import Any


# ── Content type detection ──────────────────────────────────────

CONTENT_PROFILES = {
    "document": {
        "keywords": ["text", "document", "form", "receipt", "invoice", "table",
                     "pdf", "paper", "letter", "report", "handwriting", "ocr"],
        "filter_backend": "chandra",
        "label_backend": "chandra",
        "verify_backend": "qwen",
        "gather_queries_hint": "scanned {target}, {target} photograph, {target} closeup",
    },
    "object": {
        "keywords": ["car", "dog", "cat", "drone", "bird", "person", "sign",
                     "hydrant", "bottle", "phone", "building", "animal",
                     "vehicle", "tool", "equipment", "furniture"],
        "filter_backend": "qwen",
        "label_backend": "falcon",
        "verify_backend": "qwen",
        "gather_queries_hint": "{target} photo, {target} outdoor, {target} closeup",
    },
    "card": {
        "keywords": ["card", "playing card", "trading card", "poker", "blackjack",
                     "yu-gi-oh", "yugioh", "pokemon", "magic the gathering", "deck"],
        "filter_backend": "qwen",
        "label_backend": "falcon",
        "verify_backend": "qwen",
        "gather_queries_hint": "{target} card photo, {target} cards spread",
        "synthetic": True,
    },
    "3d_scene": {
        "keywords": ["3d", "depth", "scene", "room", "outdoor", "street",
                     "driving", "autonomous", "lidar", "stereo"],
        "filter_backend": "qwen",
        "label_backend": "wilddet3d",
        "verify_backend": "qwen",
        "gather_queries_hint": "{target} photo, {target} outdoor scene",
    },
    "generic": {
        "keywords": [],
        "filter_backend": "qwen",
        "label_backend": "falcon",
        "verify_backend": "qwen",
        "gather_queries_hint": "{target} photo, {target} closeup, {target} example",
    },
}


def detect_content_type(description: str, sample_analysis: str = "") -> str:
    """Classify the labeling task into a content profile."""
    text = (description + " " + sample_analysis).lower()
    best_type = "generic"
    best_score = 0

    for ctype, profile in CONTENT_PROFILES.items():
        if ctype == "generic":
            continue
        score = sum(1 for kw in profile["keywords"] if kw in text)
        if score > best_score:
            best_score = score
            best_type = ctype

    return best_type


def analyze_samples_with_vlm(sample_paths: list[str], description: str) -> dict:
    """Use available VLM to analyze sample images and extract domain info."""
    from .providers import create_provider

    # Try qwen first (fastest), then gemma
    for backend in ("qwen", "gemma"):
        try:
            provider = create_provider(backend)
            if not provider.status().get("alive"):
                continue
        except Exception:
            continue

        prompt = (
            f"I want to build a dataset of images showing: {description}\n"
            f"Look at this sample image. In 2-3 sentences, describe:\n"
            f"1. What objects/elements are visible\n"
            f"2. What search queries would find similar images\n"
            f"3. What negative examples (things to exclude) would help\n"
            f"Be specific and concise."
        )

        analyses = []
        for path in sample_paths[:5]:  # analyze up to 5 samples
            try:
                result = provider.filter_image(path, prompt)
                analyses.append(result.raw_answer)
            except Exception:
                continue

        if analyses:
            return {
                "backend_used": backend,
                "analyses": analyses,
                "combined": " ".join(analyses),
            }

    return {"backend_used": "none", "analyses": [], "combined": ""}


def generate_queries(description: str, target_object: str,
                     content_type: str, vlm_analysis: str = "") -> dict:
    """Generate bucket queries for the gather stage."""
    profile = CONTENT_PROFILES.get(content_type, CONTENT_PROFILES["generic"])
    hint = profile["gather_queries_hint"].format(target=target_object)

    # Base positive queries from description
    words = [w.strip() for w in description.split() if len(w.strip()) > 2]
    positive_queries = [
        description,
        target_object,
        f"{target_object} photo",
        f"{target_object} closeup",
        f"{target_object} high quality",
    ]
    # Add hint-based queries
    for q in hint.split(", "):
        if q not in positive_queries:
            positive_queries.append(q)

    # Extract additional queries from VLM analysis
    if vlm_analysis:
        # Simple extraction: look for quoted phrases or comma-separated suggestions
        for line in vlm_analysis.split("\n"):
            if "search" in line.lower() or "query" in line.lower():
                parts = line.split('"')
                for i in range(1, len(parts), 2):
                    if len(parts[i]) > 3:
                        positive_queries.append(parts[i])

    # Negative / background queries
    negative_queries = [
        f"not {target_object}",
        "empty background",
        "plain surface",
    ]

    buckets = {
        f"positive/clear_view": {"queries": positive_queries[:8]},
        f"negative/other_objects": {"queries": negative_queries[:4]},
        f"background/empty": {"queries": ["blue sky clouds", "empty room", "plain wall"]},
    }

    return buckets


def generate_falcon_queries(target_object: str, description: str) -> list[str]:
    """Generate Falcon Perception queries for the label stage."""
    queries = [target_object]
    # Add component parts if description mentions them
    words = description.lower().split()
    for w in words:
        if w not in target_object.lower() and len(w) > 3 and w not in (
            "with", "that", "this", "from", "into", "have", "been",
            "show", "shows", "showing", "image", "images", "photo",
        ):
            queries.append(w)
    return queries[:5]  # cap at 5 queries


def auto_project(
    samples: list[str] | str,
    description: str,
    project_name: str = "",
    output: str = "",
    analyze: bool = True,
) -> dict:
    """Create a complete project config from samples + description.

    Args:
        samples: Path to directory of sample images, or list of image paths
        description: What the user wants to label (e.g. "fire hydrants")
        project_name: Optional name (auto-generated from description if empty)
        output: Optional path to write YAML (returns dict if empty)
        analyze: Whether to use VLM to analyze samples (slower but better)

    Returns:
        Complete project config dict (also written to output if specified)
    """
    # Resolve sample paths
    if isinstance(samples, str):
        samples_dir = os.path.expanduser(samples)
        if os.path.isdir(samples_dir):
            sample_paths = [
                os.path.join(samples_dir, f) for f in os.listdir(samples_dir)
                if f.lower().endswith((".jpg", ".jpeg", ".png", ".webp"))
            ]
        else:
            sample_paths = [samples_dir]
    else:
        sample_paths = [os.path.expanduser(s) for s in samples]

    # Derive target object from description
    target_object = description.strip().rstrip(".")
    if not project_name:
        project_name = target_object.lower().replace(" ", "-")[:30]

    print(f"[auto] Creating project: {project_name}")
    print(f"[auto] Target: {target_object}")
    print(f"[auto] Samples: {len(sample_paths)} images")

    # Analyze samples with VLM if available
    vlm_analysis = ""
    if analyze and sample_paths:
        print(f"[auto] Analyzing samples with VLM...")
        analysis = analyze_samples_with_vlm(sample_paths, description)
        vlm_analysis = analysis.get("combined", "")
        if vlm_analysis:
            print(f"[auto] Analysis ({analysis['backend_used']}): {vlm_analysis[:200]}...")

    # Detect content type
    content_type = detect_content_type(description, vlm_analysis)
    profile = CONTENT_PROFILES[content_type]
    print(f"[auto] Content type: {content_type}")
    print(f"[auto] Backends: filter={profile['filter_backend']}, "
          f"label={profile['label_backend']}, verify={profile['verify_backend']}")

    # Generate queries
    buckets = generate_queries(description, target_object, content_type, vlm_analysis)
    falcon_queries = generate_falcon_queries(target_object, description)

    # Build project config
    config = {
        "project_name": project_name,
        "target_object": target_object,
        "description": (
            f"Auto-generated project for labeling {target_object}. "
            f"Content type: {content_type}. "
            f"Created {datetime.now().strftime('%Y-%m-%d %H:%M')}."
        ),
        "data_root": f"~/data-label-factory/{project_name}",
        "buckets": buckets,
        "falcon_queries": falcon_queries,
        "backends": {
            "filter": profile["filter_backend"],
            "label": profile["label_backend"],
            "verify": profile["verify_backend"],
        },
    }

    # Add synthetic config if applicable
    if profile.get("synthetic"):
        config["synthetic"] = {
            "enabled": True,
            "type": "flywheel",
            "note": "This domain supports synthetic data generation via flywheel",
        }

    # Copy samples to data_root
    data_root = os.path.expanduser(config["data_root"])
    samples_dest = os.path.join(data_root, "samples")
    os.makedirs(samples_dest, exist_ok=True)
    import shutil
    for sp in sample_paths:
        if os.path.exists(sp):
            shutil.copy2(sp, samples_dest)
    print(f"[auto] Copied {len(sample_paths)} samples to {samples_dest}")

    # Write YAML if output specified
    if output:
        output = os.path.expanduser(output)
        os.makedirs(os.path.dirname(output) or ".", exist_ok=True)
        with open(output, "w") as f:
            yaml.dump(config, f, default_flow_style=False, sort_keys=False)
        print(f"[auto] Saved project: {output}")

    print(f"[auto] Ready! Run: data_label_factory pipeline --project {output or '<path>'}")
    return config


def main(argv: list[str] | None = None):
    import argparse
    p = argparse.ArgumentParser(
        prog="data_label_factory auto",
        description=(
            "Create a labeling project automatically from sample images + description. "
            "The factory analyzes your samples, picks the best backends, generates "
            "search queries, and creates a ready-to-run project YAML."
        ),
    )
    p.add_argument("--samples", required=True,
                   help="Directory of sample images or a single image path")
    p.add_argument("--description", required=True,
                   help='What to label (e.g. "fire hydrants in urban settings")')
    p.add_argument("--name", default="",
                   help="Project name (auto-generated from description if empty)")
    p.add_argument("--output", default="",
                   help="Output YAML path (default: projects/<name>.yaml)")
    p.add_argument("--no-analyze", action="store_true",
                   help="Skip VLM analysis of samples (faster, less accurate)")

    args = p.parse_args(argv)

    output = args.output
    if not output:
        name = args.name or args.description.lower().replace(" ", "-")[:30]
        output = f"projects/{name}.yaml"

    auto_project(
        samples=args.samples,
        description=args.description,
        project_name=args.name,
        output=output,
        analyze=not args.no_analyze,
    )