"""f-theta 相机模型 + 射线计算。

依据 Cosmos-Drive-Dreams 数据集 README：
    ftheta_intrinsic 存储为 ``[cx, cy, w, h, *poly(6), is_bw_poly, *linear_cde(3)]``。

f-theta 相机模型用 6 阶多项式将像素半径 ``r_pix = ||(u-cx, v-cy)||`` 映射到
入射角 ``theta``（或反向）。``is_bw_poly == True`` 表示多项式是从 ``r_pix`` 反
求 ``theta`` 的 backward polynomial（pixel -> theta）；否则是 forward polynomial
（theta -> r_pix）。``linear_cde`` 是仿射修正系数 ``[c, d, e]``，用于补偿轻微
的非旋转对称形变。

为了简单与可微，本模块默认假设 backward polynomial（``is_bw_poly=True``，
即 ``theta = poly(r_pix)``）；实际数据通常是这种格式。如需 forward 多项式，
这里使用牛顿迭代反求。
"""

from __future__ import annotations

from dataclasses import dataclass

import torch
import torch.nn.functional as F


@dataclass
class FThetaIntrinsic:
    """f-theta 内参（PyTorch 张量形式）。

    所有字段均为标量或一维向量；外部使用时通常 broadcast 到 batch。
    """

    cx: torch.Tensor          # ()
    cy: torch.Tensor          # ()
    w: int
    h: int
    poly: torch.Tensor        # (6,)
    is_bw_poly: bool
    linear_cde: torch.Tensor  # (3,)


class FThetaCamera:
    """f-theta 相机：像素 -> 单位射线方向（相机坐标系）。"""

    def __init__(self, intr: FThetaIntrinsic) -> None:
        self.intr = intr

    @staticmethod
    def from_vector(vec: torch.Tensor) -> "FThetaCamera":
        """从 NVIDIA ftheta 向量构造：``[cx, cy, w, h, poly×6, is_bw_poly?, linear_cde×3?]``。

        官方常见 14 维；部分 clip 仅 11 维（无 ``linear_cde``），此时用 ``(1,0,1)``，
        与 ``unproject`` 里近似一致。
        """
        v = vec.flatten().float()
        n = int(v.numel())
        if n < 10:
            raise ValueError(f"ftheta intrinsic 维度 {n} < 10（至少需要 cx,cy,w,h + 6 poly）")
        cx = v[0]
        cy = v[1]
        w = int(v[2].item())
        h = int(v[3].item())
        poly = v[4:10].clone()
        if n >= 11:
            is_bw = bool(v[10].item() > 0.5)
        else:
            is_bw = True
        if n >= 14:
            linear_cde = v[11:14].clone()
        else:
            linear_cde = torch.tensor([1.0, 0.0, 1.0], dtype=v.dtype, device=v.device)
        return FThetaCamera(
            FThetaIntrinsic(cx=cx, cy=cy, w=w, h=h, poly=poly, is_bw_poly=is_bw, linear_cde=linear_cde)
        )

    def _eval_poly(self, r: torch.Tensor) -> torch.Tensor:
        """用 Horner 法计算 poly(r) = sum_{i=0..5} c_i * r^i。"""
        c = self.intr.poly
        out = torch.zeros_like(r)
        for i in range(c.numel() - 1, -1, -1):
            out = out * r + c[i]
        return out

    def _eval_poly_grad(self, r: torch.Tensor) -> torch.Tensor:
        """poly 的导数。"""
        c = self.intr.poly
        n = c.numel()
        out = torch.zeros_like(r)
        for i in range(n - 1, 0, -1):
            out = out * r + c[i] * float(i)
        return out

    def pixel_to_theta(self, r_pix: torch.Tensor) -> torch.Tensor:
        """像素半径 -> 入射角 theta（弧度）。"""
        if self.intr.is_bw_poly:
            return self._eval_poly(r_pix)
        # forward: r_pix = poly(theta) -> 牛顿迭代
        theta = r_pix.clone()
        for _ in range(8):
            f = self._eval_poly(theta) - r_pix
            df = self._eval_poly_grad(theta).clamp_min(1e-6)
            theta = theta - f / df
        return theta

    def unproject(self, uv: torch.Tensor) -> torch.Tensor:
        """像素坐标 ``[..., 2]`` -> 相机坐标系下的单位方向 ``[..., 3]``。

        f-theta 反投影：
            (du, dv) = (u - cx, v - cy) （并应用 linear_cde 的微小仿射）
            r_pix    = ||(du, dv)||
            theta    = poly(r_pix) 或 inv_poly(r_pix)
            phi      = atan2(dv, du)
            dir_cam  = (sin(theta)*cos(phi), sin(theta)*sin(phi), cos(theta))
        """
        cx = self.intr.cx
        cy = self.intr.cy
        c, d, e = self.intr.linear_cde[0], self.intr.linear_cde[1], self.intr.linear_cde[2]

        u = uv[..., 0]
        v = uv[..., 1]
        # 应用线性修正：du' = c*du + d*dv + e*1（NVIDIA 工具中通常是 2x2 仿射，这里做近似）
        du0 = u - cx
        dv0 = v - cy
        du = c * du0 + d * dv0
        dv = e * du0 + dv0  # 简化：保持 dv 不变量、加入 e*du 微调
        r_pix = torch.sqrt(du * du + dv * dv).clamp_min(1e-6)
        theta = self.pixel_to_theta(r_pix)

        sin_t = torch.sin(theta)
        cos_t = torch.cos(theta)
        cos_p = du / r_pix
        sin_p = dv / r_pix
        x = sin_t * cos_p
        y = sin_t * sin_p
        z = cos_t
        dir_cam = torch.stack([x, y, z], dim=-1)
        return F.normalize(dir_cam, dim=-1)


def compute_ego_rays(
    intr_vec: torch.Tensor,
    cam2vehicle: torch.Tensor,
    height: int,
    width: int,
    grid_h: int,
    grid_w: int,
    device: torch.device,
    dtype: torch.dtype = torch.float32,
) -> torch.Tensor:
    """对一个 ``grid_h x grid_w`` 的均匀像素网格计算自车系下单位射线方向。

    参数
    ----
    intr_vec : ``[B, 14]`` 或 ``[14]``，f-theta 内参向量。
    cam2vehicle : ``[B, 4, 4]`` 或 ``[4, 4]`` 相机系到自车系的变换。
    height, width : 图像分辨率（像素），用于在 ``[0, w] x [0, h]`` 网格采样。
    grid_h, grid_w : 输出射线网格分辨率（与 patch 网格一致，例如 24x64）。

    返回
    ----
    rays : ``[B, grid_h, grid_w, 3]``，自车系下单位方向。
    """
    if intr_vec.dim() == 1:
        intr_vec = intr_vec.unsqueeze(0)
    if cam2vehicle.dim() == 2:
        cam2vehicle = cam2vehicle.unsqueeze(0)
    B = intr_vec.shape[0]

    # 在像素中心采样
    u = (torch.arange(grid_w, device=device, dtype=dtype) + 0.5) * (width / grid_w)
    v = (torch.arange(grid_h, device=device, dtype=dtype) + 0.5) * (height / grid_h)
    vv, uu = torch.meshgrid(v, u, indexing="ij")  # [gh, gw]
    uv = torch.stack([uu, vv], dim=-1)  # [gh, gw, 2]

    out = []
    for b in range(B):
        cam = FThetaCamera.from_vector(intr_vec[b].to(dtype))
        dir_cam = cam.unproject(uv)  # [gh, gw, 3]
        # 旋到自车系：取 cam2vehicle 的 3x3 旋转部分
        R = cam2vehicle[b, :3, :3].to(dtype)
        dir_veh = dir_cam @ R.T  # [gh, gw, 3]
        out.append(F.normalize(dir_veh, dim=-1))
    return torch.stack(out, dim=0)