#!/usr/bin/env bash
set -euo pipefail

echo "🚀 Builder completo — FlashAttention LayerNorm, Apex, Q8, FlashAttention (GitHub) + upload"

# ===== Configurações e diretórios =====
APP_WHEELS="/app/wheels"
APP_CUDA_CACHE="/app/cuda_cache"
SRC_DIR="$APP_WHEELS/src"
mkdir -p "$APP_WHEELS" "$APP_CUDA_CACHE" "$SRC_DIR"
chmod -R 777 "$APP_WHEELS" || true
export CUDA_CACHE_PATH="$APP_CUDA_CACHE"

# Preserva licença NGC (se existir)
if [ -f "/NGC-DL-CONTAINER-LICENSE" ]; then
  cp -f /NGC-DL-CONTAINER-LICENSE "$APP_WHEELS/" || true
fi

# ===== Dependências mínimas =====
python -m pip install -v -U \
  pip build setuptools wheel hatchling hatch-vcs \
  scikit-build-core cmake ninja packaging \
  "huggingface_hub[hf_transfer]" || true

# ===== Tags de ambiente (Python/CUDA/Torch) =====
PY_TAG="$(python -c 'import sys; print(f"cp{sys.version_info[0]}{sys.version_info[1]}")' 2>/dev/null || echo cp310)"
TORCH_VER="$(python - <<'PY'  
try; import torch, re; v=torch.__version__; print(re.sub(r'\+.*$', '', v))
except; print("unknown")
PY
)"
CU_TAG="$(python - <<'PY'
try; import torch; cu=getattr(torch.version,"cuda",None); echo="cu"+cu.replace(".","") if cu else ""; print(echo)
except; print("")
PY
)"
echo "[env] PY_TAG=$PY_TAG TORCH_VER=$TORCH_VER CU_TAG=$CU_TAG"

# ===== Funções de checagem =====
check_flash_layer_norm_bin() {
  python - <<'PY'
import importlib, sys
modules = [
  "dropout_layer_norm",
  "flash_attn.ops.layer_norm",
  "flash_attn.ops.rms_norm",
]
for m in modules:
    try: importlib.import_module(m); sys.exit(0)
    except: pass
sys.exit(1)
PY
}

check_apex() {
  python - <<'PY'
import sys
try:
    from apex.normalization import FusedLayerNorm
    import importlib; importlib.import_module("fused_layer_norm_cuda")
    sys.exit(0)
except:
    sys.exit(1)
PY
}

check_q8() {
  python - <<'PY'
import importlib.util, sys
spec = importlib.util.find_spec("ltx_q8_kernels") or importlib.util.find_spec("q8_kernels")
sys.exit(0 if spec else 1)
PY
}

# ===== Helpers Hugging Face =====
install_from_hf_by_prefix() {
  local PREFIX="$1"
  python - <<'PY' || return 1
import os, sys, re
from huggingface_hub import HfApi, hf_hub_download, HfFolder
repo = os.getenv("SELF_HF_REPO_ID","eeuuia/Tmp")
token = os.getenv("HF_TOKEN") or HfFolder.get_token()
api = HfApi(token=token)
files = api.list_repo_files(repo_id=repo, repo_type="model")
cands = [f for f in files if f.endswith(".whl") and "/${PREFIX}-" in f and "${PY_TAG}" in f]
pref = [f for f in cands if "${CU_TAG}" in f] or cands
if not pref: sys.exit(1)
target = sorted(pref, reverse=True)[0]
hf_hub_download(repo_id=repo, filename=target, repo_type="model", local_dir="$APP_WHEELS")
print(target)
PY
}

# ===== Build functions =====
build_flash_layer_norm() {
  echo "=== FlashAttn LayerNorm ==="
  if install_from_hf_by_prefix "flash-attn"; then
    python -m pip install -v --no-deps "$APP_WHEELS"/flash_attn-*.whl || true
    check_flash_layer_norm_bin && return 0
    echo "Wheel HF falhou, build local"
  fi

  SRC="$SRC_DIR/flash-attention"
  if [ -d "$SRC/.git" ]; then
    git -C "$SRC" fetch --all -p || true
    git -C "$SRC" reset --hard origin/main || true
    git -C "$SRC" clean -fdx || true
  else
    rm -rf "$SRC"
    git clone --depth 1 https://github.com/Dao-AILab/flash-attention "$SRC"
  fi

  export TORCH_CUDA_ARCH_LIST="$(python - <<'PY'
import torch,sys
try: cc="%d.%d"%torch.cuda.get_device_capability(0); print(cc)
except: print("8.9")
PY
  )"
  echo "[build] TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"

  pushd "$SRC/csrc/layer_norm" >/dev/null
    python -m pip wheel -v --no-build-isolation --no-deps . -w "$APP_WHEELS" || true
  popd >/dev/null

  WHEEL=$(ls -t "$APP_WHEELS"/*flash*layer*norm*-*.whl 2>/dev/null | head -n1)
  python -m pip install -v --no-deps "${WHEEL:-$SRC/csrc/layer_norm}" || true
  check_flash_layer_norm_bin || echo "⚠️ LayerNorm import falhou"
}

build_apex() {
  echo "=== Apex ==="
  SRC="$SRC_DIR/apex"
  rm -rf "$SRC"
  git clone --depth 1 https://github.com/NVIDIA/apex "$SRC"
  export APEX_CPP_EXT=1 APEX_CUDA_EXT=1 APEX_ALL_CONTRIB_EXT=0
  python -m pip wheel -v --no-build-isolation --no-deps "$SRC" -w "$APP_WHEELS" || true
  python -m pip install -v --no-deps "$APP_WHEELS"/apex-*.whl || true
}

build_q8() {
  echo "=== Q8 Kernels ==="
  SRC="$SRC_DIR/q8_kernels"
  rm -rf "$SRC"
  git clone --filter=blob:none "$Q8_REPO" "$SRC"
  git -C "$SRC" checkout "$Q8_COMMIT"
  git -C "$SRC" submodule update --init --recursive
  python -m pip wheel -v --no-build-isolation --no-deps "$SRC" -w "$APP_WHEELS" || true
  python -m pip install -v --no-deps "$APP_WHEELS"/q8_kernels-*.whl || true
}

build_flash_attention_full() {
  echo "=== FlashAttention (full GitHub) ==="
  SRC="$SRC_DIR/flash-attention-full"
  rm -rf "$SRC"
  git clone --depth 1 https://github.com/Dao-AILab/flash-attention "$SRC"
  pushd "$SRC" >/dev/null
    export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-cuda}"
    python -m pip wheel -v --no-build-isolation --no-deps . -w "$APP_WHEELS" || true
  popd >/dev/null
  W=$(ls -t "$APP_WHEELS"/flash_attn-*.whl 2>/dev/null | head -n1)
  if [ -n "$W" ]; then
    python -m pip install -v --no-deps "$W"
  else
    python -m pip install -v --no-deps git+https://github.com/Dao-AILab/flash-attention
  fi
}

# ===== Execução principal =====

build_apex
build_q8
build_flash_attention_full
build_flash_layer_norm

# ===== Upload de wheels =====
python - <<'PY'
import os
from huggingface_hub import HfApi, HfFolder
repo = os.getenv("SELF_HF_REPO_ID","eeuuia/Tmp")
token = os.getenv("HF_TOKEN") or HfFolder.get_token()
if not token: exit(0)
api = HfApi(token=token)
api.upload_folder(
    folder_path="$APP_WHEELS",
    repo_id=repo,
    repo_type="model",
    allow_patterns=["*.whl","NGC-DL-CONTAINER-LICENSE"],
    ignore_patterns=["**/src/**",".git/**"],
)
print("✅ Upload concluído.")
PY

chmod -R 777 "$APP_WHEELS" || true
echo "✅ Builder finalizado."