"""Regenerate the confirmed 6-model LightGCN ensemble submissions.

This script reproduces the family of `sub_ens6_t*.csv` files. The public
leaderboard score we have confirmed is:

    sub_ens6_t0.36.csv -> F1 0.93044

It expects the transfer package layout by default:

    data_and_docs/
    checkpoints/final_ens6/
    submissions/

Run from the package root:

    python code/generate_ens6_submission.py
"""

from __future__ import annotations

import argparse
import os
import pickle as pkl
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from numpy.linalg import norm
from torch_geometric.data import HeteroData


EDGE_TYPES = [
    ("author", "ref", "paper"),
    ("paper", "beref", "author"),
    ("paper", "cite", "paper"),
    ("author", "coauthor", "author"),
]


def read_txt(path: Path) -> list[list[int]]:
    rows: list[list[int]] = []
    with path.open("r") as f:
        for line in f:
            rows.append(list(map(int, line.strip().split())))
    return rows


def log_norm(x: np.ndarray) -> np.ndarray:
    x = np.log1p(x)
    return (x - x.mean()) / (x.std() + 1e-8)


class LightGCNLayer(nn.Module):
    def forward(self, x_dict, edge_index_dict):
        agg_dict = {node_type: [] for node_type in x_dict}
        for edge_type in EDGE_TYPES:
            if edge_type not in edge_index_dict:
                continue
            src_type, _, dst_type = edge_type
            src, dst = edge_index_dict[edge_type]
            src_x = x_dict[src_type]
            agg = src_x.new_zeros((x_dict[dst_type].size(0), src_x.size(-1)))
            deg = src_x.new_zeros((x_dict[dst_type].size(0), 1))
            agg.index_add_(0, dst, src_x[src])
            deg.index_add_(
                0,
                dst,
                torch.ones((dst.numel(), 1), dtype=src_x.dtype, device=src_x.device),
            )
            agg_dict[dst_type].append(agg / deg.clamp(min=1.0))

        return {
            node_type: sum(aggs) / len(aggs) if aggs else x_dict[node_type]
            for node_type, aggs in agg_dict.items()
        }


class LightGCN(nn.Module):
    def __init__(self, num_authors: int, paper_feat_dim: int, embed_dim: int, num_layers: int = 4):
        super().__init__()
        self.author_emb = nn.Embedding(num_authors, embed_dim)
        self.paper_proj = nn.Linear(paper_feat_dim, embed_dim)
        self.layers = nn.ModuleList([LightGCNLayer() for _ in range(num_layers)])
        self.num_layers = num_layers

    def encode(self, data):
        x_dict = {
            "author": self.author_emb.weight,
            "paper": self.paper_proj(data["paper"].x),
        }
        all_layers = [x_dict]
        for layer in self.layers:
            x_dict = layer(x_dict, data.edge_index_dict)
            all_layers.append(x_dict)
        weight = 1.0 / (self.num_layers + 1)
        return {
            node_type: sum(weight * layer[node_type] for layer in all_layers)
            for node_type in x_dict
        }


def cos_sim(a: np.ndarray, b: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    return np.sum(a * b, axis=1) / (norm(a, axis=1) * norm(b, axis=1) + eps)


def build_features(data_dir: Path, num_authors: int, num_papers: int):
    citation = read_txt(data_dir / "paper_file_ann.txt")
    existing_refs = read_txt(data_dir / "bipartite_train_ann.txt")
    refs_to_pred = read_txt(data_dir / "bipartite_test_ann.txt")
    coauthor = read_txt(data_dir / "author_file_ann.txt")

    with (data_dir / "feature.pkl").open("rb") as f:
        paper_feature = pkl.load(f)

    paper_ref_deg = np.zeros(num_papers, dtype=np.float32)
    paper_cite_out = np.zeros(num_papers, dtype=np.float32)
    paper_cite_in = np.zeros(num_papers, dtype=np.float32)

    for _, paper in existing_refs:
        paper_ref_deg[paper] += 1
    for source, target in citation:
        paper_cite_out[source] += 1
        paper_cite_in[target] += 1

    paper_feat_np = paper_feature.numpy().astype(np.float32)
    paper_deg_feat = np.stack(
        [log_norm(paper_ref_deg), log_norm(paper_cite_out), log_norm(paper_cite_in)],
        axis=-1,
    )
    paper_feat_aug = np.concatenate([paper_feat_np, paper_deg_feat], axis=-1)
    paper_feat_aug = (paper_feat_aug - paper_feat_aug.mean(axis=0)) / (
        paper_feat_aug.std(axis=0) + 1e-8
    )

    cite_edges = pd.DataFrame(citation, columns=["source", "target"])
    ref_edges = pd.DataFrame(existing_refs, columns=["source", "target"])
    coauthor_edges = pd.DataFrame(coauthor, columns=["source", "target"])

    return {
        "citation": citation,
        "existing_refs": existing_refs,
        "refs_to_pred": refs_to_pred,
        "coauthor": coauthor,
        "paper_feat_aug": paper_feat_aug,
        "cite_edges": cite_edges,
        "ref_edges": ref_edges,
        "coauthor_edges": coauthor_edges,
    }


def build_data(parts, num_authors: int, num_papers: int, device: torch.device):
    ref_tensor = torch.as_tensor(
        parts["ref_edges"][["source", "target"]].to_numpy(), dtype=torch.long
    )
    cite_tensor = torch.as_tensor(
        parts["cite_edges"][["source", "target"]].to_numpy(), dtype=torch.long
    )
    coauthor_tensor = torch.as_tensor(
        parts["coauthor_edges"][["source", "target"]].to_numpy(), dtype=torch.long
    )

    data = HeteroData()
    data["author"].num_nodes = num_authors
    data["paper"].num_nodes = num_papers
    data["paper"].x = torch.as_tensor(parts["paper_feat_aug"], dtype=torch.float)
    data["author", "ref", "paper"].edge_index = ref_tensor.t().contiguous()
    data["paper", "beref", "author"].edge_index = ref_tensor[:, [1, 0]].t().contiguous()
    data["paper", "cite", "paper"].edge_index = torch.cat(
        [cite_tensor, cite_tensor[:, [1, 0]]], dim=0
    ).t().contiguous()
    data["author", "coauthor", "author"].edge_index = torch.cat(
        [coauthor_tensor, coauthor_tensor[:, [1, 0]]], dim=0
    ).t().contiguous()
    return data.to(device)


@torch.no_grad()
def predict(model: LightGCN, data, pairs: np.ndarray, batch_size: int) -> np.ndarray:
    model.eval()
    z_dict = model.encode(data)
    author_z = z_dict["author"].cpu().numpy()
    paper_z = z_dict["paper"].cpu().numpy()
    scores = []
    for start in range(0, len(pairs), batch_size):
        end = min(start + batch_size, len(pairs))
        batch = pairs[start:end]
        scores.append(cos_sim(author_z[batch[:, 0]], paper_z[batch[:, 1]]))
    return np.concatenate(scores)


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--package-root", type=Path, default=Path(__file__).resolve().parents[1])
    parser.add_argument("--data-dir", type=Path, default=None)
    parser.add_argument("--checkpoint-dir", type=Path, default=None)
    parser.add_argument("--output-dir", type=Path, default=None)
    parser.add_argument("--device", default="cuda:0" if torch.cuda.is_available() else "cpu")
    parser.add_argument("--batch-size", type=int, default=65536)
    parser.add_argument(
        "--thresholds",
        nargs="*",
        type=float,
        default=[0.30, 0.32, 0.34, 0.35, 0.36, 0.37, 0.38, 0.40, 0.42, 0.45],
    )
    args = parser.parse_args()

    root = args.package_root
    data_dir = args.data_dir or root / "data_and_docs"
    checkpoint_dir = args.checkpoint_dir or root / "checkpoints" / "final_ens6"
    output_dir = args.output_dir or root / "submissions" / "regenerated_ens6"
    output_dir.mkdir(parents=True, exist_ok=True)

    device = torch.device(args.device)
    num_authors = 6611
    num_papers = 79937

    parts = build_features(data_dir, num_authors, num_papers)
    data = build_data(parts, num_authors, num_papers, device)
    test_arr = np.array(parts["refs_to_pred"], dtype=np.int64)

    configs = [
        ("model_lgcn_s0.pt", 256),
        ("model_lgcn_s42.pt", 256),
        ("model_lgcn_s2024.pt", 256),
        ("model_lgcn_s10.pt", 256),
        ("model_lgcn_s100.pt", 256),
        ("model_lgcn_dim384_s99.pt", 384),
    ]

    all_scores = []
    for filename, embed_dim in configs:
        path = checkpoint_dir / filename
        model = LightGCN(num_authors, parts["paper_feat_aug"].shape[1], embed_dim).to(device)
        state = torch.load(path, map_location=device)
        model.load_state_dict(state)
        scores = predict(model, data, test_arr, args.batch_size)
        all_scores.append(scores)
        print(f"{filename}: mean={scores.mean():.6f} std={scores.std():.6f}")
        del model

    ensemble = np.mean(all_scores, axis=0)
    train_set = set(map(tuple, parts["existing_refs"]))
    known_mask = np.array([tuple(pair) in train_set for pair in parts["refs_to_pred"]])
    ensemble[known_mask] = 1.0
    print(f"known positives forced to 1: {known_mask.sum()} / {len(known_mask)}")

    for threshold in args.thresholds:
        preds = (ensemble >= threshold).astype(int)
        out = pd.DataFrame(
            [[idx, str(int(pred))] for idx, pred in enumerate(preds)],
            columns=["Index", "Predicted"],
            dtype=object,
        )
        path = output_dir / f"sub_ens6_t{threshold:.2f}.csv"
        out.to_csv(path, index=False)
        print(f"{path}: positives={preds.sum()} ratio={preds.mean():.6f}")


if __name__ == "__main__":
    os.environ.setdefault("PYTHONHASHSEED", "0")
    main()