cs3319-project2 / code /generate_conservative_rw_blends.py
NLP-beginner's picture
CS3319 Project 2 final deliverable (public F1 = 0.96626)
f28d994
Raw
History Blame Contribute Delete
3.93 kB
"""Conservative blends anchored to the public-validated random-walk score."""
from __future__ import annotations
from pathlib import Path
import numpy as np
import pandas as pd
ROOT = Path(__file__).resolve().parents[1]
OUT = ROOT / "validation_runs" / "dynamic_seed202" / "randomwalk_conservative_blends"
def rank01(x: np.ndarray) -> np.ndarray:
order = np.argsort(x, kind="mergesort")
out = np.empty(len(x), dtype=np.float32)
out[order] = np.linspace(0, 1, len(x), dtype=np.float32)
return out
def write_ratio_submission(name: str, score: np.ndarray, ratio: float, reference: np.ndarray | None) -> dict:
known = np.load(ROOT / "cached_scores" / "test_known_mask.npy").astype(bool)
pred = np.zeros(len(score), dtype=np.int8)
pred[np.argsort(score, kind="mergesort")[-int(round(len(score) * ratio)):]] = 1
pred[known] = 1
path = OUT / f"{name}_r{ratio:.3f}.csv"
pd.DataFrame({"Index": np.arange(len(pred), dtype=np.int64), "Predicted": pred}).to_csv(path, index=False)
changed = int((pred != reference).sum()) if reference is not None else -1
return {"path": str(path), "ratio": ratio, "positive_ratio": float(pred.mean()), "changed_vs_public_anchor": changed}
def main() -> None:
OUT.mkdir(parents=True, exist_ok=True)
score_paths = {
"anchor": ROOT / "validation_runs/dynamic_seed202/node2vec_deepwalk_submission/test_content_mf_deepwalk_node2vec_lgb_pred.npy",
"highdim": ROOT / "validation_runs/dynamic_seed202/randomwalk_ensemble_submission/test_dw_highdim_d256_l40_w10_win10_pred.npy",
"d256l80": ROOT / "validation_runs/dynamic_seed202/randomwalk_ensemble_submission/test_dw_d256_l80_w10_win10_pred.npy",
"ens5": ROOT / "validation_runs/dynamic_seed202/randomwalk_ensemble_submission/test_rwens_dw_basel40_dw_longl80_dw_highdim_d256_l40_dw_graph_ap_pp_n2v_p2_q1l40_pred.npy",
"ens7": ROOT / "validation_runs/dynamic_seed202/randomwalk_ensemble_submission/test_rwens_dw_basel40_dw_longl80_dw_highdim_d256_l40_dw_d256_l80_dw_seed3407l40_dw_graph_ap_pp_n2v_p2_q1l40_pred.npy",
}
scores = {k: np.load(v).astype(np.float32) for k, v in score_paths.items()}
anchor_sub = ROOT / "validation_runs/dynamic_seed202/node2vec_deepwalk_submission/submission_content_mf_deepwalk_node2vec_lgb_th0.480000.csv"
anchor_pred = pd.read_csv(anchor_sub)["Predicted"].to_numpy(np.int8) if anchor_sub.exists() else None
rank = {k: rank01(v) for k, v in scores.items()}
blends = {
"anchor_rank_only": rank["anchor"],
"blend_anchor90_highdim10": 0.90 * rank["anchor"] + 0.10 * rank["highdim"],
"blend_anchor85_highdim15": 0.85 * rank["anchor"] + 0.15 * rank["highdim"],
"blend_anchor80_highdim20": 0.80 * rank["anchor"] + 0.20 * rank["highdim"],
"blend_anchor90_d256l80_10": 0.90 * rank["anchor"] + 0.10 * rank["d256l80"],
"blend_anchor85_d256l80_15": 0.85 * rank["anchor"] + 0.15 * rank["d256l80"],
"blend_anchor90_ens5_10": 0.90 * rank["anchor"] + 0.10 * rank["ens5"],
"blend_anchor85_ens5_15": 0.85 * rank["anchor"] + 0.15 * rank["ens5"],
"blend_anchor90_ens7_10": 0.90 * rank["anchor"] + 0.10 * rank["ens7"],
"blend_anchor85_ens7_15": 0.85 * rank["anchor"] + 0.15 * rank["ens7"],
"blend_anchor80_highdim10_d256l80_10": 0.80 * rank["anchor"] + 0.10 * rank["highdim"] + 0.10 * rank["d256l80"],
"blend_anchor75_highdim10_d256l80_10_ens5_05": 0.75 * rank["anchor"] + 0.10 * rank["highdim"] + 0.10 * rank["d256l80"] + 0.05 * rank["ens5"],
}
rows = []
for name, score in blends.items():
for ratio in [0.499, 0.500, 0.501]:
rows.append(write_ratio_submission(name, score.astype(np.float32), ratio, anchor_pred))
pd.DataFrame(rows).to_csv(OUT / "conservative_blend_summary.csv", index=False)
print(pd.DataFrame(rows).to_string(index=False))
if __name__ == "__main__":
main()