Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| import json | |
| import pickle | |
| import numpy as np | |
| import pandas as pd | |
| import torch | |
| import torch.nn as nn | |
| from torch.utils.data import Dataset, DataLoader | |
| BATCH_SIZE = 64 | |
| HIDDEN_DIM = 768 | |
| PARENT_EMBED_DIM = 64 | |
| DROPOUT = 0.1 | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| torch.set_num_threads(1) | |
| class HierEmbedDataset(Dataset): | |
| def __init__(self, X, y_dict): | |
| self.X = torch.tensor(X, dtype=torch.float32) | |
| self.y2 = torch.tensor(y_dict["y2"], dtype=torch.long) | |
| self.y3 = torch.tensor(y_dict["y3"], dtype=torch.long) | |
| self.y4 = torch.tensor(y_dict["y4"], dtype=torch.long) | |
| self.y5 = torch.tensor(y_dict["y5"], dtype=torch.long) | |
| self.y6 = torch.tensor(y_dict["y6"], dtype=torch.long) | |
| def __len__(self): | |
| return self.X.shape[0] | |
| def __getitem__(self, idx): | |
| return ( | |
| self.X[idx], | |
| self.y2[idx], | |
| self.y3[idx], | |
| self.y4[idx], | |
| self.y5[idx], | |
| self.y6[idx], | |
| ) | |
| class HierarchicalEmbedMLP(nn.Module): | |
| def __init__( | |
| self, | |
| input_dim, | |
| n2, | |
| n3, | |
| n4, | |
| n5, | |
| n6, | |
| hidden_dim=HIDDEN_DIM, | |
| parent_embed_dim=PARENT_EMBED_DIM, | |
| dropout=DROPOUT, | |
| ): | |
| super().__init__() | |
| self.input_block = nn.Sequential( | |
| nn.Linear(input_dim, hidden_dim), | |
| nn.ReLU(), | |
| nn.Dropout(dropout), | |
| nn.Linear(hidden_dim, hidden_dim), | |
| nn.ReLU(), | |
| nn.Dropout(dropout), | |
| ) | |
| self.y2_embed = nn.Embedding(n2, parent_embed_dim) | |
| self.y3_embed = nn.Embedding(n3, parent_embed_dim) | |
| self.y4_embed = nn.Embedding(n4, parent_embed_dim) | |
| self.y5_embed = nn.Embedding(n5, parent_embed_dim) | |
| self.head2 = nn.Linear(hidden_dim, n2) | |
| self.head3_hidden = nn.Linear(hidden_dim + parent_embed_dim, hidden_dim) | |
| self.head3 = nn.Linear(hidden_dim, n3) | |
| self.head4_hidden = nn.Linear(hidden_dim + parent_embed_dim, hidden_dim) | |
| self.head4 = nn.Linear(hidden_dim, n4) | |
| self.head5_hidden = nn.Linear(hidden_dim + parent_embed_dim, hidden_dim) | |
| self.head5 = nn.Linear(hidden_dim, n5) | |
| self.head6_hidden = nn.Linear(hidden_dim + parent_embed_dim, hidden_dim) | |
| self.head6 = nn.Linear(hidden_dim, n6) | |
| self.relu = nn.ReLU() | |
| self.dropout = nn.Dropout(dropout) | |
| def encode(self, x): | |
| return self.input_block(x) | |
| def load_npz_dict(path): | |
| obj = np.load(path) | |
| return {k: obj[k] for k in obj.files} | |
| def apply_mask(logits, allowed_mask): | |
| very_neg = torch.full_like(logits, -1e9) | |
| return torch.where(allowed_mask, logits, very_neg) | |
| def strict_decode(model, x, mask23, mask34, mask45, mask56): | |
| h = model.encode(x) | |
| logits2 = model.head2(h) | |
| probs2 = torch.softmax(logits2, dim=1) | |
| pred2 = torch.argmax(logits2, dim=1) | |
| h3 = torch.cat([h, model.y2_embed(pred2)], dim=1) | |
| h3 = model.dropout(model.relu(model.head3_hidden(h3))) | |
| logits3 = model.head3(h3) | |
| allowed3 = torch.index_select(mask23, 0, pred2) | |
| logits3 = apply_mask(logits3, allowed3) | |
| probs3 = torch.softmax(logits3, dim=1) | |
| pred3 = torch.argmax(logits3, dim=1) | |
| h4 = torch.cat([h, model.y3_embed(pred3)], dim=1) | |
| h4 = model.dropout(model.relu(model.head4_hidden(h4))) | |
| logits4 = model.head4(h4) | |
| allowed4 = torch.index_select(mask34, 0, pred3) | |
| logits4 = apply_mask(logits4, allowed4) | |
| probs4 = torch.softmax(logits4, dim=1) | |
| pred4 = torch.argmax(logits4, dim=1) | |
| h5 = torch.cat([h, model.y4_embed(pred4)], dim=1) | |
| h5 = model.dropout(model.relu(model.head5_hidden(h5))) | |
| logits5 = model.head5(h5) | |
| allowed5 = torch.index_select(mask45, 0, pred4) | |
| logits5 = apply_mask(logits5, allowed5) | |
| probs5 = torch.softmax(logits5, dim=1) | |
| pred5 = torch.argmax(logits5, dim=1) | |
| h6 = torch.cat([h, model.y5_embed(pred5)], dim=1) | |
| h6 = model.dropout(model.relu(model.head6_hidden(h6))) | |
| logits6 = model.head6(h6) | |
| allowed6 = torch.index_select(mask56, 0, pred5) | |
| logits6 = apply_mask(logits6, allowed6) | |
| probs6 = torch.softmax(logits6, dim=1) | |
| pred6 = torch.argmax(logits6, dim=1) | |
| k = min(5, logits6.shape[1]) | |
| top5_idx = torch.topk(logits6, k=k, dim=1)[1] | |
| return { | |
| "pred2": pred2, | |
| "pred3": pred3, | |
| "pred4": pred4, | |
| "pred5": pred5, | |
| "pred6": pred6, | |
| "probs2": probs2, | |
| "probs3": probs3, | |
| "probs4": probs4, | |
| "probs5": probs5, | |
| "probs6": probs6, | |
| "top5_idx": top5_idx, | |
| } | |
| def idx_to_code(arr, label_map): | |
| return [label_map["to_value"][int(x)] for x in arr] | |
| def main(): | |
| print("starting evaluation", flush=True) | |
| project_dir = Path(__file__).resolve().parents[2] | |
| processed_dir = project_dir / "data" / "processed" | |
| artifacts_dir = project_dir / "training" / "artifacts" | |
| hierarchy_dir = artifacts_dir / "hierarchy" | |
| label_maps_dir = artifacts_dir / "label_maps" | |
| embedder_dir = artifacts_dir / "embedder" | |
| models_dir = artifacts_dir / "models" | |
| predictions_dir = project_dir / "outputs" / "predictions" | |
| predictions_dir.mkdir(parents=True, exist_ok=True) | |
| grouped_dir = predictions_dir / "grouped_summaries_embed" | |
| grouped_dir.mkdir(parents=True, exist_ok=True) | |
| X_test = np.load(processed_dir / "X_test_embed.npy") | |
| y_test = load_npz_dict(processed_dir / "y_test_embed.npz") | |
| test_ref = pd.read_csv(processed_dir / "test_embed_reference.csv") | |
| with open(hierarchy_dir / "hierarchy_embed.pkl", "rb") as f: | |
| hierarchy = pickle.load(f) | |
| with open(label_maps_dir / "label_maps_embed.pkl", "rb") as f: | |
| label_maps = pickle.load(f) | |
| with open(embedder_dir / "embed_metadata.pkl", "rb") as f: | |
| embed_metadata = pickle.load(f) | |
| input_dim = int(X_test.shape[1]) | |
| n2 = len(label_maps["y2"]["classes"]) | |
| n3 = len(label_maps["y3"]["classes"]) | |
| n4 = len(label_maps["y4"]["classes"]) | |
| n5 = len(label_maps["y5"]["classes"]) | |
| n6 = len(label_maps["y6"]["classes"]) | |
| test_ds = HierEmbedDataset(X_test, y_test) | |
| test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False) | |
| mask23 = torch.tensor(hierarchy["mask23"], dtype=torch.bool).to(DEVICE) | |
| mask34 = torch.tensor(hierarchy["mask34"], dtype=torch.bool).to(DEVICE) | |
| mask45 = torch.tensor(hierarchy["mask45"], dtype=torch.bool).to(DEVICE) | |
| mask56 = torch.tensor(hierarchy["mask56"], dtype=torch.bool).to(DEVICE) | |
| model = HierarchicalEmbedMLP( | |
| input_dim=input_dim, | |
| n2=n2, | |
| n3=n3, | |
| n4=n4, | |
| n5=n5, | |
| n6=n6, | |
| ).to(DEVICE) | |
| model_path = models_dir / "hierarchical_embed_best.pt" | |
| model.load_state_dict(torch.load(model_path, map_location=DEVICE)) | |
| model.eval() | |
| all_pred2 = [] | |
| all_pred3 = [] | |
| all_pred4 = [] | |
| all_pred5 = [] | |
| all_pred6 = [] | |
| all_prob2 = [] | |
| all_prob3 = [] | |
| all_prob4 = [] | |
| all_prob5 = [] | |
| all_prob6 = [] | |
| all_top5 = [] | |
| with torch.no_grad(): | |
| for batch in test_loader: | |
| x, y2, y3, y4, y5, y6 = batch | |
| x = x.to(DEVICE) | |
| out = strict_decode(model, x, mask23, mask34, mask45, mask56) | |
| all_pred2.extend(out["pred2"].cpu().numpy().tolist()) | |
| all_pred3.extend(out["pred3"].cpu().numpy().tolist()) | |
| all_pred4.extend(out["pred4"].cpu().numpy().tolist()) | |
| all_pred5.extend(out["pred5"].cpu().numpy().tolist()) | |
| all_pred6.extend(out["pred6"].cpu().numpy().tolist()) | |
| all_prob2.extend(torch.max(out["probs2"], dim=1)[0].cpu().numpy().tolist()) | |
| all_prob3.extend(torch.max(out["probs3"], dim=1)[0].cpu().numpy().tolist()) | |
| all_prob4.extend(torch.max(out["probs4"], dim=1)[0].cpu().numpy().tolist()) | |
| all_prob5.extend(torch.max(out["probs5"], dim=1)[0].cpu().numpy().tolist()) | |
| all_prob6.extend(torch.max(out["probs6"], dim=1)[0].cpu().numpy().tolist()) | |
| all_top5.extend(out["top5_idx"].cpu().numpy().tolist()) | |
| pred_y2 = idx_to_code(all_pred2, label_maps["y2"]) | |
| pred_y3 = idx_to_code(all_pred3, label_maps["y3"]) | |
| pred_y4 = idx_to_code(all_pred4, label_maps["y4"]) | |
| pred_y5 = idx_to_code(all_pred5, label_maps["y5"]) | |
| pred_y6 = idx_to_code(all_pred6, label_maps["y6"]) | |
| pred_top5_y6_codes = [] | |
| for row in all_top5: | |
| pred_top5_y6_codes.append( | |
| [label_maps["y6"]["to_value"][int(i)] for i in row] | |
| ) | |
| out_df = test_ref.copy() | |
| out_df["pred_y2"] = pred_y2 | |
| out_df["pred_y3"] = pred_y3 | |
| out_df["pred_y4"] = pred_y4 | |
| out_df["pred_y5"] = pred_y5 | |
| out_df["pred_y6"] = pred_y6 | |
| out_df["pred_prob_y2"] = all_prob2 | |
| out_df["pred_prob_y3"] = all_prob3 | |
| out_df["pred_prob_y4"] = all_prob4 | |
| out_df["pred_prob_y5"] = all_prob5 | |
| out_df["pred_prob_y6"] = all_prob6 | |
| out_df["match_y2"] = (out_df["y2"].astype(str) == out_df["pred_y2"].astype(str)).astype(int) | |
| out_df["match_y3"] = (out_df["y3"].astype(str) == out_df["pred_y3"].astype(str)).astype(int) | |
| out_df["match_y4"] = (out_df["y4"].astype(str) == out_df["pred_y4"].astype(str)).astype(int) | |
| out_df["match_y5"] = (out_df["y5"].astype(str) == out_df["pred_y5"].astype(str)).astype(int) | |
| out_df["match_y6"] = (out_df["y6"].astype(str) == out_df["pred_y6"].astype(str)).astype(int) | |
| out_df["pred_top5_y6"] = [" | ".join(x) for x in pred_top5_y6_codes] | |
| out_df["top5_contains_true_y6"] = [ | |
| int(str(true_y6) in top5_list) | |
| for true_y6, top5_list in zip(out_df["y6"].astype(str), pred_top5_y6_codes) | |
| ] | |
| row_path = predictions_dir / "hierarchical_embed_test_predictions.csv" | |
| out_df.to_csv(row_path, index=False) | |
| summary = { | |
| "embedder_model_name": embed_metadata["model_name"], | |
| "embedding_dim": int(embed_metadata["embedding_dim"]), | |
| "acc_y2": float(out_df["match_y2"].mean()), | |
| "acc_y3": float(out_df["match_y3"].mean()), | |
| "acc_y4": float(out_df["match_y4"].mean()), | |
| "acc_y5": float(out_df["match_y5"].mean()), | |
| "acc_y6": float(out_df["match_y6"].mean()), | |
| "top5_y6": float(out_df["top5_contains_true_y6"].mean()), | |
| "n_test_rows": int(len(out_df)), | |
| } | |
| summary_path = predictions_dir / "hierarchical_embed_test_summary.json" | |
| with open(summary_path, "w") as f: | |
| json.dump(summary, f, indent=2) | |
| group_levels = ["y2", "y3", "y4", "y5", "y6"] | |
| for level in group_levels: | |
| grouped = ( | |
| out_df | |
| .groupby(level, dropna=False) | |
| .agg( | |
| n_obs=(level, "size"), | |
| match_y2_rate=("match_y2", "mean"), | |
| match_y3_rate=("match_y3", "mean"), | |
| match_y4_rate=("match_y4", "mean"), | |
| match_y5_rate=("match_y5", "mean"), | |
| match_y6_rate=("match_y6", "mean"), | |
| top5_contains_true_y6_rate=("top5_contains_true_y6", "mean"), | |
| ) | |
| .reset_index() | |
| .sort_values(["n_obs", level], ascending=[False, True]) | |
| ) | |
| rate_cols = [ | |
| "match_y2_rate", | |
| "match_y3_rate", | |
| "match_y4_rate", | |
| "match_y5_rate", | |
| "match_y6_rate", | |
| "top5_contains_true_y6_rate", | |
| ] | |
| grouped[rate_cols] = grouped[rate_cols].round(4) | |
| grouped.to_csv(grouped_dir / f"summary_by_{level}.csv", index=False) | |
| grouped_top5 = ( | |
| out_df | |
| .groupby(level, dropna=False) | |
| .agg( | |
| n_obs=(level, "size"), | |
| top5_contains_true_y6_rate=("top5_contains_true_y6", "mean"), | |
| ) | |
| .reset_index() | |
| .sort_values(["n_obs", level], ascending=[False, True]) | |
| ) | |
| grouped_top5["top5_contains_true_y6_rate"] = grouped_top5["top5_contains_true_y6_rate"].round(4) | |
| grouped_top5.to_csv(grouped_dir / f"top5_summary_by_{level}.csv", index=False) | |
| print("saved row-level predictions to:", row_path, flush=True) | |
| print("saved summary to:", summary_path, flush=True) | |
| print("saved grouped summaries to:", grouped_dir, flush=True) | |
| print("", flush=True) | |
| print("summary metrics:", flush=True) | |
| for k, v in summary.items(): | |
| if k in ["embedder_model_name", "n_test_rows", "embedding_dim"]: | |
| print(f"{k}: {v}", flush=True) | |
| else: | |
| print(f"{k}: {v:.4f}", flush=True) | |
| print("", flush=True) | |
| print("sample errors:", flush=True) | |
| err_df = out_df[out_df["match_y6"] == 0].head(10) | |
| show_cols = [ | |
| "company_name", | |
| "naics_2022", | |
| "pred_y6", | |
| "y2", | |
| "pred_y2", | |
| "y3", | |
| "pred_y3", | |
| "company_description", | |
| ] | |
| show_cols = [c for c in show_cols if c in err_df.columns] | |
| print(err_df[show_cols].to_string(index=False), flush=True) | |
| if __name__ == "__main__": | |
| main() |