Spaces:

jwrth
/

naics_embeddings

Sleeping

naics_embeddings / training /scripts /evaluate_flat_embed.py

Joseph Warth

updated to flat embedding

a6067aa about 1 month ago

7.84 kB

	from pathlib import Path
	import json
	import pickle
	import numpy as np
	import pandas as pd
	import torch
	import torch.nn as nn
	from torch.utils.data import Dataset, DataLoader


	BATCH_SIZE = 64
	HIDDEN_DIM = 768
	DROPOUT = 0.1
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	torch.set_num_threads(1)


	class FlatEmbedDataset(Dataset):
	def __init__(self, X, y):
	self.X = torch.tensor(X, dtype=torch.float32)
	self.y = torch.tensor(y, dtype=torch.long)

	def __len__(self):
	return self.X.shape[0]

	def __getitem__(self, idx):
	return self.X[idx], self.y[idx]


	class FlatEmbedMLP(nn.Module):
	def __init__(self, input_dim, n_classes, hidden_dim=HIDDEN_DIM, dropout=DROPOUT):
	super().__init__()

	self.net = nn.Sequential(
	nn.Linear(input_dim, hidden_dim),
	nn.ReLU(),
	nn.Dropout(dropout),
	nn.Linear(hidden_dim, hidden_dim),
	nn.ReLU(),
	nn.Dropout(dropout),
	nn.Linear(hidden_dim, n_classes),
	)

	def forward(self, x):
	return self.net(x)


	def main():
	print("starting evaluation", flush=True)

	project_dir = Path(__file__).resolve().parents[2]

	processed_dir = project_dir / "data" / "processed"
	artifacts_dir = project_dir / "training" / "artifacts"
	label_maps_dir = artifacts_dir / "label_maps"
	embedder_dir = artifacts_dir / "embedder"
	models_dir = artifacts_dir / "models"

	predictions_dir = project_dir / "outputs" / "predictions"
	predictions_dir.mkdir(parents=True, exist_ok=True)

	grouped_dir = predictions_dir / "grouped_summaries_flat"
	grouped_dir.mkdir(parents=True, exist_ok=True)

	X_test = np.load(processed_dir / "X_test_embed.npy")
	y_test_obj = np.load(processed_dir / "y_test_embed.npz")
	y_test = y_test_obj["y6"]

	test_ref = pd.read_csv(processed_dir / "test_embed_reference.csv")

	with open(label_maps_dir / "label_maps_embed.pkl", "rb") as f:
	label_maps = pickle.load(f)

	with open(embedder_dir / "embed_metadata.pkl", "rb") as f:
	embed_metadata = pickle.load(f)

	input_dim = int(X_test.shape[1])
	n_classes = len(label_maps["y6"]["classes"])

	test_ds = FlatEmbedDataset(X_test, y_test)
	test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

	model = FlatEmbedMLP(
	input_dim=input_dim,
	n_classes=n_classes,
	).to(DEVICE)

	model_path = models_dir / "flat_embed_best.pt"
	model.load_state_dict(torch.load(model_path, map_location=DEVICE))
	model.eval()

	all_pred_idx = []
	all_pred_prob = []
	all_top5_idx = []

	with torch.no_grad():
	for x, y in test_loader:
	x = x.to(DEVICE)

	logits = model(x)
	probs = torch.softmax(logits, dim=1)

	pred_idx = torch.argmax(logits, dim=1)
	top5_idx = torch.topk(logits, k=min(5, logits.shape[1]), dim=1).indices
	pred_prob = torch.max(probs, dim=1).values

	all_pred_idx.extend(pred_idx.cpu().numpy().tolist())
	all_pred_prob.extend(pred_prob.cpu().numpy().tolist())
	all_top5_idx.extend(top5_idx.cpu().numpy().tolist())

	pred_y6 = [label_maps["y6"]["to_value"][int(i)] for i in all_pred_idx]
	pred_top5_y6_codes = [
	[label_maps["y6"]["to_value"][int(i)] for i in row]
	for row in all_top5_idx
	]

	out_df = test_ref.copy()

	out_df["pred_y6"] = pred_y6
	out_df["pred_prob_y6"] = all_pred_prob
	out_df["pred_top5_y6"] = [" \| ".join(x) for x in pred_top5_y6_codes]

	# derive hierarchical prefixes from flat predictions
	out_df["pred_y2"] = out_df["pred_y6"].astype(str).str[:2]
	out_df["pred_y3"] = out_df["pred_y6"].astype(str).str[:3]
	out_df["pred_y4"] = out_df["pred_y6"].astype(str).str[:4]
	out_df["pred_y5"] = out_df["pred_y6"].astype(str).str[:5]

	# match flags
	out_df["match_y2"] = (out_df["y2"].astype(str) == out_df["pred_y2"].astype(str)).astype(int)
	out_df["match_y3"] = (out_df["y3"].astype(str) == out_df["pred_y3"].astype(str)).astype(int)
	out_df["match_y4"] = (out_df["y4"].astype(str) == out_df["pred_y4"].astype(str)).astype(int)
	out_df["match_y5"] = (out_df["y5"].astype(str) == out_df["pred_y5"].astype(str)).astype(int)
	out_df["match_y6"] = (out_df["y6"].astype(str) == out_df["pred_y6"].astype(str)).astype(int)

	out_df["top5_contains_true_y6"] = [
	int(str(true_y6) in top5_list)
	for true_y6, top5_list in zip(out_df["y6"].astype(str), pred_top5_y6_codes)
	]

	row_path = predictions_dir / "flat_embed_test_predictions.csv"
	out_df.to_csv(row_path, index=False)

	summary = {
	"embedder_model_name": embed_metadata["model_name"],
	"embedding_dim": int(embed_metadata["embedding_dim"]),
	"acc_y2": float(out_df["match_y2"].mean()),
	"acc_y3": float(out_df["match_y3"].mean()),
	"acc_y4": float(out_df["match_y4"].mean()),
	"acc_y5": float(out_df["match_y5"].mean()),
	"acc_y6": float(out_df["match_y6"].mean()),
	"top5_y6": float(out_df["top5_contains_true_y6"].mean()),
	"n_test_rows": int(len(out_df)),
	}

	summary_path = predictions_dir / "flat_embed_test_summary.json"
	with open(summary_path, "w") as f:
	json.dump(summary, f, indent=2)

	group_levels = ["y2", "y3", "y4", "y5", "y6"]

	for level in group_levels:
	grouped = (
	out_df
	.groupby(level, dropna=False)
	.agg(
	n_obs=(level, "size"),
	match_y2_rate=("match_y2", "mean"),
	match_y3_rate=("match_y3", "mean"),
	match_y4_rate=("match_y4", "mean"),
	match_y5_rate=("match_y5", "mean"),
	match_y6_rate=("match_y6", "mean"),
	top5_contains_true_y6_rate=("top5_contains_true_y6", "mean"),
	)
	.reset_index()
	.sort_values(["n_obs", level], ascending=[False, True])
	)

	rate_cols = [
	"match_y2_rate",
	"match_y3_rate",
	"match_y4_rate",
	"match_y5_rate",
	"match_y6_rate",
	"top5_contains_true_y6_rate",
	]
	grouped[rate_cols] = grouped[rate_cols].round(4)

	grouped.to_csv(grouped_dir / f"summary_by_{level}.csv", index=False)

	grouped_top5 = (
	out_df
	.groupby(level, dropna=False)
	.agg(
	n_obs=(level, "size"),
	top5_contains_true_y6_rate=("top5_contains_true_y6", "mean"),
	)
	.reset_index()
	.sort_values(["n_obs", level], ascending=[False, True])
	)
	grouped_top5["top5_contains_true_y6_rate"] = grouped_top5["top5_contains_true_y6_rate"].round(4)
	grouped_top5.to_csv(grouped_dir / f"top5_summary_by_{level}.csv", index=False)

	print("saved row-level predictions to:", row_path, flush=True)
	print("saved summary to:", summary_path, flush=True)
	print("saved grouped summaries to:", grouped_dir, flush=True)
	print("", flush=True)
	print("summary metrics:", flush=True)
	for k, v in summary.items():
	if k in ["embedder_model_name", "n_test_rows", "embedding_dim"]:
	print(f"{k}: {v}", flush=True)
	else:
	print(f"{k}: {v:.4f}", flush=True)

	print("", flush=True)
	print("sample errors:", flush=True)
	err_df = out_df[out_df["match_y6"] == 0].head(10)
	show_cols = [
	"company_name",
	"naics_2022",
	"pred_y6",
	"y2",
	"pred_y2",
	"y3",
	"pred_y3",
	"company_description",
	]
	show_cols = [c for c in show_cols if c in err_df.columns]
	print(err_df[show_cols].to_string(index=False), flush=True)


	if __name__ == "__main__":
	main()