| import pickle | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| test_df = pd.read_csv("/tmp/data/test.csv") | |
| with open("model.pkl", "rb") as f: | |
| model = pickle.load(f) | |
| scores = [] | |
| for _, row in test_df.iterrows(): | |
| X_query = model["tokenizer"].transform([row["Query"]]) | |
| is_cand = sum([(model["faq_ids"] == row[f"FAQ{i+1}"]).astype(int) for i in range(3)]) > 0 | |
| sim = cosine_similarity(X_query, model["X_faq"][is_cand])[0] | |
| score = sim.max() | |
| scores.append(score) | |
| predict = (np.array(scores) > model["thr"]).astype(int) | |
| df = pd.DataFrame([(f"testid{i:04}", v) for i, v in enumerate(predict)], columns=["id", "pred"]) | |
| df.to_csv("submission.csv", index=None) |