| |
| import os |
| import json |
| import pickle |
| import numpy as np |
| import pandas as pd |
| from sklearn.preprocessing import LabelEncoder |
|
|
|
|
| class EssayCatBoostModel: |
| """ |
| Wrapper для сохранения/загрузки CatBoost модели с feature engineering. |
| Совместим с HuggingFace Hub (save_pretrained / from_pretrained). |
| """ |
|
|
| def __init__(self): |
| self.cat_models = [] |
| self.task_classes = [] |
| self.prompt_classes = [] |
| self.feature_names = None |
|
|
| def save_pretrained(self, save_directory: str): |
| """Сохраняет модель в папку для загрузки на HuggingFace.""" |
| os.makedirs(save_directory, exist_ok=True) |
|
|
| |
| for i, model in enumerate(self.cat_models): |
| model_path = os.path.join(save_directory, f"catboost_fold_{i}.cbm") |
| model.save_model(model_path) |
|
|
| |
| metadata = { |
| "n_folds": len(self.cat_models), |
| "task_classes": self.task_classes, |
| "prompt_classes": self.prompt_classes, |
| "feature_names": self.feature_names, |
| } |
| with open(os.path.join(save_directory, "metadata.json"), "w", encoding="utf-8") as f: |
| json.dump(metadata, f, ensure_ascii=False, indent=2) |
|
|
| print(f"CatBoost модель сохранена в {save_directory}") |
|
|
| @classmethod |
| def from_pretrained(cls, model_directory: str): |
| """Загружает модель из папки (или с HuggingFace Hub).""" |
| import catboost as cb |
|
|
| instance = cls() |
|
|
| |
| metadata_path = os.path.join(model_directory, "metadata.json") |
| with open(metadata_path, "r", encoding="utf-8") as f: |
| metadata = json.load(f) |
|
|
| instance.task_classes = metadata.get("task_classes", []) |
| instance.prompt_classes = metadata.get("prompt_classes", []) |
| instance.feature_names = metadata.get("feature_names", None) |
|
|
| |
| n_folds = metadata["n_folds"] |
| for i in range(n_folds): |
| model_path = os.path.join(model_directory, f"catboost_fold_{i}.cbm") |
| model = cb.CatBoostRegressor() |
| model.load_model(model_path) |
| instance.cat_models.append(model) |
|
|
| print(f"CatBoost модель загружена из {model_directory} ({n_folds} фолдов)") |
| return instance |
|
|
| def predict(self, X: pd.DataFrame): |
| """Предсказание усреднением по всем фолдам.""" |
| if not self.cat_models: |
| raise ValueError("Модель не загружена") |
|
|
| |
| if self.feature_names is not None: |
| X = X[self.feature_names] |
|
|
| preds = np.zeros(len(X)) |
| for model in self.cat_models: |
| preds += model.predict(X) / len(self.cat_models) |
| return preds |