Upload src/predict.py with huggingface_hub
Browse files- src/predict.py +46 -11
src/predict.py
CHANGED
|
@@ -1,40 +1,75 @@
|
|
| 1 |
-
|
| 2 |
-
from src.preprocess import preprocess_input
|
| 3 |
import joblib
|
| 4 |
import pandas as pd
|
| 5 |
import yaml
|
| 6 |
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
def load_config(config_path: str = "config/config.yaml") -> dict:
|
| 9 |
with open(config_path, "r", encoding="utf-8") as f:
|
| 10 |
return yaml.safe_load(f)
|
| 11 |
|
| 12 |
|
| 13 |
-
def
|
| 14 |
config = load_config()
|
|
|
|
| 15 |
repo_id = config["model"]["repo_id"]
|
| 16 |
-
|
|
|
|
| 17 |
|
| 18 |
model_path = hf_hub_download(
|
| 19 |
repo_id=repo_id,
|
| 20 |
-
filename=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
)
|
|
|
|
| 22 |
model = joblib.load(model_path)
|
| 23 |
-
return model
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
processed_df = preprocess_input(input_df)
|
| 31 |
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
-
result = {
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
if hasattr(model, "predict_proba"):
|
| 37 |
-
proba = model.predict_proba(
|
| 38 |
result["probabilities"] = proba[0].tolist()
|
| 39 |
|
| 40 |
return result
|
|
|
|
| 1 |
+
import json
|
|
|
|
| 2 |
import joblib
|
| 3 |
import pandas as pd
|
| 4 |
import yaml
|
| 5 |
|
| 6 |
+
from huggingface_hub import hf_hub_download
|
| 7 |
+
from src.preprocess import preprocess_input
|
| 8 |
+
|
| 9 |
|
| 10 |
def load_config(config_path: str = "config/config.yaml") -> dict:
|
| 11 |
with open(config_path, "r", encoding="utf-8") as f:
|
| 12 |
return yaml.safe_load(f)
|
| 13 |
|
| 14 |
|
| 15 |
+
def load_model_and_info():
|
| 16 |
config = load_config()
|
| 17 |
+
|
| 18 |
repo_id = config["model"]["repo_id"]
|
| 19 |
+
model_filename = config["model"]["filename"]
|
| 20 |
+
info_filename = config["model"]["info_filename"]
|
| 21 |
|
| 22 |
model_path = hf_hub_download(
|
| 23 |
repo_id=repo_id,
|
| 24 |
+
filename=model_filename,
|
| 25 |
+
repo_type="model",
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
info_path = hf_hub_download(
|
| 29 |
+
repo_id=repo_id,
|
| 30 |
+
filename=info_filename,
|
| 31 |
+
repo_type="model",
|
| 32 |
)
|
| 33 |
+
|
| 34 |
model = joblib.load(model_path)
|
|
|
|
| 35 |
|
| 36 |
+
with open(info_path, "r", encoding="utf-8") as f:
|
| 37 |
+
model_info = json.load(f)
|
| 38 |
+
|
| 39 |
+
return model, model_info
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def align_features_for_inference(input_df: pd.DataFrame, feature_columns: list[str]) -> pd.DataFrame:
|
| 43 |
+
df = input_df.copy()
|
| 44 |
+
|
| 45 |
+
df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]
|
| 46 |
|
| 47 |
+
# apply one-hot encoding in case categoricals are introduced later
|
| 48 |
+
df = pd.get_dummies(df, drop_first=False)
|
| 49 |
|
| 50 |
+
# align to exact training feature set
|
| 51 |
+
df = df.reindex(columns=feature_columns, fill_value=0)
|
| 52 |
+
|
| 53 |
+
return df
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def predict_input(input_df: pd.DataFrame) -> dict:
|
| 57 |
+
model, model_info = load_model_and_info()
|
| 58 |
|
| 59 |
processed_df = preprocess_input(input_df)
|
| 60 |
|
| 61 |
+
feature_columns = model_info["feature_columns"]
|
| 62 |
+
aligned_df = align_features_for_inference(processed_df, feature_columns)
|
| 63 |
+
|
| 64 |
+
prediction = model.predict(aligned_df)
|
| 65 |
|
| 66 |
+
result = {
|
| 67 |
+
"prediction": prediction[0],
|
| 68 |
+
"processed_input": aligned_df.to_dict(orient="records")[0],
|
| 69 |
+
}
|
| 70 |
|
| 71 |
if hasattr(model, "predict_proba"):
|
| 72 |
+
proba = model.predict_proba(aligned_df)
|
| 73 |
result["probabilities"] = proba[0].tolist()
|
| 74 |
|
| 75 |
return result
|