File size: 5,991 Bytes
885de0d
92c3eaa
 
 
 
 
 
 
 
 
 
 
a3c60c5
 
 
92c3eaa
c323a8d
 
 
 
 
 
 
a3c60c5
 
c323a8d
a3c60c5
92c3eaa
c323a8d
 
 
92c3eaa
a3c60c5
 
 
 
 
c323a8d
 
a3c60c5
 
 
c323a8d
a3c60c5
 
 
 
 
 
 
 
 
c323a8d
a3c60c5
 
 
c323a8d
 
a3c60c5
 
 
 
 
 
92c3eaa
 
 
 
 
 
 
 
 
 
c323a8d
92c3eaa
 
 
c323a8d
92c3eaa
 
 
 
 
c323a8d
92c3eaa
a3c60c5
92c3eaa
 
 
 
 
 
c323a8d
 
 
 
 
 
 
92c3eaa
 
c323a8d
92c3eaa
 
 
 
 
c323a8d
92c3eaa
 
a3c60c5
c323a8d
a3c60c5
885de0d
a3c60c5
885de0d
 
 
a3c60c5
 
 
 
92c3eaa
a4a4ffd
a3c60c5
 
 
 
92c3eaa
 
 
 
 
 
a4a4ffd
a3c60c5
 
 
 
7312c7f
92c3eaa
 
 
 
 
 
a4a4ffd
a3c60c5
 
 
 
a4a4ffd
7312c7f
92c3eaa
7312c7f
a3c60c5
7312c7f
 
92c3eaa
 
 
 
 
 
 
7312c7f
92c3eaa
 
 
 
 
 
a3c60c5
 
 
 
 
7312c7f
a3c60c5
c323a8d
a3c60c5
7312c7f
92c3eaa
7312c7f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import sys
import joblib
import pandas as pd
import numpy as np
from fastapi import FastAPI
from pydantic import BaseModel
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

# ================================
# 1️⃣ Custom Preprocessing Functions
# ================================
def temp_cat(X):
    X = pd.DataFrame(X)
    X['avg_temp_cat'] = pd.cut(
        X['avg_temp'],
        bins=[0, 5, 10, 20, 30, np.inf],
        labels=['very_cold', 'cold', 'warm', 'hot', 'very_hot']
    )
    return X

def clean(X):
    return pd.DataFrame(X).dropna()

def proxy_humidity(X):
    X = pd.DataFrame(X)
    X["proxy_humidity"] = X["average_rain_fall_mm_per_year"] / (X["avg_temp"] + 1)
    return X


# ================================
# 2️⃣ Transformers and Pipelines
# ================================
temp_cat_pipeline = make_pipeline(
    FunctionTransformer(temp_cat),
    OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
)

clean_pipeline = make_pipeline(
    FunctionTransformer(clean),
    StandardScaler()
)

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
)

proxy_humidity_pipeline = make_pipeline(
    FunctionTransformer(proxy_humidity),
    StandardScaler()
)

square_pipeline = make_pipeline(FunctionTransformer(np.square), StandardScaler())
log_pipeline = make_pipeline(FunctionTransformer(np.log1p), StandardScaler())
default_num_pipeline = make_pipeline(StandardScaler())


# ================================
# 3️⃣ Custom Feature Selector
# ================================
class CorrelationThresholdSelector(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9, target_threshold=0.0, method="pearson", min_variance=0.0):
        self.threshold = threshold
        self.target_threshold = target_threshold
        self.method = method
        self.min_variance = min_variance

    def fit(self, X, y):
        X_arr, y_arr = check_X_y(X, y, accept_sparse=False, dtype=np.float64)
        n_features = X_arr.shape[1]
        self.feature_names_in_ = np.array(getattr(X, "columns", [f"f{i}" for i in range(n_features)]))

        X_df = pd.DataFrame(X_arr, columns=self.feature_names_in_)
        variances = X_df.var(numeric_only=True)
        low_var_idx = np.where(variances <= self.min_variance)[0].tolist()

        corr_mat = X_df.corr(method=self.method).abs().values
        np.fill_diagonal(corr_mat, 0.0)

        y_series = pd.Series(y_arr)
        target_corr = X_df.corrwith(y_series, method=self.method).abs().fillna(0.0).values

        visited, drops = set(), set()
        for i in range(n_features):
            if i in visited or i in low_var_idx:
                continue
            correlated_idx = set(np.where(corr_mat[i] > self.threshold)[0].tolist())
            cluster = {i} | correlated_idx
            visited |= cluster
            if len(cluster) > 1:
                best = max(cluster, key=lambda idx: (target_corr[idx], X_df.iloc[:, idx].var()))
                if self.target_threshold > 0 and target_corr[best] < self.target_threshold:
                    drops |= cluster
                else:
                    cluster.remove(best)
                    drops |= cluster

        drops |= set(low_var_idx)
        self.selected_features_ = np.array(sorted(set(range(n_features)) - drops), dtype=int)
        return self

    def transform(self, X):
        check_is_fitted(self, "selected_features_")
        X_arr = check_array(X, accept_sparse=False, dtype=np.float64)
        return X_arr[:, self.selected_features_]


# ================================
# 4️⃣ Register Custom Functions for joblib
# ================================
sys.modules['__main__'].temp_cat = temp_cat
sys.modules['__main__'].clean = clean
sys.modules['__main__'].proxy_humidity = proxy_humidity
sys.modules['__main__'].CorrelationThresholdSelector = CorrelationThresholdSelector


# ================================
# 5️⃣ Initialize FastAPI
# ================================
app = FastAPI(title="🌾 Crop Yield Predictor API", version="1.0")


# ================================
# 6️⃣ Load Model
# ================================
try:
    model = joblib.load("CropYieldPredictor.pkl")
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    model = None


# ================================
# 7️⃣ Define Input Schema
# ================================
class CropInput(BaseModel):
    Area: str
    Item: str
    Year: int
    average_rain_fall_mm_per_year: float
    pesticides_tonnes: float
    avg_temp: float


# ================================
# 8️⃣ Routes
# ================================
@app.get("/")
def home():
    return {"message": "🌾 Crop Yield Predictor API is live and running!"}


@app.post("/predict")
def predict_yield(data: CropInput):
    if model is None:
        return {"error": "Model not loaded properly!"}

    try:
        input_df = pd.DataFrame([data.dict()])
        prediction = model.predict(input_df)[0]
        predicted_yield_kg_ha = prediction * 0.1

        return {
            "predicted_yield_hg_per_ha": float(prediction),
            "predicted_yield_kg_per_ha": float(predicted_yield_kg_ha),
            "message": "✅ Prediction successful!"
        }
    except Exception as e:
        return {
            "error": str(e),
            "message": "❌ Prediction failed due to preprocessing or feature mismatch."
        }


# ================================
# 9️⃣ Local or Hugging Face Run
# ================================
if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)