Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,7 +15,7 @@ LABEL_COLUMNS = [
|
|
| 15 |
"Risk_Category", "Risk_Drivers", "Investigation_Outcome"
|
| 16 |
]
|
| 17 |
TEXT_COLUMN = "Sanction_Context"
|
| 18 |
-
MODEL_DIR = "/tmp"
|
| 19 |
MODEL_PATH = os.path.join(MODEL_DIR, "logreg_model.pkl")
|
| 20 |
TFIDF_PATH = os.path.join(MODEL_DIR, "tfidf_vectorizer.pkl")
|
| 21 |
ENCODERS_PATH = os.path.join(MODEL_DIR, "label_encoders.pkl")
|
|
@@ -23,10 +23,9 @@ ENCODERS_PATH = os.path.join(MODEL_DIR, "label_encoders.pkl")
|
|
| 23 |
# --- FastAPI App ---
|
| 24 |
app = FastAPI()
|
| 25 |
|
| 26 |
-
# ---
|
| 27 |
class TransactionData(BaseModel):
|
| 28 |
Sanction_Context: str
|
| 29 |
-
# Add all required metadata fields here if needed
|
| 30 |
|
| 31 |
class PredictionRequest(BaseModel):
|
| 32 |
transaction_data: TransactionData
|
|
@@ -44,19 +43,23 @@ def train_model(input: DataPathInput):
|
|
| 44 |
df = pd.read_csv(input.data_path)
|
| 45 |
df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True)
|
| 46 |
|
|
|
|
| 47 |
label_encoders = {}
|
| 48 |
for col in LABEL_COLUMNS:
|
| 49 |
le = LabelEncoder()
|
| 50 |
df[col] = le.fit_transform(df[col])
|
| 51 |
label_encoders[col] = le
|
| 52 |
|
|
|
|
| 53 |
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2), stop_words="english")
|
| 54 |
X_vec = tfidf.fit_transform(df[TEXT_COLUMN])
|
| 55 |
y = df[LABEL_COLUMNS]
|
| 56 |
|
|
|
|
| 57 |
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
|
| 58 |
model.fit(X_vec, y)
|
| 59 |
|
|
|
|
| 60 |
joblib.dump(model, MODEL_PATH)
|
| 61 |
joblib.dump(tfidf, TFIDF_PATH)
|
| 62 |
joblib.dump(label_encoders, ENCODERS_PATH)
|
|
@@ -100,7 +103,7 @@ def test_model(input: DataPathInput):
|
|
| 100 |
}
|
| 101 |
decoded_preds.append(decoded)
|
| 102 |
|
| 103 |
-
return {"predictions": decoded_preds[:5]}
|
| 104 |
except Exception as e:
|
| 105 |
raise HTTPException(status_code=500, detail=str(e))
|
| 106 |
|
|
|
|
| 15 |
"Risk_Category", "Risk_Drivers", "Investigation_Outcome"
|
| 16 |
]
|
| 17 |
TEXT_COLUMN = "Sanction_Context"
|
| 18 |
+
MODEL_DIR = "/tmp"
|
| 19 |
MODEL_PATH = os.path.join(MODEL_DIR, "logreg_model.pkl")
|
| 20 |
TFIDF_PATH = os.path.join(MODEL_DIR, "tfidf_vectorizer.pkl")
|
| 21 |
ENCODERS_PATH = os.path.join(MODEL_DIR, "label_encoders.pkl")
|
|
|
|
| 23 |
# --- FastAPI App ---
|
| 24 |
app = FastAPI()
|
| 25 |
|
| 26 |
+
# --- Schemas ---
|
| 27 |
class TransactionData(BaseModel):
|
| 28 |
Sanction_Context: str
|
|
|
|
| 29 |
|
| 30 |
class PredictionRequest(BaseModel):
|
| 31 |
transaction_data: TransactionData
|
|
|
|
| 43 |
df = pd.read_csv(input.data_path)
|
| 44 |
df.dropna(subset=[TEXT_COLUMN] + LABEL_COLUMNS, inplace=True)
|
| 45 |
|
| 46 |
+
# Label Encoding
|
| 47 |
label_encoders = {}
|
| 48 |
for col in LABEL_COLUMNS:
|
| 49 |
le = LabelEncoder()
|
| 50 |
df[col] = le.fit_transform(df[col])
|
| 51 |
label_encoders[col] = le
|
| 52 |
|
| 53 |
+
# TF-IDF
|
| 54 |
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2), stop_words="english")
|
| 55 |
X_vec = tfidf.fit_transform(df[TEXT_COLUMN])
|
| 56 |
y = df[LABEL_COLUMNS]
|
| 57 |
|
| 58 |
+
# Train Model
|
| 59 |
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
|
| 60 |
model.fit(X_vec, y)
|
| 61 |
|
| 62 |
+
# Save
|
| 63 |
joblib.dump(model, MODEL_PATH)
|
| 64 |
joblib.dump(tfidf, TFIDF_PATH)
|
| 65 |
joblib.dump(label_encoders, ENCODERS_PATH)
|
|
|
|
| 103 |
}
|
| 104 |
decoded_preds.append(decoded)
|
| 105 |
|
| 106 |
+
return {"predictions": decoded_preds[:5]} # limit output
|
| 107 |
except Exception as e:
|
| 108 |
raise HTTPException(status_code=500, detail=str(e))
|
| 109 |
|