Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,7 +7,8 @@ import os
|
|
| 7 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 8 |
from sklearn.preprocessing import LabelEncoder
|
| 9 |
from sklearn.multioutput import MultiOutputClassifier
|
| 10 |
-
|
|
|
|
| 11 |
# --- Configuration ---
|
| 12 |
LABEL_COLUMNS = [
|
| 13 |
"Red_Flag_Reason", "Maker_Action", "Escalation_Level",
|
|
@@ -103,40 +104,46 @@ def health_check():
|
|
| 103 |
@app.post("/train")
|
| 104 |
def train():
|
| 105 |
try:
|
| 106 |
-
|
| 107 |
df = pd.read_csv(config.DATA_PATH)
|
| 108 |
|
| 109 |
-
#
|
| 110 |
X = df[config.TEXT_COLUMN]
|
| 111 |
y = df[config.LABEL_COLUMNS]
|
| 112 |
|
| 113 |
# Split the data
|
| 114 |
X_train, X_test, y_train, y_test = train_test_split(
|
| 115 |
-
X, y, test_size=
|
| 116 |
)
|
| 117 |
|
| 118 |
-
# TF-IDF
|
| 119 |
-
vectorizer = TfidfVectorizer(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
X_train_vec = vectorizer.fit_transform(X_train)
|
| 121 |
X_test_vec = vectorizer.transform(X_test)
|
| 122 |
|
| 123 |
-
# Train
|
| 124 |
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
|
| 125 |
model.fit(X_train_vec, y_train)
|
| 126 |
|
| 127 |
-
# Predict
|
| 128 |
y_pred = model.predict(X_test_vec)
|
|
|
|
|
|
|
| 129 |
accuracy = {
|
| 130 |
-
|
| 131 |
-
for i,
|
| 132 |
}
|
| 133 |
|
| 134 |
# Save model and vectorizer
|
| 135 |
joblib.dump(model, config.MODEL_PATH)
|
| 136 |
-
joblib.dump(vectorizer, config.
|
| 137 |
|
| 138 |
return {
|
| 139 |
-
"message": "
|
| 140 |
"accuracy": accuracy
|
| 141 |
}
|
| 142 |
|
|
|
|
| 7 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 8 |
from sklearn.preprocessing import LabelEncoder
|
| 9 |
from sklearn.multioutput import MultiOutputClassifier
|
| 10 |
+
import config
|
| 11 |
+
from sklearn.metrics import accuracy_score
|
| 12 |
# --- Configuration ---
|
| 13 |
LABEL_COLUMNS = [
|
| 14 |
"Red_Flag_Reason", "Maker_Action", "Escalation_Level",
|
|
|
|
| 104 |
@app.post("/train")
|
| 105 |
def train():
|
| 106 |
try:
|
| 107 |
+
# Load data
|
| 108 |
df = pd.read_csv(config.DATA_PATH)
|
| 109 |
|
| 110 |
+
# Prepare features and labels
|
| 111 |
X = df[config.TEXT_COLUMN]
|
| 112 |
y = df[config.LABEL_COLUMNS]
|
| 113 |
|
| 114 |
# Split the data
|
| 115 |
X_train, X_test, y_train, y_test = train_test_split(
|
| 116 |
+
X, y, test_size=config.TEST_SIZE, random_state=config.RANDOM_STATE
|
| 117 |
)
|
| 118 |
|
| 119 |
+
# TF-IDF vectorizer
|
| 120 |
+
vectorizer = TfidfVectorizer(
|
| 121 |
+
max_features=config.TFIDF_MAX_FEATURES,
|
| 122 |
+
ngram_range=config.NGRAM_RANGE,
|
| 123 |
+
stop_words='english' if config.USE_STOPWORDS else None
|
| 124 |
+
)
|
| 125 |
X_train_vec = vectorizer.fit_transform(X_train)
|
| 126 |
X_test_vec = vectorizer.transform(X_test)
|
| 127 |
|
| 128 |
+
# Train model
|
| 129 |
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
|
| 130 |
model.fit(X_train_vec, y_train)
|
| 131 |
|
| 132 |
+
# Predict on test data
|
| 133 |
y_pred = model.predict(X_test_vec)
|
| 134 |
+
|
| 135 |
+
# Calculate accuracy for each label
|
| 136 |
accuracy = {
|
| 137 |
+
label: accuracy_score(y_test[label], [pred[i] for pred in y_pred])
|
| 138 |
+
for i, label in enumerate(config.LABEL_COLUMNS)
|
| 139 |
}
|
| 140 |
|
| 141 |
# Save model and vectorizer
|
| 142 |
joblib.dump(model, config.MODEL_PATH)
|
| 143 |
+
joblib.dump(vectorizer, config.TFIDF_VECTORIZER_PATH)
|
| 144 |
|
| 145 |
return {
|
| 146 |
+
"message": "Training completed successfully.",
|
| 147 |
"accuracy": accuracy
|
| 148 |
}
|
| 149 |
|