Spaces:

malware-USTH
/

mlp_csv

Sleeping

App Files Files Community

hieu3636 commited on Jan 31

Commit

96f75f0

verified ·

1 Parent(s): 616eb33

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -1

app.py CHANGED Viewed

@@ -47,12 +47,69 @@ SELECTED_FEATURES = [
 ]
 N_FEATURES = len(SELECTED_FEATURES)
 # =========================
 # PREDICTION FUNCTION
 # =========================
 def predict_csv(file):
-    df = pd.read_csv(file)
     # Drop label columns if exist
     df = df.drop(columns=["Label", "label", "class", "Class"], errors="ignore")

 ]
 N_FEATURES = len(SELECTED_FEATURES)
+# CLEAN NUMERIC (same as training)
+# =========================
+def clean_numeric(val):
+    if pd.isna(val):
+        return None
+    val = str(val).strip()
+    val = re.sub(r"\s+", "", val)
+    # scientific notation
+    if re.match(r"^-?\d+(\.\d+)?[eE][+-]?\d+$", val):
+        return float(val)
+    # remove thousand separators
+    if val.count(".") > 1:
+        val = val.replace(".", "")
+    # comma decimal -> dot
+    if "," in val and "." not in val:
+        val = val.replace(",", ".")
+    try:
+        return float(val)
+    except ValueError:
+        return None
+# =========================
+# LOAD & PREPROCESS CSV
+# =========================
+def load_and_clean_csv(file):
+    # 1. Read CSV (auto detect delimiter)
+    df = pd.read_csv(
+        file.name,
+        sep=None,
+        engine="python",
+        dtype=str
+    )
+    # 2. Clean header
+    df.columns = (
+        df.columns
+        .astype(str)
+        .str.strip()
+        .str.replace(r"\s+", "", regex=True)
+    )
+    # 3. Drop label columns if exist
+    df = df.drop(
+        columns=["Label", "label", "class", "Class", "file_name"],
+        errors="ignore"
+    )
+    # 4. Clean numeric values
+    for col in df.columns:
+        df[col] = df[col].apply(clean_numeric)
+    return df
 # =========================
 # PREDICTION FUNCTION
 # =========================
 def predict_csv(file):
+    df = load_and_clean_csv(file)
     # Drop label columns if exist
     df = df.drop(columns=["Label", "label", "class", "Class"], errors="ignore")