clementBE commited on
Commit
3fb95a5
·
verified ·
1 Parent(s): d3a453e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -31
app.py CHANGED
@@ -1,55 +1,49 @@
1
  import gradio as gr
2
  import pandas as pd
3
- import matplotlib.pyplot as plt
4
  from sklearn.model_selection import train_test_split
5
  from sklearn.ensemble import RandomForestClassifier
6
  from sklearn.metrics import classification_report
7
- import io
8
 
9
  def load_data(file):
10
  if file is None:
11
- return None, []
12
  try:
13
  if file.name.endswith(".csv"):
14
  df = pd.read_csv(file.name)
15
  else:
16
  df = pd.read_excel(file.name)
17
- return df, list(df.columns)
 
18
  except Exception as e:
19
- return None, []
20
 
21
  def train_model(df, target_col, feature_cols):
22
- if df is None:
23
  return "Please upload a valid dataset first."
24
  if target_col not in df.columns:
25
  return "Target column not found in dataset."
26
  if not feature_cols:
27
  return "Please select at least one feature column."
28
 
29
- # Drop rows with NA in selected columns
30
  df_clean = df[[target_col] + feature_cols].dropna()
31
  if df_clean.empty:
32
- return "After removing rows with missing values, no data left to train."
33
 
34
  X = df_clean[feature_cols]
35
  y = df_clean[target_col]
36
 
37
- # Simple check for classification: target should be categorical or integer
38
  if y.nunique() < 2:
39
- return "Target column must have at least 2 unique classes for classification."
40
 
41
- # Encode categorical features if any
42
  X_enc = pd.get_dummies(X)
43
 
44
  try:
45
- X_train, X_test, y_train, y_test = train_test_split(
46
- X_enc, y, test_size=0.2, random_state=42
47
- )
48
  except ValueError as e:
49
  return f"Error splitting data: {e}"
50
 
51
  if X_train.shape[0] == 0 or X_test.shape[0] == 0:
52
- return "Train or test split resulted in empty dataset. Try reducing test size or adding more data."
53
 
54
  model = RandomForestClassifier(random_state=42)
55
  model.fit(X_train, y_train)
@@ -59,29 +53,36 @@ def train_model(df, target_col, feature_cols):
59
  return report
60
 
61
  with gr.Blocks() as demo:
62
- gr.Markdown("# XLSX/CSV Classifier with Sklearn")
63
 
64
  df_state = gr.State(None)
65
- cols_state = gr.State([])
66
 
67
  with gr.Row():
68
- file_input = gr.File(label="Upload CSV or Excel")
69
- column_selector = gr.Dropdown(label="Target Column", interactive=True)
70
  with gr.Row():
71
- features_selector = gr.CheckboxGroup(label="Feature Columns", interactive=True)
72
- train_btn = gr.Button("Train Classifier")
73
- output_text = gr.Textbox(label="Classification Report", lines=10)
74
-
75
- def on_file_upload(file):
76
- df, columns = load_data(file)
77
- return df, columns, columns, []
78
-
79
- file_input.change(on_file_upload, inputs=file_input, outputs=[df_state, column_selector, features_selector])
 
 
 
 
 
 
 
 
 
80
 
81
  train_btn.click(
82
- train_model,
83
- inputs=[df_state, column_selector, features_selector],
84
- outputs=output_text,
85
  )
86
 
87
  demo.launch()
 
1
  import gradio as gr
2
  import pandas as pd
 
3
  from sklearn.model_selection import train_test_split
4
  from sklearn.ensemble import RandomForestClassifier
5
  from sklearn.metrics import classification_report
 
6
 
7
  def load_data(file):
8
  if file is None:
9
+ return None, [], pd.DataFrame()
10
  try:
11
  if file.name.endswith(".csv"):
12
  df = pd.read_csv(file.name)
13
  else:
14
  df = pd.read_excel(file.name)
15
+ columns = list(df.columns)
16
+ return df, columns, df.head(100) # Show first 100 rows as preview
17
  except Exception as e:
18
+ return None, [], pd.DataFrame()
19
 
20
  def train_model(df, target_col, feature_cols):
21
+ if df is None or df.empty:
22
  return "Please upload a valid dataset first."
23
  if target_col not in df.columns:
24
  return "Target column not found in dataset."
25
  if not feature_cols:
26
  return "Please select at least one feature column."
27
 
 
28
  df_clean = df[[target_col] + feature_cols].dropna()
29
  if df_clean.empty:
30
+ return "No data left after removing missing values."
31
 
32
  X = df_clean[feature_cols]
33
  y = df_clean[target_col]
34
 
 
35
  if y.nunique() < 2:
36
+ return "Target must have at least 2 classes."
37
 
 
38
  X_enc = pd.get_dummies(X)
39
 
40
  try:
41
+ X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.2, random_state=42)
 
 
42
  except ValueError as e:
43
  return f"Error splitting data: {e}"
44
 
45
  if X_train.shape[0] == 0 or X_test.shape[0] == 0:
46
+ return "Empty train or test set after splitting."
47
 
48
  model = RandomForestClassifier(random_state=42)
49
  model.fit(X_train, y_train)
 
53
  return report
54
 
55
  with gr.Blocks() as demo:
56
+ gr.Markdown("# XLSX/CSV Classification App with Table Preview")
57
 
58
  df_state = gr.State(None)
 
59
 
60
  with gr.Row():
61
+ file_input = gr.File(label="Upload CSV or Excel file")
 
62
  with gr.Row():
63
+ table_preview = gr.DataFrame(headers=None, datatype=["str"], interactive=False, label="Data Preview")
64
+ with gr.Row():
65
+ target_col = gr.Dropdown(label="Select Target Column", choices=[])
66
+ with gr.Row():
67
+ feature_cols = gr.CheckboxGroup(label="Select Feature Columns", choices=[])
68
+ train_btn = gr.Button("Train Model")
69
+ output = gr.Textbox(label="Classification Report", lines=10)
70
+
71
+ def on_file_change(file):
72
+ df, columns, preview = load_data(file)
73
+ # Store df in state
74
+ return df, columns, columns, preview
75
+
76
+ file_input.change(
77
+ fn=on_file_change,
78
+ inputs=file_input,
79
+ outputs=[df_state, target_col, feature_cols, table_preview]
80
+ )
81
 
82
  train_btn.click(
83
+ fn=train_model,
84
+ inputs=[df_state, target_col, feature_cols],
85
+ outputs=output
86
  )
87
 
88
  demo.launch()