Spaces:

salihfurkaan
/

auto-data-analyst

Running

App Files Files Community

salihfurkaan commited on Feb 7

Commit

6b37a61

1 Parent(s): 841e95d

Refine visualizations: horizontal bars, smart filtering, label truncation

Browse files

Files changed (3) hide show

cleaned_data.csv +6 -6
src/__pycache__/visualization.cpython-313.pyc +0 -0
src/visualization.py +25 -6

cleaned_data.csv CHANGED Viewed

@@ -1,6 +1,6 @@
-name,age,tags,details
-Alice,25,['HR' 'admin'],{'role': 'staff'}
-Bob,30,['eng' 'dev'],{'role': 'dev'}
-Charlie,35,['eng' 'lead'],{'role': 'lead'}
-David,40,['mgmt'],{'role': 'manager'}
-Eve,22,['HR'],{'role': 'staff'}

+name,age,tags,details,longtext,shortcategory
+Alice,25,['HR' 'admin'],{'role': 'staff'},This is a very long text string that should be filtered out from categorical plots because it is too long.,Cat A
+Bob,30,['eng' 'dev'],{'role': 'dev'},Another very long text string that serves the same purpose of testing the filtration logic.,Cat B
+Charlie,35,['eng' 'lead'],{'role': 'lead'},Yet another long string to ensure the average length is high enough.,Cat A
+David,40,['mgmt'],{'role': 'manager'},Repeating the long string to have some frequency but still be long.,Cat C
+Eve,22,['HR'],{'role': 'staff'},This is a very long text string that should be filtered out from categorical plots because it is too long.,Cat B

src/__pycache__/visualization.cpython-313.pyc CHANGED Viewed

Binary files a/src/__pycache__/visualization.cpython-313.pyc and b/src/__pycache__/visualization.cpython-313.pyc differ

src/visualization.py CHANGED Viewed

@@ -27,17 +27,36 @@ def generate_charts(df, profile):
         figures.append(fig_hist)
     # 3. Categorical Counts - Top 3
     cat_cols = profile.get('categorical_columns', [])
-    for col in cat_cols[:3]:
         unique_count = df[col].nunique()
-        if unique_count < 50: # Don't plot high cardinality
-            counts = df[col].value_counts().head(15) # Top 15
-            fig_bar = px.bar(x=counts.index, y=counts.values, labels={'x': col, 'y': 'Count'}, title=f"Top Categories in {col}")
             figures.append(fig_bar)
         else:
-             # For high cardinality, maybe just plot top 10?
              counts = df[col].value_counts().head(10)
-             fig_bar = px.bar(x=counts.index, y=counts.values, labels={'x': col, 'y': 'Count'}, title=f"Top 10 Most Frequent in {col}")
              figures.append(fig_bar)
     # 4. Text Length Distribution (Fallback for Text Data)

         figures.append(fig_hist)
     # 3. Categorical Counts - Top 3
+    # Filter out columns that are likely just distinct text (avg length > 20) unless unique count is very low
     cat_cols = profile.get('categorical_columns', [])
+    valid_cat_cols = []
+    for col in cat_cols:
+        # Check average length if object
+        if df[col].dtype == 'object':
+             avg_len = df[col].astype(str).str.len().mean()
+             if avg_len > 30 and df[col].nunique() > 10:
+                 continue # Skip likely text/path columns
+        valid_cat_cols.append(col)
+    for col in valid_cat_cols[:3]:
         unique_count = df[col].nunique()
+        # For layout, horizontal bars are better for text labels
+        if unique_count < 50:
+            counts = df[col].value_counts().head(15)
+            # Truncate labels for display
+            short_labels = [str(x)[:30] + "..." if len(str(x)) > 30 else str(x) for x in counts.index]
+            fig_bar = px.bar(x=counts.values, y=short_labels, orientation='h',
+                             labels={'x': 'Count', 'y': col}, title=f"Top Categories in {col}")
+            fig_bar.update_layout(yaxis=dict(autorange="reversed")) # Top to bottom
             figures.append(fig_bar)
         else:
              counts = df[col].value_counts().head(10)
+             short_labels = [str(x)[:30] + "..." if len(str(x)) > 30 else str(x) for x in counts.index]
+             fig_bar = px.bar(x=counts.values, y=short_labels, orientation='h',
+                              labels={'x': 'Count', 'y': col}, title=f"Top 10 Most Frequent in {col}")
+             fig_bar.update_layout(yaxis=dict(autorange="reversed"))
              figures.append(fig_bar)
     # 4. Text Length Distribution (Fallback for Text Data)