Spaces:
Running
Running
Commit ·
6b37a61
1
Parent(s): 841e95d
Refine visualizations: horizontal bars, smart filtering, label truncation
Browse files- cleaned_data.csv +6 -6
- src/__pycache__/visualization.cpython-313.pyc +0 -0
- src/visualization.py +25 -6
cleaned_data.csv
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
name,age,tags,details
|
| 2 |
-
Alice,25,['HR' 'admin'],{'role': 'staff'}
|
| 3 |
-
Bob,30,['eng' 'dev'],{'role': 'dev'}
|
| 4 |
-
Charlie,35,['eng' 'lead'],{'role': 'lead'}
|
| 5 |
-
David,40,['mgmt'],{'role': 'manager'}
|
| 6 |
-
Eve,22,['HR'],{'role': 'staff'}
|
|
|
|
| 1 |
+
name,age,tags,details,longtext,shortcategory
|
| 2 |
+
Alice,25,['HR' 'admin'],{'role': 'staff'},This is a very long text string that should be filtered out from categorical plots because it is too long.,Cat A
|
| 3 |
+
Bob,30,['eng' 'dev'],{'role': 'dev'},Another very long text string that serves the same purpose of testing the filtration logic.,Cat B
|
| 4 |
+
Charlie,35,['eng' 'lead'],{'role': 'lead'},Yet another long string to ensure the average length is high enough.,Cat A
|
| 5 |
+
David,40,['mgmt'],{'role': 'manager'},Repeating the long string to have some frequency but still be long.,Cat C
|
| 6 |
+
Eve,22,['HR'],{'role': 'staff'},This is a very long text string that should be filtered out from categorical plots because it is too long.,Cat B
|
src/__pycache__/visualization.cpython-313.pyc
CHANGED
|
Binary files a/src/__pycache__/visualization.cpython-313.pyc and b/src/__pycache__/visualization.cpython-313.pyc differ
|
|
|
src/visualization.py
CHANGED
|
@@ -27,17 +27,36 @@ def generate_charts(df, profile):
|
|
| 27 |
figures.append(fig_hist)
|
| 28 |
|
| 29 |
# 3. Categorical Counts - Top 3
|
|
|
|
| 30 |
cat_cols = profile.get('categorical_columns', [])
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
unique_count = df[col].nunique()
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
figures.append(fig_bar)
|
| 37 |
else:
|
| 38 |
-
# For high cardinality, maybe just plot top 10?
|
| 39 |
counts = df[col].value_counts().head(10)
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
| 41 |
figures.append(fig_bar)
|
| 42 |
|
| 43 |
# 4. Text Length Distribution (Fallback for Text Data)
|
|
|
|
| 27 |
figures.append(fig_hist)
|
| 28 |
|
| 29 |
# 3. Categorical Counts - Top 3
|
| 30 |
+
# Filter out columns that are likely just distinct text (avg length > 20) unless unique count is very low
|
| 31 |
cat_cols = profile.get('categorical_columns', [])
|
| 32 |
+
valid_cat_cols = []
|
| 33 |
+
|
| 34 |
+
for col in cat_cols:
|
| 35 |
+
# Check average length if object
|
| 36 |
+
if df[col].dtype == 'object':
|
| 37 |
+
avg_len = df[col].astype(str).str.len().mean()
|
| 38 |
+
if avg_len > 30 and df[col].nunique() > 10:
|
| 39 |
+
continue # Skip likely text/path columns
|
| 40 |
+
valid_cat_cols.append(col)
|
| 41 |
+
|
| 42 |
+
for col in valid_cat_cols[:3]:
|
| 43 |
unique_count = df[col].nunique()
|
| 44 |
+
# For layout, horizontal bars are better for text labels
|
| 45 |
+
if unique_count < 50:
|
| 46 |
+
counts = df[col].value_counts().head(15)
|
| 47 |
+
# Truncate labels for display
|
| 48 |
+
short_labels = [str(x)[:30] + "..." if len(str(x)) > 30 else str(x) for x in counts.index]
|
| 49 |
+
|
| 50 |
+
fig_bar = px.bar(x=counts.values, y=short_labels, orientation='h',
|
| 51 |
+
labels={'x': 'Count', 'y': col}, title=f"Top Categories in {col}")
|
| 52 |
+
fig_bar.update_layout(yaxis=dict(autorange="reversed")) # Top to bottom
|
| 53 |
figures.append(fig_bar)
|
| 54 |
else:
|
|
|
|
| 55 |
counts = df[col].value_counts().head(10)
|
| 56 |
+
short_labels = [str(x)[:30] + "..." if len(str(x)) > 30 else str(x) for x in counts.index]
|
| 57 |
+
fig_bar = px.bar(x=counts.values, y=short_labels, orientation='h',
|
| 58 |
+
labels={'x': 'Count', 'y': col}, title=f"Top 10 Most Frequent in {col}")
|
| 59 |
+
fig_bar.update_layout(yaxis=dict(autorange="reversed"))
|
| 60 |
figures.append(fig_bar)
|
| 61 |
|
| 62 |
# 4. Text Length Distribution (Fallback for Text Data)
|