Spaces:

Nucha
/

WordCloud_BarChart

Sleeping

App Files Files Community

Nucha commited on Aug 29, 2025

Commit

67d52df

verified ·

1 Parent(s): f8c1ed5

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -73

app.py CHANGED Viewed

@@ -1,105 +1,80 @@
 import os
-# --- Gradio UI ---
-with gr.Blocks(title="Soft Skills Word Cloud") as demo:
-gr.Markdown("""
-# 🧩 Soft Skills Word Cloud
-Upload your CSV (or place `Soft_Skills__Top_5000_.csv` in the repo) to generate a word cloud.
-- **Skill column**: text names of skills (e.g., `communication`, `teamwork`).
-- **Count column** (optional): frequency/weight of each skill.
-""")
-df_state = gr.State()
-with gr.Row():
-csv_file = gr.File(label="CSV file", file_types=[".csv"], interactive=True)
-font_file = gr.File(label="Font (optional .ttf)")
-with gr.Row():
-skill_col = gr.Dropdown(choices=[], label="Skill column", interactive=True)
-count_col = gr.Dropdown(choices=["(none)"], value="(none)", label="Count column (optional)", interactive=True)
-with gr.Accordion("Advanced options", open=False):
-with gr.Row():
-max_words = gr.Slider(50, 1000, value=DEFAULT_MAX_WORDS, step=10, label="Max words")
-width = gr.Slider(400, 3000, value=1400, step=50, label="Width")
-height = gr.Slider(300, 2000, value=800, step=50, label="Height")
-with gr.Row():
-bg = gr.Textbox(value=DEFAULT_BG, label="Background color (e.g., white or #111827)")
-seed = gr.Number(value=42, precision=0, label="Random seed")
-stopwords = gr.Textbox(value="", label="Stopwords to exclude (comma-separated)")
-with gr.Row():
-btn_load = gr.Button("Load / Preview CSV", variant="secondary")
-btn_generate = gr.Button("Generate Word Cloud", variant="primary")
-preview = gr.Dataframe(label="CSV preview", interactive=False, wrap=True, max_rows=10)
-with gr.Row():
-img = gr.Image(type="filepath", label="Word Cloud", show_download_button=True)
-download = gr.File(label="Download image")
-top_table = gr.Dataframe(label="Top skills (weighted)", interactive=False)
-# --- events ---
-def on_load(file):
-df = _load_csv(file)
-s_col, c_col = _guess_columns(df)
-df_state_value = df
-# choices
-choices = list(df.columns)
-skill_update = gr.update(choices=choices, value=s_col)
-count_choices = ["(none)"] + choices
-count_value = c_col if c_col else "(none)"
-count_update = gr.update(choices=count_choices, value=count_value)
-return df_state_value, skill_update, count_update, df.head(10)
-btn_load.click(on_load, inputs=[csv_file], outputs=[df_state, skill_col, count_col, preview])
-def on_generate(df, file, s_col, c_col, max_w, w, h, bg_color, font, stop, seed_val):
-# df from State may be None if user didn't click Load; try to load now
-if df is None:
-df = _load_csv(file)
-# still need to guess columns
-s_guess, c_guess = _guess_columns(df)
-s_col = s_col or s_guess
-if (not c_col) or c_col == "(none)":
-c_col = c_guess
-if c_col == "(none)":
-c_col = None
-freq_df = build_frequencies(df, s_col, c_col, stop)
-out_path = generate_wordcloud_image(freq_df, font, int(w), int(h), int(max_w), bg_color, int(seed_val))
-# Show top 100 rows for quick inspection
-top_show = freq_df.head(100)
-return out_path, out_path, top_show
-btn_generate.click(
-on_generate,
-inputs=[df_state, csv_file, skill_col, count_col, max_words, width, height, bg, font_file, stopwords, seed],
-outputs=[img, download, top_table],
-)
-if __name__ == "__main__":
 demo.launch()

 import os
+import io
+import re
+from typing import Dict, Optional, Tuple
+import gradio as gr
+import pandas as pd
+from wordcloud import WordCloud
+DEFAULT_CSV_PATH = "Soft_Skills__Top_5000.csv" # auto-load if present
+DEFAULT_MAX_WORDS = 400
+DEFAULT_BG = "white"
+# --- helpers ---
+LIKELY_SKILL_COLS = {"skill", "skills", "soft skill", "soft skills", "name", "keyword", "term"}
+LIKELY_COUNT_COLS = {"count", "counts", "frequency", "freq", "weight", "n"}
+def _find_column(cols, candidates):
+low = {c.lower(): c for c in cols}
+for cand in candidates:
+for c in cols:
+if cand == c.lower():
+return c
+# fuzzy contains
+for key, orig in low.items():
+if cand in key:
+return orig
+return None
+def _load_csv(file_obj: Optional[gr.File]) -> pd.DataFrame:
+"""Load from uploaded file or default path."""
+if file_obj and hasattr(file_obj, "name") and file_obj.name:
+return pd.read_csv(file_obj.name)
+if os.path.exists(DEFAULT_CSV_PATH):
+return pd.read_csv(DEFAULT_CSV_PATH)
+raise FileNotFoundError("CSV not provided. Upload a file or add Soft_Skills__Top_5000_.csv to the repo.")
+def _guess_columns(df: pd.DataFrame) -> Tuple[Optional[str], Optional[str]]:
+skill_col = _find_column(df.columns, LIKELY_SKILL_COLS) or df.select_dtypes(include=["object"]).columns[:1].tolist()[0]
+count_col = _find_column(df.columns, LIKELY_COUNT_COLS)
+return skill_col, count_col
+def _clean_text(s: str) -> str:
+if not isinstance(s, str):
+s = str(s)
+s = s.strip()
+# preserve Thai/letters/numbers, replace other punctuation with spaces
+s = re.sub(r"[^\w\u0E00-\u0E7F\s\-]+", " ", s)
+s = re.sub(r"\s+", " ", s)
+return s
+def build_frequencies(df: pd.DataFrame, skill_col: str, count_col: Optional[str], stopwords_text: str) -> pd.DataFrame:
+if skill_col not in df.columns:
+raise ValueError(f"Skill column '{skill_col}' not found.")
+tmp = df.copy()
+tmp[skill_col] = tmp[skill_col].map(_clean_text)
+tmp = tmp.dropna(subset=[skill_col])
 demo.launch()