Nucha commited on
Commit
67d52df
·
verified ·
1 Parent(s): f8c1ed5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -73
app.py CHANGED
@@ -1,105 +1,80 @@
1
  import os
2
- # --- Gradio UI ---
3
- with gr.Blocks(title="Soft Skills Word Cloud") as demo:
4
- gr.Markdown("""
5
- # 🧩 Soft Skills Word Cloud
6
- Upload your CSV (or place `Soft_Skills__Top_5000_.csv` in the repo) to generate a word cloud.
7
- - **Skill column**: text names of skills (e.g., `communication`, `teamwork`).
8
- - **Count column** (optional): frequency/weight of each skill.
9
- """)
10
 
11
 
12
- df_state = gr.State()
 
 
13
 
14
 
15
- with gr.Row():
16
- csv_file = gr.File(label="CSV file", file_types=[".csv"], interactive=True)
17
- font_file = gr.File(label="Font (optional .ttf)")
18
 
19
 
20
- with gr.Row():
21
- skill_col = gr.Dropdown(choices=[], label="Skill column", interactive=True)
22
- count_col = gr.Dropdown(choices=["(none)"], value="(none)", label="Count column (optional)", interactive=True)
23
 
24
 
25
- with gr.Accordion("Advanced options", open=False):
26
- with gr.Row():
27
- max_words = gr.Slider(50, 1000, value=DEFAULT_MAX_WORDS, step=10, label="Max words")
28
- width = gr.Slider(400, 3000, value=1400, step=50, label="Width")
29
- height = gr.Slider(300, 2000, value=800, step=50, label="Height")
30
- with gr.Row():
31
- bg = gr.Textbox(value=DEFAULT_BG, label="Background color (e.g., white or #111827)")
32
- seed = gr.Number(value=42, precision=0, label="Random seed")
33
- stopwords = gr.Textbox(value="", label="Stopwords to exclude (comma-separated)")
34
 
35
 
36
- with gr.Row():
37
- btn_load = gr.Button("Load / Preview CSV", variant="secondary")
38
- btn_generate = gr.Button("Generate Word Cloud", variant="primary")
 
 
 
 
 
 
 
 
39
 
40
 
41
- preview = gr.Dataframe(label="CSV preview", interactive=False, wrap=True, max_rows=10)
42
 
43
 
44
- with gr.Row():
45
- img = gr.Image(type="filepath", label="Word Cloud", show_download_button=True)
46
- download = gr.File(label="Download image")
 
 
 
 
47
 
48
 
49
- top_table = gr.Dataframe(label="Top skills (weighted)", interactive=False)
50
 
51
 
52
- # --- events ---
53
- def on_load(file):
54
- df = _load_csv(file)
55
- s_col, c_col = _guess_columns(df)
56
- df_state_value = df
57
 
58
 
59
- # choices
60
- choices = list(df.columns)
61
- skill_update = gr.update(choices=choices, value=s_col)
62
- count_choices = ["(none)"] + choices
63
- count_value = c_col if c_col else "(none)"
64
- count_update = gr.update(choices=count_choices, value=count_value)
65
 
66
 
67
- return df_state_value, skill_update, count_update, df.head(10)
 
 
 
 
 
 
 
68
 
69
 
70
- btn_load.click(on_load, inputs=[csv_file], outputs=[df_state, skill_col, count_col, preview])
71
 
72
 
73
- def on_generate(df, file, s_col, c_col, max_w, w, h, bg_color, font, stop, seed_val):
74
- # df from State may be None if user didn't click Load; try to load now
75
- if df is None:
76
- df = _load_csv(file)
77
- # still need to guess columns
78
- s_guess, c_guess = _guess_columns(df)
79
- s_col = s_col or s_guess
80
- if (not c_col) or c_col == "(none)":
81
- c_col = c_guess
82
 
83
 
84
- if c_col == "(none)":
85
- c_col = None
 
86
 
87
 
88
- freq_df = build_frequencies(df, s_col, c_col, stop)
89
- out_path = generate_wordcloud_image(freq_df, font, int(w), int(h), int(max_w), bg_color, int(seed_val))
90
-
91
-
92
- # Show top 100 rows for quick inspection
93
- top_show = freq_df.head(100)
94
- return out_path, out_path, top_show
95
-
96
-
97
- btn_generate.click(
98
- on_generate,
99
- inputs=[df_state, csv_file, skill_col, count_col, max_words, width, height, bg, font_file, stopwords, seed],
100
- outputs=[img, download, top_table],
101
- )
102
-
103
-
104
- if __name__ == "__main__":
105
  demo.launch()
 
1
  import os
2
+ import io
3
+ import re
4
+ from typing import Dict, Optional, Tuple
 
 
 
 
 
5
 
6
 
7
+ import gradio as gr
8
+ import pandas as pd
9
+ from wordcloud import WordCloud
10
 
11
 
12
+ DEFAULT_CSV_PATH = "Soft_Skills__Top_5000.csv" # auto-load if present
13
+ DEFAULT_MAX_WORDS = 400
14
+ DEFAULT_BG = "white"
15
 
16
 
17
+ # --- helpers ---
18
+ LIKELY_SKILL_COLS = {"skill", "skills", "soft skill", "soft skills", "name", "keyword", "term"}
19
+ LIKELY_COUNT_COLS = {"count", "counts", "frequency", "freq", "weight", "n"}
20
 
21
 
 
 
 
 
 
 
 
 
 
22
 
23
 
24
+ def _find_column(cols, candidates):
25
+ low = {c.lower(): c for c in cols}
26
+ for cand in candidates:
27
+ for c in cols:
28
+ if cand == c.lower():
29
+ return c
30
+ # fuzzy contains
31
+ for key, orig in low.items():
32
+ if cand in key:
33
+ return orig
34
+ return None
35
 
36
 
 
37
 
38
 
39
+ def _load_csv(file_obj: Optional[gr.File]) -> pd.DataFrame:
40
+ """Load from uploaded file or default path."""
41
+ if file_obj and hasattr(file_obj, "name") and file_obj.name:
42
+ return pd.read_csv(file_obj.name)
43
+ if os.path.exists(DEFAULT_CSV_PATH):
44
+ return pd.read_csv(DEFAULT_CSV_PATH)
45
+ raise FileNotFoundError("CSV not provided. Upload a file or add Soft_Skills__Top_5000_.csv to the repo.")
46
 
47
 
 
48
 
49
 
50
+ def _guess_columns(df: pd.DataFrame) -> Tuple[Optional[str], Optional[str]]:
51
+ skill_col = _find_column(df.columns, LIKELY_SKILL_COLS) or df.select_dtypes(include=["object"]).columns[:1].tolist()[0]
52
+ count_col = _find_column(df.columns, LIKELY_COUNT_COLS)
53
+ return skill_col, count_col
 
54
 
55
 
 
 
 
 
 
 
56
 
57
 
58
+ def _clean_text(s: str) -> str:
59
+ if not isinstance(s, str):
60
+ s = str(s)
61
+ s = s.strip()
62
+ # preserve Thai/letters/numbers, replace other punctuation with spaces
63
+ s = re.sub(r"[^\w\u0E00-\u0E7F\s\-]+", " ", s)
64
+ s = re.sub(r"\s+", " ", s)
65
+ return s
66
 
67
 
 
68
 
69
 
70
+ def build_frequencies(df: pd.DataFrame, skill_col: str, count_col: Optional[str], stopwords_text: str) -> pd.DataFrame:
71
+ if skill_col not in df.columns:
72
+ raise ValueError(f"Skill column '{skill_col}' not found.")
 
 
 
 
 
 
73
 
74
 
75
+ tmp = df.copy()
76
+ tmp[skill_col] = tmp[skill_col].map(_clean_text)
77
+ tmp = tmp.dropna(subset=[skill_col])
78
 
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  demo.launch()