Nucha commited on
Commit
a79f725
·
verified ·
1 Parent(s): 67d52df

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +154 -75
  2. requirements.txt +4 -4
app.py CHANGED
@@ -1,80 +1,159 @@
 
1
  import os
2
  import io
3
- import re
4
- from typing import Dict, Optional, Tuple
5
-
6
-
7
- import gradio as gr
8
  import pandas as pd
 
 
9
  from wordcloud import WordCloud
10
 
11
-
12
- DEFAULT_CSV_PATH = "Soft_Skills__Top_5000.csv" # auto-load if present
13
- DEFAULT_MAX_WORDS = 400
14
- DEFAULT_BG = "white"
15
-
16
-
17
- # --- helpers ---
18
- LIKELY_SKILL_COLS = {"skill", "skills", "soft skill", "soft skills", "name", "keyword", "term"}
19
- LIKELY_COUNT_COLS = {"count", "counts", "frequency", "freq", "weight", "n"}
20
-
21
-
22
-
23
-
24
- def _find_column(cols, candidates):
25
- low = {c.lower(): c for c in cols}
26
- for cand in candidates:
27
- for c in cols:
28
- if cand == c.lower():
29
- return c
30
- # fuzzy contains
31
- for key, orig in low.items():
32
- if cand in key:
33
- return orig
34
- return None
35
-
36
-
37
-
38
-
39
- def _load_csv(file_obj: Optional[gr.File]) -> pd.DataFrame:
40
- """Load from uploaded file or default path."""
41
- if file_obj and hasattr(file_obj, "name") and file_obj.name:
42
- return pd.read_csv(file_obj.name)
43
- if os.path.exists(DEFAULT_CSV_PATH):
44
- return pd.read_csv(DEFAULT_CSV_PATH)
45
- raise FileNotFoundError("CSV not provided. Upload a file or add Soft_Skills__Top_5000_.csv to the repo.")
46
-
47
-
48
-
49
-
50
- def _guess_columns(df: pd.DataFrame) -> Tuple[Optional[str], Optional[str]]:
51
- skill_col = _find_column(df.columns, LIKELY_SKILL_COLS) or df.select_dtypes(include=["object"]).columns[:1].tolist()[0]
52
- count_col = _find_column(df.columns, LIKELY_COUNT_COLS)
53
- return skill_col, count_col
54
-
55
-
56
-
57
-
58
- def _clean_text(s: str) -> str:
59
- if not isinstance(s, str):
60
- s = str(s)
61
- s = s.strip()
62
- # preserve Thai/letters/numbers, replace other punctuation with spaces
63
- s = re.sub(r"[^\w\u0E00-\u0E7F\s\-]+", " ", s)
64
- s = re.sub(r"\s+", " ", s)
65
- return s
66
-
67
-
68
-
69
-
70
- def build_frequencies(df: pd.DataFrame, skill_col: str, count_col: Optional[str], stopwords_text: str) -> pd.DataFrame:
71
- if skill_col not in df.columns:
72
- raise ValueError(f"Skill column '{skill_col}' not found.")
73
-
74
-
75
- tmp = df.copy()
76
- tmp[skill_col] = tmp[skill_col].map(_clean_text)
77
- tmp = tmp.dropna(subset=[skill_col])
78
-
79
-
80
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \
2
  import os
3
  import io
4
+ import numpy as np
 
 
 
 
5
  import pandas as pd
6
+ from PIL import Image
7
+ import gradio as gr
8
  from wordcloud import WordCloud
9
 
10
+ DEFAULT_CSV = "Soft_Skills__Top_5000_.csv" # Put this file in the Space repo root
11
+ DEFAULT_TEXT_COL = "ทักษะ"
12
+ DEFAULT_FREQ_COL = "จำนวนความถี่ที่พบ"
13
+
14
+ def _load_dataframe(file):
15
+ """
16
+ Load a CSV either from the uploaded file or from DEFAULT_CSV if present.
17
+ """
18
+ if file is not None:
19
+ return pd.read_csv(file.name if hasattr(file, "name") else file)
20
+ if os.path.exists(DEFAULT_CSV):
21
+ return pd.read_csv(DEFAULT_CSV)
22
+ raise gr.Error("No CSV provided and default file not found. Please upload a CSV.")
23
+
24
+ def _detect_columns(df):
25
+ # Try defaults first; else guess the first two columns
26
+ if DEFAULT_TEXT_COL in df.columns and DEFAULT_FREQ_COL in df.columns:
27
+ return DEFAULT_TEXT_COL, DEFAULT_FREQ_COL
28
+ if len(df.columns) >= 2:
29
+ return df.columns[0], df.columns[1]
30
+ raise gr.Error("CSV must have at least 2 columns: [text/skill, frequency].")
31
+
32
+ def generate_wordcloud(
33
+ csv_file,
34
+ text_col,
35
+ freq_col,
36
+ max_words,
37
+ background_color,
38
+ colormap,
39
+ width,
40
+ height,
41
+ scale,
42
+ prefer_horizontal,
43
+ collocations,
44
+ stopwords_text,
45
+ mask_image,
46
+ random_state
47
+ ):
48
+ # Load data
49
+ df = _load_dataframe(csv_file)
50
+ # Auto-pick columns if "auto"
51
+ if text_col == "auto" or freq_col == "auto":
52
+ auto_text, auto_freq = _detect_columns(df)
53
+ if text_col == "auto":
54
+ text_col = auto_text
55
+ if freq_col == "auto":
56
+ freq_col = auto_freq
57
+
58
+ if text_col not in df.columns or freq_col not in df.columns:
59
+ raise gr.Error(f"Columns not found. Available columns: {list(df.columns)}")
60
+
61
+ # Clean and build frequency dict
62
+ sub = df[[text_col, freq_col]].dropna()
63
+ # Coerce frequency to numeric
64
+ sub[freq_col] = pd.to_numeric(sub[freq_col], errors="coerce").fillna(0).astype(float)
65
+ # Keep top N by frequency
66
+ sub = sub.sort_values(freq_col, ascending=False).head(max_words)
67
+
68
+ frequencies = {str(k): float(v) for k, v in zip(sub[text_col], sub[freq_col]) if str(k).strip()}
69
+
70
+ if not frequencies:
71
+ raise gr.Error("No words found after processing. Please check your CSV columns.")
72
+
73
+ # Stopwords
74
+ stopwords = set()
75
+ if stopwords_text:
76
+ for w in stopwords_text.splitlines():
77
+ w = w.strip()
78
+ if w:
79
+ stopwords.add(w)
80
+
81
+ # Optional mask
82
+ mask = None
83
+ if mask_image is not None:
84
+ try:
85
+ pil = Image.open(mask_image.name if hasattr(mask_image, "name") else mask_image).convert("L")
86
+ mask = np.array(pil)
87
+ except Exception as e:
88
+ raise gr.Error(f"Failed to read mask image: {e}")
89
+
90
+ # Try to use Thai-capable font if present
91
+ font_path = None
92
+ for cand in ["NotoSansThai-Regular.ttf", "NotoSansThai.ttf", "/usr/share/fonts/truetype/noto/NotoSansThai-Regular.ttf"]:
93
+ if os.path.exists(cand):
94
+ font_path = cand
95
+ break
96
+
97
+ wc = WordCloud(
98
+ width=width,
99
+ height=height,
100
+ scale=scale,
101
+ background_color=background_color,
102
+ colormap=None if colormap == "default" else colormap,
103
+ prefer_horizontal=prefer_horizontal,
104
+ collocations=collocations,
105
+ stopwords=stopwords,
106
+ mask=mask,
107
+ font_path=font_path,
108
+ random_state=random_state,
109
+ )
110
+
111
+ wc.generate_from_frequencies(frequencies)
112
+ img = wc.to_image()
113
+
114
+ # Return image and also a CSV preview (top words used)
115
+ preview = sub.rename(columns={text_col: "word", freq_col: "frequency"})
116
+ return img, preview
117
+
118
+ with gr.Blocks(title="Soft Skills WordCloud") as demo:
119
+ gr.Markdown("# Soft Skills Word Cloud\nUpload a CSV or place **Soft_Skills__Top_5000_.csv** in the repo.")
120
+
121
+ with gr.Row():
122
+ with gr.Column(scale=1):
123
+ csv_file = gr.File(label="Upload CSV (optional)", file_count="single", file_types=[".csv"])
124
+ text_col = gr.Dropdown(choices=["auto"], value="auto", label="Text/Skill column")
125
+ freq_col = gr.Dropdown(choices=["auto"], value="auto", label="Frequency column")
126
+ max_words = gr.Slider(10, 1000, value=300, step=10, label="Max words")
127
+ background_color = gr.Dropdown(
128
+ choices=["white", "black"],
129
+ value="white",
130
+ label="Background color"
131
+ )
132
+ colormap = gr.Dropdown(
133
+ choices=["default","viridis","plasma","inferno","magma","cividis","terrain","tab20","tab10","Pastel1","Set3"],
134
+ value="default",
135
+ label="Colormap"
136
+ )
137
+ width = gr.Slider(400, 2000, value=1200, step=50, label="Width (px)")
138
+ height = gr.Slider(300, 1500, value=700, step=50, label="Height (px)")
139
+ scale = gr.Slider(1, 5, value=2, step=1, label="Scale")
140
+ prefer_horizontal = gr.Checkbox(value=True, label="Prefer horizontal layout")
141
+ collocations = gr.Checkbox(value=False, label="Allow collocations (word pairs)")
142
+ stopwords_text = gr.Textbox(lines=4, label="Stopwords (one per line)", placeholder="e.g.\nและ\nกับ\nของ\nthe\nand")
143
+ mask_image = gr.Image(type="filepath", label="Mask image (optional)", tool=None)
144
+ random_state = gr.Slider(0, 100, value=42, step=1, label="Random state (for reproducibility)")
145
+
146
+ run_btn = gr.Button("Generate Word Cloud", variant="primary")
147
+
148
+ with gr.Column(scale=1):
149
+ out_img = gr.Image(label="Word Cloud", type="pil")
150
+ out_table = gr.Dataframe(label="Top words used", wrap=True)
151
+
152
+ run_btn.click(
153
+ fn=generate_wordcloud,
154
+ inputs=[csv_file, text_col, freq_col, max_words, background_color, colormap, width, height, scale, prefer_horizontal, collocations, stopwords_text, mask_image, random_state],
155
+ outputs=[out_img, out_table],
156
+ )
157
+
158
+ if __name__ == "__main__":
159
+ demo.launch()
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
- gradio>=4.44.0
2
- pandas>=2.1.0
3
  wordcloud>=1.9.3
4
- matplotlib>=3.8.0
5
- Pillow>=10.0.0
 
1
+ gradio>=4.0.0
2
+ pandas>=2.0.0
3
  wordcloud>=1.9.3
4
+ numpy
5
+ Pillow