File size: 6,557 Bytes
be8ff6e
67d52df
a79f725
67d52df
a79f725
 
67d52df
f8c1ed5
611a545
 
 
 
 
a79f725
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611a545
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a79f725
 
 
 
 
 
 
 
 
 
 
 
 
611a545
 
 
a79f725
 
 
 
611a545
 
a79f725
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611a545
a79f725
 
 
611a545
 
 
 
 
 
 
 
a79f725
 
 
 
 
 
 
611a545
a79f725
 
 
 
 
 
 
 
 
 
 
ad31e83
a79f725
 
611a545
 
 
 
 
a79f725
 
 
611a545
a79f725
 
 
611a545
 
 
a79f725
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import os
import io
import numpy as np
import pandas as pd
from PIL import Image
import gradio as gr
from wordcloud import WordCloud

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

DEFAULT_CSV = "Soft_Skills__Top_5000_.csv"
DEFAULT_TEXT_COL = "ทักษะ"
DEFAULT_FREQ_COL = "จำนวนความถี่ที่พบ"

def _load_dataframe(file):
    if file is not None:
        return pd.read_csv(file.name if hasattr(file, "name") else file)
    if os.path.exists(DEFAULT_CSV):
        return pd.read_csv(DEFAULT_CSV)
    raise gr.Error("No CSV provided and default file not found. Please upload a CSV.")

def _detect_columns(df):
    if DEFAULT_TEXT_COL in df.columns and DEFAULT_FREQ_COL in df.columns:
        return DEFAULT_TEXT_COL, DEFAULT_FREQ_COL
    if len(df.columns) >= 2:
        return df.columns[0], df.columns[1]
    raise gr.Error("CSV must have at least 2 columns: [text/skill, frequency].")

def _to_bar_chart_image(df_words, top_k):
    sub = df_words.head(int(top_k)).copy()
    sub = sub.iloc[::-1]
    fig = plt.figure(figsize=(8, max(3, 0.35*len(sub))))
    ax = fig.add_subplot(111)
    ax.barh(sub["word"], sub["frequency"])
    ax.set_xlabel("Frequency")
    ax.set_ylabel("Word")
    ax.set_title(f"Top {len(sub)} Words by Frequency")
    fig.tight_layout()
    import io as _io
    buf = _io.BytesIO()
    fig.savefig(buf, format="png", dpi=160, bbox_inches="tight")
    plt.close(fig)
    buf.seek(0)
    return Image.open(buf)

def generate_wordcloud_and_bar(
    csv_file,
    text_col,
    freq_col,
    max_words,
    background_color,
    colormap,
    width,
    height,
    scale,
    prefer_horizontal,
    collocations,
    stopwords_text,
    mask_image,
    random_state,
    show_bar,
    topk_bar
):
    df = _load_dataframe(csv_file)
    if text_col == "auto" or freq_col == "auto":
        auto_text, auto_freq = _detect_columns(df)
        if text_col == "auto": text_col = auto_text
        if freq_col == "auto": freq_col = auto_freq

    if text_col not in df.columns or freq_col not in df.columns:
        raise gr.Error(f"Columns not found. Available columns: {list(df.columns)}")

    sub = df[[text_col, freq_col]].dropna()
    sub[freq_col] = pd.to_numeric(sub[freq_col], errors="coerce").fillna(0).astype(float)
    sub = sub.sort_values(freq_col, ascending=False).head(max_words)

    frequencies = {str(k): float(v) for k, v in zip(sub[text_col], sub[freq_col]) if str(k).strip()}
    if not frequencies:
        raise gr.Error("No words found after processing. Please check your CSV columns.")

    stopwords = set()
    if stopwords_text:
        for w in stopwords_text.splitlines():
            w = w.strip()
            if w:
                stopwords.add(w)

    mask = None
    if mask_image is not None:
        try:
            pil = Image.open(mask_image.name if hasattr(mask_image, "name") else mask_image).convert("L")
            mask = np.array(pil)
        except Exception as e:
            raise gr.Error(f"Failed to read mask image: {e}")

    font_path = None
    for cand in ["NotoSansThai-Regular.ttf", "NotoSansThai.ttf", "/usr/share/fonts/truetype/noto/NotoSansThai-Regular.ttf"]:
        if os.path.exists(cand):
            font_path = cand
            break

    wc = WordCloud(
        width=width,
        height=height,
        scale=scale,
        background_color=background_color,
        colormap=None if colormap == "default" else colormap,
        prefer_horizontal=prefer_horizontal,
        collocations=collocations,
        stopwords=stopwords,
        mask=mask,
        font_path=font_path,
        random_state=random_state,
    )
    wc.generate_from_frequencies(frequencies)
    img_cloud = wc.to_image()

    preview = sub.rename(columns={text_col: "word", freq_col: "frequency"})

    img_bar = None
    if show_bar:
        img_bar = _to_bar_chart_image(preview, topk_bar)

    return img_cloud, img_bar, preview

with gr.Blocks(title="Soft Skills Word Cloud + Bar Chart") as demo:
    gr.Markdown("# Soft Skills Word Cloud + Bar Chart\nUpload a CSV or place **Soft_Skills__Top_5000_.csv** in the repo.")

    with gr.Row():
        with gr.Column(scale=1):
            csv_file = gr.File(label="Upload CSV (optional)", file_count="single", file_types=[".csv"])
            text_col = gr.Dropdown(choices=["auto"], value="auto", label="Text/Skill column")
            freq_col = gr.Dropdown(choices=["auto"], value="auto", label="Frequency column")
            max_words = gr.Slider(10, 1000, value=300, step=10, label="Max words")
            background_color = gr.Dropdown(choices=["white", "black"], value="white", label="Background color")
            colormap = gr.Dropdown(
                choices=["default","viridis","plasma","inferno","magma","cividis","terrain","tab20","tab10","Pastel1","Set3"],
                value="default",
                label="Colormap"
            )
            width = gr.Slider(400, 2000, value=1200, step=50, label="Width (px)")
            height = gr.Slider(300, 1500, value=700, step=50, label="Height (px)")
            scale = gr.Slider(1, 5, value=2, step=1, label="Scale")
            prefer_horizontal = gr.Checkbox(value=True, label="Prefer horizontal layout")
            collocations = gr.Checkbox(value=False, label="Allow collocations (word pairs)")
            stopwords_text = gr.Textbox(lines=4, label="Stopwords (one per line)", placeholder="e.g.\nและ\nกับ\nของ\nthe\nand")
            mask_image = gr.Image(type="filepath", label="Mask image (optional)")
            random_state = gr.Slider(0, 100, value=42, step=1, label="Random state (for reproducibility)")

            gr.Markdown("### Bar Chart Options")
            show_bar = gr.Checkbox(value=True, label="Show bar chart")
            topk_bar = gr.Slider(5, 50, value=20, step=1, label="Top K for bar chart")

            run_btn = gr.Button("Generate", variant="primary")

        with gr.Column(scale=1):
            out_img = gr.Image(label="Word Cloud", type="pil")
            out_bar = gr.Image(label="Bar Chart (Top K)", type="pil")
            out_table = gr.Dataframe(label="Top words used", wrap=True)

    run_btn.click(
        fn=generate_wordcloud_and_bar,
        inputs=[csv_file, text_col, freq_col, max_words, background_color, colormap, width, height, scale, prefer_horizontal, collocations, stopwords_text, mask_image, random_state, show_bar, topk_bar],
        outputs=[out_img, out_bar, out_table],
    )

if __name__ == "__main__":
    demo.launch()