Spaces:
Sleeping
Sleeping
File size: 6,557 Bytes
be8ff6e 67d52df a79f725 67d52df a79f725 67d52df f8c1ed5 611a545 a79f725 611a545 a79f725 611a545 a79f725 611a545 a79f725 611a545 a79f725 611a545 a79f725 611a545 a79f725 ad31e83 a79f725 611a545 a79f725 611a545 a79f725 611a545 a79f725 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import os
import io
import numpy as np
import pandas as pd
from PIL import Image
import gradio as gr
from wordcloud import WordCloud
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
DEFAULT_CSV = "Soft_Skills__Top_5000_.csv"
DEFAULT_TEXT_COL = "ทักษะ"
DEFAULT_FREQ_COL = "จำนวนความถี่ที่พบ"
def _load_dataframe(file):
if file is not None:
return pd.read_csv(file.name if hasattr(file, "name") else file)
if os.path.exists(DEFAULT_CSV):
return pd.read_csv(DEFAULT_CSV)
raise gr.Error("No CSV provided and default file not found. Please upload a CSV.")
def _detect_columns(df):
if DEFAULT_TEXT_COL in df.columns and DEFAULT_FREQ_COL in df.columns:
return DEFAULT_TEXT_COL, DEFAULT_FREQ_COL
if len(df.columns) >= 2:
return df.columns[0], df.columns[1]
raise gr.Error("CSV must have at least 2 columns: [text/skill, frequency].")
def _to_bar_chart_image(df_words, top_k):
sub = df_words.head(int(top_k)).copy()
sub = sub.iloc[::-1]
fig = plt.figure(figsize=(8, max(3, 0.35*len(sub))))
ax = fig.add_subplot(111)
ax.barh(sub["word"], sub["frequency"])
ax.set_xlabel("Frequency")
ax.set_ylabel("Word")
ax.set_title(f"Top {len(sub)} Words by Frequency")
fig.tight_layout()
import io as _io
buf = _io.BytesIO()
fig.savefig(buf, format="png", dpi=160, bbox_inches="tight")
plt.close(fig)
buf.seek(0)
return Image.open(buf)
def generate_wordcloud_and_bar(
csv_file,
text_col,
freq_col,
max_words,
background_color,
colormap,
width,
height,
scale,
prefer_horizontal,
collocations,
stopwords_text,
mask_image,
random_state,
show_bar,
topk_bar
):
df = _load_dataframe(csv_file)
if text_col == "auto" or freq_col == "auto":
auto_text, auto_freq = _detect_columns(df)
if text_col == "auto": text_col = auto_text
if freq_col == "auto": freq_col = auto_freq
if text_col not in df.columns or freq_col not in df.columns:
raise gr.Error(f"Columns not found. Available columns: {list(df.columns)}")
sub = df[[text_col, freq_col]].dropna()
sub[freq_col] = pd.to_numeric(sub[freq_col], errors="coerce").fillna(0).astype(float)
sub = sub.sort_values(freq_col, ascending=False).head(max_words)
frequencies = {str(k): float(v) for k, v in zip(sub[text_col], sub[freq_col]) if str(k).strip()}
if not frequencies:
raise gr.Error("No words found after processing. Please check your CSV columns.")
stopwords = set()
if stopwords_text:
for w in stopwords_text.splitlines():
w = w.strip()
if w:
stopwords.add(w)
mask = None
if mask_image is not None:
try:
pil = Image.open(mask_image.name if hasattr(mask_image, "name") else mask_image).convert("L")
mask = np.array(pil)
except Exception as e:
raise gr.Error(f"Failed to read mask image: {e}")
font_path = None
for cand in ["NotoSansThai-Regular.ttf", "NotoSansThai.ttf", "/usr/share/fonts/truetype/noto/NotoSansThai-Regular.ttf"]:
if os.path.exists(cand):
font_path = cand
break
wc = WordCloud(
width=width,
height=height,
scale=scale,
background_color=background_color,
colormap=None if colormap == "default" else colormap,
prefer_horizontal=prefer_horizontal,
collocations=collocations,
stopwords=stopwords,
mask=mask,
font_path=font_path,
random_state=random_state,
)
wc.generate_from_frequencies(frequencies)
img_cloud = wc.to_image()
preview = sub.rename(columns={text_col: "word", freq_col: "frequency"})
img_bar = None
if show_bar:
img_bar = _to_bar_chart_image(preview, topk_bar)
return img_cloud, img_bar, preview
with gr.Blocks(title="Soft Skills Word Cloud + Bar Chart") as demo:
gr.Markdown("# Soft Skills Word Cloud + Bar Chart\nUpload a CSV or place **Soft_Skills__Top_5000_.csv** in the repo.")
with gr.Row():
with gr.Column(scale=1):
csv_file = gr.File(label="Upload CSV (optional)", file_count="single", file_types=[".csv"])
text_col = gr.Dropdown(choices=["auto"], value="auto", label="Text/Skill column")
freq_col = gr.Dropdown(choices=["auto"], value="auto", label="Frequency column")
max_words = gr.Slider(10, 1000, value=300, step=10, label="Max words")
background_color = gr.Dropdown(choices=["white", "black"], value="white", label="Background color")
colormap = gr.Dropdown(
choices=["default","viridis","plasma","inferno","magma","cividis","terrain","tab20","tab10","Pastel1","Set3"],
value="default",
label="Colormap"
)
width = gr.Slider(400, 2000, value=1200, step=50, label="Width (px)")
height = gr.Slider(300, 1500, value=700, step=50, label="Height (px)")
scale = gr.Slider(1, 5, value=2, step=1, label="Scale")
prefer_horizontal = gr.Checkbox(value=True, label="Prefer horizontal layout")
collocations = gr.Checkbox(value=False, label="Allow collocations (word pairs)")
stopwords_text = gr.Textbox(lines=4, label="Stopwords (one per line)", placeholder="e.g.\nและ\nกับ\nของ\nthe\nand")
mask_image = gr.Image(type="filepath", label="Mask image (optional)")
random_state = gr.Slider(0, 100, value=42, step=1, label="Random state (for reproducibility)")
gr.Markdown("### Bar Chart Options")
show_bar = gr.Checkbox(value=True, label="Show bar chart")
topk_bar = gr.Slider(5, 50, value=20, step=1, label="Top K for bar chart")
run_btn = gr.Button("Generate", variant="primary")
with gr.Column(scale=1):
out_img = gr.Image(label="Word Cloud", type="pil")
out_bar = gr.Image(label="Bar Chart (Top K)", type="pil")
out_table = gr.Dataframe(label="Top words used", wrap=True)
run_btn.click(
fn=generate_wordcloud_and_bar,
inputs=[csv_file, text_col, freq_col, max_words, background_color, colormap, width, height, scale, prefer_horizontal, collocations, stopwords_text, mask_image, random_state, show_bar, topk_bar],
outputs=[out_img, out_bar, out_table],
)
if __name__ == "__main__":
demo.launch()
|