Spaces:

Nucha
/

WordCloud_BarChart

Sleeping

App Files Files Community

Nucha commited on Aug 29, 2025

Commit

611a545

verified ·

1 Parent(s): ad31e83

Upload app.py

Browse files

Files changed (1) hide show

app.py +47 -35

app.py CHANGED Viewed

@@ -6,14 +6,15 @@ from PIL import Image
 import gradio as gr
 from wordcloud import WordCloud
-DEFAULT_CSV = "Soft_Skills__Top_5000.csv"  # Put this file in the Space repo root
 DEFAULT_TEXT_COL = "ทักษะ"
 DEFAULT_FREQ_COL = "จำนวนความถี่ที่พบ"
 def _load_dataframe(file):
-    """
-    Load a CSV either from the uploaded file or from DEFAULT_CSV if present.
-    """
     if file is not None:
         return pd.read_csv(file.name if hasattr(file, "name") else file)
     if os.path.exists(DEFAULT_CSV):
@@ -21,14 +22,30 @@ def _load_dataframe(file):
     raise gr.Error("No CSV provided and default file not found. Please upload a CSV.")
 def _detect_columns(df):
-    # Try defaults first; else guess the first two columns
     if DEFAULT_TEXT_COL in df.columns and DEFAULT_FREQ_COL in df.columns:
         return DEFAULT_TEXT_COL, DEFAULT_FREQ_COL
     if len(df.columns) >= 2:
         return df.columns[0], df.columns[1]
     raise gr.Error("CSV must have at least 2 columns: [text/skill, frequency].")
-def generate_wordcloud(
     csv_file,
     text_col,
     freq_col,
@@ -42,34 +59,27 @@ def generate_wordcloud(
     collocations,
     stopwords_text,
     mask_image,
-    random_state
 ):
-    # Load data
     df = _load_dataframe(csv_file)
-    # Auto-pick columns if "auto"
     if text_col == "auto" or freq_col == "auto":
         auto_text, auto_freq = _detect_columns(df)
-        if text_col == "auto":
-            text_col = auto_text
-        if freq_col == "auto":
-            freq_col = auto_freq
     if text_col not in df.columns or freq_col not in df.columns:
         raise gr.Error(f"Columns not found. Available columns: {list(df.columns)}")
-    # Clean and build frequency dict
     sub = df[[text_col, freq_col]].dropna()
-    # Coerce frequency to numeric
     sub[freq_col] = pd.to_numeric(sub[freq_col], errors="coerce").fillna(0).astype(float)
-    # Keep top N by frequency
     sub = sub.sort_values(freq_col, ascending=False).head(max_words)
     frequencies = {str(k): float(v) for k, v in zip(sub[text_col], sub[freq_col]) if str(k).strip()}
     if not frequencies:
         raise gr.Error("No words found after processing. Please check your CSV columns.")
-    # Stopwords
     stopwords = set()
     if stopwords_text:
         for w in stopwords_text.splitlines():
@@ -77,7 +87,6 @@ def generate_wordcloud(
             if w:
                 stopwords.add(w)
-    # Optional mask
     mask = None
     if mask_image is not None:
         try:
@@ -86,7 +95,6 @@ def generate_wordcloud(
         except Exception as e:
             raise gr.Error(f"Failed to read mask image: {e}")
-    # Try to use Thai-capable font if present
     font_path = None
     for cand in ["NotoSansThai-Regular.ttf", "NotoSansThai.ttf", "/usr/share/fonts/truetype/noto/NotoSansThai-Regular.ttf"]:
         if os.path.exists(cand):
@@ -106,16 +114,19 @@ def generate_wordcloud(
         font_path=font_path,
         random_state=random_state,
     )
     wc.generate_from_frequencies(frequencies)
-    img = wc.to_image()
-    # Return image and also a CSV preview (top words used)
     preview = sub.rename(columns={text_col: "word", freq_col: "frequency"})
-    return img, preview
-with gr.Blocks(title="Soft Skills WordCloud") as demo:
-    gr.Markdown("# Soft Skills Word Cloud\nUpload a CSV or place **Soft_Skills__Top_5000_.csv** in the repo.")
     with gr.Row():
         with gr.Column(scale=1):
@@ -123,11 +134,7 @@ with gr.Blocks(title="Soft Skills WordCloud") as demo:
             text_col = gr.Dropdown(choices=["auto"], value="auto", label="Text/Skill column")
             freq_col = gr.Dropdown(choices=["auto"], value="auto", label="Frequency column")
             max_words = gr.Slider(10, 1000, value=300, step=10, label="Max words")
-            background_color = gr.Dropdown(
-                choices=["white", "black"],
-                value="white",
-                label="Background color"
-            )
             colormap = gr.Dropdown(
                 choices=["default","viridis","plasma","inferno","magma","cividis","terrain","tab20","tab10","Pastel1","Set3"],
                 value="default",
@@ -142,16 +149,21 @@ with gr.Blocks(title="Soft Skills WordCloud") as demo:
             mask_image = gr.Image(type="filepath", label="Mask image (optional)")
             random_state = gr.Slider(0, 100, value=42, step=1, label="Random state (for reproducibility)")
-            run_btn = gr.Button("Generate Word Cloud", variant="primary")
         with gr.Column(scale=1):
             out_img = gr.Image(label="Word Cloud", type="pil")
             out_table = gr.Dataframe(label="Top words used", wrap=True)
     run_btn.click(
-        fn=generate_wordcloud,
-        inputs=[csv_file, text_col, freq_col, max_words, background_color, colormap, width, height, scale, prefer_horizontal, collocations, stopwords_text, mask_image, random_state],
-        outputs=[out_img, out_table],
     )
 if __name__ == "__main__":

 import gradio as gr
 from wordcloud import WordCloud
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+DEFAULT_CSV = "Soft_Skills__Top_5000_.csv"
 DEFAULT_TEXT_COL = "ทักษะ"
 DEFAULT_FREQ_COL = "จำนวนความถี่ที่พบ"
 def _load_dataframe(file):
     if file is not None:
         return pd.read_csv(file.name if hasattr(file, "name") else file)
     if os.path.exists(DEFAULT_CSV):
     raise gr.Error("No CSV provided and default file not found. Please upload a CSV.")
 def _detect_columns(df):
     if DEFAULT_TEXT_COL in df.columns and DEFAULT_FREQ_COL in df.columns:
         return DEFAULT_TEXT_COL, DEFAULT_FREQ_COL
     if len(df.columns) >= 2:
         return df.columns[0], df.columns[1]
     raise gr.Error("CSV must have at least 2 columns: [text/skill, frequency].")
+def _to_bar_chart_image(df_words, top_k):
+    sub = df_words.head(int(top_k)).copy()
+    sub = sub.iloc[::-1]
+    fig = plt.figure(figsize=(8, max(3, 0.35*len(sub))))
+    ax = fig.add_subplot(111)
+    ax.barh(sub["word"], sub["frequency"])
+    ax.set_xlabel("Frequency")
+    ax.set_ylabel("Word")
+    ax.set_title(f"Top {len(sub)} Words by Frequency")
+    fig.tight_layout()
+    import io as _io
+    buf = _io.BytesIO()
+    fig.savefig(buf, format="png", dpi=160, bbox_inches="tight")
+    plt.close(fig)
+    buf.seek(0)
+    return Image.open(buf)
+def generate_wordcloud_and_bar(
     csv_file,
     text_col,
     freq_col,
     collocations,
     stopwords_text,
     mask_image,
+    random_state,
+    show_bar,
+    topk_bar
 ):
     df = _load_dataframe(csv_file)
     if text_col == "auto" or freq_col == "auto":
         auto_text, auto_freq = _detect_columns(df)
+        if text_col == "auto": text_col = auto_text
+        if freq_col == "auto": freq_col = auto_freq
     if text_col not in df.columns or freq_col not in df.columns:
         raise gr.Error(f"Columns not found. Available columns: {list(df.columns)}")
     sub = df[[text_col, freq_col]].dropna()
     sub[freq_col] = pd.to_numeric(sub[freq_col], errors="coerce").fillna(0).astype(float)
     sub = sub.sort_values(freq_col, ascending=False).head(max_words)
     frequencies = {str(k): float(v) for k, v in zip(sub[text_col], sub[freq_col]) if str(k).strip()}
     if not frequencies:
         raise gr.Error("No words found after processing. Please check your CSV columns.")
     stopwords = set()
     if stopwords_text:
         for w in stopwords_text.splitlines():
             if w:
                 stopwords.add(w)
     mask = None
     if mask_image is not None:
         try:
         except Exception as e:
             raise gr.Error(f"Failed to read mask image: {e}")
     font_path = None
     for cand in ["NotoSansThai-Regular.ttf", "NotoSansThai.ttf", "/usr/share/fonts/truetype/noto/NotoSansThai-Regular.ttf"]:
         if os.path.exists(cand):
         font_path=font_path,
         random_state=random_state,
     )
     wc.generate_from_frequencies(frequencies)
+    img_cloud = wc.to_image()
     preview = sub.rename(columns={text_col: "word", freq_col: "frequency"})
+    img_bar = None
+    if show_bar:
+        img_bar = _to_bar_chart_image(preview, topk_bar)
+    return img_cloud, img_bar, preview
+with gr.Blocks(title="Soft Skills Word Cloud + Bar Chart") as demo:
+    gr.Markdown("# Soft Skills Word Cloud + Bar Chart\nUpload a CSV or place **Soft_Skills__Top_5000_.csv** in the repo.")
     with gr.Row():
         with gr.Column(scale=1):
             text_col = gr.Dropdown(choices=["auto"], value="auto", label="Text/Skill column")
             freq_col = gr.Dropdown(choices=["auto"], value="auto", label="Frequency column")
             max_words = gr.Slider(10, 1000, value=300, step=10, label="Max words")
+            background_color = gr.Dropdown(choices=["white", "black"], value="white", label="Background color")
             colormap = gr.Dropdown(
                 choices=["default","viridis","plasma","inferno","magma","cividis","terrain","tab20","tab10","Pastel1","Set3"],
                 value="default",
             mask_image = gr.Image(type="filepath", label="Mask image (optional)")
             random_state = gr.Slider(0, 100, value=42, step=1, label="Random state (for reproducibility)")
+            gr.Markdown("### Bar Chart Options")
+            show_bar = gr.Checkbox(value=True, label="Show bar chart")
+            topk_bar = gr.Slider(5, 50, value=20, step=1, label="Top K for bar chart")
+            run_btn = gr.Button("Generate", variant="primary")
         with gr.Column(scale=1):
             out_img = gr.Image(label="Word Cloud", type="pil")
+            out_bar = gr.Image(label="Bar Chart (Top K)", type="pil")
             out_table = gr.Dataframe(label="Top words used", wrap=True)
     run_btn.click(
+        fn=generate_wordcloud_and_bar,
+        inputs=[csv_file, text_col, freq_col, max_words, background_color, colormap, width, height, scale, prefer_horizontal, collocations, stopwords_text, mask_image, random_state, show_bar, topk_bar],
+        outputs=[out_img, out_bar, out_table],
     )
 if __name__ == "__main__":