Spaces:

Corin1998
/

multilingual_sns_analyzerV2

Sleeping

App Files Files Community

Corin1998 commited on Sep 3, 2025

Commit

770385f

verified ·

1 Parent(s): c207743

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -26

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba_cache")
-from typing import List, Dict, Any, Tuple
 import json
 import pandas as pd
 import gradio as gr
@@ -12,21 +12,31 @@ from lib.utils import now_utc_str, load_sample_df
 APP_NAME = "🔎 SNS Analyzer v3（クラスタ＋要約＋感情＋重複排除）— OpenAI API"
-# ---------- 追加: CSV ロバスト読み込みユーティリティ ----------
 def read_csv_flex(file_obj) -> pd.DataFrame:
-    """
-    文字コード（utf-8, utf-8-sig, cp932/shift_jis, latin1）と
-    区切り（自動推測 / , / タブ / ; / |）を総当たりで読み込み。
-    """
     encodings = ["utf-8", "utf-8-sig", "cp932", "shift_jis", "latin1"]
     seps = [None, ",", "\t", ";", "|"]
     last_err = None
     for enc in encodings:
         for sep in seps:
             try:
-                df = pd.read_csv(file_obj.name, encoding=enc, sep=sep, engine="python")
-                # 列が1つも無いのはNG
                 if isinstance(df, pd.DataFrame) and df.shape[1] >= 1:
                     return df
             except Exception as e:
@@ -35,10 +45,6 @@ def read_csv_flex(file_obj) -> pd.DataFrame:
     raise RuntimeError(f"CSV 読み込みに失敗しました（encoding/区切りの自動判定に失敗）: {last_err}")
 def guess_text_col(df: pd.DataFrame, hint: str | None) -> str:
-    """
-    列名ヒントがあればそれを優先。
-    無ければ、よくある列名 → 最初の object 列 → 先頭列 の順で選択。
-    """
     if hint and hint in df.columns:
         return hint
     candidates = [
@@ -48,14 +54,12 @@ def guess_text_col(df: pd.DataFrame, hint: str | None) -> str:
     for c in candidates:
         if c in df.columns:
             return c
-    # 最初の文字列(=object)列
     for c in df.columns:
         if df[c].dtype == "object":
             return c
-    # どうしても見つからなければ先頭列
     return df.columns[0]
-# ---------------------------------------------------------------
 def run_analysis(
     text_blob: str,
@@ -67,28 +71,24 @@ def run_analysis(
     csv_file,
     csv_text_col: str,
     out_lang_ui: str,   # "日本語"/"English"/"自動"
-) -> Tuple[str, Any, pd.DataFrame, str]:
-    # 言語コードに変換
     lang_map = {"日本語": "ja", "English": "en", "自動": "auto"}
     out_lang = lang_map.get(out_lang_ui, "ja")
-    # 入力収集（CSV優先。列名未指定でも自動解決）
     texts: List[str] = []
     if csv_file is not None:
         try:
             df = read_csv_flex(csv_file)
             col = guess_text_col(df, csv_text_col.strip() if csv_text_col else None)
             if col not in df.columns:
-                return f"⚠️ CSVに列 `{col}` が見つかりません。列名を指定するか、対象列を確認してください。", None, pd.DataFrame(), ""
-            # 文字列化＆欠損除去
-            series = df[col].astype(str).fillna("")
-            texts = [s.strip() for s in series.tolist() if str(s).strip()]
         except Exception as e:
             return f"⚠️ CSV読み込みでエラー: {e}", None, pd.DataFrame(), ""
     else:
         texts = [t.strip() for t in (text_blob or "").split("\n") if t.strip()]
-    # 上限
     if max_items and max_items > 0:
         texts = texts[:max_items]
     if len(texts) < 2:
@@ -104,7 +104,7 @@ def run_analysis(
         output_lang=out_lang,
     )
-    # 右ペイン（要約）
     lines = []
     if result.get("dedup", {}).get("removed", 0) > 0:
         kept = result["dedup"]["kept"]
@@ -112,7 +112,7 @@ def run_analysis(
         lines.append(f"> 近似重複を {removed} 件除外（残り {kept} 件）\n")
     for c in result["clusters"]:
-        title = c.get("label") or c.get("summary", {}).get("title") or f"Cluster {c['id']}"
         lines.append(f"### クラスタ {c['id']}  (size={c['size']})")
         lines.append(f"**{title}**")
         if c.get("summary", {}).get("overview"):
@@ -141,10 +141,12 @@ def run_analysis(
     dl_json = json.dumps(result, ensure_ascii=False, indent=2)
     return "\n".join(lines), fig, df_out, dl_json
 def on_load_sample():
     df = load_sample_df()
     return "\n".join(df["text"].astype(str).tolist())
 def apply_labels(json_str: str, edited_df: pd.DataFrame) -> str:
     if not json_str:
         return json_str
@@ -154,7 +156,6 @@ def apply_labels(json_str: str, edited_df: pd.DataFrame) -> str:
         return json_str
     if not isinstance(edited_df, pd.DataFrame) or "id" not in edited_df or "label" not in edited_df:
         return json_str
     id2label = {int(r["id"]): str(r["label"]) for _, r in edited_df.iterrows() if str(r["label"]).strip() != ""}
     for c in payload.get("clusters", []):
         cid = int(c["id"])
@@ -164,6 +165,7 @@ def apply_labels(json_str: str, edited_df: pd.DataFrame) -> str:
             c["summary"]["title"] = id2label[cid]
     return json.dumps(payload, ensure_ascii=False, indent=2)
 def to_json_file(json_str: str):
     if not json_str:
         return gr.File.update(visible=False)
@@ -172,6 +174,7 @@ def to_json_file(json_str: str):
         f.write(json_str)
     return gr.File.update(value=path, visible=True)
 def to_csv_file(json_str: str):
     if not json_str:
         return gr.File.update(visible=False)
@@ -191,6 +194,7 @@ def to_csv_file(json_str: str):
     except Exception:
         return gr.File.update(visible=False)
 with gr.Blocks(title=APP_NAME, theme=gr.themes.Soft()) as demo:
     gr.Markdown(f"# {APP_NAME}")

 import os
 os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba_cache")
+from typing import List, Tuple
 import json
 import pandas as pd
 import gradio as gr
 APP_NAME = "🔎 SNS Analyzer v3（クラスタ＋要約＋感情＋重複排除）— OpenAI API"
+# ----------------- CSV ロバスト読み込み -----------------
+def _file_path(file_obj) -> str | None:
+    # gr.File はパス(str)か、{'name','path'}のdict、または file-like を返すことがある
+    if file_obj is None:
+        return None
+    if isinstance(file_obj, str):
+        return file_obj
+    if hasattr(file_obj, "name"):
+        return file_obj.name
+    if isinstance(file_obj, dict):
+        return file_obj.get("path") or file_obj.get("name")
+    return None
 def read_csv_flex(file_obj) -> pd.DataFrame:
+    path = _file_path(file_obj)
+    if not path or not os.path.exists(path):
+        raise RuntimeError("アップロードされた CSV のパスが取得できませんでした。")
     encodings = ["utf-8", "utf-8-sig", "cp932", "shift_jis", "latin1"]
     seps = [None, ",", "\t", ";", "|"]
     last_err = None
     for enc in encodings:
         for sep in seps:
             try:
+                df = pd.read_csv(path, encoding=enc, sep=sep, engine="python", on_bad_lines="skip")
                 if isinstance(df, pd.DataFrame) and df.shape[1] >= 1:
                     return df
             except Exception as e:
     raise RuntimeError(f"CSV 読み込みに失敗しました（encoding/区切りの自動判定に失敗）: {last_err}")
 def guess_text_col(df: pd.DataFrame, hint: str | None) -> str:
     if hint and hint in df.columns:
         return hint
     candidates = [
     for c in candidates:
         if c in df.columns:
             return c
     for c in df.columns:
         if df[c].dtype == "object":
             return c
     return df.columns[0]
+# --------------------------------------------------------
 def run_analysis(
     text_blob: str,
     csv_file,
     csv_text_col: str,
     out_lang_ui: str,   # "日本語"/"English"/"自動"
+) -> Tuple[str, any, pd.DataFrame, str]:
     lang_map = {"日本語": "ja", "English": "en", "自動": "auto"}
     out_lang = lang_map.get(out_lang_ui, "ja")
+    # 入力収集
     texts: List[str] = []
     if csv_file is not None:
         try:
             df = read_csv_flex(csv_file)
             col = guess_text_col(df, csv_text_col.strip() if csv_text_col else None)
             if col not in df.columns:
+                return f"⚠️ CSVに列 `{col}` が見つかりません。列名を指定してください。", None, pd.DataFrame(), ""
+            texts = [str(s).strip() for s in df[col].astype(str).fillna("").tolist() if str(s).strip()]
         except Exception as e:
             return f"⚠️ CSV読み込みでエラー: {e}", None, pd.DataFrame(), ""
     else:
         texts = [t.strip() for t in (text_blob or "").split("\n") if t.strip()]
     if max_items and max_items > 0:
         texts = texts[:max_items]
     if len(texts) < 2:
         output_lang=out_lang,
     )
+    # 要約の表示（右ペイン）
     lines = []
     if result.get("dedup", {}).get("removed", 0) > 0:
         kept = result["dedup"]["kept"]
         lines.append(f"> 近似重複を {removed} 件除外（残り {kept} 件）\n")
     for c in result["clusters"]:
+        title = c.get("label") or c.get("summary", {}).get("title") or f"クラスタ {c['id']}"
         lines.append(f"### クラスタ {c['id']}  (size={c['size']})")
         lines.append(f"**{title}**")
         if c.get("summary", {}).get("overview"):
     dl_json = json.dumps(result, ensure_ascii=False, indent=2)
     return "\n".join(lines), fig, df_out, dl_json
 def on_load_sample():
     df = load_sample_df()
     return "\n".join(df["text"].astype(str).tolist())
 def apply_labels(json_str: str, edited_df: pd.DataFrame) -> str:
     if not json_str:
         return json_str
         return json_str
     if not isinstance(edited_df, pd.DataFrame) or "id" not in edited_df or "label" not in edited_df:
         return json_str
     id2label = {int(r["id"]): str(r["label"]) for _, r in edited_df.iterrows() if str(r["label"]).strip() != ""}
     for c in payload.get("clusters", []):
         cid = int(c["id"])
             c["summary"]["title"] = id2label[cid]
     return json.dumps(payload, ensure_ascii=False, indent=2)
 def to_json_file(json_str: str):
     if not json_str:
         return gr.File.update(visible=False)
         f.write(json_str)
     return gr.File.update(value=path, visible=True)
 def to_csv_file(json_str: str):
     if not json_str:
         return gr.File.update(visible=False)
     except Exception:
         return gr.File.update(visible=False)
 with gr.Blocks(title=APP_NAME, theme=gr.themes.Soft()) as demo:
     gr.Markdown(f"# {APP_NAME}")