Corin1998 commited on
Commit
770385f
·
verified ·
1 Parent(s): c207743

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -26
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba_cache")
3
 
4
- from typing import List, Dict, Any, Tuple
5
  import json
6
  import pandas as pd
7
  import gradio as gr
@@ -12,21 +12,31 @@ from lib.utils import now_utc_str, load_sample_df
12
 
13
  APP_NAME = "🔎 SNS Analyzer v3(クラスタ+要約+感情+重複排除)— OpenAI API"
14
 
15
- # ---------- 追加: CSV ロバスト読み込みユーティリティ ----------
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def read_csv_flex(file_obj) -> pd.DataFrame:
18
- """
19
- 文字コード(utf-8, utf-8-sig, cp932/shift_jis, latin1)と
20
- 区切り(自動推測 / , / タブ / ; / |)を総当たり読み込み
21
- """
22
  encodings = ["utf-8", "utf-8-sig", "cp932", "shift_jis", "latin1"]
23
  seps = [None, ",", "\t", ";", "|"]
24
  last_err = None
25
  for enc in encodings:
26
  for sep in seps:
27
  try:
28
- df = pd.read_csv(file_obj.name, encoding=enc, sep=sep, engine="python")
29
- # 列が1つも無いのはNG
30
  if isinstance(df, pd.DataFrame) and df.shape[1] >= 1:
31
  return df
32
  except Exception as e:
@@ -35,10 +45,6 @@ def read_csv_flex(file_obj) -> pd.DataFrame:
35
  raise RuntimeError(f"CSV 読み込みに失敗しました(encoding/区切りの自動判定に失敗): {last_err}")
36
 
37
  def guess_text_col(df: pd.DataFrame, hint: str | None) -> str:
38
- """
39
- 列名ヒントがあればそれを優先。
40
- 無ければ、よくある列名 → 最初の object 列 → 先頭列 の順で選択。
41
- """
42
  if hint and hint in df.columns:
43
  return hint
44
  candidates = [
@@ -48,14 +54,12 @@ def guess_text_col(df: pd.DataFrame, hint: str | None) -> str:
48
  for c in candidates:
49
  if c in df.columns:
50
  return c
51
- # 最初の文字列(=object)列
52
  for c in df.columns:
53
  if df[c].dtype == "object":
54
  return c
55
- # どうしても見つからなければ先頭列
56
  return df.columns[0]
 
57
 
58
- # ---------------------------------------------------------------
59
 
60
  def run_analysis(
61
  text_blob: str,
@@ -67,28 +71,24 @@ def run_analysis(
67
  csv_file,
68
  csv_text_col: str,
69
  out_lang_ui: str, # "日本語"/"English"/"自動"
70
- ) -> Tuple[str, Any, pd.DataFrame, str]:
71
- # 言語コードに変換
72
  lang_map = {"日本語": "ja", "English": "en", "自動": "auto"}
73
  out_lang = lang_map.get(out_lang_ui, "ja")
74
 
75
- # 入力収集(CSV優先。列名未指定でも自動解決)
76
  texts: List[str] = []
77
  if csv_file is not None:
78
  try:
79
  df = read_csv_flex(csv_file)
80
  col = guess_text_col(df, csv_text_col.strip() if csv_text_col else None)
81
  if col not in df.columns:
82
- return f"⚠️ CSVに列 `{col}` が見つかりません。列名を指定するか、対象列を確認してください。", None, pd.DataFrame(), ""
83
- # 文字列化&欠損除去
84
- series = df[col].astype(str).fillna("")
85
- texts = [s.strip() for s in series.tolist() if str(s).strip()]
86
  except Exception as e:
87
  return f"⚠️ CSV読み込みでエラー: {e}", None, pd.DataFrame(), ""
88
  else:
89
  texts = [t.strip() for t in (text_blob or "").split("\n") if t.strip()]
90
 
91
- # 上限
92
  if max_items and max_items > 0:
93
  texts = texts[:max_items]
94
  if len(texts) < 2:
@@ -104,7 +104,7 @@ def run_analysis(
104
  output_lang=out_lang,
105
  )
106
 
107
- # 右ペイン(要約
108
  lines = []
109
  if result.get("dedup", {}).get("removed", 0) > 0:
110
  kept = result["dedup"]["kept"]
@@ -112,7 +112,7 @@ def run_analysis(
112
  lines.append(f"> 近似重複を {removed} 件除外(残り {kept} 件)\n")
113
 
114
  for c in result["clusters"]:
115
- title = c.get("label") or c.get("summary", {}).get("title") or f"Cluster {c['id']}"
116
  lines.append(f"### クラスタ {c['id']} (size={c['size']})")
117
  lines.append(f"**{title}**")
118
  if c.get("summary", {}).get("overview"):
@@ -141,10 +141,12 @@ def run_analysis(
141
  dl_json = json.dumps(result, ensure_ascii=False, indent=2)
142
  return "\n".join(lines), fig, df_out, dl_json
143
 
 
144
  def on_load_sample():
145
  df = load_sample_df()
146
  return "\n".join(df["text"].astype(str).tolist())
147
 
 
148
  def apply_labels(json_str: str, edited_df: pd.DataFrame) -> str:
149
  if not json_str:
150
  return json_str
@@ -154,7 +156,6 @@ def apply_labels(json_str: str, edited_df: pd.DataFrame) -> str:
154
  return json_str
155
  if not isinstance(edited_df, pd.DataFrame) or "id" not in edited_df or "label" not in edited_df:
156
  return json_str
157
-
158
  id2label = {int(r["id"]): str(r["label"]) for _, r in edited_df.iterrows() if str(r["label"]).strip() != ""}
159
  for c in payload.get("clusters", []):
160
  cid = int(c["id"])
@@ -164,6 +165,7 @@ def apply_labels(json_str: str, edited_df: pd.DataFrame) -> str:
164
  c["summary"]["title"] = id2label[cid]
165
  return json.dumps(payload, ensure_ascii=False, indent=2)
166
 
 
167
  def to_json_file(json_str: str):
168
  if not json_str:
169
  return gr.File.update(visible=False)
@@ -172,6 +174,7 @@ def to_json_file(json_str: str):
172
  f.write(json_str)
173
  return gr.File.update(value=path, visible=True)
174
 
 
175
  def to_csv_file(json_str: str):
176
  if not json_str:
177
  return gr.File.update(visible=False)
@@ -191,6 +194,7 @@ def to_csv_file(json_str: str):
191
  except Exception:
192
  return gr.File.update(visible=False)
193
 
 
194
  with gr.Blocks(title=APP_NAME, theme=gr.themes.Soft()) as demo:
195
  gr.Markdown(f"# {APP_NAME}")
196
 
 
1
  import os
2
  os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba_cache")
3
 
4
+ from typing import List, Tuple
5
  import json
6
  import pandas as pd
7
  import gradio as gr
 
12
 
13
  APP_NAME = "🔎 SNS Analyzer v3(クラスタ+要約+感情+重複排除)— OpenAI API"
14
 
15
+
16
+ # ----------------- CSV ロバスト読み込み -----------------
17
+ def _file_path(file_obj) -> str | None:
18
+ # gr.File はパス(str)か、{'name','path'}のdict、または file-like を返すことがある
19
+ if file_obj is None:
20
+ return None
21
+ if isinstance(file_obj, str):
22
+ return file_obj
23
+ if hasattr(file_obj, "name"):
24
+ return file_obj.name
25
+ if isinstance(file_obj, dict):
26
+ return file_obj.get("path") or file_obj.get("name")
27
+ return None
28
 
29
  def read_csv_flex(file_obj) -> pd.DataFrame:
30
+ path = _file_path(file_obj)
31
+ if not path or not os.path.exists(path):
32
+ raise RuntimeError("アップロードされた CSV のパスが取得きませんでした")
 
33
  encodings = ["utf-8", "utf-8-sig", "cp932", "shift_jis", "latin1"]
34
  seps = [None, ",", "\t", ";", "|"]
35
  last_err = None
36
  for enc in encodings:
37
  for sep in seps:
38
  try:
39
+ df = pd.read_csv(path, encoding=enc, sep=sep, engine="python", on_bad_lines="skip")
 
40
  if isinstance(df, pd.DataFrame) and df.shape[1] >= 1:
41
  return df
42
  except Exception as e:
 
45
  raise RuntimeError(f"CSV 読み込みに失敗しました(encoding/区切りの自動判定に失敗): {last_err}")
46
 
47
  def guess_text_col(df: pd.DataFrame, hint: str | None) -> str:
 
 
 
 
48
  if hint and hint in df.columns:
49
  return hint
50
  candidates = [
 
54
  for c in candidates:
55
  if c in df.columns:
56
  return c
 
57
  for c in df.columns:
58
  if df[c].dtype == "object":
59
  return c
 
60
  return df.columns[0]
61
+ # --------------------------------------------------------
62
 
 
63
 
64
  def run_analysis(
65
  text_blob: str,
 
71
  csv_file,
72
  csv_text_col: str,
73
  out_lang_ui: str, # "日本語"/"English"/"自動"
74
+ ) -> Tuple[str, any, pd.DataFrame, str]:
 
75
  lang_map = {"日本語": "ja", "English": "en", "自動": "auto"}
76
  out_lang = lang_map.get(out_lang_ui, "ja")
77
 
78
+ # 入力収集
79
  texts: List[str] = []
80
  if csv_file is not None:
81
  try:
82
  df = read_csv_flex(csv_file)
83
  col = guess_text_col(df, csv_text_col.strip() if csv_text_col else None)
84
  if col not in df.columns:
85
+ return f"⚠️ CSVに列 `{col}` が見つかりません。列名を指定してください。", None, pd.DataFrame(), ""
86
+ texts = [str(s).strip() for s in df[col].astype(str).fillna("").tolist() if str(s).strip()]
 
 
87
  except Exception as e:
88
  return f"⚠️ CSV読み込みでエラー: {e}", None, pd.DataFrame(), ""
89
  else:
90
  texts = [t.strip() for t in (text_blob or "").split("\n") if t.strip()]
91
 
 
92
  if max_items and max_items > 0:
93
  texts = texts[:max_items]
94
  if len(texts) < 2:
 
104
  output_lang=out_lang,
105
  )
106
 
107
+ # 要約の表示(右ペイン)
108
  lines = []
109
  if result.get("dedup", {}).get("removed", 0) > 0:
110
  kept = result["dedup"]["kept"]
 
112
  lines.append(f"> 近似重複を {removed} 件除外(残り {kept} 件)\n")
113
 
114
  for c in result["clusters"]:
115
+ title = c.get("label") or c.get("summary", {}).get("title") or f"クラスタ {c['id']}"
116
  lines.append(f"### クラスタ {c['id']} (size={c['size']})")
117
  lines.append(f"**{title}**")
118
  if c.get("summary", {}).get("overview"):
 
141
  dl_json = json.dumps(result, ensure_ascii=False, indent=2)
142
  return "\n".join(lines), fig, df_out, dl_json
143
 
144
+
145
  def on_load_sample():
146
  df = load_sample_df()
147
  return "\n".join(df["text"].astype(str).tolist())
148
 
149
+
150
  def apply_labels(json_str: str, edited_df: pd.DataFrame) -> str:
151
  if not json_str:
152
  return json_str
 
156
  return json_str
157
  if not isinstance(edited_df, pd.DataFrame) or "id" not in edited_df or "label" not in edited_df:
158
  return json_str
 
159
  id2label = {int(r["id"]): str(r["label"]) for _, r in edited_df.iterrows() if str(r["label"]).strip() != ""}
160
  for c in payload.get("clusters", []):
161
  cid = int(c["id"])
 
165
  c["summary"]["title"] = id2label[cid]
166
  return json.dumps(payload, ensure_ascii=False, indent=2)
167
 
168
+
169
  def to_json_file(json_str: str):
170
  if not json_str:
171
  return gr.File.update(visible=False)
 
174
  f.write(json_str)
175
  return gr.File.update(value=path, visible=True)
176
 
177
+
178
  def to_csv_file(json_str: str):
179
  if not json_str:
180
  return gr.File.update(visible=False)
 
194
  except Exception:
195
  return gr.File.update(visible=False)
196
 
197
+
198
  with gr.Blocks(title=APP_NAME, theme=gr.themes.Soft()) as demo:
199
  gr.Markdown(f"# {APP_NAME}")
200