cormort commited on
Commit
aac27e4
·
verified ·
1 Parent(s): e133104

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -150
app.py CHANGED
@@ -15,7 +15,7 @@ from pdfminer.high_level import extract_text
15
  from docx import Document
16
  from bs4 import BeautifulSoup
17
 
18
- # --- 新增:翻譯套件引用 (容錯處理) ---
19
  try:
20
  from deep_translator import GoogleTranslator
21
  HAS_TRANSLATOR = True
@@ -23,7 +23,7 @@ except ImportError:
23
  HAS_TRANSLATOR = False
24
  print("Warning: deep-translator not installed. Translation features will be limited.")
25
 
26
- # Global lock for file access to prevent race conditions during log/load
27
  data_lock = threading.Lock()
28
 
29
  # Config
@@ -31,7 +31,10 @@ DATA_FSMM = "fsmm_data.json"
31
  DATA_PROPOSALS = "proposals_data.json"
32
  DATA_QUERIES = "user_queries.json"
33
 
34
- # --- FSMM Extraction Logic ---
 
 
 
35
  def extract_from_pdf(file_path):
36
  try:
37
  text = extract_text(file_path)
@@ -48,14 +51,76 @@ def extract_from_docx(file_path):
48
  print(f"Error reading DOCX {file_path}: {e}")
49
  return ""
50
 
51
- def parse_fsmm_filename(filename):
52
  """
53
- 解析檔名以獲取年份類型。
54
- 包含針對特定重要文件的硬編對照表。
 
 
55
  """
56
- fn_lower = filename.lower()
57
 
58
- # 1. 針對已知的重要 APEC 文件建立關鍵字對照表
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  known_docs = {
60
  "putrajaya vision": (2020, "Leaders' Declaration"),
61
  "trujillo principles": (2024, "FSMM Principles"),
@@ -66,35 +131,30 @@ def parse_fsmm_filename(filename):
66
  "bangkok goals": (2022, "Leaders' Declaration")
67
  }
68
 
69
- # 檢查是否命中特定文件
70
  for key, (year, doc_type) in known_docs.items():
71
  if key in fn_lower:
72
  return year, doc_type
73
 
74
- # 2. 原有的正規表達式邏輯 (YY_fsmm_type)
75
  match = re.search(r'(\d+)_fsmm_(\w+)', filename)
76
  if match:
77
  year_short = match.group(1)
78
  type_code = match.group(2)
79
  year = int(year_short) + 2000 if len(year_short) == 2 else int(year_short)
80
-
81
  type_map = {
82
- 'jms': 'Joint Ministerial Statement (聯合部長聲明)',
83
- 'stmt': 'Statement (聲明)',
84
- 'declaration': 'Declaration (宣言)',
85
- 'roadmap': 'Roadmap (路徑圖)'
86
  }
87
  type_name = type_map.get(type_code.lower(), type_code.upper())
88
  return year, type_name
89
 
90
- # 3. 如果都失敗,嘗試抓取年份作為最後手段
91
  year_match = re.search(r'(20\d{2})|19\d{2}', filename)
92
  if year_match:
93
  return int(year_match.group(0)), "Other Document"
94
 
95
  return None, None
96
 
97
- # --- Proposal Extraction Logic ---
98
  def parse_proposal(file_path):
99
  try:
100
  with open(file_path, 'r', encoding='utf-8') as f:
@@ -115,7 +175,10 @@ def parse_proposal(file_path):
115
  print(f"Error parsing proposal {file_path}: {e}")
116
  return None
117
 
118
- # --- Data Management ---
 
 
 
119
  def load_json(filepath):
120
  with data_lock:
121
  if not os.path.exists(filepath):
@@ -136,13 +199,13 @@ def log_query(query):
136
  return
137
  logs = load_json(DATA_QUERIES)
138
  now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
139
- logs.append({
140
- "timestamp": now,
141
- "query": str(query).strip()
142
- })
143
  save_json(DATA_QUERIES, logs)
144
 
145
- # --- Upload Logic (Enhanced for complex APEC docs) ---
 
 
 
146
  def handle_unified_upload(file_objs):
147
  if not file_objs:
148
  return "請選擇檔案上傳。"
@@ -172,7 +235,6 @@ def handle_unified_upload(file_objs):
172
  # --- 處理文件 (PDF/DOCX) ---
173
  else:
174
  year, doc_type = parse_fsmm_filename(original_filename)
175
-
176
  if not year:
177
  results.append(f"⚠️ {original_filename}: 無法辨識年份,已跳過。")
178
  continue
@@ -190,40 +252,8 @@ def handle_unified_upload(file_objs):
190
  results.append(f"❌ {original_filename}: 內容提取失敗。")
191
  continue
192
 
193
- # === [升級版] 文字清理與切分邏輯 ===
194
-
195
- # 1. 清理頁碼與雜訊
196
- content = re.sub(r'(?m)^\s*\d+\s*$', '', content)
197
- content = re.sub(r'--- PAGE \d+ ---', '', content)
198
-
199
- # 2. 統一換行符號
200
- lines = content.split('\n')
201
- clean_lines = [l.strip() for l in lines if l.strip()]
202
- full_text = '\n'.join(clean_lines)
203
-
204
- # 3. 智慧切分 Pattern
205
- split_pattern = r'(?m)^((?:PART\s+[A-Z]+|SECTION\s+[A-Z]|[IVX]+\.|[a-z]\.|(?<!\d)\d{1,2}\.)\s+)'
206
-
207
- parts = re.split(split_pattern, full_text)
208
-
209
- composed_paras = []
210
-
211
- if parts[0].strip():
212
- composed_paras.append(parts[0].strip())
213
-
214
- i = 1
215
- while i < len(parts) - 1:
216
- header_marker = parts[i].strip()
217
- body_text = parts[i+1].strip() if i + 1 < len(parts) else ""
218
- full_para = f"{header_marker} {body_text}"
219
- composed_paras.append(full_para)
220
- i += 2
221
-
222
- # 4. Fallback slicing
223
- if len(composed_paras) < 3 and len(full_text) > 1000:
224
- chunk_size = 800
225
- composed_paras = [full_text[j:j+chunk_size] for j in range(0, len(full_text), chunk_size)]
226
- composed_paras = [f"[Auto-Segment {idx+1}] {t}" for idx, t in enumerate(composed_paras)]
227
 
228
  new_entries = []
229
  for idx, para in enumerate(composed_paras):
@@ -244,23 +274,18 @@ def handle_unified_upload(file_objs):
244
 
245
  return "\n".join(results)
246
 
247
- # --- Translation Logic (New) ---
 
 
 
248
  def perform_translation(text, target_lang='zh-TW'):
249
- """
250
- 執行翻譯功能。如果 deep-translator 未安裝,返回提示。
251
- """
252
- if not text:
253
- return ""
254
-
255
- # 建立 Google Translate 連結 (Fallback)
256
  encoded_text = urllib.parse.quote(text)
257
  google_trans_url = f"https://translate.google.com/?sl=auto&tl={target_lang}&text={encoded_text}&op=translate"
258
 
259
  trans_result = ""
260
-
261
  if HAS_TRANSLATOR:
262
  try:
263
- # 限制長度以防 API 報錯,若太長則建議用連結
264
  if len(text) > 4500:
265
  trans_result = "⚠️ 文本過長,請使用下方按鈕前往 Google 翻譯。"
266
  else:
@@ -270,42 +295,44 @@ def perform_translation(text, target_lang='zh-TW'):
270
  trans_result = f"⚠️ 翻譯服務暫時不可用 ({str(e)})。請使用下方按鈕。"
271
  else:
272
  trans_result = "⚠️ 伺服器未安裝 deep-translator 套件。請使用下方按鈕。"
273
-
274
  return trans_result, google_trans_url
275
 
276
  def translate_ui_action(text):
277
  t_text, t_url = perform_translation(text)
278
- # 返回: 翻譯結果文本, 顯示按鈕(HTML)
279
- btn_html = f"""
280
- <div style="margin-top: 10px;">
281
- <a href="{t_url}" target="_blank" style="
282
- background-color: #4285F4; color: white; padding: 8px 16px;
283
- text-decoration: none; border-radius: 4px; font-weight: bold;
284
- display: inline-block;">
285
- 🌍 在 Google 翻譯中開啟
286
- </a>
287
- </div>
288
- """
289
  return t_text, btn_html
290
 
291
- # --- Search Logic ---
 
 
 
 
 
 
 
 
292
  def search_proposals(query, year, economy):
 
 
 
 
293
  log_query(f"Proposals Q:{query}|Y:{year}|E:{economy}")
294
  data = load_json(DATA_PROPOSALS)
295
 
296
  filtered = []
297
- q = query.lower() if query else ""
 
298
  y = str(year) if year else ""
299
  e = str(economy) if economy else ""
300
 
301
  for item in data:
302
- if q:
303
- found_q = False
304
- for k, v in item.items():
305
- if q in str(v).lower():
306
- found_q = True
307
- break
308
- if not found_q: continue
309
 
310
  if y and str(item.get('Project Year', '')).strip() != y:
311
  continue
@@ -319,9 +346,19 @@ def search_proposals(query, year, economy):
319
 
320
  html = ""
321
  for p in filtered[:20]:
 
 
 
 
 
 
 
 
 
 
322
  html += f"""
323
  <div style="border: 1px solid #cbd5e1; padding: 15px; border-radius: 8px; margin-bottom: 15px; background: #fff; box-shadow: 0 1px 3px rgba(0,0,0,0.1);">
324
- <div style="font-weight: bold; color: #1e293b; margin-bottom: 8px; font-size: 1.1em;">{p.get('Project Title', '無標題')}</div>
325
  <div style="display: flex; gap: 8px; flex-wrap: wrap; font-size: 0.85em; margin-bottom: 10px;">
326
  <span style="background: #e2e8f0; padding: 2px 8px; border-radius: 4px;">{p.get('Project No.', '無編號')}</span>
327
  <span style="background: #f1f5f9; padding: 2px 8px; border-radius: 4px;">{p.get('Proposing Economy(ies)', '未知經濟體')}</span>
@@ -329,7 +366,7 @@ def search_proposals(query, year, economy):
329
  <details style="font-size: 0.9em; color: #475569;">
330
  <summary style="cursor: pointer; color: #2563eb;">詳細資訊</summary>
331
  <div style="margin-top: 10px; display: grid; grid-template-columns: 1fr 2fr; gap:5px;">
332
- {"".join([f"<b>{k}:</b> <div>{v}</div>" for k, v in p.items() if k not in ['Project Title', 'Project No.']])}
333
  </div>
334
  </details>
335
  </div>
@@ -337,13 +374,24 @@ def search_proposals(query, year, economy):
337
  return html
338
 
339
  def search_fsmm(query, year, doc_type, filename_filter):
 
 
 
340
  log_query(f"FSMM {query}")
341
  data = load_json(DATA_FSMM)
342
  filtered = []
343
- q = query.lower() if query else ""
 
 
344
 
345
  for item in data:
346
- if q and q not in item['content'].lower(): continue
 
 
 
 
 
 
347
  if year and str(item['year']) != year: continue
348
  if doc_type and item['type'] != doc_type: continue
349
  if filename_filter and item['filename'] != filename_filter: continue
@@ -356,18 +404,11 @@ def search_fsmm(query, year, doc_type, filename_filter):
356
  else:
357
  filtered.sort(key=lambda x: (x['year'], x['type'], x['paragraph_index']), reverse=True)
358
 
359
- def highlight_text(text, keyword):
360
- if not keyword:
361
- return text
362
- pattern = re.compile(re.escape(keyword), re.IGNORECASE)
363
- return pattern.sub(
364
- f'<span style="background: #fef08a; padding: 1px 3px; border-radius: 3px; font-weight: bold;">{keyword}</span>',
365
- text
366
- )
367
-
368
  html = ""
369
  for item in filtered[:100]:
370
- content = highlight_text(item['content'], query) if query else item['content']
 
 
371
  html += f"""
372
  <div style="border: 1px solid #e2e8f0; padding: 15px; border-radius: 8px; margin-bottom: 15px; background: white; box-shadow: 0 1px 3px rgba(0,0,0,0.1);">
373
  <div style="display: flex; gap: 8px; margin-bottom: 8px; flex-wrap: wrap;">
@@ -397,7 +438,10 @@ def get_fsmm_full_text(filename):
397
 
398
  return full_text
399
 
400
- # --- Dashboard Logic ---
 
 
 
401
  def get_dashboard_stats():
402
  proposals = load_json(DATA_PROPOSALS)
403
  fsmm = load_json(DATA_FSMM)
@@ -465,12 +509,10 @@ def get_dashboard_stats():
465
 
466
  return summary_html, fig_p_year, fig_p_econ, fig_queries, df_ye_stats, df_ec_stats, df_qu_stats
467
 
468
- # --- UI Helpers ---
469
  def get_p_choices_raw(year_filter=None):
470
  try:
471
  data = load_json(DATA_PROPOSALS)
472
- if not data:
473
- return ([""],), ([""],)
474
  years = sorted(list(set(str(it.get('Project Year', '')).strip() for it in data if it.get('Project Year'))), reverse=True)
475
  if year_filter:
476
  econs = sorted(list(set(str(it.get('Proposing Economy(ies)', '')).strip()
@@ -478,21 +520,18 @@ def get_p_choices_raw(year_filter=None):
478
  else:
479
  econs = sorted(list(set(str(it.get('Proposing Economy(ies)', '')).strip() for it in data if it.get('Proposing Economy(ies)'))))
480
  return ([""] + years), ([""] + econs)
481
- except Exception as e:
482
- print(f"Error in get_p_choices_raw: {e}")
483
  return ([""],), ([""],)
484
 
485
  def get_fsmm_choices_raw():
486
  try:
487
  data = load_json(DATA_FSMM)
488
- if not data:
489
- return ([""],), ([""],), ([""],)
490
  years = sorted(list(set(str(it.get('year', '')) for it in data if it.get('year'))), reverse=True)
491
  types = sorted(list(set(str(it.get('type', '')) for it in data if it.get('type'))))
492
  filenames = sorted(list(set(str(it.get('filename', '')) for it in data if it.get('filename'))), reverse=True)
493
  return ([""] + years), ([""] + types), ([""] + filenames)
494
- except Exception as e:
495
- print(f"Error in get_fsmm_choices_raw: {e}")
496
  return ([""],), ([""],), ([""],)
497
 
498
  def refresh_p_choices(year_filter=None):
@@ -517,42 +556,22 @@ def handle_drilldown(evt: gr.SelectData):
517
 
518
  res_fsmm = search_fsmm(selected_val, "", "", "")
519
  return gr.update(selected="fsmm_tab"), res_fsmm, ""
520
- except Exception as e:
521
- print(f"Error in handle_drilldown: {e}")
522
  return gr.update(), "", ""
523
 
524
- # --- UI UI UI ---
 
 
525
  DASHBOARD_CSS = """
526
- .drilldown-df table tr:hover {
527
- cursor: pointer !important;
528
- background-color: #f0f9ff !important;
529
- position: relative;
530
- }
531
- .drilldown-df table tr:hover::after {
532
- content: "🖱️ 點擊跳轉查詢";
533
- position: absolute;
534
- right: 10px;
535
- top: 50%;
536
- transform: translateY(-50%);
537
- font-size: 0.8em;
538
- color: #3b82f6;
539
- background: #fff;
540
- padding: 2px 6px;
541
- border-radius: 4px;
542
- box-shadow: 0 2px 4px rgba(0,0,0,0.1);
543
- pointer-events: none;
544
- }
545
- .drilldown-df table tr:hover td {
546
- color: #2563eb !important;
547
- font-weight: bold;
548
- }
549
  """
550
 
551
  with gr.Blocks(title="APEC 綜合查詢系統", css=DASHBOARD_CSS) as demo:
552
  gr.Markdown("# 🌐 APEC 綜合查詢系統")
553
  gr.Markdown("整�� APEC 提案項目與重要宣言內容的統一檢索平台。")
554
 
555
- # --- 新增:快速翻譯工具區塊 ---
556
  with gr.Accordion("🛠️ 快速翻譯工具箱 (Translation Tool)", open=False):
557
  gr.Markdown("將英文內容貼在此處,可快速翻譯成繁體中文。若內容過長建議使用 Google 翻譯按鈕。")
558
  with gr.Row():
@@ -561,9 +580,7 @@ with gr.Blocks(title="APEC 綜合查詢系統", css=DASHBOARD_CSS) as demo:
561
  with gr.Row():
562
  t_btn = gr.Button("🔄 執行翻譯 (Translate)", variant="secondary")
563
  t_link_html = gr.HTML()
564
-
565
  t_btn.click(translate_ui_action, inputs=[t_input], outputs=[t_output, t_link_html])
566
- # ---------------------------
567
 
568
  with gr.Tabs() as tabs:
569
  with gr.Tab("📊 統計儀表板") as dash_tab:
@@ -588,7 +605,7 @@ with gr.Blocks(title="APEC 綜合查詢系統", css=DASHBOARD_CSS) as demo:
588
  with gr.Tab("🍎 政策文件查詢", id="fsmm_tab") as fsmm_tab:
589
  gr.Markdown("### 步驟 1:設定篩選條件 (FSMM / 領袖宣言 / 路線圖)")
590
  with gr.Row():
591
- f_query = gr.Textbox(label="🔍 關鍵字搜尋", placeholder="例如:resilience, climate, women...")
592
  f_year = gr.Dropdown(label="📅 年份", choices=[""])
593
  f_type = gr.Dropdown(label="📝 類型", choices=[""])
594
  f_doc = gr.Dropdown(label="📄 特定檔案 (必選以查看全文)", choices=[""])
@@ -601,7 +618,6 @@ with gr.Blocks(title="APEC 綜合查詢系統", css=DASHBOARD_CSS) as demo:
601
  gr.Markdown("---")
602
  f_out = gr.HTML(label="輸出區域")
603
 
604
- # Events
605
  f_btn.click(search_fsmm, inputs=[f_query, f_year, f_type, f_doc], outputs=[f_out])
606
  f_full_btn.click(get_fsmm_full_text, inputs=[f_doc], outputs=[f_out])
607
  f_refresh_btn.click(refresh_fsmm_choices, outputs=[f_year, f_type, f_doc])
@@ -609,7 +625,7 @@ with gr.Blocks(title="APEC 綜合查詢系統", css=DASHBOARD_CSS) as demo:
609
 
610
  with gr.Tab("📊 APEC 提案查詢", id="proposal_tab") as proposal_tab:
611
  with gr.Row():
612
- p_query = gr.Textbox(label="關鍵字搜尋所有欄位", placeholder="例如:Thailand, agriculture, energy...")
613
  p_year = gr.Dropdown(label="📅 年份", choices=[""])
614
  p_econ = gr.Dropdown(label="Proposing Economy", choices=[""])
615
 
@@ -619,7 +635,6 @@ with gr.Blocks(title="APEC 綜合查詢系統", css=DASHBOARD_CSS) as demo:
619
 
620
  p_out = gr.HTML()
621
 
622
- # Events
623
  p_btn.click(search_proposals, inputs=[p_query, p_year, p_econ], outputs=[p_out])
624
  p_refresh_btn.click(refresh_p_choices, outputs=[p_year, p_econ])
625
  p_year.change(lambda y: refresh_p_choices(y)[1], inputs=[p_year], outputs=[p_econ])
@@ -634,19 +649,16 @@ with gr.Blocks(title="APEC 綜合查詢系統", css=DASHBOARD_CSS) as demo:
634
  u_status = gr.Textbox(label="處理結果")
635
  u_btn.click(handle_unified_upload, inputs=[u_file], outputs=[u_status])
636
 
637
- # Drill-down Events: outputs=[tabs, f_out, p_out]
638
  d_ye_df.select(handle_drilldown, outputs=[tabs, f_out, p_out])
639
  d_ec_df.select(handle_drilldown, outputs=[tabs, f_out, p_out])
640
  d_qu_df.select(handle_drilldown, outputs=[tabs, f_out, p_out])
641
 
642
- # Initial load of choices
643
  def init_choices():
644
  try:
645
  fc = refresh_fsmm_choices()
646
  pc = refresh_p_choices()
647
  return fc + pc
648
- except Exception as e:
649
- print(f"Error in init_choices: {e}")
650
  return (gr.update(), gr.update(), gr.update(), gr.update(), gr.update())
651
 
652
  demo.load(init_choices, outputs=[f_year, f_type, f_doc, p_year, p_econ])
 
15
  from docx import Document
16
  from bs4 import BeautifulSoup
17
 
18
+ # --- 翻譯套件引用 (容錯處理) ---
19
  try:
20
  from deep_translator import GoogleTranslator
21
  HAS_TRANSLATOR = True
 
23
  HAS_TRANSLATOR = False
24
  print("Warning: deep-translator not installed. Translation features will be limited.")
25
 
26
+ # Global lock for file access
27
  data_lock = threading.Lock()
28
 
29
  # Config
 
31
  DATA_PROPOSALS = "proposals_data.json"
32
  DATA_QUERIES = "user_queries.json"
33
 
34
+ # ==========================================
35
+ # 核心邏輯區:文字處理與智能分段
36
+ # ==========================================
37
+
38
  def extract_from_pdf(file_path):
39
  try:
40
  text = extract_text(file_path)
 
51
  print(f"Error reading DOCX {file_path}: {e}")
52
  return ""
53
 
54
+ def process_and_segment_text(raw_text):
55
  """
56
+ 針對 PDF/Word 內容進行清洗智能分段:
57
+ 1. 去除多餘頁與雜訊
58
+ 2. 修復 PDF 斷行 (Un-breaking lines)
59
+ 3. 依據 APEC 常見的段落編號 (1., 2., a., i.) 或雙換行進行切分
60
  """
61
+ if not raw_text: return []
62
 
63
+ # 1. 清理基本雜訊
64
+ text = re.sub(r'(?m)^\s*\d+\s*$', '', raw_text) # 去除純數字行(頁碼)
65
+ text = re.sub(r'--- PAGE \d+ ---', '', text)
66
+
67
+ # 2. 智能合併斷行 (Fix PDF Line Breaks)
68
+ # 邏輯:如果一行結尾不是標點符號 (.!?:;"”’),通常代表這句還沒講完,應該跟下一行合併
69
+ lines = text.split('\n')
70
+ merged_lines = []
71
+ buffer = ""
72
+
73
+ for line in lines:
74
+ line = line.strip()
75
+ if not line:
76
+ # 遇到空行,如果 buffer 有東西就先存起來 (視為一個段落結束)
77
+ if buffer:
78
+ merged_lines.append(buffer)
79
+ buffer = ""
80
+ continue
81
+
82
+ if not buffer:
83
+ buffer = line
84
+ else:
85
+ # 判斷上一行是否結束了?(檢查結尾字符)
86
+ if buffer.endswith(('.', '!', '?', ':', ';', '"', '”', '’')):
87
+ merged_lines.append(buffer)
88
+ buffer = line
89
+ else:
90
+ # 處理連字號 (Hyphenation),例如 "co-\noperate" -> "cooperate"
91
+ if buffer.endswith('-'):
92
+ buffer = buffer[:-1] + line
93
+ else:
94
+ buffer += " " + line
95
+
96
+ if buffer: merged_lines.append(buffer)
97
+
98
+ # 3. 針對 APEC 格式進行結構化切分
99
+ full_text = "\n".join(merged_lines)
100
+
101
+ # Pattern: 匹配章節標題或編號 (PART, SECTION, 1., (a), I.)
102
+ split_pattern = r'(?m)^((?:PART\s+[A-Z]+|SECTION\s+[A-Z]|[IVX]+\.|(?:\d{1,2}\.)|(?:\([a-z]\)))\s+.*)'
103
+
104
+ parts = re.split(split_pattern, full_text)
105
+
106
+ final_segments = []
107
+ if parts[0].strip():
108
+ final_segments.append(parts[0].strip())
109
+
110
+ for part in parts[1:]:
111
+ s = part.strip()
112
+ if len(s) > 10: # 過濾太短的雜訊
113
+ final_segments.append(s)
114
+
115
+ # 如果 Regex 切分失敗(段落太少),退回使用雙換行切分
116
+ if len(final_segments) < 3:
117
+ final_segments = [p.strip() for p in full_text.split('\n') if len(p.strip()) > 20]
118
+
119
+ return final_segments
120
+
121
+ def parse_fsmm_filename(filename):
122
+ """解析檔名以獲取年份與類型"""
123
+ fn_lower = filename.lower()
124
  known_docs = {
125
  "putrajaya vision": (2020, "Leaders' Declaration"),
126
  "trujillo principles": (2024, "FSMM Principles"),
 
131
  "bangkok goals": (2022, "Leaders' Declaration")
132
  }
133
 
 
134
  for key, (year, doc_type) in known_docs.items():
135
  if key in fn_lower:
136
  return year, doc_type
137
 
 
138
  match = re.search(r'(\d+)_fsmm_(\w+)', filename)
139
  if match:
140
  year_short = match.group(1)
141
  type_code = match.group(2)
142
  year = int(year_short) + 2000 if len(year_short) == 2 else int(year_short)
 
143
  type_map = {
144
+ 'jms': 'Joint Ministerial Statement',
145
+ 'stmt': 'Statement',
146
+ 'declaration': 'Declaration',
147
+ 'roadmap': 'Roadmap'
148
  }
149
  type_name = type_map.get(type_code.lower(), type_code.upper())
150
  return year, type_name
151
 
 
152
  year_match = re.search(r'(20\d{2})|19\d{2}', filename)
153
  if year_match:
154
  return int(year_match.group(0)), "Other Document"
155
 
156
  return None, None
157
 
 
158
  def parse_proposal(file_path):
159
  try:
160
  with open(file_path, 'r', encoding='utf-8') as f:
 
175
  print(f"Error parsing proposal {file_path}: {e}")
176
  return None
177
 
178
+ # ==========================================
179
+ # 資料存取區
180
+ # ==========================================
181
+
182
  def load_json(filepath):
183
  with data_lock:
184
  if not os.path.exists(filepath):
 
199
  return
200
  logs = load_json(DATA_QUERIES)
201
  now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
202
+ logs.append({"timestamp": now, "query": str(query).strip()})
 
 
 
203
  save_json(DATA_QUERIES, logs)
204
 
205
+ # ==========================================
206
+ # 上傳處理區 (應用智能分段)
207
+ # ==========================================
208
+
209
  def handle_unified_upload(file_objs):
210
  if not file_objs:
211
  return "請選擇檔案上傳。"
 
235
  # --- 處理文件 (PDF/DOCX) ---
236
  else:
237
  year, doc_type = parse_fsmm_filename(original_filename)
 
238
  if not year:
239
  results.append(f"⚠️ {original_filename}: 無法辨識年份,已跳過。")
240
  continue
 
252
  results.append(f"❌ {original_filename}: 內容提取失敗。")
253
  continue
254
 
255
+ # 使用新的智能邏輯
256
+ composed_paras = process_and_segment_text(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
  new_entries = []
259
  for idx, para in enumerate(composed_paras):
 
274
 
275
  return "\n".join(results)
276
 
277
+ # ==========================================
278
+ # 搜尋與工具邏輯 (多關鍵字支援)
279
+ # ==========================================
280
+
281
  def perform_translation(text, target_lang='zh-TW'):
282
+ if not text: return ""
 
 
 
 
 
 
283
  encoded_text = urllib.parse.quote(text)
284
  google_trans_url = f"https://translate.google.com/?sl=auto&tl={target_lang}&text={encoded_text}&op=translate"
285
 
286
  trans_result = ""
 
287
  if HAS_TRANSLATOR:
288
  try:
 
289
  if len(text) > 4500:
290
  trans_result = "⚠️ 文本過長,請使用下方按鈕前往 Google 翻譯。"
291
  else:
 
295
  trans_result = f"⚠️ 翻譯服務暫時不可用 ({str(e)})。請使用下方按鈕。"
296
  else:
297
  trans_result = "⚠️ 伺服器未安裝 deep-translator 套件。請使用下方按鈕。"
 
298
  return trans_result, google_trans_url
299
 
300
  def translate_ui_action(text):
301
  t_text, t_url = perform_translation(text)
302
+ btn_html = f"""<div style="margin-top: 10px;"><a href="{t_url}" target="_blank" style="background-color: #4285F4; color: white; padding: 8px 16px; text-decoration: none; border-radius: 4px; font-weight: bold; display: inline-block;">🌍 在 Google 翻譯中開啟</a></div>"""
 
 
 
 
 
 
 
 
 
 
303
  return t_text, btn_html
304
 
305
+ def highlight_keywords(text, keywords):
306
+ """將文字中的關鍵字標上黃色底色 (忽略大小寫)"""
307
+ if not keywords or not text: return text
308
+ val = str(text)
309
+ for k in keywords:
310
+ pattern = re.compile(re.escape(k), re.IGNORECASE)
311
+ val = pattern.sub(f'<span style="background: #fef08a; font-weight: bold;">\g<0></span>', val)
312
+ return val
313
+
314
  def search_proposals(query, year, economy):
315
+ """
316
+ 提案搜尋:支援多關鍵字 AND 搜尋
317
+ 例如輸入 "Thailand agriculture" -> 找出同時包含 Thailand 和 agriculture 的提案
318
+ """
319
  log_query(f"Proposals Q:{query}|Y:{year}|E:{economy}")
320
  data = load_json(DATA_PROPOSALS)
321
 
322
  filtered = []
323
+ # 1. 關鍵字預處理
324
+ keywords = query.lower().split() if query else []
325
  y = str(year) if year else ""
326
  e = str(economy) if economy else ""
327
 
328
  for item in data:
329
+ # 建立全文字串以供檢查
330
+ full_text_search_content = " ".join([str(v) for v in item.values()]).lower()
331
+
332
+ # 2. 關鍵字 AND 邏輯檢查
333
+ if keywords:
334
+ if not all(k in full_text_search_content for k in keywords):
335
+ continue
336
 
337
  if y and str(item.get('Project Year', '')).strip() != y:
338
  continue
 
346
 
347
  html = ""
348
  for p in filtered[:20]:
349
+ # 3. 顯示時 Highlight 標題
350
+ title = highlight_keywords(p.get('Project Title', '無標題'), keywords)
351
+
352
+ # 4. 組裝詳細資訊 HTML (含 Highlight)
353
+ details_html = ""
354
+ for k, v in p.items():
355
+ if k not in ['Project Title', 'Project No.']:
356
+ val_highlighted = highlight_keywords(str(v), keywords)
357
+ details_html += f"<b>{k}:</b> <div>{val_highlighted}</div>"
358
+
359
  html += f"""
360
  <div style="border: 1px solid #cbd5e1; padding: 15px; border-radius: 8px; margin-bottom: 15px; background: #fff; box-shadow: 0 1px 3px rgba(0,0,0,0.1);">
361
+ <div style="font-weight: bold; color: #1e293b; margin-bottom: 8px; font-size: 1.1em;">{title}</div>
362
  <div style="display: flex; gap: 8px; flex-wrap: wrap; font-size: 0.85em; margin-bottom: 10px;">
363
  <span style="background: #e2e8f0; padding: 2px 8px; border-radius: 4px;">{p.get('Project No.', '無編號')}</span>
364
  <span style="background: #f1f5f9; padding: 2px 8px; border-radius: 4px;">{p.get('Proposing Economy(ies)', '未知經濟體')}</span>
 
366
  <details style="font-size: 0.9em; color: #475569;">
367
  <summary style="cursor: pointer; color: #2563eb;">詳細資訊</summary>
368
  <div style="margin-top: 10px; display: grid; grid-template-columns: 1fr 2fr; gap:5px;">
369
+ {details_html}
370
  </div>
371
  </details>
372
  </div>
 
374
  return html
375
 
376
  def search_fsmm(query, year, doc_type, filename_filter):
377
+ """
378
+ 文件搜尋:支援多關鍵字 AND 搜尋
379
+ """
380
  log_query(f"FSMM {query}")
381
  data = load_json(DATA_FSMM)
382
  filtered = []
383
+
384
+ # 1. 關鍵字預處理
385
+ keywords = query.lower().split() if query else []
386
 
387
  for item in data:
388
+ content_lower = item['content'].lower()
389
+
390
+ # 2. AND 邏輯檢查
391
+ if keywords:
392
+ if not all(k in content_lower for k in keywords):
393
+ continue
394
+
395
  if year and str(item['year']) != year: continue
396
  if doc_type and item['type'] != doc_type: continue
397
  if filename_filter and item['filename'] != filename_filter: continue
 
404
  else:
405
  filtered.sort(key=lambda x: (x['year'], x['type'], x['paragraph_index']), reverse=True)
406
 
 
 
 
 
 
 
 
 
 
407
  html = ""
408
  for item in filtered[:100]:
409
+ # 3. Highlight 內容
410
+ content = highlight_keywords(item['content'], keywords)
411
+
412
  html += f"""
413
  <div style="border: 1px solid #e2e8f0; padding: 15px; border-radius: 8px; margin-bottom: 15px; background: white; box-shadow: 0 1px 3px rgba(0,0,0,0.1);">
414
  <div style="display: flex; gap: 8px; margin-bottom: 8px; flex-wrap: wrap;">
 
438
 
439
  return full_text
440
 
441
+ # ==========================================
442
+ # Dashboard & UI Helpers
443
+ # ==========================================
444
+
445
  def get_dashboard_stats():
446
  proposals = load_json(DATA_PROPOSALS)
447
  fsmm = load_json(DATA_FSMM)
 
509
 
510
  return summary_html, fig_p_year, fig_p_econ, fig_queries, df_ye_stats, df_ec_stats, df_qu_stats
511
 
 
512
  def get_p_choices_raw(year_filter=None):
513
  try:
514
  data = load_json(DATA_PROPOSALS)
515
+ if not data: return ([""],), ([""],)
 
516
  years = sorted(list(set(str(it.get('Project Year', '')).strip() for it in data if it.get('Project Year'))), reverse=True)
517
  if year_filter:
518
  econs = sorted(list(set(str(it.get('Proposing Economy(ies)', '')).strip()
 
520
  else:
521
  econs = sorted(list(set(str(it.get('Proposing Economy(ies)', '')).strip() for it in data if it.get('Proposing Economy(ies)'))))
522
  return ([""] + years), ([""] + econs)
523
+ except:
 
524
  return ([""],), ([""],)
525
 
526
  def get_fsmm_choices_raw():
527
  try:
528
  data = load_json(DATA_FSMM)
529
+ if not data: return ([""],), ([""],), ([""],)
 
530
  years = sorted(list(set(str(it.get('year', '')) for it in data if it.get('year'))), reverse=True)
531
  types = sorted(list(set(str(it.get('type', '')) for it in data if it.get('type'))))
532
  filenames = sorted(list(set(str(it.get('filename', '')) for it in data if it.get('filename'))), reverse=True)
533
  return ([""] + years), ([""] + types), ([""] + filenames)
534
+ except:
 
535
  return ([""],), ([""],), ([""],)
536
 
537
  def refresh_p_choices(year_filter=None):
 
556
 
557
  res_fsmm = search_fsmm(selected_val, "", "", "")
558
  return gr.update(selected="fsmm_tab"), res_fsmm, ""
559
+ except:
 
560
  return gr.update(), "", ""
561
 
562
+ # ==========================================
563
+ # Main UI
564
+ # ==========================================
565
  DASHBOARD_CSS = """
566
+ .drilldown-df table tr:hover { cursor: pointer !important; background-color: #f0f9ff !important; position: relative; }
567
+ .drilldown-df table tr:hover::after { content: "🖱️ 點擊跳轉查詢"; position: absolute; right: 10px; top: 50%; transform: translateY(-50%); font-size: 0.8em; color: #3b82f6; background: #fff; padding: 2px 6px; border-radius: 4px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); pointer-events: none; }
568
+ .drilldown-df table tr:hover td { color: #2563eb !important; font-weight: bold; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
569
  """
570
 
571
  with gr.Blocks(title="APEC 綜合查詢系統", css=DASHBOARD_CSS) as demo:
572
  gr.Markdown("# 🌐 APEC 綜合查詢系統")
573
  gr.Markdown("整�� APEC 提案項目與重要宣言內容的統一檢索平台。")
574
 
 
575
  with gr.Accordion("🛠️ 快速翻譯工具箱 (Translation Tool)", open=False):
576
  gr.Markdown("將英文內容貼在此處,可快速翻譯成繁體中文。若內容過長建議使用 Google 翻譯按鈕。")
577
  with gr.Row():
 
580
  with gr.Row():
581
  t_btn = gr.Button("🔄 執行翻譯 (Translate)", variant="secondary")
582
  t_link_html = gr.HTML()
 
583
  t_btn.click(translate_ui_action, inputs=[t_input], outputs=[t_output, t_link_html])
 
584
 
585
  with gr.Tabs() as tabs:
586
  with gr.Tab("📊 統計儀表板") as dash_tab:
 
605
  with gr.Tab("🍎 政策文件查詢", id="fsmm_tab") as fsmm_tab:
606
  gr.Markdown("### 步驟 1:設定篩選條件 (FSMM / 領袖宣言 / 路線圖)")
607
  with gr.Row():
608
+ f_query = gr.Textbox(label="🔍 關鍵字搜尋 (支援多詞組, 如: climate resilience)", placeholder="例如:resilience climate women...")
609
  f_year = gr.Dropdown(label="📅 年份", choices=[""])
610
  f_type = gr.Dropdown(label="📝 類型", choices=[""])
611
  f_doc = gr.Dropdown(label="📄 特定檔案 (必選以查看全文)", choices=[""])
 
618
  gr.Markdown("---")
619
  f_out = gr.HTML(label="輸出區域")
620
 
 
621
  f_btn.click(search_fsmm, inputs=[f_query, f_year, f_type, f_doc], outputs=[f_out])
622
  f_full_btn.click(get_fsmm_full_text, inputs=[f_doc], outputs=[f_out])
623
  f_refresh_btn.click(refresh_fsmm_choices, outputs=[f_year, f_type, f_doc])
 
625
 
626
  with gr.Tab("📊 APEC 提案查詢", id="proposal_tab") as proposal_tab:
627
  with gr.Row():
628
+ p_query = gr.Textbox(label="關鍵字搜尋 (支援多詞組, 如: Thailand agriculture)", placeholder="例如:Thailand agriculture energy...")
629
  p_year = gr.Dropdown(label="📅 年份", choices=[""])
630
  p_econ = gr.Dropdown(label="Proposing Economy", choices=[""])
631
 
 
635
 
636
  p_out = gr.HTML()
637
 
 
638
  p_btn.click(search_proposals, inputs=[p_query, p_year, p_econ], outputs=[p_out])
639
  p_refresh_btn.click(refresh_p_choices, outputs=[p_year, p_econ])
640
  p_year.change(lambda y: refresh_p_choices(y)[1], inputs=[p_year], outputs=[p_econ])
 
649
  u_status = gr.Textbox(label="處理結果")
650
  u_btn.click(handle_unified_upload, inputs=[u_file], outputs=[u_status])
651
 
 
652
  d_ye_df.select(handle_drilldown, outputs=[tabs, f_out, p_out])
653
  d_ec_df.select(handle_drilldown, outputs=[tabs, f_out, p_out])
654
  d_qu_df.select(handle_drilldown, outputs=[tabs, f_out, p_out])
655
 
 
656
  def init_choices():
657
  try:
658
  fc = refresh_fsmm_choices()
659
  pc = refresh_p_choices()
660
  return fc + pc
661
+ except:
 
662
  return (gr.update(), gr.update(), gr.update(), gr.update(), gr.update())
663
 
664
  demo.load(init_choices, outputs=[f_year, f_type, f_doc, p_year, p_econ])