File size: 29,075 Bytes
1258e51
 
 
 
 
 
 
 
5a819ff
1258e51
 
5a819ff
935326a
1258e51
 
5a819ff
4a6cd1b
1258e51
 
 
 
 
 
 
 
 
 
 
d25eff9
ee74030
1258e51
 
 
 
 
 
 
 
 
 
 
 
5a819ff
4a6cd1b
 
1258e51
 
 
 
 
4a6cd1b
1258e51
 
4a6cd1b
1258e51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a819ff
1258e51
 
 
 
 
 
 
 
 
 
5a819ff
1258e51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95daccd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a674033
 
 
 
 
 
 
 
 
 
 
 
bc261ee
a674033
 
 
 
 
 
 
 
 
 
 
dde2f3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a674033
1258e51
4a6cd1b
1258e51
 
4a6cd1b
935326a
5a819ff
1258e51
 
 
 
 
22db8a4
1258e51
 
 
 
22db8a4
 
1258e51
 
 
22db8a4
1258e51
 
 
 
 
 
 
 
 
22db8a4
95daccd
78fcf69
dde2f3c
 
 
 
 
 
95daccd
15679af
622282a
 
dde2f3c
 
 
 
 
 
 
 
 
 
 
 
 
935326a
1258e51
935326a
1258e51
 
 
 
 
 
935326a
 
 
1258e51
 
 
 
 
 
 
5a819ff
 
 
 
 
 
 
 
 
 
1258e51
5a819ff
 
1258e51
 
 
 
 
5a819ff
1258e51
 
5a819ff
1258e51
 
 
 
 
 
 
 
 
5a819ff
1258e51
 
5a819ff
1258e51
4a6cd1b
ee74030
1258e51
4a6cd1b
5a819ff
ee74030
 
1258e51
53077f6
ee74030
 
 
 
1258e51
ee74030
 
1258e51
ee74030
1258e51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a819ff
1258e51
 
 
5a819ff
1258e51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a819ff
1258e51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a819ff
1258e51
 
 
 
 
 
 
 
 
4a6cd1b
1258e51
 
 
5a819ff
1258e51
 
 
 
 
 
 
 
 
 
4a6cd1b
1258e51
 
 
 
 
 
5a819ff
1258e51
 
 
 
5a819ff
1258e51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a6cd1b
1258e51
 
4a6cd1b
1258e51
 
 
 
 
 
 
 
 
4a6cd1b
1258e51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a6cd1b
1258e51
 
 
 
4a6cd1b
1258e51
 
 
 
 
 
 
5a819ff
 
1258e51
5a819ff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
# app.py
# 法遵自評覆核(Hugging Face Spaces 版)
# - PDF 讀取:先用 PyMuPDF 擷取文字;若該頁無文字則以 PaddleOCR(lang="ch")進行 OCR(含繁體)
# - LLM:本地 Hugging Face Transformers(預設 meta-llama/Meta-Llama-3.1-8B-Instruct),首次自動下載到 /data/hf
# - 自評檢核規則:預設從 config/config.json 的 DEFAULT_RULES 載入(可在 UI 中編輯)
# - 每個函式皆加上 try/except 與註解;重要步驟 print log 便於除錯
# - UI:內規多檔上傳(可預覽)、自評規則(必填)、自評文字或自評附檔(二擇一)、Debug Log 顯示切換

import os
import io
import re
import json
import time
from typing import List, Dict, Tuple, Optional

import gradio as gr
import fitz  # PyMuPDF
import numpy as np
from PIL import Image

# HF 變數與快取位置(首次下載後可持久化)
os.environ.setdefault("TRANSFORMERS_CACHE", "/data/hf")
os.environ.setdefault("HF_HOME", "/data/hf")

# -----------------------------
# 設定檔載入(含預設值與錯誤處理)
# -----------------------------
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
CONFIG_PATH = os.path.join(BASE_DIR, "Config", "config.json")

# 預設 fallback(若找不到 config.json)
_default_cfg = {
    "DEFAULT_RULES": "【請在此貼上你的檢核規則】",
    "DEMO_PROMPT_INSTRUCTIONS": (
        "你是一位法遵/合規覆核專家。請依『檢核規範』逐點比對『文件全文』,並嚴格以 JSON 回覆:"
        '{"符合情況":"符合|不符合|部分符合","原因":[],"改進建議":[],"規則逐點檢核":[{"規則編號":"","規則內容":"",'
        '"判斷":"符合|不符合|部分符合","理由":"","建議":""}]}'
    ),
    "SYSTEM_MESSAGE": "你是嚴謹的法遵覆核專家,請以審計可追溯為原則回覆並僅輸出 JSON。",
    "temperature": 0.2,
    "MAX_TOKENS_INPUT": 100000
}

try:
    with open(CONFIG_PATH, "r", encoding="utf-8") as f:
        cfg = json.load(f) or {}
    print(f"[CONFIG] Loaded: {CONFIG_PATH}")
except FileNotFoundError:
    print(f"[CONFIG][WARN] {CONFIG_PATH} not found, using defaults.")
    cfg = dict(_default_cfg)
except Exception as e:
    print(f"[CONFIG][ERROR] Failed to load config.json: {e}. Using defaults.")
    cfg = dict(_default_cfg)

DEFAULT_RULES = (cfg.get("DEFAULT_RULES") or _default_cfg["DEFAULT_RULES"]).strip()
DEMO_PROMPT_INSTRUCTIONS = (cfg.get("DEMO_PROMPT_INSTRUCTIONS") or _default_cfg["DEMO_PROMPT_INSTRUCTIONS"]).strip()
SYSTEM_MESSAGE = (cfg.get("SYSTEM_MESSAGE") or _default_cfg["SYSTEM_MESSAGE"]).strip()
TEMPERATURE = float(cfg.get("temperature", _default_cfg["temperature"]))
MAX_TOKENS_INPUT = int(cfg.get("MAX_TOKENS_INPUT", _default_cfg["MAX_TOKENS_INPUT"]))

# -----------------------------
# PaddleOCR 初始化(lang='ch' 同時支援簡/繁)
# -----------------------------
import torch
from paddleocr import PaddleOCR

OCR = None
def _init_ocr() -> Optional[PaddleOCR]:
    """初始化 PaddleOCR(採用 lang='ch',支援簡/繁;自動偵測 GPU/CPU)。"""
    try:
        print("[OCR] Initializing PaddleOCR (lang='ch', PP-OCRv4)")
        ocr = PaddleOCR(
            lang="ch",
            use_angle_cls=True,
            use_gpu=torch.cuda.is_available(),
            ocr_version="PP-OCRv4",
            show_log=False
        )
        print("[OCR] Ready.")
        return ocr
    except Exception as e:
        print(f"[OCR][ERROR] init failed: {e}")
        return None

# -----------------------------
# 本地 LLaMA(Transformers)
# -----------------------------
from transformers import AutoModelForCausalLM, AutoTokenizer

LOCAL_MODEL_ID = os.getenv("LOCAL_MODEL_ID", "meta-llama/Meta-Llama-3.1-8B-Instruct")
HF_TOKEN = os.getenv("HF_TOKEN", None)

_hf_tok = None
_hf_model = None

def _ensure_local_model(logs: Optional[List[str]] = None) -> None:
    """確保本地模型已載入(首次呼叫時自動下載/載入到 /data/hf)。"""
    global _hf_tok, _hf_model
    try:
        if _hf_tok is not None and _hf_model is not None:
            return
        if logs is not None:
            logs.append(f"[LOCAL LLM] Loading model: {LOCAL_MODEL_ID}")
        print(f"[LLM] Loading {LOCAL_MODEL_ID} (cache={os.environ.get('TRANSFORMERS_CACHE')})")
        _hf_tok = AutoTokenizer.from_pretrained(
            LOCAL_MODEL_ID, use_fast=True, cache_dir=os.environ["TRANSFORMERS_CACHE"], token=HF_TOKEN
        )
        _hf_model = AutoModelForCausalLM.from_pretrained(
            LOCAL_MODEL_ID,
            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
            device_map="auto",
            cache_dir=os.environ["TRANSFORMERS_CACHE"],
            token=HF_TOKEN
        )
        if logs is not None:
            logs.append("[LOCAL LLM] Model ready.")
        print("[LLM] Ready.")
    except Exception as e:
        if logs is not None:
            logs.append(f"[LOCAL LLM][ERROR] load failed: {e}")
        print(f"[LLM][ERROR] load failed: {e}")
        raise
# ---------- Robust JSON parsing helpers ----------
def _strip_code_fences(s: str) -> str:
    s = s.strip()
    if s.startswith("```"):
        s = s[3:]
        if "```" in s:
            s = s.split("```", 1)[0]
    s = s.replace("```json", "").replace("```JSON", "").strip("` \n\r\t")
    return s

def _extract_first_brace_block(s: str) -> str:
    start = s.find("{")
    if start == -1:
        return s
    depth = 0
    for i in range(start, len(s)):
        if s[i] == "{":
            depth += 1
        elif s[i] == "}":
            depth -= 1
            if depth == 0:
                return s[start:i+1]
    return s

def safe_parse_json(text: str) -> dict:
    """

    先嚴格 json.loads;失敗則:

    1) 去掉 code fences/markdown

    2) 擷取第一個平衡的 {...}

    3) 嘗試 json5(允許單引號、尾逗號)

    4) 修補全形/花式引號與 BOM;必要時把整體單引號轉雙引號

    """
    import json as _json

    # 直接試一次
    try:
        return _json.loads(text)
    except Exception:
        pass

    s = _strip_code_fences(text)
    s = _extract_first_brace_block(s)

    try:
        return _json.loads(s)
    except Exception:
        pass

    # 可選:json5(若未安裝會直接跳過)
    try:
        import json5  # type: ignore
        return json5.loads(s)
    except Exception:
        pass

    # 修補引號與 BOM
    repaired = (
        s.replace("\u201c", '"').replace("\u201d", '"')
         .replace("\u2018", "'").replace("\u2019", "'")
         .replace("\ufeff", "").strip()
    )
    if "'" in repaired and '"' not in repaired:
        repaired = repaired.replace("'", '"')
    return _json.loads(repaired)
def extract_model_reply(full_text, prompt):
    """

    從模型完整輸出中,移除 prompt 和任何 system、assistant 等前置內容

    """
    try:
        # 如果模型有把 prompt 或 system 一起回顯,先找最後一次 user 提問位置
        markers = ["user", "User", "使用者", prompt.strip()]
        last_pos = -1
        for m in markers:
            pos = full_text.rfind(m)
            if pos > last_pos:
                last_pos = pos

        # 從最後 marker 後面開始取內容
        if last_pos != -1:
            reply = full_text[last_pos + len(markers[-1]):]
        else:
            reply = full_text

        # 移除多餘空白與換行
        return reply.strip()
    except Exception as e:
        print(f"[extract_model_reply 錯誤] {e}")
        return full_text.strip()

# === 放在 safe_parse_json 之後:用「正則」擷取 full_text 中最後一個完整 JSON 物件 ===
try:
    import regex as re2  # 第三方 regex,支援遞迴 (?R)
except Exception:
    re2 = None

def extract_last_json_block(text: str) -> Optional[str]:
    """

    以 regex 擷取最後一個平衡的大括號 JSON 物件:

    - 優先使用第三方 `regex` 的遞迴 (?R) 來比對平衡大括號

    - 若無法使用 `regex`,改用手動堆疊法做 fallback

    回傳:最後一個 JSON 物件字串;若找不到回傳 None

    """
    try:
        s = _strip_code_fences(text)

        # 1) 使用 regex (?R) 遞迴:{\n  ... { ... } ... \n}
        if re2 is not None:
            pattern = re2.compile(r"\{(?:[^{}]|(?R))*\}", flags=re2.DOTALL)
            matches = [m.group(0) for m in pattern.finditer(s)]
            return matches[-1] if matches else None

        # 2) 無 regex 模組 → 手動掃描平衡大括號
        blocks = []
        depth = 0
        start = None
        for i, ch in enumerate(s):
            if ch == "{":
                if depth == 0:
                    start = i
                depth += 1
            elif ch == "}":
                if depth > 0:
                    depth -= 1
                    if depth == 0 and start is not None:
                        blocks.append(s[start:i+1])
                        start = None
        return blocks[-1] if blocks else None
    except Exception as e:
        print(f"[JSON-EXTRACT][ERROR] {e}")
        return None

        
def call_llm(messages: List[dict], model: str, logs: List[str]) -> dict:
    """

    保留原名稱 call_llm,但改為本地 LLaMA。

    嚴格要求僅輸出 JSON;若混入其他文本,會正則擷取第一個 {...}。

    """
    start_time = time.time()  # 計時開始
    try:
        _ensure_local_model(logs)
        # 準備 chat prompt(加上 JSON 輸出約束)
        sys_txt = messages[0].get("content", "") if messages else ""
        usr_txt = messages[1].get("content", "") if len(messages) > 1 else ""
        extra_rules = "\n\n請務必只輸出單一 JSON 物件,不得包含任何 JSON 之外的文字或符號。"
        print('準備 chat prompt(加上 JSON 輸出約束)')
        chat = [
            {"role": "system", "content": sys_txt},
            {"role": "user", "content": usr_txt + extra_rules}
        ]
        print(f"user content:{usr_txt + extra_rules}")
        
        prompt = _hf_tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

        inputs = _hf_tok(prompt, return_tensors="pt").to(_hf_model.device)
        print("inputs")
        with torch.no_grad():
            out_ids = _hf_model.generate(
                **inputs,
                max_new_tokens=1024,
                temperature=float(TEMPERATURE),
                do_sample=(float(TEMPERATURE) > 0),
                eos_token_id=_hf_tok.eos_token_id,
                pad_token_id=_hf_tok.eos_token_id
            )
        print("torch.no_grad")
        # 解碼生成內容後
        full_text = _hf_tok.decode(out_ids[0], skip_special_tokens=True)
        
        

         # ★ 使用 regex/堆疊法:從 full_text 擷取「最後一個」完整 JSON 物件
        candidate = extract_last_json_block(full_text)
        gen_text = candidate if candidate is not None else full_text  # 若找不到就用原文(後續 safe_parse_json 仍會嘗試)
        logs.append(f"[LOCAL LLM] raw_len={len(gen_text)}")
        logs.append(f"[LOCAL LLM] gen_text={gen_text}")
        logs.append(f"[LOCAL LLM] prompt={prompt}")
        logs.append(f"[LOCAL LLM] full_text={full_text}")
          # 強韌解析
        try:
            data = safe_parse_json(gen_text)
            logs.append("[LOCAL LLM] JSON 解析成功")
            return data
        except Exception as jerr:
            logs.append(f"[LOCAL LLM] JSON 解析失敗:{jerr}")
            return {
                "符合情況": "部分符合",
                "原因": [f"模型輸出非合法 JSON:{str(jerr)}"],
                "改進建議": ["請調整提示詞,要求嚴格輸出 JSON(雙引號、無註解、無多餘文字)。"],
                "規則逐點檢核": []
            }
        return data
    except Exception as e:
        logs.append(f"[LOCAL LLM] JSON 解析失敗:{e}")
        return {
            "符合情況": "部分符合",
            "原因": [f"本地模型錯誤:{e}"],
            "改進建議": ["請檢查 Hugging Face 權杖與模型權限、或改用較小模型。"],
            "規則逐點檢核": []
        }
    finally:
        elapsed = time.time() - start_time
        logs.append(f"[LOCAL LLM] 本次執行耗時:{elapsed:.2f} 秒")
# -----------------------------
# 檔案讀取:TXT / PDF(含 OCR fallback)
# -----------------------------
def _read_text_file(path: str) -> str:
    """讀取 TXT:嘗試多種常見編碼(台灣環境可能遇到 cp950/big5)。"""
    try:
        print(f"[READ] TXT: {os.path.basename(path)}")
        encodings = ("utf-8", "utf-8-sig", "cp950", "big5", "latin1")
        for enc in encodings:
            try:
                with open(path, "r", encoding=enc, errors="ignore") as f:
                    return f.read()
            except Exception:
                continue
        with open(path, "rb") as f:
            return f.read().decode("utf-8", errors="ignore")
    except Exception as e:
        print(f"[READ][ERROR] TXT {path}: {e}")
        return ""

def _ensure_ocr_ready():
    """Lazy 初始化 OCR(避免在未用到時佔資源)。"""
    global OCR
    if OCR is None:
        OCR = _init_ocr()

def _ocr_page_text(page) -> str:
    """將 PDF 頁面轉圖後以 OCR 讀取文字。"""
    try:
        _ensure_ocr_ready()
        if OCR is None:
            return ""
        pix = page.get_pixmap(dpi=240)
        img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
        res = OCR.ocr(np.array(img), cls=True)
        if not res or not res[0]:
            return ""
        return "\n".join([line[1][0] for line in res[0]]).strip()
    except Exception as e:
        print(f"[OCR][ERROR] {e}")
        return ""

def _read_pdf_text(path: str) -> Tuple[str, int]:
    """

    讀取 PDF:若該頁抽不到文字(可能是掃描影像),則以 OCR 進行辨識。

    回傳:(全文, 頁數)

    """
    try:
        print(f"[READ] PDF: {os.path.basename(path)}")
        parts: List[str] = []
        with fitz.open(path) as doc:
            for page in doc:
                txt = (page.get_text("text") or "").strip()
                if len(txt) < 20:  # 低於門檻判定影像頁
                    ocr_txt = _ocr_page_text(page)
                    parts.append(ocr_txt)
                else:
                    parts.append(txt)
            return "\n".join(parts).strip(), len(doc)
    except Exception as e:
        print(f"[READ][ERROR] PDF {path}: {e}")
        return "", 0

def _read_file_to_text(file_path: Optional[str]) -> Tuple[str, str]:
    """統一入口:根據副檔名分派至 TXT/PDF 讀取函式。"""
    try:
        if not file_path:
            raise ValueError("未收到檔案路徑")
        file_path = os.fspath(file_path)
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"檔案不存在:{file_path}")
        fname = os.path.basename(file_path)
        ext = os.path.splitext(fname)[1].lower()
        if ext == ".txt":
            content = _read_text_file(file_path)
            return content, f"讀取 TXT:{fname}(長度:{len(content)})"
        elif ext == ".pdf":
            content, pages = _read_pdf_text(file_path)
            return content, f"讀取 PDF:{fname}(頁數:{pages};長度:{len(content)})"
        else:
            raise ValueError("僅支援 .txt 或 .pdf")
    except Exception as e:
        print(f"[READ][ERROR] {e}")
        return "", f"[ERROR] {e}"

# -----------------------------
# 多檔輔助:清單摘要 / 快取與預覽
# -----------------------------
def _read_multi(paths: Optional[List[str]]) -> Dict[str, str]:
    """批次讀取多檔,回傳 {檔名: 文字內容},忽略讀取錯誤。"""
    cache: Dict[str, str] = {}
    try:
        if not paths:
            return cache
        for p in paths:
            if not p:
                continue
            try:
                name = os.path.basename(os.fspath(p))
                text, _ = _read_file_to_text(p)
                cache[name] = text
            except Exception as e:
                print(f"[READ][WARN] skip {p}: {e}")
                continue
        return cache
    except Exception as e:
        print(f"[READ_MULTI][ERROR] {e}")
        return cache

def _summarize_paths(paths: Optional[List[str]]) -> str:
    """列出上傳檔案清單(含大小),供 UI 顯示。"""
    try:
        if not paths:
            return "(尚未上傳任何檔案)"
        lines = []
        for p in paths:
            try:
                name = os.path.basename(os.fspath(p))
                size = os.path.getsize(os.fspath(p))
                lines.append(f"- {name}{size} bytes)")
            except Exception:
                lines.append(f"- {os.path.basename(str(p))}")
        return "\n".join(lines)
    except Exception as e:
        print(f"[SUMMARY][ERROR] {e}")
        return "(清單生成失敗)"

def on_files_change(paths: Optional[List[str]]):
    """Gradio callback:檔案更新 → 更新清單 / 勾選選項 / 預覽區 / 快取。"""
    try:
        cache = _read_multi(paths)
        choices = list(cache.keys())
        return (
            _summarize_paths(paths),
            gr.update(choices=choices, value=[]),
            gr.update(value="", visible=False),
            cache,
        )
    except Exception as e:
        print(f"[FILES_CHANGE][ERROR] {e}")
        return "(更新失敗)", gr.update(), gr.update(), {}

def on_show_toggle(selected_names: List[str], cache: Dict[str, str]):
    """Gradio callback:勾選要預覽的檔名 → 顯示合併內容(截斷顯示)。"""
    try:
        if not selected_names:
            return gr.update(value="", visible=False)
        parts = []
        preview_limit = max(2000, MAX_TOKENS_INPUT // 5)
        for name in selected_names:
            txt = cache.get(name, "")
            if len(txt) > preview_limit:
                txt = txt[:preview_limit] + "\n...[內容過長,已截斷顯示]"
            parts.append(f"===== {name} =====\n{txt}")
        return gr.update(value="\n\n".join(parts), visible=True)
    except Exception as e:
        print(f"[SHOW_TOGGLE][ERROR] {e}")
        return gr.update(value=f"[ERROR] {e}", visible=True)

# -----------------------------
# 檢核前驗證 / Prompt 構建
# -----------------------------
def validate_before_run(

    policy_files: List[str],

    self_rules: str,

    self_text: str,

    self_files: List[str],

) -> Tuple[bool, str]:
    """檢核前的前置驗證:內規至少一檔、規則必填、自評文字或附檔至少其一。"""
    try:
        errors = []
        if not (policy_files and len(policy_files) > 0):
            errors.append("請上傳『單位適用內規(可多檔)』至少一個檔案。")
        if not (self_rules and self_rules.strip()):
            errors.append("請輸入『自評檢核規則』。")
        if not (self_text and self_text.strip()) and not (self_files and len(self_files) > 0):
            errors.append("請提供『單位自評』文字,或上傳『單位自評附檔(可多檔)』至少一個。")
        ok = (len(errors) == 0)
        return ok, "\n".join(errors)
    except Exception as e:
        print(f"[VALIDATE][ERROR] {e}")
        return False, f"驗證失敗:{e}"

def build_prompt(rules: str, doc_text: str) -> List[dict]:
    """依規則與文件全文構建 chat messages。"""
    try:
        system_msg = {"role": "system", "content": SYSTEM_MESSAGE}
        user_msg = {
            "role": "user",
            "content": f"{DEMO_PROMPT_INSTRUCTIONS}\n\n[檢核規範]\n{rules}\n\n[文件全文]\n{doc_text}"
        }
        return [system_msg, user_msg]
    except Exception as e:
        print(f"[PROMPT][ERROR] {e}")
        return [{"role": "system", "content": SYSTEM_MESSAGE},
                {"role": "user", "content": "(構建 Prompt 失敗)"}]

# -----------------------------
# 主流程:執行檢核
# -----------------------------
def run_check_with_log(

    policy_files_paths,  # 多檔內規(必填)

    self_rules_text,     # 自評檢核規則(必填;預設載入 DEFAULT_RULES)

    self_text,           # 自評文字(可空)

    self_files_paths     # 多檔自評附檔(可空)

):
    logs: List[str] = []
    try:
        # 驗證
        ok, msg = validate_before_run(policy_files_paths, self_rules_text, self_text, self_files_paths)
        if not ok:
            msg_display = msg.replace("\n", ";")
            logs.append(f"[VALIDATE] {msg_display}")
            return msg, None, None, "\n".join(logs)

        # 紀錄 / 讀取內規(目前僅作為流程留痕)
        for p in (policy_files_paths or []):
            t, info = _read_file_to_text(p)
            logs.append(f"[POLICY] {info}")

        # 取得自評內容(優先文字;否則合併自評附檔)
        doc_text = ""
        if self_text and self_text.strip():
            doc_text = self_text.strip()
            logs.append(f"[SELF] from textbox, len={len(doc_text)}")
        else:
            pieces = []
            for p in (self_files_paths or []):
                t, info = _read_file_to_text(p)
                logs.append(f"[SELF_FILE] {info}")
                if t.strip():
                    pieces.append(t.strip())
            doc_text = "\n\n".join(pieces).strip()
            if not doc_text:
                return "未取得任何自評內容(文字與附檔皆為空)。", None, None, "\n".join(logs)

        # 長度限制(保守)
        if len(doc_text) > MAX_TOKENS_INPUT:
            logs.append(f"[WARN] self text too long ({len(doc_text)}) → truncate to {MAX_TOKENS_INPUT}")
            doc_text = doc_text[:MAX_TOKENS_INPUT]

        # 構建訊息並呼叫本地 LLM
        msgs = build_prompt(self_rules_text.strip(), doc_text)
        logs.append("=== Prompt ===")
        logs.append(f"[system] len={len(msgs[0]['content']) if msgs else 0}")
        logs.append(f"[user] len={len(msgs[1]['content']) if len(msgs)>1 else 0}")

        result_dict = call_llm(msgs, model="local", logs=logs)
        logs.append("[LLM] done.")

        pretty = json.dumps(result_dict, ensure_ascii=False, indent=2)
        table = [
            [i.get("規則編號",""), i.get("規則內容",""), i.get("判斷",""), i.get("理由",""), i.get("建議","")]
            for i in result_dict.get("規則逐點檢核", [])
        ]
        info = f"自評內容長度:{len(doc_text)};自評檢核規則長度:{len(self_rules_text.strip())}"
        return info, pretty, table, "\n".join(logs)
    except Exception as e:
        logs.append(f"[RUN][ERROR] {e}")
        return f"執行失敗:{e}", None, None, "\n".join(logs)

def toggle_debug(current_visibility: bool):
    """切換 Debug Log 顯示。"""
    try:
        return gr.update(visible=not current_visibility), not current_visibility
    except Exception as e:
        print(f"[DEBUG_TOGGLE][ERROR] {e}")
        return gr.update(), current_visibility

# -----------------------------
# Gradio 介面
# -----------------------------
with gr.Blocks(title="法遵自評覆核(PaddleOCR + LLaMA 本地)") as demo:
    gr.Markdown("# 法遵自評覆核(支援 PDF OCR:繁/簡中文;本地 LLaMA 推論)")
    gr.Markdown("步驟:上傳『單位適用內規(可多檔)』 → 輸入『自評檢核規則』(預設載入 config) → 輸入『單位自評』或上傳『自評附檔(可多檔)』 → 開始檢核。")

    # 內規(多檔)+ 清單 + 個別顯示開關 + 預覽(預設隱藏)
    with gr.Row():
        policy_files_in = gr.Files(
            label="上傳單位適用內規(可多檔,.txt 或 .pdf)",
            file_types=[".txt", ".pdf"],
            type="filepath"
        )
    policy_list_md = gr.Markdown("(尚未上傳任何檔案)")
    policy_show_chk = gr.CheckboxGroup(label="顯示哪些內規內容(個別開關,預設不勾選)", choices=[])
    policy_show_area = gr.Textbox(label="內規檔案內容預覽(僅顯示勾選者,可能截斷)", value="", lines=15, visible=False)
    policy_cache_state = gr.State({})

    # 自評檢核規則(預設由 config 載入,可編輯,視為必填)
    with gr.Row():
        self_rules_in = gr.Textbox(
            label="自評檢核規則(必填;預設載入自 config)",
            value=DEFAULT_RULES,
            lines=8,
            placeholder="請貼上要檢核自評內容的規則;例如:需包含抽檢紀錄、會辦流程、簽核證明等。"
        )

    # 自評文字(可空)
    with gr.Row():
        self_text_in = gr.Textbox(
            label="單位自評(文字輸入;可留白改上傳自評附檔)",
            value="",
            lines=10,
            placeholder="請輸入自評文字;或留白並在下方上傳自評附檔(可多檔)"
        )

    # 自評附檔(多檔)+ 清單 + 個別顯示開關 + 預覽(預設隱藏)
    with gr.Row():
        self_files_in = gr.Files(
            label="上傳單位自評附檔(可多檔,.txt 或 .pdf)",
            file_types=[".txt", ".pdf"],
            type="filepath"
        )
    self_list_md = gr.Markdown("(尚未上傳任何檔案)")
    self_show_chk = gr.CheckboxGroup(label="顯示哪些自評附檔內容(個別開關,預設不勾選)", choices=[])
    self_show_area = gr.Textbox(label="自評附檔內容預覽(僅顯示勾選者,可能截斷)", value="", lines=15, visible=False)
    self_cache_state = gr.State({})

    # 控制列
    with gr.Row():
        run_btn = gr.Button("開始檢核", variant="primary", interactive=False)
        toggle_debug_btn = gr.Button("切換 Debug Log 顯示/隱藏")

    info_out = gr.Markdown(label="讀檔/輸入資訊")
    json_out = gr.Code(label="檢核結果(JSON)", language="json")
    table_out = gr.Dataframe(headers=["規則編號","規則內容","判斷","理由","建議"], wrap=True)
    debug_log = gr.Textbox(label="Debug Log", lines=14, visible=False)
    debug_state = gr.State(False)

    # 內規:清單 + 勾選 + 預覽
    policy_files_in.change(
        fn=on_files_change,
        inputs=[policy_files_in],
        outputs=[policy_list_md, policy_show_chk, policy_show_area, policy_cache_state]
    )
    policy_show_chk.change(
        fn=on_show_toggle,
        inputs=[policy_show_chk, policy_cache_state],
        outputs=[policy_show_area]
    )

    # 自評附檔:清單 + 勾選 + 預覽
    self_files_in.change(
        fn=on_files_change,
        inputs=[self_files_in],
        outputs=[self_list_md, self_show_chk, self_show_area, self_cache_state]
    )
    self_show_chk.change(
        fn=on_show_toggle,
        inputs=[self_show_chk, self_cache_state],
        outputs=[self_show_area]
    )

    # 「開始檢核」互動條件:內規(至少一檔) + 自評規則(必填) + 自評(文字或檔案擇一)
    def _ready(policy_files, self_rules, self_text, self_files):
        try:
            ok, _ = validate_before_run(policy_files, self_rules, self_text, self_files)
            return gr.update(interactive=ok)
        except Exception as e:
            print(f"[READY][ERROR] {e}")
            return gr.update(interactive=False)

    policy_files_in.change(_ready, [policy_files_in, self_rules_in, self_text_in, self_files_in], [run_btn])
    self_rules_in.change(_ready, [policy_files_in, self_rules_in, self_text_in, self_files_in], [run_btn])
    self_text_in.change(_ready, [policy_files_in, self_rules_in, self_text_in, self_files_in], [run_btn])
    self_files_in.change(_ready, [policy_files_in, self_rules_in, self_text_in, self_files_in], [run_btn])

    # Run
    run_btn.click(
        fn=run_check_with_log,
        inputs=[policy_files_in, self_rules_in, self_text_in, self_files_in],
        outputs=[info_out, json_out, table_out, debug_log]
    )
    toggle_debug_btn.click(fn=toggle_debug, inputs=[debug_state], outputs=[debug_log, debug_state])

if __name__ == "__main__":
    # Spaces 需綁 0.0.0.0;本地可保留相同設定
    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")), share=False)