Spaces:
Sleeping
Sleeping
File size: 29,075 Bytes
1258e51 5a819ff 1258e51 5a819ff 935326a 1258e51 5a819ff 4a6cd1b 1258e51 d25eff9 ee74030 1258e51 5a819ff 4a6cd1b 1258e51 4a6cd1b 1258e51 4a6cd1b 1258e51 5a819ff 1258e51 5a819ff 1258e51 95daccd a674033 bc261ee a674033 dde2f3c a674033 1258e51 4a6cd1b 1258e51 4a6cd1b 935326a 5a819ff 1258e51 22db8a4 1258e51 22db8a4 1258e51 22db8a4 1258e51 22db8a4 95daccd 78fcf69 dde2f3c 95daccd 15679af 622282a dde2f3c 935326a 1258e51 935326a 1258e51 935326a 1258e51 5a819ff 1258e51 5a819ff 1258e51 5a819ff 1258e51 5a819ff 1258e51 5a819ff 1258e51 5a819ff 1258e51 4a6cd1b ee74030 1258e51 4a6cd1b 5a819ff ee74030 1258e51 53077f6 ee74030 1258e51 ee74030 1258e51 ee74030 1258e51 5a819ff 1258e51 5a819ff 1258e51 5a819ff 1258e51 5a819ff 1258e51 4a6cd1b 1258e51 5a819ff 1258e51 4a6cd1b 1258e51 5a819ff 1258e51 5a819ff 1258e51 4a6cd1b 1258e51 4a6cd1b 1258e51 4a6cd1b 1258e51 4a6cd1b 1258e51 4a6cd1b 1258e51 5a819ff 1258e51 5a819ff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 |
# app.py
# 法遵自評覆核(Hugging Face Spaces 版)
# - PDF 讀取:先用 PyMuPDF 擷取文字;若該頁無文字則以 PaddleOCR(lang="ch")進行 OCR(含繁體)
# - LLM:本地 Hugging Face Transformers(預設 meta-llama/Meta-Llama-3.1-8B-Instruct),首次自動下載到 /data/hf
# - 自評檢核規則:預設從 config/config.json 的 DEFAULT_RULES 載入(可在 UI 中編輯)
# - 每個函式皆加上 try/except 與註解;重要步驟 print log 便於除錯
# - UI:內規多檔上傳(可預覽)、自評規則(必填)、自評文字或自評附檔(二擇一)、Debug Log 顯示切換
import os
import io
import re
import json
import time
from typing import List, Dict, Tuple, Optional
import gradio as gr
import fitz # PyMuPDF
import numpy as np
from PIL import Image
# HF 變數與快取位置(首次下載後可持久化)
os.environ.setdefault("TRANSFORMERS_CACHE", "/data/hf")
os.environ.setdefault("HF_HOME", "/data/hf")
# -----------------------------
# 設定檔載入(含預設值與錯誤處理)
# -----------------------------
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
CONFIG_PATH = os.path.join(BASE_DIR, "Config", "config.json")
# 預設 fallback(若找不到 config.json)
_default_cfg = {
"DEFAULT_RULES": "【請在此貼上你的檢核規則】",
"DEMO_PROMPT_INSTRUCTIONS": (
"你是一位法遵/合規覆核專家。請依『檢核規範』逐點比對『文件全文』,並嚴格以 JSON 回覆:"
'{"符合情況":"符合|不符合|部分符合","原因":[],"改進建議":[],"規則逐點檢核":[{"規則編號":"","規則內容":"",'
'"判斷":"符合|不符合|部分符合","理由":"","建議":""}]}'
),
"SYSTEM_MESSAGE": "你是嚴謹的法遵覆核專家,請以審計可追溯為原則回覆並僅輸出 JSON。",
"temperature": 0.2,
"MAX_TOKENS_INPUT": 100000
}
try:
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
cfg = json.load(f) or {}
print(f"[CONFIG] Loaded: {CONFIG_PATH}")
except FileNotFoundError:
print(f"[CONFIG][WARN] {CONFIG_PATH} not found, using defaults.")
cfg = dict(_default_cfg)
except Exception as e:
print(f"[CONFIG][ERROR] Failed to load config.json: {e}. Using defaults.")
cfg = dict(_default_cfg)
DEFAULT_RULES = (cfg.get("DEFAULT_RULES") or _default_cfg["DEFAULT_RULES"]).strip()
DEMO_PROMPT_INSTRUCTIONS = (cfg.get("DEMO_PROMPT_INSTRUCTIONS") or _default_cfg["DEMO_PROMPT_INSTRUCTIONS"]).strip()
SYSTEM_MESSAGE = (cfg.get("SYSTEM_MESSAGE") or _default_cfg["SYSTEM_MESSAGE"]).strip()
TEMPERATURE = float(cfg.get("temperature", _default_cfg["temperature"]))
MAX_TOKENS_INPUT = int(cfg.get("MAX_TOKENS_INPUT", _default_cfg["MAX_TOKENS_INPUT"]))
# -----------------------------
# PaddleOCR 初始化(lang='ch' 同時支援簡/繁)
# -----------------------------
import torch
from paddleocr import PaddleOCR
OCR = None
def _init_ocr() -> Optional[PaddleOCR]:
"""初始化 PaddleOCR(採用 lang='ch',支援簡/繁;自動偵測 GPU/CPU)。"""
try:
print("[OCR] Initializing PaddleOCR (lang='ch', PP-OCRv4)")
ocr = PaddleOCR(
lang="ch",
use_angle_cls=True,
use_gpu=torch.cuda.is_available(),
ocr_version="PP-OCRv4",
show_log=False
)
print("[OCR] Ready.")
return ocr
except Exception as e:
print(f"[OCR][ERROR] init failed: {e}")
return None
# -----------------------------
# 本地 LLaMA(Transformers)
# -----------------------------
from transformers import AutoModelForCausalLM, AutoTokenizer
LOCAL_MODEL_ID = os.getenv("LOCAL_MODEL_ID", "meta-llama/Meta-Llama-3.1-8B-Instruct")
HF_TOKEN = os.getenv("HF_TOKEN", None)
_hf_tok = None
_hf_model = None
def _ensure_local_model(logs: Optional[List[str]] = None) -> None:
"""確保本地模型已載入(首次呼叫時自動下載/載入到 /data/hf)。"""
global _hf_tok, _hf_model
try:
if _hf_tok is not None and _hf_model is not None:
return
if logs is not None:
logs.append(f"[LOCAL LLM] Loading model: {LOCAL_MODEL_ID}")
print(f"[LLM] Loading {LOCAL_MODEL_ID} (cache={os.environ.get('TRANSFORMERS_CACHE')})")
_hf_tok = AutoTokenizer.from_pretrained(
LOCAL_MODEL_ID, use_fast=True, cache_dir=os.environ["TRANSFORMERS_CACHE"], token=HF_TOKEN
)
_hf_model = AutoModelForCausalLM.from_pretrained(
LOCAL_MODEL_ID,
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
device_map="auto",
cache_dir=os.environ["TRANSFORMERS_CACHE"],
token=HF_TOKEN
)
if logs is not None:
logs.append("[LOCAL LLM] Model ready.")
print("[LLM] Ready.")
except Exception as e:
if logs is not None:
logs.append(f"[LOCAL LLM][ERROR] load failed: {e}")
print(f"[LLM][ERROR] load failed: {e}")
raise
# ---------- Robust JSON parsing helpers ----------
def _strip_code_fences(s: str) -> str:
s = s.strip()
if s.startswith("```"):
s = s[3:]
if "```" in s:
s = s.split("```", 1)[0]
s = s.replace("```json", "").replace("```JSON", "").strip("` \n\r\t")
return s
def _extract_first_brace_block(s: str) -> str:
start = s.find("{")
if start == -1:
return s
depth = 0
for i in range(start, len(s)):
if s[i] == "{":
depth += 1
elif s[i] == "}":
depth -= 1
if depth == 0:
return s[start:i+1]
return s
def safe_parse_json(text: str) -> dict:
"""
先嚴格 json.loads;失敗則:
1) 去掉 code fences/markdown
2) 擷取第一個平衡的 {...}
3) 嘗試 json5(允許單引號、尾逗號)
4) 修補全形/花式引號與 BOM;必要時把整體單引號轉雙引號
"""
import json as _json
# 直接試一次
try:
return _json.loads(text)
except Exception:
pass
s = _strip_code_fences(text)
s = _extract_first_brace_block(s)
try:
return _json.loads(s)
except Exception:
pass
# 可選:json5(若未安裝會直接跳過)
try:
import json5 # type: ignore
return json5.loads(s)
except Exception:
pass
# 修補引號與 BOM
repaired = (
s.replace("\u201c", '"').replace("\u201d", '"')
.replace("\u2018", "'").replace("\u2019", "'")
.replace("\ufeff", "").strip()
)
if "'" in repaired and '"' not in repaired:
repaired = repaired.replace("'", '"')
return _json.loads(repaired)
def extract_model_reply(full_text, prompt):
"""
從模型完整輸出中,移除 prompt 和任何 system、assistant 等前置內容
"""
try:
# 如果模型有把 prompt 或 system 一起回顯,先找最後一次 user 提問位置
markers = ["user", "User", "使用者", prompt.strip()]
last_pos = -1
for m in markers:
pos = full_text.rfind(m)
if pos > last_pos:
last_pos = pos
# 從最後 marker 後面開始取內容
if last_pos != -1:
reply = full_text[last_pos + len(markers[-1]):]
else:
reply = full_text
# 移除多餘空白與換行
return reply.strip()
except Exception as e:
print(f"[extract_model_reply 錯誤] {e}")
return full_text.strip()
# === 放在 safe_parse_json 之後:用「正則」擷取 full_text 中最後一個完整 JSON 物件 ===
try:
import regex as re2 # 第三方 regex,支援遞迴 (?R)
except Exception:
re2 = None
def extract_last_json_block(text: str) -> Optional[str]:
"""
以 regex 擷取最後一個平衡的大括號 JSON 物件:
- 優先使用第三方 `regex` 的遞迴 (?R) 來比對平衡大括號
- 若無法使用 `regex`,改用手動堆疊法做 fallback
回傳:最後一個 JSON 物件字串;若找不到回傳 None
"""
try:
s = _strip_code_fences(text)
# 1) 使用 regex (?R) 遞迴:{\n ... { ... } ... \n}
if re2 is not None:
pattern = re2.compile(r"\{(?:[^{}]|(?R))*\}", flags=re2.DOTALL)
matches = [m.group(0) for m in pattern.finditer(s)]
return matches[-1] if matches else None
# 2) 無 regex 模組 → 手動掃描平衡大括號
blocks = []
depth = 0
start = None
for i, ch in enumerate(s):
if ch == "{":
if depth == 0:
start = i
depth += 1
elif ch == "}":
if depth > 0:
depth -= 1
if depth == 0 and start is not None:
blocks.append(s[start:i+1])
start = None
return blocks[-1] if blocks else None
except Exception as e:
print(f"[JSON-EXTRACT][ERROR] {e}")
return None
def call_llm(messages: List[dict], model: str, logs: List[str]) -> dict:
"""
保留原名稱 call_llm,但改為本地 LLaMA。
嚴格要求僅輸出 JSON;若混入其他文本,會正則擷取第一個 {...}。
"""
start_time = time.time() # 計時開始
try:
_ensure_local_model(logs)
# 準備 chat prompt(加上 JSON 輸出約束)
sys_txt = messages[0].get("content", "") if messages else ""
usr_txt = messages[1].get("content", "") if len(messages) > 1 else ""
extra_rules = "\n\n請務必只輸出單一 JSON 物件,不得包含任何 JSON 之外的文字或符號。"
print('準備 chat prompt(加上 JSON 輸出約束)')
chat = [
{"role": "system", "content": sys_txt},
{"role": "user", "content": usr_txt + extra_rules}
]
print(f"user content:{usr_txt + extra_rules}")
prompt = _hf_tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
inputs = _hf_tok(prompt, return_tensors="pt").to(_hf_model.device)
print("inputs")
with torch.no_grad():
out_ids = _hf_model.generate(
**inputs,
max_new_tokens=1024,
temperature=float(TEMPERATURE),
do_sample=(float(TEMPERATURE) > 0),
eos_token_id=_hf_tok.eos_token_id,
pad_token_id=_hf_tok.eos_token_id
)
print("torch.no_grad")
# 解碼生成內容後
full_text = _hf_tok.decode(out_ids[0], skip_special_tokens=True)
# ★ 使用 regex/堆疊法:從 full_text 擷取「最後一個」完整 JSON 物件
candidate = extract_last_json_block(full_text)
gen_text = candidate if candidate is not None else full_text # 若找不到就用原文(後續 safe_parse_json 仍會嘗試)
logs.append(f"[LOCAL LLM] raw_len={len(gen_text)}")
logs.append(f"[LOCAL LLM] gen_text={gen_text}")
logs.append(f"[LOCAL LLM] prompt={prompt}")
logs.append(f"[LOCAL LLM] full_text={full_text}")
# 強韌解析
try:
data = safe_parse_json(gen_text)
logs.append("[LOCAL LLM] JSON 解析成功")
return data
except Exception as jerr:
logs.append(f"[LOCAL LLM] JSON 解析失敗:{jerr}")
return {
"符合情況": "部分符合",
"原因": [f"模型輸出非合法 JSON:{str(jerr)}"],
"改進建議": ["請調整提示詞,要求嚴格輸出 JSON(雙引號、無註解、無多餘文字)。"],
"規則逐點檢核": []
}
return data
except Exception as e:
logs.append(f"[LOCAL LLM] JSON 解析失敗:{e}")
return {
"符合情況": "部分符合",
"原因": [f"本地模型錯誤:{e}"],
"改進建議": ["請檢查 Hugging Face 權杖與模型權限、或改用較小模型。"],
"規則逐點檢核": []
}
finally:
elapsed = time.time() - start_time
logs.append(f"[LOCAL LLM] 本次執行耗時:{elapsed:.2f} 秒")
# -----------------------------
# 檔案讀取:TXT / PDF(含 OCR fallback)
# -----------------------------
def _read_text_file(path: str) -> str:
"""讀取 TXT:嘗試多種常見編碼(台灣環境可能遇到 cp950/big5)。"""
try:
print(f"[READ] TXT: {os.path.basename(path)}")
encodings = ("utf-8", "utf-8-sig", "cp950", "big5", "latin1")
for enc in encodings:
try:
with open(path, "r", encoding=enc, errors="ignore") as f:
return f.read()
except Exception:
continue
with open(path, "rb") as f:
return f.read().decode("utf-8", errors="ignore")
except Exception as e:
print(f"[READ][ERROR] TXT {path}: {e}")
return ""
def _ensure_ocr_ready():
"""Lazy 初始化 OCR(避免在未用到時佔資源)。"""
global OCR
if OCR is None:
OCR = _init_ocr()
def _ocr_page_text(page) -> str:
"""將 PDF 頁面轉圖後以 OCR 讀取文字。"""
try:
_ensure_ocr_ready()
if OCR is None:
return ""
pix = page.get_pixmap(dpi=240)
img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
res = OCR.ocr(np.array(img), cls=True)
if not res or not res[0]:
return ""
return "\n".join([line[1][0] for line in res[0]]).strip()
except Exception as e:
print(f"[OCR][ERROR] {e}")
return ""
def _read_pdf_text(path: str) -> Tuple[str, int]:
"""
讀取 PDF:若該頁抽不到文字(可能是掃描影像),則以 OCR 進行辨識。
回傳:(全文, 頁數)
"""
try:
print(f"[READ] PDF: {os.path.basename(path)}")
parts: List[str] = []
with fitz.open(path) as doc:
for page in doc:
txt = (page.get_text("text") or "").strip()
if len(txt) < 20: # 低於門檻判定影像頁
ocr_txt = _ocr_page_text(page)
parts.append(ocr_txt)
else:
parts.append(txt)
return "\n".join(parts).strip(), len(doc)
except Exception as e:
print(f"[READ][ERROR] PDF {path}: {e}")
return "", 0
def _read_file_to_text(file_path: Optional[str]) -> Tuple[str, str]:
"""統一入口:根據副檔名分派至 TXT/PDF 讀取函式。"""
try:
if not file_path:
raise ValueError("未收到檔案路徑")
file_path = os.fspath(file_path)
if not os.path.exists(file_path):
raise FileNotFoundError(f"檔案不存在:{file_path}")
fname = os.path.basename(file_path)
ext = os.path.splitext(fname)[1].lower()
if ext == ".txt":
content = _read_text_file(file_path)
return content, f"讀取 TXT:{fname}(長度:{len(content)})"
elif ext == ".pdf":
content, pages = _read_pdf_text(file_path)
return content, f"讀取 PDF:{fname}(頁數:{pages};長度:{len(content)})"
else:
raise ValueError("僅支援 .txt 或 .pdf")
except Exception as e:
print(f"[READ][ERROR] {e}")
return "", f"[ERROR] {e}"
# -----------------------------
# 多檔輔助:清單摘要 / 快取與預覽
# -----------------------------
def _read_multi(paths: Optional[List[str]]) -> Dict[str, str]:
"""批次讀取多檔,回傳 {檔名: 文字內容},忽略讀取錯誤。"""
cache: Dict[str, str] = {}
try:
if not paths:
return cache
for p in paths:
if not p:
continue
try:
name = os.path.basename(os.fspath(p))
text, _ = _read_file_to_text(p)
cache[name] = text
except Exception as e:
print(f"[READ][WARN] skip {p}: {e}")
continue
return cache
except Exception as e:
print(f"[READ_MULTI][ERROR] {e}")
return cache
def _summarize_paths(paths: Optional[List[str]]) -> str:
"""列出上傳檔案清單(含大小),供 UI 顯示。"""
try:
if not paths:
return "(尚未上傳任何檔案)"
lines = []
for p in paths:
try:
name = os.path.basename(os.fspath(p))
size = os.path.getsize(os.fspath(p))
lines.append(f"- {name}({size} bytes)")
except Exception:
lines.append(f"- {os.path.basename(str(p))}")
return "\n".join(lines)
except Exception as e:
print(f"[SUMMARY][ERROR] {e}")
return "(清單生成失敗)"
def on_files_change(paths: Optional[List[str]]):
"""Gradio callback:檔案更新 → 更新清單 / 勾選選項 / 預覽區 / 快取。"""
try:
cache = _read_multi(paths)
choices = list(cache.keys())
return (
_summarize_paths(paths),
gr.update(choices=choices, value=[]),
gr.update(value="", visible=False),
cache,
)
except Exception as e:
print(f"[FILES_CHANGE][ERROR] {e}")
return "(更新失敗)", gr.update(), gr.update(), {}
def on_show_toggle(selected_names: List[str], cache: Dict[str, str]):
"""Gradio callback:勾選要預覽的檔名 → 顯示合併內容(截斷顯示)。"""
try:
if not selected_names:
return gr.update(value="", visible=False)
parts = []
preview_limit = max(2000, MAX_TOKENS_INPUT // 5)
for name in selected_names:
txt = cache.get(name, "")
if len(txt) > preview_limit:
txt = txt[:preview_limit] + "\n...[內容過長,已截斷顯示]"
parts.append(f"===== {name} =====\n{txt}")
return gr.update(value="\n\n".join(parts), visible=True)
except Exception as e:
print(f"[SHOW_TOGGLE][ERROR] {e}")
return gr.update(value=f"[ERROR] {e}", visible=True)
# -----------------------------
# 檢核前驗證 / Prompt 構建
# -----------------------------
def validate_before_run(
policy_files: List[str],
self_rules: str,
self_text: str,
self_files: List[str],
) -> Tuple[bool, str]:
"""檢核前的前置驗證:內規至少一檔、規則必填、自評文字或附檔至少其一。"""
try:
errors = []
if not (policy_files and len(policy_files) > 0):
errors.append("請上傳『單位適用內規(可多檔)』至少一個檔案。")
if not (self_rules and self_rules.strip()):
errors.append("請輸入『自評檢核規則』。")
if not (self_text and self_text.strip()) and not (self_files and len(self_files) > 0):
errors.append("請提供『單位自評』文字,或上傳『單位自評附檔(可多檔)』至少一個。")
ok = (len(errors) == 0)
return ok, "\n".join(errors)
except Exception as e:
print(f"[VALIDATE][ERROR] {e}")
return False, f"驗證失敗:{e}"
def build_prompt(rules: str, doc_text: str) -> List[dict]:
"""依規則與文件全文構建 chat messages。"""
try:
system_msg = {"role": "system", "content": SYSTEM_MESSAGE}
user_msg = {
"role": "user",
"content": f"{DEMO_PROMPT_INSTRUCTIONS}\n\n[檢核規範]\n{rules}\n\n[文件全文]\n{doc_text}"
}
return [system_msg, user_msg]
except Exception as e:
print(f"[PROMPT][ERROR] {e}")
return [{"role": "system", "content": SYSTEM_MESSAGE},
{"role": "user", "content": "(構建 Prompt 失敗)"}]
# -----------------------------
# 主流程:執行檢核
# -----------------------------
def run_check_with_log(
policy_files_paths, # 多檔內規(必填)
self_rules_text, # 自評檢核規則(必填;預設載入 DEFAULT_RULES)
self_text, # 自評文字(可空)
self_files_paths # 多檔自評附檔(可空)
):
logs: List[str] = []
try:
# 驗證
ok, msg = validate_before_run(policy_files_paths, self_rules_text, self_text, self_files_paths)
if not ok:
msg_display = msg.replace("\n", ";")
logs.append(f"[VALIDATE] {msg_display}")
return msg, None, None, "\n".join(logs)
# 紀錄 / 讀取內規(目前僅作為流程留痕)
for p in (policy_files_paths or []):
t, info = _read_file_to_text(p)
logs.append(f"[POLICY] {info}")
# 取得自評內容(優先文字;否則合併自評附檔)
doc_text = ""
if self_text and self_text.strip():
doc_text = self_text.strip()
logs.append(f"[SELF] from textbox, len={len(doc_text)}")
else:
pieces = []
for p in (self_files_paths or []):
t, info = _read_file_to_text(p)
logs.append(f"[SELF_FILE] {info}")
if t.strip():
pieces.append(t.strip())
doc_text = "\n\n".join(pieces).strip()
if not doc_text:
return "未取得任何自評內容(文字與附檔皆為空)。", None, None, "\n".join(logs)
# 長度限制(保守)
if len(doc_text) > MAX_TOKENS_INPUT:
logs.append(f"[WARN] self text too long ({len(doc_text)}) → truncate to {MAX_TOKENS_INPUT}")
doc_text = doc_text[:MAX_TOKENS_INPUT]
# 構建訊息並呼叫本地 LLM
msgs = build_prompt(self_rules_text.strip(), doc_text)
logs.append("=== Prompt ===")
logs.append(f"[system] len={len(msgs[0]['content']) if msgs else 0}")
logs.append(f"[user] len={len(msgs[1]['content']) if len(msgs)>1 else 0}")
result_dict = call_llm(msgs, model="local", logs=logs)
logs.append("[LLM] done.")
pretty = json.dumps(result_dict, ensure_ascii=False, indent=2)
table = [
[i.get("規則編號",""), i.get("規則內容",""), i.get("判斷",""), i.get("理由",""), i.get("建議","")]
for i in result_dict.get("規則逐點檢核", [])
]
info = f"自評內容長度:{len(doc_text)};自評檢核規則長度:{len(self_rules_text.strip())}"
return info, pretty, table, "\n".join(logs)
except Exception as e:
logs.append(f"[RUN][ERROR] {e}")
return f"執行失敗:{e}", None, None, "\n".join(logs)
def toggle_debug(current_visibility: bool):
"""切換 Debug Log 顯示。"""
try:
return gr.update(visible=not current_visibility), not current_visibility
except Exception as e:
print(f"[DEBUG_TOGGLE][ERROR] {e}")
return gr.update(), current_visibility
# -----------------------------
# Gradio 介面
# -----------------------------
with gr.Blocks(title="法遵自評覆核(PaddleOCR + LLaMA 本地)") as demo:
gr.Markdown("# 法遵自評覆核(支援 PDF OCR:繁/簡中文;本地 LLaMA 推論)")
gr.Markdown("步驟:上傳『單位適用內規(可多檔)』 → 輸入『自評檢核規則』(預設載入 config) → 輸入『單位自評』或上傳『自評附檔(可多檔)』 → 開始檢核。")
# 內規(多檔)+ 清單 + 個別顯示開關 + 預覽(預設隱藏)
with gr.Row():
policy_files_in = gr.Files(
label="上傳單位適用內規(可多檔,.txt 或 .pdf)",
file_types=[".txt", ".pdf"],
type="filepath"
)
policy_list_md = gr.Markdown("(尚未上傳任何檔案)")
policy_show_chk = gr.CheckboxGroup(label="顯示哪些內規內容(個別開關,預設不勾選)", choices=[])
policy_show_area = gr.Textbox(label="內規檔案內容預覽(僅顯示勾選者,可能截斷)", value="", lines=15, visible=False)
policy_cache_state = gr.State({})
# 自評檢核規則(預設由 config 載入,可編輯,視為必填)
with gr.Row():
self_rules_in = gr.Textbox(
label="自評檢核規則(必填;預設載入自 config)",
value=DEFAULT_RULES,
lines=8,
placeholder="請貼上要檢核自評內容的規則;例如:需包含抽檢紀錄、會辦流程、簽核證明等。"
)
# 自評文字(可空)
with gr.Row():
self_text_in = gr.Textbox(
label="單位自評(文字輸入;可留白改上傳自評附檔)",
value="",
lines=10,
placeholder="請輸入自評文字;或留白並在下方上傳自評附檔(可多檔)"
)
# 自評附檔(多檔)+ 清單 + 個別顯示開關 + 預覽(預設隱藏)
with gr.Row():
self_files_in = gr.Files(
label="上傳單位自評附檔(可多檔,.txt 或 .pdf)",
file_types=[".txt", ".pdf"],
type="filepath"
)
self_list_md = gr.Markdown("(尚未上傳任何檔案)")
self_show_chk = gr.CheckboxGroup(label="顯示哪些自評附檔內容(個別開關,預設不勾選)", choices=[])
self_show_area = gr.Textbox(label="自評附檔內容預覽(僅顯示勾選者,可能截斷)", value="", lines=15, visible=False)
self_cache_state = gr.State({})
# 控制列
with gr.Row():
run_btn = gr.Button("開始檢核", variant="primary", interactive=False)
toggle_debug_btn = gr.Button("切換 Debug Log 顯示/隱藏")
info_out = gr.Markdown(label="讀檔/輸入資訊")
json_out = gr.Code(label="檢核結果(JSON)", language="json")
table_out = gr.Dataframe(headers=["規則編號","規則內容","判斷","理由","建議"], wrap=True)
debug_log = gr.Textbox(label="Debug Log", lines=14, visible=False)
debug_state = gr.State(False)
# 內規:清單 + 勾選 + 預覽
policy_files_in.change(
fn=on_files_change,
inputs=[policy_files_in],
outputs=[policy_list_md, policy_show_chk, policy_show_area, policy_cache_state]
)
policy_show_chk.change(
fn=on_show_toggle,
inputs=[policy_show_chk, policy_cache_state],
outputs=[policy_show_area]
)
# 自評附檔:清單 + 勾選 + 預覽
self_files_in.change(
fn=on_files_change,
inputs=[self_files_in],
outputs=[self_list_md, self_show_chk, self_show_area, self_cache_state]
)
self_show_chk.change(
fn=on_show_toggle,
inputs=[self_show_chk, self_cache_state],
outputs=[self_show_area]
)
# 「開始檢核」互動條件:內規(至少一檔) + 自評規則(必填) + 自評(文字或檔案擇一)
def _ready(policy_files, self_rules, self_text, self_files):
try:
ok, _ = validate_before_run(policy_files, self_rules, self_text, self_files)
return gr.update(interactive=ok)
except Exception as e:
print(f"[READY][ERROR] {e}")
return gr.update(interactive=False)
policy_files_in.change(_ready, [policy_files_in, self_rules_in, self_text_in, self_files_in], [run_btn])
self_rules_in.change(_ready, [policy_files_in, self_rules_in, self_text_in, self_files_in], [run_btn])
self_text_in.change(_ready, [policy_files_in, self_rules_in, self_text_in, self_files_in], [run_btn])
self_files_in.change(_ready, [policy_files_in, self_rules_in, self_text_in, self_files_in], [run_btn])
# Run
run_btn.click(
fn=run_check_with_log,
inputs=[policy_files_in, self_rules_in, self_text_in, self_files_in],
outputs=[info_out, json_out, table_out, debug_log]
)
toggle_debug_btn.click(fn=toggle_debug, inputs=[debug_state], outputs=[debug_log, debug_state])
if __name__ == "__main__":
# Spaces 需綁 0.0.0.0;本地可保留相同設定
demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")), share=False)
|