File size: 1,450 Bytes
ddbc0c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from pathlib import Path
import json

from legalrag.config import AppConfig
from legalrag.utils.logger import get_logger

logger = get_logger(__name__)

RULE_KWS = ["应当", "可以", "不得", "禁止", "应予"]
CONDITION_KWS = ["如果", "若", "经", "在下列情形", "有下列情形", "符合下列条件"]
EFFECT_KWS = ["无效", "失效", "生效", "解除", "承担", "赔偿", "不再履行"]


def classify_segment(text: str) -> str:
    t = text.strip()
    if any(k in t for k in RULE_KWS):
        return "rule"
    if any(k in t for k in CONDITION_KWS):
        return "condition"
    if any(k in t for k in EFFECT_KWS):
        return "effect"
    return "other"


def main() -> None:
    cfg = AppConfig.load()
    in_path: Path = Path(cfg.retrieval.processed_file)
    out_path: Path = in_path.parent / "contract_law_structured.jsonl"

    logger.info("Enriching structure for %s", in_path)
    n = 0
    with in_path.open("r", encoding="utf-8") as fin, out_path.open("w", encoding="utf-8") as fout:
        for line in fin:
            obj = json.loads(line)
            seg_type = classify_segment(obj.get("text", ""))
            obj["segment_type"] = seg_type
            obj["role_tags"] = [seg_type] if seg_type != "other" else []
            fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
            n += 1
    logger.info("Saved %d structured records to %s", n, out_path)


if __name__ == "__main__":
    main()