| from pathlib import Path |
| import json |
|
|
| from legalrag.config import AppConfig |
| from legalrag.utils.logger import get_logger |
|
|
| logger = get_logger(__name__) |
|
|
| RULE_KWS = ["应当", "可以", "不得", "禁止", "应予"] |
| CONDITION_KWS = ["如果", "若", "经", "在下列情形", "有下列情形", "符合下列条件"] |
| EFFECT_KWS = ["无效", "失效", "生效", "解除", "承担", "赔偿", "不再履行"] |
|
|
|
|
| def classify_segment(text: str) -> str: |
| t = text.strip() |
| if any(k in t for k in RULE_KWS): |
| return "rule" |
| if any(k in t for k in CONDITION_KWS): |
| return "condition" |
| if any(k in t for k in EFFECT_KWS): |
| return "effect" |
| return "other" |
|
|
|
|
| def main() -> None: |
| cfg = AppConfig.load() |
| in_path: Path = Path(cfg.retrieval.processed_file) |
| out_path: Path = in_path.parent / "contract_law_structured.jsonl" |
|
|
| logger.info("Enriching structure for %s", in_path) |
| n = 0 |
| with in_path.open("r", encoding="utf-8") as fin, out_path.open("w", encoding="utf-8") as fout: |
| for line in fin: |
| obj = json.loads(line) |
| seg_type = classify_segment(obj.get("text", "")) |
| obj["segment_type"] = seg_type |
| obj["role_tags"] = [seg_type] if seg_type != "other" else [] |
| fout.write(json.dumps(obj, ensure_ascii=False) + "\n") |
| n += 1 |
| logger.info("Saved %d structured records to %s", n, out_path) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|