from pathlib import Path import json from legalrag.config import AppConfig from legalrag.utils.logger import get_logger logger = get_logger(__name__) RULE_KWS = ["应当", "可以", "不得", "禁止", "应予"] CONDITION_KWS = ["如果", "若", "经", "在下列情形", "有下列情形", "符合下列条件"] EFFECT_KWS = ["无效", "失效", "生效", "解除", "承担", "赔偿", "不再履行"] def classify_segment(text: str) -> str: t = text.strip() if any(k in t for k in RULE_KWS): return "rule" if any(k in t for k in CONDITION_KWS): return "condition" if any(k in t for k in EFFECT_KWS): return "effect" return "other" def main() -> None: cfg = AppConfig.load() in_path: Path = Path(cfg.retrieval.processed_file) out_path: Path = in_path.parent / "contract_law_structured.jsonl" logger.info("Enriching structure for %s", in_path) n = 0 with in_path.open("r", encoding="utf-8") as fin, out_path.open("w", encoding="utf-8") as fout: for line in fin: obj = json.loads(line) seg_type = classify_segment(obj.get("text", "")) obj["segment_type"] = seg_type obj["role_tags"] = [seg_type] if seg_type != "other" else [] fout.write(json.dumps(obj, ensure_ascii=False) + "\n") n += 1 logger.info("Saved %d structured records to %s", n, out_path) if __name__ == "__main__": main()