Legal-RAG / scripts /enrich_structure.py
flora-l's picture
recreate the space for demo
ddbc0c8
from pathlib import Path
import json
from legalrag.config import AppConfig
from legalrag.utils.logger import get_logger
logger = get_logger(__name__)
RULE_KWS = ["应当", "可以", "不得", "禁止", "应予"]
CONDITION_KWS = ["如果", "若", "经", "在下列情形", "有下列情形", "符合下列条件"]
EFFECT_KWS = ["无效", "失效", "生效", "解除", "承担", "赔偿", "不再履行"]
def classify_segment(text: str) -> str:
t = text.strip()
if any(k in t for k in RULE_KWS):
return "rule"
if any(k in t for k in CONDITION_KWS):
return "condition"
if any(k in t for k in EFFECT_KWS):
return "effect"
return "other"
def main() -> None:
cfg = AppConfig.load()
in_path: Path = Path(cfg.retrieval.processed_file)
out_path: Path = in_path.parent / "contract_law_structured.jsonl"
logger.info("Enriching structure for %s", in_path)
n = 0
with in_path.open("r", encoding="utf-8") as fin, out_path.open("w", encoding="utf-8") as fout:
for line in fin:
obj = json.loads(line)
seg_type = classify_segment(obj.get("text", ""))
obj["segment_type"] = seg_type
obj["role_tags"] = [seg_type] if seg_type != "other" else []
fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
n += 1
logger.info("Saved %d structured records to %s", n, out_path)
if __name__ == "__main__":
main()