import spacy from spacy.matcher import Matcher nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab) clause_patterns = { "CONFIDENTIALITY": [ [{"LOWER": "confidentiality"}], [{"LOWER": "non-disclosure"}], [{"LOWER": "nda"}], [{"LOWER": "proprietary"}, {"LOWER": "information"}], ], "TERMINATION": [ [{"LOWER": "termination"}], [{"LOWER": "end"}, {"LOWER": "of"}, {"LOWER": "agreement"}], [{"LOWER": "terminate"}, {"LOWER": "this"}, {"LOWER": "agreement"}], ], "NON_COMPETE": [ [{"LOWER": "non-compete"}], [{"LOWER": "non"}, {"LOWER": "compete"}], [{"LOWER": "competition"}, {"LOWER": "restriction"}], ], "GOVERNING_LAW": [ [{"LOWER": "governing"}, {"LOWER": "law"}], [{"LOWER": "jurisdiction"}], [{"LOWER": "choice"}, {"LOWER": "of"}, {"LOWER": "law"}], ], "SEVERABILITY": [ [{"LOWER": "severability"}], [{"LOWER": "invalidity"}, {"OP": "?"}, {"LOWER": "provision"}], [{"LOWER": "severable"}], ], "LIABILITY": [ [{"LOWER": "liability"}], [{"LOWER": "limitation"}, {"LOWER": "of"}, {"LOWER": "liability"}], [{"LOWER": "indemnification"}], [{"LOWER": "cap"}, {"LOWER": "on"}, {"LOWER": "damages"}], ], "FORCE_MAJEURE": [ [{"LOWER": "force"}, {"LOWER": "majeure"}], [{"LOWER": "acts"}, {"LOWER": "of"}, {"LOWER": "god"}], [{"LOWER": "unforeseen"}, {"LOWER": "events"}], [{"LOWER": "pandemic"}], ], "PAYMENT_TERMS": [ [{"LOWER": "payment"}, {"LOWER": "terms"}], [{"LOWER": "due"}, {"LOWER": "within"}, {"IS_DIGIT": True}, {"LOWER": {"IN": ["days", "weeks"]}}], [{"LOWER": "invoice"}, {"LOWER": "shall"}, {"LOWER": "be"}], [{"LOWER": "net"}, {"IS_DIGIT": True}], ] } # Adding all patterns to matcher for label, patterns in clause_patterns.items(): for pattern in patterns: matcher.add(label, [pattern]) print("Clause matcher loaded with extended patterns.") def find_clauses(text, window_size=30): """Find clauses in text and return context.""" doc = nlp(text) matches = matcher(doc) results = [] for match_id, start, end in matches: span = doc[start:end] label = nlp.vocab.strings[match_id] # Get context window ctx_start = max(0, start - window_size) ctx_end = min(len(doc), end + window_size) context = doc[ctx_start:ctx_end].text results.append((label, context, start, end)) return results