Spaces:
Sleeping
Sleeping
| import spacy | |
| from spacy.matcher import Matcher | |
| nlp = spacy.load("en_core_web_sm") | |
| matcher = Matcher(nlp.vocab) | |
| clause_patterns = { | |
| "CONFIDENTIALITY": [ | |
| [{"LOWER": "confidentiality"}], | |
| [{"LOWER": "non-disclosure"}], | |
| [{"LOWER": "nda"}], | |
| [{"LOWER": "proprietary"}, {"LOWER": "information"}], | |
| ], | |
| "TERMINATION": [ | |
| [{"LOWER": "termination"}], | |
| [{"LOWER": "end"}, {"LOWER": "of"}, {"LOWER": "agreement"}], | |
| [{"LOWER": "terminate"}, {"LOWER": "this"}, {"LOWER": "agreement"}], | |
| ], | |
| "NON_COMPETE": [ | |
| [{"LOWER": "non-compete"}], | |
| [{"LOWER": "non"}, {"LOWER": "compete"}], | |
| [{"LOWER": "competition"}, {"LOWER": "restriction"}], | |
| ], | |
| "GOVERNING_LAW": [ | |
| [{"LOWER": "governing"}, {"LOWER": "law"}], | |
| [{"LOWER": "jurisdiction"}], | |
| [{"LOWER": "choice"}, {"LOWER": "of"}, {"LOWER": "law"}], | |
| ], | |
| "SEVERABILITY": [ | |
| [{"LOWER": "severability"}], | |
| [{"LOWER": "invalidity"}, {"OP": "?"}, {"LOWER": "provision"}], | |
| [{"LOWER": "severable"}], | |
| ], | |
| "LIABILITY": [ | |
| [{"LOWER": "liability"}], | |
| [{"LOWER": "limitation"}, {"LOWER": "of"}, {"LOWER": "liability"}], | |
| [{"LOWER": "indemnification"}], | |
| [{"LOWER": "cap"}, {"LOWER": "on"}, {"LOWER": "damages"}], | |
| ], | |
| "FORCE_MAJEURE": [ | |
| [{"LOWER": "force"}, {"LOWER": "majeure"}], | |
| [{"LOWER": "acts"}, {"LOWER": "of"}, {"LOWER": "god"}], | |
| [{"LOWER": "unforeseen"}, {"LOWER": "events"}], | |
| [{"LOWER": "pandemic"}], | |
| ], | |
| "PAYMENT_TERMS": [ | |
| [{"LOWER": "payment"}, {"LOWER": "terms"}], | |
| [{"LOWER": "due"}, {"LOWER": "within"}, {"IS_DIGIT": True}, {"LOWER": {"IN": ["days", "weeks"]}}], | |
| [{"LOWER": "invoice"}, {"LOWER": "shall"}, {"LOWER": "be"}], | |
| [{"LOWER": "net"}, {"IS_DIGIT": True}], | |
| ] | |
| } | |
| # Adding all patterns to matcher | |
| for label, patterns in clause_patterns.items(): | |
| for pattern in patterns: | |
| matcher.add(label, [pattern]) | |
| print("Clause matcher loaded with extended patterns.") | |
| def find_clauses(text, window_size=30): | |
| """Find clauses in text and return context.""" | |
| doc = nlp(text) | |
| matches = matcher(doc) | |
| results = [] | |
| for match_id, start, end in matches: | |
| span = doc[start:end] | |
| label = nlp.vocab.strings[match_id] | |
| # Get context window | |
| ctx_start = max(0, start - window_size) | |
| ctx_end = min(len(doc), end + window_size) | |
| context = doc[ctx_start:ctx_end].text | |
| results.append((label, context, start, end)) | |
| return results | |