Clause_Lense / spacy_matcher.py
solfedge's picture
Upload 4 files
1f8cd6e verified
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
clause_patterns = {
"CONFIDENTIALITY": [
[{"LOWER": "confidentiality"}],
[{"LOWER": "non-disclosure"}],
[{"LOWER": "nda"}],
[{"LOWER": "proprietary"}, {"LOWER": "information"}],
],
"TERMINATION": [
[{"LOWER": "termination"}],
[{"LOWER": "end"}, {"LOWER": "of"}, {"LOWER": "agreement"}],
[{"LOWER": "terminate"}, {"LOWER": "this"}, {"LOWER": "agreement"}],
],
"NON_COMPETE": [
[{"LOWER": "non-compete"}],
[{"LOWER": "non"}, {"LOWER": "compete"}],
[{"LOWER": "competition"}, {"LOWER": "restriction"}],
],
"GOVERNING_LAW": [
[{"LOWER": "governing"}, {"LOWER": "law"}],
[{"LOWER": "jurisdiction"}],
[{"LOWER": "choice"}, {"LOWER": "of"}, {"LOWER": "law"}],
],
"SEVERABILITY": [
[{"LOWER": "severability"}],
[{"LOWER": "invalidity"}, {"OP": "?"}, {"LOWER": "provision"}],
[{"LOWER": "severable"}],
],
"LIABILITY": [
[{"LOWER": "liability"}],
[{"LOWER": "limitation"}, {"LOWER": "of"}, {"LOWER": "liability"}],
[{"LOWER": "indemnification"}],
[{"LOWER": "cap"}, {"LOWER": "on"}, {"LOWER": "damages"}],
],
"FORCE_MAJEURE": [
[{"LOWER": "force"}, {"LOWER": "majeure"}],
[{"LOWER": "acts"}, {"LOWER": "of"}, {"LOWER": "god"}],
[{"LOWER": "unforeseen"}, {"LOWER": "events"}],
[{"LOWER": "pandemic"}],
],
"PAYMENT_TERMS": [
[{"LOWER": "payment"}, {"LOWER": "terms"}],
[{"LOWER": "due"}, {"LOWER": "within"}, {"IS_DIGIT": True}, {"LOWER": {"IN": ["days", "weeks"]}}],
[{"LOWER": "invoice"}, {"LOWER": "shall"}, {"LOWER": "be"}],
[{"LOWER": "net"}, {"IS_DIGIT": True}],
]
}
# Adding all patterns to matcher
for label, patterns in clause_patterns.items():
for pattern in patterns:
matcher.add(label, [pattern])
print("Clause matcher loaded with extended patterns.")
def find_clauses(text, window_size=30):
"""Find clauses in text and return context."""
doc = nlp(text)
matches = matcher(doc)
results = []
for match_id, start, end in matches:
span = doc[start:end]
label = nlp.vocab.strings[match_id]
# Get context window
ctx_start = max(0, start - window_size)
ctx_end = min(len(doc), end + window_size)
context = doc[ctx_start:ctx_end].text
results.append((label, context, start, end))
return results