Spaces:

SCBconsulting
/

synclm-demo

Sleeping

App Files Files Community

synclm-demo / utils /metadata.py

SCBconsulting

Update utils/metadata.py

2ceb2ac verified 5 months ago

raw

history blame contribute delete

3.53 kB

	# utils/metadata.py

	from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
	import re
	import dateparser

	# 🧠 Load advanced NER model
	model_name = "Jean-Baptiste/roberta-large-ner-english"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForTokenClassification.from_pretrained(model_name)

	# 🔧 Build NER pipeline with grouping
	ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

	def clean_text(text):
	"""
	🧼 Clean contract text for better NER and regex performance.
	"""
	return text.replace("\n", " ").replace(" ", " ").strip()

	def extract_effective_date(text):
	"""
	📅 Extract natural language 'Effective Date' (e.g., 'as of August 28, 2025').
	"""
	match = re.search(r"(?i)as of (.+?)(\.\|,\|\n)", text)
	if match:
	raw_date = match.group(1).strip()
	parsed = dateparser.parse(raw_date)
	if parsed:
	return [parsed.strftime("%Y-%m-%d")]
	return []

	def extract_parties(text):
	"""
	🧾 Extract contracting parties using 'by and between X and Y'.
	"""
	pattern = r"(?i)by and between[:\s\n]+(.+?)\s+and\s+(.+?)\s*(\(\|\n\|$)"
	match = re.search(pattern, text, re.DOTALL)
	if match:
	return [match.group(1).strip(), match.group(2).strip()]
	return []

	def extract_governing_law(text):
	"""
	⚖️ Capture governing law even if it's stated less directly.
	"""
	patterns = [
	r"(?i)governed by the laws of ([\w\s,]+)",
	r"(?i)under the laws of ([\w\s,]+)"
	]
	for pattern in patterns:
	match = re.search(pattern, text)
	if match:
	return [match.group(1).strip()]
	return []

	def extract_venue(text):
	"""
	🏛️ Look for venue in dispute clause like 'submitted to ... in XYZ'.
	"""
	match = re.search(r"(?i)submitted to.*?in ([\w\s,]+)", text)
	return [match.group(1).strip()] if match else []

	def extract_metadata(text):
	"""
	📦 Extract full structured metadata using hybrid rule-based + NER.
	"""
	if not text.strip():
	return {"error": "No input provided."}

	text = clean_text(text)

	# NER chunking
	max_chunk_length = 512
	words = text.split()
	chunks = [" ".join(words[i:i + max_chunk_length]) for i in range(0, len(words), max_chunk_length)]

	ner_metadata = {
	"EFFECTIVE_DATE": [],
	"PARTIES": [],
	"GOVERNING_LAW": [],
	"JURISDICTION": []
	}

	label_mapping = {
	"DATE": "EFFECTIVE_DATE",
	"PERSON": "PARTIES",
	"ORGANIZATION": "PARTIES",
	"LOCATION": "GOVERNING_LAW"
	}

	for chunk in chunks:
	ner_results = ner_pipeline(chunk)
	for ent in ner_results:
	label = ent["entity_group"]
	word = ent["word"]
	custom_label = label_mapping.get(label)
	if custom_label and word not in ner_metadata[custom_label]:
	ner_metadata[custom_label].append(word)

	# 🧠 Replace/enhance with rule-based extraction
	ner_metadata["PARTIES"] = extract_parties(text) or ner_metadata["PARTIES"]
	ner_metadata["EFFECTIVE_DATE"] = extract_effective_date(text) or ner_metadata["EFFECTIVE_DATE"]
	ner_metadata["GOVERNING_LAW"] = extract_governing_law(text) or ner_metadata["GOVERNING_LAW"]
	ner_metadata["VENUE"] = extract_venue(text) or ner_metadata["VENUE"]
	ner_metadata["JURISDICTION"] = extract_venue(text) or ner_metadata["JURISDICTION"]

	return ner_metadata