Spaces:
Sleeping
Sleeping
| # utils/metadata.py | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline | |
| import re | |
| import dateparser | |
| # π§ Load advanced NER model | |
| model_name = "Jean-Baptiste/roberta-large-ner-english" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForTokenClassification.from_pretrained(model_name) | |
| # π§ Build NER pipeline with grouping | |
| ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") | |
| def clean_text(text): | |
| """ | |
| π§Ό Clean contract text for better NER and regex performance. | |
| """ | |
| return text.replace("\n", " ").replace(" ", " ").strip() | |
| def extract_effective_date(text): | |
| """ | |
| π Extract natural language 'Effective Date' (e.g., 'as of August 28, 2025'). | |
| """ | |
| match = re.search(r"(?i)as of (.+?)(\.|,|\n)", text) | |
| if match: | |
| raw_date = match.group(1).strip() | |
| parsed = dateparser.parse(raw_date) | |
| if parsed: | |
| return [parsed.strftime("%Y-%m-%d")] | |
| return [] | |
| def extract_parties(text): | |
| """ | |
| π§Ύ Extract contracting parties using 'by and between X and Y'. | |
| """ | |
| pattern = r"(?i)by and between[:\s\n]+(.+?)\s+and\s+(.+?)\s*(\(|\n|$)" | |
| match = re.search(pattern, text, re.DOTALL) | |
| if match: | |
| return [match.group(1).strip(), match.group(2).strip()] | |
| return [] | |
| def extract_governing_law(text): | |
| """ | |
| βοΈ Capture governing law even if it's stated less directly. | |
| """ | |
| patterns = [ | |
| r"(?i)governed by the laws of ([\w\s,]+)", | |
| r"(?i)under the laws of ([\w\s,]+)" | |
| ] | |
| for pattern in patterns: | |
| match = re.search(pattern, text) | |
| if match: | |
| return [match.group(1).strip()] | |
| return [] | |
| def extract_venue(text): | |
| """ | |
| ποΈ Look for venue in dispute clause like 'submitted to ... in XYZ'. | |
| """ | |
| match = re.search(r"(?i)submitted to.*?in ([\w\s,]+)", text) | |
| return [match.group(1).strip()] if match else [] | |
| def extract_metadata(text): | |
| """ | |
| π¦ Extract full structured metadata using hybrid rule-based + NER. | |
| """ | |
| if not text.strip(): | |
| return {"error": "No input provided."} | |
| text = clean_text(text) | |
| # NER chunking | |
| max_chunk_length = 512 | |
| words = text.split() | |
| chunks = [" ".join(words[i:i + max_chunk_length]) for i in range(0, len(words), max_chunk_length)] | |
| ner_metadata = { | |
| "EFFECTIVE_DATE": [], | |
| "PARTIES": [], | |
| "GOVERNING_LAW": [], | |
| "JURISDICTION": [] | |
| } | |
| label_mapping = { | |
| "DATE": "EFFECTIVE_DATE", | |
| "PERSON": "PARTIES", | |
| "ORGANIZATION": "PARTIES", | |
| "LOCATION": "GOVERNING_LAW" | |
| } | |
| for chunk in chunks: | |
| ner_results = ner_pipeline(chunk) | |
| for ent in ner_results: | |
| label = ent["entity_group"] | |
| word = ent["word"] | |
| custom_label = label_mapping.get(label) | |
| if custom_label and word not in ner_metadata[custom_label]: | |
| ner_metadata[custom_label].append(word) | |
| # π§ Replace/enhance with rule-based extraction | |
| ner_metadata["PARTIES"] = extract_parties(text) or ner_metadata["PARTIES"] | |
| ner_metadata["EFFECTIVE_DATE"] = extract_effective_date(text) or ner_metadata["EFFECTIVE_DATE"] | |
| ner_metadata["GOVERNING_LAW"] = extract_governing_law(text) or ner_metadata["GOVERNING_LAW"] | |
| ner_metadata["VENUE"] = extract_venue(text) or ner_metadata["VENUE"] | |
| ner_metadata["JURISDICTION"] = extract_venue(text) or ner_metadata["JURISDICTION"] | |
| return ner_metadata | |