Spaces:

Komal133
/

Contract-Risk-Heatmap-Generator

Runtime error

App Files Files Community

Komal133 commited on Jun 9, 2025

Commit

dcbd7b1

verified ·

1 Parent(s): e353374

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -39

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 import PyPDF2
 import nltk
-from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
 import seaborn as sns
 import matplotlib.pyplot as plt
 from reportlab.lib.pagesizes import letter
@@ -10,7 +9,6 @@ import json
 import os
 from io import BytesIO
 import numpy as np
-import torch
 import logging
 # Set up logging
@@ -20,23 +18,17 @@ logger = logging.getLogger(__name__)
 # Download NLTK data
 nltk.download('punkt')
-# Initialize BERT model and tokenizer
-model_name = "nlpaueb/legal-bert-base-uncased"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 labels: penalty, obligation, delay
-classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)
-# Map model labels to clause types (adjust based on actual model labels after fine-tuning)
-LABEL_MAP = {
-    "LABEL_0": "penalty",
-    "LABEL_1": "obligation",
-    "LABEL_2": "delay"
-}
 # Clause types and risk scoring logic
 CLAUSE_TYPES = ["penalty", "obligation", "delay"]
 RISK_WEIGHTS = {"penalty": 0.8, "obligation": 0.5, "delay": 0.6}
 def extract_text_from_pdf(pdf_file):
     """Extract text from uploaded PDF file."""
     try:
@@ -55,7 +47,7 @@ def extract_text_from_pdf(pdf_file):
         return f"Error extracting text: {str(e)}"
 def parse_contract(text):
-    """Parse contract text into clauses and classify risks."""
     # Clean text: replace multiple newlines with single, handle LaTeX artifacts
     text = text.replace("\n\n", "\n").replace("\t", " ")
     sentences = nltk.sent_tokenize(text)
@@ -70,31 +62,30 @@ def parse_contract(text):
         if len(sentence) < 10:  # Skip short sentences
             logger.debug(f"Skipping short sentence (length {len(sentence)}): {sentence}")
             continue
-        # Classify clause
-        try:
-            classification = classifier(sentence)
-            logger.debug(f"Classification for sentence {idx}: {classification}")
-            # Map model labels to clause types
-            top_label = max(classification[0], key=lambda x: x['score'])['label']
-            clause_type = LABEL_MAP.get(top_label, None)
-            if clause_type not in CLAUSE_TYPES:
-                logger.debug(f"Clause type {clause_type} not in {CLAUSE_TYPES}, skipping.")
-                continue
-            # Calculate risk score
-            score = classification[0][[label for label in LABEL_MAP if LABEL_MAP[label] == clause_type][0]]['score'] * RISK_WEIGHTS[clause_type]
-            results.append({
-                "clause_id": idx,
-                "text": sentence,
-                "clause_type": clause_type,
-                "risk_score": round(score, 2)
-            })
-            risk_scores.append(score)
-            logger.info(f"Detected clause {idx}: {clause_type} with risk score {score}")
-        except Exception as e:
-            logger.error(f"Error classifying sentence {idx}: {str(e)}")
             continue
     return results, risk_scores
 def generate_heatmap(risk_scores):

 import gradio as gr
 import PyPDF2
 import nltk
 import seaborn as sns
 import matplotlib.pyplot as plt
 from reportlab.lib.pagesizes import letter
 import os
 from io import BytesIO
 import numpy as np
 import logging
 # Set up logging
 # Download NLTK data
 nltk.download('punkt')
 # Clause types and risk scoring logic
 CLAUSE_TYPES = ["penalty", "obligation", "delay"]
 RISK_WEIGHTS = {"penalty": 0.8, "obligation": 0.5, "delay": 0.6}
+# Keyword-based heuristic for clause classification
+KEYWORD_MAP = {
+    "penalty": ["penalty", "fee", "fine", "charge", "incur"],
+    "obligation": ["shall", "must", "obligated", "required", "responsible"],
+    "delay": ["delay", "late", "beyond", "postpone", "deferred"]
+}
 def extract_text_from_pdf(pdf_file):
     """Extract text from uploaded PDF file."""
     try:
         return f"Error extracting text: {str(e)}"
 def parse_contract(text):
+    """Parse contract text into clauses and classify risks using keyword-based heuristic."""
     # Clean text: replace multiple newlines with single, handle LaTeX artifacts
     text = text.replace("\n\n", "\n").replace("\t", " ")
     sentences = nltk.sent_tokenize(text)
         if len(sentence) < 10:  # Skip short sentences
             logger.debug(f"Skipping short sentence (length {len(sentence)}): {sentence}")
             continue
+        # Heuristic classification based on keywords
+        sentence_lower = sentence.lower()
+        clause_type = None
+        for c_type, keywords in KEYWORD_MAP.items():
+            if any(keyword in sentence_lower for keyword in keywords):
+                clause_type = c_type
+                break
+        if clause_type not in CLAUSE_TYPES:
+            logger.debug(f"No relevant clause type for sentence {idx}: {sentence}")
             continue
+        # Assign a dummy score based on keyword presence (simulating model confidence)
+        score = RISK_WEIGHTS[clause_type] * 0.9  # 0.9 as a dummy confidence score
+        results.append({
+            "clause_id": idx,
+            "text": sentence,
+            "clause_type": clause_type,
+            "risk_score": round(score, 2)
+        })
+        risk_scores.append(score)
+        logger.info(f"Detected clause {idx}: {clause_type} with risk score {score}")
     return results, risk_scores
 def generate_heatmap(risk_scores):