Spaces:

Komal133
/

Contract-Risk-Heatmap-Generator

Runtime error

App Files Files Community

Komal133 commited on Jun 9, 2025

Commit

f20ba38

verified ·

1 Parent(s): 4fe62f6

Update app.py

Browse files

Files changed (1) hide show

app.py +133 -141

app.py CHANGED Viewed

@@ -1,148 +1,140 @@
-import dash
-from dash import dcc, html
-import dash_bootstrap_components as dbc
-from transformers import pipeline
 import PyPDF2
-import docx
 import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import requests
 import json
-# Initialize the BERT-based NLP pipeline
-model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"  # Example, replace with your model
-nlp_pipeline = pipeline("ner", model=model_name)
-# Initialize Dash App
-app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
-# Define app layout
-app.layout = html.Div([
-    dbc.Row([
-        dbc.Col(html.H1("Contract Risk Analyzer", style={'textAlign': 'center'})),
-    ]),
-    dbc.Row([
-        dbc.Col(html.Div([
-            html.Label("Upload Contract"),
-            dcc.Upload(
-                id='upload-data',
-                children=html.Button('Upload File'),
-                multiple=False
-            ),
-            html.Div(id='file-upload-status'),
-        ]), width=12),
-    ]),
-    dbc.Row([
-        dbc.Col(html.Div(id='output-text'), width=12),
-    ]),
-    dbc.Row([
-        dbc.Col(dcc.Graph(id='risk-heatmap'), width=12),
-    ]),
-])
-# Function to analyze contract text
-def analyze_contract(contract_text):
     try:
-        # Run the contract through the NLP pipeline
-        results = nlp_pipeline(contract_text)
-        # Parse and score clauses (this is a simplified version)
-        risk_score = 0
-        high_risk_clauses = []
-        for result in results:
-            # This assumes 'labels' are risk-related; adjust as per model output
-            if result['label'] in ["PENALTY", "OBLIGATION", "DELAY"]:  # Customize as per your model's tags
-                high_risk_clauses.append(result['word'])
-                risk_score += 10  # Example scoring logic, modify as needed
-        return {
-            "high_risk_clauses": high_risk_clauses,
-            "risk_score": risk_score
-        }
     except Exception as e:
-        return {"error": str(e)}
-# Function to parse uploaded contract
-def parse_contract(file_content, file_type):
-    contract_text = ""
-    if file_type == "application/pdf":
-        try:
-            pdf_reader = PyPDF2.PdfReader(file_content)
-            for page in pdf_reader.pages:
-                contract_text += page.extract_text()
-        except Exception as e:
-            return f"Error reading PDF: {str(e)}"
-    elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
-        try:
-            doc = docx.Document(file_content)
-            for para in doc.paragraphs:
-                contract_text += para.text
-        except Exception as e:
-            return f"Error reading DOCX: {str(e)}"
-    elif file_type == "text/plain":
-        contract_text = file_content.decode("utf-8")
-    return contract_text
-# Callback to handle file upload
-@app.callback(
-    [dash.dependencies.Output('file-upload-status', 'children'),
-     dash.dependencies.Output('output-text', 'children'),
-     dash.dependencies.Output('risk-heatmap', 'figure')],
-    [dash.dependencies.Input('upload-data', 'contents'),
-     dash.dependencies.State('upload-data', 'filename'),
-     dash.dependencies.State('upload-data', 'type')]
 )
-def update_output(file_contents, filename, file_type):
-    if file_contents is not None:
-        # Parse the contract
-        contract_text = parse_contract(file_contents, file_type)
-        if contract_text:
-            # Analyze the contract
-            analysis_results = analyze_contract(contract_text)
-            if "error" in analysis_results:
-                return "Error", f"An error occurred during analysis: {analysis_results['error']}", {}
-            # Display high-risk clauses and overall risk score
-            high_risk_clauses = analysis_results["high_risk_clauses"]
-            risk_score = analysis_results["risk_score"]
-            high_risk_text = f"High Risk Clauses: {', '.join(high_risk_clauses)}"
-            risk_score_text = f"Overall Risk Score: {risk_score}"
-            # Generate the risk heatmap (simplified here)
-            fig, ax = plt.subplots()
-            ax.barh(['Contract'], [risk_score], color='red')
-            ax.set_xlim(0, 100)  # Assuming risk score ranges from 0 to 100
-            ax.set_xlabel("Risk Score")
-            # Returning results for display
-            return "File Uploaded Successfully", [high_risk_text, risk_score_text], {
-                'data': [{
-                    'x': ['Contract'],
-                    'y': [risk_score],
-                    'type': 'bar',
-                    'name': 'Risk Score',
-                    'marker': {'color': 'red'}
-                }],
-                'layout': {
-                    'title': 'Risk Heatmap',
-                    'xaxis': {'title': 'Risk Score'},
-                    'yaxis': {'title': 'Contract'}
-                }
-            }
-    return "No File Uploaded", "", {}
-if __name__ == '__main__':
-    app.run_server(debug=True)

+import gradio as gr
 import PyPDF2
+import nltk
+from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
+import seaborn as sns
 import matplotlib.pyplot as plt
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
 import json
+import os
+from io import BytesIO
+import numpy as np
+import torch
+# Download NLTK data
+nltk.download('punkt')
+# Initialize BERT model and tokenizer
+model_name = "nlpaueb/legal-bert-base-uncased"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 labels: penalty, obligation, delay
+classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)
+# Clause types and risk scoring logic
+CLAUSE_TYPES = ["penalty", "obligation", "delay"]
+RISK_WEIGHTS = {"penalty": 0.8, "obligation": 0.5, "delay": 0.6}
+def extract_text_from_pdf(pdf_file):
+    """Extract text from uploaded PDF file."""
     try:
+        reader = PyPDF2.PdfReader(pdf_file)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text() or ""
+        return text
     except Exception as e:
+        return f"Error extracting text: {str(e)}"
+def parse_contract(text):
+    """Parse contract text into clauses and classify risks."""
+    sentences = nltk.sent_tokenize(text)
+    results = []
+    risk_scores = []
+    for idx, sentence in enumerate(sentences):
+        if len(sentence.strip()) < 10:  # Skip short sentences
+            continue
+        # Classify clause
+        classification = classifier(sentence)
+        clause_type = max(classification[0], key=lambda x: x['score'])['label']
+        if clause_type not in CLAUSE_TYPES:
+            continue
+        # Calculate risk score
+        score = classification[0][CLAUSE_TYPES.index(clause_type)]['score'] * RISK_WEIGHTS[clause_type]
+        results.append({
+            "clause_id": idx,
+            "text": sentence,
+            "clause_type": clause_type,
+            "risk_score": round(score, 2)
+        })
+        risk_scores.append(score)
+    return results, risk_scores
+def generate_heatmap(risk_scores):
+    """Generate heatmap for risk scores."""
+    if not risk_scores:
+        return None
+    data = np.array(risk_scores).reshape(1, -1)
+    plt.figure(figsize=(10, 2))
+    sns.heatmap(data, cmap="YlOrRd", annot=True, fmt=".2f", cbar_kws={'label': 'Risk Score'})
+    plt.title("Contract Risk Heatmap")
+    plt.xlabel("Clause Index")
+    plt.ylabel("Risk")
+    buffer = BytesIO()
+    plt.savefig(buffer, format="png", bbox_inches="tight")
+    plt.close()
+    buffer.seek(0)
+    return buffer
+def generate_pdf_report(results, heatmap_buffer):
+    """Generate PDF report with summary and heatmap."""
+    buffer = BytesIO()
+    c = canvas.Canvas(buffer, pagesize=letter)
+    c.setFont("Helvetica", 12)
+    c.drawString(50, 750, "Contract Risk Analysis Report")
+    # Summary
+    c.drawString(50, 720, "Summary of Risk-Prone Clauses:")
+    y = 700
+    for result in results[:5]:  # Limit to top 5 for brevity
+        text = f"Clause {result['clause_id']}: {result['clause_type'].capitalize()} (Risk: {result['risk_score']})"
+        c.drawString(50, y, text[:80] + "..." if len(text) > 80 else text)
+        y -= 20
+    # Embed heatmap
+    if heatmap_buffer:
+        c.drawImage(BytesIO(heatmap_buffer.read()), 50, y-200, width=500, height=100)
+    c.showPage()
+    c.save()
+    buffer.seek(0)
+    return buffer
+def process_contract(pdf_file):
+    """Main function to process uploaded contract."""
+    # Extract text
+    text = extract_text_from_pdf(pdf_file)
+    if "Error" in text:
+        return text, None, None, None
+    # Parse and classify
+    results, risk_scores = parse_contract(text)
+    if not results:
+        return "No relevant clauses detected.", None, None, None
+    # Generate outputs
+    json_output = json.dumps(results, indent=2)
+    heatmap_buffer = generate_heatmap(risk_scores)
+    pdf_report = generate_pdf_report(results, heatmap_buffer)
+    return json_output, heatmap_buffer, pdf_report, {"Summary": f"Detected {len(results)} risk-prone clauses."}
+# Gradio interface
+iface = gr.Interface(
+    fn=process_contract,
+    inputs=gr.File(label="Upload Contract PDF"),
+    outputs=[
+        gr.Textbox(label="JSON Output"),
+        gr.Image(label="Risk Heatmap"),
+        gr.File(label="Download PDF Report"),
+        gr.JSON(label="Summary")
+    ],
+    title="Contract Risk Analyzer",
+    description="Upload a contract PDF to analyze risk-prone clauses and visualize results."
 )
+if __name__ == "__main__":
+    iface.launch()