Komal133 commited on
Commit
6eba542
·
verified ·
1 Parent(s): d35516f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -74
app.py CHANGED
@@ -11,6 +11,11 @@ import os
11
  from io import BytesIO
12
  import numpy as np
13
  import torch
 
 
 
 
 
14
 
15
  # Download NLTK data
16
  nltk.download('punkt')
@@ -21,6 +26,13 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
21
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) # 3 labels: penalty, obligation, delay
22
  classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)
23
 
 
 
 
 
 
 
 
24
  # Clause types and risk scoring logic
25
  CLAUSE_TYPES = ["penalty", "obligation", "delay"]
26
  RISK_WEIGHTS = {"penalty": 0.8, "obligation": 0.5, "delay": 0.6}
@@ -31,41 +43,64 @@ def extract_text_from_pdf(pdf_file):
31
  reader = PyPDF2.PdfReader(pdf_file)
32
  text = ""
33
  for page in reader.pages:
34
- text += page.extract_text() or ""
 
 
 
 
 
35
  return text
36
  except Exception as e:
 
37
  return f"Error extracting text: {str(e)}"
38
 
39
  def parse_contract(text):
40
  """Parse contract text into clauses and classify risks."""
 
 
41
  sentences = nltk.sent_tokenize(text)
 
 
 
42
  results = []
43
  risk_scores = []
44
 
45
  for idx, sentence in enumerate(sentences):
46
- if len(sentence.strip()) < 10: # Skip short sentences
 
 
47
  continue
48
  # Classify clause
49
- classification = classifier(sentence)
50
- clause_type = max(classification[0], key=lambda x: x['score'])['label']
51
- if clause_type not in CLAUSE_TYPES:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  continue
53
 
54
- # Calculate risk score
55
- score = classification[0][CLAUSE_TYPES.index(clause_type)]['score'] * RISK_WEIGHTS[clause_type]
56
- results.append({
57
- "clause_id": idx,
58
- "text": sentence,
59
- "clause_type": clause_type,
60
- "risk_score": round(score, 2)
61
- })
62
- risk_scores.append(score)
63
-
64
  return results, risk_scores
65
 
66
  def generate_heatmap(risk_scores):
67
  """Generate heatmap for risk scores."""
68
  if not risk_scores:
 
69
  return None
70
  data = np.array(risk_scores).reshape(1, -1)
71
  plt.figure(figsize=(10, 2))
@@ -79,62 +114,4 @@ def generate_heatmap(risk_scores):
79
  buffer.seek(0)
80
  return buffer
81
 
82
- def generate_pdf_report(results, heatmap_buffer):
83
- """Generate PDF report with summary and heatmap."""
84
- buffer = BytesIO()
85
- c = canvas.Canvas(buffer, pagesize=letter)
86
- c.setFont("Helvetica", 12)
87
- c.drawString(50, 750, "Contract Risk Analysis Report")
88
-
89
- # Summary
90
- c.drawString(50, 720, "Summary of Risk-Prone Clauses:")
91
- y = 700
92
- for result in results[:5]: # Limit to top 5 for brevity
93
- text = f"Clause {result['clause_id']}: {result['clause_type'].capitalize()} (Risk: {result['risk_score']})"
94
- c.drawString(50, y, text[:80] + "..." if len(text) > 80 else text)
95
- y -= 20
96
-
97
- # Embed heatmap
98
- if heatmap_buffer:
99
- c.drawImage(BytesIO(heatmap_buffer.read()), 50, y-200, width=500, height=100)
100
-
101
- c.showPage()
102
- c.save()
103
- buffer.seek(0)
104
- return buffer
105
-
106
- def process_contract(pdf_file):
107
- """Main function to process uploaded contract."""
108
- # Extract text
109
- text = extract_text_from_pdf(pdf_file)
110
- if "Error" in text:
111
- return text, None, None, None
112
-
113
- # Parse and classify
114
- results, risk_scores = parse_contract(text)
115
- if not results:
116
- return "No relevant clauses detected.", None, None, None
117
-
118
- # Generate outputs
119
- json_output = json.dumps(results, indent=2)
120
- heatmap_buffer = generate_heatmap(risk_scores)
121
- pdf_report = generate_pdf_report(results, heatmap_buffer)
122
-
123
- return json_output, heatmap_buffer, pdf_report, {"Summary": f"Detected {len(results)} risk-prone clauses."}
124
-
125
- # Gradio interface
126
- iface = gr.Interface(
127
- fn=process_contract,
128
- inputs=gr.File(label="Upload Contract PDF"),
129
- outputs=[
130
- gr.Textbox(label="JSON Output"),
131
- gr.Image(label="Risk Heatmap"),
132
- gr.File(label="Download PDF Report"),
133
- gr.JSON(label="Summary")
134
- ],
135
- title="Contract Risk Analyzer",
136
- description="Upload a contract PDF to analyze risk-prone clauses and visualize results."
137
- )
138
-
139
- if __name__ == "__main__":
140
- iface.launch()
 
11
  from io import BytesIO
12
  import numpy as np
13
  import torch
14
+ import logging
15
+
16
+ # Set up logging
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
 
20
  # Download NLTK data
21
  nltk.download('punkt')
 
26
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) # 3 labels: penalty, obligation, delay
27
  classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)
28
 
29
+ # Map model labels to clause types (adjust based on actual model labels after fine-tuning)
30
+ LABEL_MAP = {
31
+ "LABEL_0": "penalty",
32
+ "LABEL_1": "obligation",
33
+ "LABEL_2": "delay"
34
+ }
35
+
36
  # Clause types and risk scoring logic
37
  CLAUSE_TYPES = ["penalty", "obligation", "delay"]
38
  RISK_WEIGHTS = {"penalty": 0.8, "obligation": 0.5, "delay": 0.6}
 
43
  reader = PyPDF2.PdfReader(pdf_file)
44
  text = ""
45
  for page in reader.pages:
46
+ page_text = page.extract_text() or ""
47
+ text += page_text + "\n"
48
+ logger.info(f"Extracted text length: {len(text)} characters")
49
+ logger.debug(f"Extracted text sample: {text[:500]}")
50
+ if not text.strip():
51
+ return "Error: No text extracted from PDF."
52
  return text
53
  except Exception as e:
54
+ logger.error(f"Text extraction error: {str(e)}")
55
  return f"Error extracting text: {str(e)}"
56
 
57
  def parse_contract(text):
58
  """Parse contract text into clauses and classify risks."""
59
+ # Clean text: replace multiple newlines with single, handle LaTeX artifacts
60
+ text = text.replace("\n\n", "\n").replace("\t", " ")
61
  sentences = nltk.sent_tokenize(text)
62
+ logger.info(f"Number of sentences tokenized: {len(sentences)}")
63
+ logger.debug(f"Sample sentences: {sentences[:3]}")
64
+
65
  results = []
66
  risk_scores = []
67
 
68
  for idx, sentence in enumerate(sentences):
69
+ sentence = sentence.strip()
70
+ if len(sentence) < 10: # Skip short sentences
71
+ logger.debug(f"Skipping short sentence (length {len(sentence)}): {sentence}")
72
  continue
73
  # Classify clause
74
+ try:
75
+ classification = classifier(sentence)
76
+ logger.debug(f"Classification for sentence {idx}: {classification}")
77
+ # Map model labels to clause types
78
+ top_label = max(classification[0], key=lambda x: x['score'])['label']
79
+ clause_type = LABEL_MAP.get(top_label, None)
80
+ if clause_type not in CLAUSE_TYPES:
81
+ logger.debug(f"Clause type {clause_type} not in {CLAUSE_TYPES}, skipping.")
82
+ continue
83
+
84
+ # Calculate risk score
85
+ score = classification[0][[label for label in LABEL_MAP if LABEL_MAP[label] == clause_type][0]]['score'] * RISK_WEIGHTS[clause_type]
86
+ results.append({
87
+ "clause_id": idx,
88
+ "text": sentence,
89
+ "clause_type": clause_type,
90
+ "risk_score": round(score, 2)
91
+ })
92
+ risk_scores.append(score)
93
+ logger.info(f"Detected clause {idx}: {clause_type} with risk score {score}")
94
+ except Exception as e:
95
+ logger.error(f"Error classifying sentence {idx}: {str(e)}")
96
  continue
97
 
 
 
 
 
 
 
 
 
 
 
98
  return results, risk_scores
99
 
100
  def generate_heatmap(risk_scores):
101
  """Generate heatmap for risk scores."""
102
  if not risk_scores:
103
+ logger.warning("No risk scores to generate heatmap.")
104
  return None
105
  data = np.array(risk_scores).reshape(1, -1)
106
  plt.figure(figsize=(10, 2))
 
114
  buffer.seek(0)
115
  return buffer
116
 
117
+ def generate_pdf_report(results, heatmap