MrTsp commited on
Commit
0b4ec01
·
1 Parent(s): d6cf053

added contract analyzer

Browse files
analyze_contract.py CHANGED
@@ -1,21 +1,15 @@
1
- import fitz # PyMuPDF
2
  import torch
3
  import pandas as pd
4
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
5
 
6
- # ============================
7
- # LOAD TRAINED MODEL
8
- # ============================
9
-
10
  model_path = "risk_clause_model"
11
-
12
  tokenizer = AutoTokenizer.from_pretrained(model_path)
13
  model = AutoModelForSequenceClassification.from_pretrained(model_path)
14
 
15
- # ============================
16
- # RISK WEIGHTS
17
- # ============================
18
-
19
  risk_weights = {
20
  "Termination": 3,
21
  "Liability": 3,
@@ -28,25 +22,20 @@ risk_weights = {
28
  "Intellectual Property": 2
29
  }
30
 
31
- # ============================
32
- # EXTRACT TEXT FROM PDF
33
- # ============================
34
 
 
35
  def extract_text_from_pdf(pdf_path):
36
 
37
  doc = fitz.open(pdf_path)
38
- text = ""
39
 
 
40
  for page in doc:
41
  text += page.get_text()
42
 
43
  return text
44
 
45
 
46
- # ============================
47
- # SPLIT INTO CLAUSES
48
- # ============================
49
-
50
  def split_clauses(text):
51
 
52
  clauses = text.split(".")
@@ -55,10 +44,7 @@ def split_clauses(text):
55
  return clauses
56
 
57
 
58
- # ============================
59
- # PREDICT CLAUSE CATEGORY
60
- # ============================
61
-
62
  def predict_clause(text):
63
 
64
  inputs = tokenizer(
@@ -73,55 +59,47 @@ def predict_clause(text):
73
 
74
  probs = torch.nn.functional.softmax(outputs.logits, dim=1)
75
 
76
- predicted_class_id = torch.argmax(probs).item()
77
 
78
- category = model.config.id2label[predicted_class_id]
79
 
80
- confidence = probs[0][predicted_class_id].item()
81
 
82
  return category, confidence
83
 
84
 
85
- # ============================
86
- # MAIN ANALYZER
87
- # ============================
88
-
89
  def analyze_contract(pdf_path):
90
 
91
- text = extract_text_from_pdf(pdf_path)
92
 
 
93
  clauses = split_clauses(text)
94
 
95
- print("\nContract Analysis Results:\n")
96
-
97
  results = []
98
-
99
- total_risk_score = 0
100
 
101
  for i, clause in enumerate(clauses):
102
 
103
  category, confidence = predict_clause(clause)
104
 
105
- risk_score = risk_weights.get(category, 1)
106
-
107
- total_risk_score += risk_score
108
 
109
- print(f"Clause {i+1}: {category} (confidence: {confidence:.2f})")
 
110
 
111
  results.append({
112
  "Clause_Number": i+1,
113
- "Clause_Text": clause,
114
  "Category": category,
115
  "Confidence": round(confidence, 3),
116
- "Risk_Score": risk_score
 
117
  })
118
 
119
 
120
- # ============================
121
- # FINAL VERDICT LOGIC
122
- # ============================
123
-
124
- avg_risk = total_risk_score / len(clauses)
125
 
126
  if avg_risk >= 2.5:
127
  verdict = "HIGH RISK"
@@ -131,27 +109,29 @@ def analyze_contract(pdf_path):
131
  verdict = "LOW RISK"
132
 
133
 
134
- print("\n===========================")
135
- print("TOTAL CLAUSES:", len(clauses))
136
- print("TOTAL RISK SCORE:", total_risk_score)
137
- print("AVERAGE RISK:", round(avg_risk, 2))
138
- print("FINAL VERDICT:", verdict)
139
- print("===========================\n")
140
 
141
 
142
- # ============================
143
- # SAVE CSV REPORT
144
- # ============================
145
 
146
- df = pd.DataFrame(results)
 
 
147
 
148
- df.to_csv("contract_analysis_results.csv", index=False)
 
 
 
 
 
 
149
 
150
- print("Saved report → contract_analysis_results.csv")
151
 
 
 
 
152
 
153
- # ============================
154
- # RUN ANALYSIS
155
- # ============================
156
 
 
157
  analyze_contract("contract.pdf")
 
1
+ import fitz
2
  import torch
3
  import pandas as pd
4
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
+ from collections import Counter
6
 
7
+ # load model
 
 
 
8
  model_path = "risk_clause_model"
 
9
  tokenizer = AutoTokenizer.from_pretrained(model_path)
10
  model = AutoModelForSequenceClassification.from_pretrained(model_path)
11
 
12
+ # risk weights
 
 
 
13
  risk_weights = {
14
  "Termination": 3,
15
  "Liability": 3,
 
22
  "Intellectual Property": 2
23
  }
24
 
 
 
 
25
 
26
+ # extract pdf text
27
  def extract_text_from_pdf(pdf_path):
28
 
29
  doc = fitz.open(pdf_path)
 
30
 
31
+ text = ""
32
  for page in doc:
33
  text += page.get_text()
34
 
35
  return text
36
 
37
 
38
+ # split clauses
 
 
 
39
  def split_clauses(text):
40
 
41
  clauses = text.split(".")
 
44
  return clauses
45
 
46
 
47
+ # predict
 
 
 
48
  def predict_clause(text):
49
 
50
  inputs = tokenizer(
 
59
 
60
  probs = torch.nn.functional.softmax(outputs.logits, dim=1)
61
 
62
+ pred_id = torch.argmax(probs).item()
63
 
64
+ category = model.config.id2label[pred_id]
65
 
66
+ confidence = probs[0][pred_id].item()
67
 
68
  return category, confidence
69
 
70
 
71
+ # analyzer
 
 
 
72
  def analyze_contract(pdf_path):
73
 
74
+ print("\nAnalyzing contract...\n")
75
 
76
+ text = extract_text_from_pdf(pdf_path)
77
  clauses = split_clauses(text)
78
 
 
 
79
  results = []
80
+ total_risk = 0
81
+ categories = []
82
 
83
  for i, clause in enumerate(clauses):
84
 
85
  category, confidence = predict_clause(clause)
86
 
87
+ risk = risk_weights.get(category, 1)
 
 
88
 
89
+ total_risk += risk
90
+ categories.append(category)
91
 
92
  results.append({
93
  "Clause_Number": i+1,
 
94
  "Category": category,
95
  "Confidence": round(confidence, 3),
96
+ "Risk_Score": risk,
97
+ "Clause_Text": clause
98
  })
99
 
100
 
101
+ # summary
102
+ avg_risk = total_risk / len(clauses)
 
 
 
103
 
104
  if avg_risk >= 2.5:
105
  verdict = "HIGH RISK"
 
109
  verdict = "LOW RISK"
110
 
111
 
112
+ category_counts = Counter(categories)
 
 
 
 
 
113
 
114
 
115
+ # clean output
116
+ print("========== CONTRACT SUMMARY ==========\n")
 
117
 
118
+ print("Total Clauses:", len(clauses))
119
+ print("Average Risk Score:", round(avg_risk, 2))
120
+ print("Final Verdict:", verdict)
121
 
122
+ print("\nClause Category Distribution:")
123
+
124
+ for cat, count in category_counts.items():
125
+ print(f"{cat}: {count}")
126
+
127
+ print("\nFull results saved in contract_analysis_results.csv")
128
+ print("=====================================\n")
129
 
 
130
 
131
+ # save csv
132
+ df = pd.DataFrame(results)
133
+ df.to_csv("contract_analysis_results.csv", index=False)
134
 
 
 
 
135
 
136
+ # run
137
  analyze_contract("contract.pdf")
contract_analysis_results.csv CHANGED
The diff for this file is too large to render. See raw diff