deepthi6 commited on
Commit
9c30488
·
verified ·
1 Parent(s): 3f3fd40

Update util.py

Browse files
Files changed (1) hide show
  1. util.py +211 -86
util.py CHANGED
@@ -1,96 +1,221 @@
1
- import re
2
- from pypdf import PdfReader
3
- import docx
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
5
 
6
- # -------------------------------------------------------
7
- # ✅ Extract text from uploaded files
8
- # -------------------------------------------------------
9
- def extract_text_from_file(uploaded):
10
- name = uploaded.name.lower()
11
-
12
- if name.endswith(".pdf"):
13
- reader = PdfReader(uploaded)
14
- text = ""
15
- for page in reader.pages:
16
- text += page.extract_text()
17
- return text
18
-
19
- elif name.endswith(".txt"):
20
- return uploaded.read().decode("utf-8")
21
-
22
- elif name.endswith(".docx"):
23
- doc = docx.Document(uploaded)
24
- return "\n".join([p.text for p in doc.paragraphs])
25
-
26
- return ""
27
-
28
- # -------------------------------------------------------
29
- # ✅ Split into clauses
30
- # -------------------------------------------------------
31
- def split_into_clauses(text):
32
- return [c.strip() for c in re.split(r"\n+|\.\s+", text) if len(c.strip()) > 20]
33
-
34
- # -------------------------------------------------------
35
- # ✅ Clause Simplifier
36
- # -------------------------------------------------------
37
- def simplify_clause(clause, lang, mode):
38
- if "Explain Like" in mode:
39
- return f"This clause basically means: {clause[:120]}..."
40
- if "Professional" in mode:
41
- return f"A more formal interpretation: {clause}"
42
- return f"Simply put: {clause[:160]}..."
43
-
44
- # -------------------------------------------------------
45
- # ✅ Risk Analyzer
46
- # -------------------------------------------------------
47
- def get_risks(text):
48
- return [
49
- "Broad confidentiality scope",
50
- "Long-term obligations",
51
- "Unilateral termination rights",
52
- "Strong liability clause",
53
- "Missing dispute resolution"
54
- ]
55
 
56
- # -------------------------------------------------------
57
- # Fairness Meter
58
- # -------------------------------------------------------
59
- def get_fairness_score(text):
60
- return 40, 80
61
-
62
- # -------------------------------------------------------
63
- # ✅ Entities
64
- # -------------------------------------------------------
65
- def extract_entities(text):
66
- names = re.findall(r"[A-Z][a-z]+ [A-Z][a-z]+", text)
67
- dates = re.findall(r"\b\d{1,2} \w+ \d{4}\b", text)
68
- return {"names": names[:5], "dates": dates[:5]}
69
-
70
- # -------------------------------------------------------
71
- # ✅ Alternative Clause Suggestions
72
- # -------------------------------------------------------
73
- def suggest_alternatives(text):
74
- return [
75
- "Consider adding a mutual confidentiality clause.",
76
- "Limit NDA duration to 2–3 years.",
77
- "Specify permitted disclosures clearly.",
78
- ]
79
 
80
- # -------------------------------------------------------
81
- # Load Local Chat Model
82
- # -------------------------------------------------------
 
 
 
 
 
 
 
 
83
  def load_chat_model():
84
  model_name = "distilgpt2"
85
  tokenizer = AutoTokenizer.from_pretrained(model_name)
86
  model = AutoModelForCausalLM.from_pretrained(model_name)
 
 
 
 
87
  return model, tokenizer
88
 
89
- # -------------------------------------------------------
90
- # Chat with Model
91
- # -------------------------------------------------------
92
- def chat_with_model(model, tokenizer, user_text, history):
93
- text = " ".join([f"{u}:{m}" for u, m in history]) + user_text
94
- inputs = tokenizer.encode(text, return_tensors="pt")
95
- outputs = model.generate(inputs, max_length=200)
96
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
 
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
+ from multilingual import UI_TEXT, translate_text
5
+ from util import extract_text, split_into_clauses, simplify_clause, chat_with_model
6
 
7
+ # ---------------------------------------------------
8
+ # ✅ PAGE CONFIG
9
+ # ---------------------------------------------------
10
+ st.set_page_config(
11
+ page_title="ClauseWise – NDA Assistant",
12
+ layout="wide"
13
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ st.markdown(
16
+ "<h2 style='text-align:center;'>ClauseWise Multilingual NDA Legal Assistant</h2>",
17
+ unsafe_allow_html=True
18
+ )
19
+
20
+ # ---------------------------------------------------
21
+ # ✅ LANGUAGE HANDLING
22
+ # ---------------------------------------------------
23
+ LANGUAGES = {
24
+ "English": "en",
25
+ "हिन्दी (Hindi)": "hi",
26
+ "தமிழ் (Tamil)": "ta",
27
+ "తెలుగు (Telugu)": "te",
28
+ "ಕನ್ನಡ (Kannada)": "kn"
29
+ }
 
 
 
 
 
 
 
 
30
 
31
+ if "lang" not in st.session_state:
32
+ st.session_state.lang = "en"
33
+
34
+ selected_label = st.selectbox("🌐 Language", list(LANGUAGES.keys()))
35
+ st.session_state.lang = LANGUAGES[selected_label]
36
+ T = {k: v[st.session_state.lang] for k, v in UI_TEXT.items()}
37
+
38
+ # ---------------------------------------------------
39
+ # ✅ LOAD CHAT MODEL (DistilGPT2 – HF SAFE)
40
+ # ---------------------------------------------------
41
+ @st.cache_resource
42
  def load_chat_model():
43
  model_name = "distilgpt2"
44
  tokenizer = AutoTokenizer.from_pretrained(model_name)
45
  model = AutoModelForCausalLM.from_pretrained(model_name)
46
+
47
+ tokenizer.pad_token = tokenizer.eos_token
48
+ model.config.pad_token_id = tokenizer.eos_token_id
49
+
50
  return model, tokenizer
51
 
52
+
53
+ model, tokenizer = load_chat_model()
54
+
55
+ # Chat history
56
+ if "chat_history" not in st.session_state:
57
+ st.session_state.chat_history = []
58
+
59
+
60
+ # ---------------------------------------------------
61
+ # ✅ FILE UPLOAD
62
+ # ---------------------------------------------------
63
+ st.subheader(T["upload_title"])
64
+ uploaded = st.file_uploader(T["upload_instruction"], type=["pdf", "txt", "docx"])
65
+
66
+ if uploaded:
67
+ st.info("⏳ Reading file...")
68
+ text = extract_text(uploaded)
69
+
70
+ # ---------------------------------------------------
71
+ # ✅ STRICT NDA DETECTION
72
+ # ---------------------------------------------------
73
+ NDA_KEYWORDS = [
74
+ "non-disclosure", "non disclosure", "nda",
75
+ "confidential information", "disclosing party",
76
+ "receiving party", "confidentiality",
77
+ "confidential materials", "protected information"
78
+ ]
79
+
80
+ if len(text) < 50 or not any(k.lower() in text.lower() for k in NDA_KEYWORDS):
81
+ st.error(T["error_not_nda"])
82
+ st.stop()
83
+
84
+ st.success(T["success_nda"])
85
+
86
+ # ---------------------------------------------------
87
+ # ✅ ANALYSIS TABS
88
+ # ---------------------------------------------------
89
+ st.subheader(T["analysis_title"])
90
+ tabs = st.tabs([
91
+ T["tab_clauses"],
92
+ T["tab_risks"],
93
+ T["tab_fairness"],
94
+ T["tab_entities"],
95
+ T["tab_alternatives"],
96
+ T["tab_chat"],
97
+ ])
98
+
99
+ # ===================================================
100
+ # ✅ TAB 1 — CLAUSE SIMPLIFICATION
101
+ # ===================================================
102
+ with tabs[0]:
103
+ st.markdown(f"### {T['clause_simplify']}")
104
+
105
+ mode = st.radio(
106
+ T["choose_mode"],
107
+ [("eli5", T["eli5"]), ("simple", T["simple"]), ("pro", T["pro"])],
108
+ format_func=lambda x: x[1]
109
+ )[0]
110
+
111
+ clauses = split_into_clauses(text)
112
+
113
+ for i, c in enumerate(clauses):
114
+ with st.expander(f"Clause {i+1}"):
115
+ st.write("**Original:**")
116
+ st.write(c)
117
+
118
+ st.write("**Explanation:**")
119
+ st.write(simplify_clause(c, mode))
120
+
121
+ # ===================================================
122
+ # ✅ TAB 2 — RISK ANALYSIS
123
+ # ===================================================
124
+ with tabs[1]:
125
+ st.markdown(f"### {T['risk_title']}")
126
+
127
+ # Simple risk detector
128
+ RISK_PATTERNS = {
129
+ "Broad confidentiality definition": ["broad", "all information", "any information"],
130
+ "Unlimited liability": ["unlimited", "full liability", "all damages"],
131
+ "One-sided obligations": ["shall not", "only the receiving party"],
132
+ "Long duration (>5 years)": ["5 years", "7 years", "perpetual"],
133
+ "No termination rights": ["cannot terminate", "no termination"]
134
+ }
135
+
136
+ risks_found = []
137
+
138
+ for clause in clauses:
139
+ lower_c = clause.lower()
140
+ for risk_label, kws in RISK_PATTERNS.items():
141
+ if any(k in lower_c for k in kws):
142
+ risks_found.append(risk_label)
143
+
144
+ risks_found = list(dict.fromkeys(risks_found))[:5] # top 5
145
+
146
+ if not risks_found:
147
+ st.success("✅ No major risks detected.")
148
+ else:
149
+ for r in risks_found:
150
+ st.error("⚠️ " + r)
151
+
152
+ # ===================================================
153
+ # ✅ TAB 3 — FAIRNESS METER
154
+ # ===================================================
155
+ with tabs[2]:
156
+ st.markdown(f"### {T['fairness_title']}")
157
+
158
+ fairness_score = max(20, min(90, 50 - len(risks_found) * 7))
159
+
160
+ st.write(f"**{T['your_position']}:** {fairness_score}%")
161
+ st.write(f"**{T['company_position']}:** {100 - fairness_score}%")
162
+
163
+ st.progress(fairness_score / 100)
164
+
165
+ # ===================================================
166
+ # ✅ TAB 4 — ENTITIES
167
+ # ===================================================
168
+ with tabs[3]:
169
+ st.markdown(f"### {T['entities_title']}")
170
+
171
+ parties = []
172
+ dates = []
173
+ money = []
174
+
175
+ import re
176
+
177
+ for clause in clauses:
178
+ if "party" in clause.lower():
179
+ parties.append(clause[:80] + "...")
180
+
181
+ money.extend(re.findall(r"\$[\d,]+", clause))
182
+ dates.extend(re.findall(r"\b(?:\d{1,2}\/\d{1,2}\/\d{2,4}|20\d{2})\b", clause))
183
+
184
+ st.write("**Parties:**", list(set(parties)))
185
+ st.write("**Dates:**", list(set(dates)))
186
+ st.write("**Amounts:**", list(set(money)))
187
+
188
+ # ===================================================
189
+ # ✅ TAB 5 — ALTERNATIVE CLAUSES
190
+ # ===================================================
191
+ with tabs[4]:
192
+ st.markdown(f"### {T['alt_title']}")
193
+
194
+ ALTS = [
195
+ "A mutual confidentiality clause where both parties share equal protection.",
196
+ "A time-limited confidentiality period of 2–3 years.",
197
+ "Liability capped at a fixed reasonable amount."
198
+ ]
199
+
200
+ for alt in ALTS:
201
+ st.info(alt)
202
+
203
+ # ===================================================
204
+ # ✅ TAB 6 — LEGAL CHAT ASSISTANT
205
+ # ===================================================
206
+ with tabs[5]:
207
+ st.markdown(f"### {T['chat_title']}")
208
+
209
+ user_input = st.text_input(T["chat_placeholder"])
210
+
211
+ if user_input:
212
+ reply = chat_with_model(model, tokenizer, user_input, st.session_state.chat_history)
213
+
214
+ st.session_state.chat_history.append(("User", user_input))
215
+ st.session_state.chat_history.append(("AI", reply))
216
+
217
+ for role, msg in st.session_state.chat_history[-10:]:
218
+ if role == "User":
219
+ st.markdown(f"🧑 **You:** {msg}")
220
+ else:
221
+ st.markdown(f"🤖 **ClauseWise:** {msg}")