deepthi6 commited on
Commit
4612bd8
·
verified ·
1 Parent(s): 55ae994

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +206 -80
app.py CHANGED
@@ -38,38 +38,52 @@ LANG_NAMES = list(LANG_MAP.keys())
38
  # -----------------------------
39
  @st.cache_resource
40
  def load_models():
41
- simplify_model_name = "mrm8488/t5-small-finetuned-text-simplification"
42
- tokenizer_simplify = AutoTokenizer.from_pretrained(simplify_model_name)
43
- simplify_model = AutoModelForSeq2SeqLM.from_pretrained(simplify_model_name)
 
 
44
 
45
- gen_model_id = "microsoft/phi-2"
46
- gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id)
47
- gen_model = AutoModelForCausalLM.from_pretrained(gen_model_id)
48
 
49
- # ✅ Auto-download SpaCy if missing
50
- try:
51
- nlp = spacy.load("en_core_web_sm")
52
- except OSError:
53
- from spacy.cli import download
54
- download("en_core_web_sm")
55
- nlp = spacy.load("en_core_web_sm")
 
 
 
 
 
 
 
 
56
 
57
- classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
58
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
59
- return tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer
 
 
60
 
61
- tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer = load_models()
62
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
63
- gen_model.to(DEVICE)
 
64
 
65
  # -----------------------------
66
  # UTILITIES
67
  # -----------------------------
68
  def extract_text(file):
 
69
  if not file:
70
  return ""
71
  name = file.name.lower()
72
- with tempfile.NamedTemporaryFile(delete=False) as tmp:
73
  tmp.write(file.read())
74
  tmp_path = tmp.name
75
  text = ""
@@ -82,67 +96,153 @@ def extract_text(file):
82
  text += t + "\n"
83
  elif name.endswith(".docx"):
84
  doc = Document(tmp_path)
85
- text = "\n".join([p.text for p in doc.paragraphs])
86
  else:
87
- text = open(tmp_path, "r", encoding="utf-8", errors="ignore").read()
 
88
  except Exception as e:
89
  st.error(f"Error reading file: {e}")
90
  finally:
91
- os.remove(tmp_path)
 
92
  return text.strip()
93
 
94
  def translate_text(text, target_lang):
95
- lang_code = LANG_MAP[target_lang]
 
 
 
 
96
  if lang_code == "en":
97
  return text
 
98
  try:
 
 
99
  translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{lang_code}")
100
- return translator(text[:1000])[0]["translation_text"]
101
- except Exception:
102
- return f"(Translation unavailable for {target_lang})"
 
 
103
 
104
  def text_to_speech(text, lang):
 
 
 
 
105
  try:
106
- lang_code = LANG_MAP[lang]
107
- tts = gTTS(text=text, lang=lang_code)
 
 
108
  audio_fp = BytesIO()
109
  tts.write_to_fp(audio_fp)
110
  audio_fp.seek(0)
111
  return audio_fp
112
- except Exception:
113
- st.warning("Audio unavailable for this language.")
114
  return None
115
 
116
  def clause_simplification(text, mode):
117
- prefix = {
 
 
 
 
118
  "Simplified": "simplify: ",
119
  "Explain like I'm 5": "explain like I'm 5: ",
120
  "Professional": "rephrase professionally: "
121
- }.get(mode, "simplify: ")
122
- inputs = tokenizer_simplify(prefix + text, return_tensors="pt", truncation=True, max_length=512)
123
- outputs = simplify_model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True)
124
- return tokenizer_simplify.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  def fairness_score_visual(text, lang):
127
- pos = len(re.findall(r"(mutual|both parties|shared)", text, re.I))
128
- neg = len(re.findall(r"(sole|unilateral|exclusive right)", text, re.I))
129
- score = max(0, min(100, 70 + pos - 2 * neg))
 
 
 
 
 
 
130
 
131
  st.subheader("⚖️ Fairness Balance Meter")
 
132
  fairness_df = pd.DataFrame({
133
  "Aspect": ["Party A Favored", "Balanced", "Party B Favored"],
134
- "Score": [100 - score, score // 2, score]
135
  })
136
- fig = px.bar(fairness_df, x="Score", y="Aspect", orientation="h", text="Score", color="Aspect")
137
- fig.update_layout(showlegend=False, xaxis_title="Score", yaxis_title="")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  st.plotly_chart(fig, use_container_width=True)
139
- st.info(translate_text(f"Fairness Score: {score}% (Approximate)", lang))
 
 
 
 
140
 
141
  def chat_response(prompt, lang):
142
- inputs = gen_tokenizer(prompt, return_tensors="pt").to(DEVICE)
143
- outputs = gen_model.generate(**inputs, max_new_tokens=300, temperature=0.7, top_p=0.9, do_sample=True)
144
- response = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
145
- return translate_text(response, lang)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  # -----------------------------
148
  # MAIN STREAMLIT APP FUNCTION
@@ -159,72 +259,98 @@ def main():
159
  # TAB 1: ANALYZER
160
  with tab1:
161
  st.subheader("📁 Upload or Paste Legal Document")
162
- lang = st.selectbox("Select Language:", LANG_NAMES, index=0)
163
  file = st.file_uploader("Upload a Legal Document (PDF/DOCX/TXT)", type=["pdf", "docx", "txt"])
164
- text_input = st.text_area("Or Paste Text Here:", height=200)
165
 
166
  if file or text_input:
167
  text = extract_text(file) if file else text_input
168
 
169
- mode = st.radio("Simplify Mode", ["Explain like I'm 5", "Simplified", "Professional"])
 
 
 
170
 
171
- if st.button("🧾 Simplify Clauses"):
172
- with st.spinner("Simplifying..."):
173
- simplified = clause_simplification(text, mode)
174
- translated = translate_text(simplified, lang)
175
- st.success(translated)
176
- audio_data = text_to_speech(translated, lang)
177
- if audio_data:
178
- st.audio(audio_data, format="audio/mp3")
 
179
 
180
- if st.button("⚖️ Fairness Analysis"):
181
- fairness_score_visual(text, lang)
 
182
 
183
  # TAB 2: TRANSLATION + AUDIO
184
  with tab2:
185
  st.subheader("🌐 Translate & Listen")
186
- text_input = st.text_area("Enter text:", height=200)
187
- lang = st.selectbox("Translate to:", LANG_NAMES, index=4)
188
 
189
  if st.button("Translate"):
190
- translated = translate_text(text_input, lang)
191
- st.success(translated)
 
 
 
 
 
192
  if st.button("🎧 Generate Audio"):
193
- audio_data = text_to_speech(text_input, lang)
194
- if audio_data:
195
- st.audio(audio_data, format="audio/mp3")
 
 
 
 
196
 
197
  # TAB 3: CHATBOT
198
  with tab3:
199
  st.subheader("💬 Chat with ClauseWise (Multilingual)")
200
- lang = st.selectbox("Chat Language:", LANG_NAMES, index=4)
201
- query = st.text_area("Ask about clauses, fairness, or legal meaning:", height=150)
 
202
  if st.button("Ask"):
203
- with st.spinner("Thinking..."):
204
- response = chat_response(f"You are a legal assistant. Answer helpfully: {query}", lang)
205
- st.success(response)
206
- audio_data = text_to_speech(response, lang)
207
- if audio_data:
208
- st.audio(audio_data, format="audio/mp3")
 
 
 
 
209
 
210
  # TAB 4: ABOUT
211
  with tab4:
212
  st.markdown("""
213
  ### ⚖️ About ClauseWise
214
  ClauseWise is a multilingual AI-powered legal assistant that helps users:
215
- - Simplify complex clauses
216
- - Translate and listen in 10+ languages
217
- - Assess fairness visually
218
- - Chat interactively
219
 
220
  **Languages Supported:**
221
  English, French, Spanish, German, Hindi, Tamil, Telugu, Kannada, Marathi, Gujarati, Bengali
222
 
223
- **Disclaimer:** Educational purposes only, not legal advice.
 
 
 
 
 
 
 
224
  """)
225
 
226
  # -----------------------------
227
  # RUN STREAMLIT APP SAFELY
228
  # -----------------------------
229
  if __name__ == "__main__":
230
- main()
 
38
  # -----------------------------
39
  @st.cache_resource
40
  def load_models():
41
+ """Load all required models with error handling"""
42
+ try:
43
+ simplify_model_name = "mrm8488/t5-small-finetuned-text-simplification"
44
+ tokenizer_simplify = AutoTokenizer.from_pretrained(simplify_model_name)
45
+ simplify_model = AutoModelForSeq2SeqLM.from_pretrained(simplify_model_name)
46
 
47
+ gen_model_id = "microsoft/phi-2"
48
+ gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id, trust_remote_code=True)
49
+ gen_model = AutoModelForCausalLM.from_pretrained(gen_model_id, trust_remote_code=True)
50
 
51
+ # ✅ Auto-download SpaCy if missing
52
+ try:
53
+ nlp = spacy.load("en_core_web_sm")
54
+ except OSError:
55
+ from spacy.cli import download
56
+ download("en_core_web_sm")
57
+ nlp = spacy.load("en_core_web_sm")
58
+
59
+ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
60
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
61
+
62
+ return tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer
63
+ except Exception as e:
64
+ st.error(f"Error loading models: {e}")
65
+ return None, None, None, None, None, None, None
66
 
67
+ # Load models
68
+ model_data = load_models()
69
+ if model_data[0] is None:
70
+ st.error("Failed to load models. Please check your internet connection and try again.")
71
+ st.stop()
72
 
73
+ tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer = model_data
74
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
75
+ if gen_model is not None:
76
+ gen_model.to(DEVICE)
77
 
78
  # -----------------------------
79
  # UTILITIES
80
  # -----------------------------
81
  def extract_text(file):
82
+ """Extract text from uploaded file"""
83
  if not file:
84
  return ""
85
  name = file.name.lower()
86
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(name)[1]) as tmp:
87
  tmp.write(file.read())
88
  tmp_path = tmp.name
89
  text = ""
 
96
  text += t + "\n"
97
  elif name.endswith(".docx"):
98
  doc = Document(tmp_path)
99
+ text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
100
  else:
101
+ with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f:
102
+ text = f.read()
103
  except Exception as e:
104
  st.error(f"Error reading file: {e}")
105
  finally:
106
+ if os.path.exists(tmp_path):
107
+ os.remove(tmp_path)
108
  return text.strip()
109
 
110
  def translate_text(text, target_lang):
111
+ """Translate text to target language"""
112
+ if not text:
113
+ return ""
114
+
115
+ lang_code = LANG_MAP.get(target_lang, "en")
116
  if lang_code == "en":
117
  return text
118
+
119
  try:
120
+ # Truncate text to manageable size
121
+ text_to_translate = text[:500]
122
  translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{lang_code}")
123
+ result = translator(text_to_translate, max_length=512)
124
+ return result[0]["translation_text"]
125
+ except Exception as e:
126
+ st.warning(f"Translation unavailable for {target_lang}: {str(e)}")
127
+ return text # Return original text if translation fails
128
 
129
  def text_to_speech(text, lang):
130
+ """Convert text to speech"""
131
+ if not text:
132
+ return None
133
+
134
  try:
135
+ lang_code = LANG_MAP.get(lang, "en")
136
+ # Limit text length for TTS
137
+ text_for_tts = text[:1000]
138
+ tts = gTTS(text=text_for_tts, lang=lang_code, slow=False)
139
  audio_fp = BytesIO()
140
  tts.write_to_fp(audio_fp)
141
  audio_fp.seek(0)
142
  return audio_fp
143
+ except Exception as e:
144
+ st.warning(f"Audio generation unavailable: {str(e)}")
145
  return None
146
 
147
  def clause_simplification(text, mode):
148
+ """Simplify legal text based on selected mode"""
149
+ if not text or simplify_model is None:
150
+ return text
151
+
152
+ prefix_map = {
153
  "Simplified": "simplify: ",
154
  "Explain like I'm 5": "explain like I'm 5: ",
155
  "Professional": "rephrase professionally: "
156
+ }
157
+ prefix = prefix_map.get(mode, "simplify: ")
158
+
159
+ try:
160
+ # Truncate input text
161
+ text_to_process = text[:500]
162
+ inputs = tokenizer_simplify(
163
+ prefix + text_to_process,
164
+ return_tensors="pt",
165
+ truncation=True,
166
+ max_length=512
167
+ )
168
+ outputs = simplify_model.generate(
169
+ **inputs,
170
+ max_length=256,
171
+ num_beams=4,
172
+ early_stopping=True
173
+ )
174
+ return tokenizer_simplify.decode(outputs[0], skip_special_tokens=True)
175
+ except Exception as e:
176
+ st.error(f"Simplification error: {e}")
177
+ return text
178
 
179
  def fairness_score_visual(text, lang):
180
+ """Analyze and visualize fairness score"""
181
+ if not text:
182
+ st.warning("No text to analyze.")
183
+ return
184
+
185
+ # Calculate fairness score
186
+ pos = len(re.findall(r"\b(mutual|both parties|shared|equal|fair|balanced)\b", text, re.I))
187
+ neg = len(re.findall(r"\b(sole|unilateral|exclusive right|one-sided|only)\b", text, re.I))
188
+ score = max(0, min(100, 50 + (pos * 5) - (neg * 5)))
189
 
190
  st.subheader("⚖️ Fairness Balance Meter")
191
+
192
  fairness_df = pd.DataFrame({
193
  "Aspect": ["Party A Favored", "Balanced", "Party B Favored"],
194
+ "Score": [max(0, 100 - score), score, min(100, score)]
195
  })
196
+
197
+ fig = px.bar(
198
+ fairness_df,
199
+ x="Score",
200
+ y="Aspect",
201
+ orientation="h",
202
+ text="Score",
203
+ color="Aspect",
204
+ color_discrete_sequence=["#ff6b6b", "#4ecdc4", "#95e1d3"]
205
+ )
206
+ fig.update_layout(
207
+ showlegend=False,
208
+ xaxis_title="Score",
209
+ yaxis_title="",
210
+ height=300
211
+ )
212
  st.plotly_chart(fig, use_container_width=True)
213
+
214
+ # Translate the result
215
+ fairness_text = f"Fairness Score: {score}% (Approximate - based on keyword analysis)"
216
+ translated_result = translate_text(fairness_text, lang)
217
+ st.info(translated_result)
218
 
219
  def chat_response(prompt, lang):
220
+ """Generate chatbot response"""
221
+ if not prompt or gen_model is None:
222
+ return "Unable to generate response. Please try again."
223
+
224
+ try:
225
+ full_prompt = f"You are a helpful legal assistant. Answer the following question: {prompt}\n\nAnswer:"
226
+ inputs = gen_tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)
227
+
228
+ outputs = gen_model.generate(
229
+ **inputs,
230
+ max_new_tokens=200,
231
+ temperature=0.7,
232
+ top_p=0.9,
233
+ do_sample=True,
234
+ pad_token_id=gen_tokenizer.eos_token_id
235
+ )
236
+
237
+ response = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
238
+ # Extract only the answer part
239
+ if "Answer:" in response:
240
+ response = response.split("Answer:")[-1].strip()
241
+
242
+ return translate_text(response, lang)
243
+ except Exception as e:
244
+ st.error(f"Chat error: {e}")
245
+ return "I'm having trouble generating a response. Please try rephrasing your question."
246
 
247
  # -----------------------------
248
  # MAIN STREAMLIT APP FUNCTION
 
259
  # TAB 1: ANALYZER
260
  with tab1:
261
  st.subheader("📁 Upload or Paste Legal Document")
262
+ lang = st.selectbox("Select Language:", LANG_NAMES, index=0, key="analyzer_lang")
263
  file = st.file_uploader("Upload a Legal Document (PDF/DOCX/TXT)", type=["pdf", "docx", "txt"])
264
+ text_input = st.text_area("Or Paste Text Here:", height=200, key="analyzer_text")
265
 
266
  if file or text_input:
267
  text = extract_text(file) if file else text_input
268
 
269
+ if not text.strip():
270
+ st.warning("Please provide some text to analyze.")
271
+ else:
272
+ mode = st.radio("Simplify Mode", ["Explain like I'm 5", "Simplified", "Professional"])
273
 
274
+ if st.button("🧾 Simplify Clauses"):
275
+ with st.spinner("Simplifying..."):
276
+ simplified = clause_simplification(text, mode)
277
+ translated = translate_text(simplified, lang)
278
+ st.success(translated)
279
+
280
+ audio_data = text_to_speech(translated, lang)
281
+ if audio_data:
282
+ st.audio(audio_data, format="audio/mp3")
283
 
284
+ if st.button("⚖️ Fairness Analysis"):
285
+ with st.spinner("Analyzing fairness..."):
286
+ fairness_score_visual(text, lang)
287
 
288
  # TAB 2: TRANSLATION + AUDIO
289
  with tab2:
290
  st.subheader("🌐 Translate & Listen")
291
+ text_input = st.text_area("Enter text:", height=200, key="translate_text")
292
+ lang = st.selectbox("Translate to:", LANG_NAMES, index=4, key="translate_lang")
293
 
294
  if st.button("Translate"):
295
+ if text_input.strip():
296
+ with st.spinner("Translating..."):
297
+ translated = translate_text(text_input, lang)
298
+ st.success(translated)
299
+ else:
300
+ st.warning("Please enter some text to translate.")
301
+
302
  if st.button("🎧 Generate Audio"):
303
+ if text_input.strip():
304
+ with st.spinner("Generating audio..."):
305
+ audio_data = text_to_speech(text_input, lang)
306
+ if audio_data:
307
+ st.audio(audio_data, format="audio/mp3")
308
+ else:
309
+ st.warning("Please enter some text for audio generation.")
310
 
311
  # TAB 3: CHATBOT
312
  with tab3:
313
  st.subheader("💬 Chat with ClauseWise (Multilingual)")
314
+ lang = st.selectbox("Chat Language:", LANG_NAMES, index=0, key="chat_lang")
315
+ query = st.text_area("Ask about clauses, fairness, or legal meaning:", height=150, key="chat_query")
316
+
317
  if st.button("Ask"):
318
+ if query.strip():
319
+ with st.spinner("Thinking..."):
320
+ response = chat_response(query, lang)
321
+ st.success(response)
322
+
323
+ audio_data = text_to_speech(response, lang)
324
+ if audio_data:
325
+ st.audio(audio_data, format="audio/mp3")
326
+ else:
327
+ st.warning("Please enter a question.")
328
 
329
  # TAB 4: ABOUT
330
  with tab4:
331
  st.markdown("""
332
  ### ⚖️ About ClauseWise
333
  ClauseWise is a multilingual AI-powered legal assistant that helps users:
334
+ - **Simplify complex clauses** into easy-to-understand language
335
+ - **Translate and listen** in 10+ languages
336
+ - **Assess fairness** visually with keyword analysis
337
+ - **Chat interactively** about legal concepts
338
 
339
  **Languages Supported:**
340
  English, French, Spanish, German, Hindi, Tamil, Telugu, Kannada, Marathi, Gujarati, Bengali
341
 
342
+ **Technologies Used:**
343
+ - Hugging Face Transformers (T5, Phi-2, BART)
344
+ - SpaCy for NLP
345
+ - Google Text-to-Speech (gTTS)
346
+ - Plotly for visualizations
347
+
348
+ **⚠️ Disclaimer:** This tool is for educational purposes only and does not constitute legal advice.
349
+ Always consult with a qualified legal professional for legal matters.
350
  """)
351
 
352
  # -----------------------------
353
  # RUN STREAMLIT APP SAFELY
354
  # -----------------------------
355
  if __name__ == "__main__":
356
+ main()