deepthi6 commited on
Commit
5e6f621
Β·
verified Β·
1 Parent(s): 4dfe5da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +212 -199
app.py CHANGED
@@ -1,218 +1,231 @@
1
  import os
2
- import json
3
  import re
4
  import io
5
- from typing import List, Dict, Tuple, Optional, Any
6
-
7
  import torch
8
- from transformers import AutoModelForCausalLM, AutoTokenizer
9
- from PyPDF2 import PdfReader # βœ… PyPDF2 instead of pypdf (lighter, preinstalled on HF)
10
- import docx
 
 
 
 
 
 
 
 
11
  import spacy
12
- import gradio as gr
 
13
 
14
  # -----------------------------
15
- # Model (Granite or lightweight fallback)
16
  # -----------------------------
17
- try:
18
- MODEL_ID = "ibm-granite/granite-3.2-2b-instruct"
19
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
20
- model = AutoModelForCausalLM.from_pretrained(
21
- MODEL_ID,
22
- torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
23
- device_map="auto" if torch.cuda.is_available() else None
24
- )
25
- except Exception:
26
- MODEL_ID = "microsoft/phi-2" # βœ… Lightweight fallback if Granite fails
27
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
28
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
29
 
30
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
31
- model.to(DEVICE)
 
 
 
 
32
 
33
  # -----------------------------
34
- # Load spaCy model
35
  # -----------------------------
36
- try:
37
- nlp = spacy.load("en_core_web_sm")
38
- except Exception:
39
- from spacy.cli import download
40
- download("en_core_web_sm")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  nlp = spacy.load("en_core_web_sm")
 
 
 
 
 
 
 
42
 
43
  # -----------------------------
44
- # Helper: Prompt builder
45
  # -----------------------------
46
- def build_chat_prompt(system_prompt: str, user_prompt: str) -> str:
47
- messages = []
48
- if system_prompt:
49
- messages.append({"role": "system", "content": system_prompt})
50
- messages.append({"role": "user", "content": user_prompt})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  try:
52
- return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
53
  except Exception:
54
- return f"[SYSTEM]\n{system_prompt}\n[USER]\n{user_prompt}\n[ASSISTANT]\n"
55
-
56
- # -----------------------------
57
- # Text generation
58
- # -----------------------------
59
- def llm_generate(system_prompt: str, user_prompt: str, max_new_tokens=512) -> str:
60
- prompt = build_chat_prompt(system_prompt, user_prompt)
61
- inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
62
- with torch.no_grad():
63
- output = model.generate(
64
- **inputs,
65
- max_new_tokens=max_new_tokens,
66
- do_sample=True,
67
- top_p=0.9,
68
- temperature=0.3,
69
- pad_token_id=tokenizer.eos_token_id
70
- )
71
- full_text = tokenizer.decode(output[0], skip_special_tokens=True)
72
- if "[ASSISTANT]" in full_text:
73
- return full_text.split("[ASSISTANT]")[-1].strip()
74
- if full_text.startswith(prompt):
75
- return full_text[len(prompt):].strip()
76
- return full_text.strip()
77
-
78
- # -----------------------------
79
- # File loaders
80
- # -----------------------------
81
- def load_text_from_pdf(file_obj) -> str:
82
- reader = PdfReader(file_obj)
83
- pages = [page.extract_text() or "" for page in reader.pages]
84
- return "\n".join(pages).strip()
85
-
86
- def load_text_from_docx(file_obj) -> str:
87
- data = file_obj.read()
88
- file_obj.seek(0)
89
- f = io.BytesIO(data)
90
- doc = docx.Document(f)
91
- paras = [p.text for p in doc.paragraphs]
92
- return "\n".join(paras).strip()
93
-
94
- def load_text_from_txt(file_obj) -> str:
95
- data = file_obj.read()
96
- if isinstance(data, bytes):
97
- data = data.decode("utf-8", errors="ignore")
98
- return str(data).strip()
99
-
100
- def load_document(file: Optional[gr.File]) -> str:
101
- if not file:
102
- return ""
103
- name = (file.name or "").lower()
104
- if name.endswith(".pdf"):
105
- return load_text_from_pdf(file)
106
- elif name.endswith(".docx"):
107
- return load_text_from_docx(file)
108
- elif name.endswith(".txt"):
109
- return load_text_from_txt(file)
110
- else:
111
- try:
112
- return load_text_from_pdf(file)
113
- except Exception:
114
- try:
115
- return load_text_from_docx(file)
116
- except Exception:
117
- return load_text_from_txt(file)
118
-
119
- # -----------------------------
120
- # Clause extraction
121
- # -----------------------------
122
- def split_into_clauses(text: str, min_len=40) -> List[str]:
123
- if not text:
124
- return []
125
- parts = re.split(r"(?:(?:^\s*\d+(?:\.\d+)[.)]\s+)|(?:^\s[A-Z]\s*[.)]\s+)|(?:;?\s*\n))", text, flags=re.MULTILINE)
126
- if len(parts) < 2:
127
- parts = re.split(r"(?<=[.;])\s+\n?\s*", text)
128
- clauses = [p.strip() for p in parts if len(p.strip()) >= min_len]
129
- seen, unique = set(), []
130
- for c in clauses:
131
- key = re.sub(r"\s+", " ", c.lower())
132
- if key not in seen:
133
- seen.add(key)
134
- unique.append(c)
135
- return unique
136
-
137
- # -----------------------------
138
- # Simplify clause
139
- # -----------------------------
140
- def simplify_clause(clause: str) -> str:
141
- system = "You are a legal assistant simplifying contract clauses for clarity."
142
- user = f"Rewrite this clause in plain English:\n\n{clause}"
143
- return llm_generate(system, user, max_new_tokens=300)
144
-
145
- # -----------------------------
146
- # Named Entity Recognition
147
- # -----------------------------
148
- def ner_entities(text: str) -> Dict[str, List[str]]:
149
- if not text:
150
- return {}
151
- doc = nlp(text)
152
- out: Dict[str, List[str]] = {}
153
- for ent in doc.ents:
154
- out.setdefault(ent.label_, []).append(ent.text)
155
- return {k: sorted(set(v)) for k, v in out.items()}
156
-
157
- # -----------------------------
158
- # Document classification
159
- # -----------------------------
160
- DOC_TYPES = [
161
- "Non-Disclosure Agreement (NDA)",
162
- "Lease Agreement",
163
- "Employment Contract",
164
- "Service Agreement",
165
- "Sales Agreement",
166
- "Consulting Agreement",
167
- "Terms of Service"
168
- ]
169
-
170
- def classify_document(text: str) -> str:
171
- system = "You are a legal document classifier."
172
- labels = "\n".join(f"- {t}" for t in DOC_TYPES)
173
- user = f"Classify the document into one of these types:\n{labels}\n\nDocument:\n{text[:4000]}"
174
- resp = llm_generate(system, user, max_new_tokens=200)
175
- for t in DOC_TYPES:
176
- if t.lower() in resp.lower():
177
- return t
178
- return "Unclassified"
179
-
180
- # -----------------------------
181
- # Input handler
182
- # -----------------------------
183
- def get_text_from_inputs(file: Optional[gr.File], text: str) -> str:
184
- file_text = load_document(file) if file else ""
185
- final = (text or "").strip()
186
- return file_text if len(file_text) > len(final) else final
187
-
188
- # -----------------------------
189
- # Gradio Interface
190
- # -----------------------------
191
- def analyze_document(file, text):
192
- content = get_text_from_inputs(file, text)
193
- if not content:
194
- return "No content found.", {}, []
195
- clauses = split_into_clauses(content)
196
- summary = f"Found {len(clauses)} clauses."
197
- entities = ner_entities(content)
198
- classification = classify_document(content)
199
- simplified = simplify_clause(clauses[0]) if clauses else "No clause to simplify."
200
- return summary + f"\n\nDocument Type: {classification}\n\nSample Simplified Clause:\n{simplified}", entities, clauses[:5]
201
-
202
- iface = gr.Interface(
203
- fn=analyze_document,
204
- inputs=[
205
- gr.File(label="Upload a Legal Document (PDF/DOCX/TXT)"),
206
- gr.Textbox(label="...or Paste Text", lines=5)
207
- ],
208
- outputs=[
209
- gr.Textbox(label="Analysis Summary"),
210
- gr.JSON(label="Entities Found"),
211
- gr.Textbox(label="Extracted Clauses (first 5)", lines=10)
212
- ],
213
- title="βš–οΈ ClauseWise: Legal Document Analyzer",
214
- description="Upload a contract or paste text to extract clauses, identify entities, and simplify content."
215
- )
216
 
217
- if __name__ == "__main__":
218
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
2
  import re
3
  import io
4
+ import tempfile
 
5
  import torch
6
+ import pandas as pd
7
+ import plotly.express as px
8
+ import streamlit as st
9
+ from transformers import (
10
+ AutoTokenizer,
11
+ AutoModelForCausalLM,
12
+ AutoModelForSeq2SeqLM,
13
+ pipeline
14
+ )
15
+ from PyPDF2 import PdfReader
16
+ from docx import Document
17
  import spacy
18
+ from gtts import gTTS
19
+ from io import BytesIO
20
 
21
  # -----------------------------
22
+ # PAGE CONFIG
23
  # -----------------------------
24
+ st.set_page_config(page_title="βš–οΈ ClauseWise: Multilingual Legal AI Assistant", page_icon="βš–οΈ", layout="wide")
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ st.title("βš–οΈ ClauseWise: Multilingual Legal AI Assistant")
27
+ st.markdown("""
28
+ ClauseWise helps you **simplify, translate, and understand legal documents** in your preferred language.
29
+ Upload contracts, extract clauses, check fairness, and chat with your AI legal assistant β€” all multilingual and with audio output.
30
+ ---
31
+ """)
32
 
33
  # -----------------------------
34
+ # LANGUAGE MAP
35
  # -----------------------------
36
+ LANG_MAP = {
37
+ "English": "en", "French": "fr", "Spanish": "es", "German": "de",
38
+ "Hindi": "hi", "Tamil": "ta", "Telugu": "te", "Kannada": "kn",
39
+ "Marathi": "mr", "Gujarati": "gu", "Bengali": "bn"
40
+ }
41
+ LANG_NAMES = list(LANG_MAP.keys())
42
+
43
+ # -----------------------------
44
+ # LOAD MODELS
45
+ # -----------------------------
46
+ @st.cache_resource
47
+ def load_all_models():
48
+ simplify_model_name = "mrm8488/t5-small-finetuned-text-simplification"
49
+ tokenizer_simplify = AutoTokenizer.from_pretrained(simplify_model_name)
50
+ simplify_model = AutoModelForSeq2SeqLM.from_pretrained(simplify_model_name)
51
+
52
+ gen_model_id = "microsoft/phi-2"
53
+ gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id)
54
+ gen_model = AutoModelForCausalLM.from_pretrained(gen_model_id)
55
+
56
  nlp = spacy.load("en_core_web_sm")
57
+ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
58
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
59
+ return tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer
60
+
61
+ tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer = load_all_models()
62
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
63
+ gen_model.to(DEVICE)
64
 
65
  # -----------------------------
66
+ # UTILS
67
  # -----------------------------
68
+ def extract_text(file):
69
+ name = file.name.lower()
70
+ with tempfile.NamedTemporaryFile(delete=False) as tmp:
71
+ tmp.write(file.read())
72
+ tmp_path = tmp.name
73
+ text = ""
74
+ try:
75
+ if name.endswith(".pdf"):
76
+ reader = PdfReader(tmp_path)
77
+ for page in reader.pages:
78
+ t = page.extract_text()
79
+ if t:
80
+ text += t + "\n"
81
+ elif name.endswith(".docx"):
82
+ doc = Document(tmp_path)
83
+ text = "\n".join([p.text for p in doc.paragraphs])
84
+ else:
85
+ text = open(tmp_path, "r", encoding="utf-8", errors="ignore").read()
86
+ except Exception as e:
87
+ st.error(f"Failed to read file: {e}")
88
+ finally:
89
+ os.remove(tmp_path)
90
+ return text.strip()
91
+
92
+ def translate_text(text, target_lang):
93
+ lang_code = LANG_MAP[target_lang]
94
+ if lang_code == "en":
95
+ return text
96
  try:
97
+ translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{lang_code}")
98
+ return translator(text[:1000])[0]["translation_text"]
99
  except Exception:
100
+ return f"(Translation unavailable for {target_lang})"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
+ def text_to_speech(text, lang):
103
+ lang_code = LANG_MAP[lang]
104
+ try:
105
+ tts = gTTS(text=text, lang=lang_code)
106
+ audio_fp = BytesIO()
107
+ tts.write_to_fp(audio_fp)
108
+ audio_fp.seek(0)
109
+ return audio_fp
110
+ except Exception:
111
+ st.warning("Speech generation failed for this language.")
112
+ return None
113
+
114
+ def clause_simplification(text, mode):
115
+ prefix = {
116
+ "Simplified": "simplify: ",
117
+ "Explain like I'm 5": "explain like I'm 5: ",
118
+ "Professional": "rephrase professionally: "
119
+ }.get(mode, "simplify: ")
120
+ inputs = tokenizer_simplify(prefix + text, return_tensors="pt", truncation=True, max_length=512)
121
+ outputs = simplify_model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True)
122
+ return tokenizer_simplify.decode(outputs[0], skip_special_tokens=True)
123
+
124
+ def fairness_score_visual(text, lang):
125
+ pos = len(re.findall(r"(mutual|both parties|shared)", text, re.I))
126
+ neg = len(re.findall(r"(sole|unilateral|exclusive right)", text, re.I))
127
+ score = max(0, min(100, 70 + pos - 2*neg))
128
+
129
+ st.subheader("βš–οΈ Fairness Balance Meter")
130
+ fairness_df = pd.DataFrame({"Aspect": ["Party A Favored", "Balanced", "Party B Favored"],
131
+ "Score": [100 - score, score // 2, score]})
132
+ fig = px.bar(
133
+ fairness_df, x="Score", y="Aspect", orientation="h",
134
+ color="Aspect", text="Score", title="Fairness Score Representation"
135
+ )
136
+ fig.update_layout(showlegend=False, xaxis_title="Score", yaxis_title="")
137
+ st.plotly_chart(fig, use_container_width=True)
138
+
139
+ translated_info = translate_text(f"Fairness Score: {score}% (Educational Estimate Only)", lang)
140
+ st.info(translated_info)
141
+
142
+ def chat_response(prompt, lang):
143
+ inputs = gen_tokenizer(prompt, return_tensors="pt").to(DEVICE)
144
+ outputs = gen_model.generate(**inputs, max_new_tokens=350, do_sample=True, temperature=0.7, top_p=0.9)
145
+ resp = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
146
+ return translate_text(resp, lang)
147
+
148
+ # -----------------------------
149
+ # MAIN TABS
150
+ # -----------------------------
151
+ tab1, tab2, tab3, tab4 = st.tabs(["πŸ“„ Analyzer", "🌐 Translate & Audio", "πŸ’¬ Chatbot", "βš™οΈ About"])
152
+
153
+ # -----------------------------
154
+ # TAB 1: Analyzer
155
+ # -----------------------------
156
+ with tab1:
157
+ st.subheader("πŸ“ Upload or Paste Legal Document")
158
+ lang = st.selectbox("Select Working Language:", LANG_NAMES, index=0)
159
+ file = st.file_uploader("Upload Document (PDF/DOCX/TXT)", type=["pdf", "docx", "txt"])
160
+ text_input = st.text_area("Or Paste Text Here:", height=200)
161
+
162
+ if file or text_input:
163
+ text = extract_text(file) if file else text_input
164
+ st.markdown("---")
165
+ col1, col2 = st.columns(2)
166
+ with col1:
167
+ mode = st.radio("Simplify Mode", ["Explain like I'm 5", "Simplified", "Professional"])
168
+ if st.button("🧾 Simplify Clauses"):
169
+ with st.spinner("Simplifying..."):
170
+ simplified = clause_simplification(text, mode)
171
+ translated_output = translate_text(simplified, lang)
172
+ st.success(translated_output)
173
+ audio_data = text_to_speech(translated_output, lang)
174
+ if audio_data:
175
+ st.audio(audio_data, format="audio/mp3")
176
+
177
+ with col2:
178
+ if st.button("βš–οΈ Fairness Analysis"):
179
+ fairness_score_visual(text, lang)
180
+
181
+ # -----------------------------
182
+ # TAB 2: Translate & Audio
183
+ # -----------------------------
184
+ with tab2:
185
+ st.subheader("🌐 Translate & Hear Content")
186
+ text_input = st.text_area("Enter text to translate or listen:", height=200)
187
+ lang = st.selectbox("Choose Translation Language:", LANG_NAMES, index=4)
188
+ if st.button("Translate Text"):
189
+ translated = translate_text(text_input, lang)
190
+ st.success(translated)
191
+ if st.button("🎧 Generate Audio"):
192
+ audio_data = text_to_speech(text_input, lang)
193
+ if audio_data:
194
+ st.audio(audio_data, format="audio/mp3")
195
+
196
+ # -----------------------------
197
+ # TAB 3: Chatbot
198
+ # -----------------------------
199
+ with tab3:
200
+ st.subheader("πŸ’¬ ClauseWise Multilingual Chatbot")
201
+ lang = st.selectbox("Chatbot Language:", LANG_NAMES, index=4)
202
+ st.markdown("Ask questions about contract clauses, fairness, or legal basics. *(Educational only β€” not legal advice.)*")
203
+ query = st.text_area("Your question:", height=150)
204
+ if st.button("Ask ClauseWise"):
205
+ with st.spinner("Thinking..."):
206
+ response = chat_response(f"Answer this like a legal assistant: {query}", lang)
207
+ st.success(response)
208
+ audio_data = text_to_speech(response, lang)
209
+ if audio_data:
210
+ st.audio(audio_data, format="audio/mp3")
211
+
212
+ # -----------------------------
213
+ # TAB 4: About
214
+ # -----------------------------
215
+ with tab4:
216
+ st.markdown("""
217
+ ### 🌍 About ClauseWise
218
+ ClauseWise is an **AI-powered multilingual legal document assistant** that helps users:
219
+ - Simplify complex legal clauses
220
+ - Translate and listen in **10+ languages**
221
+ - Analyze fairness visually
222
+ - Ask questions interactively in any supported language
223
+
224
+ **Supported Languages:**
225
+ English, French, Spanish, German, Hindi, Tamil, Telugu, Kannada, Marathi, Gujarati, Bengali
226
+
227
+ **Disclaimer:**
228
+ ClauseWise provides educational insights only and does not offer legal advice.
229
+ """)
230
+
231
+ st.markdown("<p style='text-align:center; color:gray;'>Β© 2025 ClauseWise | Multilingual Legal AI Assistant</p>", unsafe_allow_html=True)