clausewise / app.py
bhoomi19's picture
Update app.py
08707ed verified
import os
import re
import io
import tempfile
import torch
import pandas as pd
import plotly.express as px
import streamlit as st
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
AutoModelForSeq2SeqLM,
pipeline
)
from PyPDF2 import PdfReader
from docx import Document
from gtts import gTTS
from io import BytesIO
import spacy
import subprocess
# -----------------------------
# Hugging Face fix: ensure Streamlit runs properly
# -----------------------------
if __name__ == "__main__" and os.environ.get("SYSTEM") == "spaces":
subprocess.Popen(["streamlit", "run", "app.py", "--server.port", "7860", "--server.address", "0.0.0.0"])
exit()
# -----------------------------
# Page config
# -----------------------------
st.set_page_config(page_title="βš– ClauseWise", page_icon="βš–", layout="wide")
# -----------------------------
# Language Map
# -----------------------------
LANG_MAP = {
"English": "en", "French": "fr", "Spanish": "es", "German": "de",
"Hindi": "hi", "Tamil": "ta", "Telugu": "te", "Kannada": "kn",
"Marathi": "mr", "Gujarati": "gu", "Bengali": "bn"
}
LANG_NAMES = list(LANG_MAP.keys())
# -----------------------------
# Model Loading (cached)
# -----------------------------
@st.cache_resource
def load_models():
simplify_model_name = "mrm8488/t5-small-finetuned-text-simplification"
tokenizer_simplify = AutoTokenizer.from_pretrained(simplify_model_name)
simplify_model = AutoModelForSeq2SeqLM.from_pretrained(simplify_model_name)
gen_model_id = "microsoft/phi-2"
gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id, trust_remote_code=True)
gen_model = AutoModelForCausalLM.from_pretrained(gen_model_id, trust_remote_code=True)
# βœ… Load SpaCy
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
from spacy.cli import download
download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
return tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer
tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer = load_models()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
gen_model.to(DEVICE)
# -----------------------------
# Utility Functions
# -----------------------------
def extract_text(file):
if not file:
return ""
name = file.name.lower()
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(name)[1]) as tmp:
tmp.write(file.read())
tmp_path = tmp.name
text = ""
try:
if name.endswith(".pdf"):
reader = PdfReader(tmp_path)
for page in reader.pages:
t = page.extract_text()
if t:
text += t + "\n"
elif name.endswith(".docx"):
doc = Document(tmp_path)
text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
else:
with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f:
text = f.read()
except Exception as e:
st.error(f"Error reading file: {e}")
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)
return text.strip()
def translate_text(text, target_lang):
if not text:
return ""
lang_code = LANG_MAP.get(target_lang, "en")
if lang_code == "en":
return text
try:
translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{lang_code}")
return translator(text[:1000])[0]["translation_text"]
except Exception:
return text
def text_to_speech(text, lang):
try:
lang_code = LANG_MAP.get(lang, "en")
tts = gTTS(text=text[:1000], lang=lang_code)
audio_fp = BytesIO()
tts.write_to_fp(audio_fp)
audio_fp.seek(0)
return audio_fp
except Exception:
return None
def clause_simplification(text, mode):
prefix = {
"Simplified": "simplify: ",
"Explain like I'm 5": "explain like I'm 5: ",
"Professional": "rephrase professionally: "
}.get(mode, "simplify: ")
inputs = tokenizer_simplify(prefix + text[:500], return_tensors="pt", truncation=True, max_length=512)
outputs = simplify_model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True)
return tokenizer_simplify.decode(outputs[0], skip_special_tokens=True)
def fairness_score_visual(text, lang):
pos = len(re.findall(r"\b(mutual|both parties|shared|equal|fair|balanced)\b", text, re.I))
neg = len(re.findall(r"\b(sole|unilateral|exclusive right|one-sided|only)\b", text, re.I))
score = max(0, min(100, 50 + (pos * 5) - (neg * 5)))
st.subheader("βš– Fairness Balance Meter")
fairness_df = pd.DataFrame({
"Aspect": ["Party A Favored", "Balanced", "Party B Favored"],
"Score": [max(0, 100 - score), score, min(100, score)]
})
fig = px.bar(
fairness_df, x="Score", y="Aspect", orientation="h", text="Score", color="Aspect",
color_discrete_sequence=["#ff6b6b", "#4ecdc4", "#95e1d3"]
)
fig.update_layout(showlegend=False, xaxis_title="Score", yaxis_title="", height=300)
st.plotly_chart(fig, use_container_width=True)
st.info(translate_text(f"Fairness Score: {score}% (Approximate)", lang))
def chat_response(prompt, lang, history):
"""Persistent memory chat"""
# Combine chat history context
context = "\n".join([f"User: {u}\nAI: {a}" for u, a in history[-3:]]) # Keep last 3
full_prompt = f"You are a helpful multilingual legal assistant. {context}\nUser: {prompt}\nAI:"
inputs = gen_tokenizer(full_prompt, return_tensors="pt").to(DEVICE)
outputs = gen_model.generate(**inputs, max_new_tokens=200, temperature=0.7, top_p=0.9, do_sample=True)
response = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
if "AI:" in response:
response = response.split("AI:")[-1].strip()
return translate_text(response, lang)
# -----------------------------
# Main Streamlit App
# -----------------------------
def main():
st.title("βš– ClauseWise: Multilingual Legal AI Assistant")
st.markdown("Simplify, translate, and analyze legal documents with AI β€” in your language.")
st.divider()
tab1, tab2, tab3, tab4 = st.tabs(["πŸ“„ Analyzer", "🌐 Translate & Audio", "πŸ’¬ Chatbot", "β„Ή About"])
with tab1:
st.subheader("πŸ“ Upload or Paste Legal Document")
lang = st.selectbox("Select Language:", LANG_NAMES, index=0)
file = st.file_uploader("Upload a Legal Document (PDF/DOCX/TXT)", type=["pdf", "docx", "txt"])
text_input = st.text_area("Or Paste Text Here:", height=200)
if file or text_input:
text = extract_text(file) if file else text_input
if not text:
st.warning("No content found.")
else:
mode = st.radio("Simplify Mode", ["Explain like I'm 5", "Simplified", "Professional"])
if st.button("🧾 Simplify Clauses"):
with st.spinner("Simplifying..."):
simplified = clause_simplification(text, mode)
translated = translate_text(simplified, lang)
st.success(translated)
audio = text_to_speech(translated, lang)
if audio:
st.audio(audio, format="audio/mp3")
if st.button("βš– Fairness Analysis"):
fairness_score_visual(text, lang)
with tab2:
st.subheader("🌐 Translate & Listen")
text_input = st.text_area("Enter text:", height=200)
lang = st.selectbox("Translate to:", LANG_NAMES, index=4)
if st.button("Translate"):
translated = translate_text(text_input, lang)
st.success(translated)
if st.button("🎧 Generate Audio"):
audio = text_to_speech(text_input, lang)
if audio:
st.audio(audio, format="audio/mp3")
with tab3:
st.subheader("πŸ’¬ Chat with ClauseWise (Memory Enabled)")
lang = st.selectbox("Chat Language:", LANG_NAMES, index=0)
query = st.text_area("Ask your question:", height=150)
# Maintain persistent conversation
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
if st.button("Ask"):
if query.strip():
with st.spinner("Thinking..."):
response = chat_response(query, lang, st.session_state.chat_history)
st.session_state.chat_history.append((query, response))
st.success(response)
audio = text_to_speech(response, lang)
if audio:
st.audio(audio, format="audio/mp3")
# Display conversation history
if st.session_state.chat_history:
st.markdown("### 🧠 Chat History")
for q, a in st.session_state.chat_history[-5:]:
st.markdown(f"*You:* {q}")
st.markdown(f"*ClauseWise:* {a}")
if st.button("Clear Chat"):
st.session_state.chat_history = []
st.info("Chat cleared.")
with tab4:
st.markdown("""
### βš– About ClauseWise
ClauseWise is a multilingual AI-powered legal assistant that helps users:
- Simplify legal language
- Translate and listen in 10+ languages
- Assess fairness visually
- Chat interactively with memory
---
*Disclaimer:* Educational use only β€” not legal advice.
""")
if __name__ == "__main__":
main()