Spaces:

bhoomi19
/

clausewise

Runtime error

App Files Files Community

clausewise / app.py

bhoomi19

Update app.py

08707ed verified 3 months ago

raw

history blame contribute delete

9.91 kB

	import os
	import re
	import io
	import tempfile
	import torch
	import pandas as pd
	import plotly.express as px
	import streamlit as st
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	AutoModelForSeq2SeqLM,
	pipeline
	)
	from PyPDF2 import PdfReader
	from docx import Document
	from gtts import gTTS
	from io import BytesIO
	import spacy
	import subprocess

	# -----------------------------
	# Hugging Face fix: ensure Streamlit runs properly
	# -----------------------------
	if __name__ == "__main__" and os.environ.get("SYSTEM") == "spaces":
	subprocess.Popen(["streamlit", "run", "app.py", "--server.port", "7860", "--server.address", "0.0.0.0"])
	exit()

	# -----------------------------
	# Page config
	# -----------------------------
	st.set_page_config(page_title="⚖ ClauseWise", page_icon="⚖", layout="wide")

	# -----------------------------
	# Language Map
	# -----------------------------
	LANG_MAP = {
	"English": "en", "French": "fr", "Spanish": "es", "German": "de",
	"Hindi": "hi", "Tamil": "ta", "Telugu": "te", "Kannada": "kn",
	"Marathi": "mr", "Gujarati": "gu", "Bengali": "bn"
	}
	LANG_NAMES = list(LANG_MAP.keys())

	# -----------------------------
	# Model Loading (cached)
	# -----------------------------
	@st.cache_resource
	def load_models():
	simplify_model_name = "mrm8488/t5-small-finetuned-text-simplification"
	tokenizer_simplify = AutoTokenizer.from_pretrained(simplify_model_name)
	simplify_model = AutoModelForSeq2SeqLM.from_pretrained(simplify_model_name)

	gen_model_id = "microsoft/phi-2"
	gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id, trust_remote_code=True)
	gen_model = AutoModelForCausalLM.from_pretrained(gen_model_id, trust_remote_code=True)

	# ✅ Load SpaCy
	try:
	nlp = spacy.load("en_core_web_sm")
	except OSError:
	from spacy.cli import download
	download("en_core_web_sm")
	nlp = spacy.load("en_core_web_sm")

	classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

	return tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer


	tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer = load_models()
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	gen_model.to(DEVICE)

	# -----------------------------
	# Utility Functions
	# -----------------------------
	def extract_text(file):
	if not file:
	return ""
	name = file.name.lower()
	with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(name)[1]) as tmp:
	tmp.write(file.read())
	tmp_path = tmp.name
	text = ""
	try:
	if name.endswith(".pdf"):
	reader = PdfReader(tmp_path)
	for page in reader.pages:
	t = page.extract_text()
	if t:
	text += t + "\n"
	elif name.endswith(".docx"):
	doc = Document(tmp_path)
	text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
	else:
	with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f:
	text = f.read()
	except Exception as e:
	st.error(f"Error reading file: {e}")
	finally:
	if os.path.exists(tmp_path):
	os.remove(tmp_path)
	return text.strip()


	def translate_text(text, target_lang):
	if not text:
	return ""
	lang_code = LANG_MAP.get(target_lang, "en")
	if lang_code == "en":
	return text
	try:
	translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{lang_code}")
	return translator(text[:1000])[0]["translation_text"]
	except Exception:
	return text


	def text_to_speech(text, lang):
	try:
	lang_code = LANG_MAP.get(lang, "en")
	tts = gTTS(text=text[:1000], lang=lang_code)
	audio_fp = BytesIO()
	tts.write_to_fp(audio_fp)
	audio_fp.seek(0)
	return audio_fp
	except Exception:
	return None


	def clause_simplification(text, mode):
	prefix = {
	"Simplified": "simplify: ",
	"Explain like I'm 5": "explain like I'm 5: ",
	"Professional": "rephrase professionally: "
	}.get(mode, "simplify: ")
	inputs = tokenizer_simplify(prefix + text[:500], return_tensors="pt", truncation=True, max_length=512)
	outputs = simplify_model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True)
	return tokenizer_simplify.decode(outputs[0], skip_special_tokens=True)


	def fairness_score_visual(text, lang):
	pos = len(re.findall(r"\b(mutual\|both parties\|shared\|equal\|fair\|balanced)\b", text, re.I))
	neg = len(re.findall(r"\b(sole\|unilateral\|exclusive right\|one-sided\|only)\b", text, re.I))
	score = max(0, min(100, 50 + (pos * 5) - (neg * 5)))

	st.subheader("⚖ Fairness Balance Meter")
	fairness_df = pd.DataFrame({
	"Aspect": ["Party A Favored", "Balanced", "Party B Favored"],
	"Score": [max(0, 100 - score), score, min(100, score)]
	})
	fig = px.bar(
	fairness_df, x="Score", y="Aspect", orientation="h", text="Score", color="Aspect",
	color_discrete_sequence=["#ff6b6b", "#4ecdc4", "#95e1d3"]
	)
	fig.update_layout(showlegend=False, xaxis_title="Score", yaxis_title="", height=300)
	st.plotly_chart(fig, use_container_width=True)
	st.info(translate_text(f"Fairness Score: {score}% (Approximate)", lang))


	def chat_response(prompt, lang, history):
	"""Persistent memory chat"""
	# Combine chat history context
	context = "\n".join([f"User: {u}\nAI: {a}" for u, a in history[-3:]]) # Keep last 3
	full_prompt = f"You are a helpful multilingual legal assistant. {context}\nUser: {prompt}\nAI:"
	inputs = gen_tokenizer(full_prompt, return_tensors="pt").to(DEVICE)
	outputs = gen_model.generate(**inputs, max_new_tokens=200, temperature=0.7, top_p=0.9, do_sample=True)
	response = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
	if "AI:" in response:
	response = response.split("AI:")[-1].strip()
	return translate_text(response, lang)


	# -----------------------------
	# Main Streamlit App
	# -----------------------------
	def main():
	st.title("⚖ ClauseWise: Multilingual Legal AI Assistant")
	st.markdown("Simplify, translate, and analyze legal documents with AI — in your language.")
	st.divider()

	tab1, tab2, tab3, tab4 = st.tabs(["📄 Analyzer", "🌐 Translate & Audio", "💬 Chatbot", "ℹ About"])

	with tab1:
	st.subheader("📁 Upload or Paste Legal Document")
	lang = st.selectbox("Select Language:", LANG_NAMES, index=0)
	file = st.file_uploader("Upload a Legal Document (PDF/DOCX/TXT)", type=["pdf", "docx", "txt"])
	text_input = st.text_area("Or Paste Text Here:", height=200)

	if file or text_input:
	text = extract_text(file) if file else text_input
	if not text:
	st.warning("No content found.")
	else:
	mode = st.radio("Simplify Mode", ["Explain like I'm 5", "Simplified", "Professional"])
	if st.button("🧾 Simplify Clauses"):
	with st.spinner("Simplifying..."):
	simplified = clause_simplification(text, mode)
	translated = translate_text(simplified, lang)
	st.success(translated)
	audio = text_to_speech(translated, lang)
	if audio:
	st.audio(audio, format="audio/mp3")

	if st.button("⚖ Fairness Analysis"):
	fairness_score_visual(text, lang)

	with tab2:
	st.subheader("🌐 Translate & Listen")
	text_input = st.text_area("Enter text:", height=200)
	lang = st.selectbox("Translate to:", LANG_NAMES, index=4)
	if st.button("Translate"):
	translated = translate_text(text_input, lang)
	st.success(translated)
	if st.button("🎧 Generate Audio"):
	audio = text_to_speech(text_input, lang)
	if audio:
	st.audio(audio, format="audio/mp3")

	with tab3:
	st.subheader("💬 Chat with ClauseWise (Memory Enabled)")
	lang = st.selectbox("Chat Language:", LANG_NAMES, index=0)
	query = st.text_area("Ask your question:", height=150)

	# Maintain persistent conversation
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = []

	if st.button("Ask"):
	if query.strip():
	with st.spinner("Thinking..."):
	response = chat_response(query, lang, st.session_state.chat_history)
	st.session_state.chat_history.append((query, response))
	st.success(response)
	audio = text_to_speech(response, lang)
	if audio:
	st.audio(audio, format="audio/mp3")

	# Display conversation history
	if st.session_state.chat_history:
	st.markdown("### 🧠 Chat History")
	for q, a in st.session_state.chat_history[-5:]:
	st.markdown(f"You: {q}")
	st.markdown(f"ClauseWise: {a}")

	if st.button("Clear Chat"):
	st.session_state.chat_history = []
	st.info("Chat cleared.")

	with tab4:
	st.markdown("""
	### ⚖ About ClauseWise
	ClauseWise is a multilingual AI-powered legal assistant that helps users:
	- Simplify legal language
	- Translate and listen in 10+ languages
	- Assess fairness visually
	- Chat interactively with memory
	---
	Disclaimer: Educational use only — not legal advice.
	""")


	if __name__ == "__main__":
	main()