Spaces:

deepthi6
/

clausewise_full_project

Runtime error

App Files Files Community

clausewise_full_project / app.py

deepthi6

Update app.py

2248103 verified 3 months ago

raw

history blame contribute delete

11.7 kB

	import os
	import re
	import io
	import tempfile
	import torch
	import pandas as pd
	import plotly.express as px
	import streamlit as st
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	AutoModelForSeq2SeqLM,
	pipeline
	)
	from PyPDF2 import PdfReader
	from docx import Document
	from gtts import gTTS
	from io import BytesIO
	import spacy

	# -----------------------------
	# STREAMLIT PAGE CONFIG
	# -----------------------------
	st.set_page_config(page_title="⚖️ ClauseWise", page_icon="⚖️", layout="wide")

	# -----------------------------
	# LANGUAGE MAP
	# -----------------------------
	LANG_MAP = {
	"English": "en", "French": "fr", "Spanish": "es", "German": "de",
	"Hindi": "hi", "Tamil": "ta", "Telugu": "te", "Kannada": "kn",
	"Marathi": "mr", "Gujarati": "gu", "Bengali": "bn"
	}
	LANG_NAMES = list(LANG_MAP.keys())

	# -----------------------------
	# MODEL LOADING (with caching)
	# -----------------------------
	@st.cache_resource
	def load_models():
	"""Load all required models with error handling"""
	try:
	simplify_model_name = "mrm8488/t5-small-finetuned-text-simplification"
	tokenizer_simplify = AutoTokenizer.from_pretrained(simplify_model_name)
	simplify_model = AutoModelForSeq2SeqLM.from_pretrained(simplify_model_name)

	gen_model_id = "microsoft/phi-2"
	gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id, trust_remote_code=True)
	gen_model = AutoModelForCausalLM.from_pretrained(gen_model_id, trust_remote_code=True)

	# ✅ Auto-download SpaCy if missing
	try:
	nlp = spacy.load("en_core_web_sm")
	except OSError:
	from spacy.cli import download
	download("en_core_web_sm")
	nlp = spacy.load("en_core_web_sm")

	classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

	return tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer
	except Exception as e:
	st.error(f"Error loading models: {e}")
	return None, None, None, None, None, None, None


	model_data = load_models()
	if model_data[0] is None:
	st.error("Failed to load models. Please check your internet connection and try again.")
	st.stop()

	tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer = model_data
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	if gen_model is not None:
	gen_model.to(DEVICE)

	# -----------------------------
	# UTILITIES
	# -----------------------------
	def extract_text(file):
	if not file:
	return ""
	name = file.name.lower()
	with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(name)[1]) as tmp:
	tmp.write(file.read())
	tmp_path = tmp.name
	text = ""
	try:
	if name.endswith(".pdf"):
	reader = PdfReader(tmp_path)
	for page in reader.pages:
	t = page.extract_text()
	if t:
	text += t + "\n"
	elif name.endswith(".docx"):
	doc = Document(tmp_path)
	text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
	else:
	with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f:
	text = f.read()
	except Exception as e:
	st.error(f"Error reading file: {e}")
	finally:
	if os.path.exists(tmp_path):
	os.remove(tmp_path)
	return text.strip()


	def translate_text(text, target_lang):
	if not text:
	return ""
	lang_code = LANG_MAP.get(target_lang, "en")
	if lang_code == "en":
	return text
	try:
	text_to_translate = text[:500]
	translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{lang_code}")
	result = translator(text_to_translate, max_length=512)
	return result[0]["translation_text"]
	except Exception as e:
	st.warning(f"Translation unavailable for {target_lang}: {str(e)}")
	return text


	def text_to_speech(text, lang):
	if not text:
	return None
	try:
	lang_code = LANG_MAP.get(lang, "en")
	tts = gTTS(text=text[:1000], lang=lang_code, slow=False)
	audio_fp = BytesIO()
	tts.write_to_fp(audio_fp)
	audio_fp.seek(0)
	return audio_fp
	except Exception as e:
	st.warning(f"Audio generation unavailable: {str(e)}")
	return None


	def clause_simplification(text, mode):
	if not text or simplify_model is None:
	return text
	prefix_map = {
	"Simplified": "simplify: ",
	"Explain like I'm 5": "explain like I'm 5: ",
	"Professional": "rephrase professionally: "
	}
	prefix = prefix_map.get(mode, "simplify: ")
	try:
	text_to_process = text[:500]
	inputs = tokenizer_simplify(prefix + text_to_process, return_tensors="pt",
	truncation=True, max_length=512)
	outputs = simplify_model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True)
	return tokenizer_simplify.decode(outputs[0], skip_special_tokens=True)
	except Exception as e:
	st.error(f"Simplification error: {e}")
	return text


	def fairness_score_visual(text, lang):
	if not text:
	st.warning("No text to analyze.")
	return
	pos = len(re.findall(r"\b(mutual\|both parties\|shared\|equal\|fair\|balanced)\b", text, re.I))
	neg = len(re.findall(r"\b(sole\|unilateral\|exclusive right\|one-sided\|only)\b", text, re.I))
	score = max(0, min(100, 50 + (pos * 5) - (neg * 5)))

	st.subheader("⚖️ Fairness Balance Meter")
	fairness_df = pd.DataFrame({
	"Aspect": ["Party A Favored", "Balanced", "Party B Favored"],
	"Score": [max(0, 100 - score), score, min(100, score)]
	})
	fig = px.bar(fairness_df, x="Score", y="Aspect", orientation="h", text="Score",
	color="Aspect", color_discrete_sequence=["#ff6b6b", "#4ecdc4", "#95e1d3"])
	fig.update_layout(showlegend=False, xaxis_title="Score", yaxis_title="", height=300)
	st.plotly_chart(fig, use_container_width=True)
	fairness_text = f"Fairness Score: {score}% (Approximate - based on keyword analysis)"
	translated_result = translate_text(fairness_text, lang)
	st.info(translated_result)


	def chat_response(prompt, lang):
	if not prompt or gen_model is None:
	return "Unable to generate response. Please try again."
	try:
	full_prompt = f"You are a helpful legal assistant. Answer the following question: {prompt}\n\nAnswer:"
	inputs = gen_tokenizer(full_prompt, return_tensors="pt", truncation=True,
	max_length=512).to(DEVICE)
	outputs = gen_model.generate(**inputs, max_new_tokens=200, temperature=0.7,
	top_p=0.9, do_sample=True,
	pad_token_id=gen_tokenizer.eos_token_id)
	response = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
	if "Answer:" in response:
	response = response.split("Answer:")[-1].strip()
	return translate_text(response, lang)
	except Exception as e:
	st.error(f"Chat error: {e}")
	return "I'm having trouble generating a response. Please try rephrasing your question."


	# -----------------------------
	# MAIN APP
	# -----------------------------
	def main():
	st.title("⚖️ ClauseWise: Multilingual Legal AI Assistant")
	st.markdown("Simplify, translate, and analyze legal documents with AI — in your language.\n---")

	tab1, tab2, tab3, tab4 = st.tabs(["📄 Analyzer", "🌐 Translate & Audio", "💬 Chatbot", "ℹ️ About"])

	# TAB 1: ANALYZER
	with tab1:
	st.subheader("📁 Upload or Paste Legal Document")
	lang = st.selectbox("Select Language:", LANG_NAMES, index=0, key="analyzer_lang")
	file = st.file_uploader("Upload a Legal Document (PDF/DOCX/TXT)", type=["pdf", "docx", "txt"])
	text_input = st.text_area("Or Paste Text Here:", height=200, key="analyzer_text")

	if file or text_input:
	text = extract_text(file) if file else text_input
	if text.strip():
	mode = st.radio("Simplify Mode", ["Explain like I'm 5", "Simplified", "Professional"])
	if st.button("🧾 Simplify Clauses"):
	with st.spinner("Simplifying..."):
	simplified = clause_simplification(text, mode)
	translated = translate_text(simplified, lang)
	st.success(translated)
	audio_data = text_to_speech(translated, lang)
	if audio_data:
	st.audio(audio_data, format="audio/mp3")
	if st.button("⚖️ Fairness Analysis"):
	with st.spinner("Analyzing fairness..."):
	fairness_score_visual(text, lang)
	else:
	st.warning("Please provide some text to analyze.")

	# TAB 2: TRANSLATION + AUDIO
	with tab2:
	st.subheader("🌐 Translate & Listen")
	text_input = st.text_area("Enter text:", height=200, key="translate_text")
	lang = st.selectbox("Translate to:", LANG_NAMES, index=4, key="translate_lang")
	if st.button("Translate"):
	if text_input.strip():
	with st.spinner("Translating..."):
	translated = translate_text(text_input, lang)
	st.success(translated)
	else:
	st.warning("Please enter some text to translate.")
	if st.button("🎧 Generate Audio"):
	if text_input.strip():
	with st.spinner("Generating audio..."):
	audio_data = text_to_speech(text_input, lang)
	if audio_data:
	st.audio(audio_data, format="audio/mp3")
	else:
	st.warning("Please enter some text for audio generation.")

	# TAB 3: CHATBOT
	with tab3:
	st.subheader("💬 Chat with ClauseWise (Multilingual)")
	lang = st.selectbox("Chat Language:", LANG_NAMES, index=0, key="chat_lang")
	query = st.text_area("Ask about clauses, fairness, or legal meaning:", height=150, key="chat_query")
	if st.button("Ask"):
	if query.strip():
	with st.spinner("Thinking..."):
	response = chat_response(query, lang)
	st.success(response)
	audio_data = text_to_speech(response, lang)
	if audio_data:
	st.audio(audio_data, format="audio/mp3")
	else:
	st.warning("Please enter a question.")

	# TAB 4: ABOUT
	with tab4:
	st.markdown("""
	### ⚖️ About ClauseWise
	ClauseWise is a multilingual AI-powered legal assistant that helps users:
	- Simplify complex clauses into easy-to-understand language
	- Translate and listen in 10+ languages
	- Assess fairness visually
	- Chat interactively about legal concepts

	Languages Supported:
	English, French, Spanish, German, Hindi, Tamil, Telugu, Kannada, Marathi, Gujarati, Bengali

	Technologies Used:
	Hugging Face Transformers (T5, Phi-2, BART), SpaCy, gTTS, Plotly

	⚠️ Disclaimer: Educational use only — not legal advice.
	""")

	# -----------------------------
	# ✅ CORRECT HUGGING FACE LAUNCHER
	# -----------------------------
	if __name__ == "__main__":
	main()