import os import re import io import tempfile import torch import pandas as pd import plotly.express as px import streamlit as st from transformers import ( AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, pipeline ) from PyPDF2 import PdfReader from docx import Document from gtts import gTTS from io import BytesIO import spacy # ----------------------------- # STREAMLIT PAGE CONFIG # ----------------------------- st.set_page_config(page_title="⚖️ ClauseWise", page_icon="⚖️", layout="wide") # ----------------------------- # LANGUAGE MAP # ----------------------------- LANG_MAP = { "English": "en", "French": "fr", "Spanish": "es", "German": "de", "Hindi": "hi", "Tamil": "ta", "Telugu": "te", "Kannada": "kn", "Marathi": "mr", "Gujarati": "gu", "Bengali": "bn" } LANG_NAMES = list(LANG_MAP.keys()) # ----------------------------- # MODEL LOADING (with caching) # ----------------------------- @st.cache_resource def load_models(): """Load all required models with error handling""" try: simplify_model_name = "mrm8488/t5-small-finetuned-text-simplification" tokenizer_simplify = AutoTokenizer.from_pretrained(simplify_model_name) simplify_model = AutoModelForSeq2SeqLM.from_pretrained(simplify_model_name) gen_model_id = "microsoft/phi-2" gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id, trust_remote_code=True) gen_model = AutoModelForCausalLM.from_pretrained(gen_model_id, trust_remote_code=True) # ✅ Auto-download SpaCy if missing try: nlp = spacy.load("en_core_web_sm") except OSError: from spacy.cli import download download("en_core_web_sm") nlp = spacy.load("en_core_web_sm") classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") summarizer = pipeline("summarization", model="facebook/bart-large-cnn") return tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer except Exception as e: st.error(f"Error loading models: {e}") return None, None, None, None, None, None, None model_data = load_models() if model_data[0] is None: st.error("Failed to load models. Please check your internet connection and try again.") st.stop() tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer = model_data DEVICE = "cuda" if torch.cuda.is_available() else "cpu" if gen_model is not None: gen_model.to(DEVICE) # ----------------------------- # UTILITIES # ----------------------------- def extract_text(file): if not file: return "" name = file.name.lower() with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(name)[1]) as tmp: tmp.write(file.read()) tmp_path = tmp.name text = "" try: if name.endswith(".pdf"): reader = PdfReader(tmp_path) for page in reader.pages: t = page.extract_text() if t: text += t + "\n" elif name.endswith(".docx"): doc = Document(tmp_path) text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) else: with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f: text = f.read() except Exception as e: st.error(f"Error reading file: {e}") finally: if os.path.exists(tmp_path): os.remove(tmp_path) return text.strip() def translate_text(text, target_lang): if not text: return "" lang_code = LANG_MAP.get(target_lang, "en") if lang_code == "en": return text try: text_to_translate = text[:500] translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{lang_code}") result = translator(text_to_translate, max_length=512) return result[0]["translation_text"] except Exception as e: st.warning(f"Translation unavailable for {target_lang}: {str(e)}") return text def text_to_speech(text, lang): if not text: return None try: lang_code = LANG_MAP.get(lang, "en") tts = gTTS(text=text[:1000], lang=lang_code, slow=False) audio_fp = BytesIO() tts.write_to_fp(audio_fp) audio_fp.seek(0) return audio_fp except Exception as e: st.warning(f"Audio generation unavailable: {str(e)}") return None def clause_simplification(text, mode): if not text or simplify_model is None: return text prefix_map = { "Simplified": "simplify: ", "Explain like I'm 5": "explain like I'm 5: ", "Professional": "rephrase professionally: " } prefix = prefix_map.get(mode, "simplify: ") try: text_to_process = text[:500] inputs = tokenizer_simplify(prefix + text_to_process, return_tensors="pt", truncation=True, max_length=512) outputs = simplify_model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True) return tokenizer_simplify.decode(outputs[0], skip_special_tokens=True) except Exception as e: st.error(f"Simplification error: {e}") return text def fairness_score_visual(text, lang): if not text: st.warning("No text to analyze.") return pos = len(re.findall(r"\b(mutual|both parties|shared|equal|fair|balanced)\b", text, re.I)) neg = len(re.findall(r"\b(sole|unilateral|exclusive right|one-sided|only)\b", text, re.I)) score = max(0, min(100, 50 + (pos * 5) - (neg * 5))) st.subheader("⚖️ Fairness Balance Meter") fairness_df = pd.DataFrame({ "Aspect": ["Party A Favored", "Balanced", "Party B Favored"], "Score": [max(0, 100 - score), score, min(100, score)] }) fig = px.bar(fairness_df, x="Score", y="Aspect", orientation="h", text="Score", color="Aspect", color_discrete_sequence=["#ff6b6b", "#4ecdc4", "#95e1d3"]) fig.update_layout(showlegend=False, xaxis_title="Score", yaxis_title="", height=300) st.plotly_chart(fig, use_container_width=True) fairness_text = f"Fairness Score: {score}% (Approximate - based on keyword analysis)" translated_result = translate_text(fairness_text, lang) st.info(translated_result) def chat_response(prompt, lang): if not prompt or gen_model is None: return "Unable to generate response. Please try again." try: full_prompt = f"You are a helpful legal assistant. Answer the following question: {prompt}\n\nAnswer:" inputs = gen_tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=512).to(DEVICE) outputs = gen_model.generate(**inputs, max_new_tokens=200, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=gen_tokenizer.eos_token_id) response = gen_tokenizer.decode(outputs[0], skip_special_tokens=True) if "Answer:" in response: response = response.split("Answer:")[-1].strip() return translate_text(response, lang) except Exception as e: st.error(f"Chat error: {e}") return "I'm having trouble generating a response. Please try rephrasing your question." # ----------------------------- # MAIN APP # ----------------------------- def main(): st.title("⚖️ ClauseWise: Multilingual Legal AI Assistant") st.markdown("**Simplify**, **translate**, and **analyze** legal documents with AI — in your language.\n---") tab1, tab2, tab3, tab4 = st.tabs(["📄 Analyzer", "🌐 Translate & Audio", "💬 Chatbot", "ℹ️ About"]) # TAB 1: ANALYZER with tab1: st.subheader("📁 Upload or Paste Legal Document") lang = st.selectbox("Select Language:", LANG_NAMES, index=0, key="analyzer_lang") file = st.file_uploader("Upload a Legal Document (PDF/DOCX/TXT)", type=["pdf", "docx", "txt"]) text_input = st.text_area("Or Paste Text Here:", height=200, key="analyzer_text") if file or text_input: text = extract_text(file) if file else text_input if text.strip(): mode = st.radio("Simplify Mode", ["Explain like I'm 5", "Simplified", "Professional"]) if st.button("🧾 Simplify Clauses"): with st.spinner("Simplifying..."): simplified = clause_simplification(text, mode) translated = translate_text(simplified, lang) st.success(translated) audio_data = text_to_speech(translated, lang) if audio_data: st.audio(audio_data, format="audio/mp3") if st.button("⚖️ Fairness Analysis"): with st.spinner("Analyzing fairness..."): fairness_score_visual(text, lang) else: st.warning("Please provide some text to analyze.") # TAB 2: TRANSLATION + AUDIO with tab2: st.subheader("🌐 Translate & Listen") text_input = st.text_area("Enter text:", height=200, key="translate_text") lang = st.selectbox("Translate to:", LANG_NAMES, index=4, key="translate_lang") if st.button("Translate"): if text_input.strip(): with st.spinner("Translating..."): translated = translate_text(text_input, lang) st.success(translated) else: st.warning("Please enter some text to translate.") if st.button("🎧 Generate Audio"): if text_input.strip(): with st.spinner("Generating audio..."): audio_data = text_to_speech(text_input, lang) if audio_data: st.audio(audio_data, format="audio/mp3") else: st.warning("Please enter some text for audio generation.") # TAB 3: CHATBOT with tab3: st.subheader("💬 Chat with ClauseWise (Multilingual)") lang = st.selectbox("Chat Language:", LANG_NAMES, index=0, key="chat_lang") query = st.text_area("Ask about clauses, fairness, or legal meaning:", height=150, key="chat_query") if st.button("Ask"): if query.strip(): with st.spinner("Thinking..."): response = chat_response(query, lang) st.success(response) audio_data = text_to_speech(response, lang) if audio_data: st.audio(audio_data, format="audio/mp3") else: st.warning("Please enter a question.") # TAB 4: ABOUT with tab4: st.markdown(""" ### ⚖️ About ClauseWise ClauseWise is a multilingual AI-powered legal assistant that helps users: - **Simplify complex clauses** into easy-to-understand language - **Translate and listen** in 10+ languages - **Assess fairness** visually - **Chat interactively** about legal concepts **Languages Supported:** English, French, Spanish, German, Hindi, Tamil, Telugu, Kannada, Marathi, Gujarati, Bengali **Technologies Used:** Hugging Face Transformers (T5, Phi-2, BART), SpaCy, gTTS, Plotly ⚠️ *Disclaimer:* Educational use only — not legal advice. """) # ----------------------------- # ✅ CORRECT HUGGING FACE LAUNCHER # ----------------------------- if __name__ == "__main__": main()