Spaces:

deepthi6
/

clausewise_full_project

Runtime error

File size: 11,732 Bytes

import os
import re
import io
import tempfile
import torch
import pandas as pd
import plotly.express as px
import streamlit as st
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    pipeline
)
from PyPDF2 import PdfReader
from docx import Document
from gtts import gTTS
from io import BytesIO
import spacy

# -----------------------------
# STREAMLIT PAGE CONFIG
# -----------------------------
st.set_page_config(page_title="⚖️ ClauseWise", page_icon="⚖️", layout="wide")

# -----------------------------
# LANGUAGE MAP
# -----------------------------
LANG_MAP = {
    "English": "en", "French": "fr", "Spanish": "es", "German": "de",
    "Hindi": "hi", "Tamil": "ta", "Telugu": "te", "Kannada": "kn",
    "Marathi": "mr", "Gujarati": "gu", "Bengali": "bn"
}
LANG_NAMES = list(LANG_MAP.keys())

# -----------------------------
# MODEL LOADING (with caching)
# -----------------------------
@st.cache_resource
def load_models():
    """Load all required models with error handling"""
    try:
        simplify_model_name = "mrm8488/t5-small-finetuned-text-simplification"
        tokenizer_simplify = AutoTokenizer.from_pretrained(simplify_model_name)
        simplify_model = AutoModelForSeq2SeqLM.from_pretrained(simplify_model_name)

        gen_model_id = "microsoft/phi-2"
        gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id, trust_remote_code=True)
        gen_model = AutoModelForCausalLM.from_pretrained(gen_model_id, trust_remote_code=True)

        # ✅ Auto-download SpaCy if missing
        try:
            nlp = spacy.load("en_core_web_sm")
        except OSError:
            from spacy.cli import download
            download("en_core_web_sm")
            nlp = spacy.load("en_core_web_sm")

        classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        
        return tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer
    except Exception as e:
        st.error(f"Error loading models: {e}")
        return None, None, None, None, None, None, None


model_data = load_models()
if model_data[0] is None:
    st.error("Failed to load models. Please check your internet connection and try again.")
    st.stop()

tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer = model_data
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
if gen_model is not None:
    gen_model.to(DEVICE)

# -----------------------------
# UTILITIES
# -----------------------------
def extract_text(file):
    if not file:
        return ""
    name = file.name.lower()
    with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(name)[1]) as tmp:
        tmp.write(file.read())
        tmp_path = tmp.name
    text = ""
    try:
        if name.endswith(".pdf"):
            reader = PdfReader(tmp_path)
            for page in reader.pages:
                t = page.extract_text()
                if t:
                    text += t + "\n"
        elif name.endswith(".docx"):
            doc = Document(tmp_path)
            text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
        else:
            with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()
    except Exception as e:
        st.error(f"Error reading file: {e}")
    finally:
        if os.path.exists(tmp_path):
            os.remove(tmp_path)
    return text.strip()


def translate_text(text, target_lang):
    if not text:
        return ""
    lang_code = LANG_MAP.get(target_lang, "en")
    if lang_code == "en":
        return text
    try:
        text_to_translate = text[:500]
        translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{lang_code}")
        result = translator(text_to_translate, max_length=512)
        return result[0]["translation_text"]
    except Exception as e:
        st.warning(f"Translation unavailable for {target_lang}: {str(e)}")
        return text


def text_to_speech(text, lang):
    if not text:
        return None
    try:
        lang_code = LANG_MAP.get(lang, "en")
        tts = gTTS(text=text[:1000], lang=lang_code, slow=False)
        audio_fp = BytesIO()
        tts.write_to_fp(audio_fp)
        audio_fp.seek(0)
        return audio_fp
    except Exception as e:
        st.warning(f"Audio generation unavailable: {str(e)}")
        return None


def clause_simplification(text, mode):
    if not text or simplify_model is None:
        return text
    prefix_map = {
        "Simplified": "simplify: ",
        "Explain like I'm 5": "explain like I'm 5: ",
        "Professional": "rephrase professionally: "
    }
    prefix = prefix_map.get(mode, "simplify: ")
    try:
        text_to_process = text[:500]
        inputs = tokenizer_simplify(prefix + text_to_process, return_tensors="pt",
                                    truncation=True, max_length=512)
        outputs = simplify_model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True)
        return tokenizer_simplify.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        st.error(f"Simplification error: {e}")
        return text


def fairness_score_visual(text, lang):
    if not text:
        st.warning("No text to analyze.")
        return
    pos = len(re.findall(r"\b(mutual|both parties|shared|equal|fair|balanced)\b", text, re.I))
    neg = len(re.findall(r"\b(sole|unilateral|exclusive right|one-sided|only)\b", text, re.I))
    score = max(0, min(100, 50 + (pos * 5) - (neg * 5)))

    st.subheader("⚖️ Fairness Balance Meter")
    fairness_df = pd.DataFrame({
        "Aspect": ["Party A Favored", "Balanced", "Party B Favored"],
        "Score": [max(0, 100 - score), score, min(100, score)]
    })
    fig = px.bar(fairness_df, x="Score", y="Aspect", orientation="h", text="Score",
                 color="Aspect", color_discrete_sequence=["#ff6b6b", "#4ecdc4", "#95e1d3"])
    fig.update_layout(showlegend=False, xaxis_title="Score", yaxis_title="", height=300)
    st.plotly_chart(fig, use_container_width=True)
    fairness_text = f"Fairness Score: {score}% (Approximate - based on keyword analysis)"
    translated_result = translate_text(fairness_text, lang)
    st.info(translated_result)


def chat_response(prompt, lang):
    if not prompt or gen_model is None:
        return "Unable to generate response. Please try again."
    try:
        full_prompt = f"You are a helpful legal assistant. Answer the following question: {prompt}\n\nAnswer:"
        inputs = gen_tokenizer(full_prompt, return_tensors="pt", truncation=True,
                               max_length=512).to(DEVICE)
        outputs = gen_model.generate(**inputs, max_new_tokens=200, temperature=0.7,
                                     top_p=0.9, do_sample=True,
                                     pad_token_id=gen_tokenizer.eos_token_id)
        response = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
        if "Answer:" in response:
            response = response.split("Answer:")[-1].strip()
        return translate_text(response, lang)
    except Exception as e:
        st.error(f"Chat error: {e}")
        return "I'm having trouble generating a response. Please try rephrasing your question."


# -----------------------------
# MAIN APP
# -----------------------------
def main():
    st.title("⚖️ ClauseWise: Multilingual Legal AI Assistant")
    st.markdown("**Simplify**, **translate**, and **analyze** legal documents with AI — in your language.\n---")

    tab1, tab2, tab3, tab4 = st.tabs(["📄 Analyzer", "🌐 Translate & Audio", "💬 Chatbot", "ℹ️ About"])

    # TAB 1: ANALYZER
    with tab1:
        st.subheader("📁 Upload or Paste Legal Document")
        lang = st.selectbox("Select Language:", LANG_NAMES, index=0, key="analyzer_lang")
        file = st.file_uploader("Upload a Legal Document (PDF/DOCX/TXT)", type=["pdf", "docx", "txt"])
        text_input = st.text_area("Or Paste Text Here:", height=200, key="analyzer_text")

        if file or text_input:
            text = extract_text(file) if file else text_input
            if text.strip():
                mode = st.radio("Simplify Mode", ["Explain like I'm 5", "Simplified", "Professional"])
                if st.button("🧾 Simplify Clauses"):
                    with st.spinner("Simplifying..."):
                        simplified = clause_simplification(text, mode)
                        translated = translate_text(simplified, lang)
                        st.success(translated)
                        audio_data = text_to_speech(translated, lang)
                        if audio_data:
                            st.audio(audio_data, format="audio/mp3")
                if st.button("⚖️ Fairness Analysis"):
                    with st.spinner("Analyzing fairness..."):
                        fairness_score_visual(text, lang)
            else:
                st.warning("Please provide some text to analyze.")

    # TAB 2: TRANSLATION + AUDIO
    with tab2:
        st.subheader("🌐 Translate & Listen")
        text_input = st.text_area("Enter text:", height=200, key="translate_text")
        lang = st.selectbox("Translate to:", LANG_NAMES, index=4, key="translate_lang")
        if st.button("Translate"):
            if text_input.strip():
                with st.spinner("Translating..."):
                    translated = translate_text(text_input, lang)
                    st.success(translated)
            else:
                st.warning("Please enter some text to translate.")
        if st.button("🎧 Generate Audio"):
            if text_input.strip():
                with st.spinner("Generating audio..."):
                    audio_data = text_to_speech(text_input, lang)
                    if audio_data:
                        st.audio(audio_data, format="audio/mp3")
            else:
                st.warning("Please enter some text for audio generation.")

    # TAB 3: CHATBOT
    with tab3:
        st.subheader("💬 Chat with ClauseWise (Multilingual)")
        lang = st.selectbox("Chat Language:", LANG_NAMES, index=0, key="chat_lang")
        query = st.text_area("Ask about clauses, fairness, or legal meaning:", height=150, key="chat_query")
        if st.button("Ask"):
            if query.strip():
                with st.spinner("Thinking..."):
                    response = chat_response(query, lang)
                    st.success(response)
                    audio_data = text_to_speech(response, lang)
                    if audio_data:
                        st.audio(audio_data, format="audio/mp3")
            else:
                st.warning("Please enter a question.")

    # TAB 4: ABOUT
    with tab4:
        st.markdown("""
        ### ⚖️ About ClauseWise
        ClauseWise is a multilingual AI-powered legal assistant that helps users:
        - **Simplify complex clauses** into easy-to-understand language  
        - **Translate and listen** in 10+ languages  
        - **Assess fairness** visually  
        - **Chat interactively** about legal concepts  

        **Languages Supported:**  
        English, French, Spanish, German, Hindi, Tamil, Telugu, Kannada, Marathi, Gujarati, Bengali  

        **Technologies Used:**  
        Hugging Face Transformers (T5, Phi-2, BART), SpaCy, gTTS, Plotly  

        ⚠️ *Disclaimer:* Educational use only — not legal advice.
        """)

# -----------------------------
# ✅ CORRECT HUGGING FACE LAUNCHER
# -----------------------------
if __name__ == "__main__":
    main()