Spaces:

deepthi6
/

clausewise_full_project

Runtime error

App Files Files Community

deepthi6 commited on Nov 5, 2025

Commit

5e6f621

verified ·

1 Parent(s): 4dfe5da

Update app.py

Browse files

Files changed (1) hide show

app.py +212 -199

app.py CHANGED Viewed

@@ -1,218 +1,231 @@
 import os
-import json
 import re
 import io
-from typing import List, Dict, Tuple, Optional, Any
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from PyPDF2 import PdfReader  # ✅ PyPDF2 instead of pypdf (lighter, preinstalled on HF)
-import docx
 import spacy
-import gradio as gr
 # -----------------------------
-# Model (Granite or lightweight fallback)
 # -----------------------------
-try:
-    MODEL_ID = "ibm-granite/granite-3.2-2b-instruct"
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,
-        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-        device_map="auto" if torch.cuda.is_available() else None
-    )
-except Exception:
-    MODEL_ID = "microsoft/phi-2"  # ✅ Lightweight fallback if Granite fails
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(DEVICE)
 # -----------------------------
-# Load spaCy model
 # -----------------------------
-try:
-    nlp = spacy.load("en_core_web_sm")
-except Exception:
-    from spacy.cli import download
-    download("en_core_web_sm")
     nlp = spacy.load("en_core_web_sm")
 # -----------------------------
-# Helper: Prompt builder
 # -----------------------------
-def build_chat_prompt(system_prompt: str, user_prompt: str) -> str:
-    messages = []
-    if system_prompt:
-        messages.append({"role": "system", "content": system_prompt})
-    messages.append({"role": "user", "content": user_prompt})
     try:
-        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     except Exception:
-        return f"[SYSTEM]\n{system_prompt}\n[USER]\n{user_prompt}\n[ASSISTANT]\n"
-# -----------------------------
-# Text generation
-# -----------------------------
-def llm_generate(system_prompt: str, user_prompt: str, max_new_tokens=512) -> str:
-    prompt = build_chat_prompt(system_prompt, user_prompt)
-    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
-    with torch.no_grad():
-        output = model.generate(
-            **inputs,
-            max_new_tokens=max_new_tokens,
-            do_sample=True,
-            top_p=0.9,
-            temperature=0.3,
-            pad_token_id=tokenizer.eos_token_id
-        )
-    full_text = tokenizer.decode(output[0], skip_special_tokens=True)
-    if "[ASSISTANT]" in full_text:
-        return full_text.split("[ASSISTANT]")[-1].strip()
-    if full_text.startswith(prompt):
-        return full_text[len(prompt):].strip()
-    return full_text.strip()
-# -----------------------------
-# File loaders
-# -----------------------------
-def load_text_from_pdf(file_obj) -> str:
-    reader = PdfReader(file_obj)
-    pages = [page.extract_text() or "" for page in reader.pages]
-    return "\n".join(pages).strip()
-def load_text_from_docx(file_obj) -> str:
-    data = file_obj.read()
-    file_obj.seek(0)
-    f = io.BytesIO(data)
-    doc = docx.Document(f)
-    paras = [p.text for p in doc.paragraphs]
-    return "\n".join(paras).strip()
-def load_text_from_txt(file_obj) -> str:
-    data = file_obj.read()
-    if isinstance(data, bytes):
-        data = data.decode("utf-8", errors="ignore")
-    return str(data).strip()
-def load_document(file: Optional[gr.File]) -> str:
-    if not file:
-        return ""
-    name = (file.name or "").lower()
-    if name.endswith(".pdf"):
-        return load_text_from_pdf(file)
-    elif name.endswith(".docx"):
-        return load_text_from_docx(file)
-    elif name.endswith(".txt"):
-        return load_text_from_txt(file)
-    else:
-        try:
-            return load_text_from_pdf(file)
-        except Exception:
-            try:
-                return load_text_from_docx(file)
-            except Exception:
-                return load_text_from_txt(file)
-# -----------------------------
-# Clause extraction
-# -----------------------------
-def split_into_clauses(text: str, min_len=40) -> List[str]:
-    if not text:
-        return []
-    parts = re.split(r"(?:(?:^\s*\d+(?:\.\d+)[.)]\s+)|(?:^\s[A-Z]\s*[.)]\s+)|(?:;?\s*\n))", text, flags=re.MULTILINE)
-    if len(parts) < 2:
-        parts = re.split(r"(?<=[.;])\s+\n?\s*", text)
-    clauses = [p.strip() for p in parts if len(p.strip()) >= min_len]
-    seen, unique = set(), []
-    for c in clauses:
-        key = re.sub(r"\s+", " ", c.lower())
-        if key not in seen:
-            seen.add(key)
-            unique.append(c)
-    return unique
-# -----------------------------
-# Simplify clause
-# -----------------------------
-def simplify_clause(clause: str) -> str:
-    system = "You are a legal assistant simplifying contract clauses for clarity."
-    user = f"Rewrite this clause in plain English:\n\n{clause}"
-    return llm_generate(system, user, max_new_tokens=300)
-# -----------------------------
-# Named Entity Recognition
-# -----------------------------
-def ner_entities(text: str) -> Dict[str, List[str]]:
-    if not text:
-        return {}
-    doc = nlp(text)
-    out: Dict[str, List[str]] = {}
-    for ent in doc.ents:
-        out.setdefault(ent.label_, []).append(ent.text)
-    return {k: sorted(set(v)) for k, v in out.items()}
-# -----------------------------
-# Document classification
-# -----------------------------
-DOC_TYPES = [
-    "Non-Disclosure Agreement (NDA)",
-    "Lease Agreement",
-    "Employment Contract",
-    "Service Agreement",
-    "Sales Agreement",
-    "Consulting Agreement",
-    "Terms of Service"
-]
-def classify_document(text: str) -> str:
-    system = "You are a legal document classifier."
-    labels = "\n".join(f"- {t}" for t in DOC_TYPES)
-    user = f"Classify the document into one of these types:\n{labels}\n\nDocument:\n{text[:4000]}"
-    resp = llm_generate(system, user, max_new_tokens=200)
-    for t in DOC_TYPES:
-        if t.lower() in resp.lower():
-            return t
-    return "Unclassified"
-# -----------------------------
-# Input handler
-# -----------------------------
-def get_text_from_inputs(file: Optional[gr.File], text: str) -> str:
-    file_text = load_document(file) if file else ""
-    final = (text or "").strip()
-    return file_text if len(file_text) > len(final) else final
-# -----------------------------
-# Gradio Interface
-# -----------------------------
-def analyze_document(file, text):
-    content = get_text_from_inputs(file, text)
-    if not content:
-        return "No content found.", {}, []
-    clauses = split_into_clauses(content)
-    summary = f"Found {len(clauses)} clauses."
-    entities = ner_entities(content)
-    classification = classify_document(content)
-    simplified = simplify_clause(clauses[0]) if clauses else "No clause to simplify."
-    return summary + f"\n\nDocument Type: {classification}\n\nSample Simplified Clause:\n{simplified}", entities, clauses[:5]
-iface = gr.Interface(
-    fn=analyze_document,
-    inputs=[
-        gr.File(label="Upload a Legal Document (PDF/DOCX/TXT)"),
-        gr.Textbox(label="...or Paste Text", lines=5)
-    ],
-    outputs=[
-        gr.Textbox(label="Analysis Summary"),
-        gr.JSON(label="Entities Found"),
-        gr.Textbox(label="Extracted Clauses (first 5)", lines=10)
-    ],
-    title="⚖️ ClauseWise: Legal Document Analyzer",
-    description="Upload a contract or paste text to extract clauses, identify entities, and simplify content."
-)
-if __name__ == "__main__":
-    iface.launch()

 import os
 import re
 import io
+import tempfile
 import torch
+import pandas as pd
+import plotly.express as px
+import streamlit as st
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    pipeline
+)
+from PyPDF2 import PdfReader
+from docx import Document
 import spacy
+from gtts import gTTS
+from io import BytesIO
 # -----------------------------
+# PAGE CONFIG
 # -----------------------------
+st.set_page_config(page_title="⚖️ ClauseWise: Multilingual Legal AI Assistant", page_icon="⚖️", layout="wide")
+st.title("⚖️ ClauseWise: Multilingual Legal AI Assistant")
+st.markdown("""
+ClauseWise helps you **simplify, translate, and understand legal documents** in your preferred language.
+Upload contracts, extract clauses, check fairness, and chat with your AI legal assistant — all multilingual and with audio output.
+---
+""")
 # -----------------------------
+# LANGUAGE MAP
 # -----------------------------
+LANG_MAP = {
+    "English": "en", "French": "fr", "Spanish": "es", "German": "de",
+    "Hindi": "hi", "Tamil": "ta", "Telugu": "te", "Kannada": "kn",
+    "Marathi": "mr", "Gujarati": "gu", "Bengali": "bn"
+}
+LANG_NAMES = list(LANG_MAP.keys())
+# -----------------------------
+# LOAD MODELS
+# -----------------------------
+@st.cache_resource
+def load_all_models():
+    simplify_model_name = "mrm8488/t5-small-finetuned-text-simplification"
+    tokenizer_simplify = AutoTokenizer.from_pretrained(simplify_model_name)
+    simplify_model = AutoModelForSeq2SeqLM.from_pretrained(simplify_model_name)
+    gen_model_id = "microsoft/phi-2"
+    gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id)
+    gen_model = AutoModelForCausalLM.from_pretrained(gen_model_id)
     nlp = spacy.load("en_core_web_sm")
+    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+    return tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer
+tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer = load_all_models()
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+gen_model.to(DEVICE)
 # -----------------------------
+# UTILS
 # -----------------------------
+def extract_text(file):
+    name = file.name.lower()
+    with tempfile.NamedTemporaryFile(delete=False) as tmp:
+        tmp.write(file.read())
+        tmp_path = tmp.name
+    text = ""
+    try:
+        if name.endswith(".pdf"):
+            reader = PdfReader(tmp_path)
+            for page in reader.pages:
+                t = page.extract_text()
+                if t:
+                    text += t + "\n"
+        elif name.endswith(".docx"):
+            doc = Document(tmp_path)
+            text = "\n".join([p.text for p in doc.paragraphs])
+        else:
+            text = open(tmp_path, "r", encoding="utf-8", errors="ignore").read()
+    except Exception as e:
+        st.error(f"Failed to read file: {e}")
+    finally:
+        os.remove(tmp_path)
+    return text.strip()
+def translate_text(text, target_lang):
+    lang_code = LANG_MAP[target_lang]
+    if lang_code == "en":
+        return text
     try:
+        translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{lang_code}")
+        return translator(text[:1000])[0]["translation_text"]
     except Exception:
+        return f"(Translation unavailable for {target_lang})"
+def text_to_speech(text, lang):
+    lang_code = LANG_MAP[lang]
+    try:
+        tts = gTTS(text=text, lang=lang_code)
+        audio_fp = BytesIO()
+        tts.write_to_fp(audio_fp)
+        audio_fp.seek(0)
+        return audio_fp
+    except Exception:
+        st.warning("Speech generation failed for this language.")
+        return None
+def clause_simplification(text, mode):
+    prefix = {
+        "Simplified": "simplify: ",
+        "Explain like I'm 5": "explain like I'm 5: ",
+        "Professional": "rephrase professionally: "
+    }.get(mode, "simplify: ")
+    inputs = tokenizer_simplify(prefix + text, return_tensors="pt", truncation=True, max_length=512)
+    outputs = simplify_model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True)
+    return tokenizer_simplify.decode(outputs[0], skip_special_tokens=True)
+def fairness_score_visual(text, lang):
+    pos = len(re.findall(r"(mutual|both parties|shared)", text, re.I))
+    neg = len(re.findall(r"(sole|unilateral|exclusive right)", text, re.I))
+    score = max(0, min(100, 70 + pos - 2*neg))
+    st.subheader("⚖️ Fairness Balance Meter")
+    fairness_df = pd.DataFrame({"Aspect": ["Party A Favored", "Balanced", "Party B Favored"],
+                                "Score": [100 - score, score // 2, score]})
+    fig = px.bar(
+        fairness_df, x="Score", y="Aspect", orientation="h",
+        color="Aspect", text="Score", title="Fairness Score Representation"
+    )
+    fig.update_layout(showlegend=False, xaxis_title="Score", yaxis_title="")
+    st.plotly_chart(fig, use_container_width=True)
+    translated_info = translate_text(f"Fairness Score: {score}% (Educational Estimate Only)", lang)
+    st.info(translated_info)
+def chat_response(prompt, lang):
+    inputs = gen_tokenizer(prompt, return_tensors="pt").to(DEVICE)
+    outputs = gen_model.generate(**inputs, max_new_tokens=350, do_sample=True, temperature=0.7, top_p=0.9)
+    resp = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return translate_text(resp, lang)
+# -----------------------------
+# MAIN TABS
+# -----------------------------
+tab1, tab2, tab3, tab4 = st.tabs(["📄 Analyzer", "🌐 Translate & Audio", "💬 Chatbot", "⚙️ About"])
+# -----------------------------
+# TAB 1: Analyzer
+# -----------------------------
+with tab1:
+    st.subheader("📁 Upload or Paste Legal Document")
+    lang = st.selectbox("Select Working Language:", LANG_NAMES, index=0)
+    file = st.file_uploader("Upload Document (PDF/DOCX/TXT)", type=["pdf", "docx", "txt"])
+    text_input = st.text_area("Or Paste Text Here:", height=200)
+    if file or text_input:
+        text = extract_text(file) if file else text_input
+        st.markdown("---")
+        col1, col2 = st.columns(2)
+        with col1:
+            mode = st.radio("Simplify Mode", ["Explain like I'm 5", "Simplified", "Professional"])
+            if st.button("🧾 Simplify Clauses"):
+                with st.spinner("Simplifying..."):
+                    simplified = clause_simplification(text, mode)
+                    translated_output = translate_text(simplified, lang)
+                    st.success(translated_output)
+                    audio_data = text_to_speech(translated_output, lang)
+                    if audio_data:
+                        st.audio(audio_data, format="audio/mp3")
+        with col2:
+            if st.button("⚖️ Fairness Analysis"):
+                fairness_score_visual(text, lang)
+# -----------------------------
+# TAB 2: Translate & Audio
+# -----------------------------
+with tab2:
+    st.subheader("🌐 Translate & Hear Content")
+    text_input = st.text_area("Enter text to translate or listen:", height=200)
+    lang = st.selectbox("Choose Translation Language:", LANG_NAMES, index=4)
+    if st.button("Translate Text"):
+        translated = translate_text(text_input, lang)
+        st.success(translated)
+    if st.button("🎧 Generate Audio"):
+        audio_data = text_to_speech(text_input, lang)
+        if audio_data:
+            st.audio(audio_data, format="audio/mp3")
+# -----------------------------
+# TAB 3: Chatbot
+# -----------------------------
+with tab3:
+    st.subheader("💬 ClauseWise Multilingual Chatbot")
+    lang = st.selectbox("Chatbot Language:", LANG_NAMES, index=4)
+    st.markdown("Ask questions about contract clauses, fairness, or legal basics. *(Educational only — not legal advice.)*")
+    query = st.text_area("Your question:", height=150)
+    if st.button("Ask ClauseWise"):
+        with st.spinner("Thinking..."):
+            response = chat_response(f"Answer this like a legal assistant: {query}", lang)
+            st.success(response)
+            audio_data = text_to_speech(response, lang)
+            if audio_data:
+                st.audio(audio_data, format="audio/mp3")
+# -----------------------------
+# TAB 4: About
+# -----------------------------
+with tab4:
+    st.markdown("""
+    ### 🌍 About ClauseWise
+    ClauseWise is an **AI-powered multilingual legal document assistant** that helps users:
+    - Simplify complex legal clauses
+    - Translate and listen in **10+ languages**
+    - Analyze fairness visually
+    - Ask questions interactively in any supported language
+    **Supported Languages:**
+    English, French, Spanish, German, Hindi, Tamil, Telugu, Kannada, Marathi, Gujarati, Bengali
+    **Disclaimer:**
+    ClauseWise provides educational insights only and does not offer legal advice.
+    """)
+st.markdown("<p style='text-align:center; color:gray;'>© 2025 ClauseWise | Multilingual Legal AI Assistant</p>", unsafe_allow_html=True)