Spaces:

bhoomi19
/

clausewise

Runtime error

App Files Files Community

bhoomi19 commited on Nov 5, 2025

Commit

7dd21c8

verified ·

1 Parent(s): e631f5a

Update app.py

Browse files

Files changed (1) hide show

app.py +247 -343

app.py CHANGED Viewed

@@ -1,361 +1,265 @@
-import streamlit as st
 import os
 import re
-import json
-from typing import List, Dict
-import torch
-from transformers import pipeline
-from pypdf import PdfReader
-import docx
 import io
-# Set page config FIRST - this is critical for Streamlit
-st.set_page_config(
-    page_title="ClauseWise Legal Assistant",
-    page_icon="⚖️",
-    layout="wide"
 )
-# Use a small, reliable model
-MODEL_ID = "microsoft/DialoGPT-small"  # 334M parameters - fits in Spaces memory
-@st.cache_resource(show_spinner=False)
-def load_model():
-    """Load a small model that actually works on Spaces"""
-    try:
-        # Use a simple pipeline - much more memory efficient
-        generator = pipeline(
-            "text-generation",
-            model=MODEL_ID,
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device_map="auto" if torch.cuda.is_available() else None,
-            max_length=512
-        )
-        return generator
-    except Exception as e:
-        st.error(f"Model loading failed: {e}")
-        return None
-def simple_llm_generate(prompt: str, max_length=200) -> str:
-    """Simple generation with error handling"""
-    generator = load_model()
-    if generator is None:
-        return "Model not available. Using demo mode."
     try:
-        result = generator(
-            prompt,
-            max_length=max_length,
-            num_return_sequences=1,
-            temperature=0.7,
-            do_sample=True,
-            pad_token_id=50256
-        )
-        generated = result[0]['generated_text']
-        # Remove the prompt from the response
-        if generated.startswith(prompt):
-            return generated[len(prompt):].strip()
-        return generated.strip()
-    except Exception as e:
-        return f"Generation error: {str(e)}"
-# Document loading functions
-def load_text_from_pdf(file_obj):
     try:
-        # Read the file content
-        file_content = file_obj.read()
-        file_obj.seek(0)  # Reset file pointer
-        reader = PdfReader(io.BytesIO(file_content))
-        text = ""
-        for page in reader.pages:
-            page_text = page.extract_text()
-            if page_text:
-                text += page_text + "\n"
-        return text.strip()
     except Exception as e:
-        return f"Error reading PDF: {str(e)}"
-def load_text_from_docx(file_obj):
     try:
-        file_content = file_obj.read()
-        file_obj.seek(0)
-        doc = docx.Document(io.BytesIO(file_content))
-        return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
-    except Exception as e:
-        return f"Error reading DOCX: {str(e)}"
-def load_text_from_txt(file_obj):
     try:
-        file_content = file_obj.read()
-        file_obj.seek(0)
-        if isinstance(file_content, bytes):
-            return file_content.decode('utf-8', errors='ignore')
-        return str(file_content)
-    except Exception as e:
-        return f"Error reading TXT: {str(e)}"
-def load_document(file):
-    """Universal document loader"""
-    if not file:
-        return ""
-    filename = file.name.lower()
-    if filename.endswith('.pdf'):
-        return load_text_from_pdf(file)
-    elif filename.endswith('.docx'):
-        return load_text_from_docx(file)
-    elif filename.endswith('.txt'):
-        return load_text_from_txt(file)
-    else:
-        # Try all formats
-        for loader in [load_text_from_pdf, load_text_from_docx, load_text_from_txt]:
-            try:
-                result = loader(file)
-                if result and not result.startswith("Error"):
-                    return result
-            except:
-                continue
-        return "Could not read document"
-# FIXED regex patterns - simple and working
-def extract_clauses_simple(text: str) -> List[str]:
-    """Simple clause extraction using reliable regex"""
-    if not text:
-        return []
-    # Multiple splitting strategies
-    clauses = []
-    # Strategy 1: Split by common clause separators
-    clauses1 = re.split(r'[.;!?]\s+', text)
-    # Strategy 2: Split by line breaks followed by numbers or bullets
-    clauses2 = re.split(r'\n\s*(?:\d+\.|\*|\-)\s+', text)
-    # Strategy 3: Split by section markers
-    clauses3 = re.split(r'\n\s*[A-Z][A-Za-z\s]+\:', text)
-    # Combine all strategies and clean up
-    all_clauses = clauses1 + clauses2 + clauses3
-    cleaned_clauses = []
-    for clause in all_clauses:
-        clause = clause.strip()
-        # Only keep meaningful clauses
-        if (len(clause) > 30 and
-            len(clause) < 1000 and
-            not clause.isspace()):
-            # Simple deduplication
-            simple_clause = re.sub(r'\s+', ' ', clause.lower())
-            if simple_clause not in [re.sub(r'\s+', ' ', c.lower()) for c in cleaned_clauses]:
-                cleaned_clauses.append(clause)
-    return cleaned_clauses[:20]  # Limit to 20 clauses
-def rule_based_analysis(text):
-    """Rule-based analysis without AI"""
-    results = {}
-    # Basic statistics
-    results['character_count'] = len(text)
-    results['word_count'] = len(text.split())
-    # Clause analysis
-    clauses = extract_clauses_simple(text)
-    results['clauses_found'] = len(clauses)
-    # Risk word detection
-    risk_words = {
-        'high_risk': ['liable', 'indemnify', 'damages', 'breach', 'termination', 'penalty'],
-        'medium_risk': ['confidential', 'proprietary', 'warranty', 'obligation'],
-        'low_risk': ['agree', 'party', 'contract', 'term']
-    }
-    found_risks = {}
-    text_lower = text.lower()
-    for risk_level, words in risk_words.items():
-        found = [word for word in words if word in text_lower]
-        if found:
-            found_risks[risk_level] = found
-    results['risk_terms'] = found_risks
-    # Simple document type detection
-    text_lower = text.lower()
-    doc_type_scores = {
-        "Non-Disclosure Agreement": len(re.findall(r'confidential|non.?disclosure|nda', text_lower)),
-        "Employment Contract": len(re.findall(r'employ|salary|duties|terminat', text_lower)),
-        "Lease Agreement": len(re.findall(r'lease|tenant|rent|property', text_lower)),
-        "Service Agreement": len(re.findall(r'service|provider|client|deliverable', text_lower)),
-        "Sales Agreement": len(re.findall(r'sale|purchase|price|payment', text_lower))
-    }
-    best_type = max(doc_type_scores.items(), key=lambda x: x[1])
-    results['doc_type'] = best_type[0] if best_type[1] > 0 else "General Contract"
-    results['doc_type_confidence'] = min(100, best_type[1] * 20)  # Simple confidence score
-    return results, clauses
-# Initialize session state
-if 'text_data' not in st.session_state:
-    st.session_state.text_data = ""
-if 'analysis_results' not in st.session_state:
-    st.session_state.analysis_results = {}
-if 'clauses' not in st.session_state:
-    st.session_state.clauses = []
-# UI Layout
-st.title("⚖️ ClauseWise Legal Assistant")
-st.markdown("**Lightweight legal document analysis**")
-# Sidebar
-with st.sidebar:
-    st.header("📁 Document Input")
-    uploaded_file = st.file_uploader(
-        "Upload Document",
-        type=["pdf", "docx", "txt"],
-        help="Supported formats: PDF, Word, Text"
     )
-    pasted_text = st.text_area("Or paste text below:", height=150, placeholder="Paste your legal text here...")
-    process_btn = st.button("📊 Analyze Document", type="primary", use_container_width=True)
-    if process_btn:
-        if uploaded_file:
-            with st.spinner("Reading document..."):
-                st.session_state.text_data = load_document(uploaded_file)
-        elif pasted_text.strip():
-            st.session_state.text_data = pasted_text.strip()
-        else:
-            st.error("Please upload a file or paste some text")
-        if st.session_state.text_data and not st.session_state.text_data.startswith("Error"):
-            st.success(f"✅ Loaded {len(st.session_state.text_data)} characters")
-            with st.spinner("Analyzing content..."):
-                st.session_state.analysis_results, st.session_state.clauses = rule_based_analysis(st.session_state.text_data)
-        else:
-            st.error("Failed to load document text")
-# Main content area
-if st.session_state.text_data and not st.session_state.text_data.startswith("Error"):
-    # Document preview
-    with st.expander("📄 Document Preview", expanded=False):
-        preview_text = st.session_state.text_data
-        if len(preview_text) > 1500:
-            st.text_area("", preview_text[:1500] + "...", height=200, label_visibility="collapsed")
-            st.caption(f"Preview truncated. Full document: {len(preview_text)} characters")
-        else:
-            st.text_area("", preview_text, height=200, label_visibility="collapsed")
-    # Analysis results
-    if st.session_state.analysis_results:
-        results = st.session_state.analysis_results
-        st.subheader("📊 Analysis Results")
-        # Key metrics
-        col1, col2, col3, col4 = st.columns(4)
-        with col1:
-            st.metric("Document Type", results['doc_type'])
-        with col2:
-            st.metric("Confidence", f"{results['doc_type_confidence']}%")
-        with col3:
-            st.metric("Clauses Found", results['clauses_found'])
-        with col4:
-            st.metric("Word Count", results['word_count'])
-        # Risk analysis
-        if results['risk_terms']:
-            st.subheader("⚠️ Risk Analysis")
-            for risk_level, terms in results['risk_terms'].items():
-                risk_display = risk_level.replace('_', ' ').title()
-                color = {
-                    'high_risk': 'red',
-                    'medium_risk': 'orange',
-                    'low_risk': 'green'
-                }.get(risk_level, 'gray')
-                st.write(f"**{risk_display}**: {', '.join(terms)}")
-        # Clauses display
-        if st.session_state.clauses:
-            st.subheader(f"📑 Extracted Clauses ({len(st.session_state.clauses)})")
-            for i, clause in enumerate(st.session_state.clauses[:10], 1):
-                with st.expander(f"Clause {i} ({len(clause)} chars)"):
-                    st.write(clause)
-            if len(st.session_state.clauses) > 10:
-                st.info(f"Showing first 10 of {len(st.session_state.clauses)} clauses")
-        # AI Analysis Section (optional)
-        st.subheader("🤖 AI Analysis (Optional)")
-        if st.button("Generate AI Summary", key="ai_summary"):
-            if len(st.session_state.text_data) > 100:
-                with st.spinner("AI is analyzing..."):
-                    prompt = f"Provide a concise summary of this legal document:\n\n{st.session_state.text_data[:1000]}"
-                    ai_summary = simple_llm_generate(prompt, max_length=300)
-                    st.write(ai_summary)
             else:
-                st.warning("Document too short for AI analysis")
-else:
-    # Welcome screen
-    st.markdown("""
-    ## 👋 Welcome to ClauseWise!
-    A lightweight legal document analyzer optimized for Hugging Face Spaces.
-    ### 🚀 How to use:
-    1. **Upload a document** (PDF, DOCX, TXT) in the sidebar **OR**
-    2. **Paste your legal text** in the text area
-    3. Click **"Analyze Document"** to process
-    4. Review the automated analysis results
-    ### 📋 What it analyzes:
-    - **Document type** (NDA, Employment, Lease, etc.)
-    - **Risk terms** and potential issues
-    - **Clause extraction** and organization
-    - **Basic statistics** and metrics
-    ### 🧪 Try this sample text:
-    ```
-    This Non-Disclosure Agreement (the "Agreement") is entered into between
-    Company ABC ("Disclosing Party") and John Smith ("Receiving Party").
-    The Receiving Party agrees to maintain the confidentiality of all
-    proprietary information disclosed under this Agreement for a period
-    of three years following termination. Any breach of this Agreement
-    may result in legal action and liability for damages.
-    ```
-    ### ⚠️ Important Notes:
-    - Uses rule-based analysis for reliability
-    - Optional AI features use small, fast models
-    - Works best with clear legal text
-    - Free and open source
-    """)
-# Footer
-st.markdown("---")
-st.caption("🔒 ClauseWise Demo | Optimized for Hugging Face Spaces | No data stored")
-# Add some custom CSS to make it look nicer
-st.markdown("""
-<style>
-    .main .block-container {
-        padding-top: 2rem;
-    }
-    .stButton button {
-        width: 100%;
-    }
-</style>
-""", unsafe_allow_html=True)

 import os
 import re
 import io
+import tempfile
+import torch
+import pandas as pd
+import plotly.express as px
+import streamlit as st
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    pipeline
 )
+from PyPDF2 import PdfReader
+from docx import Document
+from gtts import gTTS
+from io import BytesIO
+import spacy
+import subprocess
+# -----------------------------
+# Hugging Face fix: ensure Streamlit runs properly
+# -----------------------------
+if _name_ == "_main_" and os.environ.get("SYSTEM") == "spaces":
+    subprocess.Popen(["streamlit", "run", "app.py", "--server.port", "7860", "--server.address", "0.0.0.0"])
+    exit()
+# -----------------------------
+# Page config
+# -----------------------------
+st.set_page_config(page_title="⚖ ClauseWise", page_icon="⚖", layout="wide")
+# -----------------------------
+# Language Map
+# -----------------------------
+LANG_MAP = {
+    "English": "en", "French": "fr", "Spanish": "es", "German": "de",
+    "Hindi": "hi", "Tamil": "ta", "Telugu": "te", "Kannada": "kn",
+    "Marathi": "mr", "Gujarati": "gu", "Bengali": "bn"
+}
+LANG_NAMES = list(LANG_MAP.keys())
+# -----------------------------
+# Model Loading (cached)
+# -----------------------------
+@st.cache_resource
+def load_models():
+    simplify_model_name = "mrm8488/t5-small-finetuned-text-simplification"
+    tokenizer_simplify = AutoTokenizer.from_pretrained(simplify_model_name)
+    simplify_model = AutoModelForSeq2SeqLM.from_pretrained(simplify_model_name)
+    gen_model_id = "microsoft/phi-2"
+    gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id, trust_remote_code=True)
+    gen_model = AutoModelForCausalLM.from_pretrained(gen_model_id, trust_remote_code=True)
+    # ✅ Load SpaCy
     try:
+        nlp = spacy.load("en_core_web_sm")
+    except OSError:
+        from spacy.cli import download
+        download("en_core_web_sm")
+        nlp = spacy.load("en_core_web_sm")
+    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+    return tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer
+tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer = load_models()
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+gen_model.to(DEVICE)
+# -----------------------------
+# Utility Functions
+# -----------------------------
+def extract_text(file):
+    if not file:
+        return ""
+    name = file.name.lower()
+    with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(name)[1]) as tmp:
+        tmp.write(file.read())
+        tmp_path = tmp.name
+    text = ""
     try:
+        if name.endswith(".pdf"):
+            reader = PdfReader(tmp_path)
+            for page in reader.pages:
+                t = page.extract_text()
+                if t:
+                    text += t + "\n"
+        elif name.endswith(".docx"):
+            doc = Document(tmp_path)
+            text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
+        else:
+            with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f:
+                text = f.read()
     except Exception as e:
+        st.error(f"Error reading file: {e}")
+    finally:
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
+    return text.strip()
+def translate_text(text, target_lang):
+    if not text:
+        return ""
+    lang_code = LANG_MAP.get(target_lang, "en")
+    if lang_code == "en":
+        return text
     try:
+        translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{lang_code}")
+        return translator(text[:1000])[0]["translation_text"]
+    except Exception:
+        return text
+def text_to_speech(text, lang):
     try:
+        lang_code = LANG_MAP.get(lang, "en")
+        tts = gTTS(text=text[:1000], lang=lang_code)
+        audio_fp = BytesIO()
+        tts.write_to_fp(audio_fp)
+        audio_fp.seek(0)
+        return audio_fp
+    except Exception:
+        return None
+def clause_simplification(text, mode):
+    prefix = {
+        "Simplified": "simplify: ",
+        "Explain like I'm 5": "explain like I'm 5: ",
+        "Professional": "rephrase professionally: "
+    }.get(mode, "simplify: ")
+    inputs = tokenizer_simplify(prefix + text[:500], return_tensors="pt", truncation=True, max_length=512)
+    outputs = simplify_model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True)
+    return tokenizer_simplify.decode(outputs[0], skip_special_tokens=True)
+def fairness_score_visual(text, lang):
+    pos = len(re.findall(r"\b(mutual|both parties|shared|equal|fair|balanced)\b", text, re.I))
+    neg = len(re.findall(r"\b(sole|unilateral|exclusive right|one-sided|only)\b", text, re.I))
+    score = max(0, min(100, 50 + (pos * 5) - (neg * 5)))
+    st.subheader("⚖ Fairness Balance Meter")
+    fairness_df = pd.DataFrame({
+        "Aspect": ["Party A Favored", "Balanced", "Party B Favored"],
+        "Score": [max(0, 100 - score), score, min(100, score)]
+    })
+    fig = px.bar(
+        fairness_df, x="Score", y="Aspect", orientation="h", text="Score", color="Aspect",
+        color_discrete_sequence=["#ff6b6b", "#4ecdc4", "#95e1d3"]
     )
+    fig.update_layout(showlegend=False, xaxis_title="Score", yaxis_title="", height=300)
+    st.plotly_chart(fig, use_container_width=True)
+    st.info(translate_text(f"Fairness Score: {score}% (Approximate)", lang))
+def chat_response(prompt, lang, history):
+    """Persistent memory chat"""
+    # Combine chat history context
+    context = "\n".join([f"User: {u}\nAI: {a}" for u, a in history[-3:]])  # Keep last 3
+    full_prompt = f"You are a helpful multilingual legal assistant. {context}\nUser: {prompt}\nAI:"
+    inputs = gen_tokenizer(full_prompt, return_tensors="pt").to(DEVICE)
+    outputs = gen_model.generate(**inputs, max_new_tokens=200, temperature=0.7, top_p=0.9, do_sample=True)
+    response = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    if "AI:" in response:
+        response = response.split("AI:")[-1].strip()
+    return translate_text(response, lang)
+# -----------------------------
+# Main Streamlit App
+# -----------------------------
+def main():
+    st.title("⚖ ClauseWise: Multilingual Legal AI Assistant")
+    st.markdown("Simplify, translate, and analyze legal documents with AI — in your language.")
+    st.divider()
+    tab1, tab2, tab3, tab4 = st.tabs(["📄 Analyzer", "🌐 Translate & Audio", "💬 Chatbot", "ℹ About"])
+    with tab1:
+        st.subheader("📁 Upload or Paste Legal Document")
+        lang = st.selectbox("Select Language:", LANG_NAMES, index=0)
+        file = st.file_uploader("Upload a Legal Document (PDF/DOCX/TXT)", type=["pdf", "docx", "txt"])
+        text_input = st.text_area("Or Paste Text Here:", height=200)
+        if file or text_input:
+            text = extract_text(file) if file else text_input
+            if not text:
+                st.warning("No content found.")
             else:
+                mode = st.radio("Simplify Mode", ["Explain like I'm 5", "Simplified", "Professional"])
+                if st.button("🧾 Simplify Clauses"):
+                    with st.spinner("Simplifying..."):
+                        simplified = clause_simplification(text, mode)
+                        translated = translate_text(simplified, lang)
+                        st.success(translated)
+                        audio = text_to_speech(translated, lang)
+                        if audio:
+                            st.audio(audio, format="audio/mp3")
+                if st.button("⚖ Fairness Analysis"):
+                    fairness_score_visual(text, lang)
+    with tab2:
+        st.subheader("🌐 Translate & Listen")
+        text_input = st.text_area("Enter text:", height=200)
+        lang = st.selectbox("Translate to:", LANG_NAMES, index=4)
+        if st.button("Translate"):
+            translated = translate_text(text_input, lang)
+            st.success(translated)
+        if st.button("🎧 Generate Audio"):
+            audio = text_to_speech(text_input, lang)
+            if audio:
+                st.audio(audio, format="audio/mp3")
+    with tab3:
+        st.subheader("💬 Chat with ClauseWise (Memory Enabled)")
+        lang = st.selectbox("Chat Language:", LANG_NAMES, index=0)
+        query = st.text_area("Ask your question:", height=150)
+        # Maintain persistent conversation
+        if "chat_history" not in st.session_state:
+            st.session_state.chat_history = []
+        if st.button("Ask"):
+            if query.strip():
+                with st.spinner("Thinking..."):
+                    response = chat_response(query, lang, st.session_state.chat_history)
+                    st.session_state.chat_history.append((query, response))
+                    st.success(response)
+                    audio = text_to_speech(response, lang)
+                    if audio:
+                        st.audio(audio, format="audio/mp3")
+        # Display conversation history
+        if st.session_state.chat_history:
+            st.markdown("### 🧠 Chat History")
+            for q, a in st.session_state.chat_history[-5:]:
+                st.markdown(f"*You:* {q}")
+                st.markdown(f"*ClauseWise:* {a}")
+        if st.button("Clear Chat"):
+            st.session_state.chat_history = []
+            st.info("Chat cleared.")
+    with tab4:
+        st.markdown("""
+        ### ⚖ About ClauseWise
+        ClauseWise is a multilingual AI-powered legal assistant that helps users:
+        - Simplify legal language
+        - Translate and listen in 10+ languages
+        - Assess fairness visually
+        - Chat interactively with memory
+        ---
+        *Disclaimer:* Educational use only — not legal advice.
+        """)
+if _name_ == "_main_":
+    main()