Spaces:

deepthi6
/

clausewise_full_project

Runtime error

App Files Files Community

deepthi6 commited on Nov 5, 2025

Commit

4612bd8

verified ·

1 Parent(s): 55ae994

Update app.py

Browse files

Files changed (1) hide show

app.py +206 -80

app.py CHANGED Viewed

@@ -38,38 +38,52 @@ LANG_NAMES = list(LANG_MAP.keys())
 # -----------------------------
 @st.cache_resource
 def load_models():
-    simplify_model_name = "mrm8488/t5-small-finetuned-text-simplification"
-    tokenizer_simplify = AutoTokenizer.from_pretrained(simplify_model_name)
-    simplify_model = AutoModelForSeq2SeqLM.from_pretrained(simplify_model_name)
-    gen_model_id = "microsoft/phi-2"
-    gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id)
-    gen_model = AutoModelForCausalLM.from_pretrained(gen_model_id)
-    # ✅ Auto-download SpaCy if missing
-    try:
-        nlp = spacy.load("en_core_web_sm")
-    except OSError:
-        from spacy.cli import download
-        download("en_core_web_sm")
-        nlp = spacy.load("en_core_web_sm")
-    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
-    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
-    return tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer
-tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer = load_models()
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-gen_model.to(DEVICE)
 # -----------------------------
 # UTILITIES
 # -----------------------------
 def extract_text(file):
     if not file:
         return ""
     name = file.name.lower()
-    with tempfile.NamedTemporaryFile(delete=False) as tmp:
         tmp.write(file.read())
         tmp_path = tmp.name
     text = ""
@@ -82,67 +96,153 @@ def extract_text(file):
                     text += t + "\n"
         elif name.endswith(".docx"):
             doc = Document(tmp_path)
-            text = "\n".join([p.text for p in doc.paragraphs])
         else:
-            text = open(tmp_path, "r", encoding="utf-8", errors="ignore").read()
     except Exception as e:
         st.error(f"Error reading file: {e}")
     finally:
-        os.remove(tmp_path)
     return text.strip()
 def translate_text(text, target_lang):
-    lang_code = LANG_MAP[target_lang]
     if lang_code == "en":
         return text
     try:
         translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{lang_code}")
-        return translator(text[:1000])[0]["translation_text"]
-    except Exception:
-        return f"(Translation unavailable for {target_lang})"
 def text_to_speech(text, lang):
     try:
-        lang_code = LANG_MAP[lang]
-        tts = gTTS(text=text, lang=lang_code)
         audio_fp = BytesIO()
         tts.write_to_fp(audio_fp)
         audio_fp.seek(0)
         return audio_fp
-    except Exception:
-        st.warning("Audio unavailable for this language.")
         return None
 def clause_simplification(text, mode):
-    prefix = {
         "Simplified": "simplify: ",
         "Explain like I'm 5": "explain like I'm 5: ",
         "Professional": "rephrase professionally: "
-    }.get(mode, "simplify: ")
-    inputs = tokenizer_simplify(prefix + text, return_tensors="pt", truncation=True, max_length=512)
-    outputs = simplify_model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True)
-    return tokenizer_simplify.decode(outputs[0], skip_special_tokens=True)
 def fairness_score_visual(text, lang):
-    pos = len(re.findall(r"(mutual|both parties|shared)", text, re.I))
-    neg = len(re.findall(r"(sole|unilateral|exclusive right)", text, re.I))
-    score = max(0, min(100, 70 + pos - 2 * neg))
     st.subheader("⚖️ Fairness Balance Meter")
     fairness_df = pd.DataFrame({
         "Aspect": ["Party A Favored", "Balanced", "Party B Favored"],
-        "Score": [100 - score, score // 2, score]
     })
-    fig = px.bar(fairness_df, x="Score", y="Aspect", orientation="h", text="Score", color="Aspect")
-    fig.update_layout(showlegend=False, xaxis_title="Score", yaxis_title="")
     st.plotly_chart(fig, use_container_width=True)
-    st.info(translate_text(f"Fairness Score: {score}% (Approximate)", lang))
 def chat_response(prompt, lang):
-    inputs = gen_tokenizer(prompt, return_tensors="pt").to(DEVICE)
-    outputs = gen_model.generate(**inputs, max_new_tokens=300, temperature=0.7, top_p=0.9, do_sample=True)
-    response = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return translate_text(response, lang)
 # -----------------------------
 # MAIN STREAMLIT APP FUNCTION
@@ -159,72 +259,98 @@ def main():
     # TAB 1: ANALYZER
     with tab1:
         st.subheader("📁 Upload or Paste Legal Document")
-        lang = st.selectbox("Select Language:", LANG_NAMES, index=0)
         file = st.file_uploader("Upload a Legal Document (PDF/DOCX/TXT)", type=["pdf", "docx", "txt"])
-        text_input = st.text_area("Or Paste Text Here:", height=200)
         if file or text_input:
             text = extract_text(file) if file else text_input
-            mode = st.radio("Simplify Mode", ["Explain like I'm 5", "Simplified", "Professional"])
-            if st.button("🧾 Simplify Clauses"):
-                with st.spinner("Simplifying..."):
-                    simplified = clause_simplification(text, mode)
-                    translated = translate_text(simplified, lang)
-                    st.success(translated)
-                    audio_data = text_to_speech(translated, lang)
-                    if audio_data:
-                        st.audio(audio_data, format="audio/mp3")
-            if st.button("⚖️ Fairness Analysis"):
-                fairness_score_visual(text, lang)
     # TAB 2: TRANSLATION + AUDIO
     with tab2:
         st.subheader("🌐 Translate & Listen")
-        text_input = st.text_area("Enter text:", height=200)
-        lang = st.selectbox("Translate to:", LANG_NAMES, index=4)
         if st.button("Translate"):
-            translated = translate_text(text_input, lang)
-            st.success(translated)
         if st.button("🎧 Generate Audio"):
-            audio_data = text_to_speech(text_input, lang)
-            if audio_data:
-                st.audio(audio_data, format="audio/mp3")
     # TAB 3: CHATBOT
     with tab3:
         st.subheader("💬 Chat with ClauseWise (Multilingual)")
-        lang = st.selectbox("Chat Language:", LANG_NAMES, index=4)
-        query = st.text_area("Ask about clauses, fairness, or legal meaning:", height=150)
         if st.button("Ask"):
-            with st.spinner("Thinking..."):
-                response = chat_response(f"You are a legal assistant. Answer helpfully: {query}", lang)
-                st.success(response)
-                audio_data = text_to_speech(response, lang)
-                if audio_data:
-                    st.audio(audio_data, format="audio/mp3")
     # TAB 4: ABOUT
     with tab4:
         st.markdown("""
         ### ⚖️ About ClauseWise
         ClauseWise is a multilingual AI-powered legal assistant that helps users:
-        - Simplify complex clauses
-        - Translate and listen in 10+ languages
-        - Assess fairness visually
-        - Chat interactively
         **Languages Supported:**
         English, French, Spanish, German, Hindi, Tamil, Telugu, Kannada, Marathi, Gujarati, Bengali
-        **Disclaimer:** Educational purposes only, not legal advice.
         """)
 # -----------------------------
 # RUN STREAMLIT APP SAFELY
 # -----------------------------
 if __name__ == "__main__":
-    main()

 # -----------------------------
 @st.cache_resource
 def load_models():
+    """Load all required models with error handling"""
+    try:
+        simplify_model_name = "mrm8488/t5-small-finetuned-text-simplification"
+        tokenizer_simplify = AutoTokenizer.from_pretrained(simplify_model_name)
+        simplify_model = AutoModelForSeq2SeqLM.from_pretrained(simplify_model_name)
+        gen_model_id = "microsoft/phi-2"
+        gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id, trust_remote_code=True)
+        gen_model = AutoModelForCausalLM.from_pretrained(gen_model_id, trust_remote_code=True)
+        # ✅ Auto-download SpaCy if missing
+        try:
+            nlp = spacy.load("en_core_web_sm")
+        except OSError:
+            from spacy.cli import download
+            download("en_core_web_sm")
+            nlp = spacy.load("en_core_web_sm")
+        classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+        return tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer
+    except Exception as e:
+        st.error(f"Error loading models: {e}")
+        return None, None, None, None, None, None, None
+# Load models
+model_data = load_models()
+if model_data[0] is None:
+    st.error("Failed to load models. Please check your internet connection and try again.")
+    st.stop()
+tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer = model_data
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+if gen_model is not None:
+    gen_model.to(DEVICE)
 # -----------------------------
 # UTILITIES
 # -----------------------------
 def extract_text(file):
+    """Extract text from uploaded file"""
     if not file:
         return ""
     name = file.name.lower()
+    with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(name)[1]) as tmp:
         tmp.write(file.read())
         tmp_path = tmp.name
     text = ""
                     text += t + "\n"
         elif name.endswith(".docx"):
             doc = Document(tmp_path)
+            text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
         else:
+            with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f:
+                text = f.read()
     except Exception as e:
         st.error(f"Error reading file: {e}")
     finally:
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
     return text.strip()
 def translate_text(text, target_lang):
+    """Translate text to target language"""
+    if not text:
+        return ""
+    lang_code = LANG_MAP.get(target_lang, "en")
     if lang_code == "en":
         return text
     try:
+        # Truncate text to manageable size
+        text_to_translate = text[:500]
         translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{lang_code}")
+        result = translator(text_to_translate, max_length=512)
+        return result[0]["translation_text"]
+    except Exception as e:
+        st.warning(f"Translation unavailable for {target_lang}: {str(e)}")
+        return text  # Return original text if translation fails
 def text_to_speech(text, lang):
+    """Convert text to speech"""
+    if not text:
+        return None
     try:
+        lang_code = LANG_MAP.get(lang, "en")
+        # Limit text length for TTS
+        text_for_tts = text[:1000]
+        tts = gTTS(text=text_for_tts, lang=lang_code, slow=False)
         audio_fp = BytesIO()
         tts.write_to_fp(audio_fp)
         audio_fp.seek(0)
         return audio_fp
+    except Exception as e:
+        st.warning(f"Audio generation unavailable: {str(e)}")
         return None
 def clause_simplification(text, mode):
+    """Simplify legal text based on selected mode"""
+    if not text or simplify_model is None:
+        return text
+    prefix_map = {
         "Simplified": "simplify: ",
         "Explain like I'm 5": "explain like I'm 5: ",
         "Professional": "rephrase professionally: "
+    }
+    prefix = prefix_map.get(mode, "simplify: ")
+    try:
+        # Truncate input text
+        text_to_process = text[:500]
+        inputs = tokenizer_simplify(
+            prefix + text_to_process,
+            return_tensors="pt",
+            truncation=True,
+            max_length=512
+        )
+        outputs = simplify_model.generate(
+            **inputs,
+            max_length=256,
+            num_beams=4,
+            early_stopping=True
+        )
+        return tokenizer_simplify.decode(outputs[0], skip_special_tokens=True)
+    except Exception as e:
+        st.error(f"Simplification error: {e}")
+        return text
 def fairness_score_visual(text, lang):
+    """Analyze and visualize fairness score"""
+    if not text:
+        st.warning("No text to analyze.")
+        return
+    # Calculate fairness score
+    pos = len(re.findall(r"\b(mutual|both parties|shared|equal|fair|balanced)\b", text, re.I))
+    neg = len(re.findall(r"\b(sole|unilateral|exclusive right|one-sided|only)\b", text, re.I))
+    score = max(0, min(100, 50 + (pos * 5) - (neg * 5)))
     st.subheader("⚖️ Fairness Balance Meter")
     fairness_df = pd.DataFrame({
         "Aspect": ["Party A Favored", "Balanced", "Party B Favored"],
+        "Score": [max(0, 100 - score), score, min(100, score)]
     })
+    fig = px.bar(
+        fairness_df,
+        x="Score",
+        y="Aspect",
+        orientation="h",
+        text="Score",
+        color="Aspect",
+        color_discrete_sequence=["#ff6b6b", "#4ecdc4", "#95e1d3"]
+    )
+    fig.update_layout(
+        showlegend=False,
+        xaxis_title="Score",
+        yaxis_title="",
+        height=300
+    )
     st.plotly_chart(fig, use_container_width=True)
+    # Translate the result
+    fairness_text = f"Fairness Score: {score}% (Approximate - based on keyword analysis)"
+    translated_result = translate_text(fairness_text, lang)
+    st.info(translated_result)
 def chat_response(prompt, lang):
+    """Generate chatbot response"""
+    if not prompt or gen_model is None:
+        return "Unable to generate response. Please try again."
+    try:
+        full_prompt = f"You are a helpful legal assistant. Answer the following question: {prompt}\n\nAnswer:"
+        inputs = gen_tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)
+        outputs = gen_model.generate(
+            **inputs,
+            max_new_tokens=200,
+            temperature=0.7,
+            top_p=0.9,
+            do_sample=True,
+            pad_token_id=gen_tokenizer.eos_token_id
+        )
+        response = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract only the answer part
+        if "Answer:" in response:
+            response = response.split("Answer:")[-1].strip()
+        return translate_text(response, lang)
+    except Exception as e:
+        st.error(f"Chat error: {e}")
+        return "I'm having trouble generating a response. Please try rephrasing your question."
 # -----------------------------
 # MAIN STREAMLIT APP FUNCTION
     # TAB 1: ANALYZER
     with tab1:
         st.subheader("📁 Upload or Paste Legal Document")
+        lang = st.selectbox("Select Language:", LANG_NAMES, index=0, key="analyzer_lang")
         file = st.file_uploader("Upload a Legal Document (PDF/DOCX/TXT)", type=["pdf", "docx", "txt"])
+        text_input = st.text_area("Or Paste Text Here:", height=200, key="analyzer_text")
         if file or text_input:
             text = extract_text(file) if file else text_input
+            if not text.strip():
+                st.warning("Please provide some text to analyze.")
+            else:
+                mode = st.radio("Simplify Mode", ["Explain like I'm 5", "Simplified", "Professional"])
+                if st.button("🧾 Simplify Clauses"):
+                    with st.spinner("Simplifying..."):
+                        simplified = clause_simplification(text, mode)
+                        translated = translate_text(simplified, lang)
+                        st.success(translated)
+                        audio_data = text_to_speech(translated, lang)
+                        if audio_data:
+                            st.audio(audio_data, format="audio/mp3")
+                if st.button("⚖️ Fairness Analysis"):
+                    with st.spinner("Analyzing fairness..."):
+                        fairness_score_visual(text, lang)
     # TAB 2: TRANSLATION + AUDIO
     with tab2:
         st.subheader("🌐 Translate & Listen")
+        text_input = st.text_area("Enter text:", height=200, key="translate_text")
+        lang = st.selectbox("Translate to:", LANG_NAMES, index=4, key="translate_lang")
         if st.button("Translate"):
+            if text_input.strip():
+                with st.spinner("Translating..."):
+                    translated = translate_text(text_input, lang)
+                    st.success(translated)
+            else:
+                st.warning("Please enter some text to translate.")
         if st.button("🎧 Generate Audio"):
+            if text_input.strip():
+                with st.spinner("Generating audio..."):
+                    audio_data = text_to_speech(text_input, lang)
+                    if audio_data:
+                        st.audio(audio_data, format="audio/mp3")
+            else:
+                st.warning("Please enter some text for audio generation.")
     # TAB 3: CHATBOT
     with tab3:
         st.subheader("💬 Chat with ClauseWise (Multilingual)")
+        lang = st.selectbox("Chat Language:", LANG_NAMES, index=0, key="chat_lang")
+        query = st.text_area("Ask about clauses, fairness, or legal meaning:", height=150, key="chat_query")
         if st.button("Ask"):
+            if query.strip():
+                with st.spinner("Thinking..."):
+                    response = chat_response(query, lang)
+                    st.success(response)
+                    audio_data = text_to_speech(response, lang)
+                    if audio_data:
+                        st.audio(audio_data, format="audio/mp3")
+            else:
+                st.warning("Please enter a question.")
     # TAB 4: ABOUT
     with tab4:
         st.markdown("""
         ### ⚖️ About ClauseWise
         ClauseWise is a multilingual AI-powered legal assistant that helps users:
+        - **Simplify complex clauses** into easy-to-understand language
+        - **Translate and listen** in 10+ languages
+        - **Assess fairness** visually with keyword analysis
+        - **Chat interactively** about legal concepts
         **Languages Supported:**
         English, French, Spanish, German, Hindi, Tamil, Telugu, Kannada, Marathi, Gujarati, Bengali
+        **Technologies Used:**
+        - Hugging Face Transformers (T5, Phi-2, BART)
+        - SpaCy for NLP
+        - Google Text-to-Speech (gTTS)
+        - Plotly for visualizations
+        **⚠️ Disclaimer:** This tool is for educational purposes only and does not constitute legal advice.
+        Always consult with a qualified legal professional for legal matters.
         """)
 # -----------------------------
 # RUN STREAMLIT APP SAFELY
 # -----------------------------
 if __name__ == "__main__":
+    main()