Spaces:
Runtime error
Runtime error
| import os | |
| import re | |
| import io | |
| import tempfile | |
| import torch | |
| import pandas as pd | |
| import plotly.express as px | |
| import streamlit as st | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForCausalLM, | |
| AutoModelForSeq2SeqLM, | |
| pipeline | |
| ) | |
| from PyPDF2 import PdfReader | |
| from docx import Document | |
| from gtts import gTTS | |
| from io import BytesIO | |
| import spacy | |
| # ----------------------------- | |
| # STREAMLIT PAGE CONFIG | |
| # ----------------------------- | |
| st.set_page_config(page_title="βοΈ ClauseWise", page_icon="βοΈ", layout="wide") | |
| # ----------------------------- | |
| # LANGUAGE MAP | |
| # ----------------------------- | |
| LANG_MAP = { | |
| "English": "en", "French": "fr", "Spanish": "es", "German": "de", | |
| "Hindi": "hi", "Tamil": "ta", "Telugu": "te", "Kannada": "kn", | |
| "Marathi": "mr", "Gujarati": "gu", "Bengali": "bn" | |
| } | |
| LANG_NAMES = list(LANG_MAP.keys()) | |
| # ----------------------------- | |
| # MODEL LOADING (with caching) | |
| # ----------------------------- | |
| def load_models(): | |
| """Load all required models with error handling""" | |
| try: | |
| simplify_model_name = "mrm8488/t5-small-finetuned-text-simplification" | |
| tokenizer_simplify = AutoTokenizer.from_pretrained(simplify_model_name) | |
| simplify_model = AutoModelForSeq2SeqLM.from_pretrained(simplify_model_name) | |
| gen_model_id = "microsoft/phi-2" | |
| gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id, trust_remote_code=True) | |
| gen_model = AutoModelForCausalLM.from_pretrained(gen_model_id, trust_remote_code=True) | |
| # β Auto-download SpaCy if missing | |
| try: | |
| nlp = spacy.load("en_core_web_sm") | |
| except OSError: | |
| from spacy.cli import download | |
| download("en_core_web_sm") | |
| nlp = spacy.load("en_core_web_sm") | |
| classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| return tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer | |
| except Exception as e: | |
| st.error(f"Error loading models: {e}") | |
| return None, None, None, None, None, None, None | |
| model_data = load_models() | |
| if model_data[0] is None: | |
| st.error("Failed to load models. Please check your internet connection and try again.") | |
| st.stop() | |
| tokenizer_simplify, simplify_model, gen_tokenizer, gen_model, nlp, classifier, summarizer = model_data | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| if gen_model is not None: | |
| gen_model.to(DEVICE) | |
| # ----------------------------- | |
| # UTILITIES | |
| # ----------------------------- | |
| def extract_text(file): | |
| if not file: | |
| return "" | |
| name = file.name.lower() | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(name)[1]) as tmp: | |
| tmp.write(file.read()) | |
| tmp_path = tmp.name | |
| text = "" | |
| try: | |
| if name.endswith(".pdf"): | |
| reader = PdfReader(tmp_path) | |
| for page in reader.pages: | |
| t = page.extract_text() | |
| if t: | |
| text += t + "\n" | |
| elif name.endswith(".docx"): | |
| doc = Document(tmp_path) | |
| text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) | |
| else: | |
| with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f: | |
| text = f.read() | |
| except Exception as e: | |
| st.error(f"Error reading file: {e}") | |
| finally: | |
| if os.path.exists(tmp_path): | |
| os.remove(tmp_path) | |
| return text.strip() | |
| def translate_text(text, target_lang): | |
| if not text: | |
| return "" | |
| lang_code = LANG_MAP.get(target_lang, "en") | |
| if lang_code == "en": | |
| return text | |
| try: | |
| text_to_translate = text[:500] | |
| translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{lang_code}") | |
| result = translator(text_to_translate, max_length=512) | |
| return result[0]["translation_text"] | |
| except Exception as e: | |
| st.warning(f"Translation unavailable for {target_lang}: {str(e)}") | |
| return text | |
| def text_to_speech(text, lang): | |
| if not text: | |
| return None | |
| try: | |
| lang_code = LANG_MAP.get(lang, "en") | |
| tts = gTTS(text=text[:1000], lang=lang_code, slow=False) | |
| audio_fp = BytesIO() | |
| tts.write_to_fp(audio_fp) | |
| audio_fp.seek(0) | |
| return audio_fp | |
| except Exception as e: | |
| st.warning(f"Audio generation unavailable: {str(e)}") | |
| return None | |
| def clause_simplification(text, mode): | |
| if not text or simplify_model is None: | |
| return text | |
| prefix_map = { | |
| "Simplified": "simplify: ", | |
| "Explain like I'm 5": "explain like I'm 5: ", | |
| "Professional": "rephrase professionally: " | |
| } | |
| prefix = prefix_map.get(mode, "simplify: ") | |
| try: | |
| text_to_process = text[:500] | |
| inputs = tokenizer_simplify(prefix + text_to_process, return_tensors="pt", | |
| truncation=True, max_length=512) | |
| outputs = simplify_model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True) | |
| return tokenizer_simplify.decode(outputs[0], skip_special_tokens=True) | |
| except Exception as e: | |
| st.error(f"Simplification error: {e}") | |
| return text | |
| def fairness_score_visual(text, lang): | |
| if not text: | |
| st.warning("No text to analyze.") | |
| return | |
| pos = len(re.findall(r"\b(mutual|both parties|shared|equal|fair|balanced)\b", text, re.I)) | |
| neg = len(re.findall(r"\b(sole|unilateral|exclusive right|one-sided|only)\b", text, re.I)) | |
| score = max(0, min(100, 50 + (pos * 5) - (neg * 5))) | |
| st.subheader("βοΈ Fairness Balance Meter") | |
| fairness_df = pd.DataFrame({ | |
| "Aspect": ["Party A Favored", "Balanced", "Party B Favored"], | |
| "Score": [max(0, 100 - score), score, min(100, score)] | |
| }) | |
| fig = px.bar(fairness_df, x="Score", y="Aspect", orientation="h", text="Score", | |
| color="Aspect", color_discrete_sequence=["#ff6b6b", "#4ecdc4", "#95e1d3"]) | |
| fig.update_layout(showlegend=False, xaxis_title="Score", yaxis_title="", height=300) | |
| st.plotly_chart(fig, use_container_width=True) | |
| fairness_text = f"Fairness Score: {score}% (Approximate - based on keyword analysis)" | |
| translated_result = translate_text(fairness_text, lang) | |
| st.info(translated_result) | |
| def chat_response(prompt, lang): | |
| if not prompt or gen_model is None: | |
| return "Unable to generate response. Please try again." | |
| try: | |
| full_prompt = f"You are a helpful legal assistant. Answer the following question: {prompt}\n\nAnswer:" | |
| inputs = gen_tokenizer(full_prompt, return_tensors="pt", truncation=True, | |
| max_length=512).to(DEVICE) | |
| outputs = gen_model.generate(**inputs, max_new_tokens=200, temperature=0.7, | |
| top_p=0.9, do_sample=True, | |
| pad_token_id=gen_tokenizer.eos_token_id) | |
| response = gen_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| if "Answer:" in response: | |
| response = response.split("Answer:")[-1].strip() | |
| return translate_text(response, lang) | |
| except Exception as e: | |
| st.error(f"Chat error: {e}") | |
| return "I'm having trouble generating a response. Please try rephrasing your question." | |
| # ----------------------------- | |
| # MAIN APP | |
| # ----------------------------- | |
| def main(): | |
| st.title("βοΈ ClauseWise: Multilingual Legal AI Assistant") | |
| st.markdown("**Simplify**, **translate**, and **analyze** legal documents with AI β in your language.\n---") | |
| tab1, tab2, tab3, tab4 = st.tabs(["π Analyzer", "π Translate & Audio", "π¬ Chatbot", "βΉοΈ About"]) | |
| # TAB 1: ANALYZER | |
| with tab1: | |
| st.subheader("π Upload or Paste Legal Document") | |
| lang = st.selectbox("Select Language:", LANG_NAMES, index=0, key="analyzer_lang") | |
| file = st.file_uploader("Upload a Legal Document (PDF/DOCX/TXT)", type=["pdf", "docx", "txt"]) | |
| text_input = st.text_area("Or Paste Text Here:", height=200, key="analyzer_text") | |
| if file or text_input: | |
| text = extract_text(file) if file else text_input | |
| if text.strip(): | |
| mode = st.radio("Simplify Mode", ["Explain like I'm 5", "Simplified", "Professional"]) | |
| if st.button("π§Ύ Simplify Clauses"): | |
| with st.spinner("Simplifying..."): | |
| simplified = clause_simplification(text, mode) | |
| translated = translate_text(simplified, lang) | |
| st.success(translated) | |
| audio_data = text_to_speech(translated, lang) | |
| if audio_data: | |
| st.audio(audio_data, format="audio/mp3") | |
| if st.button("βοΈ Fairness Analysis"): | |
| with st.spinner("Analyzing fairness..."): | |
| fairness_score_visual(text, lang) | |
| else: | |
| st.warning("Please provide some text to analyze.") | |
| # TAB 2: TRANSLATION + AUDIO | |
| with tab2: | |
| st.subheader("π Translate & Listen") | |
| text_input = st.text_area("Enter text:", height=200, key="translate_text") | |
| lang = st.selectbox("Translate to:", LANG_NAMES, index=4, key="translate_lang") | |
| if st.button("Translate"): | |
| if text_input.strip(): | |
| with st.spinner("Translating..."): | |
| translated = translate_text(text_input, lang) | |
| st.success(translated) | |
| else: | |
| st.warning("Please enter some text to translate.") | |
| if st.button("π§ Generate Audio"): | |
| if text_input.strip(): | |
| with st.spinner("Generating audio..."): | |
| audio_data = text_to_speech(text_input, lang) | |
| if audio_data: | |
| st.audio(audio_data, format="audio/mp3") | |
| else: | |
| st.warning("Please enter some text for audio generation.") | |
| # TAB 3: CHATBOT | |
| with tab3: | |
| st.subheader("π¬ Chat with ClauseWise (Multilingual)") | |
| lang = st.selectbox("Chat Language:", LANG_NAMES, index=0, key="chat_lang") | |
| query = st.text_area("Ask about clauses, fairness, or legal meaning:", height=150, key="chat_query") | |
| if st.button("Ask"): | |
| if query.strip(): | |
| with st.spinner("Thinking..."): | |
| response = chat_response(query, lang) | |
| st.success(response) | |
| audio_data = text_to_speech(response, lang) | |
| if audio_data: | |
| st.audio(audio_data, format="audio/mp3") | |
| else: | |
| st.warning("Please enter a question.") | |
| # TAB 4: ABOUT | |
| with tab4: | |
| st.markdown(""" | |
| ### βοΈ About ClauseWise | |
| ClauseWise is a multilingual AI-powered legal assistant that helps users: | |
| - **Simplify complex clauses** into easy-to-understand language | |
| - **Translate and listen** in 10+ languages | |
| - **Assess fairness** visually | |
| - **Chat interactively** about legal concepts | |
| **Languages Supported:** | |
| English, French, Spanish, German, Hindi, Tamil, Telugu, Kannada, Marathi, Gujarati, Bengali | |
| **Technologies Used:** | |
| Hugging Face Transformers (T5, Phi-2, BART), SpaCy, gTTS, Plotly | |
| β οΈ *Disclaimer:* Educational use only β not legal advice. | |
| """) | |
| # ----------------------------- | |
| # β CORRECT HUGGING FACE LAUNCHER | |
| # ----------------------------- | |
| if __name__ == "__main__": | |
| main() | |