import streamlit as st from transformers import pipeline import chardet st.title("Legal Document Analysis") # Sidebar for uploading the document st.sidebar.header("Upload Document") uploaded_file = st.sidebar.file_uploader("Choose a document", type=["txt"]) # Add text input box for user-provided text user_input = st.text_area("Or enter text directly for analysis:") # Sidebar for selecting task st.sidebar.header("Select Task") task = st.sidebar.selectbox("Choose the task you want to perform:", ("Summarization", "Named Entity Recognition (NER)")) # Sidebar for setting summarization parameters (only shown if summarization is selected) if task == "Summarization": st.sidebar.header("Summarization Parameters") max_length = st.sidebar.slider("Max Length", min_value=50, max_value=500, value=150) min_length = st.sidebar.slider("Min Length", min_value=10, max_value=100, value=40) do_sample = st.sidebar.checkbox("Use Sampling", value=False) # Function to detect file encoding def detect_encoding(file): raw_data = file.read(1024) # Read a small chunk of the file result = chardet.detect(raw_data) encoding = result['encoding'] return encoding # Function to split text into chunks def chunk_text(text, chunk_size=1000): return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] # Function to classify text as law-related or not using zero-shot classification def classify_text(text): classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") candidate_labels = ["law-related", "not law-related"] result = classifier(text[:512], candidate_labels=candidate_labels) return result['labels'][0] == "law-related" # Main area - Display content and perform tasks if uploaded_file is not None or user_input: try: # If a file is uploaded, read the file content if uploaded_file: encoding = detect_encoding(uploaded_file) if encoding is None: encoding = 'utf-8' # Fallback to default encoding uploaded_file.seek(0) # Reset file pointer to the beginning text = uploaded_file.read().decode(encoding) else: # If no file is uploaded, use user input text = user_input # Classify the text before proceeding with summarization or NER if classify_text(text): st.write("This document is classified as law-related.") # Chunk the text if it is too long chunks = chunk_text(text, chunk_size=1000) if task == "Summarization": summarizer = pipeline("summarization", model="facebook/bart-large-cnn") summarized_text = "" # Summarize each chunk and combine the results for chunk in chunks: if len(chunk.split()) > min_length: summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=do_sample) summarized_text += summary[0]['summary_text'] + " " st.subheader("Summary:") st.write(summarized_text) elif task == "Named Entity Recognition (NER)": ner = pipeline("ner", grouped_entities=True, model="dslim/bert-base-NER") st.subheader("Named Entities:") for chunk in chunks: entities = ner(chunk) for entity in entities: st.write(f"{entity['entity_group']} - {entity['word']} (Score: {entity['score']:.2f})") else: st.warning("The uploaded document or entered text does not contain law-related content. Please provide relevant content.") except IndexError as e: st.error(f"IndexError: {e}. Ensure the text is long enough and parameters are set correctly.") except UnicodeDecodeError as e: st.error(f"Encoding error: {e}. Please upload a file with valid encoding.") except Exception as e: st.error(f"An unexpected error occurred: {e}") else: st.info("Please upload a document or enter text to analyze.")