Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from transformers import pipeline | |
| import chardet | |
| st.title("Legal Document Analysis") | |
| # Sidebar for uploading the document | |
| st.sidebar.header("Upload Document") | |
| uploaded_file = st.sidebar.file_uploader("Choose a document", type=["txt"]) | |
| # Add text input box for user-provided text | |
| user_input = st.text_area("Or enter text directly for analysis:") | |
| # Sidebar for selecting task | |
| st.sidebar.header("Select Task") | |
| task = st.sidebar.selectbox("Choose the task you want to perform:", ("Summarization", "Named Entity Recognition (NER)")) | |
| # Sidebar for setting summarization parameters (only shown if summarization is selected) | |
| if task == "Summarization": | |
| st.sidebar.header("Summarization Parameters") | |
| max_length = st.sidebar.slider("Max Length", min_value=50, max_value=500, value=150) | |
| min_length = st.sidebar.slider("Min Length", min_value=10, max_value=100, value=40) | |
| do_sample = st.sidebar.checkbox("Use Sampling", value=False) | |
| # Function to detect file encoding | |
| def detect_encoding(file): | |
| raw_data = file.read(1024) # Read a small chunk of the file | |
| result = chardet.detect(raw_data) | |
| encoding = result['encoding'] | |
| return encoding | |
| # Function to split text into chunks | |
| def chunk_text(text, chunk_size=1000): | |
| return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] | |
| # Function to classify text as law-related or not using zero-shot classification | |
| def classify_text(text): | |
| classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
| candidate_labels = ["law-related", "not law-related"] | |
| result = classifier(text[:512], candidate_labels=candidate_labels) | |
| return result['labels'][0] == "law-related" | |
| # Main area - Display content and perform tasks | |
| if uploaded_file is not None or user_input: | |
| try: | |
| # If a file is uploaded, read the file content | |
| if uploaded_file: | |
| encoding = detect_encoding(uploaded_file) | |
| if encoding is None: | |
| encoding = 'utf-8' # Fallback to default encoding | |
| uploaded_file.seek(0) # Reset file pointer to the beginning | |
| text = uploaded_file.read().decode(encoding) | |
| else: | |
| # If no file is uploaded, use user input | |
| text = user_input | |
| # Classify the text before proceeding with summarization or NER | |
| if classify_text(text): | |
| st.write("This document is classified as law-related.") | |
| # Chunk the text if it is too long | |
| chunks = chunk_text(text, chunk_size=1000) | |
| if task == "Summarization": | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| summarized_text = "" | |
| # Summarize each chunk and combine the results | |
| for chunk in chunks: | |
| if len(chunk.split()) > min_length: | |
| summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=do_sample) | |
| summarized_text += summary[0]['summary_text'] + " " | |
| st.subheader("Summary:") | |
| st.write(summarized_text) | |
| elif task == "Named Entity Recognition (NER)": | |
| ner = pipeline("ner", grouped_entities=True, model="dslim/bert-base-NER") | |
| st.subheader("Named Entities:") | |
| for chunk in chunks: | |
| entities = ner(chunk) | |
| for entity in entities: | |
| st.write(f"{entity['entity_group']} - {entity['word']} (Score: {entity['score']:.2f})") | |
| else: | |
| st.warning("The uploaded document or entered text does not contain law-related content. Please provide relevant content.") | |
| except IndexError as e: | |
| st.error(f"IndexError: {e}. Ensure the text is long enough and parameters are set correctly.") | |
| except UnicodeDecodeError as e: | |
| st.error(f"Encoding error: {e}. Please upload a file with valid encoding.") | |
| except Exception as e: | |
| st.error(f"An unexpected error occurred: {e}") | |
| else: | |
| st.info("Please upload a document or enter text to analyze.") | |