File size: 4,165 Bytes
b3a0493
 
 
 
 
 
 
 
 
 
cff475f
 
 
b3a0493
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80aa59b
 
 
 
 
 
 
b3a0493
cff475f
b3a0493
cff475f
 
 
 
 
 
 
 
 
 
 
b3a0493
80aa59b
 
 
 
 
 
b3a0493
80aa59b
 
 
b3a0493
80aa59b
 
 
 
 
b3a0493
80aa59b
 
b3a0493
80aa59b
 
 
b3a0493
80aa59b
 
 
 
 
cff475f
b3a0493
 
 
 
 
 
 
 
 
 
cff475f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import streamlit as st
from transformers import pipeline
import chardet

st.title("Legal Document Analysis")

# Sidebar for uploading the document
st.sidebar.header("Upload Document")
uploaded_file = st.sidebar.file_uploader("Choose a document", type=["txt"])

# Add text input box for user-provided text
user_input = st.text_area("Or enter text directly for analysis:")

# Sidebar for selecting task
st.sidebar.header("Select Task")
task = st.sidebar.selectbox("Choose the task you want to perform:", ("Summarization", "Named Entity Recognition (NER)"))

# Sidebar for setting summarization parameters (only shown if summarization is selected)
if task == "Summarization":
    st.sidebar.header("Summarization Parameters")
    max_length = st.sidebar.slider("Max Length", min_value=50, max_value=500, value=150)
    min_length = st.sidebar.slider("Min Length", min_value=10, max_value=100, value=40)
    do_sample = st.sidebar.checkbox("Use Sampling", value=False)

# Function to detect file encoding
def detect_encoding(file):
    raw_data = file.read(1024)  # Read a small chunk of the file
    result = chardet.detect(raw_data)
    encoding = result['encoding']
    return encoding

# Function to split text into chunks
def chunk_text(text, chunk_size=1000):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

# Function to classify text as law-related or not using zero-shot classification
def classify_text(text):
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    candidate_labels = ["law-related", "not law-related"]
    result = classifier(text[:512], candidate_labels=candidate_labels)
    return result['labels'][0] == "law-related"

# Main area - Display content and perform tasks
if uploaded_file is not None or user_input:
    try:
        # If a file is uploaded, read the file content
        if uploaded_file:
            encoding = detect_encoding(uploaded_file)
            if encoding is None:
                encoding = 'utf-8'  # Fallback to default encoding

            uploaded_file.seek(0)  # Reset file pointer to the beginning
            text = uploaded_file.read().decode(encoding)
        else:
            # If no file is uploaded, use user input
            text = user_input

        # Classify the text before proceeding with summarization or NER
        if classify_text(text):
            st.write("This document is classified as law-related.")
            
            # Chunk the text if it is too long
            chunks = chunk_text(text, chunk_size=1000)

            if task == "Summarization":
                summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
                summarized_text = ""

                # Summarize each chunk and combine the results
                for chunk in chunks:
                    if len(chunk.split()) > min_length:
                        summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=do_sample)
                        summarized_text += summary[0]['summary_text'] + " "

                st.subheader("Summary:")
                st.write(summarized_text)

            elif task == "Named Entity Recognition (NER)":
                ner = pipeline("ner", grouped_entities=True, model="dslim/bert-base-NER")
                st.subheader("Named Entities:")

                for chunk in chunks:
                    entities = ner(chunk)
                    for entity in entities:
                        st.write(f"{entity['entity_group']} - {entity['word']} (Score: {entity['score']:.2f})")
        else:
            st.warning("The uploaded document or entered text does not contain law-related content. Please provide relevant content.")

    except IndexError as e:
        st.error(f"IndexError: {e}. Ensure the text is long enough and parameters are set correctly.")
    
    except UnicodeDecodeError as e:
        st.error(f"Encoding error: {e}. Please upload a file with valid encoding.")
    
    except Exception as e:
        st.error(f"An unexpected error occurred: {e}")
else:
    st.info("Please upload a document or enter text to analyze.")