WAQASCHANNA commited on
Commit
b3a0493
·
verified ·
1 Parent(s): ba11a78

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -0
app.py CHANGED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline
3
+ import chardet
4
+
5
+ st.title("Legal Document Analysis")
6
+
7
+ # Sidebar for uploading the document
8
+ st.sidebar.header("Upload Document")
9
+ uploaded_file = st.sidebar.file_uploader("Choose a document", type=["txt"])
10
+
11
+ # Sidebar for selecting task
12
+ st.sidebar.header("Select Task")
13
+ task = st.sidebar.selectbox("Choose the task you want to perform:", ("Summarization", "Named Entity Recognition (NER)"))
14
+
15
+ # Sidebar for setting summarization parameters (only shown if summarization is selected)
16
+ if task == "Summarization":
17
+ st.sidebar.header("Summarization Parameters")
18
+ max_length = st.sidebar.slider("Max Length", min_value=50, max_value=500, value=150)
19
+ min_length = st.sidebar.slider("Min Length", min_value=10, max_value=100, value=40)
20
+ do_sample = st.sidebar.checkbox("Use Sampling", value=False)
21
+
22
+ # Function to detect file encoding
23
+ def detect_encoding(file):
24
+ raw_data = file.read(1024) # Read a small chunk of the file
25
+ result = chardet.detect(raw_data)
26
+ encoding = result['encoding']
27
+ return encoding
28
+
29
+ # Function to split text into chunks
30
+ def chunk_text(text, chunk_size=1000):
31
+ return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
32
+
33
+ # Function to classify text as law-related or not using zero-shot classification
34
+ def classify_text(text):
35
+ # Load the zero-shot classification pipeline
36
+ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
37
+
38
+ # Define the candidate labels
39
+ candidate_labels = ["law-related", "not law-related"]
40
+
41
+ # Run the classifier with the candidate labels
42
+ result = classifier(text[:512], candidate_labels=candidate_labels)
43
+
44
+ st.write(f"Classification result: {result}")
45
+
46
+ # Check if the highest-scoring label is "law-related"
47
+ return result['labels'][0] == "law-related"
48
+
49
+ # Main area - Display content and perform tasks
50
+ if uploaded_file is not None:
51
+ try:
52
+ # Detect and decode the file content
53
+ encoding = detect_encoding(uploaded_file)
54
+ if encoding is None:
55
+ encoding = 'utf-8' # Fallback to default encoding
56
+
57
+ uploaded_file.seek(0) # Reset file pointer to the beginning
58
+ text = uploaded_file.read().decode(encoding)
59
+ st.write("File content loaded successfully!") # Debugging: Confirm file loading
60
+
61
+ # Classify the text
62
+ if classify_text(text):
63
+ st.write("This document is classified as law-related.") # Debugging: Confirm classification
64
+ chunks = chunk_text(text, chunk_size=1000)
65
+
66
+ if task == "Summarization":
67
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
68
+ summarized_text = ""
69
+
70
+ # Summarize each chunk and combine the results
71
+ for chunk in chunks:
72
+ if len(chunk.split()) > min_length:
73
+ summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=do_sample)
74
+ summarized_text += summary[0]['summary_text'] + " "
75
+
76
+ st.subheader("Summary:")
77
+ st.write(summarized_text)
78
+
79
+ elif task == "Named Entity Recognition (NER)":
80
+ ner = pipeline("ner", grouped_entities=True, model="dslim/bert-base-NER")
81
+ st.subheader("Named Entities:")
82
+
83
+ for chunk in chunks:
84
+ entities = ner(chunk)
85
+ for entity in entities:
86
+ st.write(f"{entity['entity_group']} - {entity['word']} (Score: {entity['score']:.2f})")
87
+ else:
88
+ st.warning("The uploaded document does not contain law-related content. Please upload a legal document.")
89
+
90
+ except IndexError as e:
91
+ st.error(f"IndexError: {e}. Ensure the text is long enough and parameters are set correctly.")
92
+
93
+ except UnicodeDecodeError as e:
94
+ st.error(f"Encoding error: {e}. Please upload a file with valid encoding.")
95
+
96
+ except Exception as e:
97
+ st.error(f"An unexpected error occurred: {e}")
98
+ else:
99
+ st.info("Please upload a document to analyze.")