kritsadaK commited on
Commit
67b15b5
·
verified ·
1 Parent(s): 5ac3ea3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +232 -10
app.py CHANGED
@@ -1,15 +1,237 @@
1
  import streamlit as st
2
 
3
- # Set the title of the app
4
- st.title("My Simple Streamlit App")
5
 
6
- # Add a text input
7
- user_input = st.text_input("Enter some text:")
 
 
8
 
9
- # Display user input
10
- if user_input:
11
- st.write(f"You entered: {user_input}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # Add a button
14
- if st.button("Click Me!"):
15
- st.write("Button clicked!")
 
1
  import streamlit as st
2
 
3
+ # Set page configuration first
4
+ st.set_page_config(page_title="FinBrief: Financial Document Insights", layout="wide")
5
 
6
+ import spacy
7
+ import pandas as pd
8
+ import re
9
+ from transformers import pipeline
10
 
11
+ # Load the spaCy model once at the start
12
+ try:
13
+ nlp = spacy.load("en_core_web_sm") # Ensure spaCy's model is installed
14
+ st.write("spaCy model loaded successfully!")
15
+ except OSError:
16
+ nlp = None # Handle the case where the model is missing
17
+ st.write("Failed to load spaCy model.")
18
+
19
+ # Load the summarization model from Hugging Face Model Hub
20
+ try:
21
+ online_model_path = "kritsadaK/bart-financial-summarization"
22
+ summarizer = pipeline("summarization", model=online_model_path, tokenizer=online_model_path)
23
+ st.write("Online summarization model loaded successfully!")
24
+ except Exception as e:
25
+ summarizer = None # Handle case where model is missing
26
+ st.write("Failed to load online summarization model.")
27
+
28
+ # Initialize models in session state if not already loaded
29
+ if "nlp" not in st.session_state:
30
+ st.session_state["nlp"] = nlp
31
+ if "summarizer" not in st.session_state:
32
+ st.session_state["summarizer"] = summarizer
33
+
34
+ # # Load the summarization model locally
35
+ # try:
36
+ # local_model_path = "./local_models/bart-financial"
37
+ # summarizer = pipeline("summarization", model=local_model_path, tokenizer=local_model_path)
38
+ # st.write("Local summarization model loaded successfully!")
39
+ # except Exception as e:
40
+ # summarizer = None # Handle case where model is missing
41
+ # st.write("Failed to load local summarization model.")
42
+
43
+
44
+ # Define regex patterns to extract structured data
45
+ patterns = {
46
+ "Fund Name": r"^(.*?) Fund", # Extracts the name before "Fund"
47
+ "CUSIP": r"CUSIP\s+(\d+)",
48
+ "Inception Date": r"Inception Date\s+([\w\s\d]+)",
49
+ "Benchmark": r"Benchmark\s+([\w\s\d]+)",
50
+ "Expense Ratio": r"Expense Information.*?(\d+\.\d+%)",
51
+ "Total Assets": r"Total Assets\s+USD\s+([\d,]+)",
52
+ "Portfolio Turnover": r"Portfolio Holdings Turnover.*?(\d+\.\d+%)",
53
+ "Cash Allocation": r"% of Portfolio in Cash\s+(\d+\.\d+%)",
54
+ "Alpha": r"Alpha\s+(-?\d+\.\d+%)",
55
+ "Standard Deviation": r"Standard Deviation\s+(\d+\.\d+%)"
56
+ }
57
+
58
+ # Set the title and layout
59
+ st.title("FinBrief: Financial Document Insights")
60
+ st.markdown("[Example Financial Documents](https://drive.google.com/drive/folders/1jMu3S7S_Hc_RgK6_cvsCqIB8x3SSS-R6)")
61
+
62
+ # Custom styling (this remains unchanged)
63
+ st.markdown(
64
+ """
65
+ <style>
66
+ .sidebar .sidebar-content {
67
+ background-color: #f7f7f7;
68
+ color: #333;
69
+ }
70
+ .css-1d391kg {
71
+ background-color: #f0f4f8;
72
+ }
73
+ .stButton>button {
74
+ background-color: #4CAF50;
75
+ color: white;
76
+ padding: 10px 20px;
77
+ border-radius: 5px;
78
+ font-size: 16px;
79
+ }
80
+ .stTextArea textarea {
81
+ border: 2px solid #4CAF50;
82
+ border-radius: 5px;
83
+ padding: 10px;
84
+ }
85
+ </style>
86
+ """,
87
+ unsafe_allow_html=True,
88
+ )
89
+
90
+ # Function to extract text and tables using pdfplumber
91
+ def extract_text_tables_pdfplumber(pdf_file):
92
+ import io
93
+ import pdfplumber
94
+
95
+ print("\n🔹 PDFPlumber: Extracting text and tables...")
96
+ with pdfplumber.open(io.BytesIO(pdf_file.read())) as pdf:
97
+ all_text = ""
98
+ all_tables = []
99
+
100
+ for page in pdf.pages:
101
+ page_text = page.extract_text()
102
+ if page_text:
103
+ all_text += page_text + "\n"
104
+
105
+ # Extract tables
106
+ tables = page.extract_tables()
107
+ all_tables.extend(tables) # Store all tables
108
+
109
+ if all_text.strip():
110
+ print(all_text[:1000]) # Print first 1000 characters for verification
111
+ return all_text, all_tables
112
+ else:
113
+ print("No text extracted. The PDF might be image-based.")
114
+ return None, None
115
+
116
+ # Ensure session state is initialized
117
+ if "pdf_text" not in st.session_state:
118
+ st.session_state["pdf_text"] = ""
119
+ if "pdf_tables" not in st.session_state:
120
+ st.session_state["pdf_tables"] = [] # Initialize as an empty list
121
+
122
+ # Step 0: Upload PDF
123
+ st.sidebar.header("Upload Your Financial Document")
124
+ uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type="pdf")
125
+
126
+ if uploaded_file is not None:
127
+ st.sidebar.write(f"You uploaded: {uploaded_file.name}")
128
+
129
+ # Extract text and tables
130
+ pdf_text, pdf_tables = extract_text_tables_pdfplumber(uploaded_file)
131
+
132
+ if pdf_text is not None:
133
+ # Store results in session state
134
+ st.session_state["pdf_text"] = pdf_text
135
+ st.session_state["pdf_tables"] = pdf_tables # Save tables separately
136
+
137
+ st.sidebar.success("PDF uploaded and text extracted!")
138
+ else:
139
+ st.markdown("[Example Financial Documents](https://drive.google.com/drive/folders/1jMu3S7S_Hc_RgK6_cvsCqIB8x3SSS-R6)")
140
+ st.error("No text extracted from the uploaded PDF.")
141
+
142
+ # Step 1: Display Extracted Text
143
+ st.subheader("Extracted Text")
144
+ if st.session_state["pdf_text"]:
145
+ st.text_area("Document Text", st.session_state["pdf_text"], height=400)
146
+ else:
147
+ st.warning("No text extracted yet. Upload a PDF to start.")
148
+
149
+
150
+ # Step 2: Display Extracted Tables (Fixed Error)
151
+ st.subheader("Extracted Tables")
152
+ if st.session_state["pdf_tables"]: # Check if tables exist
153
+ for idx, table in enumerate(st.session_state["pdf_tables"]):
154
+ st.write(f"Table {idx+1}")
155
+ st.write(pd.DataFrame(table)) # Display tables as DataFrames
156
+ else:
157
+ st.info("No tables extracted.")
158
+
159
+ # Retrieve variables from session state
160
+ nlp = st.session_state["nlp"]
161
+ summarizer = st.session_state["summarizer"]
162
+ pdf_text = st.session_state["pdf_text"]
163
+ pdf_tables = st.session_state["pdf_tables"]
164
+
165
+ # Ensure that the models are loaded
166
+ if nlp is None or summarizer is None:
167
+ st.error("Models are not properly loaded. Please check your model paths and installation.")
168
+ else:
169
+ # Step 3: Named Entity Recognition (NER)
170
+ st.subheader("NER Analysis")
171
+
172
+ # Display full extracted text, not just first 1000 characters
173
+ example_text = st.text_area(
174
+ "Enter or paste text for analysis",
175
+ height=400,
176
+ value=st.session_state["pdf_text"] if st.session_state["pdf_text"] else ""
177
+ )
178
+
179
+ if st.button("Analyze"):
180
+ # Ensure full extracted text is used for analysis
181
+ text_for_analysis = st.session_state["pdf_text"].strip() if st.session_state["pdf_text"] else example_text.strip()
182
+
183
+ if text_for_analysis:
184
+ with st.spinner("Analyzing text..."):
185
+ # Extract structured financial data using regex (Now using full text)
186
+ extracted_data = {
187
+ key: (match.group(1) if match else "N/A")
188
+ for key, pattern in patterns.items()
189
+ if (match := re.search(pattern, text_for_analysis, re.IGNORECASE))
190
+ }
191
+
192
+ # Use spaCy to extract additional financial terms (Now using full text)
193
+ doc = nlp(text_for_analysis)
194
+ financial_entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ["MONEY", "PERCENT", "ORG", "DATE"]]
195
+
196
+ # Store extracted data in a structured dictionary
197
+ structured_data = {**extracted_data, "Named Entities Extracted": financial_entities}
198
+
199
+ # Display results
200
+ st.write("Entities Found:")
201
+ st.write(pd.DataFrame(financial_entities, columns=["Entity", "Label"]))
202
+
203
+ st.write("Structured Data Extracted:")
204
+ st.write(pd.DataFrame([structured_data]))
205
+
206
+ else:
207
+ st.error("Please provide some text for analysis.")
208
+
209
+
210
+ # Step 4: Summarization
211
+ st.subheader("Summarization")
212
+
213
+ # Display full extracted text, not just first 1000 characters
214
+ input_text = st.text_area(
215
+ "Enter text to summarize",
216
+ height=400,
217
+ value=st.session_state["pdf_text"] if st.session_state["pdf_text"] else ""
218
+ )
219
+
220
+ if st.button("Summarize"):
221
+ # Ensure full extracted text is used for summarization
222
+ text_to_summarize = st.session_state["pdf_text"].strip() if st.session_state["pdf_text"] else input_text.strip()
223
+
224
+ if text_to_summarize:
225
+ with st.spinner("Generating summary..."):
226
+ summary = summarizer(
227
+ text_to_summarize,
228
+ max_length=min(len(text_to_summarize.split()), 1024),
229
+ min_length=100,
230
+ do_sample=False
231
+ )
232
+ st.write("Summary:")
233
+ st.success(summary[0]["summary_text"])
234
+
235
+ else:
236
+ st.error("Please provide text to summarize.")
237