Spaces:

kritsadaK
/

FinBrief

Sleeping

App Files Files Community

kritsadaK commited on Feb 23, 2025

Commit

67b15b5

verified ·

1 Parent(s): 5ac3ea3

Update app.py

Browse files

Files changed (1) hide show

app.py +232 -10

app.py CHANGED Viewed

@@ -1,15 +1,237 @@
 import streamlit as st
-# Set the title of the app
-st.title("My Simple Streamlit App")
-# Add a text input
-user_input = st.text_input("Enter some text:")
-# Display user input
-if user_input:
-    st.write(f"You entered: {user_input}")
-# Add a button
-if st.button("Click Me!"):
-    st.write("Button clicked!")

 import streamlit as st
+# Set page configuration first
+st.set_page_config(page_title="FinBrief: Financial Document Insights", layout="wide")
+import spacy
+import pandas as pd
+import re
+from transformers import pipeline
+# Load the spaCy model once at the start
+try:
+    nlp = spacy.load("en_core_web_sm")  # Ensure spaCy's model is installed
+    st.write("spaCy model loaded successfully!")
+except OSError:
+    nlp = None  # Handle the case where the model is missing
+    st.write("Failed to load spaCy model.")
+# Load the summarization model from Hugging Face Model Hub
+try:
+    online_model_path = "kritsadaK/bart-financial-summarization"
+    summarizer = pipeline("summarization", model=online_model_path, tokenizer=online_model_path)
+    st.write("Online summarization model loaded successfully!")
+except Exception as e:
+    summarizer = None  # Handle case where model is missing
+    st.write("Failed to load online summarization model.")
+# Initialize models in session state if not already loaded
+if "nlp" not in st.session_state:
+    st.session_state["nlp"] = nlp
+if "summarizer" not in st.session_state:
+    st.session_state["summarizer"] = summarizer
+# # Load the summarization model locally
+# try:
+#     local_model_path = "./local_models/bart-financial"
+#     summarizer = pipeline("summarization", model=local_model_path, tokenizer=local_model_path)
+#     st.write("Local summarization model loaded successfully!")
+# except Exception as e:
+#     summarizer = None  # Handle case where model is missing
+#     st.write("Failed to load local summarization model.")
+# Define regex patterns to extract structured data
+patterns = {
+    "Fund Name": r"^(.*?) Fund",  # Extracts the name before "Fund"
+    "CUSIP": r"CUSIP\s+(\d+)",
+    "Inception Date": r"Inception Date\s+([\w\s\d]+)",
+    "Benchmark": r"Benchmark\s+([\w\s\d]+)",
+    "Expense Ratio": r"Expense Information.*?(\d+\.\d+%)",
+    "Total Assets": r"Total Assets\s+USD\s+([\d,]+)",
+    "Portfolio Turnover": r"Portfolio Holdings Turnover.*?(\d+\.\d+%)",
+    "Cash Allocation": r"% of Portfolio in Cash\s+(\d+\.\d+%)",
+    "Alpha": r"Alpha\s+(-?\d+\.\d+%)",
+    "Standard Deviation": r"Standard Deviation\s+(\d+\.\d+%)"
+}
+# Set the title and layout
+st.title("FinBrief: Financial Document Insights")
+st.markdown("[Example Financial Documents](https://drive.google.com/drive/folders/1jMu3S7S_Hc_RgK6_cvsCqIB8x3SSS-R6)")
+# Custom styling (this remains unchanged)
+st.markdown(
+    """
+    <style>
+    .sidebar .sidebar-content {
+        background-color: #f7f7f7;
+        color: #333;
+    }
+    .css-1d391kg {
+        background-color: #f0f4f8;
+    }
+    .stButton>button {
+        background-color: #4CAF50;
+        color: white;
+        padding: 10px 20px;
+        border-radius: 5px;
+        font-size: 16px;
+    }
+    .stTextArea textarea {
+        border: 2px solid #4CAF50;
+        border-radius: 5px;
+        padding: 10px;
+    }
+    </style>
+    """,
+    unsafe_allow_html=True,
+)
+# Function to extract text and tables using pdfplumber
+def extract_text_tables_pdfplumber(pdf_file):
+    import io
+    import pdfplumber
+    print("\n🔹 PDFPlumber: Extracting text and tables...")
+    with pdfplumber.open(io.BytesIO(pdf_file.read())) as pdf:
+        all_text = ""
+        all_tables = []
+        for page in pdf.pages:
+            page_text = page.extract_text()
+            if page_text:
+                all_text += page_text + "\n"
+            # Extract tables
+            tables = page.extract_tables()
+            all_tables.extend(tables)  # Store all tables
+        if all_text.strip():
+            print(all_text[:1000])  # Print first 1000 characters for verification
+            return all_text, all_tables
+        else:
+            print("No text extracted. The PDF might be image-based.")
+            return None, None
+# Ensure session state is initialized
+if "pdf_text" not in st.session_state:
+    st.session_state["pdf_text"] = ""
+if "pdf_tables" not in st.session_state:
+    st.session_state["pdf_tables"] = []  # Initialize as an empty list
+# Step 0: Upload PDF
+st.sidebar.header("Upload Your Financial Document")
+uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type="pdf")
+if uploaded_file is not None:
+    st.sidebar.write(f"You uploaded: {uploaded_file.name}")
+    # Extract text and tables
+    pdf_text, pdf_tables = extract_text_tables_pdfplumber(uploaded_file)
+    if pdf_text is not None:
+        # Store results in session state
+        st.session_state["pdf_text"] = pdf_text
+        st.session_state["pdf_tables"] = pdf_tables  # Save tables separately
+        st.sidebar.success("PDF uploaded and text extracted!")
+    else:
+        st.markdown("[Example Financial Documents](https://drive.google.com/drive/folders/1jMu3S7S_Hc_RgK6_cvsCqIB8x3SSS-R6)")
+        st.error("No text extracted from the uploaded PDF.")
+# Step 1: Display Extracted Text
+st.subheader("Extracted Text")
+if st.session_state["pdf_text"]:
+    st.text_area("Document Text", st.session_state["pdf_text"], height=400)
+else:
+    st.warning("No text extracted yet. Upload a PDF to start.")
+# Step 2: Display Extracted Tables (Fixed Error)
+st.subheader("Extracted Tables")
+if st.session_state["pdf_tables"]:  # Check if tables exist
+    for idx, table in enumerate(st.session_state["pdf_tables"]):
+        st.write(f"Table {idx+1}")
+        st.write(pd.DataFrame(table))  # Display tables as DataFrames
+else:
+    st.info("No tables extracted.")
+# Retrieve variables from session state
+nlp = st.session_state["nlp"]
+summarizer = st.session_state["summarizer"]
+pdf_text = st.session_state["pdf_text"]
+pdf_tables = st.session_state["pdf_tables"]
+# Ensure that the models are loaded
+if nlp is None or summarizer is None:
+    st.error("Models are not properly loaded. Please check your model paths and installation.")
+else:
+    # Step 3: Named Entity Recognition (NER)
+    st.subheader("NER Analysis")
+    # Display full extracted text, not just first 1000 characters
+    example_text = st.text_area(
+        "Enter or paste text for analysis",
+        height=400,
+        value=st.session_state["pdf_text"] if st.session_state["pdf_text"] else ""
+    )
+    if st.button("Analyze"):
+        # Ensure full extracted text is used for analysis
+        text_for_analysis = st.session_state["pdf_text"].strip() if st.session_state["pdf_text"] else example_text.strip()
+        if text_for_analysis:
+            with st.spinner("Analyzing text..."):
+                # Extract structured financial data using regex (Now using full text)
+                extracted_data = {
+                    key: (match.group(1) if match else "N/A")
+                    for key, pattern in patterns.items()
+                    if (match := re.search(pattern, text_for_analysis, re.IGNORECASE))
+                }
+                # Use spaCy to extract additional financial terms (Now using full text)
+                doc = nlp(text_for_analysis)
+                financial_entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ["MONEY", "PERCENT", "ORG", "DATE"]]
+                # Store extracted data in a structured dictionary
+                structured_data = {**extracted_data, "Named Entities Extracted": financial_entities}
+                # Display results
+                st.write("Entities Found:")
+                st.write(pd.DataFrame(financial_entities, columns=["Entity", "Label"]))
+                st.write("Structured Data Extracted:")
+                st.write(pd.DataFrame([structured_data]))
+        else:
+            st.error("Please provide some text for analysis.")
+    # Step 4: Summarization
+    st.subheader("Summarization")
+    # Display full extracted text, not just first 1000 characters
+    input_text = st.text_area(
+        "Enter text to summarize",
+        height=400,
+        value=st.session_state["pdf_text"] if st.session_state["pdf_text"] else ""
+    )
+    if st.button("Summarize"):
+        # Ensure full extracted text is used for summarization
+        text_to_summarize = st.session_state["pdf_text"].strip() if st.session_state["pdf_text"] else input_text.strip()
+        if text_to_summarize:
+            with st.spinner("Generating summary..."):
+                summary = summarizer(
+                    text_to_summarize,
+                    max_length=min(len(text_to_summarize.split()), 1024),
+                    min_length=100,
+                    do_sample=False
+                )
+                st.write("Summary:")
+                st.success(summary[0]["summary_text"])
+        else:
+            st.error("Please provide text to summarize.")