| import os |
| import requests |
| import streamlit as st |
| import streamlit.components.v1 as components |
| from streamlit_extras.add_vertical_space import add_vertical_space |
| from bs4 import BeautifulSoup |
| from dotenv import load_dotenv |
| from warnings import filterwarnings |
| filterwarnings('ignore') |
|
|
|
|
| def streamlit_config(): |
|
|
| |
| st.set_page_config(page_title='Document Classification', layout='centered') |
|
|
| |
| page_background_color = """ |
| <style> |
| |
| [data-testid="stHeader"] |
| { |
| background: rgba(0,0,0,0); |
| } |
| |
| </style> |
| """ |
| st.markdown(page_background_color, unsafe_allow_html=True) |
|
|
| |
| st.markdown(f'<h1 style="text-align: center;">Financial Document Classification</h1>', |
| unsafe_allow_html=True) |
| add_vertical_space(2) |
|
|
|
|
| def display_html_document(input_file): |
|
|
| |
| html_content = input_file.getvalue().decode("utf-8") |
|
|
| |
| styled_html = f""" |
| <div style="width: 610px; height: 300px; |
| overflow: auto; border: 1px solid #ddd; |
| padding: 10px; background-color: white; |
| color: black; white-space: normal; |
| display: block;"> |
| {html_content} |
| </div> |
| """ |
|
|
| |
| components.html(styled_html, height=320, width=650, scrolling=False) |
|
|
|
|
| def text_extract_from_html(html_file): |
|
|
| |
| html_content = html_file.read().decode('utf-8') |
|
|
| |
| soup = BeautifulSoup(html_content, 'html.parser') |
|
|
| |
| text = soup.get_text() |
|
|
| |
| result = [i.strip() for i in text.split()] |
| result = ' '.join(result) |
|
|
| return result |
|
|
|
|
| def classify_text_with_huggingface_api(extracted_text): |
| |
| |
| load_dotenv() |
|
|
| |
| hf_token = os.getenv("HUGGINGFACE_TOKEN") |
|
|
| |
| API_URL = "https://api-inference.huggingface.co/models/gopiashokan/Financial-Document-Classification-using-Deep-Learning" |
|
|
| |
| HEADERS = {"Authorization": f"Bearer {hf_token}"} |
|
|
| |
| response = requests.post(API_URL, headers=HEADERS, json={"inputs": extracted_text}) |
| |
| |
| if response.status_code == 200: |
| result = response.json() |
| return result[0] |
| |
| else: |
| return None |
|
|
| |
| def prediction(input_file): |
|
|
| |
| extracted_text = text_extract_from_html(input_file) |
|
|
| |
| extracted_text = extracted_text[0:512] |
|
|
| |
| result = classify_text_with_huggingface_api(extracted_text) |
|
|
| if result is not None: |
| |
| prediction = max(result, key=lambda x: x['score']) |
|
|
| |
| label_mapping = {'LABEL_0':'Others', 'LABEL_1':'Balance Sheets', 'LABEL_2':'Notes', 'LABEL_3':'Cash Flow', 'LABEL_4':'Income Statement'} |
|
|
| |
| predicted_class = label_mapping[prediction['label']] |
|
|
| |
| confidence = prediction['score'] * 100 |
|
|
| |
| add_vertical_space(1) |
| st.markdown(f""" |
| <div style="text-align: center; line-height: 1; padding: 0px;"> |
| <h4 style="color: orange; margin: 0px; padding: 0px;">{confidence:.2f}% Match Found</h4> |
| <h3 style="color: green; margin-top: 10px; padding: 0px;">Predicted Class = {predicted_class}</h3> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
|
|
| else: |
| add_vertical_space(1) |
| st.markdown(f'<h4 style="text-align: center; color: orange; margin-top: 10px;">Refresh the Page and Try Again</h4>', |
| unsafe_allow_html=True) |
|
|
|
|
|
|
| |
| streamlit_config() |
| |
|
|
| try: |
|
|
| |
| input_file = st.file_uploader('Upload an HTML file', type='html') |
|
|
| if input_file is not None: |
| |
| |
| display_html_document(input_file) |
| |
| |
| with st.spinner('Processing'): |
| prediction(input_file) |
| add_vertical_space(2) |
|
|
|
|
| except Exception as e: |
| st.markdown(f'<h3 style="text-align: center;">{e}</h3>', unsafe_allow_html=True) |
|
|