Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import fitz # PyMuPDF for PDF processing | |
| import pandas as pd | |
| import os | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
| # Get the Hugging Face token from the environment variables | |
| hf_token = os.getenv("HF_API_TOKEN") | |
| # Load the model (Meta-Llama 3.1 8B) | |
| def load_model(): | |
| model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token) | |
| return model | |
| model = load_model() | |
| # Function to extract text from PDF | |
| def extract_pdf_text(file): | |
| doc = fitz.open(stream=file.read(), filetype="pdf") | |
| extracted_text = "" | |
| for page in doc: | |
| extracted_text += page.get_text("text") | |
| return extracted_text | |
| # Function to chunk text into smaller sections | |
| def chunk_text(text, max_tokens=1000): | |
| sentences = text.split('.') | |
| chunks = [] | |
| current_chunk = "" | |
| current_token_count = 0 | |
| for sentence in sentences: | |
| token_count = len(sentence.split()) | |
| if current_token_count + token_count > max_tokens: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence | |
| current_token_count = token_count | |
| else: | |
| current_chunk += sentence + "." | |
| current_token_count += token_count | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks | |
| # Prompt generation for extracting financial data | |
| def generate_extraction_prompt(chunk): | |
| return f""" | |
| From the following text, please extract the following financial metrics in IFRS format: | |
| - Revenue | |
| - Net Income | |
| - Total Assets | |
| - Total Liabilities | |
| - Shareholders' Equity | |
| - Current Assets | |
| - Current Liabilities | |
| If the information is not found in the text, return 'Not Available'. | |
| Text: {chunk} | |
| """ | |
| # Function to query Meta-Llama for each chunk | |
| def extract_financial_metrics_from_chunk(chunk): | |
| prompt = generate_extraction_prompt(chunk) | |
| model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token) | |
| tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token) | |
| nlp = pipeline("text-generation", model=model, tokenizer=tokenizer) | |
| response = nlp(prompt) | |
| return response[0]['generated_text'] | |
| # Process the PDF text through the model | |
| def process_pdf_text_for_metrics(text): | |
| chunks = chunk_text(text) | |
| extracted_metrics = [] | |
| for chunk in chunks: | |
| response = extract_financial_metrics_from_chunk(chunk) | |
| extracted_metrics.append(response) | |
| return extracted_metrics | |
| # Function to parse the metrics from the model response | |
| import re | |
| def parse_metrics(extracted_text): | |
| metrics = {} | |
| for line in extracted_text.split("\n"): | |
| if "Revenue" in line: | |
| metrics['Revenue'] = re.findall(r'\d+', line) # Find numeric data | |
| elif "Net Income" in line: | |
| metrics['Net Income'] = re.findall(r'\d+', line) | |
| elif "Total Assets" in line: | |
| metrics['Total Assets'] = re.findall(r'\d+', line) | |
| elif "Total Liabilities" in line: | |
| metrics['Total Liabilities'] = re.findall(r'\d+', line) | |
| elif "Shareholders' Equity" in line: | |
| metrics['Shareholders\' Equity'] = re.findall(r'\d+', line) | |
| elif "Current Assets" in line: | |
| metrics['Current Assets'] = re.findall(r'\d+', line) | |
| elif "Current Liabilities" in line: | |
| metrics['Current Liabilities'] = re.findall(r'\d+', line) | |
| return metrics | |
| # Function to aggregate metrics from all chunks | |
| def aggregate_metrics(extracted_metrics): | |
| aggregated_metrics = { | |
| "Revenue": None, | |
| "Net Income": None, | |
| "Total Assets": None, | |
| "Total Liabilities": None, | |
| "Shareholders' Equity": None, | |
| "Current Assets": None, | |
| "Current Liabilities": None | |
| } | |
| for metrics_text in extracted_metrics: | |
| parsed = parse_metrics(metrics_text) | |
| for key in parsed: | |
| if not aggregated_metrics[key]: | |
| aggregated_metrics[key] = parsed[key] | |
| return aggregated_metrics | |
| # Function to calculate financial ratios | |
| def calculate_financial_ratios(metrics): | |
| try: | |
| current_ratio = int(metrics['Current Assets'][0]) / int(metrics['Current Liabilities'][0]) | |
| debt_to_equity = int(metrics['Total Liabilities'][0]) / int(metrics['Shareholders\' Equity'][0]) | |
| roa = int(metrics['Net Income'][0]) / int(metrics['Total Assets'][0]) | |
| roe = int(metrics['Net Income'][0]) / int(metrics['Shareholders\' Equity'][0]) | |
| return { | |
| 'Current Ratio': current_ratio, | |
| 'Debt to Equity': debt_to_equity, | |
| 'Return on Assets (ROA)': roa, | |
| 'Return on Equity (ROE)': roe | |
| } | |
| except (TypeError, KeyError, IndexError): | |
| return "Some metrics were not extracted properly or are missing." | |
| # Streamlit UI | |
| st.title("Financial Ratio Extractor from IFRS Reports") | |
| st.write(""" | |
| Upload an IFRS financial report (PDF), and this app will automatically extract key financial metrics such as Revenue, | |
| Net Income, Total Assets, and calculate important financial ratios like ROA, ROE, and Debt-to-Equity Ratio. | |
| You can also ask questions about the financial data using Meta-Llama. | |
| """) | |
| # File uploader for PDF | |
| uploaded_file = st.file_uploader("Upload your IFRS report (PDF)", type=["pdf"]) | |
| # If a PDF is uploaded | |
| if uploaded_file: | |
| st.write("Processing your document, please wait...") | |
| # Extract text from PDF | |
| pdf_text = extract_pdf_text(uploaded_file) | |
| # Process the text through Meta-Llama for metrics extraction | |
| extracted_metrics = process_pdf_text_for_metrics(pdf_text) | |
| # Aggregate extracted metrics | |
| aggregated_metrics = aggregate_metrics(extracted_metrics) | |
| # Calculate financial ratios | |
| financial_ratios = calculate_financial_ratios(aggregated_metrics) | |
| # Display extracted financial ratios | |
| st.subheader("Extracted Financial Ratios:") | |
| if isinstance(financial_ratios, dict): | |
| st.table(pd.DataFrame(financial_ratios.items(), columns=["Ratio", "Value"])) | |
| else: | |
| st.write(financial_ratios) | |
| # Asking questions to Meta-Llama | |
| st.subheader("Ask Meta-Llama about the extracted financial data:") | |
| question = st.text_input("Enter your question here") | |
| if st.button("Ask Meta-Llama"): | |
| if question: | |
| response = model(question) | |
| st.write("Meta-Llama's Response:") | |
| st.write(response[0]['generated_text']) | |