Spaces:

pentarosarium
/

rdtest

Sleeping

App Files Files Community

rdtest / app.py

pentarosarium

Upload 2 files

3d82a40 verified over 1 year ago

raw

history blame contribute delete

6.77 kB

	import streamlit as st
	import fitz # PyMuPDF for PDF processing
	import pandas as pd
	import os
	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


	# Get the Hugging Face token from the environment variables
	hf_token = os.getenv("HF_API_TOKEN")


	# Load the model (Meta-Llama 3.1 8B)
	@st.cache_resource
	def load_model():
	model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token)
	return model

	model = load_model()

	# Function to extract text from PDF
	def extract_pdf_text(file):
	doc = fitz.open(stream=file.read(), filetype="pdf")
	extracted_text = ""
	for page in doc:
	extracted_text += page.get_text("text")
	return extracted_text

	# Function to chunk text into smaller sections
	def chunk_text(text, max_tokens=1000):
	sentences = text.split('.')
	chunks = []
	current_chunk = ""
	current_token_count = 0

	for sentence in sentences:
	token_count = len(sentence.split())
	if current_token_count + token_count > max_tokens:
	chunks.append(current_chunk.strip())
	current_chunk = sentence
	current_token_count = token_count
	else:
	current_chunk += sentence + "."
	current_token_count += token_count

	if current_chunk:
	chunks.append(current_chunk.strip())

	return chunks

	# Prompt generation for extracting financial data
	def generate_extraction_prompt(chunk):
	return f"""
	From the following text, please extract the following financial metrics in IFRS format:
	- Revenue
	- Net Income
	- Total Assets
	- Total Liabilities
	- Shareholders' Equity
	- Current Assets
	- Current Liabilities

	If the information is not found in the text, return 'Not Available'.

	Text: {chunk}
	"""

	# Function to query Meta-Llama for each chunk
	def extract_financial_metrics_from_chunk(chunk):
	prompt = generate_extraction_prompt(chunk)
	model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token)
	tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token)
	nlp = pipeline("text-generation", model=model, tokenizer=tokenizer)
	response = nlp(prompt)
	return response[0]['generated_text']

	# Process the PDF text through the model
	def process_pdf_text_for_metrics(text):
	chunks = chunk_text(text)
	extracted_metrics = []

	for chunk in chunks:
	response = extract_financial_metrics_from_chunk(chunk)
	extracted_metrics.append(response)

	return extracted_metrics

	# Function to parse the metrics from the model response
	import re

	def parse_metrics(extracted_text):
	metrics = {}
	for line in extracted_text.split("\n"):
	if "Revenue" in line:
	metrics['Revenue'] = re.findall(r'\d+', line) # Find numeric data
	elif "Net Income" in line:
	metrics['Net Income'] = re.findall(r'\d+', line)
	elif "Total Assets" in line:
	metrics['Total Assets'] = re.findall(r'\d+', line)
	elif "Total Liabilities" in line:
	metrics['Total Liabilities'] = re.findall(r'\d+', line)
	elif "Shareholders' Equity" in line:
	metrics['Shareholders\' Equity'] = re.findall(r'\d+', line)
	elif "Current Assets" in line:
	metrics['Current Assets'] = re.findall(r'\d+', line)
	elif "Current Liabilities" in line:
	metrics['Current Liabilities'] = re.findall(r'\d+', line)

	return metrics

	# Function to aggregate metrics from all chunks
	def aggregate_metrics(extracted_metrics):
	aggregated_metrics = {
	"Revenue": None,
	"Net Income": None,
	"Total Assets": None,
	"Total Liabilities": None,
	"Shareholders' Equity": None,
	"Current Assets": None,
	"Current Liabilities": None
	}

	for metrics_text in extracted_metrics:
	parsed = parse_metrics(metrics_text)
	for key in parsed:
	if not aggregated_metrics[key]:
	aggregated_metrics[key] = parsed[key]

	return aggregated_metrics

	# Function to calculate financial ratios
	def calculate_financial_ratios(metrics):
	try:
	current_ratio = int(metrics['Current Assets'][0]) / int(metrics['Current Liabilities'][0])
	debt_to_equity = int(metrics['Total Liabilities'][0]) / int(metrics['Shareholders\' Equity'][0])
	roa = int(metrics['Net Income'][0]) / int(metrics['Total Assets'][0])
	roe = int(metrics['Net Income'][0]) / int(metrics['Shareholders\' Equity'][0])

	return {
	'Current Ratio': current_ratio,
	'Debt to Equity': debt_to_equity,
	'Return on Assets (ROA)': roa,
	'Return on Equity (ROE)': roe
	}
	except (TypeError, KeyError, IndexError):
	return "Some metrics were not extracted properly or are missing."

	# Streamlit UI
	st.title("Financial Ratio Extractor from IFRS Reports")

	st.write("""
	Upload an IFRS financial report (PDF), and this app will automatically extract key financial metrics such as Revenue,
	Net Income, Total Assets, and calculate important financial ratios like ROA, ROE, and Debt-to-Equity Ratio.
	You can also ask questions about the financial data using Meta-Llama.
	""")

	# File uploader for PDF
	uploaded_file = st.file_uploader("Upload your IFRS report (PDF)", type=["pdf"])

	# If a PDF is uploaded
	if uploaded_file:
	st.write("Processing your document, please wait...")

	# Extract text from PDF
	pdf_text = extract_pdf_text(uploaded_file)

	# Process the text through Meta-Llama for metrics extraction
	extracted_metrics = process_pdf_text_for_metrics(pdf_text)

	# Aggregate extracted metrics
	aggregated_metrics = aggregate_metrics(extracted_metrics)

	# Calculate financial ratios
	financial_ratios = calculate_financial_ratios(aggregated_metrics)

	# Display extracted financial ratios
	st.subheader("Extracted Financial Ratios:")

	if isinstance(financial_ratios, dict):
	st.table(pd.DataFrame(financial_ratios.items(), columns=["Ratio", "Value"]))
	else:
	st.write(financial_ratios)

	# Asking questions to Meta-Llama
	st.subheader("Ask Meta-Llama about the extracted financial data:")

	question = st.text_input("Enter your question here")

	if st.button("Ask Meta-Llama"):
	if question:
	response = model(question)
	st.write("Meta-Llama's Response:")
	st.write(response[0]['generated_text'])