Spaces:

Gowthamvemula
/

ITC_Financial_Assistant

Sleeping

App Files Files Community

ITC_Financial_Assistant / src /streamlit_app.py

Gowthamvemula

Update src/streamlit_app.py

009a93d verified 8 months ago

raw

history blame contribute delete

6.41 kB

	import streamlit as st
	import sqlite3
	import os
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import Chroma
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_community.llms import Ollama
	from langchain_core.output_parsers import StrOutputParser
	from sentence_transformers import SentenceTransformer
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.docstore.document import Document

	# Initialize models
	@st.cache_resource
	def load_models():
	llm = Ollama(model="llama3")
	sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
	return llm, sentence_transformer

	llm, sentence_transformer = load_models()

	# Custom embedding function
	def sentence_transformer_embedding(texts):
	return sentence_transformer.encode(texts).tolist()

	# Streamlit UI
	st.title("📊 ITC Financial Analysis Assistant")
	st.markdown("""
	Analyze ITC's financial documents using local AI (Llama 3).
	Upload annual reports, presentations, or paste text below.
	""")

	# File upload section
	uploaded_files = st.file_uploader(
	"Upload financial documents (PDF or TXT)",
	type=["pdf", "txt"],
	accept_multiple_files=True
	)

	# Text input alternative
	manual_text = st.text_area("Or paste financial text directly:")

	# Database setup
	def init_database():
	conn = sqlite3.connect('itc_finance.db')
	cursor = conn.cursor()
	cursor.execute('''
	CREATE TABLE IF NOT EXISTS documents (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	source TEXT,
	content TEXT,
	embedding_id TEXT
	)
	''')
	conn.commit()
	conn.close()

	# Process uploaded files
	@st.cache_resource
	def process_documents(_uploaded_files, manual_text=""):
	init_database()
	conn = sqlite3.connect('itc_finance.db')
	cursor = conn.cursor()

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200
	)

	chroma_db = Chroma(
	embedding_function=sentence_transformer_embedding,
	persist_directory="./chroma_db"
	)

	documents = []

	# Process uploaded files
	for uploaded_file in _uploaded_files:
	file_path = f"./temp_{uploaded_file.name}"
	with open(file_path, "wb") as f:
	f.write(uploaded_file.getbuffer())

	if uploaded_file.name.endswith('.pdf'):
	loader = PyPDFLoader(file_path)
	pages = loader.load_and_split()
	else:
	with open(file_path, 'r') as f:
	text = f.read()
	pages = [Document(page_content=text)]

	for page in pages:
	chunks = text_splitter.split_text(page.page_content)
	for chunk in chunks:
	cursor.execute(
	"INSERT INTO documents (source, content) VALUES (?, ?)",
	(uploaded_file.name, chunk)
	)
	doc_id = cursor.lastrowid

	chroma_db.add_texts(
	texts=[chunk],
	metadatas=[{"source": uploaded_file.name, "sql_id": doc_id}]
	)

	cursor.execute(
	"UPDATE documents SET embedding_id = ? WHERE id = ?",
	(str(doc_id), doc_id)

	os.remove(file_path)
	documents.append(uploaded_file.name)

	# Process manual text
	if manual_text:
	chunks = text_splitter.split_text(manual_text)
	for chunk in chunks:
	cursor.execute(
	"INSERT INTO documents (source, content) VALUES (?, ?)",
	("Manual Input", chunk)
	)
	doc_id = cursor.lastrowid

	chroma_db.add_texts(
	texts=[chunk],
	metadatas=[{"source": "Manual Input", "sql_id": doc_id}]
	)

	cursor.execute(
	"UPDATE documents SET embedding_id = ? WHERE id = ?",
	(str(doc_id), doc_id)
	)
	documents.append("Manual Input")

	conn.commit()
	conn.close()
	return chroma_db, documents

	# Query engine
	def get_query_engine(chroma_db):
	prompt = ChatPromptTemplate.from_template("""
	[INST] <<SYS>>
	You are an expert financial analyst for ITC Limited.
	Use only the provided context to answer.
	Cite sources like: [Source: {source}, page X]
	<</SYS>>

	Context: {context}

	Question: {question}[/INST]
	""")

	def format_docs(docs):
	return "\n\n".join(
	f"Document Excerpt: {doc.page_content}\nSource: {doc.metadata['source']}"
	for doc in docs
	)

	retriever = chroma_db.as_retriever(search_kwargs={"k": 3})

	return (
	{
	"context": retriever \| format_docs,
	"question": lambda x: x["question"]
	}
	\| prompt
	\| llm
	\| StrOutputParser()
	)

	# Main app logic
	if uploaded_files or manual_text:
	with st.spinner("Processing documents..."):
	chroma_db, processed_docs = process_documents(uploaded_files, manual_text)

	st.success(f"Processed {len(processed_docs)} documents")
	query_engine = get_query_engine(chroma_db)

	# Query interface
	st.divider()
	question = st.text_input("Ask about ITC's finances:", placeholder="E.g. What was the revenue growth in 2023?")

	if question:
	with st.spinner("Analyzing..."):
	answer = query_engine.invoke({"question": question})

	st.subheader("Analysis Result")
	st.markdown(answer)

	with st.expander("View source documents"):
	st.write(chroma_db.similarity_search(question))
	else:
	st.info("Please upload documents or enter text to begin analysis")

	# Sidebar with info
	with st.sidebar:
	st.markdown("## How to Use")
	st.markdown("""
	1. Upload PDF reports/presentations
	2. Or paste financial text
	3. Ask questions about the data
	""")

	st.markdown("## Sample Questions")
	st.markdown("""
	- What was ITC's net profit in 2023?
	- Compare revenue between 2022-2024
	- Show me key financial ratios
	""")

	st.markdown("## System Info")
	st.code(f"Using: Llama 3 (local)\nEmbeddings: sentence-transformers/all-MiniLM-L6-v2")