|
|
import streamlit as st |
|
|
import sqlite3 |
|
|
import os |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
from langchain_community.vectorstores import Chroma |
|
|
from langchain_core.prompts import ChatPromptTemplate |
|
|
from langchain_community.llms import Ollama |
|
|
from langchain_core.output_parsers import StrOutputParser |
|
|
from sentence_transformers import SentenceTransformer |
|
|
from langchain_community.document_loaders import PyPDFLoader |
|
|
from langchain.docstore.document import Document |
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def load_models(): |
|
|
llm = Ollama(model="llama3") |
|
|
sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') |
|
|
return llm, sentence_transformer |
|
|
|
|
|
llm, sentence_transformer = load_models() |
|
|
|
|
|
|
|
|
def sentence_transformer_embedding(texts): |
|
|
return sentence_transformer.encode(texts).tolist() |
|
|
|
|
|
|
|
|
st.title("π ITC Financial Analysis Assistant") |
|
|
st.markdown(""" |
|
|
Analyze ITC's financial documents using local AI (Llama 3). |
|
|
Upload annual reports, presentations, or paste text below. |
|
|
""") |
|
|
|
|
|
|
|
|
uploaded_files = st.file_uploader( |
|
|
"Upload financial documents (PDF or TXT)", |
|
|
type=["pdf", "txt"], |
|
|
accept_multiple_files=True |
|
|
) |
|
|
|
|
|
|
|
|
manual_text = st.text_area("Or paste financial text directly:") |
|
|
|
|
|
|
|
|
def init_database(): |
|
|
conn = sqlite3.connect('itc_finance.db') |
|
|
cursor = conn.cursor() |
|
|
cursor.execute(''' |
|
|
CREATE TABLE IF NOT EXISTS documents ( |
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT, |
|
|
source TEXT, |
|
|
content TEXT, |
|
|
embedding_id TEXT |
|
|
) |
|
|
''') |
|
|
conn.commit() |
|
|
conn.close() |
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def process_documents(_uploaded_files, manual_text=""): |
|
|
init_database() |
|
|
conn = sqlite3.connect('itc_finance.db') |
|
|
cursor = conn.cursor() |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=1000, |
|
|
chunk_overlap=200 |
|
|
) |
|
|
|
|
|
chroma_db = Chroma( |
|
|
embedding_function=sentence_transformer_embedding, |
|
|
persist_directory="./chroma_db" |
|
|
) |
|
|
|
|
|
documents = [] |
|
|
|
|
|
|
|
|
for uploaded_file in _uploaded_files: |
|
|
file_path = f"./temp_{uploaded_file.name}" |
|
|
with open(file_path, "wb") as f: |
|
|
f.write(uploaded_file.getbuffer()) |
|
|
|
|
|
if uploaded_file.name.endswith('.pdf'): |
|
|
loader = PyPDFLoader(file_path) |
|
|
pages = loader.load_and_split() |
|
|
else: |
|
|
with open(file_path, 'r') as f: |
|
|
text = f.read() |
|
|
pages = [Document(page_content=text)] |
|
|
|
|
|
for page in pages: |
|
|
chunks = text_splitter.split_text(page.page_content) |
|
|
for chunk in chunks: |
|
|
cursor.execute( |
|
|
"INSERT INTO documents (source, content) VALUES (?, ?)", |
|
|
(uploaded_file.name, chunk) |
|
|
) |
|
|
doc_id = cursor.lastrowid |
|
|
|
|
|
chroma_db.add_texts( |
|
|
texts=[chunk], |
|
|
metadatas=[{"source": uploaded_file.name, "sql_id": doc_id}] |
|
|
) |
|
|
|
|
|
cursor.execute( |
|
|
"UPDATE documents SET embedding_id = ? WHERE id = ?", |
|
|
(str(doc_id), doc_id) |
|
|
|
|
|
os.remove(file_path) |
|
|
documents.append(uploaded_file.name) |
|
|
|
|
|
|
|
|
if manual_text: |
|
|
chunks = text_splitter.split_text(manual_text) |
|
|
for chunk in chunks: |
|
|
cursor.execute( |
|
|
"INSERT INTO documents (source, content) VALUES (?, ?)", |
|
|
("Manual Input", chunk) |
|
|
) |
|
|
doc_id = cursor.lastrowid |
|
|
|
|
|
chroma_db.add_texts( |
|
|
texts=[chunk], |
|
|
metadatas=[{"source": "Manual Input", "sql_id": doc_id}] |
|
|
) |
|
|
|
|
|
cursor.execute( |
|
|
"UPDATE documents SET embedding_id = ? WHERE id = ?", |
|
|
(str(doc_id), doc_id) |
|
|
) |
|
|
documents.append("Manual Input") |
|
|
|
|
|
conn.commit() |
|
|
conn.close() |
|
|
return chroma_db, documents |
|
|
|
|
|
|
|
|
def get_query_engine(chroma_db): |
|
|
prompt = ChatPromptTemplate.from_template(""" |
|
|
[INST] <<SYS>> |
|
|
You are an expert financial analyst for ITC Limited. |
|
|
Use only the provided context to answer. |
|
|
Cite sources like: [Source: {source}, page X] |
|
|
<</SYS>> |
|
|
|
|
|
Context: {context} |
|
|
|
|
|
Question: {question}[/INST] |
|
|
""") |
|
|
|
|
|
def format_docs(docs): |
|
|
return "\n\n".join( |
|
|
f"Document Excerpt: {doc.page_content}\nSource: {doc.metadata['source']}" |
|
|
for doc in docs |
|
|
) |
|
|
|
|
|
retriever = chroma_db.as_retriever(search_kwargs={"k": 3}) |
|
|
|
|
|
return ( |
|
|
{ |
|
|
"context": retriever | format_docs, |
|
|
"question": lambda x: x["question"] |
|
|
} |
|
|
| prompt |
|
|
| llm |
|
|
| StrOutputParser() |
|
|
) |
|
|
|
|
|
|
|
|
if uploaded_files or manual_text: |
|
|
with st.spinner("Processing documents..."): |
|
|
chroma_db, processed_docs = process_documents(uploaded_files, manual_text) |
|
|
|
|
|
st.success(f"Processed {len(processed_docs)} documents") |
|
|
query_engine = get_query_engine(chroma_db) |
|
|
|
|
|
|
|
|
st.divider() |
|
|
question = st.text_input("Ask about ITC's finances:", placeholder="E.g. What was the revenue growth in 2023?") |
|
|
|
|
|
if question: |
|
|
with st.spinner("Analyzing..."): |
|
|
answer = query_engine.invoke({"question": question}) |
|
|
|
|
|
st.subheader("Analysis Result") |
|
|
st.markdown(answer) |
|
|
|
|
|
with st.expander("View source documents"): |
|
|
st.write(chroma_db.similarity_search(question)) |
|
|
else: |
|
|
st.info("Please upload documents or enter text to begin analysis") |
|
|
|
|
|
|
|
|
with st.sidebar: |
|
|
st.markdown("## How to Use") |
|
|
st.markdown(""" |
|
|
1. Upload PDF reports/presentations |
|
|
2. Or paste financial text |
|
|
3. Ask questions about the data |
|
|
""") |
|
|
|
|
|
st.markdown("## Sample Questions") |
|
|
st.markdown(""" |
|
|
- What was ITC's net profit in 2023? |
|
|
- Compare revenue between 2022-2024 |
|
|
- Show me key financial ratios |
|
|
""") |
|
|
|
|
|
st.markdown("## System Info") |
|
|
st.code(f"Using: Llama 3 (local)\nEmbeddings: sentence-transformers/all-MiniLM-L6-v2") |