Spaces:

Gowthamvemula
/

ITC_Financial_Assistant

Sleeping

File size: 6,412 Bytes

import streamlit as st
import sqlite3
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.llms import Ollama
from langchain_core.output_parsers import StrOutputParser
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyPDFLoader
from langchain.docstore.document import Document

# Initialize models
@st.cache_resource
def load_models():
    llm = Ollama(model="llama3")
    sentence_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    return llm, sentence_transformer

llm, sentence_transformer = load_models()

# Custom embedding function
def sentence_transformer_embedding(texts):
    return sentence_transformer.encode(texts).tolist()

# Streamlit UI
st.title("📊 ITC Financial Analysis Assistant")
st.markdown("""
Analyze ITC's financial documents using local AI (Llama 3).  
Upload annual reports, presentations, or paste text below.
""")

# File upload section
uploaded_files = st.file_uploader(
    "Upload financial documents (PDF or TXT)",
    type=["pdf", "txt"],
    accept_multiple_files=True
)

# Text input alternative
manual_text = st.text_area("Or paste financial text directly:")

# Database setup
def init_database():
    conn = sqlite3.connect('itc_finance.db')
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS documents (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            source TEXT,
            content TEXT,
            embedding_id TEXT
        )
    ''')
    conn.commit()
    conn.close()

# Process uploaded files
@st.cache_resource
def process_documents(_uploaded_files, manual_text=""):
    init_database()
    conn = sqlite3.connect('itc_finance.db')
    cursor = conn.cursor()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    
    chroma_db = Chroma(
        embedding_function=sentence_transformer_embedding,
        persist_directory="./chroma_db"
    )
    
    documents = []
    
    # Process uploaded files
    for uploaded_file in _uploaded_files:
        file_path = f"./temp_{uploaded_file.name}"
        with open(file_path, "wb") as f:
            f.write(uploaded_file.getbuffer())
        
        if uploaded_file.name.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
            pages = loader.load_and_split()
        else:
            with open(file_path, 'r') as f:
                text = f.read()
            pages = [Document(page_content=text)]
        
        for page in pages:
            chunks = text_splitter.split_text(page.page_content)
            for chunk in chunks:
                cursor.execute(
                    "INSERT INTO documents (source, content) VALUES (?, ?)",
                    (uploaded_file.name, chunk)
                )
                doc_id = cursor.lastrowid
                
                chroma_db.add_texts(
                    texts=[chunk],
                    metadatas=[{"source": uploaded_file.name, "sql_id": doc_id}]
                )
                
                cursor.execute(
                    "UPDATE documents SET embedding_id = ? WHERE id = ?",
                    (str(doc_id), doc_id)
        
        os.remove(file_path)
        documents.append(uploaded_file.name)
    
    # Process manual text
    if manual_text:
        chunks = text_splitter.split_text(manual_text)
        for chunk in chunks:
            cursor.execute(
                "INSERT INTO documents (source, content) VALUES (?, ?)",
                ("Manual Input", chunk)
            )
            doc_id = cursor.lastrowid
            
            chroma_db.add_texts(
                texts=[chunk],
                metadatas=[{"source": "Manual Input", "sql_id": doc_id}]
            )
            
            cursor.execute(
                "UPDATE documents SET embedding_id = ? WHERE id = ?",
                (str(doc_id), doc_id)
            )
        documents.append("Manual Input")
    
    conn.commit()
    conn.close()
    return chroma_db, documents

# Query engine
def get_query_engine(chroma_db):
    prompt = ChatPromptTemplate.from_template("""
    [INST] <<SYS>>
    You are an expert financial analyst for ITC Limited.
    Use only the provided context to answer.
    Cite sources like: [Source: {source}, page X]
    <</SYS>>
    
    Context: {context}
    
    Question: {question}[/INST]
    """)
    
    def format_docs(docs):
        return "\n\n".join(
            f"Document Excerpt: {doc.page_content}\nSource: {doc.metadata['source']}"
            for doc in docs
        )
    
    retriever = chroma_db.as_retriever(search_kwargs={"k": 3})
    
    return (
        {
            "context": retriever | format_docs,
            "question": lambda x: x["question"]
        }
        | prompt
        | llm
        | StrOutputParser()
    )

# Main app logic
if uploaded_files or manual_text:
    with st.spinner("Processing documents..."):
        chroma_db, processed_docs = process_documents(uploaded_files, manual_text)
    
    st.success(f"Processed {len(processed_docs)} documents")
    query_engine = get_query_engine(chroma_db)
    
    # Query interface
    st.divider()
    question = st.text_input("Ask about ITC's finances:", placeholder="E.g. What was the revenue growth in 2023?")
    
    if question:
        with st.spinner("Analyzing..."):
            answer = query_engine.invoke({"question": question})
        
        st.subheader("Analysis Result")
        st.markdown(answer)
        
        with st.expander("View source documents"):
            st.write(chroma_db.similarity_search(question))
else:
    st.info("Please upload documents or enter text to begin analysis")

# Sidebar with info
with st.sidebar:
    st.markdown("## How to Use")
    st.markdown("""
    1. Upload PDF reports/presentations
    2. Or paste financial text
    3. Ask questions about the data
    """)
    
    st.markdown("## Sample Questions")
    st.markdown("""
    - What was ITC's net profit in 2023?
    - Compare revenue between 2022-2024
    - Show me key financial ratios
    """)
    
    st.markdown("## System Info")
    st.code(f"Using: Llama 3 (local)\nEmbeddings: sentence-transformers/all-MiniLM-L6-v2")