""" ╔══════════════════════════════════════════════════════════════════════╗ ║ TaxBot AI — app.py ║ ║ Multimodal Financial Co-pilot for Indian MSMEs & Taxpayers ║ ║ Built for Hugging Face Spaces · Powered by Streamlit + LangChain ║ ╚══════════════════════════════════════════════════════════════════════╝ ARCHITECTURE OVERVIEW (Triple-Engine Hybrid RAG): Engine 1 — Knowledge Base (RAG): PDF → ChromaDB → Retrieval Engine 2 — Generative Reasoning: Retrieved context → Claude → Answer Engine 3 — Notice Interpreter: Image/PDF upload → GPT-4o Vision → Summary DEPLOYMENT: 1. Upload this file + requirements.txt to a Hugging Face Space (Streamlit SDK). 2. Set secrets: ANTHROPIC_API_KEY, OPENAI_API_KEY in HF Space Settings. 3. ChromaDB runs in-memory (no external DB needed for the pilot). """ # ───────────────────────────────────────────── # SECTION 0: Imports & Page Configuration # ───────────────────────────────────────────── import os import io import base64 import tempfile import streamlit as st # LangChain — the orchestration backbone from langchain_community.document_loaders import PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma from langchain_openai import OpenAIEmbeddings # Embeddings via OpenAI from langchain_anthropic import ChatAnthropic # LLM via Anthropic Claude from langchain_core.prompts import PromptTemplate # Moved to langchain_core # RetrievalQA — try modern location first, fall back to legacy try: from langchain.chains import RetrievalQA except ImportError: from langchain_community.chains import RetrievalQA # OpenAI Vision (for Notice Interpreter) from openai import OpenAI # ── Streamlit Page Setup (must be FIRST Streamlit call) ────────────── st.set_page_config( page_title="TaxBot AI · Indian Tax Co-pilot", page_icon="⚖️", layout="wide", initial_sidebar_state="expanded", ) # ───────────────────────────────────────────── # SECTION 1: Custom CSS — Visual Identity # ───────────────────────────────────────────── # Design Direction: "Legal Ink on Digital Paper" # Crisp deep navy + saffron accent. Monospaced touches for the "statutory" feel. # Evokes trust, authority, and precision — appropriate for a compliance tool. st.markdown(""" """, unsafe_allow_html=True) # ───────────────────────────────────────────── # SECTION 2: API Client Initialisation # ───────────────────────────────────────────── # Business Purpose: Securely load API keys from Hugging Face Secrets # (or .env locally). Never hard-code keys in source code. @st.cache_resource def get_llm(): """ Returns a LangChain-wrapped Claude 3.5 Sonnet instance. Claude handles all the statutory reasoning and answer generation. """ api_key = os.environ.get("ANTHROPIC_API_KEY", "") if not api_key: st.warning("⚠️ ANTHROPIC_API_KEY not set. Reasoning engine offline.", icon="⚠️") return None return ChatAnthropic( model="claude-sonnet-4-5", # Claude Sonnet 4.5 — smart + efficient api_key=api_key, temperature=0.1, # Low temp = more deterministic legal answers max_tokens=1500, ) @st.cache_resource def get_openai_client(): """ Returns an OpenAI client used exclusively for Vision-based notice parsing (GPT-4o's multimodal capability). """ api_key = os.environ.get("OPENAI_API_KEY", "") if not api_key: return None return OpenAI(api_key=api_key) @st.cache_resource def get_embeddings(): """ Returns an OpenAI Embeddings model for converting text chunks into vectors stored in ChromaDB. """ api_key = os.environ.get("OPENAI_API_KEY", "") if not api_key: return None return OpenAIEmbeddings( model="text-embedding-3-small", # Cost-effective, high quality api_key=api_key ) # ───────────────────────────────────────────── # SECTION 3: Session State Initialisation # ───────────────────────────────────────────── # Think of session_state as the app's short-term memory per user session. def init_session_state(): defaults = { "chat_history": [], # List of {"role": "user"/"bot", "content": "..."} "vectorstore": None, # ChromaDB instance (built when user uploads PDFs) "kb_doc_count": 0, # Number of chunks indexed "kb_file_names": [], # Names of uploaded files for display "notice_result": None, # Last parsed notice result } for key, val in defaults.items(): if key not in st.session_state: st.session_state[key] = val init_session_state() # ───────────────────────────────────────────── # SECTION 3b: AUTO-PRELOAD on Startup # ───────────────────────────────────────────── # Business Purpose: When deployed on Hugging Face Spaces, this block # runs ONCE per session and loads all PDFs from the 'docs/' folder # automatically. Judges see a fully-ready Knowledge Base on first load. # The @st.cache_resource on embeddings/LLM ensures this is efficient. @st.cache_resource(show_spinner=False) def autoload_knowledge_base(): """ Cached function — runs only ONCE per app instance (not per user session). Loads all PDFs from the docs/ folder into ChromaDB. Returns (vectorstore, chunk_count, file_names) or (None, 0, []) if no docs found. """ docs_folder = "docs" # Relative path — matches your HF Space folder structure embeddings = get_embeddings() if embeddings is None: return None, 0, [] if not os.path.exists(docs_folder): return None, 0, [] pdf_files = [f for f in os.listdir(docs_folder) if f.lower().endswith(".pdf")] if not pdf_files: return None, 0, [] all_pages = [] for pdf_name in pdf_files: try: loader = PyPDFLoader(os.path.join(docs_folder, pdf_name)) pages = loader.load() for page in pages: page.metadata["source"] = pdf_name all_pages.extend(pages) except Exception: continue if not all_pages: return None, 0, [] splitter = RecursiveCharacterTextSplitter( chunk_size=800, chunk_overlap=150, separators=["\n\n", "\n", ".", " "] ) chunks = splitter.split_documents(all_pages) vectorstore = Chroma.from_documents( documents=chunks, embedding=embeddings, collection_name="taxbot_preloaded_kb", ) return vectorstore, len(chunks), pdf_files # Run the auto-preload and populate session state (only if KB not already set) if st.session_state["vectorstore"] is None: _vs, _count, _files = autoload_knowledge_base() if _vs is not None: st.session_state["vectorstore"] = _vs st.session_state["kb_doc_count"] = _count st.session_state["kb_file_names"] = _files st.session_state["kb_preloaded"] = True # Flag to show "Pre-loaded" badge # ───────────────────────────────────────────── # SECTION 4: ENGINE 1 — Knowledge Base Builder # ───────────────────────────────────────────── # Business Logic: # User uploads PDF circulars/acts → we split them into manageable chunks → # embed each chunk as a vector → store in ChromaDB. # Later, when a user asks a question, we retrieve the top-K most relevant # chunks as "context" for Claude (this is RAG). def build_knowledge_base(uploaded_files: list) -> tuple[Chroma | None, int]: """ Ingests a list of uploaded PDF files into a ChromaDB vector store. Args: uploaded_files: List of Streamlit UploadedFile objects. Returns: (vectorstore, chunk_count) — the ChromaDB instance and total chunks indexed. """ embeddings = get_embeddings() if embeddings is None: st.error("OpenAI API key required for building the Knowledge Base.") return None, 0 all_chunks = [] # ── Step 1: Load and parse each PDF ────────────────────────────── with st.spinner("📄 Reading and parsing PDFs..."): for uploaded_file in uploaded_files: # Save to temp file (PyPDFLoader requires a file path) with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: tmp.write(uploaded_file.getvalue()) tmp_path = tmp.name loader = PyPDFLoader(tmp_path) raw_pages = loader.load() # Add source metadata to each page for citation tracking for page in raw_pages: page.metadata["source"] = uploaded_file.name all_chunks.extend(raw_pages) os.unlink(tmp_path) # Clean up temp file # ── Step 2: Chunk the text ──────────────────────────────────────── # Why chunk? LLMs have context limits. Smaller chunks = more precise retrieval. # chunk_size=800 chars ≈ ~200 tokens. Overlap=150 prevents context loss at edges. with st.spinner("✂️ Chunking documents into retrievable segments..."): splitter = RecursiveCharacterTextSplitter( chunk_size=800, chunk_overlap=150, separators=["\n\n", "\n", ".", " "], # Prefer splitting at paragraphs ) chunks = splitter.split_documents(all_chunks) # ── Step 3: Embed and store in ChromaDB ────────────────────────── with st.spinner(f"🧠 Embedding {len(chunks)} chunks into vector database..."): vectorstore = Chroma.from_documents( documents=chunks, embedding=embeddings, collection_name="taxbot_knowledge_base", ) return vectorstore, len(chunks) def build_knowledge_base_from_folder(folder_path: str = "docs") -> tuple: """ AUTO-PRELOAD FUNCTION: Ingests all PDFs from a local folder on startup. This runs automatically when the app launches on Hugging Face Spaces, so the Knowledge Base is ready without any manual uploads. Business Purpose: Judges open the app → KB is already loaded with all CBDT Circulars and Finance Act → they can ask questions immediately. Args: folder_path: Path to the folder containing pre-loaded PDFs. On Hugging Face this is the 'docs/' folder in your Space repo. Returns: (vectorstore, chunk_count, file_names) tuple """ embeddings = get_embeddings() if embeddings is None: return None, 0, [] # Check if the docs folder exists if not os.path.exists(folder_path): return None, 0, [] # Find all PDFs in the folder pdf_files = [ f for f in os.listdir(folder_path) if f.lower().endswith(".pdf") ] if not pdf_files: return None, 0, [] all_pages = [] # ── Load each PDF from disk (no temp files needed — we have direct paths) ── for pdf_name in pdf_files: pdf_path = os.path.join(folder_path, pdf_name) try: loader = PyPDFLoader(pdf_path) pages = loader.load() # Tag each page with its source filename for citations for page in pages: page.metadata["source"] = pdf_name all_pages.extend(pages) except Exception as e: st.warning(f"Could not load {pdf_name}: {e}") continue if not all_pages: return None, 0, [] # ── Chunk ──────────────────────────────────────────────────────── splitter = RecursiveCharacterTextSplitter( chunk_size=800, chunk_overlap=150, separators=["\n\n", "\n", ".", " "], ) chunks = splitter.split_documents(all_pages) # ── Embed into ChromaDB ────────────────────────────────────────── vectorstore = Chroma.from_documents( documents=chunks, embedding=embeddings, collection_name="taxbot_knowledge_base", ) return vectorstore, len(chunks), pdf_files # ───────────────────────────────────────────── # SECTION 5: ENGINE 2 — Generative Reasoning Chain # ───────────────────────────────────────────── # Business Logic: # When a user asks a tax question, we: # (a) Retrieve top-4 most relevant document chunks from ChromaDB. # (b) Feed those chunks + the user's question into Claude via a # carefully crafted prompt that enforces statutory accuracy. # ── The System Prompt — this is the "personality" of TaxBot AI ─────── TAXBOT_PROMPT_TEMPLATE = """ You are TaxBot AI, an expert Indian tax compliance assistant for MSMEs and individual taxpayers. Your answers must be: 1. GROUNDED: Only use information from the provided context (retrieved statutory excerpts). 2. PLAIN-LANGUAGE: Explain complex legal provisions in simple business terms. 3. STRUCTURED: Use bullet points and section references where helpful. 4. HONEST: If the context does not contain enough information, say so clearly. Never fabricate section numbers or circular references. 5. ACTIONABLE: End with a clear "What you should do" recommendation. RETRIEVED STATUTORY CONTEXT: ────────────────────────────── {context} ────────────────────────────── USER QUESTION: {question} TAXBOT AI RESPONSE: """ TAXBOT_PROMPT = PromptTemplate( input_variables=["context", "question"], template=TAXBOT_PROMPT_TEMPLATE ) def get_tax_answer(question: str, vectorstore: Chroma) -> dict: """ Runs the RAG pipeline: retrieve relevant law chunks, then ask Claude. Args: question: The user's tax query string. vectorstore: The populated ChromaDB instance. Returns: dict with keys: "answer" (str), "sources" (list of source filenames) """ llm = get_llm() if llm is None: return {"answer": "⚠️ LLM not configured. Please set ANTHROPIC_API_KEY.", "sources": []} # Build a RetrievalQA chain with our custom prompt qa_chain = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", # "stuff" = all chunks stuffed into one prompt retriever=vectorstore.as_retriever( search_type="similarity", search_kwargs={"k": 4} # Retrieve top 4 most relevant chunks ), chain_type_kwargs={"prompt": TAXBOT_PROMPT}, return_source_documents=True, # We want to show citations ) result = qa_chain.invoke({"query": question}) # Extract unique source file names for citation display sources = list({ doc.metadata.get("source", "Unknown") for doc in result.get("source_documents", []) }) return { "answer": result.get("result", "No answer generated."), "sources": sources } # ───────────────────────────────────────────── # SECTION 6: ENGINE 3 — Notice Interpreter (Vision AI) # ───────────────────────────────────────────── # Business Logic: # User uploads a scanned tax notice image or PDF. # We encode it as base64 and send to GPT-4o Vision. # The model extracts key details: notice type, demands, deadlines, required action. NOTICE_SYSTEM_PROMPT = """ You are an expert Indian tax notice analyst. When given a tax notice image, extract and summarize the following in a structured JSON format: { "notice_type": "e.g., Section 143(1) Intimation / GST ASMT-10 / etc.", "assessment_year": "AY 20XX-XX", "taxpayer_pan": "PAN or GSTIN if visible", "key_discrepancy": "Plain-language description of what the department found", "amount_involved": "₹ amount of demand or refund", "deadline": "Date by which taxpayer must respond or pay", "required_action": "Specific steps the taxpayer must take", "severity": "LOW / MEDIUM / HIGH", "severity_reason": "Brief reason for severity classification" } If any field is not visible in the notice, set it to "Not specified". """ def parse_tax_notice(image_bytes: bytes, file_type: str = "image/jpeg") -> dict | None: """ Sends a notice image to GPT-4o Vision and returns a structured summary. Args: image_bytes: Raw bytes of the uploaded notice image. file_type: MIME type of the image (image/jpeg, image/png, etc.) Returns: Parsed dict with notice details, or None on failure. """ import json client = get_openai_client() if client is None: return {"error": "OpenAI API key not configured. Vision engine offline."} # Encode image to base64 for the Vision API b64_image = base64.b64encode(image_bytes).decode("utf-8") with st.spinner("🔍 Analysing notice with Vision AI..."): response = client.chat.completions.create( model="gpt-4o", messages=[ { "role": "system", "content": NOTICE_SYSTEM_PROMPT }, { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": f"data:{file_type};base64,{b64_image}", "detail": "high" # High detail for text-heavy docs } }, { "type": "text", "text": "Please analyse this Indian tax notice and extract the structured information as specified." } ] } ], max_tokens=1000, temperature=0.0, # Zero temp for factual extraction ) raw_text = response.choices[0].message.content.strip() # Strip markdown code fences if present (GPT sometimes wraps JSON) raw_text = raw_text.replace("```json", "").replace("```", "").strip() try: return json.loads(raw_text) except json.JSONDecodeError: # If JSON parsing fails, return the raw text in a wrapper return {"notice_type": "Parsed (raw)", "raw_response": raw_text} # ───────────────────────────────────────────── # SECTION 7: UI LAYOUT — Sidebar # ───────────────────────────────────────────── with st.sidebar: st.markdown("## ⚖️ TaxBot AI") st.markdown("*Your Indian Tax Co-pilot*") st.markdown("---") # ── Knowledge Base Builder ──────────────────────────────────────── st.markdown("### 📚 Knowledge Base") st.caption("Upload CBDT Circulars, Finance Acts, GST notifications, or any tax PDF.") uploaded_pdfs = st.file_uploader( "Upload Tax Documents (PDF)", type=["pdf"], accept_multiple_files=True, help="These will be ingested into ChromaDB to power the RAG engine." ) if st.button("⚡ Build Knowledge Base", key="build_kb"): if not uploaded_pdfs: st.warning("Please upload at least one PDF first.") else: vectorstore, chunk_count = build_knowledge_base(uploaded_pdfs) if vectorstore: st.session_state["vectorstore"] = vectorstore st.session_state["kb_doc_count"] = chunk_count st.session_state["kb_file_names"] = [f.name for f in uploaded_pdfs] st.success(f"✅ Knowledge Base ready! {chunk_count} chunks indexed.") # Show current KB status if st.session_state["vectorstore"]: is_preloaded = st.session_state.get("kb_preloaded", False) label = "● Pre-loaded KB Active" if is_preloaded else "● KB Active" st.markdown( f'{label} — {st.session_state["kb_doc_count"]} chunks', unsafe_allow_html=True ) if is_preloaded: st.caption("✅ Core tax documents loaded automatically.") st.caption("Sources:") for fname in st.session_state["kb_file_names"]: st.caption(f" • {fname}") else: st.markdown('● KB Not Built', unsafe_allow_html=True) st.caption("No docs/ folder found. Upload PDFs above to build manually.") st.markdown("---") # ── Settings ───────────────────────────────────────────────────── st.markdown("### ⚙️ Settings") show_sources = st.toggle("Show source citations", value=True) retrieval_k = st.slider("Chunks to retrieve (k)", min_value=2, max_value=8, value=4, help="More chunks = broader context. May increase latency.") st.markdown("---") st.caption("Built for SIH1285 · Hackathon Demo") st.caption("Claude Sonnet 4.5 + GPT-4o Vision") # ───────────────────────────────────────────── # SECTION 8: UI LAYOUT — Main Panel # ───────────────────────────────────────────── # ── Header ──────────────────────────────────── st.markdown("""
Multimodal Financial Co-pilot · Statutory-accurate guidance for Indian MSMEs & Taxpayers
""", unsafe_allow_html=True) # ── Three Tabs: Chat | Notice Interpreter | Audit Risk ─────────────── tab_chat, tab_notice, tab_audit = st.tabs([ "💬 Tax Advisory Chat", "📋 Notice Interpreter", "📊 Audit Risk Scanner" ]) # ══════════════════════════════════════════ # TAB 1: TAX ADVISORY CHAT # ══════════════════════════════════════════ with tab_chat: # ── Display chat history ────────────────────────────────────────── chat_container = st.container() with chat_container: if not st.session_state["chat_history"]: st.markdown("""Assessment Year: {r.get('assessment_year', 'N/A')}
PAN / GSTIN: {r.get('taxpayer_pan', 'N/A')}
🔍 Discrepancy Found:
{r.get('key_discrepancy', 'N/A')}
💰 Amount Involved: {r.get('amount_involved', 'N/A')}
✅ What You Must Do:
{r.get('required_action', 'N/A')}