Spaces:

DRokhade
/

TaxBot_AI

Sleeping

App Files Files Community

DRokhade commited on Mar 13

Commit

67860b3

verified ·

1 Parent(s): cc2f53b

Upload 8 files

Browse files

Files changed (9) hide show

.gitattributes +6 -0
Notices.pdf +3 -0
app.py +1029 -0
circular-11-2025.pdf +3 -0
circular-15-2025.pdf +3 -0
circular-no-14-2025.pdf +3 -0
income-tax-act-1961-as-amended-by-finance-act-2025.pdf +3 -0
requirements.txt +29 -0
waiver-of-interest-circular-no-13-2025.pdf +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+circular-11-2025.pdf filter=lfs diff=lfs merge=lfs -text
+circular-15-2025.pdf filter=lfs diff=lfs merge=lfs -text
+circular-no-14-2025.pdf filter=lfs diff=lfs merge=lfs -text
+income-tax-act-1961-as-amended-by-finance-act-2025.pdf filter=lfs diff=lfs merge=lfs -text
+Notices.pdf filter=lfs diff=lfs merge=lfs -text
+waiver-of-interest-circular-no-13-2025.pdf filter=lfs diff=lfs merge=lfs -text

Notices.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ea284c144b77486b1efb466c77112f2f8fccb2c9465e46a965ddc975207a316
+size 118105

app.py ADDED Viewed

	@@ -0,0 +1,1029 @@

+"""
+╔══════════════════════════════════════════════════════════════════════╗
+║                     TaxBot AI — app.py                              ║
+║   Multimodal Financial Co-pilot for Indian MSMEs & Taxpayers        ║
+║   Built for Hugging Face Spaces · Powered by Streamlit + LangChain  ║
+╚══════════════════════════════════════════════════════════════════════╝
+ARCHITECTURE OVERVIEW (Triple-Engine Hybrid RAG):
+  Engine 1 — Knowledge Base (RAG):   PDF → ChromaDB → Retrieval
+  Engine 2 — Generative Reasoning:   Retrieved context → Claude → Answer
+  Engine 3 — Notice Interpreter:     Image/PDF upload → GPT-4o Vision → Summary
+DEPLOYMENT:
+  1. Upload this file + requirements.txt to a Hugging Face Space (Streamlit SDK).
+  2. Set secrets: ANTHROPIC_API_KEY, OPENAI_API_KEY in HF Space Settings.
+  3. ChromaDB runs in-memory (no external DB needed for the pilot).
+"""
+# ─────────────────────────────────────────────
+# SECTION 0: Imports & Page Configuration
+# ─────────────────────────────────────────────
+import os
+import io
+import base64
+import tempfile
+import streamlit as st
+# LangChain — the orchestration backbone
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain_openai import OpenAIEmbeddings          # Embeddings via OpenAI
+from langchain_anthropic import ChatAnthropic           # LLM via Anthropic Claude
+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+# OpenAI Vision (for Notice Interpreter)
+from openai import OpenAI
+# ── Streamlit Page Setup (must be FIRST Streamlit call) ──────────────
+st.set_page_config(
+    page_title="TaxBot AI · Indian Tax Co-pilot",
+    page_icon="⚖️",
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+# ─────────────────────────────────────────────
+# SECTION 1: Custom CSS — Visual Identity
+# ─────────────────────────────────────────────
+# Design Direction: "Legal Ink on Digital Paper"
+# Crisp deep navy + saffron accent. Monospaced touches for the "statutory" feel.
+# Evokes trust, authority, and precision — appropriate for a compliance tool.
+st.markdown("""
+<style>
+  @import url('https://fonts.googleapis.com/css2?family=DM+Serif+Display:ital@0;1&family=JetBrains+Mono:wght@400;600&family=DM+Sans:wght@400;500;600&display=swap');
+  /* ── Root Variables ── */
+  :root {
+    --navy:    #0d1b2a;
+    --saffron: #e8851a;
+    --cream:   #f5f0e8;
+    --teal:    #1a7a6e;
+    --red:     #c0392b;
+    --text:    #1a1a2e;
+    --muted:   #6b7280;
+    --border:  #d4c9b0;
+    --card-bg: #fdfaf5;
+  }
+  /* ── Global Reset ── */
+  html, body, [class*="css"] {
+    font-family: 'DM Sans', sans-serif;
+    background-color: var(--cream) !important;
+    color: var(--text);
+  }
+  /* ── Sidebar ── */
+  [data-testid="stSidebar"] {
+    background: var(--navy) !important;
+    border-right: 3px solid var(--saffron);
+  }
+  [data-testid="stSidebar"] * { color: var(--cream) !important; }
+  [data-testid="stSidebar"] h1,
+  [data-testid="stSidebar"] h2,
+  [data-testid="stSidebar"] h3 { color: var(--saffron) !important; }
+  [data-testid="stSidebar"] .stButton > button {
+    background: var(--saffron) !important;
+    color: var(--navy) !important;
+    font-weight: 700 !important;
+    border: none !important;
+    border-radius: 4px !important;
+    width: 100% !important;
+    font-family: 'JetBrains Mono', monospace !important;
+    letter-spacing: 0.05em;
+  }
+  [data-testid="stSidebar"] .stButton > button:hover {
+    background: #f0972a !important;
+    transform: translateY(-1px);
+    box-shadow: 0 4px 12px rgba(232,133,26,0.4) !important;
+  }
+  /* ── Main Header ── */
+  .taxbot-header {
+    display: flex;
+    align-items: center;
+    gap: 1rem;
+    padding: 1.5rem 0 0.5rem;
+    border-bottom: 2px solid var(--saffron);
+    margin-bottom: 1.5rem;
+  }
+  .taxbot-header h1 {
+    font-family: 'DM Serif Display', serif;
+    font-size: 2.4rem;
+    color: var(--navy);
+    margin: 0;
+    letter-spacing: -0.03em;
+  }
+  .taxbot-header .badge {
+    background: var(--saffron);
+    color: var(--navy);
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 0.65rem;
+    font-weight: 700;
+    padding: 3px 10px;
+    border-radius: 2px;
+    letter-spacing: 0.12em;
+    text-transform: uppercase;
+    align-self: flex-start;
+    margin-top: 0.6rem;
+  }
+  .taxbot-subtitle {
+    color: var(--muted);
+    font-size: 0.95rem;
+    margin-bottom: 1.5rem;
+  }
+  /* ── Chat Messages ── */
+  .chat-user {
+    background: var(--navy);
+    color: var(--cream);
+    border-radius: 12px 12px 2px 12px;
+    padding: 1rem 1.25rem;
+    margin: 0.75rem 0 0.75rem 3rem;
+    font-size: 0.95rem;
+    line-height: 1.6;
+    box-shadow: 0 2px 8px rgba(13,27,42,0.15);
+  }
+  .chat-bot {
+    background: var(--card-bg);
+    border: 1px solid var(--border);
+    border-left: 4px solid var(--teal);
+    border-radius: 2px 12px 12px 12px;
+    padding: 1rem 1.25rem;
+    margin: 0.75rem 3rem 0.75rem 0;
+    font-size: 0.95rem;
+    line-height: 1.7;
+    box-shadow: 0 2px 8px rgba(0,0,0,0.06);
+  }
+  .chat-bot .source-tag {
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 0.7rem;
+    color: var(--teal);
+    background: rgba(26,122,110,0.1);
+    padding: 2px 8px;
+    border-radius: 3px;
+    display: inline-block;
+    margin-top: 0.75rem;
+    margin-right: 0.4rem;
+  }
+  /* ── Notice Summary Card ── */
+  .notice-card {
+    background: #fff8f0;
+    border: 1.5px solid var(--saffron);
+    border-radius: 8px;
+    padding: 1.25rem;
+    margin: 1rem 0;
+  }
+  .notice-card h4 {
+    font-family: 'DM Serif Display', serif;
+    color: var(--navy);
+    margin: 0 0 0.5rem;
+    font-size: 1.1rem;
+  }
+  .notice-card .deadline {
+    background: var(--red);
+    color: white;
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 0.75rem;
+    font-weight: 600;
+    padding: 3px 10px;
+    border-radius: 3px;
+    display: inline-block;
+    margin-top: 0.5rem;
+  }
+  /* ── Status Pills ── */
+  .status-ok   { background:#d4edda; color:#1a5e31; padding:3px 10px; border-radius:12px; font-size:0.8rem; font-weight:600; }
+  .status-warn { background:#fff3cd; color:#856404; padding:3px 10px; border-radius:12px; font-size:0.8rem; font-weight:600; }
+  .status-err  { background:#f8d7da; color:#721c24; padding:3px 10px; border-radius:12px; font-size:0.8rem; font-weight:600; }
+  /* ── Input Area ── */
+  .stTextInput > div > div > input,
+  .stTextArea > div > div > textarea {
+    border: 1.5px solid var(--border) !important;
+    border-radius: 6px !important;
+    font-family: 'DM Sans', sans-serif !important;
+    background: white !important;
+  }
+  .stTextInput > div > div > input:focus,
+  .stTextArea > div > div > textarea:focus {
+    border-color: var(--teal) !important;
+    box-shadow: 0 0 0 3px rgba(26,122,110,0.15) !important;
+  }
+  .stButton > button {
+    background: var(--teal) !important;
+    color: white !important;
+    border: none !important;
+    border-radius: 6px !important;
+    font-weight: 600 !important;
+    padding: 0.5rem 1.5rem !important;
+  }
+  .stButton > button:hover {
+    background: #155f55 !important;
+    transform: translateY(-1px);
+    box-shadow: 0 4px 12px rgba(26,122,110,0.3) !important;
+  }
+  /* ── Tabs ── */
+  .stTabs [data-baseweb="tab-list"] { border-bottom: 2px solid var(--border); }
+  .stTabs [data-baseweb="tab"] {
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 0.8rem;
+    letter-spacing: 0.08em;
+    color: var(--muted) !important;
+  }
+  .stTabs [aria-selected="true"] {
+    color: var(--navy) !important;
+    border-bottom: 2px solid var(--saffron) !important;
+  }
+  /* ── Divider ── */
+  hr { border-color: var(--border) !important; }
+</style>
+""", unsafe_allow_html=True)
+# ─────────────────────────────────────────────
+# SECTION 2: API Client Initialisation
+# ─────────────────────────────────────────────
+# Business Purpose: Securely load API keys from Hugging Face Secrets
+# (or .env locally). Never hard-code keys in source code.
+@st.cache_resource
+def get_llm():
+    """
+    Returns a LangChain-wrapped Claude 3.5 Sonnet instance.
+    Claude handles all the statutory reasoning and answer generation.
+    """
+    api_key = os.environ.get("ANTHROPIC_API_KEY", "")
+    if not api_key:
+        st.warning("⚠️ ANTHROPIC_API_KEY not set. Reasoning engine offline.", icon="⚠️")
+        return None
+    return ChatAnthropic(
+        model="claude-sonnet-4-5",   # Claude Sonnet 4.5 — smart + efficient
+        api_key=api_key,
+        temperature=0.1,             # Low temp = more deterministic legal answers
+        max_tokens=1500,
+    )
+@st.cache_resource
+def get_openai_client():
+    """
+    Returns an OpenAI client used exclusively for Vision-based notice parsing
+    (GPT-4o's multimodal capability).
+    """
+    api_key = os.environ.get("OPENAI_API_KEY", "")
+    if not api_key:
+        return None
+    return OpenAI(api_key=api_key)
+@st.cache_resource
+def get_embeddings():
+    """
+    Returns an OpenAI Embeddings model for converting text chunks
+    into vectors stored in ChromaDB.
+    """
+    api_key = os.environ.get("OPENAI_API_KEY", "")
+    if not api_key:
+        return None
+    return OpenAIEmbeddings(
+        model="text-embedding-3-small",  # Cost-effective, high quality
+        api_key=api_key
+    )
+# ─────────────────────────────────────────────
+# SECTION 3: Session State Initialisation
+# ─────────────────────────────────────────────
+# Think of session_state as the app's short-term memory per user session.
+def init_session_state():
+    defaults = {
+        "chat_history":    [],   # List of {"role": "user"/"bot", "content": "..."}
+        "vectorstore":     None, # ChromaDB instance (built when user uploads PDFs)
+        "kb_doc_count":    0,    # Number of chunks indexed
+        "kb_file_names":   [],   # Names of uploaded files for display
+        "notice_result":   None, # Last parsed notice result
+    }
+    for key, val in defaults.items():
+        if key not in st.session_state:
+            st.session_state[key] = val
+init_session_state()
+# ─────────────────────────────────────────────
+# SECTION 3b: AUTO-PRELOAD on Startup
+# ─────────────────────────────────────────────
+# Business Purpose: When deployed on Hugging Face Spaces, this block
+# runs ONCE per session and loads all PDFs from the 'docs/' folder
+# automatically. Judges see a fully-ready Knowledge Base on first load.
+# The @st.cache_resource on embeddings/LLM ensures this is efficient.
+@st.cache_resource(show_spinner=False)
+def autoload_knowledge_base():
+    """
+    Cached function — runs only ONCE per app instance (not per user session).
+    Loads all PDFs from the docs/ folder into ChromaDB.
+    Returns (vectorstore, chunk_count, file_names) or (None, 0, []) if no docs found.
+    """
+    docs_folder = "docs"  # Relative path — matches your HF Space folder structure
+    embeddings  = get_embeddings()
+    if embeddings is None:
+        return None, 0, []
+    if not os.path.exists(docs_folder):
+        return None, 0, []
+    pdf_files = [f for f in os.listdir(docs_folder) if f.lower().endswith(".pdf")]
+    if not pdf_files:
+        return None, 0, []
+    all_pages = []
+    for pdf_name in pdf_files:
+        try:
+            loader = PyPDFLoader(os.path.join(docs_folder, pdf_name))
+            pages  = loader.load()
+            for page in pages:
+                page.metadata["source"] = pdf_name
+            all_pages.extend(pages)
+        except Exception:
+            continue
+    if not all_pages:
+        return None, 0, []
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=800, chunk_overlap=150,
+        separators=["\n\n", "\n", ".", " "]
+    )
+    chunks = splitter.split_documents(all_pages)
+    vectorstore = Chroma.from_documents(
+        documents=chunks,
+        embedding=embeddings,
+        collection_name="taxbot_preloaded_kb",
+    )
+    return vectorstore, len(chunks), pdf_files
+# Run the auto-preload and populate session state (only if KB not already set)
+if st.session_state["vectorstore"] is None:
+    _vs, _count, _files = autoload_knowledge_base()
+    if _vs is not None:
+        st.session_state["vectorstore"]   = _vs
+        st.session_state["kb_doc_count"]  = _count
+        st.session_state["kb_file_names"] = _files
+        st.session_state["kb_preloaded"]  = True   # Flag to show "Pre-loaded" badge
+# ─────────────────────────────────────────────
+# SECTION 4: ENGINE 1 — Knowledge Base Builder
+# ─────────────────────────────────────────────
+# Business Logic:
+#   User uploads PDF circulars/acts → we split them into manageable chunks →
+#   embed each chunk as a vector → store in ChromaDB.
+#   Later, when a user asks a question, we retrieve the top-K most relevant
+#   chunks as "context" for Claude (this is RAG).
+def build_knowledge_base(uploaded_files: list) -> tuple[Chroma | None, int]:
+    """
+    Ingests a list of uploaded PDF files into a ChromaDB vector store.
+    Args:
+        uploaded_files: List of Streamlit UploadedFile objects.
+    Returns:
+        (vectorstore, chunk_count) — the ChromaDB instance and total chunks indexed.
+    """
+    embeddings = get_embeddings()
+    if embeddings is None:
+        st.error("OpenAI API key required for building the Knowledge Base.")
+        return None, 0
+    all_chunks = []
+    # ── Step 1: Load and parse each PDF ��─────────────────────────────
+    with st.spinner("📄 Reading and parsing PDFs..."):
+        for uploaded_file in uploaded_files:
+            # Save to temp file (PyPDFLoader requires a file path)
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+                tmp.write(uploaded_file.getvalue())
+                tmp_path = tmp.name
+            loader = PyPDFLoader(tmp_path)
+            raw_pages = loader.load()
+            # Add source metadata to each page for citation tracking
+            for page in raw_pages:
+                page.metadata["source"] = uploaded_file.name
+            all_chunks.extend(raw_pages)
+            os.unlink(tmp_path)  # Clean up temp file
+    # ── Step 2: Chunk the text ────────────────────────────────────────
+    # Why chunk? LLMs have context limits. Smaller chunks = more precise retrieval.
+    # chunk_size=800 chars ≈ ~200 tokens. Overlap=150 prevents context loss at edges.
+    with st.spinner("✂️ Chunking documents into retrievable segments..."):
+        splitter = RecursiveCharacterTextSplitter(
+            chunk_size=800,
+            chunk_overlap=150,
+            separators=["\n\n", "\n", ".", " "],  # Prefer splitting at paragraphs
+        )
+        chunks = splitter.split_documents(all_chunks)
+    # ── Step 3: Embed and store in ChromaDB ──────────────────────────
+    with st.spinner(f"🧠 Embedding {len(chunks)} chunks into vector database..."):
+        vectorstore = Chroma.from_documents(
+            documents=chunks,
+            embedding=embeddings,
+            collection_name="taxbot_knowledge_base",
+        )
+    return vectorstore, len(chunks)
+def build_knowledge_base_from_folder(folder_path: str = "docs") -> tuple:
+    """
+    AUTO-PRELOAD FUNCTION: Ingests all PDFs from a local folder on startup.
+    This runs automatically when the app launches on Hugging Face Spaces,
+    so the Knowledge Base is ready without any manual uploads.
+    Business Purpose: Judges open the app → KB is already loaded with all
+    CBDT Circulars and Finance Act → they can ask questions immediately.
+    Args:
+        folder_path: Path to the folder containing pre-loaded PDFs.
+                     On Hugging Face this is the 'docs/' folder in your Space repo.
+    Returns:
+        (vectorstore, chunk_count, file_names) tuple
+    """
+    embeddings = get_embeddings()
+    if embeddings is None:
+        return None, 0, []
+    # Check if the docs folder exists
+    if not os.path.exists(folder_path):
+        return None, 0, []
+    # Find all PDFs in the folder
+    pdf_files = [
+        f for f in os.listdir(folder_path)
+        if f.lower().endswith(".pdf")
+    ]
+    if not pdf_files:
+        return None, 0, []
+    all_pages = []
+    # ── Load each PDF from disk (no temp files needed — we have direct paths) ──
+    for pdf_name in pdf_files:
+        pdf_path = os.path.join(folder_path, pdf_name)
+        try:
+            loader = PyPDFLoader(pdf_path)
+            pages  = loader.load()
+            # Tag each page with its source filename for citations
+            for page in pages:
+                page.metadata["source"] = pdf_name
+            all_pages.extend(pages)
+        except Exception as e:
+            st.warning(f"Could not load {pdf_name}: {e}")
+            continue
+    if not all_pages:
+        return None, 0, []
+    # ── Chunk ────────────────────────────────────────────────────────
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=800,
+        chunk_overlap=150,
+        separators=["\n\n", "\n", ".", " "],
+    )
+    chunks = splitter.split_documents(all_pages)
+    # ── Embed into ChromaDB ──────────────────────────────────────────
+    vectorstore = Chroma.from_documents(
+        documents=chunks,
+        embedding=embeddings,
+        collection_name="taxbot_knowledge_base",
+    )
+    return vectorstore, len(chunks), pdf_files
+# ─────────────────────────────────────────────
+# SECTION 5: ENGINE 2 — Generative Reasoning Chain
+# ─────────────────────────────────────────────
+# Business Logic:
+#   When a user asks a tax question, we:
+#   (a) Retrieve top-4 most relevant document chunks from ChromaDB.
+#   (b) Feed those chunks + the user's question into Claude via a
+#       carefully crafted prompt that enforces statutory accuracy.
+# ── The System Prompt — this is the "personality" of TaxBot AI ───────
+TAXBOT_PROMPT_TEMPLATE = """
+You are TaxBot AI, an expert Indian tax compliance assistant for MSMEs and individual taxpayers.
+Your answers must be:
+1. GROUNDED: Only use information from the provided context (retrieved statutory excerpts).
+2. PLAIN-LANGUAGE: Explain complex legal provisions in simple business terms.
+3. STRUCTURED: Use bullet points and section references where helpful.
+4. HONEST: If the context does not contain enough information, say so clearly.
+   Never fabricate section numbers or circular references.
+5. ACTIONABLE: End with a clear "What you should do" recommendation.
+RETRIEVED STATUTORY CONTEXT:
+──────────────────────────────
+{context}
+──────────────────────────────
+USER QUESTION: {question}
+TAXBOT AI RESPONSE:
+"""
+TAXBOT_PROMPT = PromptTemplate(
+    input_variables=["context", "question"],
+    template=TAXBOT_PROMPT_TEMPLATE
+)
+def get_tax_answer(question: str, vectorstore: Chroma) -> dict:
+    """
+    Runs the RAG pipeline: retrieve relevant law chunks, then ask Claude.
+    Args:
+        question:    The user's tax query string.
+        vectorstore: The populated ChromaDB instance.
+    Returns:
+        dict with keys: "answer" (str), "sources" (list of source filenames)
+    """
+    llm = get_llm()
+    if llm is None:
+        return {"answer": "⚠️ LLM not configured. Please set ANTHROPIC_API_KEY.", "sources": []}
+    # Build a RetrievalQA chain with our custom prompt
+    qa_chain = RetrievalQA.from_chain_type(
+        llm=llm,
+        chain_type="stuff",              # "stuff" = all chunks stuffed into one prompt
+        retriever=vectorstore.as_retriever(
+            search_type="similarity",
+            search_kwargs={"k": 4}       # Retrieve top 4 most relevant chunks
+        ),
+        chain_type_kwargs={"prompt": TAXBOT_PROMPT},
+        return_source_documents=True,    # We want to show citations
+    )
+    result = qa_chain.invoke({"query": question})
+    # Extract unique source file names for citation display
+    sources = list({
+        doc.metadata.get("source", "Unknown")
+        for doc in result.get("source_documents", [])
+    })
+    return {
+        "answer": result.get("result", "No answer generated."),
+        "sources": sources
+    }
+# ─────────────────────────────────────────────
+# SECTION 6: ENGINE 3 — Notice Interpreter (Vision AI)
+# ─────────────────────────────────────────────
+# Business Logic:
+#   User uploads a scanned tax notice image or PDF.
+#   We encode it as base64 and send to GPT-4o Vision.
+#   The model extracts key details: notice type, demands, deadlines, required action.
+NOTICE_SYSTEM_PROMPT = """
+You are an expert Indian tax notice analyst. When given a tax notice image,
+extract and summarize the following in a structured JSON format:
+{
+  "notice_type": "e.g., Section 143(1) Intimation / GST ASMT-10 / etc.",
+  "assessment_year": "AY 20XX-XX",
+  "taxpayer_pan": "PAN or GSTIN if visible",
+  "key_discrepancy": "Plain-language description of what the department found",
+  "amount_involved": "₹ amount of demand or refund",
+  "deadline": "Date by which taxpayer must respond or pay",
+  "required_action": "Specific steps the taxpayer must take",
+  "severity": "LOW / MEDIUM / HIGH",
+  "severity_reason": "Brief reason for severity classification"
+}
+If any field is not visible in the notice, set it to "Not specified".
+"""
+def parse_tax_notice(image_bytes: bytes, file_type: str = "image/jpeg") -> dict | None:
+    """
+    Sends a notice image to GPT-4o Vision and returns a structured summary.
+    Args:
+        image_bytes: Raw bytes of the uploaded notice image.
+        file_type:   MIME type of the image (image/jpeg, image/png, etc.)
+    Returns:
+        Parsed dict with notice details, or None on failure.
+    """
+    import json
+    client = get_openai_client()
+    if client is None:
+        return {"error": "OpenAI API key not configured. Vision engine offline."}
+    # Encode image to base64 for the Vision API
+    b64_image = base64.b64encode(image_bytes).decode("utf-8")
+    with st.spinner("🔍 Analysing notice with Vision AI..."):
+        response = client.chat.completions.create(
+            model="gpt-4o",
+            messages=[
+                {
+                    "role": "system",
+                    "content": NOTICE_SYSTEM_PROMPT
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:{file_type};base64,{b64_image}",
+                                "detail": "high"    # High detail for text-heavy docs
+                            }
+                        },
+                        {
+                            "type": "text",
+                            "text": "Please analyse this Indian tax notice and extract the structured information as specified."
+                        }
+                    ]
+                }
+            ],
+            max_tokens=1000,
+            temperature=0.0,  # Zero temp for factual extraction
+        )
+    raw_text = response.choices[0].message.content.strip()
+    # Strip markdown code fences if present (GPT sometimes wraps JSON)
+    raw_text = raw_text.replace("```json", "").replace("```", "").strip()
+    try:
+        return json.loads(raw_text)
+    except json.JSONDecodeError:
+        # If JSON parsing fails, return the raw text in a wrapper
+        return {"notice_type": "Parsed (raw)", "raw_response": raw_text}
+# ─────────────────────────────────────────────
+# SECTION 7: UI LAYOUT — Sidebar
+# ─────────────────────────────────────────────
+with st.sidebar:
+    st.markdown("## ⚖️ TaxBot AI")
+    st.markdown("*Your Indian Tax Co-pilot*")
+    st.markdown("---")
+    # ── Knowledge Base Builder ────────────────────────────────────────
+    st.markdown("### 📚 Knowledge Base")
+    st.caption("Upload CBDT Circulars, Finance Acts, GST notifications, or any tax PDF.")
+    uploaded_pdfs = st.file_uploader(
+        "Upload Tax Documents (PDF)",
+        type=["pdf"],
+        accept_multiple_files=True,
+        help="These will be ingested into ChromaDB to power the RAG engine."
+    )
+    if st.button("⚡ Build Knowledge Base", key="build_kb"):
+        if not uploaded_pdfs:
+            st.warning("Please upload at least one PDF first.")
+        else:
+            vectorstore, chunk_count = build_knowledge_base(uploaded_pdfs)
+            if vectorstore:
+                st.session_state["vectorstore"]   = vectorstore
+                st.session_state["kb_doc_count"]  = chunk_count
+                st.session_state["kb_file_names"] = [f.name for f in uploaded_pdfs]
+                st.success(f"✅ Knowledge Base ready! {chunk_count} chunks indexed.")
+    # Show current KB status
+    if st.session_state["vectorstore"]:
+        is_preloaded = st.session_state.get("kb_preloaded", False)
+        label = "● Pre-loaded KB Active" if is_preloaded else "● KB Active"
+        st.markdown(
+            f'<span class="status-ok">{label} — {st.session_state["kb_doc_count"]} chunks</span>',
+            unsafe_allow_html=True
+        )
+        if is_preloaded:
+            st.caption("✅ Core tax documents loaded automatically.")
+        st.caption("Sources:")
+        for fname in st.session_state["kb_file_names"]:
+            st.caption(f"  • {fname}")
+    else:
+        st.markdown('<span class="status-warn">● KB Not Built</span>', unsafe_allow_html=True)
+        st.caption("No docs/ folder found. Upload PDFs above to build manually.")
+    st.markdown("---")
+    # ── Settings ─────────────────────────────────────────────────────
+    st.markdown("### ⚙️ Settings")
+    show_sources = st.toggle("Show source citations", value=True)
+    retrieval_k  = st.slider("Chunks to retrieve (k)", min_value=2, max_value=8, value=4,
+                              help="More chunks = broader context. May increase latency.")
+    st.markdown("---")
+    st.caption("Built for SIH1285 · Hackathon Demo")
+    st.caption("Claude Sonnet 4.5 + GPT-4o Vision")
+# ─────────────────────────────────────────────
+# SECTION 8: UI LAYOUT — Main Panel
+# ─────────────────────────────────────────────
+# ── Header ────────────────────────────────────
+st.markdown("""
+<div class="taxbot-header">
+  <h1>TaxBot AI</h1>
+  <span class="badge">Beta · SIH1285</span>
+</div>
+<p class="taxbot-subtitle">
+  Multimodal Financial Co-pilot · Statutory-accurate guidance for Indian MSMEs & Taxpayers
+</p>
+""", unsafe_allow_html=True)
+# ── Three Tabs: Chat | Notice Interpreter | Audit Risk ───────────────
+tab_chat, tab_notice, tab_audit = st.tabs([
+    "💬  Tax Advisory Chat",
+    "📋  Notice Interpreter",
+    "📊  Audit Risk Scanner"
+])
+# ══════════════════════════════════════════
+# TAB 1: TAX ADVISORY CHAT
+# ════════════════════════════��═════════════
+with tab_chat:
+    # ── Display chat history ──────────────────────────────────────────
+    chat_container = st.container()
+    with chat_container:
+        if not st.session_state["chat_history"]:
+            st.markdown("""
+            <div style="text-align:center; padding: 3rem 1rem; color: #9ca3af;">
+              <div style="font-size: 2.5rem; margin-bottom: 1rem;">⚖️</div>
+              <div style="font-family: 'DM Serif Display', serif; font-size: 1.2rem;
+                          color: #374151; margin-bottom: 0.5rem;">
+                Ask me anything about Indian Tax Law
+              </div>
+              <div style="font-size: 0.9rem;">
+                Upload PDFs to the Knowledge Base, then ask queries like:<br>
+                <em>"What are the due dates under Circular 15/2025?"</em><br>
+                <em>"Explain the 87A rebate changes under Finance Act 2025."</em><br>
+                <em>"What is the penalty for late TDS payment?"</em>
+              </div>
+            </div>
+            """, unsafe_allow_html=True)
+        for msg in st.session_state["chat_history"]:
+            if msg["role"] == "user":
+                st.markdown(
+                    f'<div class="chat-user">🧑 {msg["content"]}</div>',
+                    unsafe_allow_html=True
+                )
+            else:
+                sources_html = ""
+                if show_sources and msg.get("sources"):
+                    for src in msg["sources"]:
+                        sources_html += f'<span class="source-tag">📎 {src}</span>'
+                st.markdown(
+                    f'<div class="chat-bot">⚖️ {msg["content"]}{sources_html}</div>',
+                    unsafe_allow_html=True
+                )
+    # ── Input row ────────────────────────────────────────────────────
+    st.markdown("<br>", unsafe_allow_html=True)
+    col_input, col_btn = st.columns([5, 1])
+    with col_input:
+        user_query = st.text_input(
+            "Your tax question",
+            placeholder="e.g. What is the deadline for filing ITR for AY 2025-26?",
+            label_visibility="collapsed",
+            key="chat_input"
+        )
+    with col_btn:
+        send_clicked = st.button("Send →", key="send_btn")
+    # ── Handle query submission ───────────────────────────────────────
+    if send_clicked and user_query.strip():
+        # Add user message to history
+        st.session_state["chat_history"].append({
+            "role": "user",
+            "content": user_query
+        })
+        # Route to appropriate engine
+        if st.session_state["vectorstore"] is None:
+            # No KB built — use Claude without RAG (fallback mode)
+            llm = get_llm()
+            if llm:
+                with st.spinner("🤔 Thinking (no Knowledge Base — using Claude's base knowledge)..."):
+                    fallback_prompt = f"""
+You are TaxBot AI, an expert Indian tax assistant. Answer the following question
+based on your knowledge of Indian Income Tax Act 1961, GST laws, and CBDT circulars.
+Be specific, structured, and cite relevant sections. End with actionable advice.
+Question: {user_query}
+"""
+                    response = llm.invoke(fallback_prompt)
+                    answer = response.content
+            else:
+                answer = "⚠️ Both the Knowledge Base and LLM are unavailable. Please check your API keys."
+            st.session_state["chat_history"].append({
+                "role": "bot",
+                "content": answer,
+                "sources": ["Claude base knowledge (no RAG)"]
+            })
+        else:
+            # RAG mode — retrieve from ChromaDB, then reason with Claude
+            with st.spinner("🔎 Searching knowledge base + reasoning..."):
+                result = get_tax_answer(user_query, st.session_state["vectorstore"])
+            st.session_state["chat_history"].append({
+                "role":    "bot",
+                "content": result["answer"],
+                "sources": result["sources"]
+            })
+        st.rerun()
+    # ── Clear chat button ─────────────────────────────────────────────
+    if st.session_state["chat_history"]:
+        if st.button("🗑 Clear Chat", key="clear_chat"):
+            st.session_state["chat_history"] = []
+            st.rerun()
+# ══════════════════════════════════════════
+# TAB 2: NOTICE INTERPRETER
+# ══════════════════════════════════════════
+with tab_notice:
+    st.markdown("### 📋 Tax Notice Interpreter")
+    st.markdown(
+        "Upload a scanned or digital tax notice (image or PDF screenshot). "
+        "The Vision AI engine will extract key information and explain what action you need to take."
+    )
+    uploaded_notice = st.file_uploader(
+        "Upload Notice (Image: JPG/PNG)",
+        type=["jpg", "jpeg", "png"],
+        help="For PDF notices, take a screenshot of the main page and upload as PNG/JPG.",
+        key="notice_uploader"
+    )
+    if uploaded_notice:
+        col_preview, col_result = st.columns([1, 1])
+        with col_preview:
+            st.markdown("**Preview:**")
+            st.image(uploaded_notice, use_container_width=True)
+        with col_result:
+            if st.button("🔍 Analyse Notice", key="analyse_notice"):
+                image_bytes = uploaded_notice.getvalue()
+                file_type = f"image/{uploaded_notice.type.split('/')[-1]}"
+                result = parse_tax_notice(image_bytes, file_type)
+                st.session_state["notice_result"] = result
+        # Display results if available
+        if st.session_state.get("notice_result"):
+            r = st.session_state["notice_result"]
+            st.markdown("---")
+            if "error" in r:
+                st.error(r["error"])
+            elif "raw_response" in r:
+                st.info("Raw extraction (structured parsing unavailable):")
+                st.write(r["raw_response"])
+            else:
+                # Severity color mapping
+                severity_class = {
+                    "HIGH":   "status-err",
+                    "MEDIUM": "status-warn",
+                    "LOW":    "status-ok"
+                }.get(r.get("severity", "MEDIUM"), "status-warn")
+                st.markdown(f"""
+                <div class="notice-card">
+                  <h4>{r.get('notice_type', 'Tax Notice')}</h4>
+                  <p><b>Assessment Year:</b> {r.get('assessment_year', 'N/A')}</p>
+                  <p><b>PAN / GSTIN:</b> {r.get('taxpayer_pan', 'N/A')}</p>
+                  <hr style="margin: 0.5rem 0;">
+                  <p><b>🔍 Discrepancy Found:</b><br>{r.get('key_discrepancy', 'N/A')}</p>
+                  <p><b>💰 Amount Involved:</b> {r.get('amount_involved', 'N/A')}</p>
+                  <p><b>✅ What You Must Do:</b><br>{r.get('required_action', 'N/A')}</p>
+                  <span class="{severity_class}">
+                    {r.get('severity', 'MEDIUM')} PRIORITY
+                  </span>
+                  &nbsp;
+                  <span class="deadline">DEADLINE: {r.get('deadline', 'Check notice')}</span>
+                </div>
+                """, unsafe_allow_html=True)
+                # Offer to explain further via chat
+                st.info(
+                    "💡 Switch to the **Tax Advisory Chat** tab and ask "
+                    f"\"Explain {r.get('notice_type', 'this notice type')} and my options\" "
+                    "for detailed statutory guidance."
+                )
+# ══════════════════════════════════════════
+# TAB 3: AUDIT RISK SCANNER (Placeholder)
+# ══════════════════════════════════════════
+with tab_audit:
+    st.markdown("### 📊 Audit Risk Scanner")
+    st.markdown(
+        "Enter your key financial figures. The ML model (Random Forest) "
+        "will estimate your audit risk score based on anomaly patterns."
+    )
+    st.info(
+        "🔧 **Engine Status:** ML model placeholder. "
+        "In the full build, a Scikit-Learn Random Forest model trained on "
+        "historical audit trigger patterns will power this scanner.",
+        icon="ℹ️"
+    )
+    col1, col2 = st.columns(2)
+    with col1:
+        turnover      = st.number_input("Annual Turnover (₹ Lakhs)",     min_value=0.0, value=50.0, step=1.0)
+        gross_profit  = st.number_input("Gross Profit (₹ Lakhs)",        min_value=0.0, value=8.0,  step=0.5)
+        tds_claimed   = st.number_input("TDS Claimed (₹ Lakhs)",         min_value=0.0, value=2.0,  step=0.1)
+    with col2:
+        tax_paid      = st.number_input("Total Tax Paid (₹ Lakhs)",      min_value=0.0, value=3.5,  step=0.1)
+        deductions_80c = st.number_input("80C/80D Deductions (₹ Lakhs)", min_value=0.0, value=1.5,  step=0.1)
+        cash_deposits = st.number_input("Cash Deposits in FY (₹ Lakhs)", min_value=0.0, value=5.0,  step=0.5)
+    if st.button("⚡ Run Audit Risk Scan", key="audit_scan"):
+        # ── Placeholder Rule-Based Score (replace with sklearn RF model) ──
+        risk_score = 0
+        flags = []
+        gp_ratio = (gross_profit / turnover * 100) if turnover > 0 else 0
+        if gp_ratio < 8:
+            risk_score += 25
+            flags.append(f"Low gross profit ratio ({gp_ratio:.1f}%) — industry avg ~10-15%")
+        if deductions_80c > 1.5:
+            risk_score += 20
+            flags.append(f"80C deductions (₹{deductions_80c}L) exceed ₹1.5L limit")
+        if cash_deposits > turnover * 0.3:
+            risk_score += 30
+            flags.append(f"High cash deposit ratio ({cash_deposits/turnover*100:.0f}% of turnover)")
+        if tds_claimed > tax_paid * 0.8:
+            risk_score += 15
+            flags.append("High TDS-to-tax-paid ratio — possible TDS mismatch")
+        risk_score = min(risk_score, 100)
+        # Display result
+        if risk_score >= 60:
+            color, label = "#c0392b", "HIGH RISK"
+        elif risk_score >= 30:
+            color, label = "#e8851a", "MEDIUM RISK"
+        else:
+            color, label = "#1a7a6e", "LOW RISK"
+        st.markdown(f"""
+        <div style="background:{color}15; border: 2px solid {color}; border-radius:8px;
+                    padding:1.5rem; margin:1rem 0; text-align:center;">
+          <div style="font-family:'DM Serif Display',serif; font-size:2rem;
+                      color:{color}; font-weight:bold;">{risk_score} / 100</div>
+          <div style="color:{color}; font-weight:700; font-family:'JetBrains Mono',monospace;
+                      font-size:0.9rem; letter-spacing:0.1em;">{label}</div>
+        </div>
+        """, unsafe_allow_html=True)
+        if flags:
+            st.markdown("**⚠️ Risk Flags Detected:**")
+            for flag in flags:
+                st.markdown(f"- {flag}")
+        else:
+            st.success("✅ No significant risk flags detected in your financial profile.")
+        st.caption(
+            "Note: This score is based on heuristic rules for the demo. "
+            "The production version uses a Random Forest model trained on audit patterns."
+        )

circular-11-2025.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45a854508983ed56b3cffddb0248775f4c486c1b520feee67a7f8132c56c2fc1
+size 288282

circular-15-2025.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f95677aefb3701ec5654e679cda5a06d096eab9167461a7546a19e5509eaf3f9
+size 388985

circular-no-14-2025.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6427527295bc3b07dc75d4d64c5da89e45277d31a94c2c5faa0ab1c919e4fd4
+size 572468

income-tax-act-1961-as-amended-by-finance-act-2025.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:626134660ea5ca236d967eb5ed7d7b989d48f3240f0c4dfaa57dc09846288ff5
+size 5371727

requirements.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+# TaxBot AI — Hugging Face Space Requirements
+# Python 3.10+ recommended
+# ── UI Framework ──────────────────────────
+streamlit>=1.35.0
+# ── LangChain Orchestration ───────────────
+langchain>=0.2.0
+langchain-community>=0.2.0
+langchain-anthropic>=0.1.15      # Claude via LangChain
+langchain-openai>=0.1.8          # OpenAI embeddings + GPT-4o Vision
+# ── LLM Providers ────────────────────────
+anthropic>=0.28.0                # Claude API (direct client)
+openai>=1.35.0                   # GPT-4o Vision + Embeddings
+# ── Vector Database ───────────────────────
+chromadb>=0.5.0
+# ── Document Processing ───────────────────
+pypdf>=4.0.0                     # PDF loading for LangChain
+# ── ML / Audit Risk Engine ────────────────
+scikit-learn>=1.4.0              # Random Forest audit risk model
+pandas>=2.0.0
+numpy>=1.26.0
+# ── Utilities ─────────────────────────────
+python-dotenv>=1.0.0             # .env support for local dev

waiver-of-interest-circular-no-13-2025.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1aa1afd3a54010aacffd7ed09ca1bd8b33906db98da8e4ff20cce2e20ad485ad
+size 410954