Spaces:

NEXAS
/

challenge-b

Running

App Files Files Community

NEXAS commited on 6 days ago

Commit

49cf970

verified ·

1 Parent(s): f97ec1e

Upload 16 files

Browse files

Files changed (16) hide show

Dockerfile +34 -0
agent/__init__.py +0 -0
agent/agent.py +305 -0
agent/llm_client.py +81 -0
app.py +477 -0
ingestion/__init__.py +0 -0
ingestion/vector_store.py +72 -0
processor/__init__.py +0 -0
processor/pdf_processor.py +76 -0
requirements.txt +21 -0
scripts/__init__.py +0 -0
scripts/check_meta.py +20 -0
scripts/inspect_nodes.py +21 -0
scripts/inspect_nodes_clean.py +26 -0
scripts/test_agent.py +64 -0
scripts/verify_cite.py +40 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+# Use an official Python runtime as a parent image
+FROM python:3.10-slim
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Create a non-root user
+RUN useradd -m -u 1000 user
+WORKDIR $HOME/app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+# Install requirements
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the application code and set ownership
+COPY --chown=user:user . .
+# Switch to the non-root user
+USER user
+# Hugging Face Spaces expect port 7860
+EXPOSE 7860
+# Run Streamlit with the correct port and address
+CMD ["streamlit", "run", "app.py", "--server.port", "7860", "--server.address", "0.0.0.0"]

agent/__init__.py ADDED Viewed

File without changes

agent/agent.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import os
+import hashlib
+import json
+import faiss
+import re
+import time
+from typing import List, Dict, Any
+from llama_index.core import (
+    VectorStoreIndex,
+    SummaryIndex,
+    StorageContext,
+    Document,
+    Settings,
+    QueryBundle,
+    load_index_from_storage
+)
+from llama_index.node_parser.docling import DoclingNodeParser
+from llama_index.core.retrievers import RecursiveRetriever
+from llama_index.core.query_engine import RetrieverQueryEngine
+from llama_index.core.postprocessor import LLMRerank
+from llama_index.llms.groq import Groq
+from llama_index.embeddings.fastembed import FastEmbedEmbedding
+from llama_index.vector_stores.faiss import FaissVectorStore
+from llama_index.retrievers.bm25 import BM25Retriever
+import shutil
+# NEW: Import the refactored PDFProcessor
+from processor.pdf_processor import PDFProcessor
+class AgentRateLimitError(Exception):
+    """Custom exception containing the wait time extracted from an API rate limit error."""
+    def __init__(self, wait_time: float, message: str):
+        self.wait_time = wait_time
+        super().__init__(message)
+class LlamaPDFAgent:
+    def __init__(self, api_key: str = None, model: str = None):
+        # 1. Initialize Settings with Groq and FastEmbed
+        self.api_key = api_key or os.getenv("GROQ_API_KEY")
+        self.model = model or os.getenv("GROQ_MODEL", "meta-llama/llama-4-scout-17b-16e-instruct")
+        Settings.llm = Groq(
+            model=self.model,
+            api_key=self.api_key,
+            streaming=True # Global streaming support
+        )
+        Settings.embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")
+        # 2. Use the specialized PDFProcessor
+        self.pdf_processor = PDFProcessor()
+        self.vector_index = None
+        self.summary_index = None
+        self.recursive_query_engine = None
+        self.is_loaded = False
+        self.cache_dir = "./.llama_cache"
+        if not os.path.exists(self.cache_dir):
+            os.makedirs(self.cache_dir)
+        self.tables = [] # Store extracted DataFrames
+        self.registry_path = os.path.join(self.cache_dir, "registry.json")
+        self._init_registry()
+    def ingest_pdf(self, pdf_file):
+        """
+        Ingests a PDF using Persistence: Loads from disk if already indexed.
+        """
+        file_hash = self.pdf_processor.get_pdf_hash(pdf_file)
+        self.current_hash = file_hash
+        doc_cache_path = os.path.join(self.cache_dir, file_hash)
+        # 1. Check if already indexed
+        if os.path.exists(os.path.join(doc_cache_path, "default_vector_store.json")):
+            storage_context = StorageContext.from_defaults(
+                persist_dir=doc_cache_path,
+                vector_store=FaissVectorStore.from_persist_dir(doc_cache_path)
+            )
+            self.vector_index = load_index_from_storage(storage_context)
+            # Re-load metadata (Docling)
+            result = self.pdf_processor.load_docling_documents(pdf_file)
+            documents = result["documents"]
+            self.tables = result["tables"]
+            self.summary_index = SummaryIndex.from_documents(documents)
+            # Rebuild Retriever/Engine
+            nodes = list(self.vector_index.docstore.docs.values())
+            self.bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=5)
+            vector_retriever = self.vector_index.as_retriever(similarity_top_k=5)
+            self.recursive_retriever = RecursiveRetriever(
+                "vector",
+                retriever_dict={"vector": vector_retriever},
+                node_dict={node.node_id: node for node in nodes}
+            )
+            self.recursive_query_engine = RetrieverQueryEngine.from_args(
+                self.recursive_retriever,
+                node_postprocessors=[LLMRerank(top_n=3)],
+                streaming=True
+            )
+            self.is_loaded = True
+            self._save_to_registry(file_hash, pdf_file.name)
+            return f"Loaded '{pdf_file.name}' from library storage."
+        # 2. Fresh Ingest (Load and parse)
+        # 1. Load Documents with rich metadata via Docling JSON
+        result = self.pdf_processor.load_docling_documents(pdf_file)
+        documents = result["documents"]
+        self.tables = result["tables"]
+        # 2. Advanced Node Parsing (Captures page numbers and layout)
+        node_parser = DoclingNodeParser()
+        nodes = node_parser.get_nodes_from_documents(documents)
+        # 3. Vector Index with FAISS
+        d = 384 # BGE-small-en-v1.5 dimension
+        faiss_index = faiss.IndexFlatL2(d)
+        vector_store = FaissVectorStore(faiss_index=faiss_index)
+        storage_context = StorageContext.from_defaults(vector_store=vector_store)
+        storage_context.docstore.add_documents(nodes)
+        self.vector_index = VectorStoreIndex(
+            nodes,
+            storage_context=storage_context
+        )
+        # Persist to disk
+        self.vector_index.storage_context.persist(persist_dir=doc_cache_path)
+        # 4. BM25 Retriever for Hybrid Search
+        self.bm25_retriever = BM25Retriever.from_defaults(
+            nodes=nodes,
+            similarity_top_k=5
+        )
+        # 5. Recursive Retriever for Context Depth
+        vector_retriever = self.vector_index.as_retriever(similarity_top_k=5)
+        self.recursive_retriever = RecursiveRetriever(
+            "vector",
+            retriever_dict={"vector": vector_retriever},
+            node_dict={node.node_id: node for node in list(nodes)},
+            verbose=True
+        )
+        # 6. Summary Index for global overview
+        self.summary_index = SummaryIndex.from_documents(documents)
+        # Setup the main recursive query engine
+        self.recursive_query_engine = RetrieverQueryEngine.from_args(
+            self.recursive_retriever,
+            node_postprocessors=[LLMRerank(top_n=3)],
+            streaming=True # Enable at engine level
+        )
+        self.is_loaded = True
+        self._save_to_registry(file_hash, pdf_file.name)
+        return f"Successfully indexed '{pdf_file.name}' and saved to library."
+    def answer_question(self, question: str) -> Dict[str, Any]:
+        """
+        Returns answer and source citations including page numbers.
+        """
+        if not self.is_loaded: return {"answer": "No document loaded.", "sources": []}
+        try:
+            response = self.recursive_query_engine.query(question)
+        except Exception as e:
+            # Check for RateLimit (429) message: "Please try again in X.XXXs"
+            error_str = str(e)
+            match = re.search(r"Please try again in (\d+\.\d+)s", error_str)
+            if match:
+                wait_time = float(match.group(1))
+                raise AgentRateLimitError(wait_time, error_str)
+            raise e
+        sources = []
+        for node in response.source_nodes:
+            # metadata contains 'doc_items' which has 'prov' with 'page_no'
+            page_no = node.metadata.get("page_label") or node.metadata.get("page_no")
+            if not page_no and "doc_items" in node.metadata:
+                try:
+                    doc_items = node.metadata["doc_items"]
+                    if doc_items and "prov" in doc_items[0] and doc_items[0]["prov"]:
+                        page_no = doc_items[0]["prov"][0].get("page_no")
+                except (KeyError, IndexError, TypeError):
+                    pass
+            sources.append({
+                "text": node.get_content()[:250] + "...", # Snippet for UI
+                "page": page_no
+            })
+        return {
+            "answer_gen": response.response_gen, # Generator for streaming
+            "sources": sources
+        }
+    def get_kpi_viz_data(self):
+        """
+        Processes existing KPI text and extracts numerical pairs for charting.
+        """
+        kpi_text = self.get_deep_insights().get("key_metrics", "")
+        if not kpi_text:
+            return None
+        prompt = f"""
+        Extract key numerical metrics from the following text for visualization.
+        Format as a JSON list of objects with 'label' and 'value'.
+        Include only numerical values. If a value is a percentage, convert 10% to 10.
+        Text: {kpi_text}
+        """
+        try:
+            response = self.llm.complete(prompt)
+            raw_json = str(response)
+            if "```json" in raw_json:
+                raw_json = raw_json.split("```json")[1].split("```")[0].strip()
+            return json.loads(raw_json)
+        except Exception:
+            return None
+    def summarize_document(self):
+        if not self.is_loaded: return "No document loaded."
+        query_engine = self.summary_index.as_query_engine(
+            response_mode="tree_summarize",
+            streaming=True
+        )
+        response = query_engine.query("Provide a comprehensive executive summary of this document.")
+        return response
+    def get_deep_insights(self) -> Dict[str, str]:
+        """
+        Performs a multi-stage analysis to extract strategic depth.
+        """
+        if not self.is_loaded: return {}
+        prompts = {
+            "strategic_vision": "What is the primary strategic vision or long-term objective described in this document?",
+            "key_metrics": "Extract the top 5 most critical numerical KPIs or financial metrics mentioned. Format as a list.",
+            "risks_and_challenges": "Identify the most significant risks, headwinds, or challenges mentioned for the business.",
+            "swot_analysis": "Based on the content, provide a concise SWOT analysis (Strengths, Weaknesses, Opportunities, Threats) in valid JSON format with keys 'S', 'W', 'O', 'T'."
+        }
+        insights = {}
+        for key, query in prompts.items():
+            result = self.answer_question(query)
+            insights[key] = result.get("answer_text") or result.get("answer", "")
+        return insights
+    def _init_registry(self):
+        if not os.path.exists(self.registry_path):
+            with open(self.registry_path, "w") as f:
+                json.dump({}, f)
+    def _get_registry(self) -> Dict[str, str]:
+        try:
+            with open(self.registry_path, "r") as f:
+                return json.load(f)
+        except Exception:
+            return {}
+    def _save_to_registry(self, file_hash: str, filename: str):
+        registry = self._get_registry()
+        registry[file_hash] = filename
+        with open(self.registry_path, "w") as f:
+            json.dump(registry, f)
+    def get_library(self) -> List[Dict[str, str]]:
+        registry = self._get_registry()
+        return [{"hash": h, "filename": f} for h, f in registry.items()]
+    def delete_document(self, file_hash: str):
+        registry = self._get_registry()
+        if file_hash in registry:
+            doc_path = os.path.join(self.cache_dir, file_hash)
+            if os.path.exists(doc_path):
+                shutil.rmtree(doc_path)
+            del registry[file_hash]
+            with open(self.registry_path, "w") as f:
+                json.dump(registry, f)
+            if self.is_loaded and getattr(self, "current_hash", None) == file_hash:
+                self.is_loaded = False
+            return True
+        return False

agent/llm_client.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import os
+from groq import Groq
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+class GroqClient:
+    def __init__(self, api_key=None, model=None):
+        self.api_key = api_key or os.getenv("GROQ_API_KEY")
+        self.model = model or os.getenv("GROQ_MODEL", "meta-llama/llama-4-scout-17b-16e-instruct")
+        if not self.api_key:
+            raise ValueError("Groq API Key not found. Please set GROQ_API_KEY in your .env file.")
+        self.client = Groq(api_key=self.api_key)
+    def get_completion(self, prompt: str, system_message: str = "You are a helpful AI assistant."):
+        """
+        Calls the Groq API to get a completion for the given prompt.
+        """
+        try:
+            chat_completion = self.client.chat.completions.create(
+                messages=[
+                    {
+                        "role": "system",
+                        "content": system_message,
+                    },
+                    {
+                        "role": "user",
+                        "content": prompt,
+                    }
+                ],
+                model=self.model,
+            )
+            return chat_completion.choices[0].message.content
+        except Exception as e:
+            return f"Error calling Groq API: {e}"
+    def get_json_completion(self, prompt: str, system_message: str = "You are a helpful AI assistant."):
+        """
+        Calls the Groq API with JSON mode enabled.
+        """
+        try:
+            chat_completion = self.client.chat.completions.create(
+                messages=[
+                    {
+                        "role": "system",
+                        "content": system_message,
+                    },
+                    {
+                        "role": "user",
+                        "content": prompt,
+                    }
+                ],
+                model=self.model,
+                response_format={"type": "json_object"},
+            )
+            return chat_completion.choices[0].message.content
+        except Exception as e:
+            return f"{{\"error\": \"{e}\"}}"
+    def list_models(self):
+        """
+        Lists available models from Groq.
+        """
+        try:
+            models = self.client.models.list()
+            return [model.id for model in models.data]
+        except Exception as e:
+            print(f"Error listing models: {e}")
+            return []
+if __name__ == "__main__":
+    # Test LLM client (requires API key)
+    try:
+        client = GroqClient()
+        response = client.get_completion("Hello, how are you?")
+        print(f"Groq Response: {response}")
+    except ValueError as e:
+        print(e)

app.py ADDED Viewed

	@@ -0,0 +1,477 @@

+import streamlit as st
+import os
+import pandas as pd
+import json
+import time
+from dotenv import load_dotenv
+from agent.llm_client import GroqClient
+from agent.agent import LlamaPDFAgent as PDFAgent, AgentRateLimitError
+# Load environment variables
+load_dotenv()
+# Page configuration
+st.set_page_config(
+    page_title="Naresh AI - PDF Intelligence",
+    page_icon="📄",
+    layout="wide",
+)
+# Custom Styling for a Premium Dark Mode (Consistent with Challenge A)
+st.markdown("""
+<style>
+    /* Main container styling - Deep Dark Gradient */
+    .stApp {
+        background: radial-gradient(circle at top left, #1e293b 0%, #0f172a 100%) !important;
+        color: #f1f5f9 !important;
+    }
+    /* Header and Title styling - Neon Blue */
+    h1 {
+        color: #60a5fa !important;
+        font-family: 'Outfit', sans-serif;
+        font-weight: 800 !important;
+        letter-spacing: -0.05rem;
+        text-shadow: 0 0 20px rgba(96, 165, 250, 0.3);
+    }
+    h3 {
+        color: #94a3b8 !important;
+        font-weight: 400 !important;
+    }
+    /* Input styling - Darker Glass */
+    .stTextInput>div>div>input {
+        background-color: rgba(30, 41, 59, 0.7) !important;
+        color: white !important;
+        border: 1px solid rgba(96, 165, 250, 0.5) !important;
+        border-radius: 12px !important;
+        padding: 12px 20px !important;
+        font-size: 1.1rem !important;
+    }
+    /* Button styling - Glowing Blue */
+    .stButton>button {
+        background: linear-gradient(90deg, #2563eb 0%, #3b82f6 100%) !important;
+        color: white !important;
+        border: none !important;
+        border-radius: 12px !important;
+        padding: 15px 30px !important;
+        font-weight: 700 !important;
+        font-size: 1.1rem !important;
+        transition: all 0.3s ease !important;
+        box-shadow: 0 0 15px rgba(37, 99, 235, 0.4) !important;
+        width: 100% !important;
+    }
+    .stButton>button:hover {
+        transform: translateY(-2px) !important;
+        box-shadow: 0 0 30px rgba(59, 130, 246, 0.6) !important;
+    }
+    /* Result Card styling - Dark Inset */
+    .answer-container {
+        background-color: rgba(30, 41, 59, 0.5);
+        padding: 30px;
+        border-radius: 20px;
+        backdrop-filter: blur(20px);
+        border: 1px solid rgba(255, 255, 255, 0.1);
+        box-shadow: inset 0 0 20px rgba(0, 0, 0, 0.2);
+        border-left: 8px solid #2563eb;
+        margin-top: 25px;
+    }
+    /* Sidebar Dark Glass */
+    section[data-testid="stSidebar"] {
+        background-color: rgba(15, 23, 42, 0.95) !important;
+        backdrop-filter: blur(20px) !important;
+        border-right: 1px solid rgba(255, 255, 255, 0.1) !important;
+    }
+    .brand-text {
+        font-size: 1.5rem;
+        font-weight: 900;
+        background: linear-gradient(90deg, #60a5fa, #3b82f6);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        margin-bottom: 20px;
+    }
+    /* Standard Text Color Fixes */
+    .stMarkdown, p, li {
+        color: #cbd5e1 !important;
+    }
+    strong {
+        color: #f1f5f9 !important;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Initialize Session State
+if "pdf_agent" not in st.session_state:
+    st.session_state.pdf_agent = None
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+if "deep_insights" not in st.session_state:
+    st.session_state.deep_insights = {}
+# Sidebar
+with st.sidebar:
+    st.markdown('<div class="brand-text">NARESH AI</div>', unsafe_allow_html=True)
+    st.title("Settings")
+    # API Key Input
+    groq_api_key = st.text_input("Groq API Key", type="password", value=os.getenv("GROQ_API_KEY", ""))
+    # Dynamic Model Fetching
+    available_models = ["meta-llama/llama-4-scout-17b-16e-instruct", "llama-3.3-70b-versatile", "mixtral-8x7b-32768"]
+    if groq_api_key:
+        try:
+            temp_client = GroqClient(api_key=groq_api_key)
+            fetched_models = temp_client.list_models()
+            if fetched_models:
+                available_models = fetched_models
+        except Exception:
+            pass
+    model_choice = st.selectbox(
+        "Model Architecture",
+        available_models,
+        index=0 if "meta-llama/llama-4-scout-17b-16e-instruct" not in available_models else available_models.index("meta-llama/llama-4-scout-17b-16e-instruct")
+    )
+    st.divider()
+    st.markdown("### 🗂️ Document Library")
+    # Initialize agent if not exist (for library access)
+    if "pdf_agent" in st.session_state and st.session_state.pdf_agent:
+        if not hasattr(st.session_state.pdf_agent, "get_library"):
+            st.session_state.pdf_agent = None # Clear stale object
+    if not st.session_state.pdf_agent:
+        from agent.agent import LlamaPDFAgent as PDFAgent
+        st.session_state.pdf_agent = PDFAgent(api_key=groq_api_key or os.getenv("GROQ_API_KEY"), model=model_choice)
+    library = st.session_state.pdf_agent.get_library()
+    if not library:
+        st.caption("No documents in library.")
+    else:
+        for doc in library:
+            col1, col2 = st.columns([0.8, 0.2])
+            with col1:
+                st.markdown(f"**{doc['filename']}**")
+            with col2:
+                if st.button("🗑️", key=f"del_{doc['hash']}", help="Delete vectors"):
+                    if st.session_state.pdf_agent.delete_document(doc['hash']):
+                        st.session_state.pdf_agent = None # Force re-init if active one deleted
+                        st.rerun()
+        st.info("To switch document, simply upload it again. It will load instantly from the library.")
+    st.divider()
+    st.markdown("### Document Controls")
+    if st.button("Reset Session"):
+        st.session_state.pdf_agent = None
+        st.session_state.messages = []
+        st.session_state.deep_insights = {}
+        st.rerun()
+    st.divider()
+    st.markdown("### Profile")
+    st.write("**Built by:** Naresh Kumar Lahajal")
+    st.write("**Role:** GenAI Enthusiast")
+    st.info("High-speed PDF intelligence powered by Groq and FastEmbed.")
+# Header
+st.title("Naresh AI DocuPulse")
+st.subheader("Challenge B: PDF RAG & Summarization")
+# File Upload
+uploaded_file = st.file_uploader("Upload a PDF document", type=["pdf"])
+if uploaded_file and (st.session_state.pdf_agent is None or uploaded_file.name != st.session_state.get("last_uploaded_file")):
+    with st.status("Ingesting document and indexing knowledge...", expanded=True) as status:
+        try:
+            agent = PDFAgent(api_key=groq_api_key, model=model_choice)
+            status_msg = agent.ingest_pdf(uploaded_file)
+            st.session_state.pdf_agent = agent
+            st.session_state.last_uploaded_file = uploaded_file.name
+            # Sync tables for explorer
+            st.session_state.extracted_tables = agent.tables
+            # Auto-Clear History on New Upload
+            st.session_state.messages = []
+            st.session_state.deep_insights = {}
+            status.update(label=f"✅ {status_msg}", state="complete", expanded=False)
+            st.toast("Intelligence Engine Initialized", icon="🧠")
+        except Exception as e:
+            st.error(f"Error processing PDF: {e}")
+# Helper for Exact Backoff
+def run_with_exact_backoff(func, *args, **kwargs):
+    """
+    Runs a function and catches AgentRateLimitError to perform a precise UI countdown retry.
+    """
+    max_attempts = 3
+    for attempt in range(max_attempts):
+        try:
+            return func(*args, **kwargs)
+        except AgentRateLimitError as e:
+            if attempt == max_attempts - 1:
+                st.error(f"Failed after {max_attempts} attempts due to Persistent Rate Limits. Please wait a few minutes.")
+                raise e
+            # Precise wait + 1s buffer
+            wait_time = int(e.wait_time) + 1
+            st.toast(f"Rate Limit Hit! Waiting {wait_time}s to retry...", icon="⏳")
+            # Visual Countdown
+            placeholder = st.empty()
+            for remaining in range(wait_time, 0, -1):
+                placeholder.warning(f"⚠️ API Cooldown: Retrying in {remaining} seconds...")
+                time.sleep(1)
+            placeholder.empty()
+    return None
+if st.session_state.pdf_agent:
+    # Action Tabs
+    tab1, tab2, tab3, tab4 = st.tabs(["💬 Ask Questions", "📝 Auto-Summary", "🧠 Deep Intelligence", "📋 Table Explorer"])
+    with tab1:
+        st.markdown("### 💬 Document Conversation")
+        st.caption("Ask questions about the document and maintain a conversation thread.")
+        # Display Chat History
+        for message in st.session_state.messages:
+            with st.chat_message(message["role"]):
+                st.markdown(message["content"])
+                if "sources" in message and message["sources"]:
+                    with st.expander("🔗 Sources & Citations", expanded=False):
+                        for i, src in enumerate(message["sources"]):
+                            page_text = f"Page {src['page']}" if src['page'] else "Unknown Page"
+                            st.markdown(f"**[{i+1}] {page_text}**")
+                            st.caption(f"_{src['text']}_")
+                            st.divider()
+        # Chat Input
+        if prompt := st.chat_input("What would you like to know?"):
+            # Add user message to history
+            st.session_state.messages.append({"role": "user", "content": prompt})
+            with st.chat_message("user"):
+                st.markdown(prompt)
+            # Generate AI response
+            with st.chat_message("assistant"):
+                with st.spinner("Analyzing document context..."):
+                    response_data = run_with_exact_backoff(st.session_state.pdf_agent.answer_question, prompt)
+                    if response_data:
+                        # Use st.write_stream for typing effect
+                        answer = st.write_stream(response_data['answer_gen'])
+                        sources = response_data.get("sources", [])
+                        if sources:
+                            with st.expander("🔗 Sources & Citations", expanded=False):
+                                for i, src in enumerate(sources):
+                                    page_text = f"Page {src['page']}" if src['page'] else "Unknown Page"
+                                    st.markdown(f"**[{i+1}] {page_text}**")
+                                    st.caption(f"_{src['text']}_")
+                                    st.divider()
+                        # Add assistant response to history
+                        st.session_state.messages.append({
+                            "role": "assistant",
+                            "content": answer,
+                            "sources": sources
+                        })
+    with tab2:
+        if st.button("Generate Executive Summary"):
+            with st.spinner("Synthesizing document overview..."):
+                streaming_response = run_with_exact_backoff(st.session_state.pdf_agent.summarize_document)
+                if streaming_response:
+                    st.markdown('<div class="answer-container" style="border-left: 8px solid #60a5fa;">', unsafe_allow_html=True)
+                    st.markdown("### 📝 Document Summary")
+                    st.write_stream(streaming_response.response_gen)
+                    st.markdown('</div>', unsafe_allow_html=True)
+    with tab3:
+        st.markdown("### 🚀 Strategic Deep Analysis")
+        st.info("This mode uses multi-stage recursive retrieval to extract deep strategic insights and KPIs.")
+        if st.button("Run Deep Intelligence Scan"):
+            with st.status("Analyzing document layers...", expanded=True) as status:
+                st.write("🔍 Extracting Strategic Vision...")
+                insights = run_with_exact_backoff(st.session_state.pdf_agent.get_deep_insights)
+                if insights:
+                    st.session_state.deep_insights = insights
+                    # Fetch KPI visualization data
+                    st.write("📊 Generating Visual Analytics...")
+                    viz_data = run_with_exact_backoff(st.session_state.pdf_agent.get_kpi_viz_data)
+                    st.session_state.kpi_viz_data = viz_data
+                    status.update(label="✅ Deep Analysis Complete", state="complete", expanded=False)
+                else:
+                    status.update(label="❌ Failed after retries", state="error", expanded=False)
+        if st.session_state.deep_insights:
+            insights = st.session_state.deep_insights
+            # 1. Strategic Vision
+            st.markdown('<div class="answer-container" style="border-left: 8px solid #8b5cf6;">', unsafe_allow_html=True)
+            st.markdown("#### 🎯 Strategic Vision")
+            st.write(insights.get("strategic_vision", "N/A"))
+            st.markdown('</div>', unsafe_allow_html=True)
+            col1, col2 = st.columns(2)
+            with col1:
+                # 2. Key Metrics
+                st.markdown("#### 📊 Key Performance Indicators")
+                metrics_text = insights.get("key_metrics", "")
+                st.markdown(metrics_text if metrics_text else "No metrics extracted.")
+            with col2:
+                # 3. Risks
+                st.markdown("#### ⚠️ Risks & Challenges")
+                risks_text = insights.get("risks_and_challenges", "")
+                st.markdown(risks_text if risks_text else "No risks identified.")
+            # Visual Dashboard Section
+            if st.session_state.get("kpi_viz_data"):
+                st.divider()
+                st.markdown("#### 📈 Key Trends & Metrics")
+                viz_df = pd.DataFrame(st.session_state.kpi_viz_data)
+                # Heuristic for chart type
+                if any("year" in str(l).lower() or "q1" in str(l).lower() or "q2" in str(l).lower() or "q3" in str(l).lower() or "q4" in str(l).lower() for l in viz_df['label']):
+                    st.line_chart(viz_df.set_index('label'), color="#3b82f6")
+                    st.caption("Auto-detected Time Series data.")
+                else:
+                    st.bar_chart(viz_df.set_index('label'), color="#60a5fa")
+                    st.caption("Bar chart representation of extracted KPIs.")
+            # 4. SWOT Analysis
+            st.divider()
+            st.markdown("#### 🛠️ Automated SWOT Analysis")
+            swot_raw = insights.get("swot_analysis", "{}")
+            try:
+                # Attempt to clean potential markdown artifacts around JSON
+                if "```json" in swot_raw:
+                    swot_raw = swot_raw.split("```json")[1].split("```")[0].strip()
+                elif "{" in swot_raw:
+                    swot_raw = "{" + swot_raw.split("{", 1)[1].rsplit("}", 1)[0] + "}"
+                swot_data = json.loads(swot_raw)
+                # Display SWOT in a grid
+                s_col1, s_col2 = st.columns(2)
+                with s_col1:
+                    st.success(f"**Strengths**\n\n{swot_data.get('S', 'N/A')}")
+                    st.info(f"**Opportunities**\n\n{swot_data.get('O', 'N/A')}")
+                with s_col2:
+                    st.warning(f"**Weaknesses**\n\n{swot_data.get('W', 'N/A')}")
+                    st.error(f"**Threats**\n\n{swot_data.get('T', 'N/A')}")
+            except Exception as e:
+                st.write("Raw SWOT Insight:")
+                st.write(swot_raw)
+            # Report Export
+            st.divider()
+            report_md = f"""# Executive Intelligence Report: {st.session_state.last_uploaded_file}
+## 🎯 Strategic Vision
+{insights.get('strategic_vision', 'N/A')}
+## 📊 Key Performance Indicators
+{insights.get('key_metrics', 'N/A')}
+## ⚠️ Risks & Challenges
+{insights.get('risks_and_challenges', 'N/A')}
+## 🛠️ SWOT Analysis
+### Strengths
+{swot_data.get('S', 'N/A') if 'swot_data' in locals() else 'N/A'}
+### Weaknesses
+{swot_data.get('W', 'N/A') if 'swot_data' in locals() else 'N/A'}
+### Opportunities
+{swot_data.get('O', 'N/A') if 'swot_data' in locals() else 'N/A'}
+### Threats
+{swot_data.get('T', 'N/A') if 'swot_data' in locals() else 'N/A'}
+---
+*Report generated by Naresh AI DocuPulse*
+"""
+            st.download_button(
+                label="📥 Download Executive Intelligence Report",
+                data=report_md,
+                file_name=f"Intelligence_Report_{st.session_state.last_uploaded_file.replace('.pdf', '')}.md",
+                mime="text/markdown"
+            )
+    with tab4:
+        st.markdown("### 📋 PDF Table Explorer")
+        st.info("Direct extraction of tabular data from the document. Select a table to explore.")
+        tables = st.session_state.pdf_agent.tables
+        if not tables:
+            st.warning("No structured tables were detected in the document.")
+        else:
+            table_labels = [f"{t['label']} (Page Grounded)" for t in tables]
+            selected_label = st.selectbox("Select Table", table_labels)
+            # Find the selected table
+            selected_idx = table_labels.index(selected_label)
+            selected_table = tables[selected_idx]
+            st.markdown(f"#### {selected_table['label']}")
+            st.dataframe(selected_table['df'], use_container_width=True)
+            # Download as CSV
+            csv = selected_table['df'].to_csv(index=False).encode('utf-8')
+            st.download_button(
+                label=f"📥 Download {selected_table['label']} as CSV",
+                data=csv,
+                file_name=f"{selected_table['label'].replace(' ', '_')}.csv",
+                mime="text/csv"
+            )
+else:
+    st.info("Please upload a PDF document to begin analysis.")
+# Footer
+st.divider()
+st.markdown(
+    """
+    <div style="text-align: center; color: #64748b; padding: 20px;">
+        © 2026 <b>Naresh Kumar Lahajal</b>. All Rights Reserved.<br>
+        <small>Powered by Groq and Retrieval-Augmented Generation</small>
+    </div>
+    """,
+    unsafe_allow_html=True
+)

ingestion/__init__.py ADDED Viewed

File without changes

ingestion/vector_store.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import faiss
+import numpy as np
+import os
+from fastembed import TextEmbedding
+from typing import List, Tuple
+class VectorStore:
+    def __init__(self, model_name: str = "BAAI/bge-small-en-v1.5", cache_dir: str = ".cache"):
+        self.encoder = TextEmbedding(model_name=model_name)
+        self.index = None
+        self.chunks = []
+        self.cache_dir = cache_dir
+        if not os.path.exists(self.cache_dir):
+            os.makedirs(self.cache_dir)
+    def build_index(self, chunks: List[str]):
+        """
+        Embeds chunks and builds a FAISS index.
+        """
+        self.chunks = chunks
+        embeddings = list(self.encoder.embed(chunks))
+        embeddings_np = np.array(embeddings).astype('float32')
+        dimension = embeddings_np.shape[1]
+        self.index = faiss.IndexFlatL2(dimension)
+        self.index.add(embeddings_np)
+    def save_index(self, key: str):
+        """
+        Saves the FAISS index and chunks to the cache.
+        """
+        if self.index is not None:
+            faiss.write_index(self.index, os.path.join(self.cache_dir, f"{key}.index"))
+            np.save(os.path.join(self.cache_dir, f"{key}_chunks.npy"), np.array(self.chunks))
+    def load_index(self, key: str) -> bool:
+        """
+        Loads the FAISS index and chunks from the cache if available.
+        """
+        index_path = os.path.join(self.cache_dir, f"{key}.index")
+        chunks_path = os.path.join(self.cache_dir, f"{key}_chunks.npy")
+        if os.path.exists(index_path) and os.path.exists(chunks_path):
+            self.index = faiss.read_index(index_path)
+            self.chunks = np.load(chunks_path, allow_pickle=True).tolist()
+            return True
+        return False
+    def search(self, query: str, top_k: int = 4) -> List[Tuple[str, float]]:
+        """
+        Searches for the top-k chunks most relevant to the query.
+        """
+        if self.index is None:
+            return []
+        query_embedding = list(self.encoder.embed([query]))[0]
+        query_embedding_np = np.array([query_embedding]).astype('float32')
+        distances, indices = self.index.search(query_embedding_np, top_k)
+        results = []
+        for i, idx in enumerate(indices[0]):
+            if idx != -1:
+                results.append((self.chunks[idx], float(distances[0][i])))
+        return results
+if __name__ == "__main__":
+    # Test
+    vs = VectorStore()
+    vs.build_index(["Hello, world!", "The quick brown fox jumps over the lazy dog."])
+    results = vs.search("What animal jumps?")
+    for res, dist in results:
+        print(f"Result: {res} (Distance: {dist})")

processor/__init__.py ADDED Viewed

File without changes

processor/pdf_processor.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import hashlib
+import tempfile
+import os
+import io
+from pathlib import Path
+from typing import List, Dict
+import pandas as pd
+from llama_index.readers.docling import DoclingReader
+from docling.document_converter import DocumentConverter
+class PDFProcessor:
+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.doc_converter = DocumentConverter()
+    def get_pdf_hash(self, pdf_file) -> str:
+        """
+        Generates an MD5 hash for the PDF file object to serve as a cache key.
+        """
+        pos = pdf_file.tell()
+        pdf_file.seek(0)
+        file_hash = hashlib.md5(pdf_file.read()).hexdigest()
+        pdf_file.seek(pos)
+        return file_hash
+    def load_docling_documents(self, pdf_file) -> Dict:
+        """
+        Uses DoclingReader for RAG and DocumentConverter for Table Extraction.
+        Returns a dict with 'documents' (LlamaIndex) and 'tables' (List of DataFrames).
+        """
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+            pdf_file.seek(0)
+            tmp.write(pdf_file.read())
+            tmp_path = Path(tmp.name)
+        try:
+            # 1. Ingest for LlamaIndex RAG
+            reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
+            documents = reader.load_data(file_path=tmp_path)
+            # 2. Extract structured tables for DataFrame explorer
+            result = self.doc_converter.convert(tmp_path)
+            doc = result.document
+            tables = []
+            for i, table in enumerate(doc.tables):
+                try:
+                    # Export table to HTML then read via pandas
+                    html_table = table.export_to_html()
+                    dfs = pd.read_html(io.StringIO(html_table))
+                    if dfs:
+                        tables.append({
+                            "id": i + 1,
+                            "label": f"Table {i+1}",
+                            "df": dfs[0]
+                        })
+                except Exception:
+                    pass
+            return {
+                "documents": documents,
+                "tables": tables
+            }
+        finally:
+            try:
+                if tmp_path.exists():
+                    tmp_path.unlink()
+            except Exception:
+                pass
+if __name__ == "__main__":
+    # Test
+    pass

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+groq
+pypdf
+python-dotenv
+streamlit
+fastembed
+numpy
+faiss-cpu
+llama-index-core
+llama-index-llms-groq
+llama-index-embeddings-fastembed
+llama-index-readers-file
+llama-index-vector-stores-faiss
+docling
+llama-index-readers-docling
+rank-bm25
+llama-index-retrievers-bm25
+llama-index-node-parser-docling
+pandas
+pydantic
+lxml
+html5lib

scripts/__init__.py ADDED Viewed

File without changes

scripts/check_meta.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from llama_index.readers.docling import DoclingReader
+import os
+from pathlib import Path
+def check_metadata():
+    pdf_path = "nvidia_q4_fy24.pdf"
+    if not os.path.exists(pdf_path):
+        print("PDF not found.")
+        return
+    reader = DoclingReader()
+    documents = reader.load_data(file_path=Path(pdf_path))
+    print(f"Loaded {len(documents)} documents.")
+    for i, doc in enumerate(documents[:2]): # Just check first two
+        print(f"Doc {i} Metadata: {doc.metadata}")
+        # print(f"Doc {i} Text Preview: {doc.text[:200]}...")
+if __name__ == "__main__":
+    check_metadata()

scripts/inspect_nodes.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from llama_index.readers.docling import DoclingReader
+from llama_index.node_parser.docling import DoclingNodeParser
+import os
+from pathlib import Path
+def inspect_nodes():
+    pdf_path = "nvidia_q4_fy24.pdf"
+    reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
+    documents = reader.load_data(file_path=Path(pdf_path))
+    parser = DoclingNodeParser()
+    nodes = parser.get_nodes_from_documents(documents)
+    if nodes:
+        print(f"Node 0 Metadata: {nodes[0].metadata.keys()}")
+        print(f"Node 0 Metadata Content: {nodes[0].metadata}")
+    else:
+        print("No nodes created.")
+if __name__ == "__main__":
+    inspect_nodes()

scripts/inspect_nodes_clean.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from llama_index.readers.docling import DoclingReader
+from llama_index.node_parser.docling import DoclingNodeParser
+import os
+from pathlib import Path
+import json
+def inspect_nodes():
+    pdf_path = "nvidia_q4_fy24.pdf"
+    reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
+    documents = reader.load_data(file_path=Path(pdf_path))
+    parser = DoclingNodeParser()
+    nodes = parser.get_nodes_from_documents(documents)
+    if nodes:
+        # Find a node that is likely to have a page number (not just a title)
+        for node in nodes[5:15]:
+            metadata = node.metadata
+            print("--- METADATA START ---")
+            print(json.dumps(metadata, indent=2))
+            print("--- METADATA END ---")
+    else:
+        print("No nodes created.")
+if __name__ == "__main__":
+    inspect_nodes()

scripts/test_agent.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import os
+import sys
+from pathlib import Path
+# Add project root to sys.path
+root_dir = Path(__file__).parent.parent
+sys.path.append(str(root_dir))
+from dotenv import load_dotenv
+from agent.agent import LlamaPDFAgent
+import io
+load_dotenv()
+def test_agent():
+    api_key = os.getenv("GROQ_API_KEY")
+    if not api_key:
+        print("GROQ_API_KEY not found in environment.")
+        return
+    agent = LlamaPDFAgent(api_key=api_key)
+    # Use the downloaded NVIDIA PDF - updated path
+    pdf_path = os.path.join(root_dir, "nvidia_q4_fy24.pdf")
+    if not os.path.exists(pdf_path):
+        print(f"PDF not found: {pdf_path}")
+        return
+    with open(pdf_path, "rb") as f:
+        # Mocking a streamlit-like upload object
+        class MockFile:
+            def __init__(self, file, name):
+                self.file = file
+                self.name = name
+            def read(self):
+                return self.file.read()
+            def seek(self, pos):
+                self.file.seek(pos)
+            def tell(self):
+                return self.file.tell()
+        mock_file = MockFile(f, pdf_path)
+        print("Ingesting PDF...")
+        msg = agent.ingest_pdf(mock_file)
+        print(msg)
+        print("\n--- Testing Q&A ---")
+        q = "What was the total revenue for FY24?"
+        result = agent.answer_question(q)
+        print(f"Q: {q}")
+        print(f"A: {result['answer']}")
+        print("\nSources:")
+        for src in result['sources']:
+            print(f"- [Page {src['page']}] {src['text'][:100]}...")
+        print("\n--- Testing Deep Insights ---")
+        insights = agent.get_deep_insights()
+        for key, value in insights.items():
+            print(f"\n[{key.upper()}]")
+            print(value)
+if __name__ == "__main__":
+    test_agent()

scripts/verify_cite.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+import sys
+from pathlib import Path
+# Add project root to sys.path
+root_dir = Path(__file__).parent.parent
+sys.path.append(str(root_dir))
+from dotenv import load_dotenv
+from agent.agent import LlamaPDFAgent
+load_dotenv()
+def verify_citations():
+    agent = LlamaPDFAgent()
+    # Updated path to root
+    pdf_path = os.path.join(root_dir, "nvidia_q4_fy24.pdf")
+    with open(pdf_path, "rb") as f:
+        class MockFile:
+            def __init__(self, file, name):
+                self.file = file
+                self.name = name
+            def read(self): return self.file.read()
+            def seek(self, pos): self.file.seek(pos)
+            def tell(self): return self.file.tell()
+        mock_file = MockFile(f, pdf_path)
+        agent.ingest_pdf(mock_file)
+        q = "What was the revenue for Data Center in Q4?"
+        result = agent.answer_question(q)
+        print(f"\nQ: {q}")
+        print(f"A: {result['answer']}")
+        print("\nSOURCES:")
+        for s in result['sources']:
+            print(f"- Page {s['page']}: {s['text'][:50]}...")
+if __name__ == "__main__":
+    verify_citations()