Spaces:

codewithpurav
/

financial-qa-agent

Paused

File size: 39,457 Bytes

3efe7a4

{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "rkRTHAoXbOJC",
        "outputId": "ab776f45-7c6c-4b1c-87bc-7410dc1955fe"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Selecting previously unselected package cloudflared.\n",
            "(Reading database ... 126441 files and directories currently installed.)\n",
            "Preparing to unpack cloudflared-linux-amd64.deb ...\n",
            "Unpacking cloudflared (2025.9.1) ...\n",
            "Setting up cloudflared (2025.9.1) ...\n",
            "Processing triggers for man-db (2.10.2-1) ...\n",
            "cloudflared version 2025.9.1 (built 2025-09-22-13:28 UTC)\n"
          ]
        }
      ],
      "source": [
        "!pip install -r requirements.txt -q\n",
        "!pip install streamlit cloudflared -q\n",
        "!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb\n",
        "!dpkg -i cloudflared-linux-amd64.deb\n",
        "\n",
        "!cloudflared --version\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "UpQo5rPBkvT4"
      },
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "l08lsc3SbUy2",
        "outputId": "e7c5db50-4944-4fad-bad6-fae2ec7439aa"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "✅ CUDA is available. Using GPU: Tesla T4\n"
          ]
        }
      ],
      "source": [
        "import torch\n",
        "\n",
        "if torch.cuda.is_available():\n",
        "    print(f\"✅ CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}\")\n",
        "    # return True\n",
        "else:\n",
        "    print(\"⚠️ CUDA not available. Falling back to CPU.\")\n",
        "    # return False\n",
        "\n",
        "\n",
        "# # Load the allocator\n",
        "# new_alloc = torch.cuda.memory.CUDAPluggableAllocator(\n",
        "#     'alloc.so', 'my_malloc', 'my_free')\n",
        "# # Swap the current allocator\n",
        "# torch.cuda.memory.change_current_allocator(new_alloc)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "LHHSaPwNbZXW",
        "outputId": "a2939de4-7a06-4a35-cf6f-190ea3fec13a"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Overwriting Embeddings.py\n"
          ]
        }
      ],
      "source": [
        "%%writefile Embeddings.py\n",
        "\n",
        "import os\n",
        "import glob\n",
        "import pickle, json\n",
        "from tqdm import tqdm\n",
        "import numpy as np\n",
        "\n",
        "# Try imports with friendly errors\n",
        "try:\n",
        "    import faiss\n",
        "except Exception as e:\n",
        "    raise ImportError(\"faiss is required. Install cpu version: `pip install faiss-cpu` or install via conda for GPU (faiss-gpu).\") from e\n",
        "\n",
        "try:\n",
        "    from sentence_transformers import SentenceTransformer\n",
        "except Exception as e:\n",
        "    raise ImportError(\"sentence-transformers is required. `pip install sentence-transformers`\") from e\n",
        "\n",
        "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
        "import torch\n",
        "from google.colab import userdata\n",
        "\n",
        "\n",
        "\n",
        "# from Data_Cleaning import GetDataCleaning\n",
        "# from Logger import GetLogger\n",
        "\n",
        "\n",
        "class GetEmbeddings:\n",
        "    \"\"\"\n",
        "    Embedding pipeline for cleaned text files.\n",
        "    Generates embeddings using SentenceTransformers, builds a FAISS index,\n",
        "    and allows searching queries against the vector database.\n",
        "    \"\"\"\n",
        "\n",
        "    def __init__(self, config_path=\"config.json\", logger=None):\n",
        "\n",
        "        with open(config_path, \"r\") as f:\n",
        "            self.config = json.load(f)\n",
        "\n",
        "        cfg_paths = self.config[\"paths\"]\n",
        "\n",
        "\n",
        "        cfg_emb = self.config[\"embedding\"]\n",
        "\n",
        "        self.root = cfg_paths[\"root\"]\n",
        "        self.cleaned_suffix = \"_cleaned_txt\"\n",
        "        self.chunk_words = cfg_emb[\"chunk_words\"]\n",
        "        self.batch_size = cfg_emb[\"batch_size\"]\n",
        "        self.faiss_index_path = cfg_paths[\"faiss_index\"]\n",
        "        self.metadata_path = cfg_paths[\"metadata\"]\n",
        "        self.embedding_model = cfg_emb[\"model\"]\n",
        "\n",
        "        # if not logger:\n",
        "        #     obj = GetLogger()\n",
        "        #     logger = obj.get_logger()\n",
        "        # self.logger = logger\n",
        "        # print(\"Initializing Embedding Pipeline...\")\n",
        "\n",
        "        # Device\n",
        "        self.device = \"cuda\" if self.check_cuda() and cfg_emb[\"use_gpu\"] else \"cpu\"\n",
        "        self.hf_token = \"your_token\"\n",
        "\n",
        "    def check_cuda(self):\n",
        "        \"\"\"Return True if CUDA is available and usable.\"\"\"\n",
        "        try:\n",
        "            if torch.cuda.is_available():\n",
        "                _ = torch.cuda.current_device()\n",
        "                print(f\"✅ CUDA available. Device: {torch.cuda.get_device_name(0)}\")\n",
        "                return True\n",
        "            print(\"⚠️ CUDA not available. Using CPU.\")\n",
        "            return False\n",
        "        except Exception as e:\n",
        "            print(f\"Error checking CUDA, defaulting to CPU. Error: {e}\")\n",
        "            return False\n",
        "\n",
        "    def list_cleaned_files(self):\n",
        "        \"\"\"Return sorted list of cleaned text files under root/*{cleaned_suffix}/*.txt\"\"\"\n",
        "        pattern = os.path.join(self.root, f\"*{self.cleaned_suffix}\", \"*.txt\")\n",
        "        files = glob.glob(pattern)\n",
        "        files.sort()\n",
        "        return files\n",
        "\n",
        "    def read_text_file(self, path):\n",
        "        \"\"\"Read a text file and return string content.\"\"\"\n",
        "        with open(path, \"r\", encoding=\"utf-8\") as f:\n",
        "            return f.read()\n",
        "\n",
        "    def chunk_text_words(self, text):\n",
        "        \"\"\"\n",
        "        Simple word-based chunking.\n",
        "        Returns list of text chunks.\n",
        "        \"\"\"\n",
        "        words = text.split()\n",
        "        if not words:\n",
        "            return []\n",
        "        return [\" \".join(words[i:i + self.chunk_words]) for i in range(0, len(words), self.chunk_words)]\n",
        "\n",
        "    def save_index_and_metadata(self):\n",
        "        \"\"\"Save FAISS index and metadata to disk.\"\"\"\n",
        "        os.makedirs(os.path.dirname(self.faiss_index_path), exist_ok=True)\n",
        "        faiss.write_index(self.index, self.faiss_index_path)\n",
        "        with open(self.metadata_path, \"wb\") as f:\n",
        "            pickle.dump(self.metadata, f)\n",
        "        print(f\"💾 Saved FAISS index to {self.faiss_index_path}\")\n",
        "        print(f\"💾 Saved metadata to {self.metadata_path}\")\n",
        "\n",
        "    def load_index_and_metadata(self):\n",
        "        \"\"\"Load FAISS index and metadata if they exist.\"\"\"\n",
        "        if os.path.exists(self.faiss_index_path) and os.path.exists(self.metadata_path):\n",
        "            try:\n",
        "                self.index = faiss.read_index(self.faiss_index_path)\n",
        "                with open(self.metadata_path, \"rb\") as f:\n",
        "                    self.metadata = pickle.load(f)\n",
        "                print(f\"✅ Loaded existing FAISS index + metadata from disk.\")\n",
        "                return True\n",
        "            except Exception as e:\n",
        "                print(f\"⚠️ Failed to load FAISS index/metadata, will rebuild. Error: {e}\")\n",
        "                return False\n",
        "        return False\n",
        "\n",
        "    def load_encoder(self):\n",
        "        \"\"\"Loading Encoder\"\"\"\n",
        "        self.encoder = SentenceTransformer(self.embedding_model, device=self.device)\n",
        "        print(f\"Loaded embedding model '{self.embedding_model}' on {self.device}\")\n",
        "        return self.encoder\n",
        "\n",
        "\n",
        "    def building_embeddings_index(self, files):\n",
        "        \"\"\"Build embeddings for all text chunks and return FAISS index + metadata.\"\"\"\n",
        "\n",
        "\n",
        "        all_embeddings, metadata = [], []\n",
        "        next_id = 0\n",
        "        # Iterate files and chunks\n",
        "        for fp in tqdm(files, desc=\"Files\", unit=\"file\"):\n",
        "            text = self.read_text_file(fp)\n",
        "\n",
        "            if not text.strip():\n",
        "                continue\n",
        "\n",
        "            # metadata: infer company and file from path\n",
        "            # e.g., financial_reports/Infosys_cleaned_txt/Infosys_2023_AR.txt\n",
        "            rel = os.path.relpath(fp, self.root)\n",
        "            folder = rel.split(os.sep)[0]\n",
        "            filename = os.path.basename(fp)\n",
        "\n",
        "            chunks = self.chunk_text_words(text)\n",
        "            if not chunks:\n",
        "                continue\n",
        "\n",
        "            for i in range(0, len(chunks), self.batch_size):\n",
        "                batch = chunks[i:i + self.batch_size]\n",
        "                embs = self.encoder.encode(batch, show_progress_bar=False, convert_to_numpy=True)\n",
        "                embs = embs.astype(np.float32)\n",
        "\n",
        "                for j, vec in enumerate(embs):\n",
        "                    all_embeddings.append(vec)\n",
        "                    metadata.append({\n",
        "                        \"id\": next_id,\n",
        "                        \"source_folder\": folder,\n",
        "                        \"file\": filename,\n",
        "                        \"chunk_id\": i + j,\n",
        "                        \"text\": batch[j]  # store chunk text for retrieval\n",
        "                    })\n",
        "                    next_id += 1\n",
        "\n",
        "        if not all_embeddings:\n",
        "            raise RuntimeError(\"No embeddings were produced. Check cleaned files and chunking.\")\n",
        "\n",
        "        emb_matrix = np.vstack(all_embeddings).astype(np.float32)\n",
        "        faiss.normalize_L2(emb_matrix)\n",
        "\n",
        "        # Build FAISS index (IndexFlatIP over normalized vectors = cosine similarity)\n",
        "        dim = emb_matrix.shape[1]\n",
        "        self.index = faiss.IndexFlatIP(dim)\n",
        "        self.index.add(emb_matrix)\n",
        "        self.metadata = metadata\n",
        "        print(f\"✅ Built FAISS index with {self.index.ntotal} vectors, dim={dim}\")\n",
        "\n",
        "        return self.index, self.metadata\n",
        "\n",
        "    def run(self):\n",
        "        \"\"\"Main entry: load or build embeddings + FAISS index.\"\"\"\n",
        "        if self.load_index_and_metadata():\n",
        "            return\n",
        "\n",
        "        files = self.list_cleaned_files()\n",
        "        if not files:\n",
        "            print(\"❌ No cleaned text files found.\")\n",
        "            raise SystemExit(1)\n",
        "        self.load_encoder()\n",
        "        self.building_embeddings_index(files)\n",
        "        self.save_index_and_metadata()\n",
        "\n",
        "    def load_summarizer(self, model_name=\"google/gemma-2b\"):\n",
        "        \"\"\"\n",
        "        Load summarizer LLM once.\n",
        "        If already loaded, skip.\n",
        "        \"\"\"\n",
        "        if hasattr(self, \"summarizer_pipeline\"):\n",
        "            print(\"ℹ️ Summarizer already loaded, skipping reload.\")\n",
        "            return\n",
        "\n",
        "        try:\n",
        "            print(f\"⏳ Loading summarizer model '{model_name}'...\")\n",
        "            self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=self.hf_token)\n",
        "            self.summarizer_model = AutoModelForCausalLM.from_pretrained(\n",
        "                model_name,\n",
        "                torch_dtype=torch.float16 if self.device == \"cuda\" else torch.float32,\n",
        "                device_map=self.device,\n",
        "                token=self.hf_token\n",
        "            )\n",
        "            self.summarizer_pipeline = pipeline(\n",
        "                \"text-generation\",\n",
        "                model=self.summarizer_model,\n",
        "                tokenizer=self.tokenizer\n",
        "            )\n",
        "            print(f\"✅ Summarizer model '{model_name}' loaded successfully.\")\n",
        "\n",
        "        except RuntimeError as e:\n",
        "            if \"CUDA out of memory\" in str(e):\n",
        "                print(\"⚠️ CUDA OOM while loading summarizer. Retrying on CPU...\")\n",
        "                self.device = \"cpu\"\n",
        "                torch.cuda.empty_cache()\n",
        "                return self.load_summarizer(model_name=model_name)\n",
        "            else:\n",
        "                print(f\"❌ Failed to load summarizer: {e}\")\n",
        "                raise\n",
        "\n",
        "    def summarize_chunks(self, chunks, max_content_tokens=2048, max_output_tokens=256):\n",
        "        \"\"\"\n",
        "        Summarize list of text chunks using LLM.\n",
        "        - Chunks are joined until they fit into max_context_tokens\n",
        "        - Generates a concise summary.\n",
        "        \"\"\"\n",
        "\n",
        "        if not hasattr(self, \"summarizer_pipeline\"):\n",
        "            self.load_summarizer()\n",
        "            print(\"Summarizer not initialized. Called load_summarizer(). pipeline will work with default parameters.\")\n",
        "\n",
        "        # Join chunks into one context, respecting token budget\n",
        "        context = \" \".join(chunks)\n",
        "        input_tokens = len(self.tokenizer.encode(context))\n",
        "\n",
        "        if input_tokens > max_content_tokens:\n",
        "            # Trim to fit context window\n",
        "            context = \" \".join(context.split()[:max_content_tokens])\n",
        "            print(\"⚠️ Context truncated to fit within model token limit.\")\n",
        "\n",
        "        # Build summarization prompt\n",
        "        prompt = f\"\"\"\n",
        "            Summarize the following financial report excerpts into a concise answer.\n",
        "            Keep it factual, short, and grounded in the text.\n",
        "\n",
        "            Excerpts:\n",
        "            {context}\n",
        "\n",
        "            Summary:\n",
        "            \"\"\"\n",
        "\n",
        "        try:\n",
        "            output = self.summarizer_pipeline(\n",
        "                prompt,\n",
        "                max_new_tokens=max_output_tokens,\n",
        "                do_sample=False\n",
        "            )[0][\"generated_text\"]\n",
        "\n",
        "            if \"Summary:\" in output:\n",
        "                summary = output.split(\"Summary:\")[-1].strip()\n",
        "            else:\n",
        "                summary = output.strip()\n",
        "\n",
        "            return summary\n",
        "\n",
        "        except RuntimeError as e:\n",
        "            if \"CUDA out of memory\" in str(e):\n",
        "                print(\"⚠️ CUDA OOM during summarization. Retrying on CPU...\")\n",
        "                self.device = \"cpu\"\n",
        "                torch.cuda.empty_cache()\n",
        "                return self.summarize_chunks(chunks, max_content_tokens, max_output_tokens)\n",
        "            else:\n",
        "                print(f\"❌ Summarizer failed: {e}. Falling back to raw chunks.\")\n",
        "                return \" \".join(chunks[:2])  # fallback: return first 2 chunks\n",
        "\n",
        "\n",
        "    def answer_query(self, query, top_k=3):\n",
        "        \"\"\"\n",
        "        End-to-end QA:\n",
        "        - Retrieve relevant chunks from FAISS\n",
        "        - Summarize into a final answer.\n",
        "        \"\"\"\n",
        "        try:\n",
        "            #step 1: Retrieve\n",
        "            print(f\"🔍 searching vector DB for query: {query}\")\n",
        "            q_emb = self.encoder.encode(query, show_progress_bar=False, convert_to_numpy=True).reshape(1, -1)\n",
        "            faiss.normalize_L2(q_emb)\n",
        "\n",
        "            scores, idxs = self.index.search(q_emb, k=top_k)\n",
        "            chunks = [self.metadata[idx][\"text\"] for idx in idxs[0]]\n",
        "\n",
        "            # Step 2: Summarize\n",
        "            summary = self.summarize_chunks(chunks)\n",
        "\n",
        "            # Log results\n",
        "            print(f\"✅ Final Answer: {summary}\")\n",
        "            return summary\n",
        "\n",
        "        except Exception as e:\n",
        "            print(f\"Error in answer_query: {e}\")\n",
        "            return None\n",
        "\n",
        "\n",
        "# Example\n",
        "# ge = GetEmbeddings()\n",
        "# ge.run()\n",
        "# # NEW STEP\n",
        "# ge.load_summarizer(\"google/gemma-2b\")\n",
        "# answer = ge.answer_query(\"What are the key highlights from Q2 financial report?\")\n",
        "# print(answer)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "SrZwOeGPba8Q",
        "outputId": "b14f3d67-54d7-4db1-c030-702ab670bc90"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Writing Evaluator.py\n"
          ]
        }
      ],
      "source": [
        "%%writefile Evaluator.py\n",
        "import os\n",
        "import json\n",
        "import time\n",
        "import numpy as np\n",
        "from tqdm import tqdm\n",
        "\n",
        "# from Logger import GetLogger, MetricsLogger\n",
        "# from Embeddings import GetEmbeddings\n",
        "\n",
        "# Metrics\n",
        "from sklearn.metrics.pairwise import cosine_similarity\n",
        "from rouge_score import rouge_scorer\n",
        "from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction\n",
        "from bert_score import score as bert_score\n",
        "\n",
        "class Evaluator:\n",
        "    \"\"\"\n",
        "    Evaluation pipeline for financial QA Agent.\n",
        "    Uses eval_dataset.json to run queries, collect answers, and compute metrics.\n",
        "    \"\"\"\n",
        "    def __init__(self, config_path=\"config.json\", logger=None):\n",
        "        with open(config_path, \"r\") as f:\n",
        "            self.config = json.load(f)\n",
        "        self.paths = self.config[\"paths\"]\n",
        "\n",
        "\n",
        "        # if not logger:\n",
        "        #     obj = GetLogger()\n",
        "        #     logger = obj.get_logger()\n",
        "        # self.logger = logger\n",
        "\n",
        "\t\t# # Metrics logger\n",
        "        # self.metrics_logger = MetricsLogger(logger=self.logger)\n",
        "\n",
        "        # Initialize Agent\n",
        "        self.agent = GetEmbeddings(config_path=config_path, logger=None)\n",
        "        self.agent.run()    # Load or rebuild FAISS + embeddings\n",
        "        self.agent.load_summarizer()    # Load summarizer\n",
        "        self.encoder = self.agent.load_encoder()\n",
        "\n",
        "        # Load Dataset\n",
        "        self.dataset = self.load_dataset()\n",
        "        self.results = []\n",
        "        self.failed_queries = []\n",
        "\n",
        "    def load_dataset(self):\n",
        "        path = self.paths[\"eval_dataset\"]\n",
        "        if not os.path.exists(path):\n",
        "            raise FileNotFoundError(f\"Dataset not found: {path}\")\n",
        "        with open(path, \"r\", encoding=\"utf-8\") as f:\n",
        "            return json.load(f)\n",
        "\n",
        "    def measure_latency(self, func, *args, **kwargs):\n",
        "        \"\"\"Helper: measure time taken by a function call.\"\"\"\n",
        "        start = time.time()\n",
        "        result = func(*args, **kwargs)\n",
        "        latency = time.time() - start\n",
        "        return result, latency\n",
        "\n",
        "    def evaluate_query(self, query, reference):\n",
        "        \"\"\"Run one query, compare answer vs. reference, compute metrics.\"\"\"\n",
        "        # try:\n",
        "        # Run pipeline\n",
        "        system_answer, latency = self.measure_latency(self.agent.answer_query, query)\n",
        "\n",
        "        # 1. Embedding similarity (proxy retrieval quality)\n",
        "        ref_emb = self.encoder.encode([reference], convert_to_numpy=True)\n",
        "        ans_emb = self.encoder.encode([system_answer], convert_to_numpy=True)\n",
        "        retrieval_quality = float(cosine_similarity(ref_emb, ans_emb)[0][0])\n",
        "\n",
        "        # 2. ROUGE-L\n",
        "        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)\n",
        "        rouge_score = scorer.score(reference, system_answer)['rougeL'].fmeasure\n",
        "\n",
        "        # 3. BLEU (with smoothing for short texts)\n",
        "        smoothie = SmoothingFunction().method4\n",
        "        bleu = sentence_bleu([reference.split()], system_answer.split(), smoothing_function=smoothie)\n",
        "\n",
        "        # 4. BERTScore (semantic similarity)\n",
        "        P, R, F1 = bert_score([system_answer], [reference], lang=\"en\")\n",
        "        bert_f1 = float(F1.mean())\n",
        "\n",
        "        metrics = {\n",
        "            \"query\": query,\n",
        "            \"reference\": reference,\n",
        "            \"system_answer\": system_answer,\n",
        "            \"retrieval_quality\": retrieval_quality,\n",
        "            \"rougeL\": rouge_score,\n",
        "            \"bleu\": bleu,\n",
        "            \"bertscore_f1\": bert_f1,\n",
        "            \"latency_sec\": latency\n",
        "        }\n",
        "\n",
        "        # Log into metrics logger\n",
        "        # self.metrics_logger.log_query_metrics(query, metrics)\n",
        "\n",
        "        return metrics\n",
        "\n",
        "        # except Exception as e:\n",
        "        #     print(f\"Error evaluating query '{query}': {e}\")\n",
        "        #     return None\n",
        "\n",
        "\n",
        "    def run(self):\n",
        "        \"\"\"Run evaluation on entire dataset.\"\"\"\n",
        "        print(\"Starting Evaluation...\")\n",
        "\n",
        "        for item in tqdm(self.dataset, desc=\"Queries\"):\n",
        "            query = item[\"query\"]\n",
        "            reference = item[\"reference\"]\n",
        "            result = self.evaluate_query(query, reference)\n",
        "            if result:\n",
        "                self.results.append(result)\n",
        "\n",
        "\n",
        "        # Save result\n",
        "        with open(self.paths[\"eval_results\"], \"w\", encoding=\"utf-8\") as f:\n",
        "            json.dump(self.results, f, indent=2)\n",
        "\n",
        "        if self.failed_queries:\n",
        "            with open(self.paths[\"failed_queries\"], \"w\", encoding=\"utf-8\") as f:\n",
        "                json.dump(self.failed_queries, f, indent=2)\n",
        "\n",
        "\n",
        "        # Save metrics summary\n",
        "        # summary = self.metrics_logger.save()\n",
        "        summary = None\n",
        "        print(f\"Evaluation Complete.\")\n",
        "        print(f\"📊 Evaluation summary: {summary}\")\n",
        "\n",
        "        return self.results, summary\n",
        "\n",
        "\n",
        "if __name__ == \"__main__\":\n",
        "    evaluator = Evaluator()\n",
        "    results, summary = evaluator.run()\n",
        "\n",
        "    print(\"\\n=== Sample Results ===\")\n",
        "    print(json.dumps(results[:2], indent=2))\n",
        "    print(\"\\n=== Summary ===\")\n",
        "    print(json.dumps(summary, indent=2))\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "_SgMUhSJbdcu",
        "outputId": "c79fe42b-517f-40b7-cc2b-71ddaae05084"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Overwriting app.py\n"
          ]
        }
      ],
      "source": [
        "%%writefile app.py\n",
        "import streamlit as st\n",
        "import json\n",
        "import faiss\n",
        "import numpy as np\n",
        "import re\n",
        "from Embeddings import GetEmbeddings\n",
        "from Logger import GetLogger\n",
        "\n",
        "# ================================\n",
        "# Load Config\n",
        "# ================================\n",
        "with open(\"config.json\", \"r\") as f:\n",
        "    config = json.load(f)\n",
        "\n",
        "# Initialize Logger\n",
        "log_obj = GetLogger()\n",
        "logger = log_obj.get_logger()\n",
        "\n",
        "# Initialize QA Agent\n",
        "@st.cache_resource\n",
        "def load_agent():\n",
        "    agent = GetEmbeddings(config_path=\"config.json\", logger=logger)\n",
        "    agent.run()  # load or build FAISS index\n",
        "    encoder = agent.load_encoder()\n",
        "    agent.load_summarizer()\n",
        "    return agent, encoder\n",
        "\n",
        "agent, encoder = load_agent()\n",
        "\n",
        "# ================================\n",
        "# Streamlit UI\n",
        "# ================================\n",
        "st.set_page_config(page_title=\"Financial QA Agent\", layout=\"wide\")\n",
        "\n",
        "# --- Header ---\n",
        "st.title(\"💹 Financial Report QA Agent\")\n",
        "st.markdown(\n",
        "    \"\"\"\n",
        "    Welcome!\n",
        "    This tool lets you **query annual financial reports** (Infosys, ICICI Bank, etc.)\n",
        "    and get **summarized answers** with supporting evidence from the text.\n",
        "    \"\"\"\n",
        ")\n",
        "\n",
        "# Sidebar - Settings\n",
        "st.sidebar.header(\"⚙️ Settings\")\n",
        "top_k = st.sidebar.slider(\"Top K Chunks\", 1, 10, 3)\n",
        "max_output_tokens = st.sidebar.slider(\"Max Summary Tokens\", 64, 512, 256)\n",
        "\n",
        "# --- Keyword highlighting ---\n",
        "def highlight_keywords(text, keywords=[\"risk\", \"revenue\", \"profit\", \"growth\", \"loss\"]):\n",
        "    pattern = re.compile(r\"\\b(\" + \"|\".join(keywords) + r\")\\b\", re.IGNORECASE)\n",
        "    return pattern.sub(lambda m: f\"**{m.group(0)}**\", text)\n",
        "\n",
        "# --- Session State for Query History ---\n",
        "if \"history\" not in st.session_state:\n",
        "    st.session_state[\"history\"] = []\n",
        "\n",
        "# --- Query input ---\n",
        "query = st.text_input(\"🔍 Enter your question:\", placeholder=\"e.g., What are the main risk factors in 2023?\")\n",
        "\n",
        "if st.button(\"Get Answer\"):\n",
        "    if query.strip() == \"\":\n",
        "        st.warning(\"Please enter a query.\")\n",
        "    else:\n",
        "        with st.spinner(\"Searching reports...\"):\n",
        "            try:\n",
        "                # Retrieve + summarize\n",
        "                answer = agent.answer_query(query, top_k=top_k)\n",
        "\n",
        "                # --- Display final answer ---\n",
        "                st.subheader(\"📌 Answer\")\n",
        "                st.success(answer)\n",
        "\n",
        "                # --- Show supporting chunks ---\n",
        "                st.subheader(\"📂 Supporting Chunks\")\n",
        "                q_emb = encoder.encode(query, convert_to_numpy=True).reshape(1, -1)\n",
        "                faiss.normalize_L2(q_emb)\n",
        "                scores, idxs = agent.index.search(q_emb.astype(np.float32), k=top_k)\n",
        "\n",
        "                for score, idx in zip(scores[0], idxs[0]):\n",
        "                    meta = agent.metadata[idx]\n",
        "                    with st.expander(f\"📄 {meta['file']} | Chunk {meta['chunk_id']} | Score: {score:.4f}\"):\n",
        "                        chunk_text = highlight_keywords(meta['text'][:1000])\n",
        "                        st.markdown(chunk_text)\n",
        "\n",
        "                # --- Save Query & Answer to History ---\n",
        "                st.session_state[\"history\"].append({\"query\": query, \"answer\": answer})\n",
        "\n",
        "                # --- Log query + answer ---\n",
        "                logger.info(f\"User Query: {query}\")\n",
        "                logger.info(f\"System Answer: {answer}\")\n",
        "\n",
        "                # --- Save persistent history JSON ---\n",
        "                with open(\"ui_query_history.json\", \"w\", encoding=\"utf-8\") as f:\n",
        "                    json.dump(st.session_state[\"history\"], f, indent=2)\n",
        "\n",
        "            except Exception as e:\n",
        "                st.error(f\"Error: {e}\")\n",
        "                logger.error(f\"Streamlit UI error: {e}\")\n",
        "\n",
        "# --- Show History in Sidebar ---\n",
        "if st.session_state[\"history\"]:\n",
        "    st.sidebar.subheader(\"🕘 Query History\")\n",
        "    for item in st.session_state[\"history\"][-5:]:  # show last 5 queries\n",
        "        st.sidebar.write(f\"**Q:** {item['query']}\")\n",
        "        st.sidebar.write(f\"**A:** {item['answer'][:100]}...\")\n",
        "        st.sidebar.markdown(\"---\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6UAnlclVckzM",
        "outputId": "bb65eead-5953-4a4f-f838-14fadc1469dd"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\u001b[90m2025-09-29T13:35:21Z\u001b[0m \u001b[32mINF\u001b[0m Thank you for trying Cloudflare Tunnel. Doing so, without a Cloudflare account, is a quick way to experiment and try it out. However, be aware that these account-less Tunnels have no uptime guarantee, are subject to the Cloudflare Online Services Terms of Use (https://www.cloudflare.com/website-terms/), and Cloudflare reserves the right to investigate your use of Tunnels for violations of such terms. If you intend to use Tunnels in production you should use a pre-created named tunnel by following: https://developers.cloudflare.com/cloudflare-one/connections/connect-apps\n",
            "\u001b[90m2025-09-29T13:35:21Z\u001b[0m \u001b[32mINF\u001b[0m Requesting new quick Tunnel on trycloudflare.com...\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m +--------------------------------------------------------------------------------------------+\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m |  Your quick Tunnel has been created! Visit it at (it may take some time to be reachable):  |\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m |  https://ease-library-cases-gibraltar.trycloudflare.com                                    |\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m +--------------------------------------------------------------------------------------------+\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m Cannot determine default configuration path. No file [config.yml config.yaml] in [~/.cloudflared ~/.cloudflare-warp ~/cloudflare-warp /etc/cloudflared /usr/local/etc/cloudflared]\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m Version 2025.9.1 (Checksum 3dc1dc4252eae3c691861f926e2b8640063a2ce534b07b7a3f4ec2de439ecfe3)\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m GOOS: linux, GOVersion: go1.24.4, GoArch: amd64\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m Settings: map[ha-connections:1 no-autoupdate:true protocol:quic url:http://localhost:8501]\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m cloudflared will not automatically update if installed by a package manager.\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m Generated Connector ID: b7e0104f-71af-4b1e-a366-b3b15b2c86d9\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m Initial protocol quic\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m ICMP proxy will use 172.28.0.12 as source for IPv4\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m ICMP proxy will use :: as source for IPv6\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[1m\u001b[31mERR\u001b[0m\u001b[0m Cannot determine default origin certificate path. No file cert.pem in [~/.cloudflared ~/.cloudflare-warp ~/cloudflare-warp /etc/cloudflared /usr/local/etc/cloudflared]. You need to specify the origin certificate path by specifying the origincert option in the configuration file, or set TUNNEL_ORIGIN_CERT environment variable \u001b[36moriginCertPath=\u001b[0m\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m ICMP proxy will use 172.28.0.12 as source for IPv4\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m ICMP proxy will use :: as source for IPv6\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m Starting metrics server on 127.0.0.1:20241/metrics\n",
            "\u001b[90m2025-09-29T13:35:25Z\u001b[0m \u001b[32mINF\u001b[0m Tunnel connection curve preferences: [X25519MLKEM768 CurveP256] \u001b[36mconnIndex=\u001b[0m0 \u001b[36mevent=\u001b[0m0 \u001b[36mip=\u001b[0m198.41.200.113\n",
            "2025/09/29 13:35:25 failed to sufficiently increase receive buffer size (was: 208 kiB, wanted: 7168 kiB, got: 416 kiB). See https://github.com/quic-go/quic-go/wiki/UDP-Buffer-Sizes for details.\n",
            "\u001b[90m2025-09-29T13:35:26Z\u001b[0m \u001b[32mINF\u001b[0m Registered tunnel connection \u001b[36mconnIndex=\u001b[0m0 \u001b[36mconnection=\u001b[0mc535a197-93c0-4941-a9ab-b32533b50549 \u001b[36mevent=\u001b[0m0 \u001b[36mip=\u001b[0m198.41.200.113 \u001b[36mlocation=\u001b[0msin02 \u001b[36mprotocol=\u001b[0mquic\n",
            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[32mINF\u001b[0m Initiating graceful shutdown due to signal interrupt ...\n",
            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[1m\u001b[31mERR\u001b[0m\u001b[0m failed to run the datagram handler \u001b[31merror=\u001b[0m\u001b[31m\"context canceled\"\u001b[0m \u001b[36mconnIndex=\u001b[0m0 \u001b[36mevent=\u001b[0m0 \u001b[36mip=\u001b[0m198.41.200.113\n",
            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[1m\u001b[31mERR\u001b[0m\u001b[0m failed to serve tunnel connection \u001b[31merror=\u001b[0m\u001b[31m\"accept stream listener encountered a failure while serving\"\u001b[0m \u001b[36mconnIndex=\u001b[0m0 \u001b[36mevent=\u001b[0m0 \u001b[36mip=\u001b[0m198.41.200.113\n",
            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[1m\u001b[31mERR\u001b[0m\u001b[0m Serve tunnel error \u001b[31merror=\u001b[0m\u001b[31m\"accept stream listener encountered a failure while serving\"\u001b[0m \u001b[36mconnIndex=\u001b[0m0 \u001b[36mevent=\u001b[0m0 \u001b[36mip=\u001b[0m198.41.200.113\n",
            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[32mINF\u001b[0m Retrying connection in up to 1s \u001b[36mconnIndex=\u001b[0m0 \u001b[36mevent=\u001b[0m0 \u001b[36mip=\u001b[0m198.41.200.113\n",
            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[1m\u001b[31mERR\u001b[0m\u001b[0m Connection terminated \u001b[36mconnIndex=\u001b[0m0\n",
            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[1m\u001b[31mERR\u001b[0m\u001b[0m no more connections active and exiting\n",
            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[32mINF\u001b[0m Tunnel server stopped\n",
            "\u001b[90m2025-09-29T13:38:58Z\u001b[0m \u001b[32mINF\u001b[0m Metrics server stopped\n"
          ]
        }
      ],
      "source": [
        "import threading, os\n",
        "\n",
        "# Kill anything on port 8501 (just in case)\n",
        "os.system(\"kill -9 $(lsof -t -i:8501) 2>/dev/null\")\n",
        "\n",
        "# Run Streamlit in background\n",
        "def run_app():\n",
        "    os.system(\"streamlit run app.py --server.port 8501\")\n",
        "\n",
        "thread = threading.Thread(target=run_app)\n",
        "thread.start()\n",
        "\n",
        "# Start cloudflared tunnel\n",
        "!cloudflared tunnel --url http://localhost:8501 --no-autoupdate\n"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "machine_shape": "hm",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}