Spaces:

rajkstats
/

FilingFinder

Sleeping

App Files Files Community

rajkstats commited on May 2, 2024

Commit

d50df56

1 Parent(s): e7b26d6

Adding all files

Browse files

Files changed (7) hide show

.gitattributes +35 -0
.gitignore +160 -0
Dockerfile +11 -0
app.py +98 -0
chainlit.md +24 -0
notebook/meta_filing_langchain_rag_prototype.ipynb +375 -0
requirements.txt +13 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.11
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+COPY ./requirements.txt ~/app/requirements.txt
+RUN pip install -r requirements.txt
+COPY . .
+CMD ["chainlit", "run", "app.py", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Importing Python libraries
+import os
+import asyncio
+from dotenv import load_dotenv
+import chainlit as cl
+from langchain.chains import ConversationalRetrievalChain
+from langchain.memory import ChatMessageHistory, ConversationBufferMemory
+from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_community.vectorstores import Qdrant
+from langchain_openai import ChatOpenAI
+from langchain_openai.embeddings import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import tiktoken
+# Load environment variables from a .env file
+load_dotenv()
+@cl.on_chat_start
+async def start_chat():
+    # Notify the user that the system is setting up the vector store
+    await cl.Message(content="Setting up Qdrant vector store. Please wait...").send()
+    # Load documents using PyMuPDFLoader from the specified URL
+    docs = PyMuPDFLoader("https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load()
+    # Define a function to calculate the token length using tiktoken
+    def tiktoken_len(text):
+        tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(text)
+        return len(tokens)
+    # Configure a text splitter that handles large documents
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size = 1000,
+        chunk_overlap = 0,  # Ensure there is no cutoff at the edges of chunks
+        length_function = tiktoken_len,
+    )
+    # Split the document into manageable chunks
+    split_chunks = text_splitter.split_documents(docs)
+    # Set up the embedding model for document encoding
+    embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
+    # Asynchronously create a Qdrant vector store with the document chunks
+    qdrant_vectorstore = await cl.make_async(Qdrant.from_documents)(
+        split_chunks,
+        embedding_model,
+        location=":memory:",  # Use in-memory storage for vectors
+        collection_name="meta_10k"  # Name of the collection in Qdrant
+    )
+    # Initialize a retriever from the Qdrant vector store
+    qdrant_retriever = qdrant_vectorstore.as_retriever()
+    # Notify the user that setup is complete
+    await cl.Message(content="Qdrant setup complete. You can now start asking questions!").send()
+    # Initialize a message history to track the conversation
+    message_history = ChatMessageHistory()
+    # Set up memory to hold the conversation context and return answers
+    memory = ConversationBufferMemory(
+        memory_key="chat_history",
+        output_key="answer",
+        chat_memory=message_history,
+        return_messages=True,
+    )
+    # Configure the LLM for generating responses
+    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, streaming=True)
+    # Create a retrieval chain combining the LLM and the retriever
+    chain = ConversationalRetrievalChain.from_llm(
+        llm,
+        retriever=qdrant_retriever,
+        chain_type="stuff",  # Specify the type of chain (customizable based on application)
+        memory=memory,
+        return_source_documents=True
+    )
+    # Store the configured chain in the user session
+    cl.user_session.set("chain", chain)
+@cl.on_message
+async def main(message: cl.Message):
+    # Retrieve the conversational chain from the user session
+    chain = cl.user_session.get("chain")
+    # Define a callback handler for asynchronous operations
+    cb = cl.AsyncLangchainCallbackHandler()
+    # Process the incoming message using the conversational chain
+    res = await chain.acall(message.content, callbacks=[cb])
+    answer = res["answer"]  # Extract the answer from the response
+    # Send the processed answer back to the user
+    await cl.Message(content=answer).send()

chainlit.md ADDED Viewed

	@@ -0,0 +1,24 @@

+# Welcome to FilingFinder! 📊📄
+Ready to unlock the secrets held within Meta's financial filings? You've come to the right place. FilingFinder leverages cutting-edge language models to help you quickly extract and understand critical financial data directly from Meta's 10-K documents.
+## How It Works 🚀
+FilingFinder is simple to use:
+1. Enter your query related to Meta's financials—be it about cash reserves, director listings, or other specific details.
+2. Our system analyzes the text from the latest 10-K filing to provide accurate and detailed answers.
+## Features 🌟
+- **Instant Retrieval:** Get real-time answers from Meta's financial documents.
+- **Accurate Data:** Powered by advanced NLP, ensuring precision in data extraction.
+- **User-Friendly Interface:** Designed for ease of use, regardless of your tech background.
+## Need Assistance? 🛠️
+If you encounter any issues or have questions, we're here to help:
+- **Support Channel:** Reach out by creating an issue on github repo
+## Let's Get Started! 🌐
+Begin your financial discovery now. FilingFinder is here to guide you through Meta's extensive financial data, helping you make informed decisions with ease.

notebook/meta_filing_langchain_rag_prototype.ipynb ADDED Viewed

	@@ -0,0 +1,375 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Midterm Challenge: Building and Deploying a RAG Application"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Build 🏗️\n",
+    "\n",
+    "- Data: Meta 10-k Filings\n",
+    "- LLM: OpenAI GPT-3.5-turbo\n",
+    "- Embedding Model: text-3-embedding small\n",
+    "- Infrastructure: LangChain or LlamaIndex (you choose)\n",
+    "- Vector Store: Qdrant\n",
+    "- Deployment: Chainlit, Hugging Face\n",
+    "\n",
+    "#### Ship 🚢\n",
+    "\n",
+    "Evaluate your answers to the following questions\n",
+    "- \"What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\"\n",
+    "- \"Who are Meta's 'Directors' (i.e., members of the Board of Directors)?\"\n",
+    "- Record <10 min loom video walkthrough\n",
+    "- Extra Credit: Baseline retrieval performance w/ RAGAS, change something about your RAG system to improve it, then show the improvement quantitatively!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Installing Required Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 170,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -qU langchain langchain-core langchain-community langchain-openai"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 172,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -qU qdrant-client\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 171,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -qU tiktoken pymupdf"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Set Environment Variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import getpass\n",
+    "\n",
+    "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Data Collection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 173,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import PyMuPDFLoader\n",
+    "\n",
+    "docs = PyMuPDFLoader(\"https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf\").load()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Chunking our Meta-10k Filing Document"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 174,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+    "import tiktoken\n",
+    "\n",
+    "enc = tiktoken.encoding_for_model(\"gpt-3.5-turbo\")\n",
+    "\n",
+    "def tiktoken_len(text):\n",
+    "    tokens = tiktoken.encoding_for_model(\"gpt-3.5-turbo\").encode(\n",
+    "        text,\n",
+    "    )\n",
+    "    return len(tokens)\n",
+    "\n",
+    "text_splitter = RecursiveCharacterTextSplitter(\n",
+    "    chunk_size = 200,\n",
+    "    chunk_overlap = 0, # Overlap to ensure continuity and prevent cutoffs at chunk edges\n",
+    "    length_function = tiktoken_len,\n",
+    ")\n",
+    "\n",
+    "split_chunks = text_splitter.split_documents(docs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 175,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "663"
+      ]
+     },
+     "execution_count": 175,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(split_chunks)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we have 663 ~200 token long documents"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Embeddings and Vector Storage"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 176,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.vectorstores import Qdrant\n",
+    "\n",
+    "from langchain_openai.embeddings import OpenAIEmbeddings\n",
+    "\n",
+    "embedding_model = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n",
+    "\n",
+    "qdrant_vectorstore = Qdrant.from_documents(\n",
+    "    split_chunks,\n",
+    "    embedding_model,\n",
+    "    location=\":memory:\",\n",
+    "    collection_name=\"meta_10k_filings\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Setting up our retriever using Langchain retriever method"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 177,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qdrant_retriever = qdrant_vectorstore.as_retriever()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Setting up our Langchain based RAG"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Setting up our Prompt template"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 154,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "\n",
+    "RAG_PROMPT = \"\"\"\n",
+    "CONTEXT:\n",
+    "{context}\n",
+    "\n",
+    "QUERY:\n",
+    "{question}\n",
+    "\n",
+    "RESPONSE:\n",
+    "- If the QUERY is directly related to the provided CONTEXT, generate a detailed, structured answer using the information from the CONTEXT.\n",
+    "- If the QUERY does not pertain to the provided CONTEXT, state that the question is unrelated and suggest checking the appropriate source or document for the correct information.\n",
+    "\"\"\"\n",
+    "\n",
+    "rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### RAG Chain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 155,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from operator import itemgetter\n",
+    "from langchain.schema.output_parser import StrOutputParser\n",
+    "from langchain.schema.runnable import RunnablePassthrough\n",
+    "\n",
+    "retrieval_augmented_qa_chain = (\n",
+    "    # INVOKE CHAIN WITH: {\"question\" : \"<>\"}\n",
+    "    # \"question\" : populated by getting the value of the \"question\" key\n",
+    "    # \"context\"  : populated by getting the value of the \"question\" key and chaining it into the base_retriever\n",
+    "    {\"context\": itemgetter(\"question\") | qdrant_retriever, \"question\": itemgetter(\"question\")}\n",
+    "    # \"context\"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)\n",
+    "    #              by getting the value of the \"context\" key from the previous step\n",
+    "    | RunnablePassthrough.assign(context=itemgetter(\"context\"))\n",
+    "    # \"response\" : the \"context\" and \"question\" values are used to format our prompt object and then piped\n",
+    "    #              into the LLM and stored in a key called \"response\"\n",
+    "    # \"context\"  : populated by getting the value of the \"context\" key from the previous step\n",
+    "    | {\"response\": rag_prompt | openai_chat_model, \"context\": itemgetter(\"context\")}\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 156,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question= \"What was the total value of 'Cash and cash equivalents' as of December 31, 2023?\"\n",
+    "response = retrieval_augmented_qa_chain.invoke({\"question\" :question})\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 147,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The total value of 'Cash and cash equivalents' as of December 31, 2023, was $41.862 billion. This information can be found in the document on page 107 under the section 'Inputs (Level 3).' \n",
+      "\n",
+      "Please verify this information on page 107 of the document provided.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(response[\"response\"].content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 135,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for context in response[\"context\"]:\n",
+    "#   print(\"Context:\")\n",
+    "#   print(context)\n",
+    "#   print(\"----\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 159,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question= \"Who are Meta's 'Directors' (i.e., members of the Board of Directors)?\"\n",
+    "response = retrieval_augmented_qa_chain.invoke({\"question\" :question})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 160,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The members of Meta's Board of Directors are as follows:\n",
+      "1. Peggy Alford\n",
+      "2. Marc L. Andreessen\n",
+      "3. Andrew W. Houston\n",
+      "4. Nancy Killefer\n",
+      "5. Robert M. Kimmitt\n",
+      "6. Sheryl K. Sandberg\n",
+      "7. Tracey T. Travis\n",
+      "8. Tony Xu\n",
+      "\n",
+      "These names were listed on page 132 of the document provided in the CONTEXT.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(response[\"response\"].content)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "llmops-course",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+chainlit==0.7.700
+openai==1.25.0
+tiktoken
+python-dotenv==1.0.0
+qdrant-client
+pymupdf
+langchain==0.1.16
+langchain-community==0.0.34
+langchain-core==0.1.46
+langchain-openai==0.1.4
+langchain-text-splitters==0.0.1
+langchainhub==0.1.15
+langsmith==0.1.51