Upload 6 files
Browse files- .gitattributes +1 -0
- README.md +17 -10
- app.py +29 -0
- chainlit.md +14 -0
- imdb_datasets.csv +3 -0
- rag.py +104 -0
- requirements.txt +87 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
imdb_datasets.csv filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,10 +1,17 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# RAG Llm Project
|
| 2 |
+
|
| 3 |
+
This is a Python project for the RAG Llm project. It utilizes movie data to return relevant information.
|
| 4 |
+
|
| 5 |
+
## Installation
|
| 6 |
+
|
| 7 |
+
To use this project, follow these steps:
|
| 8 |
+
|
| 9 |
+
1. Clone the repository: `git clone https://github.com/your-username/rag-llm-project.git`
|
| 10 |
+
2. Install the required dependencies: `pip install -r requirements.txt`
|
| 11 |
+
Set up your environment variables by creating a .env file with your OPENAI_API_KEY and ASSISTANT_ID.
|
| 12 |
+
|
| 13 |
+
## Usage
|
| 14 |
+
|
| 15 |
+
To run the project, execute the following command:
|
| 16 |
+
|
| 17 |
+
Start the Chainlit app by running app.py.
|
app.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import os
|
| 3 |
+
from rag import RAGModel
|
| 4 |
+
from langchain.chat_models import ChatOpenAI
|
| 5 |
+
from langchain.prompts import ChatPromptTemplate
|
| 6 |
+
from langchain.schema import StrOutputParser
|
| 7 |
+
from langchain.chains import LLMChain
|
| 8 |
+
|
| 9 |
+
import chainlit as cl
|
| 10 |
+
|
| 11 |
+
# Get the value of OPEN_API_KEY from the environment
|
| 12 |
+
|
| 13 |
+
rag = RAGModel(os.getenv("OPENAI_API_KEY"))
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@cl.on_chat_start
|
| 17 |
+
async def on_chat_start():
|
| 18 |
+
msg=cl.Message(content="Firing up the research info bot...")
|
| 19 |
+
await msg.send()
|
| 20 |
+
msg.content= "Hi, welcome to research info bot. What is your query?"
|
| 21 |
+
await msg.update()
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@cl.on_message
|
| 25 |
+
async def on_message(message: cl.Message):
|
| 26 |
+
answer = rag.query(question=message.content)
|
| 27 |
+
await cl.Message(content=answer).send()
|
| 28 |
+
|
| 29 |
+
|
chainlit.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Welcome to Chainlit! 🚀🤖
|
| 2 |
+
|
| 3 |
+
Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
|
| 4 |
+
|
| 5 |
+
## Useful Links 🔗
|
| 6 |
+
|
| 7 |
+
- **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
|
| 8 |
+
- **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
|
| 9 |
+
|
| 10 |
+
We can't wait to see what you create with Chainlit! Happy coding! 💻😊
|
| 11 |
+
|
| 12 |
+
## Welcome screen
|
| 13 |
+
|
| 14 |
+
To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.
|
imdb_datasets.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:82a55444797da9627738b3f2a8985dc0e6fa5ddb50516ee66e2c3e2e8b389b7a
|
| 3 |
+
size 10753100
|
rag.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from langchain_core.runnables.base import RunnableSequence
|
| 3 |
+
from langchain_core.runnables.passthrough import RunnablePassthrough
|
| 4 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 5 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 6 |
+
from langchain_openai import ChatOpenAI
|
| 7 |
+
from langchain_community.document_loaders import CSVLoader
|
| 8 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 9 |
+
from langchain_openai import OpenAIEmbeddings
|
| 10 |
+
from langchain.embeddings import CacheBackedEmbeddings
|
| 11 |
+
from langchain.storage import LocalFileStore
|
| 12 |
+
from langchain_community.vectorstores import FAISS
|
| 13 |
+
|
| 14 |
+
# PIP Install the following packages:
|
| 15 |
+
# pip install -q langchain
|
| 16 |
+
# pip install -q langchain_openai
|
| 17 |
+
# pip install -q faiss-cpu tiktoken
|
| 18 |
+
# pip install -q -U langchain
|
| 19 |
+
# pip install -U langchain-community
|
| 20 |
+
|
| 21 |
+
class RAGModel:
|
| 22 |
+
def __init__(self, api_key):
|
| 23 |
+
self.api_key = api_key
|
| 24 |
+
# for the RAG model. First we need to get the Dcouments processed to be used as context for the model.
|
| 25 |
+
|
| 26 |
+
# Load DataSet
|
| 27 |
+
csv_file = "imdb_datasets.csv"
|
| 28 |
+
loader = CSVLoader(csv_file)
|
| 29 |
+
csv_data = loader.load()
|
| 30 |
+
|
| 31 |
+
# 1. Split the dataset
|
| 32 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
| 33 |
+
chunked_documents = text_splitter.split_documents(csv_data)
|
| 34 |
+
print(f"Number of documents: {len(chunked_documents)}")
|
| 35 |
+
#len(chunked_documents) # ensure we have actually split the data into chunks
|
| 36 |
+
|
| 37 |
+
# 2. Create embeddings
|
| 38 |
+
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key=self.api_key)
|
| 39 |
+
print("Created embeddings")
|
| 40 |
+
|
| 41 |
+
# Create a cache backed embeddings
|
| 42 |
+
local_store = LocalFileStore("./cache/")
|
| 43 |
+
cached_embedder = CacheBackedEmbeddings.from_bytes_store(embedding_model, local_store, namespace=embedding_model.model)
|
| 44 |
+
print("Created cache backed embeddings")
|
| 45 |
+
|
| 46 |
+
# 3. Save the documents in the vector store as embeddings
|
| 47 |
+
|
| 48 |
+
self.vector_store = FAISS.from_documents(chunked_documents, cached_embedder)
|
| 49 |
+
self.vector_store.save_local("faiss_index")
|
| 50 |
+
|
| 51 |
+
# 3. Retrive the vector store
|
| 52 |
+
# create a retriever
|
| 53 |
+
retriever = self.vector_store.as_retriever()
|
| 54 |
+
|
| 55 |
+
# 4. Create a prompt - LangChain
|
| 56 |
+
# The prompt has place for the context from the raq and the question from the user
|
| 57 |
+
prompt_template = ChatPromptTemplate.from_messages(
|
| 58 |
+
[
|
| 59 |
+
("system", "You are an excellent movie critic who always includes great movie recommendations in your response. If the answer is not in the context let the user know "),
|
| 60 |
+
("human", "Using this context: {context}, please answer this question: {question}")
|
| 61 |
+
]
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
chat_model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0, api_key=api_key)
|
| 65 |
+
parser = StrOutputParser()
|
| 66 |
+
|
| 67 |
+
self.runnable_chain = (
|
| 68 |
+
{
|
| 69 |
+
"context": retriever,
|
| 70 |
+
"question": RunnablePassthrough(),
|
| 71 |
+
}
|
| 72 |
+
| prompt_template
|
| 73 |
+
| chat_model
|
| 74 |
+
| parser
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
def query(self, question) -> str:
|
| 78 |
+
print(f"Querying the RAG instance with the question: {question}")
|
| 79 |
+
output_chunks = self.runnable_chain.invoke(question)
|
| 80 |
+
return ''.join(output_chunks)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# def main():
|
| 84 |
+
# Create an instance of RAG class
|
| 85 |
+
# api_key = os.getenv("OPENAI_API_KEY")
|
| 86 |
+
# rag = RAGModel(api_key=api_key)
|
| 87 |
+
|
| 88 |
+
# while True:
|
| 89 |
+
# # Take input from command line
|
| 90 |
+
# question = input("Enter your question (or type 'exit' to quit): ")
|
| 91 |
+
|
| 92 |
+
# # Check if user wants to exit
|
| 93 |
+
# if question.lower() == "exit":
|
| 94 |
+
# break
|
| 95 |
+
|
| 96 |
+
# # Query the RAG instance
|
| 97 |
+
# answer = rag.query(question)
|
| 98 |
+
|
| 99 |
+
# # Print the answer
|
| 100 |
+
# print("Answer:", answer)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# if __name__ == "__main__":
|
| 104 |
+
# main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
aiofiles==23.2.1
|
| 2 |
+
aiohttp==3.9.5
|
| 3 |
+
aiosignal==1.3.1
|
| 4 |
+
annotated-types==0.7.0
|
| 5 |
+
anyio==3.7.1
|
| 6 |
+
asyncer==0.0.2
|
| 7 |
+
attrs==23.2.0
|
| 8 |
+
bidict==0.23.1
|
| 9 |
+
certifi==2024.2.2
|
| 10 |
+
chainlit==1.1.101
|
| 11 |
+
charset-normalizer==3.3.2
|
| 12 |
+
chevron==0.14.0
|
| 13 |
+
click==8.1.7
|
| 14 |
+
dataclasses-json==0.5.14
|
| 15 |
+
Deprecated==1.2.14
|
| 16 |
+
distro==1.9.0
|
| 17 |
+
faiss-cpu==1.8.0
|
| 18 |
+
fastapi==0.110.3
|
| 19 |
+
fastapi-socketio==0.0.10
|
| 20 |
+
filetype==1.2.0
|
| 21 |
+
frozenlist==1.4.1
|
| 22 |
+
googleapis-common-protos==1.63.0
|
| 23 |
+
greenlet==3.0.3
|
| 24 |
+
grpcio==1.64.0
|
| 25 |
+
h11==0.14.0
|
| 26 |
+
httpcore==1.0.5
|
| 27 |
+
httpx==0.27.0
|
| 28 |
+
idna==3.7
|
| 29 |
+
importlib-metadata==7.0.0
|
| 30 |
+
jsonpatch==1.33
|
| 31 |
+
jsonpointer==2.4
|
| 32 |
+
langchain==0.2.0
|
| 33 |
+
langchain-community==0.2.0
|
| 34 |
+
langchain-core==0.2.0
|
| 35 |
+
langchain-openai==0.1.7
|
| 36 |
+
langchain-text-splitters==0.2.0
|
| 37 |
+
langsmith==0.1.60
|
| 38 |
+
Lazify==0.4.0
|
| 39 |
+
literalai==0.0.601
|
| 40 |
+
marshmallow==3.21.2
|
| 41 |
+
multidict==6.0.5
|
| 42 |
+
mypy-extensions==1.0.0
|
| 43 |
+
nest-asyncio==1.6.0
|
| 44 |
+
numpy==1.26.4
|
| 45 |
+
openai==1.30.1
|
| 46 |
+
opentelemetry-api==1.24.0
|
| 47 |
+
opentelemetry-exporter-otlp==1.24.0
|
| 48 |
+
opentelemetry-exporter-otlp-proto-common==1.24.0
|
| 49 |
+
opentelemetry-exporter-otlp-proto-grpc==1.24.0
|
| 50 |
+
opentelemetry-exporter-otlp-proto-http==1.24.0
|
| 51 |
+
opentelemetry-instrumentation==0.45b0
|
| 52 |
+
opentelemetry-proto==1.24.0
|
| 53 |
+
opentelemetry-sdk==1.24.0
|
| 54 |
+
opentelemetry-semantic-conventions==0.45b0
|
| 55 |
+
orjson==3.10.3
|
| 56 |
+
packaging==23.2
|
| 57 |
+
protobuf==4.25.3
|
| 58 |
+
pydantic==2.7.1
|
| 59 |
+
pydantic_core==2.18.2
|
| 60 |
+
PyJWT==2.8.0
|
| 61 |
+
python-dotenv==1.0.1
|
| 62 |
+
python-engineio==4.9.1
|
| 63 |
+
python-multipart==0.0.9
|
| 64 |
+
python-socketio==5.11.2
|
| 65 |
+
PyYAML==6.0.1
|
| 66 |
+
regex==2024.5.15
|
| 67 |
+
requests==2.32.0
|
| 68 |
+
setuptools==69.5.1
|
| 69 |
+
simple-websocket==1.0.0
|
| 70 |
+
sniffio==1.3.1
|
| 71 |
+
SQLAlchemy==2.0.30
|
| 72 |
+
starlette==0.37.2
|
| 73 |
+
syncer==2.0.3
|
| 74 |
+
tenacity==8.3.0
|
| 75 |
+
tiktoken==0.7.0
|
| 76 |
+
tomli==2.0.1
|
| 77 |
+
tqdm==4.66.4
|
| 78 |
+
typing-inspect==0.9.0
|
| 79 |
+
typing_extensions==4.11.0
|
| 80 |
+
uptrace==1.24.0
|
| 81 |
+
urllib3==2.2.1
|
| 82 |
+
uvicorn==0.25.0
|
| 83 |
+
watchfiles==0.20.0
|
| 84 |
+
wrapt==1.16.0
|
| 85 |
+
wsproto==1.2.0
|
| 86 |
+
yarl==1.9.4
|
| 87 |
+
zipp==3.18.2
|