RajMoon commited on
Commit
0539069
·
verified ·
1 Parent(s): 5a99ce4

Upload 6 files

Browse files
Files changed (7) hide show
  1. .gitattributes +1 -0
  2. README.md +17 -10
  3. app.py +29 -0
  4. chainlit.md +14 -0
  5. imdb_datasets.csv +3 -0
  6. rag.py +104 -0
  7. requirements.txt +87 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ imdb_datasets.csv filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,10 +1,17 @@
1
- ---
2
- title: Codepath
3
- emoji: 📉
4
- colorFrom: indigo
5
- colorTo: blue
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
1
+ # RAG Llm Project
2
+
3
+ This is a Python project for the RAG Llm project. It utilizes movie data to return relevant information.
4
+
5
+ ## Installation
6
+
7
+ To use this project, follow these steps:
8
+
9
+ 1. Clone the repository: `git clone https://github.com/your-username/rag-llm-project.git`
10
+ 2. Install the required dependencies: `pip install -r requirements.txt`
11
+ Set up your environment variables by creating a .env file with your OPENAI_API_KEY and ASSISTANT_ID.
12
+
13
+ ## Usage
14
+
15
+ To run the project, execute the following command:
16
+
17
+ Start the Chainlit app by running app.py.
app.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ from rag import RAGModel
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.prompts import ChatPromptTemplate
6
+ from langchain.schema import StrOutputParser
7
+ from langchain.chains import LLMChain
8
+
9
+ import chainlit as cl
10
+
11
+ # Get the value of OPEN_API_KEY from the environment
12
+
13
+ rag = RAGModel(os.getenv("OPENAI_API_KEY"))
14
+
15
+
16
+ @cl.on_chat_start
17
+ async def on_chat_start():
18
+ msg=cl.Message(content="Firing up the research info bot...")
19
+ await msg.send()
20
+ msg.content= "Hi, welcome to research info bot. What is your query?"
21
+ await msg.update()
22
+
23
+
24
+ @cl.on_message
25
+ async def on_message(message: cl.Message):
26
+ answer = rag.query(question=message.content)
27
+ await cl.Message(content=answer).send()
28
+
29
+
chainlit.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Welcome to Chainlit! 🚀🤖
2
+
3
+ Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
4
+
5
+ ## Useful Links 🔗
6
+
7
+ - **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚
8
+ - **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬
9
+
10
+ We can't wait to see what you create with Chainlit! Happy coding! 💻😊
11
+
12
+ ## Welcome screen
13
+
14
+ To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.
imdb_datasets.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82a55444797da9627738b3f2a8985dc0e6fa5ddb50516ee66e2c3e2e8b389b7a
3
+ size 10753100
rag.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_core.runnables.base import RunnableSequence
3
+ from langchain_core.runnables.passthrough import RunnablePassthrough
4
+ from langchain_core.output_parsers import StrOutputParser
5
+ from langchain_core.prompts import ChatPromptTemplate
6
+ from langchain_openai import ChatOpenAI
7
+ from langchain_community.document_loaders import CSVLoader
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain_openai import OpenAIEmbeddings
10
+ from langchain.embeddings import CacheBackedEmbeddings
11
+ from langchain.storage import LocalFileStore
12
+ from langchain_community.vectorstores import FAISS
13
+
14
+ # PIP Install the following packages:
15
+ # pip install -q langchain
16
+ # pip install -q langchain_openai
17
+ # pip install -q faiss-cpu tiktoken
18
+ # pip install -q -U langchain
19
+ # pip install -U langchain-community
20
+
21
+ class RAGModel:
22
+ def __init__(self, api_key):
23
+ self.api_key = api_key
24
+ # for the RAG model. First we need to get the Dcouments processed to be used as context for the model.
25
+
26
+ # Load DataSet
27
+ csv_file = "imdb_datasets.csv"
28
+ loader = CSVLoader(csv_file)
29
+ csv_data = loader.load()
30
+
31
+ # 1. Split the dataset
32
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
33
+ chunked_documents = text_splitter.split_documents(csv_data)
34
+ print(f"Number of documents: {len(chunked_documents)}")
35
+ #len(chunked_documents) # ensure we have actually split the data into chunks
36
+
37
+ # 2. Create embeddings
38
+ embedding_model = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key=self.api_key)
39
+ print("Created embeddings")
40
+
41
+ # Create a cache backed embeddings
42
+ local_store = LocalFileStore("./cache/")
43
+ cached_embedder = CacheBackedEmbeddings.from_bytes_store(embedding_model, local_store, namespace=embedding_model.model)
44
+ print("Created cache backed embeddings")
45
+
46
+ # 3. Save the documents in the vector store as embeddings
47
+
48
+ self.vector_store = FAISS.from_documents(chunked_documents, cached_embedder)
49
+ self.vector_store.save_local("faiss_index")
50
+
51
+ # 3. Retrive the vector store
52
+ # create a retriever
53
+ retriever = self.vector_store.as_retriever()
54
+
55
+ # 4. Create a prompt - LangChain
56
+ # The prompt has place for the context from the raq and the question from the user
57
+ prompt_template = ChatPromptTemplate.from_messages(
58
+ [
59
+ ("system", "You are an excellent movie critic who always includes great movie recommendations in your response. If the answer is not in the context let the user know "),
60
+ ("human", "Using this context: {context}, please answer this question: {question}")
61
+ ]
62
+ )
63
+
64
+ chat_model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0, api_key=api_key)
65
+ parser = StrOutputParser()
66
+
67
+ self.runnable_chain = (
68
+ {
69
+ "context": retriever,
70
+ "question": RunnablePassthrough(),
71
+ }
72
+ | prompt_template
73
+ | chat_model
74
+ | parser
75
+ )
76
+
77
+ def query(self, question) -> str:
78
+ print(f"Querying the RAG instance with the question: {question}")
79
+ output_chunks = self.runnable_chain.invoke(question)
80
+ return ''.join(output_chunks)
81
+
82
+
83
+ # def main():
84
+ # Create an instance of RAG class
85
+ # api_key = os.getenv("OPENAI_API_KEY")
86
+ # rag = RAGModel(api_key=api_key)
87
+
88
+ # while True:
89
+ # # Take input from command line
90
+ # question = input("Enter your question (or type 'exit' to quit): ")
91
+
92
+ # # Check if user wants to exit
93
+ # if question.lower() == "exit":
94
+ # break
95
+
96
+ # # Query the RAG instance
97
+ # answer = rag.query(question)
98
+
99
+ # # Print the answer
100
+ # print("Answer:", answer)
101
+
102
+
103
+ # if __name__ == "__main__":
104
+ # main()
requirements.txt ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohttp==3.9.5
3
+ aiosignal==1.3.1
4
+ annotated-types==0.7.0
5
+ anyio==3.7.1
6
+ asyncer==0.0.2
7
+ attrs==23.2.0
8
+ bidict==0.23.1
9
+ certifi==2024.2.2
10
+ chainlit==1.1.101
11
+ charset-normalizer==3.3.2
12
+ chevron==0.14.0
13
+ click==8.1.7
14
+ dataclasses-json==0.5.14
15
+ Deprecated==1.2.14
16
+ distro==1.9.0
17
+ faiss-cpu==1.8.0
18
+ fastapi==0.110.3
19
+ fastapi-socketio==0.0.10
20
+ filetype==1.2.0
21
+ frozenlist==1.4.1
22
+ googleapis-common-protos==1.63.0
23
+ greenlet==3.0.3
24
+ grpcio==1.64.0
25
+ h11==0.14.0
26
+ httpcore==1.0.5
27
+ httpx==0.27.0
28
+ idna==3.7
29
+ importlib-metadata==7.0.0
30
+ jsonpatch==1.33
31
+ jsonpointer==2.4
32
+ langchain==0.2.0
33
+ langchain-community==0.2.0
34
+ langchain-core==0.2.0
35
+ langchain-openai==0.1.7
36
+ langchain-text-splitters==0.2.0
37
+ langsmith==0.1.60
38
+ Lazify==0.4.0
39
+ literalai==0.0.601
40
+ marshmallow==3.21.2
41
+ multidict==6.0.5
42
+ mypy-extensions==1.0.0
43
+ nest-asyncio==1.6.0
44
+ numpy==1.26.4
45
+ openai==1.30.1
46
+ opentelemetry-api==1.24.0
47
+ opentelemetry-exporter-otlp==1.24.0
48
+ opentelemetry-exporter-otlp-proto-common==1.24.0
49
+ opentelemetry-exporter-otlp-proto-grpc==1.24.0
50
+ opentelemetry-exporter-otlp-proto-http==1.24.0
51
+ opentelemetry-instrumentation==0.45b0
52
+ opentelemetry-proto==1.24.0
53
+ opentelemetry-sdk==1.24.0
54
+ opentelemetry-semantic-conventions==0.45b0
55
+ orjson==3.10.3
56
+ packaging==23.2
57
+ protobuf==4.25.3
58
+ pydantic==2.7.1
59
+ pydantic_core==2.18.2
60
+ PyJWT==2.8.0
61
+ python-dotenv==1.0.1
62
+ python-engineio==4.9.1
63
+ python-multipart==0.0.9
64
+ python-socketio==5.11.2
65
+ PyYAML==6.0.1
66
+ regex==2024.5.15
67
+ requests==2.32.0
68
+ setuptools==69.5.1
69
+ simple-websocket==1.0.0
70
+ sniffio==1.3.1
71
+ SQLAlchemy==2.0.30
72
+ starlette==0.37.2
73
+ syncer==2.0.3
74
+ tenacity==8.3.0
75
+ tiktoken==0.7.0
76
+ tomli==2.0.1
77
+ tqdm==4.66.4
78
+ typing-inspect==0.9.0
79
+ typing_extensions==4.11.0
80
+ uptrace==1.24.0
81
+ urllib3==2.2.1
82
+ uvicorn==0.25.0
83
+ watchfiles==0.20.0
84
+ wrapt==1.16.0
85
+ wsproto==1.2.0
86
+ yarl==1.9.4
87
+ zipp==3.18.2