chirag0107 commited on
Commit
1bae510
·
verified ·
1 Parent(s): 56138db

Update langchain_movie_search.py

Browse files

Commented code to generate embeddings

Files changed (1) hide show
  1. langchain_movie_search.py +160 -160
langchain_movie_search.py CHANGED
@@ -1,160 +1,160 @@
1
- import os
2
- from typing import List
3
- import argparse
4
- from dotenv import load_dotenv
5
- import pymongo
6
- from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings
7
- from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch
8
- from langchain.chains import create_retrieval_chain
9
- from langchain.chains.combine_documents import create_stuff_documents_chain
10
- from langchain_core.documents import Document
11
- from langchain_core.prompts import PromptTemplate
12
- import gradio as gr
13
- from gradio.themes.base import Base
14
- from flask import Flask
15
-
16
- __author__ = "Chirag Kamble"
17
-
18
-
19
- # Flask App
20
- # app = Flask(__name__)
21
-
22
-
23
- class MoviesSearch:
24
- """
25
- Class to perform Vector Index Search using MongoDB and LLM search using Langchain on Movies
26
- """
27
-
28
- def __init__(self):
29
- """
30
- Initializing method
31
- """
32
- # Load environment variables
33
- load_dotenv()
34
- transformer_model_name: str = os.getenv("TRANSFORMER_MODEL_NAME")
35
- mongodb_connection_url: str = os.getenv("MONGODB_CONNECTION_URL")
36
- mongodb_db_name: str = os.getenv("MONGODB_DB_NAME")
37
- mongodb_collection_name: str = os.getenv("MONGODB_COLLECTION_NAME")
38
- self.huggingface_repo: str = os.getenv("HF_REPO")
39
- self.huggingface_api_token: str = os.getenv("HF_TOKEN")
40
- self.huggingface_text_generation_model: str = os.getenv("HUGGINGFACE_TEXT_GENERATION_MODEL")
41
-
42
- # Setup MongoDB connection
43
- self.client: pymongo.synchronous.mongo_client.MongoClient = pymongo.MongoClient(mongodb_connection_url,
44
- serverSelectionTimeoutMS=50000)
45
- db: str = mongodb_db_name
46
- collection_name: str = mongodb_collection_name
47
- self.langchain_movies_collection: pymongo.synchronous.collection.Collection = self.client[db][collection_name]
48
-
49
- self.sample_movies_collection: pymongo.synchronous.collection.Collection = self.client.sample_mflix.movies
50
-
51
- self.hf_plot_embedding = HuggingFaceEmbeddings(
52
- model_name=transformer_model_name,
53
- show_progress=True,
54
- )
55
-
56
- self.retrieve_vector_store = MongoDBAtlasVectorSearch(collection=self.langchain_movies_collection,
57
- embedding=self.hf_plot_embedding,
58
- embedding_key="embedding",
59
- index_name="langchain_movies_vector_index",
60
- text_key="text",
61
- )
62
-
63
- def generate_insert_embeddings(self):
64
- """
65
- Generate vector embeddings
66
- """
67
- new_doc_list: List[Document] = []
68
- for doc in self.sample_movies_collection.find({"fullplot": {"$exists": True}}).limit(1000):
69
- new_doc: Document = Document(
70
- page_content=doc["fullplot"],
71
- metadata={"source": "Collection sample_mflix",
72
- "movie-title": doc["title"],
73
- "movie-plot": doc["fullplot"],
74
- "text": doc["fullplot"]}
75
- )
76
- new_doc_list.append(new_doc)
77
- self.retrieve_vector_store.from_documents(
78
- documents=new_doc_list,
79
- embedding=self.hf_plot_embedding,
80
- collection=self.langchain_movies_collection
81
- )
82
-
83
- def query_data(self, query: str):
84
- """
85
- Query data from Atlas Vector Search
86
- :param query: A user query to search
87
- :return: String answer generated by the LLM
88
- """
89
- hf_llm: HuggingFaceEndpoint = HuggingFaceEndpoint(
90
- repo_id=self.huggingface_text_generation_model,
91
- huggingfacehub_api_token=self.huggingface_api_token,
92
- temperature=0.1,
93
- task="text-generation",
94
- # max_new_tokens=100,
95
- verbose=True,
96
- return_full_text=True,
97
- )
98
-
99
- retriever = self.retrieve_vector_store.as_retriever()
100
- prompt = PromptTemplate.from_template(template="{context}", template_format="f-string")
101
- combine_docs = create_stuff_documents_chain(llm=hf_llm, prompt=prompt, )
102
-
103
- retrival_chain = create_retrieval_chain(retriever=retriever, combine_docs_chain=combine_docs)
104
- hf_llm_retriever_output = retrival_chain.invoke({"input": query})
105
-
106
- llm_answer = hf_llm_retriever_output.get("answer")
107
-
108
- return llm_answer
109
-
110
- def run_website(self):
111
- with gr.Blocks(theme=Base(), title="Movie plot search App using Vector Search + RAG") as v_search:
112
- gr.Markdown("Movie plot search App using Vector Search + RAG")
113
- textbox = gr.Textbox(label="Enter your question:", lines=1)
114
- with gr.Row():
115
- button = gr.Button("Submit", variant="primary")
116
- with gr.Column():
117
- output = gr.Textbox(lines=1, max_lines=10, interactive=False,
118
- label="""Output generated by chaining Atlas Vector Search with Langchain's RAG""",)
119
-
120
- button.click(fn=self.query_data, inputs=textbox, outputs=[output])
121
-
122
- v_search.launch(share=True)
123
-
124
- def close_client(self):
125
- self.client.close()
126
-
127
-
128
- # @app.route("/", methods=["GET"])
129
- def gradio_interface(cmd=None):
130
- movie_search = MoviesSearch()
131
- movie_search.generate_insert_embeddings()
132
- movie_search.run_website()
133
-
134
- # if cmd == "generate_embeddings":
135
- # movie_search.generate_insert_embeddings()
136
- # elif cmd == "run":
137
- # movie_search.run_website()
138
-
139
-
140
- if __name__ == "__main__":
141
- # Create the parser
142
- # parser = argparse.ArgumentParser(description='Script to suggest movies based on user description/query')
143
- #
144
- # # Add arguments
145
- # parser.add_argument("-g", "--generate_embeddings", action="store_true", help="Generate/Re-generate Embeddings")
146
- # parser.add_argument("-r", "--run", action="store_true", help="Age of the person")
147
- #
148
- # # Parse arguments
149
- # args = parser.parse_args()
150
- #
151
- # if args.generate_embeddings:
152
- # gradio_interface(cmd="generate_embeddings")
153
- # elif args.run:
154
- # gradio_interface(cmd="run")
155
-
156
- # app.run(host="0.0.0.0", port=os.getenv("PORT", 5000), debug=True)
157
- # app.run(host="0.0.0.0", debug=True)
158
- # app.run(debug=True)
159
-
160
- gradio_interface()
 
1
+ import os
2
+ from typing import List
3
+ import argparse
4
+ from dotenv import load_dotenv
5
+ import pymongo
6
+ from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings
7
+ from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch
8
+ from langchain.chains import create_retrieval_chain
9
+ from langchain.chains.combine_documents import create_stuff_documents_chain
10
+ from langchain_core.documents import Document
11
+ from langchain_core.prompts import PromptTemplate
12
+ import gradio as gr
13
+ from gradio.themes.base import Base
14
+ from flask import Flask
15
+
16
+ __author__ = "Chirag Kamble"
17
+
18
+
19
+ # Flask App
20
+ # app = Flask(__name__)
21
+
22
+
23
+ class MoviesSearch:
24
+ """
25
+ Class to perform Vector Index Search using MongoDB and LLM search using Langchain on Movies
26
+ """
27
+
28
+ def __init__(self):
29
+ """
30
+ Initializing method
31
+ """
32
+ # Load environment variables
33
+ load_dotenv()
34
+ transformer_model_name: str = os.getenv("TRANSFORMER_MODEL_NAME")
35
+ mongodb_connection_url: str = os.getenv("MONGODB_CONNECTION_URL")
36
+ mongodb_db_name: str = os.getenv("MONGODB_DB_NAME")
37
+ mongodb_collection_name: str = os.getenv("MONGODB_COLLECTION_NAME")
38
+ self.huggingface_repo: str = os.getenv("HF_REPO")
39
+ self.huggingface_api_token: str = os.getenv("HF_TOKEN")
40
+ self.huggingface_text_generation_model: str = os.getenv("HUGGINGFACE_TEXT_GENERATION_MODEL")
41
+
42
+ # Setup MongoDB connection
43
+ self.client: pymongo.synchronous.mongo_client.MongoClient = pymongo.MongoClient(mongodb_connection_url,
44
+ serverSelectionTimeoutMS=50000)
45
+ db: str = mongodb_db_name
46
+ collection_name: str = mongodb_collection_name
47
+ self.langchain_movies_collection: pymongo.synchronous.collection.Collection = self.client[db][collection_name]
48
+
49
+ self.sample_movies_collection: pymongo.synchronous.collection.Collection = self.client.sample_mflix.movies
50
+
51
+ self.hf_plot_embedding = HuggingFaceEmbeddings(
52
+ model_name=transformer_model_name,
53
+ show_progress=True,
54
+ )
55
+
56
+ self.retrieve_vector_store = MongoDBAtlasVectorSearch(collection=self.langchain_movies_collection,
57
+ embedding=self.hf_plot_embedding,
58
+ embedding_key="embedding",
59
+ index_name="langchain_movies_vector_index",
60
+ text_key="text",
61
+ )
62
+
63
+ def generate_insert_embeddings(self):
64
+ """
65
+ Generate vector embeddings
66
+ """
67
+ new_doc_list: List[Document] = []
68
+ for doc in self.sample_movies_collection.find({"fullplot": {"$exists": True}}).limit(1000):
69
+ new_doc: Document = Document(
70
+ page_content=doc["fullplot"],
71
+ metadata={"source": "Collection sample_mflix",
72
+ "movie-title": doc["title"],
73
+ "movie-plot": doc["fullplot"],
74
+ "text": doc["fullplot"]}
75
+ )
76
+ new_doc_list.append(new_doc)
77
+ self.retrieve_vector_store.from_documents(
78
+ documents=new_doc_list,
79
+ embedding=self.hf_plot_embedding,
80
+ collection=self.langchain_movies_collection
81
+ )
82
+
83
+ def query_data(self, query: str):
84
+ """
85
+ Query data from Atlas Vector Search
86
+ :param query: A user query to search
87
+ :return: String answer generated by the LLM
88
+ """
89
+ hf_llm: HuggingFaceEndpoint = HuggingFaceEndpoint(
90
+ repo_id=self.huggingface_text_generation_model,
91
+ huggingfacehub_api_token=self.huggingface_api_token,
92
+ temperature=0.1,
93
+ task="text-generation",
94
+ # max_new_tokens=100,
95
+ verbose=True,
96
+ return_full_text=True,
97
+ )
98
+
99
+ retriever = self.retrieve_vector_store.as_retriever()
100
+ prompt = PromptTemplate.from_template(template="{context}", template_format="f-string")
101
+ combine_docs = create_stuff_documents_chain(llm=hf_llm, prompt=prompt, )
102
+
103
+ retrival_chain = create_retrieval_chain(retriever=retriever, combine_docs_chain=combine_docs)
104
+ hf_llm_retriever_output = retrival_chain.invoke({"input": query})
105
+
106
+ llm_answer = hf_llm_retriever_output.get("answer")
107
+
108
+ return llm_answer
109
+
110
+ def run_website(self):
111
+ with gr.Blocks(theme=Base(), title="Movie plot search App using Vector Search + RAG") as v_search:
112
+ gr.Markdown("Movie plot search App using Vector Search + RAG")
113
+ textbox = gr.Textbox(label="Enter your question:", lines=1)
114
+ with gr.Row():
115
+ button = gr.Button("Submit", variant="primary")
116
+ with gr.Column():
117
+ output = gr.Textbox(lines=1, max_lines=10, interactive=False,
118
+ label="""Output generated by chaining Atlas Vector Search with Langchain's RAG""",)
119
+
120
+ button.click(fn=self.query_data, inputs=textbox, outputs=[output])
121
+
122
+ v_search.launch(share=True)
123
+
124
+ def close_client(self):
125
+ self.client.close()
126
+
127
+
128
+ # @app.route("/", methods=["GET"])
129
+ def gradio_interface(cmd=None):
130
+ movie_search = MoviesSearch()
131
+ # movie_search.generate_insert_embeddings()
132
+ movie_search.run_website()
133
+
134
+ # if cmd == "generate_embeddings":
135
+ # movie_search.generate_insert_embeddings()
136
+ # elif cmd == "run":
137
+ # movie_search.run_website()
138
+
139
+
140
+ if __name__ == "__main__":
141
+ # Create the parser
142
+ # parser = argparse.ArgumentParser(description='Script to suggest movies based on user description/query')
143
+ #
144
+ # # Add arguments
145
+ # parser.add_argument("-g", "--generate_embeddings", action="store_true", help="Generate/Re-generate Embeddings")
146
+ # parser.add_argument("-r", "--run", action="store_true", help="Age of the person")
147
+ #
148
+ # # Parse arguments
149
+ # args = parser.parse_args()
150
+ #
151
+ # if args.generate_embeddings:
152
+ # gradio_interface(cmd="generate_embeddings")
153
+ # elif args.run:
154
+ # gradio_interface(cmd="run")
155
+
156
+ # app.run(host="0.0.0.0", port=os.getenv("PORT", 5000), debug=True)
157
+ # app.run(host="0.0.0.0", debug=True)
158
+ # app.run(debug=True)
159
+
160
+ gradio_interface()