chirag0107 commited on
Commit
56138db
·
verified ·
1 Parent(s): 6690e37

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. .gitignore +16 -0
  2. README.md +4 -9
  3. langchain_movie_search.py +160 -0
  4. requirements.txt +138 -0
  5. temp_req.txt +103 -0
.gitignore ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IDE Settings
2
+ *.pyc
3
+ .vscode
4
+ .gradio
5
+ .idea
6
+
7
+ # Ignore environment file
8
+ .env
9
+
10
+ # Ignore Virtual Environment
11
+ vsmpy-env/
12
+ build/
13
+ dist/
14
+
15
+ # Ignore build file
16
+ langchain_movie_search.spec
README.md CHANGED
@@ -1,12 +1,7 @@
1
  ---
2
- title: Movie Search
3
- emoji: 🐢
4
- colorFrom: green
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 5.12.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Movie-Search
3
+ app_file: langchain_movie_search.py
 
 
4
  sdk: gradio
5
+ sdk_version: 5.3.0
 
 
6
  ---
7
+ # Vector-Search-Movies-Python
 
langchain_movie_search.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+ import argparse
4
+ from dotenv import load_dotenv
5
+ import pymongo
6
+ from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings
7
+ from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch
8
+ from langchain.chains import create_retrieval_chain
9
+ from langchain.chains.combine_documents import create_stuff_documents_chain
10
+ from langchain_core.documents import Document
11
+ from langchain_core.prompts import PromptTemplate
12
+ import gradio as gr
13
+ from gradio.themes.base import Base
14
+ from flask import Flask
15
+
16
+ __author__ = "Chirag Kamble"
17
+
18
+
19
+ # Flask App
20
+ # app = Flask(__name__)
21
+
22
+
23
+ class MoviesSearch:
24
+ """
25
+ Class to perform Vector Index Search using MongoDB and LLM search using Langchain on Movies
26
+ """
27
+
28
+ def __init__(self):
29
+ """
30
+ Initializing method
31
+ """
32
+ # Load environment variables
33
+ load_dotenv()
34
+ transformer_model_name: str = os.getenv("TRANSFORMER_MODEL_NAME")
35
+ mongodb_connection_url: str = os.getenv("MONGODB_CONNECTION_URL")
36
+ mongodb_db_name: str = os.getenv("MONGODB_DB_NAME")
37
+ mongodb_collection_name: str = os.getenv("MONGODB_COLLECTION_NAME")
38
+ self.huggingface_repo: str = os.getenv("HF_REPO")
39
+ self.huggingface_api_token: str = os.getenv("HF_TOKEN")
40
+ self.huggingface_text_generation_model: str = os.getenv("HUGGINGFACE_TEXT_GENERATION_MODEL")
41
+
42
+ # Setup MongoDB connection
43
+ self.client: pymongo.synchronous.mongo_client.MongoClient = pymongo.MongoClient(mongodb_connection_url,
44
+ serverSelectionTimeoutMS=50000)
45
+ db: str = mongodb_db_name
46
+ collection_name: str = mongodb_collection_name
47
+ self.langchain_movies_collection: pymongo.synchronous.collection.Collection = self.client[db][collection_name]
48
+
49
+ self.sample_movies_collection: pymongo.synchronous.collection.Collection = self.client.sample_mflix.movies
50
+
51
+ self.hf_plot_embedding = HuggingFaceEmbeddings(
52
+ model_name=transformer_model_name,
53
+ show_progress=True,
54
+ )
55
+
56
+ self.retrieve_vector_store = MongoDBAtlasVectorSearch(collection=self.langchain_movies_collection,
57
+ embedding=self.hf_plot_embedding,
58
+ embedding_key="embedding",
59
+ index_name="langchain_movies_vector_index",
60
+ text_key="text",
61
+ )
62
+
63
+ def generate_insert_embeddings(self):
64
+ """
65
+ Generate vector embeddings
66
+ """
67
+ new_doc_list: List[Document] = []
68
+ for doc in self.sample_movies_collection.find({"fullplot": {"$exists": True}}).limit(1000):
69
+ new_doc: Document = Document(
70
+ page_content=doc["fullplot"],
71
+ metadata={"source": "Collection sample_mflix",
72
+ "movie-title": doc["title"],
73
+ "movie-plot": doc["fullplot"],
74
+ "text": doc["fullplot"]}
75
+ )
76
+ new_doc_list.append(new_doc)
77
+ self.retrieve_vector_store.from_documents(
78
+ documents=new_doc_list,
79
+ embedding=self.hf_plot_embedding,
80
+ collection=self.langchain_movies_collection
81
+ )
82
+
83
+ def query_data(self, query: str):
84
+ """
85
+ Query data from Atlas Vector Search
86
+ :param query: A user query to search
87
+ :return: String answer generated by the LLM
88
+ """
89
+ hf_llm: HuggingFaceEndpoint = HuggingFaceEndpoint(
90
+ repo_id=self.huggingface_text_generation_model,
91
+ huggingfacehub_api_token=self.huggingface_api_token,
92
+ temperature=0.1,
93
+ task="text-generation",
94
+ # max_new_tokens=100,
95
+ verbose=True,
96
+ return_full_text=True,
97
+ )
98
+
99
+ retriever = self.retrieve_vector_store.as_retriever()
100
+ prompt = PromptTemplate.from_template(template="{context}", template_format="f-string")
101
+ combine_docs = create_stuff_documents_chain(llm=hf_llm, prompt=prompt, )
102
+
103
+ retrival_chain = create_retrieval_chain(retriever=retriever, combine_docs_chain=combine_docs)
104
+ hf_llm_retriever_output = retrival_chain.invoke({"input": query})
105
+
106
+ llm_answer = hf_llm_retriever_output.get("answer")
107
+
108
+ return llm_answer
109
+
110
+ def run_website(self):
111
+ with gr.Blocks(theme=Base(), title="Movie plot search App using Vector Search + RAG") as v_search:
112
+ gr.Markdown("Movie plot search App using Vector Search + RAG")
113
+ textbox = gr.Textbox(label="Enter your question:", lines=1)
114
+ with gr.Row():
115
+ button = gr.Button("Submit", variant="primary")
116
+ with gr.Column():
117
+ output = gr.Textbox(lines=1, max_lines=10, interactive=False,
118
+ label="""Output generated by chaining Atlas Vector Search with Langchain's RAG""",)
119
+
120
+ button.click(fn=self.query_data, inputs=textbox, outputs=[output])
121
+
122
+ v_search.launch(share=True)
123
+
124
+ def close_client(self):
125
+ self.client.close()
126
+
127
+
128
+ # @app.route("/", methods=["GET"])
129
+ def gradio_interface(cmd=None):
130
+ movie_search = MoviesSearch()
131
+ movie_search.generate_insert_embeddings()
132
+ movie_search.run_website()
133
+
134
+ # if cmd == "generate_embeddings":
135
+ # movie_search.generate_insert_embeddings()
136
+ # elif cmd == "run":
137
+ # movie_search.run_website()
138
+
139
+
140
+ if __name__ == "__main__":
141
+ # Create the parser
142
+ # parser = argparse.ArgumentParser(description='Script to suggest movies based on user description/query')
143
+ #
144
+ # # Add arguments
145
+ # parser.add_argument("-g", "--generate_embeddings", action="store_true", help="Generate/Re-generate Embeddings")
146
+ # parser.add_argument("-r", "--run", action="store_true", help="Age of the person")
147
+ #
148
+ # # Parse arguments
149
+ # args = parser.parse_args()
150
+ #
151
+ # if args.generate_embeddings:
152
+ # gradio_interface(cmd="generate_embeddings")
153
+ # elif args.run:
154
+ # gradio_interface(cmd="run")
155
+
156
+ # app.run(host="0.0.0.0", port=os.getenv("PORT", 5000), debug=True)
157
+ # app.run(host="0.0.0.0", debug=True)
158
+ # app.run(debug=True)
159
+
160
+ gradio_interface()
requirements.txt ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Install all packages to run the code
2
+
3
+ # MongoDB
4
+ dnspython==2.7.0
5
+ pymongo==4.10.1
6
+
7
+ # Requests
8
+ certifi==2024.8.30
9
+ charset-normalizer==3.4.0
10
+ idna==3.10
11
+ requests==2.32.3
12
+ urllib3==2.2.3
13
+
14
+ #Environment
15
+ python-dotenv==1.0.1
16
+
17
+ # Sentence-Transformer
18
+ safetensors==0.4.5
19
+ scikit-learn==1.5.2
20
+ scipy==1.14.1
21
+ sentence-transformers==3.2.1
22
+ sympy==1.13.1
23
+ threadpoolctl==3.5.0
24
+ tokenizers==0.20.1
25
+ torch==2.5.0
26
+ tqdm==4.66.5
27
+ transformers==4.45.2
28
+ typing_extensions==4.12.2
29
+ certifi==2024.8.30
30
+ charset-normalizer==3.4.0
31
+ colorama==0.4.6
32
+ dnspython==2.7.0
33
+ filelock==3.16.1
34
+ fsspec==2024.10.0
35
+ huggingface-hub==0.26.1
36
+ idna==3.10
37
+ Jinja2==3.1.4
38
+ joblib==1.4.2
39
+ # MarkupSafe==3.0.2
40
+ mpmath==1.3.0
41
+ networkx==3.4.2
42
+ packaging==24.1
43
+ # pillow==11.0.0
44
+
45
+ # Setup tools
46
+ setuptools==75.2.0
47
+ PyYAML==6.0.2
48
+ regex==2024.9.11
49
+ # numpy==2.1.2
50
+
51
+ # Langchain
52
+ SQLAlchemy==2.0.36
53
+ aiohappyeyeballs==2.4.3
54
+ aiohttp==3.10.10
55
+ aiosignal==1.3.1
56
+ annotated-types==0.7.0
57
+ anyio==4.6.2.post1
58
+ attrs==24.2.0
59
+ frozenlist==1.5.0
60
+ greenlet==3.1.1
61
+ h11==0.14.0
62
+ httpcore==1.0.6
63
+ httpx==0.27.2
64
+ jsonpatch==1.33
65
+ jsonpointer==3.0.0
66
+ langchain==0.3.4
67
+ langchain-core==0.3.12
68
+ langchain-text-splitters==0.3.0
69
+ langsmith==0.1.137
70
+ multidict==6.1.0
71
+ numpy==1.26.4
72
+ orjson==3.10.10
73
+ propcache==0.2.0
74
+ pydantic==2.9.2
75
+ pydantic-core==2.23.4
76
+ requests-toolbelt==1.0.0
77
+ sniffio==1.3.1
78
+ tenacity==9.0.0
79
+ yarl==1.16.0
80
+
81
+ # Langchain HuggingFace
82
+ langchain-HuggingFace==0.1.0
83
+
84
+ # Lnagchain Mongodb
85
+ langchain-mongodb==0.2.0
86
+
87
+ # Langchain Community
88
+ dataclasses-json==0.6.7
89
+ langchain_community==0.3.3
90
+ marshmallow==3.23.0
91
+ mypy-extensions==1.0.0
92
+ pydantic-settings==2.6.0
93
+ typing-inspect==0.9.0
94
+
95
+ # Gradio
96
+ aiofiles==23.2.1
97
+ click==8.1.7
98
+ fastapi==0.115.3
99
+ ffmpy==0.4.0
100
+ gradio==5.3.0
101
+ gradio-client==1.4.2
102
+ markdown-it-py==3.0.0
103
+ markupsafe==2.1.5
104
+ mdurl==0.1.2
105
+ pandas==2.2.3
106
+ pillow==10.4.0
107
+ pydub==0.25.1
108
+ pygments==2.18.0
109
+ python-dateutil==2.9.0.post0
110
+ python-multipart==0.0.12
111
+ pytz==2024.2
112
+ rich==13.9.3
113
+ ruff==0.7.1
114
+ semantic-version==2.10.0
115
+ shellingham==1.5.4
116
+ six==1.16.0
117
+ starlette==0.41.0
118
+ tomlkit==0.12.0
119
+ typer==0.12.5
120
+ tzdata==2024.2
121
+ uvicorn==0.32.0
122
+ websockets==12.0
123
+
124
+ # Flask
125
+ Werkzeug==3.0.6
126
+ blinker==1.8.2
127
+ flask==3.0.3
128
+ itsdangerous==2.2.0
129
+
130
+ # Pyinstaller
131
+ altgraph==0.17.4
132
+ pefile==2023.2.7
133
+ pyinstaller==6.11.0
134
+ pyinstaller-hooks-contrib==2024.9
135
+ pywin32-ctypes==0.2.3
136
+
137
+ # Gunicorn
138
+ gunicorn==23.0.0
temp_req.txt ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohappyeyeballs==2.4.3
3
+ aiohttp==3.10.10
4
+ aiosignal==1.3.1
5
+ altgraph==0.17.4
6
+ annotated-types==0.7.0
7
+ anyio==4.6.2.post1
8
+ attrs==24.2.0
9
+ blinker==1.8.2
10
+ certifi==2024.8.30
11
+ charset-normalizer==3.4.0
12
+ click==8.1.7
13
+ colorama==0.4.6
14
+ dataclasses-json==0.6.7
15
+ dnspython==2.7.0
16
+ fastapi==0.115.3
17
+ ffmpy==0.4.0
18
+ filelock==3.16.1
19
+ Flask==3.0.3
20
+ frozenlist==1.5.0
21
+ fsspec==2024.10.0
22
+ gradio==5.3.0
23
+ gradio_client==1.4.2
24
+ greenlet==3.1.1
25
+ h11==0.14.0
26
+ httpcore==1.0.6
27
+ httpx==0.27.2
28
+ huggingface-hub==0.26.1
29
+ idna==3.10
30
+ itsdangerous==2.2.0
31
+ Jinja2==3.1.4
32
+ joblib==1.4.2
33
+ jsonpatch==1.33
34
+ jsonpointer==3.0.0
35
+ langchain==0.3.4
36
+ langchain-community==0.3.3
37
+ langchain-core==0.3.12
38
+ langchain-huggingface==0.1.0
39
+ langchain-mongodb==0.2.0
40
+ langchain-text-splitters==0.3.0
41
+ langsmith==0.1.137
42
+ markdown-it-py==3.0.0
43
+ MarkupSafe==2.1.5
44
+ marshmallow==3.23.0
45
+ mdurl==0.1.2
46
+ mpmath==1.3.0
47
+ multidict==6.1.0
48
+ mypy-extensions==1.0.0
49
+ networkx==3.4.2
50
+ numpy==1.26.4
51
+ orjson==3.10.10
52
+ packaging==24.1
53
+ pandas==2.2.3
54
+ pefile==2023.2.7
55
+ pillow==10.4.0
56
+ propcache==0.2.0
57
+ pydantic==2.9.2
58
+ pydantic-settings==2.6.0
59
+ pydantic_core==2.23.4
60
+ pydub==0.25.1
61
+ Pygments==2.18.0
62
+ pyinstaller==6.11.0
63
+ pyinstaller-hooks-contrib==2024.9
64
+ pymongo==4.10.1
65
+ python-dateutil==2.9.0.post0
66
+ python-dotenv==1.0.1
67
+ python-multipart==0.0.12
68
+ pytz==2024.2
69
+ pywin32-ctypes==0.2.3
70
+ PyYAML==6.0.2
71
+ regex==2024.9.11
72
+ requests==2.32.3
73
+ requests-toolbelt==1.0.0
74
+ rich==13.9.3
75
+ ruff==0.7.1
76
+ safetensors==0.4.5
77
+ scikit-learn==1.5.2
78
+ scipy==1.14.1
79
+ semantic-version==2.10.0
80
+ sentence-transformers==3.2.1
81
+ setuptools==75.2.0
82
+ shellingham==1.5.4
83
+ six==1.16.0
84
+ sniffio==1.3.1
85
+ SQLAlchemy==2.0.36
86
+ starlette==0.41.0
87
+ sympy==1.13.1
88
+ tenacity==9.0.0
89
+ threadpoolctl==3.5.0
90
+ tokenizers==0.20.1
91
+ tomlkit==0.12.0
92
+ torch==2.5.0
93
+ tqdm==4.66.5
94
+ transformers==4.45.2
95
+ typer==0.12.5
96
+ typing-inspect==0.9.0
97
+ typing_extensions==4.12.2
98
+ tzdata==2024.2
99
+ urllib3==2.2.3
100
+ uvicorn==0.32.0
101
+ websockets==12.0
102
+ Werkzeug==3.0.6
103
+ yarl==1.16.0