Prathamesh Sable commited on
Commit
3e086dd
·
1 Parent(s): 096e25a

working RAG pipeline

Browse files
Files changed (2) hide show
  1. app.py +119 -59
  2. templates/index.html +71 -5
app.py CHANGED
@@ -10,7 +10,6 @@ from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
10
  import google.generativeai as genai
11
 
12
  from langchain_chroma import Chroma
13
- from chromadb import Client
14
 
15
  import os
16
  from dotenv import load_dotenv
@@ -29,39 +28,56 @@ GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
29
  CHROMA_PATH = "chroma"
30
  UPLOAD_FOLDER = "uploads"
31
  SESSION_TIMEOUT = 30 * 60 # 30 minutes
 
 
 
 
32
 
33
- # Initialize Hugging Face embedding
 
 
 
 
 
 
 
 
34
  hugging_face_ef = HuggingFaceInferenceAPIEmbeddings(
35
  api_key=HF_TOKEN,
36
  model_name="sentence-transformers/all-MiniLM-L6-v2"
37
  )
 
38
  genai.configure(api_key=GOOGLE_API_KEY)
39
  llm_model = genai.GenerativeModel("gemini-1.5-flash")
40
 
41
  app = Flask(__name__)
42
 
43
- os.makedirs(UPLOAD_FOLDER, exist_ok=True)
44
- app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
45
-
46
-
47
  app.config["SESSION_PERMANENT"] = True
48
  app.config["SESSION_TYPE"] = "filesystem"
49
-
50
  Session(app)
51
 
52
  # Initialize ChromaDB client
53
- client = Client()
54
  active_sessions = dict()
55
 
56
 
57
- def remove_file_from_chroma(file_id,CHROMA_PATH=CHROMA_PATH):
58
- db = Chroma(persist_directory=CHROMA_PATH)
 
59
 
60
- # delete chunks from db where metadata file_id is equal to file_id
61
- db.delete(ids=db.get(where={"file_id": file_id},include=[])['ids'])
62
- return True
63
-
64
-
 
 
 
 
 
 
 
65
  def add_file_to_chroma(file_path, file_id, session_id):
66
  """Add file chunks to ChromaDB."""
67
  extension = file_path.split(".")[-1]
@@ -91,45 +107,48 @@ def add_file_to_chroma(file_path, file_id, session_id):
91
  text.metadata.update({"file_id": file_id, "session_id": session_id})
92
 
93
  # Save to ChromaDB
94
- db = Chroma(persist_directory=CHROMA_PATH, embedding_function=hugging_face_ef)
95
- db.add_documents(texts)
96
  logger.info(f"Added file '{file_path}' to ChromaDB for session '{session_id}'.")
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
- # Clean up expired files and ChromaDB collections
100
- def cleanup_resources():
101
- """Clean up expired files and ChromaDB collections."""
102
- now = time.time()
103
- for session_id, session_data in list(active_sessions.items()):
104
- if now - session_data['last_accessed'] > SESSION_TIMEOUT:
105
- # Remove files
106
- files = session_data.get('files', {})
107
- for file_id, (file_path, filename) in files.items():
108
- if os.path.exists(file_path):
109
- os.remove(file_path)
110
- logger.info(f"Deleted file: {file_path}")
111
-
112
- # Remove ChromaDB chunks
113
- db = Chroma(persist_directory=CHROMA_PATH)
114
- db.delete(where={"session_id": session_id})
115
- logger.info(f"Deleted ChromaDB chunks for session: {session_id}")
116
-
117
- # Remove session
118
- del active_sessions[session_id]
119
-
120
- # Start the scheduler
121
- scheduler = BackgroundScheduler()
122
- scheduler.add_job(cleanup_resources, 'interval', minutes=5) # Run every 5 minutes
123
- scheduler.start()
124
-
125
- @app.route('/')
126
- def index():
127
  session_id = session.sid
128
  if session_id not in active_sessions:
129
  active_sessions[session_id] = {
130
  'last_accessed': time.time(),
131
  'files': dict()
132
  }
 
 
 
 
 
 
 
133
  return render_template('index.html') # Serve the HTML file we created
134
 
135
  # add files
@@ -165,39 +184,80 @@ def upload_file():
165
  def get_files():
166
  return jsonify({"files":active_sessions[session.sid]['files']}),200
167
 
168
- @app.route('/chroma-status',methods=["GET"])
169
- def chroma_status():
170
  # return all data from chroma db
171
- db = Chroma(persist_directory=CHROMA_PATH)
172
- return jsonify({"data":db.get()}),200
 
 
173
 
174
  @app.route('/remove-file',methods=["POST"])
175
  def remove_file():
176
  file_id = request.form.get('file_id')
177
- if file_id in active_sessions[session.sid]['files']:
178
- file_path = active_sessions[session.sid]['files'][file_id][0]
 
179
  # remove file from upload folder
180
  if os.path.exists(file_path):
181
  os.remove(file_path)
182
  logger.info(f"Deleted file: {file_path}")
183
  # Remove file from session
184
- del active_sessions[session.sid]['files'][file_id]
185
  else:
186
  logger.info(f"File not found in session: {file_id}")
187
 
188
- remove_file_from_chroma(file_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
- return jsonify({
191
- 'message': 'File deleted successfully',
192
- 'status': 'success'
193
- }), 200
 
 
 
 
194
 
195
  # Ensure scheduler stops on app exit
196
  @app.teardown_appcontext
197
  def shutdown_scheduler(exception=None):
198
- scheduler.shutdown()
 
199
 
200
  if __name__ == "__main__":
201
- app.run(host="0.0.0.0",port=8000,debug=True)
202
 
203
 
 
10
  import google.generativeai as genai
11
 
12
  from langchain_chroma import Chroma
 
13
 
14
  import os
15
  from dotenv import load_dotenv
 
28
  CHROMA_PATH = "chroma"
29
  UPLOAD_FOLDER = "uploads"
30
  SESSION_TIMEOUT = 30 * 60 # 30 minutes
31
+ PROMPT_TEMPLATE = """
32
+ Answer the given query based only on the context given below.
33
+ context:
34
+ {context}
35
 
36
+ ---
37
+ based on the context above answer the following query:
38
+ {query}
39
+ """
40
+
41
+ # make required folder for files to upload
42
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
43
+
44
+ # Initialize Hugging Face embeddings Model
45
  hugging_face_ef = HuggingFaceInferenceAPIEmbeddings(
46
  api_key=HF_TOKEN,
47
  model_name="sentence-transformers/all-MiniLM-L6-v2"
48
  )
49
+ # initialize LLM
50
  genai.configure(api_key=GOOGLE_API_KEY)
51
  llm_model = genai.GenerativeModel("gemini-1.5-flash")
52
 
53
  app = Flask(__name__)
54
 
55
+ # initialize session
 
 
 
56
  app.config["SESSION_PERMANENT"] = True
57
  app.config["SESSION_TYPE"] = "filesystem"
 
58
  Session(app)
59
 
60
  # Initialize ChromaDB client
61
+ db = Chroma(persist_directory=CHROMA_PATH, embedding_function=hugging_face_ef)
62
  active_sessions = dict()
63
 
64
 
65
+ def remove_file_from_chroma(file_id,session_id):
66
+ # Get chunks for session
67
+ session_chunks = db.get(where={"session_id": session_id})
68
 
69
+ # Further filter by file_id
70
+ ids_to_del = []
71
+ for i in range(len(session_chunks['ids'])):
72
+ if session_chunks['metadatas'][i]['file_id'] == str(file_id):
73
+ ids_to_del.append(session_chunks['ids'][i])
74
+
75
+ # delete chunks from db where metadata file_id is equal to file_id if there are ;)
76
+ if len(ids_to_del) > 0:
77
+ db.delete(ids=ids_to_del)
78
+ return True
79
+ return False
80
+
81
  def add_file_to_chroma(file_path, file_id, session_id):
82
  """Add file chunks to ChromaDB."""
83
  extension = file_path.split(".")[-1]
 
107
  text.metadata.update({"file_id": file_id, "session_id": session_id})
108
 
109
  # Save to ChromaDB
110
+ db.add_documents(texts,embedding=hugging_face_ef)
 
111
  logger.info(f"Added file '{file_path}' to ChromaDB for session '{session_id}'.")
112
 
113
+ def generate_query_response(query):
114
+ response = dict()
115
+ top_related = db.similarity_search_with_relevance_scores(query,filter={"session_id": session.sid},k=4)
116
+
117
+ response['is_relevant'] = top_related[0][1] >= 0.6
118
+
119
+ # filter chunks with score > 0.3
120
+ # top_related = [chunk for chunk in top_related if chunk[1] > 0.3]
121
+
122
+ context = "\n".join([chunk[0].page_content for chunk in top_related])
123
+
124
+ prompt = PROMPT_TEMPLATE.format(context = context,query = query)
125
+
126
+ print(top_related)
127
+
128
+ response['answer'] = llm_model.generate_content(prompt).text
129
+ response['sources'] = [{
130
+ "page_content":chunk[0].page_content,
131
+ "score" : chunk[1],
132
+ "metadata":chunk[0].metadata
133
+ } for chunk in top_related]
134
+
135
+ return response
136
 
137
+ @app.before_request
138
+ def update_session():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  session_id = session.sid
140
  if session_id not in active_sessions:
141
  active_sessions[session_id] = {
142
  'last_accessed': time.time(),
143
  'files': dict()
144
  }
145
+ logger.info(f"CREATED NEW SESSION with ID {session_id}")
146
+ else:
147
+ active_sessions[session_id]['last_accessed'] = time.time()
148
+
149
+
150
+ @app.route('/')
151
+ def index():
152
  return render_template('index.html') # Serve the HTML file we created
153
 
154
  # add files
 
184
  def get_files():
185
  return jsonify({"files":active_sessions[session.sid]['files']}),200
186
 
187
+ @app.route('/status',methods=["GET"])
188
+ def status():
189
  # return all data from chroma db
190
+ return jsonify({
191
+ "Active_sessions":active_sessions,
192
+ "chroma_data":db.get()
193
+ }),200
194
 
195
  @app.route('/remove-file',methods=["POST"])
196
  def remove_file():
197
  file_id = request.form.get('file_id')
198
+ session_id = session.sid
199
+ if file_id in active_sessions[session_id]['files']:
200
+ file_path = active_sessions[session_id]['files'][file_id][0]
201
  # remove file from upload folder
202
  if os.path.exists(file_path):
203
  os.remove(file_path)
204
  logger.info(f"Deleted file: {file_path}")
205
  # Remove file from session
206
+ del active_sessions[session_id]['files'][file_id]
207
  else:
208
  logger.info(f"File not found in session: {file_id}")
209
 
210
+ if remove_file_from_chroma(file_id,session_id):
211
+ return jsonify({
212
+ 'message': 'File deleted successfully',
213
+ 'status': 'success'
214
+ }), 200
215
+ else:
216
+ return jsonify({
217
+ 'message': 'File Not Found',
218
+ 'status': 'fail'
219
+ }), 404
220
+
221
+ # Clean up expired files and ChromaDB collections
222
+ def cleanup_resources():
223
+ """Clean up expired files and ChromaDB collections."""
224
+ now = time.time()
225
+ for session_id, session_data in list(active_sessions.items()):
226
+ if now - session_data['last_accessed'] > SESSION_TIMEOUT:
227
+ # Remove files
228
+ files = session_data.get('files', {})
229
+ for file_id, (file_path, filename) in files.items():
230
+ if os.path.exists(file_path):
231
+ os.remove(file_path)
232
+ logger.info(f"Deleted file: {file_path}")
233
+
234
+ # Remove ChromaDB chunks
235
+ db.delete(where={"session_id": session_id})
236
+ logger.info(f"Deleted ChromaDB chunks for session: {session_id}")
237
+
238
+ # Remove session
239
+ del active_sessions[session_id]
240
+
241
+ @app.route("/ask_query", methods=['POST'])
242
+ def ask_query():
243
+ query = request.form.get("query")
244
 
245
+ resp = generate_query_response(query)
246
+
247
+ return jsonify(resp),200
248
+
249
+ # Start the scheduler
250
+ scheduler = BackgroundScheduler()
251
+ scheduler.add_job(cleanup_resources, 'interval', minutes=5) # Run every 5 minutes
252
+ scheduler.start()
253
 
254
  # Ensure scheduler stops on app exit
255
  @app.teardown_appcontext
256
  def shutdown_scheduler(exception=None):
257
+ if scheduler.running:
258
+ scheduler.shutdown()
259
 
260
  if __name__ == "__main__":
261
+ app.run(host="0.0.0.0",port=8000,debug=True,threaded=True)
262
 
263
 
templates/index.html CHANGED
@@ -22,7 +22,8 @@
22
  <body class="bg-gray-100 dark:bg-gray-900 text-gray-900 dark:text-gray-100">
23
 
24
 
25
- <div id="loader" class="fixed top-0 left-0 w-full h-full" style="background-color: rgba(50, 50, 50, 0.5);z-index: 1000;">
 
26
  <div class="text-center" style="position: absolute;top: 50%;left: 50%;transform: translate(-50%, -50%);">
27
  <div role="status">
28
  <svg aria-hidden="true"
@@ -115,6 +116,13 @@
115
  <!-- Chat Messages -->
116
  <div class="flex-1 overflow-y-auto p-4 space-y-4" id="chat-messages">
117
  <div class="flex items-start space-x-2">
 
 
 
 
 
 
 
118
  <div class="bg-gray-200 dark:bg-gray-700 p-3 rounded-lg">
119
  <p>
120
  Hello! How can I assist you today?
@@ -131,7 +139,7 @@
131
  I need help with my account.
132
  </p>
133
  </div>
134
- </div>
135
  </div>
136
  <!-- Message Input -->
137
  <form id="message-form">
@@ -166,6 +174,7 @@
166
  const dropZone = document.getElementById('dropzone'); // label of input
167
  const fileInput = document.getElementById('file-upload'); // input element
168
  const fileList = document.getElementById('file-list'); // div to display files uploaded
 
169
 
170
  var file_list = [];
171
  var file_count = 0;
@@ -187,6 +196,46 @@
187
  }
188
  });
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
 
192
  // message send
@@ -194,9 +243,26 @@
194
  e.preventDefault();
195
  const messageInput = document.getElementById('message-input');
196
  const message = messageInput.value.trim();
 
 
 
197
  if (message) {
198
- console.log('Sending message:', message);
199
- messageInput.value = '';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  }
201
  });
202
 
@@ -263,7 +329,7 @@
263
  <i class="fas fa-times"></i>
264
  </button>`;
265
  fileList.appendChild(listItem);
266
- set_loading("Uploading File "+ file.name);
267
 
268
  var formData = new FormData();
269
  formData.append('file', file);
 
22
  <body class="bg-gray-100 dark:bg-gray-900 text-gray-900 dark:text-gray-100">
23
 
24
 
25
+ <div id="loader" class="fixed top-0 left-0 w-full h-full"
26
+ style="background-color: rgba(50, 50, 50, 0.5);z-index: 1000;">
27
  <div class="text-center" style="position: absolute;top: 50%;left: 50%;transform: translate(-50%, -50%);">
28
  <div role="status">
29
  <svg aria-hidden="true"
 
116
  <!-- Chat Messages -->
117
  <div class="flex-1 overflow-y-auto p-4 space-y-4" id="chat-messages">
118
  <div class="flex items-start space-x-2">
119
+ <div class="bg-gray-200 dark:bg-gray-700 p-3 rounded-lg">
120
+ <p>
121
+ Upload your files, And you are ready to ask questions about them🫡.
122
+ </p>
123
+ </div>
124
+ </div>
125
+ <!-- <div class="flex items-start space-x-2">
126
  <div class="bg-gray-200 dark:bg-gray-700 p-3 rounded-lg">
127
  <p>
128
  Hello! How can I assist you today?
 
139
  I need help with my account.
140
  </p>
141
  </div>
142
+ </div> -->
143
  </div>
144
  <!-- Message Input -->
145
  <form id="message-form">
 
174
  const dropZone = document.getElementById('dropzone'); // label of input
175
  const fileInput = document.getElementById('file-upload'); // input element
176
  const fileList = document.getElementById('file-list'); // div to display files uploaded
177
+ const chat = document.getElementById('chat-messages');
178
 
179
  var file_list = [];
180
  var file_count = 0;
 
196
  }
197
  });
198
 
199
+ function add_user_message(msg) {
200
+ var item = document.createElement('div');
201
+ item.className = "flex items-start space-x-2 flex-row-reverse";
202
+ item.innerHTML = `<div class="bg-blue-500 text-white p-3 rounded-lg">
203
+ <p>
204
+ ${msg}
205
+ </p>
206
+ </div>`;
207
+ chat.appendChild(item);
208
+ }
209
+
210
+ function add_ai_response(resp) {
211
+ sources = ""
212
+ if (resp.is_relevant) {
213
+ sources += "Sources: "
214
+ resp.sources.forEach(function (source) {
215
+ sources += '<a class="text-blue-500" href="#" style="background: #00000085;border-radius: 4px;padding: 1px 3px;margin: 2px;">'
216
+ sources += source.metadata.source.split("\\")[1]
217
+ if (source.metadata.page != undefined) {
218
+ sources += ", Page:" + source.metadata.page
219
+ }
220
+ sources += '</a>,'
221
+ });
222
+
223
+ } else {
224
+ sources = "The Documents provided are not sufficient to answer the query provided, So response might not be accurate."
225
+ }
226
+
227
+ var item = document.createElement('div');
228
+ item.className = "flex items-start space-x-2";
229
+ item.innerHTML = `<div class="bg-gray-200 dark:bg-gray-700 p-3 rounded-lg">
230
+ <p>
231
+ ${resp.answer}
232
+ </p>
233
+ <p class="text-sm text-gray-500 dark:text-gray-400 mt-2">
234
+ ${sources}
235
+ </p>
236
+ </div>`;
237
+ chat.appendChild(item);
238
+ }
239
 
240
 
241
  // message send
 
243
  e.preventDefault();
244
  const messageInput = document.getElementById('message-input');
245
  const message = messageInput.value.trim();
246
+
247
+ add_user_message(message)
248
+
249
  if (message) {
250
+ $.ajax({
251
+ url: "/ask_query",
252
+ method: "POST",
253
+ data: {
254
+ query: message
255
+ },
256
+ success: function (response) {
257
+ resp = JSON.stringify(response)
258
+ resp = JSON.parse(resp)
259
+ console.log(resp);
260
+ add_ai_response(resp);
261
+ },
262
+ error: function (xhr, status, error) {
263
+ console.log(error);
264
+ }
265
+ })
266
  }
267
  });
268
 
 
329
  <i class="fas fa-times"></i>
330
  </button>`;
331
  fileList.appendChild(listItem);
332
+ set_loading("Uploading File " + file.name);
333
 
334
  var formData = new FormData();
335
  formData.append('file', file);