Prathamesh Sable commited on
Commit
0b42653
·
1 Parent(s): a30cf03

updated for flask but session not working

Browse files
Files changed (5) hide show
  1. .gitignore +2 -1
  2. __pycache__/utils.cpython-312.pyc +0 -0
  3. app.py +112 -125
  4. requirements.txt +3 -1
  5. utils.py +80 -12
.gitignore CHANGED
@@ -6,4 +6,5 @@ chroma
6
  /trash
7
  uploads/
8
  /flask_session
9
- log.txt
 
 
6
  /trash
7
  uploads/
8
  /flask_session
9
+ log.txt
10
+ *.db
__pycache__/utils.cpython-312.pyc ADDED
Binary file (3.91 kB). View file
 
app.py CHANGED
@@ -4,20 +4,23 @@ from flask_session import Session
4
  from werkzeug.utils import secure_filename
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
 
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from langchain_community.document_loaders import PyPDFLoader,UnstructuredWordDocumentLoader,TextLoader,UnstructuredHTMLLoader,UnstructuredMarkdownLoader
9
  from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
10
  import google.generativeai as genai
11
 
12
  from langchain_chroma import Chroma
13
 
 
 
 
 
14
  import os
15
  from dotenv import load_dotenv
16
  import time
17
  import shutil
18
  import logging
 
19
 
20
- logging.basicConfig(filename='log.txt',filemode='a', level=logging.DEBUG,
21
  format='%(asctime)s - %(levelname)s - %(message)s')
22
  logger = logging.getLogger()
23
 
@@ -52,104 +55,84 @@ llm_model = genai.GenerativeModel("gemini-1.5-flash")
52
 
53
  app = Flask(__name__)
54
 
55
- # initialize session
56
- app.config["SESSION_PERMANENT"] = True
57
- app.config["PERMANENT_SESSION_LIFETIME"] = SESSION_TIMEOUT
58
- app.config["SESSION_TYPE"] = "filesystem"
59
- Session(app)
60
 
61
- # Initialize ChromaDB client
62
- db = Chroma(persist_directory=CHROMA_PATH, embedding_function=hugging_face_ef)
63
- active_sessions = dict()
 
 
 
 
64
 
 
 
 
65
 
66
- def remove_file_from_chroma(file_id,session_id):
67
- # Get chunks for session
68
- session_chunks = db.get(where={"session_id": session_id})
69
-
70
- # Further filter by file_id
71
- ids_to_del = []
72
- for i in range(len(session_chunks['ids'])):
73
- if session_chunks['metadatas'][i]['file_id'] == str(file_id):
74
- ids_to_del.append(session_chunks['ids'][i])
75
-
76
- # delete chunks from db where metadata file_id is equal to file_id if there are ;)
77
- if len(ids_to_del) > 0:
78
- db.delete(ids=ids_to_del)
79
- return True
80
- return False
81
-
82
- def add_file_to_chroma(file_path, file_id, session_id):
83
- """Add file chunks to ChromaDB."""
84
- extension = file_path.split(".")[-1]
85
- loader_map = {
86
- "pdf": PyPDFLoader,
87
- "docx": UnstructuredWordDocumentLoader,
88
- "txt": TextLoader,
89
- "html": UnstructuredHTMLLoader,
90
- "md": UnstructuredMarkdownLoader,
91
- }
92
- if extension not in loader_map:
93
- raise ValueError(f"Unsupported file type: {extension}")
94
-
95
- loader = loader_map[extension](file_path)
96
- documents = loader.load()
97
-
98
- text_splitter = RecursiveCharacterTextSplitter(
99
- chunk_size=1500,
100
- chunk_overlap=200,
101
- length_function=len,
102
- add_start_index=True
103
- )
104
- texts = text_splitter.split_documents(documents)
105
-
106
- # Add metadata
107
- for text in texts:
108
- text.metadata.update({"file_id": file_id, "session_id": session_id})
109
-
110
- # Save to ChromaDB
111
- db.add_documents(texts,embedding=hugging_face_ef)
112
- logger.info(f"Added file '{file_path}' to ChromaDB for session '{session_id}'.")
113
 
114
- def generate_query_response(query):
115
- response = dict()
116
- top_related = db.similarity_search_with_relevance_scores(query,filter={"session_id": session.sid},k=4)
117
-
118
- response['is_relevant'] = top_related[0][1] >= 0.6
119
-
120
- # filter chunks with score > 0.3
121
- # top_related = [chunk for chunk in top_related if chunk[1] > 0.3]
122
-
123
- context = "\n".join([chunk[0].page_content for chunk in top_related])
124
-
125
- prompt = PROMPT_TEMPLATE.format(context = context,query = query)
126
-
127
- # print(top_related)
128
-
129
- response['answer'] = llm_model.generate_content(prompt).text
130
- response['sources'] = [{
131
- "page_content":chunk[0].page_content,
132
- "score" : chunk[1],
133
- "metadata":chunk[0].metadata
134
- } for chunk in top_related]
135
-
136
- return response
137
 
138
- @app.before_request
139
- def update_session():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  session_id = session.sid
141
- if session_id not in active_sessions:
142
- active_sessions[session_id] = {
143
- 'last_accessed': time.time(),
144
- 'files': dict()
145
- }
146
  logger.info(f"CREATED NEW SESSION with ID {session_id}")
147
  else:
148
- active_sessions[session_id]['last_accessed'] = time.time()
 
 
 
 
 
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
  @app.route('/')
152
  def index():
 
153
  return render_template('index.html') # Serve the HTML file we created
154
 
155
  # add files
@@ -171,44 +154,43 @@ def upload_file():
171
  file_path = os.path.join(UPLOAD_FOLDER, filename)
172
  file.save(file_path)
173
 
174
- # Update session data
175
- active_sessions[session_id]['files'][file_id] = (file_path,file.filename)
176
- active_sessions[session_id]['last_accessed'] = time.time()
177
-
178
  # Add file chunks to ChromaDB
179
- add_file_to_chroma(file_path, file_id, session_id)
180
 
181
  return jsonify({'message': 'File uploaded successfully', 'status': 'success'}), 200
182
 
183
 
184
  @app.route('/get-files',methods=["GET"])
185
  def get_files():
186
- return jsonify({"files":active_sessions[session.sid]['files']}),200
 
187
 
188
  @app.route('/status',methods=["GET"])
189
  def status():
 
190
  # return all data from chroma db
191
  return jsonify({
192
- "Active_sessions":active_sessions,
193
- "chroma_data":db.get()
194
  }),200
 
 
 
 
195
 
196
  @app.route('/remove-file',methods=["POST"])
197
  def remove_file():
198
  file_id = request.form.get('file_id')
199
  session_id = session.sid
200
- if file_id in active_sessions[session_id]['files']:
201
- file_path = active_sessions[session_id]['files'][file_id][0]
202
- # remove file from upload folder
203
- if os.path.exists(file_path):
204
- os.remove(file_path)
205
- logger.info(f"Deleted file: {file_path}")
206
- # Remove file from session
207
- del active_sessions[session_id]['files'][file_id]
208
- else:
209
- logger.info(f"File not found in session: {file_id}")
210
 
211
- if remove_file_from_chroma(file_id,session_id):
 
 
 
 
212
  return jsonify({
213
  'message': 'File deleted successfully',
214
  'status': 'success'
@@ -223,30 +205,32 @@ def remove_file():
223
  def cleanup_resources():
224
  """Clean up expired files and ChromaDB collections."""
225
  now = time.time()
226
- for session_id, session_data in list(active_sessions.items()):
227
- if now - session_data['last_accessed'] > SESSION_TIMEOUT:
228
- # Remove files
229
- files = session_data.get('files', {})
230
- for file_id, (file_path, filename) in files.items():
231
- if os.path.exists(file_path):
232
- os.remove(file_path)
233
- logger.info(f"Deleted file: {file_path}")
234
-
235
- # Remove ChromaDB chunks
236
- db.delete(where={"session_id": session_id})
237
- logger.info(f"Deleted ChromaDB chunks for session: {session_id}")
238
-
239
- # Remove session
240
- del active_sessions[session_id]
241
-
 
242
  @app.route("/ask_query", methods=['POST'])
243
  def ask_query():
244
  query = request.form.get("query")
245
 
246
- resp = generate_query_response(query)
247
 
248
  return jsonify(resp),200
249
 
 
250
  # Start the scheduler
251
  scheduler = BackgroundScheduler()
252
  scheduler.add_job(cleanup_resources, 'interval', minutes=5) # Run every 5 minutes
@@ -255,9 +239,12 @@ scheduler.start()
255
  # Ensure scheduler stops on app exit
256
  @app.teardown_appcontext
257
  def shutdown_scheduler(exception=None):
 
 
258
  if scheduler.running:
259
  scheduler.shutdown()
260
 
 
261
  if __name__ == "__main__":
262
  app.run(host="0.0.0.0",port=8000,debug=True,threaded=True)
263
 
 
4
  from werkzeug.utils import secure_filename
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
 
 
 
7
  from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
8
  import google.generativeai as genai
9
 
10
  from langchain_chroma import Chroma
11
 
12
+ from utils import add_file_to_chroma,remove_file_from_chroma,generate_query_response,remove_session_data_from_chroma
13
+
14
+ import sqlite3
15
+
16
  import os
17
  from dotenv import load_dotenv
18
  import time
19
  import shutil
20
  import logging
21
+ from flask_cors import CORS
22
 
23
+ logging.basicConfig(filename='log.txt',filemode='w', level=logging.DEBUG,
24
  format='%(asctime)s - %(levelname)s - %(message)s')
25
  logger = logging.getLogger()
26
 
 
55
 
56
  app = Flask(__name__)
57
 
58
+ app.secret_key = os.getenv('SECRET_KEY', 'default_secret_key')
59
+ CORS(app,supports_credentials=True)
 
 
 
60
 
61
+ # # initialize session
62
+ # app.config["SESSION_PERMANENT"] = True
63
+ # # app.config["SESSION_TYPE"] = "filesystem"
64
+ # app.config['SESSION_COOKIE_SECURE'] = False # Set to True if using HTTPS
65
+ # app.config['SESSION_COOKIE_HTTPONLY'] = True
66
+ # app.config['SESSION_COOKIE_SAMESITE'] = 'Lax'
67
+ # app.config["SESSION_USE_SIGNER"] = True
68
 
69
+ app.config["SESSION_TYPE"] = "sqlalchemy"
70
+ app.config["SESSION_SQLALCHEMY_TABLE"] = "flask_session"
71
+ app.config["SQLALCHEMY_DATABASE_URI"] = "sqlite:///flask_session.db"
72
 
73
+ Session(app)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ # Initialize ChromaDB client
76
+ db = Chroma(persist_directory=CHROMA_PATH, embedding_function=hugging_face_ef)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ sqldb = sqlite3.connect("sessions.db",check_same_thread=False)
79
+ cursor = sqldb.cursor()
80
+
81
+ def init_db(sqldb:sqlite3.Connection,cursor:sqlite3.Cursor):
82
+ cursor.execute("""
83
+ CREATE TABLE IF NOT EXISTS sessions (
84
+ session_id TEXT PRIMARY KEY,
85
+ last_accessed DATETIME DEFAULT CURRENT_TIMESTAMP,
86
+ CREATED_AT DATETIME DEFAULT CURRENT_TIMESTAMP
87
+ );""")
88
+ cursor.execute("""CREATE TABLE IF NOT EXISTS files (
89
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
90
+ session_id TEXT,
91
+ file_id TEXT,
92
+ file_path TEXT,
93
+ file_name TEXT,
94
+ FOREIGN KEY (session_id) REFERENCES sessions(session_id) ON DELETE CASCADE
95
+ );""")
96
+ sqldb.commit()
97
+
98
+ init_db(sqldb,cursor)
99
+
100
+
101
+ def create_or_update_session():
102
  session_id = session.sid
103
+ if cursor.execute("SELECT * FROM sessions WHERE session_id = ?", (session_id,)).fetchone() is None:
104
+ cursor.execute("INSERT INTO sessions (session_id) VALUES (?)", (session_id,))
105
+ sqldb.commit()
 
 
106
  logger.info(f"CREATED NEW SESSION with ID {session_id}")
107
  else:
108
+ cursor.execute("UPDATE sessions SET last_accessed = CURRENT_TIMESTAMP WHERE session_id = ?", (session_id,))
109
+ sqldb.commit()
110
+ logger.info(f"UPDATED SESSION with ID {session_id}")
111
+
112
+ def pure_update_session(session_id,cursor):
113
+ cursor.execute("UPDATE sessions SET last_accessed = CURRENT_TIMESTAMP WHERE session_id = ?", (session_id,))
114
 
115
+ def add_file_to_session(session_id, file_id, file_path, file_name,sqldb:sqlite3.Connection,cursor:sqlite3.Cursor):
116
+ cursor.execute("INSERT INTO files (session_id, file_id, file_path, file_name) VALUES (?, ?, ?, ?)", (session_id, file_id, file_path, file_name))
117
+ pure_update_session(session_id,cursor)
118
+ sqldb.commit()
119
+ logger.info(f"ADDED FILE with ID {file_id} to SESSION with ID {session_id}")
120
+
121
+ def remove_file_from_session(session_id, file_id,sqldb:sqlite3.Connection,cursor:sqlite3.Cursor):
122
+ cursor.execute("DELETE FROM files WHERE session_id = ? AND file_id = ?", (session_id, file_id))
123
+ pure_update_session(session_id,cursor)
124
+ sqldb.commit()
125
+ logger.info(f"REMOVED FILE with ID {file_id} from SESSION with ID {session_id}")
126
+
127
+
128
+ def get_file_list(session_id,sqldb:sqlite3.Connection,cursor:sqlite3.Cursor):
129
+ cursor.execute("SELECT file_id, file_path, file_name FROM files WHERE session_id = ?", (session_id,))
130
+ return cursor.fetchall()
131
+
132
 
133
  @app.route('/')
134
  def index():
135
+ create_or_update_session()
136
  return render_template('index.html') # Serve the HTML file we created
137
 
138
  # add files
 
154
  file_path = os.path.join(UPLOAD_FOLDER, filename)
155
  file.save(file_path)
156
 
157
+ # Update session data
158
+ add_file_to_session(session_id, file_id, file_path, filename,sqldb,cursor)
159
+
 
160
  # Add file chunks to ChromaDB
161
+ add_file_to_chroma(file_path, file_id, session_id,hugging_face_ef,db,logger)
162
 
163
  return jsonify({'message': 'File uploaded successfully', 'status': 'success'}), 200
164
 
165
 
166
  @app.route('/get-files',methods=["GET"])
167
  def get_files():
168
+ return jsonify({"files":get_file_list(session.sid,sqldb,cursor)}),200
169
+
170
 
171
  @app.route('/status',methods=["GET"])
172
  def status():
173
+ print(request.cookies.keys())
174
  # return all data from chroma db
175
  return jsonify({
176
+ "current_session":session.sid,
177
+ "z-chroma_data":db.get()
178
  }),200
179
+ @app.after_request
180
+ def check_response_cookie(response):
181
+ logger.debug(f"Response Cookies: {response.headers.get('Set-Cookie')}")
182
+ return response
183
 
184
  @app.route('/remove-file',methods=["POST"])
185
  def remove_file():
186
  file_id = request.form.get('file_id')
187
  session_id = session.sid
 
 
 
 
 
 
 
 
 
 
188
 
189
+ # remove file entry from session
190
+ remove_file_from_session(session_id, file_id,sqldb,cursor)
191
+
192
+ # remove file chunks from chroma
193
+ if remove_file_from_chroma(file_id,session_id,db):
194
  return jsonify({
195
  'message': 'File deleted successfully',
196
  'status': 'success'
 
205
  def cleanup_resources():
206
  """Clean up expired files and ChromaDB collections."""
207
  now = time.time()
208
+ # get time before all sessions are expired
209
+ last_update_time_required = now - SESSION_TIMEOUT
210
+
211
+ # get session to delete
212
+ cursor.execute("SELECT session_id FROM sessions WHERE last_accessed < ?", (last_update_time_required,))
213
+ expired_sessions = cursor.fetchall()
214
+ logger.info(f"Expired sessions: {expired_sessions}")
215
+
216
+ # Remove expired sessions
217
+ cursor.execute("DELETE FROM sessions WHERE session_id IN (?)", (expired_sessions,))
218
+
219
+ sqldb.commit()
220
+
221
+ # Remove expired files chunk from chroma
222
+ remove_session_data_from_chroma(expired_sessions,db,logger)
223
+
224
+
225
  @app.route("/ask_query", methods=['POST'])
226
  def ask_query():
227
  query = request.form.get("query")
228
 
229
+ resp = generate_query_response(query,session.sid,db,llm_model,PROMPT_TEMPLATE)
230
 
231
  return jsonify(resp),200
232
 
233
+ """
234
  # Start the scheduler
235
  scheduler = BackgroundScheduler()
236
  scheduler.add_job(cleanup_resources, 'interval', minutes=5) # Run every 5 minutes
 
239
  # Ensure scheduler stops on app exit
240
  @app.teardown_appcontext
241
  def shutdown_scheduler(exception=None):
242
+ if exception is not None:
243
+ logger.error("Scheduler shutdown failed", exc_info=exception)
244
  if scheduler.running:
245
  scheduler.shutdown()
246
 
247
+ """
248
  if __name__ == "__main__":
249
  app.run(host="0.0.0.0",port=8000,debug=True,threaded=True)
250
 
requirements.txt CHANGED
@@ -17,4 +17,6 @@ flask
17
  werkzeug
18
  Flask-Session
19
  apscheduler
20
- gunicorn
 
 
 
17
  werkzeug
18
  Flask-Session
19
  apscheduler
20
+ gunicorn
21
+ flask-cors
22
+ flask_sqlalchemy
utils.py CHANGED
@@ -1,16 +1,84 @@
 
 
 
1
 
2
- @app.route('/wait',methods=["POST"])
3
- def wait():
4
- time.sleep(int(request.form.get("time")))
5
- return jsonify({"status":"ok"}),200
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- @app.route("/ai",methods=["POST"])
8
- def aiPost():
9
- print("Post /ai called")
10
- json_content = request.json
11
- query = json_content.get("query")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- print("Query:",query)
 
 
 
 
 
14
 
15
- response_answer = llm_model.generate_content(query)
16
- return response_answer.text
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain_community.document_loaders import PyPDFLoader,UnstructuredWordDocumentLoader,TextLoader,UnstructuredHTMLLoader,UnstructuredMarkdownLoader
3
+ import os
4
 
5
+ def remove_file_from_chroma(file_id,session_id,db):
6
+ # Get chunks for session
7
+ session_chunks = db.get(where={"session_id": session_id})
8
+
9
+ # Further filter by file_id
10
+ ids_to_del = []
11
+ for i in range(len(session_chunks['ids'])):
12
+ if session_chunks['metadatas'][i]['file_id'] == str(file_id):
13
+ ids_to_del.append(session_chunks['ids'][i])
14
+
15
+ # delete chunks from db where metadata file_id is equal to file_id if there are ;)
16
+ if len(ids_to_del) > 0:
17
+ db.delete(ids=ids_to_del)
18
+ return True
19
+ return False
20
 
21
+ def remove_session_data_from_chroma(session_ids,db,logger):
22
+ db.delete(where={"session_id": {"$in": session_ids}})
23
+ logger.info(f"Deleted ChromaDB chunks for sessions: {session_ids}")
24
+
25
+
26
+ def add_file_to_chroma(file_path, file_id, session_id,hugging_face_ef,db,logger):
27
+ """Add file chunks to ChromaDB."""
28
+ extension = file_path.split(".")[-1]
29
+ loader_map = {
30
+ "pdf": PyPDFLoader,
31
+ "docx": UnstructuredWordDocumentLoader,
32
+ "txt": TextLoader,
33
+ "html": UnstructuredHTMLLoader,
34
+ "md": UnstructuredMarkdownLoader,
35
+ }
36
+ if extension not in loader_map:
37
+ raise ValueError(f"Unsupported file type: {extension}")
38
+
39
+ loader = loader_map[extension](file_path)
40
+ documents = loader.load()
41
+
42
+ text_splitter = RecursiveCharacterTextSplitter(
43
+ chunk_size=1500,
44
+ chunk_overlap=200,
45
+ length_function=len,
46
+ add_start_index=True
47
+ )
48
+ texts = text_splitter.split_documents(documents)
49
+
50
+ # Add metadata
51
+ for text in texts:
52
+ text.metadata.update({"file_id": file_id, "session_id": session_id})
53
+
54
+ # Save to ChromaDB
55
+ db.add_documents(texts,embedding=hugging_face_ef)
56
+
57
+ # delete file
58
+ if os.path.exists(file_path):
59
+ os.remove(file_path)
60
+ logger.info(f"Added file '{file_path}' to ChromaDB for session '{session_id}'.")
61
+
62
+ def generate_query_response(query,session_id,db,llm_model,PROMPT_TEMPLATE):
63
+ response = dict()
64
+ top_related = db.similarity_search_with_relevance_scores(query,filter={"session_id": session_id},k=4)
65
+
66
+ response['is_relevant'] = top_related[0][1] >= 0.6
67
+
68
+ # filter chunks with score > 0.3
69
+ # top_related = [chunk for chunk in top_related if chunk[1] > 0.3]
70
+
71
+ context = "\n".join([chunk[0].page_content for chunk in top_related])
72
+
73
+ prompt = PROMPT_TEMPLATE.format(context = context,query = query)
74
+
75
+ # print(top_related)
76
 
77
+ response['answer'] = llm_model.generate_content(prompt).text
78
+ response['sources'] = [{
79
+ "page_content":chunk[0].page_content,
80
+ "score" : chunk[1],
81
+ "metadata":chunk[0].metadata
82
+ } for chunk in top_related]
83
 
84
+ return response