Spaces:

prathameshks
/

Multi-File-Chatting

Sleeping

App Files Files Community

Prathamesh Sable commited on Dec 21, 2024

Commit

0b42653

1 Parent(s): a30cf03

updated for flask but session not working

Browse files

Files changed (5) hide show

.gitignore +2 -1
__pycache__/utils.cpython-312.pyc +0 -0
app.py +112 -125
requirements.txt +3 -1
utils.py +80 -12

.gitignore CHANGED Viewed

@@ -6,4 +6,5 @@ chroma
 /trash
 uploads/
 /flask_session
-log.txt

 /trash
 uploads/
 /flask_session
+log.txt
+*.db

__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (3.91 kB). View file

app.py CHANGED Viewed

@@ -4,20 +4,23 @@ from flask_session import Session
 from werkzeug.utils import secure_filename
 from apscheduler.schedulers.background import BackgroundScheduler
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.document_loaders import PyPDFLoader,UnstructuredWordDocumentLoader,TextLoader,UnstructuredHTMLLoader,UnstructuredMarkdownLoader
 from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
 import google.generativeai as genai
 from langchain_chroma import Chroma
 import os
 from dotenv import load_dotenv
 import time
 import shutil
 import logging
-logging.basicConfig(filename='log.txt',filemode='a', level=logging.DEBUG,
                     format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger()
@@ -52,104 +55,84 @@ llm_model = genai.GenerativeModel("gemini-1.5-flash")
 app = Flask(__name__)
-# initialize session
-app.config["SESSION_PERMANENT"] = True
-app.config["PERMANENT_SESSION_LIFETIME"] = SESSION_TIMEOUT
-app.config["SESSION_TYPE"] = "filesystem"
-Session(app)
-# Initialize ChromaDB client
-db = Chroma(persist_directory=CHROMA_PATH, embedding_function=hugging_face_ef)
-active_sessions = dict()
-def remove_file_from_chroma(file_id,session_id):
-    # Get chunks for session
-    session_chunks = db.get(where={"session_id": session_id})
-    # Further filter by file_id
-    ids_to_del = []
-    for i in range(len(session_chunks['ids'])):
-        if session_chunks['metadatas'][i]['file_id'] == str(file_id):
-            ids_to_del.append(session_chunks['ids'][i])
-    # delete chunks from db where metadata file_id is equal to file_id if there are ;)
-    if len(ids_to_del) > 0:
-        db.delete(ids=ids_to_del)
-        return True
-    return False
-def add_file_to_chroma(file_path, file_id, session_id):
-    """Add file chunks to ChromaDB."""
-    extension = file_path.split(".")[-1]
-    loader_map = {
-        "pdf": PyPDFLoader,
-        "docx": UnstructuredWordDocumentLoader,
-        "txt": TextLoader,
-        "html": UnstructuredHTMLLoader,
-        "md": UnstructuredMarkdownLoader,
-    }
-    if extension not in loader_map:
-        raise ValueError(f"Unsupported file type: {extension}")
-    loader = loader_map[extension](file_path)
-    documents = loader.load()
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=1500,
-        chunk_overlap=200,
-        length_function=len,
-        add_start_index=True
-    )
-    texts = text_splitter.split_documents(documents)
-    # Add metadata
-    for text in texts:
-        text.metadata.update({"file_id": file_id, "session_id": session_id})
-    # Save to ChromaDB
-    db.add_documents(texts,embedding=hugging_face_ef)
-    logger.info(f"Added file '{file_path}' to ChromaDB for session '{session_id}'.")
-def generate_query_response(query):
-    response = dict()
-    top_related = db.similarity_search_with_relevance_scores(query,filter={"session_id": session.sid},k=4)
-    response['is_relevant'] = top_related[0][1] >= 0.6
-    # filter chunks with score > 0.3
-    # top_related = [chunk for chunk in top_related if chunk[1] > 0.3]
-    context = "\n".join([chunk[0].page_content for chunk in top_related])
-    prompt = PROMPT_TEMPLATE.format(context = context,query = query)
-    # print(top_related)
-    response['answer'] = llm_model.generate_content(prompt).text
-    response['sources'] = [{
-                               "page_content":chunk[0].page_content,
-                               "score" : chunk[1],
-                               "metadata":chunk[0].metadata
-                           } for chunk in top_related]
-    return response
-@app.before_request
-def update_session():
     session_id = session.sid
-    if session_id not in active_sessions:
-        active_sessions[session_id] = {
-            'last_accessed': time.time(),
-            'files': dict()
-        }
         logger.info(f"CREATED NEW SESSION with ID {session_id}")
     else:
-        active_sessions[session_id]['last_accessed'] = time.time()
 @app.route('/')
 def index():
     return render_template('index.html')  # Serve the HTML file we created
 # add files
@@ -171,44 +154,43 @@ def upload_file():
     file_path = os.path.join(UPLOAD_FOLDER, filename)
     file.save(file_path)
-    # Update session data
-    active_sessions[session_id]['files'][file_id] = (file_path,file.filename)
-    active_sessions[session_id]['last_accessed'] = time.time()
     # Add file chunks to ChromaDB
-    add_file_to_chroma(file_path, file_id, session_id)
     return jsonify({'message': 'File uploaded successfully', 'status': 'success'}), 200
 @app.route('/get-files',methods=["GET"])
 def get_files():
-    return jsonify({"files":active_sessions[session.sid]['files']}),200
 @app.route('/status',methods=["GET"])
 def status():
     # return all data from chroma db
     return jsonify({
-        "Active_sessions":active_sessions,
-        "chroma_data":db.get()
     }),200
 @app.route('/remove-file',methods=["POST"])
 def remove_file():
     file_id = request.form.get('file_id')
     session_id = session.sid
-    if file_id in active_sessions[session_id]['files']:
-        file_path = active_sessions[session_id]['files'][file_id][0]
-        # remove file from upload folder
-        if os.path.exists(file_path):
-            os.remove(file_path)
-            logger.info(f"Deleted file: {file_path}")
-        # Remove file from session
-        del active_sessions[session_id]['files'][file_id]
-    else:
-        logger.info(f"File not found in session: {file_id}")
-    if remove_file_from_chroma(file_id,session_id):
         return jsonify({
             'message': 'File deleted successfully',
             'status': 'success'
@@ -223,30 +205,32 @@ def remove_file():
 def cleanup_resources():
     """Clean up expired files and ChromaDB collections."""
     now = time.time()
-    for session_id, session_data in list(active_sessions.items()):
-        if now - session_data['last_accessed'] > SESSION_TIMEOUT:
-            # Remove files
-            files = session_data.get('files', {})
-            for file_id, (file_path, filename) in files.items():
-                if os.path.exists(file_path):
-                    os.remove(file_path)
-                    logger.info(f"Deleted file: {file_path}")
-            # Remove ChromaDB chunks
-            db.delete(where={"session_id": session_id})
-            logger.info(f"Deleted ChromaDB chunks for session: {session_id}")
-            # Remove session
-            del active_sessions[session_id]
 @app.route("/ask_query", methods=['POST'])
 def ask_query():
     query = request.form.get("query")
-    resp = generate_query_response(query)
     return jsonify(resp),200
 # Start the scheduler
 scheduler = BackgroundScheduler()
 scheduler.add_job(cleanup_resources, 'interval', minutes=5)  # Run every 5 minutes
@@ -255,9 +239,12 @@ scheduler.start()
 # Ensure scheduler stops on app exit
 @app.teardown_appcontext
 def shutdown_scheduler(exception=None):
     if scheduler.running:
         scheduler.shutdown()
 if __name__ == "__main__":
     app.run(host="0.0.0.0",port=8000,debug=True,threaded=True)

 from werkzeug.utils import secure_filename
 from apscheduler.schedulers.background import BackgroundScheduler
 from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
 import google.generativeai as genai
 from langchain_chroma import Chroma
+from utils import add_file_to_chroma,remove_file_from_chroma,generate_query_response,remove_session_data_from_chroma
+import sqlite3
 import os
 from dotenv import load_dotenv
 import time
 import shutil
 import logging
+from flask_cors import CORS
+logging.basicConfig(filename='log.txt',filemode='w', level=logging.DEBUG,
                     format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger()
 app = Flask(__name__)
+app.secret_key = os.getenv('SECRET_KEY', 'default_secret_key')
+CORS(app,supports_credentials=True)
+# # initialize session
+# app.config["SESSION_PERMANENT"] = True
+# # app.config["SESSION_TYPE"] = "filesystem"
+# app.config['SESSION_COOKIE_SECURE'] = False  # Set to True if using HTTPS
+# app.config['SESSION_COOKIE_HTTPONLY'] = True
+# app.config['SESSION_COOKIE_SAMESITE'] = 'Lax'
+# app.config["SESSION_USE_SIGNER"] = True
+app.config["SESSION_TYPE"] = "sqlalchemy"
+app.config["SESSION_SQLALCHEMY_TABLE"] = "flask_session"
+app.config["SQLALCHEMY_DATABASE_URI"] = "sqlite:///flask_session.db"
+Session(app)
+# Initialize ChromaDB client
+db = Chroma(persist_directory=CHROMA_PATH, embedding_function=hugging_face_ef)
+sqldb = sqlite3.connect("sessions.db",check_same_thread=False)
+cursor = sqldb.cursor()
+def init_db(sqldb:sqlite3.Connection,cursor:sqlite3.Cursor):
+    cursor.execute("""
+        CREATE TABLE IF NOT EXISTS sessions (
+            session_id TEXT PRIMARY KEY,
+            last_accessed DATETIME DEFAULT CURRENT_TIMESTAMP,
+            CREATED_AT DATETIME DEFAULT CURRENT_TIMESTAMP
+        );""")
+    cursor.execute("""CREATE TABLE IF NOT EXISTS files (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            session_id TEXT,
+            file_id TEXT,
+            file_path TEXT,
+            file_name TEXT,
+            FOREIGN KEY (session_id) REFERENCES sessions(session_id) ON DELETE CASCADE
+        );""")
+    sqldb.commit()
+init_db(sqldb,cursor)
+def create_or_update_session():
     session_id = session.sid
+    if cursor.execute("SELECT * FROM sessions WHERE session_id = ?", (session_id,)).fetchone() is None:
+        cursor.execute("INSERT INTO sessions (session_id) VALUES (?)", (session_id,))
+        sqldb.commit()
         logger.info(f"CREATED NEW SESSION with ID {session_id}")
     else:
+        cursor.execute("UPDATE sessions SET last_accessed = CURRENT_TIMESTAMP WHERE session_id = ?", (session_id,))
+        sqldb.commit()
+        logger.info(f"UPDATED SESSION with ID {session_id}")
+def pure_update_session(session_id,cursor):
+    cursor.execute("UPDATE sessions SET last_accessed = CURRENT_TIMESTAMP WHERE session_id = ?", (session_id,))
+def add_file_to_session(session_id, file_id, file_path, file_name,sqldb:sqlite3.Connection,cursor:sqlite3.Cursor):
+    cursor.execute("INSERT INTO files (session_id, file_id, file_path, file_name) VALUES (?, ?, ?, ?)", (session_id, file_id, file_path, file_name))
+    pure_update_session(session_id,cursor)
+    sqldb.commit()
+    logger.info(f"ADDED FILE with ID {file_id} to SESSION with ID {session_id}")
+def remove_file_from_session(session_id, file_id,sqldb:sqlite3.Connection,cursor:sqlite3.Cursor):
+    cursor.execute("DELETE FROM files WHERE session_id = ? AND file_id = ?", (session_id, file_id))
+    pure_update_session(session_id,cursor)
+    sqldb.commit()
+    logger.info(f"REMOVED FILE with ID {file_id} from SESSION with ID {session_id}")
+def get_file_list(session_id,sqldb:sqlite3.Connection,cursor:sqlite3.Cursor):
+    cursor.execute("SELECT file_id, file_path, file_name FROM files WHERE session_id = ?", (session_id,))
+    return cursor.fetchall()
 @app.route('/')
 def index():
+    create_or_update_session()
     return render_template('index.html')  # Serve the HTML file we created
 # add files
     file_path = os.path.join(UPLOAD_FOLDER, filename)
     file.save(file_path)
+    # Update session data
+    add_file_to_session(session_id, file_id, file_path, filename,sqldb,cursor)
     # Add file chunks to ChromaDB
+    add_file_to_chroma(file_path, file_id, session_id,hugging_face_ef,db,logger)
     return jsonify({'message': 'File uploaded successfully', 'status': 'success'}), 200
 @app.route('/get-files',methods=["GET"])
 def get_files():
+    return jsonify({"files":get_file_list(session.sid,sqldb,cursor)}),200
 @app.route('/status',methods=["GET"])
 def status():
+    print(request.cookies.keys())
     # return all data from chroma db
     return jsonify({
+        "current_session":session.sid,
+        "z-chroma_data":db.get()
     }),200
+@app.after_request
+def check_response_cookie(response):
+    logger.debug(f"Response Cookies: {response.headers.get('Set-Cookie')}")
+    return response
 @app.route('/remove-file',methods=["POST"])
 def remove_file():
     file_id = request.form.get('file_id')
     session_id = session.sid
+    # remove file entry from session
+    remove_file_from_session(session_id, file_id,sqldb,cursor)
+    # remove file chunks from chroma
+    if remove_file_from_chroma(file_id,session_id,db):
         return jsonify({
             'message': 'File deleted successfully',
             'status': 'success'
 def cleanup_resources():
     """Clean up expired files and ChromaDB collections."""
     now = time.time()
+    # get time before all sessions are expired
+    last_update_time_required = now - SESSION_TIMEOUT
+    # get session to delete
+    cursor.execute("SELECT session_id FROM sessions WHERE last_accessed < ?", (last_update_time_required,))
+    expired_sessions = cursor.fetchall()
+    logger.info(f"Expired sessions: {expired_sessions}")
+    # Remove expired sessions
+    cursor.execute("DELETE FROM sessions WHERE session_id IN (?)", (expired_sessions,))
+    sqldb.commit()
+    # Remove expired files chunk from chroma
+    remove_session_data_from_chroma(expired_sessions,db,logger)
 @app.route("/ask_query", methods=['POST'])
 def ask_query():
     query = request.form.get("query")
+    resp = generate_query_response(query,session.sid,db,llm_model,PROMPT_TEMPLATE)
     return jsonify(resp),200
+"""
 # Start the scheduler
 scheduler = BackgroundScheduler()
 scheduler.add_job(cleanup_resources, 'interval', minutes=5)  # Run every 5 minutes
 # Ensure scheduler stops on app exit
 @app.teardown_appcontext
 def shutdown_scheduler(exception=None):
+    if exception is not None:
+        logger.error("Scheduler shutdown failed", exc_info=exception)
     if scheduler.running:
         scheduler.shutdown()
+"""
 if __name__ == "__main__":
     app.run(host="0.0.0.0",port=8000,debug=True,threaded=True)

requirements.txt CHANGED Viewed

@@ -17,4 +17,6 @@ flask
 werkzeug
 Flask-Session
 apscheduler
-gunicorn

 werkzeug
 Flask-Session
 apscheduler
+gunicorn
+flask-cors
+flask_sqlalchemy

utils.py CHANGED Viewed

@@ -1,16 +1,84 @@
-@app.route('/wait',methods=["POST"])
-def wait():
-    time.sleep(int(request.form.get("time")))
-    return jsonify({"status":"ok"}),200
-@app.route("/ai",methods=["POST"])
-def aiPost():
-    print("Post /ai called")
-    json_content = request.json
-    query = json_content.get("query")
-    print("Query:",query)
-    response_answer = llm_model.generate_content(query)
-    return response_answer.text

+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader,UnstructuredWordDocumentLoader,TextLoader,UnstructuredHTMLLoader,UnstructuredMarkdownLoader
+import os
+def remove_file_from_chroma(file_id,session_id,db):
+    # Get chunks for session
+    session_chunks = db.get(where={"session_id": session_id})
+    # Further filter by file_id
+    ids_to_del = []
+    for i in range(len(session_chunks['ids'])):
+        if session_chunks['metadatas'][i]['file_id'] == str(file_id):
+            ids_to_del.append(session_chunks['ids'][i])
+    # delete chunks from db where metadata file_id is equal to file_id if there are ;)
+    if len(ids_to_del) > 0:
+        db.delete(ids=ids_to_del)
+        return True
+    return False
+def remove_session_data_from_chroma(session_ids,db,logger):
+    db.delete(where={"session_id": {"$in": session_ids}})
+    logger.info(f"Deleted ChromaDB chunks for sessions: {session_ids}")
+def add_file_to_chroma(file_path, file_id, session_id,hugging_face_ef,db,logger):
+    """Add file chunks to ChromaDB."""
+    extension = file_path.split(".")[-1]
+    loader_map = {
+        "pdf": PyPDFLoader,
+        "docx": UnstructuredWordDocumentLoader,
+        "txt": TextLoader,
+        "html": UnstructuredHTMLLoader,
+        "md": UnstructuredMarkdownLoader,
+    }
+    if extension not in loader_map:
+        raise ValueError(f"Unsupported file type: {extension}")
+    loader = loader_map[extension](file_path)
+    documents = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1500,
+        chunk_overlap=200,
+        length_function=len,
+        add_start_index=True
+    )
+    texts = text_splitter.split_documents(documents)
+    # Add metadata
+    for text in texts:
+        text.metadata.update({"file_id": file_id, "session_id": session_id})
+    # Save to ChromaDB
+    db.add_documents(texts,embedding=hugging_face_ef)
+    # delete file
+    if os.path.exists(file_path):
+        os.remove(file_path)
+    logger.info(f"Added file '{file_path}' to ChromaDB for session '{session_id}'.")
+def generate_query_response(query,session_id,db,llm_model,PROMPT_TEMPLATE):
+    response = dict()
+    top_related = db.similarity_search_with_relevance_scores(query,filter={"session_id": session_id},k=4)
+    response['is_relevant'] = top_related[0][1] >= 0.6
+    # filter chunks with score > 0.3
+    # top_related = [chunk for chunk in top_related if chunk[1] > 0.3]
+    context = "\n".join([chunk[0].page_content for chunk in top_related])
+    prompt = PROMPT_TEMPLATE.format(context = context,query = query)
+    # print(top_related)
+    response['answer'] = llm_model.generate_content(prompt).text
+    response['sources'] = [{
+                               "page_content":chunk[0].page_content,
+                               "score" : chunk[1],
+                               "metadata":chunk[0].metadata
+                           } for chunk in top_related]
+    return response