Spaces:
Build error
Build error
Update utils/file_utils.py
Browse files- utils/file_utils.py +50 -48
utils/file_utils.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
import pickle
|
| 3 |
import pandas as pd
|
| 4 |
-
|
| 5 |
|
| 6 |
def load_database(file_path: str) -> pd.DataFrame:
|
| 7 |
"""
|
|
@@ -43,24 +43,7 @@ def load_pickle(file_path: str):
|
|
| 43 |
except Exception as e:
|
| 44 |
print(f"Error reading Pickle file '{file_path}': {e}")
|
| 45 |
return None
|
| 46 |
-
|
| 47 |
-
# def save_pickle(embeddings: dict, file_path: str) -> None:
|
| 48 |
-
# """
|
| 49 |
-
# Saves to a pickle file safely.
|
| 50 |
-
|
| 51 |
-
# Args:
|
| 52 |
-
# embeddings (dict): The embeddings to be saved.
|
| 53 |
-
# file_path (str): The file path where the embeddings will be saved.
|
| 54 |
-
|
| 55 |
-
# Returns:
|
| 56 |
-
# None
|
| 57 |
-
# """
|
| 58 |
-
# try:
|
| 59 |
-
# with open(file_path, "wb") as file:
|
| 60 |
-
# pickle.dump(embeddings, file)
|
| 61 |
-
# except Exception as e:
|
| 62 |
-
# print(f"Error saving embeddings to '{file_path}': {e}")
|
| 63 |
-
|
| 64 |
def load_file(file_path: str) -> str:
|
| 65 |
"""
|
| 66 |
Reads the text from a file safely.
|
|
@@ -81,6 +64,23 @@ def load_file(file_path: str) -> str:
|
|
| 81 |
print(f"Error reading file '{file_path}': {e}")
|
| 82 |
return ""
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
# def save_timestamp(timestamp: float, file_path: str):
|
| 85 |
# """
|
| 86 |
# Saves the timestamp to a file to persist across sessions.
|
|
@@ -98,43 +98,43 @@ def load_file(file_path: str) -> str:
|
|
| 98 |
# except Exception as e:
|
| 99 |
# print(f"Error saving timestamp: {e}")
|
| 100 |
|
| 101 |
-
# def
|
| 102 |
# """
|
| 103 |
-
#
|
| 104 |
|
| 105 |
# Args:
|
| 106 |
-
#
|
|
|
|
| 107 |
|
| 108 |
# Returns:
|
| 109 |
-
#
|
| 110 |
# """
|
| 111 |
-
# timestamp_str = load_file(file_path) # Use load_file function to read the file content
|
| 112 |
# try:
|
| 113 |
-
#
|
| 114 |
-
#
|
| 115 |
-
#
|
| 116 |
-
#
|
| 117 |
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
#
|
| 130 |
-
|
| 131 |
|
| 132 |
-
#
|
| 133 |
-
#
|
| 134 |
|
| 135 |
-
|
| 136 |
|
| 137 |
-
|
| 138 |
"""
|
| 139 |
Loads embeddings for the given database. If the database has been updated
|
| 140 |
since the last time embeddings were generated, new embeddings are created
|
|
@@ -150,14 +150,12 @@ def load_file(file_path: str) -> str:
|
|
| 150 |
"""
|
| 151 |
# # Get the timestamp of the last modification of the database file
|
| 152 |
# database_timestamp = os.path.getmtime(database_filepath)
|
| 153 |
-
|
| 154 |
# # Get the stored timestamp of the last database for which embeddings were generated
|
| 155 |
# timestamp_filepath = '/home/user/app/data/db_update_timestamp.txt'
|
| 156 |
# previous_timestamp = load_timestamp(timestamp_filepath)
|
| 157 |
# print("Prev timestamp", previous_timestamp)
|
| 158 |
# print("DB timestamp", database_timestamp)
|
| 159 |
-
# database_embeddings = load_pickle(embeddings_filepath)
|
| 160 |
-
# print("Embeddings loaded.")
|
| 161 |
|
| 162 |
# # Check if the timestamp of the database file is different from the stored timestamp (DB_UPDATE_TIMESTAMP)
|
| 163 |
# if database_timestamp == previous_timestamp:
|
|
@@ -172,5 +170,9 @@ def load_file(file_path: str) -> str:
|
|
| 172 |
# # Update the stored timestamp
|
| 173 |
# save_timestamp(database_timestamp, timestamp_filepath)
|
| 174 |
# print("Embeddings updated.")
|
| 175 |
-
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import pickle
|
| 3 |
import pandas as pd
|
| 4 |
+
from utils.embedding_generation import compute_doc_embeddings
|
| 5 |
|
| 6 |
def load_database(file_path: str) -> pd.DataFrame:
|
| 7 |
"""
|
|
|
|
| 43 |
except Exception as e:
|
| 44 |
print(f"Error reading Pickle file '{file_path}': {e}")
|
| 45 |
return None
|
| 46 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
def load_file(file_path: str) -> str:
|
| 48 |
"""
|
| 49 |
Reads the text from a file safely.
|
|
|
|
| 64 |
print(f"Error reading file '{file_path}': {e}")
|
| 65 |
return ""
|
| 66 |
|
| 67 |
+
# def load_timestamp(file_path: str) -> float:
|
| 68 |
+
# """
|
| 69 |
+
# Loads the timestamp from a file.
|
| 70 |
+
|
| 71 |
+
# Args:
|
| 72 |
+
# file_path (str): The file path from which the timestamp will be read.
|
| 73 |
+
|
| 74 |
+
# Returns:
|
| 75 |
+
# float: The timestamp read from the file. Returns 0.0 if there is an error or no timestamp is found.
|
| 76 |
+
# """
|
| 77 |
+
# timestamp_str = load_file(file_path) # Use load_file function to read the file content
|
| 78 |
+
# try:
|
| 79 |
+
# return float(timestamp_str) # Convert the string to a float
|
| 80 |
+
# except ValueError:
|
| 81 |
+
# print(f"Error: The content in '{file_path}' is not a valid float.")
|
| 82 |
+
# return 0.0 # Return a default value if the content is not valid
|
| 83 |
+
|
| 84 |
# def save_timestamp(timestamp: float, file_path: str):
|
| 85 |
# """
|
| 86 |
# Saves the timestamp to a file to persist across sessions.
|
|
|
|
| 98 |
# except Exception as e:
|
| 99 |
# print(f"Error saving timestamp: {e}")
|
| 100 |
|
| 101 |
+
# def save_pickle(embeddings: dict, file_path: str) -> None:
|
| 102 |
# """
|
| 103 |
+
# Saves to a pickle file safely.
|
| 104 |
|
| 105 |
# Args:
|
| 106 |
+
# embeddings (dict): The embeddings to be saved.
|
| 107 |
+
# file_path (str): The file path where the embeddings will be saved.
|
| 108 |
|
| 109 |
# Returns:
|
| 110 |
+
# None
|
| 111 |
# """
|
|
|
|
| 112 |
# try:
|
| 113 |
+
# with open(file_path, "wb") as file:
|
| 114 |
+
# pickle.dump(embeddings, file)
|
| 115 |
+
# except Exception as e:
|
| 116 |
+
# print(f"Error saving embeddings to '{file_path}': {e}")
|
| 117 |
|
| 118 |
+
def update_embeddings(database:pd.DataFrame, embeddings_filepath: str):
|
| 119 |
+
"""
|
| 120 |
+
Generates new embeddings for the updated database and saves them as a pickle file.
|
| 121 |
|
| 122 |
+
Args:
|
| 123 |
+
database (pd.DataFrame): The updated database (e.g., a DataFrame).
|
| 124 |
+
embeddings_filepath (str): The file path where the embeddings will be saved.
|
| 125 |
|
| 126 |
+
Returns:
|
| 127 |
+
database_embeddings: The newly generated embeddings for the database.
|
| 128 |
+
"""
|
| 129 |
+
# Compute embeddings for the updated database
|
| 130 |
+
database_embeddings = compute_doc_embeddings(database)
|
| 131 |
|
| 132 |
+
# # Save the newly computed embeddings to a pickle file
|
| 133 |
+
# save_pickle(database_embeddings, embeddings_filepath)
|
| 134 |
|
| 135 |
+
return database_embeddings
|
| 136 |
|
| 137 |
+
def load_embeddings(database, database_filepath, embeddings_filepath):
|
| 138 |
"""
|
| 139 |
Loads embeddings for the given database. If the database has been updated
|
| 140 |
since the last time embeddings were generated, new embeddings are created
|
|
|
|
| 150 |
"""
|
| 151 |
# # Get the timestamp of the last modification of the database file
|
| 152 |
# database_timestamp = os.path.getmtime(database_filepath)
|
| 153 |
+
|
| 154 |
# # Get the stored timestamp of the last database for which embeddings were generated
|
| 155 |
# timestamp_filepath = '/home/user/app/data/db_update_timestamp.txt'
|
| 156 |
# previous_timestamp = load_timestamp(timestamp_filepath)
|
| 157 |
# print("Prev timestamp", previous_timestamp)
|
| 158 |
# print("DB timestamp", database_timestamp)
|
|
|
|
|
|
|
| 159 |
|
| 160 |
# # Check if the timestamp of the database file is different from the stored timestamp (DB_UPDATE_TIMESTAMP)
|
| 161 |
# if database_timestamp == previous_timestamp:
|
|
|
|
| 170 |
# # Update the stored timestamp
|
| 171 |
# save_timestamp(database_timestamp, timestamp_filepath)
|
| 172 |
# print("Embeddings updated.")
|
| 173 |
+
|
| 174 |
+
print("Embeddings updating.....")
|
| 175 |
+
database_embeddings = update_embeddings(database, embeddings_filepath)
|
| 176 |
+
print("Embeddings updated.")
|
| 177 |
+
|
| 178 |
+
return database_embeddings
|