AashitaK commited on
Commit
edd3ed5
·
verified ·
1 Parent(s): 5fa76e3

Update utils/file_utils.py

Browse files
Files changed (1) hide show
  1. utils/file_utils.py +86 -87
utils/file_utils.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  import pickle
3
  import pandas as pd
4
- from utils.embedding_generation import compute_doc_embeddings
5
 
6
  def load_database(file_path: str) -> pd.DataFrame:
7
  """
@@ -44,22 +44,22 @@ def load_pickle(file_path: str):
44
  print(f"Error reading Pickle file '{file_path}': {e}")
45
  return None
46
 
47
- def save_pickle(embeddings: dict, file_path: str) -> None:
48
- """
49
- Saves to a pickle file safely.
50
 
51
- Args:
52
- embeddings (dict): The embeddings to be saved.
53
- file_path (str): The file path where the embeddings will be saved.
54
 
55
- Returns:
56
- None
57
- """
58
- try:
59
- with open(file_path, "wb") as file:
60
- pickle.dump(embeddings, file)
61
- except Exception as e:
62
- print(f"Error saving embeddings to '{file_path}': {e}")
63
 
64
  def load_file(file_path: str) -> str:
65
  """
@@ -81,60 +81,60 @@ def load_file(file_path: str) -> str:
81
  print(f"Error reading file '{file_path}': {e}")
82
  return ""
83
 
84
- def save_timestamp(timestamp: float, file_path: str):
85
- """
86
- Saves the timestamp to a file to persist across sessions.
87
-
88
- Args:
89
- timestamp (float): The timestamp representing the last update time of the database.
90
- file_path (str): The file path where the timestamp will be stored.
91
-
92
- Returns:
93
- None
94
- """
95
- try:
96
- with open(file_path, 'w') as f:
97
- f.write(str(timestamp)) # Convert timestamp to string before saving
98
- except Exception as e:
99
- print(f"Error saving timestamp: {e}")
100
-
101
- def load_timestamp(file_path: str) -> float:
102
- """
103
- Loads the timestamp from a file.
104
-
105
- Args:
106
- file_path (str): The file path from which the timestamp will be read.
107
-
108
- Returns:
109
- float: The timestamp read from the file. Returns 0.0 if there is an error or no timestamp is found.
110
- """
111
- timestamp_str = load_file(file_path) # Use load_file function to read the file content
112
- try:
113
- return float(timestamp_str) # Convert the string to a float
114
- except ValueError:
115
- print(f"Error: The content in '{file_path}' is not a valid float.")
116
- return 0.0 # Return a default value if the content is not valid
117
-
118
- def update_embeddings(database:pd.DataFrame, embeddings_filepath: str):
119
- """
120
- Generates new embeddings for the updated database and saves them as a pickle file.
121
-
122
- Args:
123
- database (pd.DataFrame): The updated database (e.g., a DataFrame).
124
- embeddings_filepath (str): The file path where the embeddings will be saved.
125
-
126
- Returns:
127
- database_embeddings: The newly generated embeddings for the database.
128
- """
129
- # Compute embeddings for the updated database
130
- database_embeddings = compute_doc_embeddings(database)
131
-
132
- # Save the newly computed embeddings to a pickle file
133
- save_pickle(database_embeddings, embeddings_filepath)
134
-
135
- return database_embeddings
136
 
137
- def load_embeddings(database, database_filepath, embeddings_filepath):
138
  """
139
  Loads embeddings for the given database. If the database has been updated
140
  since the last time embeddings were generated, new embeddings are created
@@ -148,30 +148,29 @@ def load_embeddings(database, database_filepath, embeddings_filepath):
148
  Returns:
149
  database_embeddings: The embeddings for the database, either newly generated or loaded from the pickle file.
150
  """
151
- # Get the timestamp of the last modification of the database file
152
- database_timestamp = os.path.getmtime(database_filepath)
153
 
154
- # Get the stored timestamp of the last database for which embeddings were generated
155
- timestamp_filepath = '/home/user/app/data/db_update_timestamp.txt'
156
- previous_timestamp = load_timestamp(timestamp_filepath)
157
  # print("Prev timestamp", previous_timestamp)
158
  # print("DB timestamp", database_timestamp)
159
  # database_embeddings = load_pickle(embeddings_filepath)
160
  # print("Embeddings loaded.")
161
 
162
- # Check if the timestamp of the database file is different from the stored timestamp (DB_UPDATE_TIMESTAMP)
163
- if database_timestamp == previous_timestamp:
164
- # If the database file has not been updated, load the existing embeddings from the pickle file
165
- database_embeddings = load_pickle(embeddings_filepath)
166
- print("Embeddings loaded.")
167
- else:
168
- print("Embeddings updating.....")
169
- # If the database file has been updated, generate new embeddings and save them to the embeddings file
170
- database_embeddings = update_embeddings(database, embeddings_filepath)
171
-
172
- # Update the stored timestamp
173
- save_timestamp(database_timestamp, timestamp_filepath)
174
- print("Embeddings updated.")
175
 
 
 
 
176
 
177
- return database_embeddings
 
1
  import os
2
  import pickle
3
  import pandas as pd
4
+ # from utils.embedding_generation import compute_doc_embeddings
5
 
6
  def load_database(file_path: str) -> pd.DataFrame:
7
  """
 
44
  print(f"Error reading Pickle file '{file_path}': {e}")
45
  return None
46
 
47
+ # def save_pickle(embeddings: dict, file_path: str) -> None:
48
+ # """
49
+ # Saves to a pickle file safely.
50
 
51
+ # Args:
52
+ # embeddings (dict): The embeddings to be saved.
53
+ # file_path (str): The file path where the embeddings will be saved.
54
 
55
+ # Returns:
56
+ # None
57
+ # """
58
+ # try:
59
+ # with open(file_path, "wb") as file:
60
+ # pickle.dump(embeddings, file)
61
+ # except Exception as e:
62
+ # print(f"Error saving embeddings to '{file_path}': {e}")
63
 
64
  def load_file(file_path: str) -> str:
65
  """
 
81
  print(f"Error reading file '{file_path}': {e}")
82
  return ""
83
 
84
+ # def save_timestamp(timestamp: float, file_path: str):
85
+ # """
86
+ # Saves the timestamp to a file to persist across sessions.
87
+
88
+ # Args:
89
+ # timestamp (float): The timestamp representing the last update time of the database.
90
+ # file_path (str): The file path where the timestamp will be stored.
91
+
92
+ # Returns:
93
+ # None
94
+ # """
95
+ # try:
96
+ # with open(file_path, 'w') as f:
97
+ # f.write(str(timestamp)) # Convert timestamp to string before saving
98
+ # except Exception as e:
99
+ # print(f"Error saving timestamp: {e}")
100
+
101
+ # def load_timestamp(file_path: str) -> float:
102
+ # """
103
+ # Loads the timestamp from a file.
104
+
105
+ # Args:
106
+ # file_path (str): The file path from which the timestamp will be read.
107
+
108
+ # Returns:
109
+ # float: The timestamp read from the file. Returns 0.0 if there is an error or no timestamp is found.
110
+ # """
111
+ # timestamp_str = load_file(file_path) # Use load_file function to read the file content
112
+ # try:
113
+ # return float(timestamp_str) # Convert the string to a float
114
+ # except ValueError:
115
+ # print(f"Error: The content in '{file_path}' is not a valid float.")
116
+ # return 0.0 # Return a default value if the content is not valid
117
+
118
+ # def update_embeddings(database:pd.DataFrame, embeddings_filepath: str):
119
+ # """
120
+ # Generates new embeddings for the updated database and saves them as a pickle file.
121
+
122
+ # Args:
123
+ # database (pd.DataFrame): The updated database (e.g., a DataFrame).
124
+ # embeddings_filepath (str): The file path where the embeddings will be saved.
125
+
126
+ # Returns:
127
+ # database_embeddings: The newly generated embeddings for the database.
128
+ # """
129
+ # # Compute embeddings for the updated database
130
+ # database_embeddings = compute_doc_embeddings(database)
131
+
132
+ # # Save the newly computed embeddings to a pickle file
133
+ # save_pickle(database_embeddings, embeddings_filepath)
134
+
135
+ # return database_embeddings
136
 
137
+ # def load_embeddings(database, database_filepath, embeddings_filepath):
138
  """
139
  Loads embeddings for the given database. If the database has been updated
140
  since the last time embeddings were generated, new embeddings are created
 
148
  Returns:
149
  database_embeddings: The embeddings for the database, either newly generated or loaded from the pickle file.
150
  """
151
+ # # Get the timestamp of the last modification of the database file
152
+ # database_timestamp = os.path.getmtime(database_filepath)
153
 
154
+ # # Get the stored timestamp of the last database for which embeddings were generated
155
+ # timestamp_filepath = '/home/user/app/data/db_update_timestamp.txt'
156
+ # previous_timestamp = load_timestamp(timestamp_filepath)
157
  # print("Prev timestamp", previous_timestamp)
158
  # print("DB timestamp", database_timestamp)
159
  # database_embeddings = load_pickle(embeddings_filepath)
160
  # print("Embeddings loaded.")
161
 
162
+ # # Check if the timestamp of the database file is different from the stored timestamp (DB_UPDATE_TIMESTAMP)
163
+ # if database_timestamp == previous_timestamp:
164
+ # # If the database file has not been updated, load the existing embeddings from the pickle file
165
+ # database_embeddings = load_pickle(embeddings_filepath)
166
+ # print("Embeddings loaded.")
167
+ # else:
168
+ # print("Embeddings updating.....")
169
+ # # If the database file has been updated, generate new embeddings and save them to the embeddings file
170
+ # database_embeddings = update_embeddings(database, embeddings_filepath)
 
 
 
 
171
 
172
+ # # Update the stored timestamp
173
+ # save_timestamp(database_timestamp, timestamp_filepath)
174
+ # print("Embeddings updated.")
175
 
176
+ # return database_embeddings