AashitaK commited on
Commit
cfb87a4
·
verified ·
1 Parent(s): 97bb69f

Update file_utils.py

Browse files
Files changed (1) hide show
  1. file_utils.py +123 -10
file_utils.py CHANGED
@@ -1,11 +1,13 @@
 
1
  import pickle
2
  import pandas as pd
 
3
 
4
- def load_service_data(file_path: str) -> pd.DataFrame:
5
  """
6
  Loads a CSV file into a Pandas DataFrame and sets the index to the 'service' column.
7
 
8
- Parameters:
9
  file_path (str): Path to the CSV file.
10
 
11
  Returns:
@@ -23,7 +25,15 @@ def load_service_data(file_path: str) -> pd.DataFrame:
23
  return pd.DataFrame()
24
 
25
  def load_pickle(file_path: str):
26
- """Loads and returns data from a Pickle (.pkl) file."""
 
 
 
 
 
 
 
 
27
  try:
28
  with open(file_path, "rb") as file: # Open in 'rb' (read binary) mode
29
  return pickle.load(file)
@@ -34,8 +44,33 @@ def load_pickle(file_path: str):
34
  print(f"Error reading Pickle file '{file_path}': {e}")
35
  return None
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def load_file(file_path: str) -> str:
38
- """Reads the text from a file safely."""
 
 
 
 
 
 
 
 
39
  try:
40
  with open(file_path, "r", encoding="utf-8") as file:
41
  return file.read()
@@ -46,11 +81,89 @@ def load_file(file_path: str) -> str:
46
  print(f"Error reading file '{file_path}': {e}")
47
  return ""
48
 
49
- def save_embeddings(embeddings: dict, file_path: str) -> None:
50
- """Saves embeddings to a pickle file safely."""
 
 
 
 
 
 
 
 
 
51
  try:
52
- with open(file_path, "wb") as file:
53
- pickle.dump(embeddings, file)
54
  except Exception as e:
55
- print(f"Error saving embeddings to '{file_path}': {e}")
56
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import pickle
3
  import pandas as pd
4
+ from embedding_generation import compute_doc_embeddings
5
 
6
+ def load_database(file_path: str) -> pd.DataFrame:
7
  """
8
  Loads a CSV file into a Pandas DataFrame and sets the index to the 'service' column.
9
 
10
+ Args:
11
  file_path (str): Path to the CSV file.
12
 
13
  Returns:
 
25
  return pd.DataFrame()
26
 
27
  def load_pickle(file_path: str):
28
+ """
29
+ Loads and returns data from a Pickle (.pkl) file.
30
+
31
+ Args:
32
+ file_path (str): Path to the Pickle file.
33
+
34
+ Returns:
35
+ object: The data loaded from the Pickle file, or None if loading failed.
36
+ """
37
  try:
38
  with open(file_path, "rb") as file: # Open in 'rb' (read binary) mode
39
  return pickle.load(file)
 
44
  print(f"Error reading Pickle file '{file_path}': {e}")
45
  return None
46
 
47
+ def save_pickle(embeddings: dict, file_path: str) -> None:
48
+ """
49
+ Saves to a pickle file safely.
50
+
51
+ Args:
52
+ embeddings (dict): The embeddings to be saved.
53
+ file_path (str): The file path where the embeddings will be saved.
54
+
55
+ Returns:
56
+ None
57
+ """
58
+ try:
59
+ with open(file_path, "wb") as file:
60
+ pickle.dump(embeddings, file)
61
+ except Exception as e:
62
+ print(f"Error saving embeddings to '{file_path}': {e}")
63
+
64
  def load_file(file_path: str) -> str:
65
+ """
66
+ Reads the text from a file safely.
67
+
68
+ Args:
69
+ file_path (str): Path to the text file.
70
+
71
+ Returns:
72
+ str: The content of the file, or an empty string if an error occurred.
73
+ """
74
  try:
75
  with open(file_path, "r", encoding="utf-8") as file:
76
  return file.read()
 
81
  print(f"Error reading file '{file_path}': {e}")
82
  return ""
83
 
84
+ def save_timestamp(timestamp: float, file_path: str):
85
+ """
86
+ Saves the timestamp to a file to persist across sessions.
87
+
88
+ Args:
89
+ timestamp (float): The timestamp representing the last update time of the database.
90
+ file_path (str): The file path where the timestamp will be stored.
91
+
92
+ Returns:
93
+ None
94
+ """
95
  try:
96
+ with open(file_path, 'w') as f:
97
+ f.write(str(timestamp)) # Convert timestamp to string before saving
98
  except Exception as e:
99
+ print(f"Error saving timestamp: {e}")
100
+
101
+ def load_timestamp(file_path: str) -> float:
102
+ """
103
+ Loads the timestamp from a file.
104
+
105
+ Args:
106
+ file_path (str): The file path from which the timestamp will be read.
107
+
108
+ Returns:
109
+ float: The timestamp read from the file. Returns 0.0 if there is an error or no timestamp is found.
110
+ """
111
+ timestamp_str = load_file(file_path) # Use load_file function to read the file content
112
+ try:
113
+ return float(timestamp_str) # Convert the string to a float
114
+ except ValueError:
115
+ print(f"Error: The content in '{file_path}' is not a valid float.")
116
+ return 0.0 # Return a default value if the content is not valid
117
+
118
+ def update_embeddings(database:pd.DataFrame, embeddings_filepath: str):
119
+ """
120
+ Generates new embeddings for the updated database and saves them as a pickle file.
121
+
122
+ Args:
123
+ database (pd.DataFrame): The updated database (e.g., a DataFrame).
124
+ embeddings_filepath (str): The file path where the embeddings will be saved.
125
+
126
+ Returns:
127
+ database_embeddings: The newly generated embeddings for the database.
128
+ """
129
+ # Compute embeddings for the updated database
130
+ database_embeddings = compute_doc_embeddings(database)
131
+
132
+ # Save the newly computed embeddings to a pickle file
133
+ save_picle(database_embeddings, embeddings_filepath)
134
+
135
+ return database_embeddings
136
+
137
+ def load_embeddings(database, database_filepath, embeddings_filepath):
138
+ """
139
+ Loads embeddings for the given database. If the database has been updated
140
+ since the last time embeddings were generated, new embeddings are created
141
+ and saved. If the database hasn't changed, previously saved embeddings are loaded.
142
+
143
+ Args:
144
+ database (pd.DataFrame): The database (e.g., a DataFrame) for which embeddings need to be generated or loaded.
145
+ database_filepath (str): The file path of the database (CSV file or similar).
146
+ embeddings_filepath (str): The file path where the embeddings are saved (pickle file).
147
+
148
+ Returns:
149
+ database_embeddings: The embeddings for the database, either newly generated or loaded from the pickle file.
150
+ """
151
+ # Get the timestamp of the last modification of the database file
152
+ database_timestamp = os.path.getmtime(database_filepath)
153
+
154
+ # Get the stored timestamp of the last database for which embeddings were generated
155
+ timestamp_filepath = "db_update_timestamp.txt"
156
+ previous_timestamp = load_timestamp(timestamp_filepath)
157
+
158
+ # Check if the timestamp of the database file is different from the stored timestamp (DB_UPDATE_TIMESTAMP)
159
+ if database_timestamp != previous_timestamp:
160
+ # If the database file has been updated, generate new embeddings and save them to the embeddings file
161
+ database_embeddings = update_embeddings(database, embeddings_filepath)
162
+
163
+ # Update the stored timestamp
164
+ save_timestamp(database_timestamp, timestamp_filepath)
165
+ else:
166
+ # If the database file has not been updated, load the existing embeddings from the pickle file
167
+ database_embeddings = load_pickle(embeddings_filepath)
168
+
169
+ return database_embeddings