Spaces:
Build error
Build error
| import os | |
| import pickle | |
| import pandas as pd | |
| from utils.embedding_generation import compute_doc_embeddings | |
| def load_database(file_path: str) -> pd.DataFrame: | |
| """ | |
| Loads a CSV file into a Pandas DataFrame and sets the index to the 'service' column. | |
| Args: | |
| file_path (str): Path to the CSV file. | |
| Returns: | |
| pd.DataFrame: DataFrame with 'service' as the index. | |
| """ | |
| try: | |
| df = pd.read_csv(file_path) | |
| df = df.set_index("service") # Set 'service' as index | |
| return df | |
| except FileNotFoundError: | |
| print(f"Error: The file '{file_path}' was not found.") | |
| return pd.DataFrame() # Return an empty DataFrame on error | |
| except Exception as e: | |
| print(f"Error loading CSV file '{file_path}': {e}") | |
| return pd.DataFrame() | |
| def load_pickle(file_path: str): | |
| """ | |
| Loads and returns data from a Pickle (.pkl) file. | |
| Args: | |
| file_path (str): Path to the Pickle file. | |
| Returns: | |
| object: The data loaded from the Pickle file, or None if loading failed. | |
| """ | |
| try: | |
| with open(file_path, "rb") as file: # Open in 'rb' (read binary) mode | |
| return pickle.load(file) | |
| except FileNotFoundError: | |
| print(f"Error: The file '{file_path}' was not found.") | |
| return None | |
| except Exception as e: | |
| print(f"Error reading Pickle file '{file_path}': {e}") | |
| return None | |
| def load_file(file_path: str) -> str: | |
| """ | |
| Reads the text from a file safely. | |
| Args: | |
| file_path (str): Path to the text file. | |
| Returns: | |
| str: The content of the file, or an empty string if an error occurred. | |
| """ | |
| try: | |
| with open(file_path, "r", encoding="utf-8") as file: | |
| return file.read() | |
| except FileNotFoundError: | |
| print(f"Error: The file '{file_path}' was not found.") | |
| return "" | |
| except Exception as e: | |
| print(f"Error reading file '{file_path}': {e}") | |
| return "" | |
| # def load_timestamp(file_path: str) -> float: | |
| # """ | |
| # Loads the timestamp from a file. | |
| # Args: | |
| # file_path (str): The file path from which the timestamp will be read. | |
| # Returns: | |
| # float: The timestamp read from the file. Returns 0.0 if there is an error or no timestamp is found. | |
| # """ | |
| # timestamp_str = load_file(file_path) # Use load_file function to read the file content | |
| # try: | |
| # return float(timestamp_str) # Convert the string to a float | |
| # except ValueError: | |
| # print(f"Error: The content in '{file_path}' is not a valid float.") | |
| # return 0.0 # Return a default value if the content is not valid | |
| # def save_timestamp(timestamp: float, file_path: str): | |
| # """ | |
| # Saves the timestamp to a file to persist across sessions. | |
| # Args: | |
| # timestamp (float): The timestamp representing the last update time of the database. | |
| # file_path (str): The file path where the timestamp will be stored. | |
| # Returns: | |
| # None | |
| # """ | |
| # try: | |
| # with open(file_path, 'w') as f: | |
| # f.write(str(timestamp)) # Convert timestamp to string before saving | |
| # except Exception as e: | |
| # print(f"Error saving timestamp: {e}") | |
| # def save_pickle(embeddings: dict, file_path: str) -> None: | |
| # """ | |
| # Saves to a pickle file safely. | |
| # Args: | |
| # embeddings (dict): The embeddings to be saved. | |
| # file_path (str): The file path where the embeddings will be saved. | |
| # Returns: | |
| # None | |
| # """ | |
| # try: | |
| # with open(file_path, "wb") as file: | |
| # pickle.dump(embeddings, file) | |
| # except Exception as e: | |
| # print(f"Error saving embeddings to '{file_path}': {e}") | |
| def update_embeddings(database:pd.DataFrame, embeddings_filepath: str): | |
| """ | |
| Generates new embeddings for the updated database and saves them as a pickle file. | |
| Args: | |
| database (pd.DataFrame): The updated database (e.g., a DataFrame). | |
| embeddings_filepath (str): The file path where the embeddings will be saved. | |
| Returns: | |
| database_embeddings: The newly generated embeddings for the database. | |
| """ | |
| # Compute embeddings for the updated database | |
| database_embeddings = compute_doc_embeddings(database) | |
| # # Save the newly computed embeddings to a pickle file | |
| # save_pickle(database_embeddings, embeddings_filepath) | |
| return database_embeddings | |
| def load_embeddings(database, database_filepath, embeddings_filepath): | |
| """ | |
| Loads embeddings for the given database. If the database has been updated | |
| since the last time embeddings were generated, new embeddings are created | |
| and saved. If the database hasn't changed, previously saved embeddings are loaded. | |
| Args: | |
| database (pd.DataFrame): The database (e.g., a DataFrame) for which embeddings need to be generated or loaded. | |
| database_filepath (str): The file path of the database (CSV file or similar). | |
| embeddings_filepath (str): The file path where the embeddings are saved (pickle file). | |
| Returns: | |
| database_embeddings: The embeddings for the database, either newly generated or loaded from the pickle file. | |
| """ | |
| # # Get the timestamp of the last modification of the database file | |
| # database_timestamp = os.path.getmtime(database_filepath) | |
| # # Get the stored timestamp of the last database for which embeddings were generated | |
| # timestamp_filepath = '/home/user/app/data/db_update_timestamp.txt' | |
| # previous_timestamp = load_timestamp(timestamp_filepath) | |
| # print("Prev timestamp", previous_timestamp) | |
| # print("DB timestamp", database_timestamp) | |
| # # Check if the timestamp of the database file is different from the stored timestamp (DB_UPDATE_TIMESTAMP) | |
| # if database_timestamp == previous_timestamp: | |
| # # If the database file has not been updated, load the existing embeddings from the pickle file | |
| # database_embeddings = load_pickle(embeddings_filepath) | |
| # print("Embeddings loaded.") | |
| # else: | |
| # print("Embeddings updating.....") | |
| # # If the database file has been updated, generate new embeddings and save them to the embeddings file | |
| # database_embeddings = update_embeddings(database, embeddings_filepath) | |
| # # Update the stored timestamp | |
| # save_timestamp(database_timestamp, timestamp_filepath) | |
| # print("Embeddings updated.") | |
| print("Embeddings updating.....") | |
| database_embeddings = update_embeddings(database, embeddings_filepath) | |
| print("Embeddings updated.") | |
| return database_embeddings | |