Spaces:

Express-Analytics
/

QueryHelper

Runtime error

App Files Files Community

anumaurya114exp commited on Dec 5, 2023

Commit

522ce80

1 Parent(s): 7cf13a6

persist tables data in hugging face

Browse files

Files changed (2) hide show

persistStorage.py +68 -11
utils.py +9 -1

persistStorage.py CHANGED Viewed

@@ -1,26 +1,26 @@
-from huggingface_hub import HfFileSystem
-from datetime import datetime
 import pytz
 import os
 from config import HUGGING_FACE_TOKEN
 import csv
 logsDir = os.getenv("HF_HOME", "/data")
-# # Create a new file
-# with open(os.path.join(data_dir, "my_data.txt"), "a") as f:
-#     f.write("Hello World! From pesistent storage line 2")
-# # Read the data from the file
-# with open(os.path.join(data_dir, "my_data.txt"), "r") as f:
-#     data = f.read()
-#     # Print the data
-#     print(data)
 # Set the time zone to Pacific Time Zone
 TIME_ZONE = 'US/Pacific'
 TIMEZONE_OBJ = pytz.timezone(TIME_ZONE)
 def append_dict_to_csv(file_path, row_data):
     fieldnames = row_data.keys()
@@ -54,4 +54,61 @@ def getAllLogFilesPaths():
     print(logFiles,"avaiable logs")
     downloadableFilesPaths = [os.path.join(os.path.abspath(logsDir), logFilePath) for logFilePath in logFiles]
-    return downloadableFilesPaths

+import sqlite3
+from datetime import datetime, timedelta
 import pytz
 import os
 from config import HUGGING_FACE_TOKEN
+import pandas as pd
 import csv
 logsDir = os.getenv("HF_HOME", "/data")
+TABLES_DATA_DIR = os.path.join(os.getenv("HF_HOME", "/data"), "tablesData")
+try:
+    os.makedirs(TABLES_DATA_DIR, exist_ok=True)
+except:
+    pass
 # Set the time zone to Pacific Time Zone
 TIME_ZONE = 'US/Pacific'
 TIMEZONE_OBJ = pytz.timezone(TIME_ZONE)
+CACHE_TIME_EXPIRE = 5 #days
 def append_dict_to_csv(file_path, row_data):
     fieldnames = row_data.keys()
     print(logFiles,"avaiable logs")
     downloadableFilesPaths = [os.path.join(os.path.abspath(logsDir), logFilePath) for logFilePath in logFiles]
+    return downloadableFilesPaths
+def getLocalDbFileName():
+    if len(os.listdir(TABLES_DATA_DIR))==0:
+        return None
+    localDbName = os.listdir(TABLES_DATA_DIR)[0] #'2023-12-03.db YYYY-MM-DD
+    return localDbName
+def isTablesCacheValid():
+    localDbName = getLocalDbFileName()
+    timeCreatedStr = localDbName.split('.')[0]
+    timeCreated = datetime.strptime(timeCreatedStr, '%Y-%m-%d')
+    if timeCreated + timedelta(days=CACHE_TIME_EXPIRE) > datetime.now():
+        return True
+    return False
+def removeFile(fileNameWithPath):
+    if os.path.exists(fileNameWithPath):
+        os.remove(fileNameWithPath)
+        print(f"File '{fileNameWithPath}' deleted successfully.")
+    else:
+        print(f"File '{fileNameWithPath}' does not exist.")
+def saveTablesDataToLocalDB(tablesData):
+    for prevDbs in os.listdir(TABLES_DATA_DIR):
+        removeFile(os.path.join(TABLES_DATA_DIR, prevDbs))
+    newLocalDb = datetime.now(TIMEZONE_OBJ).strftime('%Y-%m-%d') + '.db'
+    localDbNameWithPath = os.path.join(TABLES_DATA_DIR, newLocalDb)
+    print(f"saving to local db {localDbNameWithPath}")
+    conn = sqlite3.connect(localDbNameWithPath)
+    for tableName in tablesData.keys():
+        tablesData[tableName].to_sql(tableName, conn, if_exists='replace', index=False)
+    conn.close()
+def retrieveTablesDataFromLocalDb(tablesList):
+    print("retreving tables from localDb")
+    localDbName = getLocalDbFileName()
+    if localDbName==None:
+        return {}
+    localDbNameWithPath = os.path.join(TABLES_DATA_DIR, localDbName)
+    if not isTablesCacheValid():
+        removeFile(localDbNameWithPath)
+        return {}
+    conn = sqlite3.connect(localDbNameWithPath)
+    data = {}
+    for tableName in tablesList:
+        try:
+            sql = f'SELECT * FROM {tableName}'
+            df = pd.read_sql_query(sql, con=conn)
+            data[tableName] = df
+        except:
+            print(f"Couldn't read {tableName} from localDb. Advise to read all the tables.")
+            conn.close()
+            return {}
+    conn.close()
+    return data

utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import psycopg2
 import re
 import pandas as pd
 class DataWrapper:
   def __init__(self, data):
@@ -111,15 +112,22 @@ def getAllTablesInfo(dbEngine, schemaName):
   return tablesAndCols
 def getSampleDataForTablesAndCols(dbEngine, schemaName, tablesAndCols, maxRows):
-    data = {}
     dbEngine.connect()
     conn = dbEngine.getConnection()
     for table in tablesAndCols.keys():
       try:
         sqlQuery = f"""select * from {schemaName}.{table} limit {maxRows}"""
         data[table] = pd.read_sql_query(sqlQuery, con=conn)
       except:
         print(f"couldn't read table data. Table: {table}")
     return data
 # Function to test the generated sql query

 import psycopg2
 import re
 import pandas as pd
+from persistStorage import retrieveTablesDataFromLocalDb, saveTablesDataToLocalDB
 class DataWrapper:
   def __init__(self, data):
   return tablesAndCols
 def getSampleDataForTablesAndCols(dbEngine, schemaName, tablesAndCols, maxRows):
+    data = retrieveTablesDataFromLocalDb(list(tablesAndCols.keys()))
+    if data!={}:
+      print("Didn't find any cache/valid cache.")
+      return data
     dbEngine.connect()
     conn = dbEngine.getConnection()
+    print("Getting data from aws redshift")
     for table in tablesAndCols.keys():
       try:
         sqlQuery = f"""select * from {schemaName}.{table} limit {maxRows}"""
         data[table] = pd.read_sql_query(sqlQuery, con=conn)
       except:
         print(f"couldn't read table data. Table: {table}")
+        data[table] = pd.DataFrame({})
+    saveTablesDataToLocalDB(data)
     return data
 # Function to test the generated sql query