Spaces:
Sleeping
Sleeping
Mustehson commited on
Commit ·
98f0179
1
Parent(s): 6dda383
Data Summary
Browse files
app.py
CHANGED
|
@@ -15,11 +15,15 @@ warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
| 15 |
|
| 16 |
# Height of the Tabs Text Area
|
| 17 |
TAB_LINES = 8
|
| 18 |
-
|
|
|
|
|
|
|
| 19 |
md_token = os.getenv('MD_TOKEN')
|
| 20 |
-
# Connect to DB
|
| 21 |
conn = duckdb.connect(f"md:my_db?motherduck_token={md_token}", read_only=True)
|
|
|
|
|
|
|
| 22 |
|
|
|
|
| 23 |
models = ["Qwen/Qwen2.5-72B-Instruct","meta-llama/Meta-Llama-3-70B-Instruct",
|
| 24 |
"meta-llama/Llama-3.1-70B-Instruct"]
|
| 25 |
|
|
@@ -35,13 +39,13 @@ for model in models:
|
|
| 35 |
continue
|
| 36 |
|
| 37 |
llm = ChatHuggingFace(llm=endpoint).bind_tools(tools=[], max_tokens=8192)
|
|
|
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
| 41 |
prompt_autogenerate = hub.pull("autogenerate-rules-testworkflow")
|
| 42 |
prompt_user_input = hub.pull("usergenerate-rules-testworkflow")
|
| 43 |
|
| 44 |
-
|
| 45 |
# Get Databases
|
| 46 |
def get_schemas():
|
| 47 |
schemas = conn.execute("""
|
|
@@ -67,10 +71,18 @@ def get_data_df(schema):
|
|
| 67 |
|
| 68 |
|
| 69 |
def format_prompt(df):
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
def process_inputs(inputs) :
|
| 76 |
print(inputs)
|
|
@@ -167,6 +179,10 @@ def statistics(df):
|
|
| 167 |
df_alerts = pd.DataFrame(alerts_list, columns=['Data Quality Issue', 'Category'])
|
| 168 |
|
| 169 |
return df_statistics, df_alerts
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
# Main Function
|
| 171 |
def main(table):
|
| 172 |
schema = get_table_schema(table)
|
|
|
|
| 15 |
|
| 16 |
# Height of the Tabs Text Area
|
| 17 |
TAB_LINES = 8
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
#----------CONNECT TO DATABASE----------
|
| 21 |
md_token = os.getenv('MD_TOKEN')
|
|
|
|
| 22 |
conn = duckdb.connect(f"md:my_db?motherduck_token={md_token}", read_only=True)
|
| 23 |
+
#---------------------------------------
|
| 24 |
+
|
| 25 |
|
| 26 |
+
#-------LOAD HUGGINGFACE-------
|
| 27 |
models = ["Qwen/Qwen2.5-72B-Instruct","meta-llama/Meta-Llama-3-70B-Instruct",
|
| 28 |
"meta-llama/Llama-3.1-70B-Instruct"]
|
| 29 |
|
|
|
|
| 39 |
continue
|
| 40 |
|
| 41 |
llm = ChatHuggingFace(llm=endpoint).bind_tools(tools=[], max_tokens=8192)
|
| 42 |
+
#---------------------------------------
|
| 43 |
|
| 44 |
+
#-----LOAD PROMPT FROM LANCHAIN HUB-----
|
|
|
|
| 45 |
prompt_autogenerate = hub.pull("autogenerate-rules-testworkflow")
|
| 46 |
prompt_user_input = hub.pull("usergenerate-rules-testworkflow")
|
| 47 |
|
| 48 |
+
#--------------ALL UTILS----------------
|
| 49 |
# Get Databases
|
| 50 |
def get_schemas():
|
| 51 |
schemas = conn.execute("""
|
|
|
|
| 71 |
|
| 72 |
|
| 73 |
def format_prompt(df):
|
| 74 |
+
summary_df = pd.DataFrame({
|
| 75 |
+
"max": df.max(),
|
| 76 |
+
"min": df.min(),
|
| 77 |
+
"top": df.mode().iloc[0],
|
| 78 |
+
"nunique": df.nunique(),
|
| 79 |
+
"count": df.count(),
|
| 80 |
+
"dtype": df.dtypes.astype(str)
|
| 81 |
+
}).reset_index().rename(columns={"index": "column"})
|
| 82 |
+
return prompt_autogenerate.format_prompt(data=df.head().to_json(orient='records'),
|
| 83 |
+
summary=summary_df.to_json(orient='records'))
|
| 84 |
+
def format_user_prompt(df):
|
| 85 |
+
return prompt_user_input.format_prompt(data=df.head().to_json(orient='records'))
|
| 86 |
|
| 87 |
def process_inputs(inputs) :
|
| 88 |
print(inputs)
|
|
|
|
| 179 |
df_alerts = pd.DataFrame(alerts_list, columns=['Data Quality Issue', 'Category'])
|
| 180 |
|
| 181 |
return df_statistics, df_alerts
|
| 182 |
+
#---------------------------------------
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
|
| 186 |
# Main Function
|
| 187 |
def main(table):
|
| 188 |
schema = get_table_schema(table)
|