Spaces:

asoria
/

auto-dataset-analyst-creator

Build error

App Files Files Community

asoria commited on Aug 21, 2024

Commit

a093cd2

1 Parent(s): 9f3ff28

Adding outlines for prompts

Browse files

Files changed (5) hide show

app.py +4 -59
requirements.txt +2 -1
utils/ __init__.py +0 -0
utils/__pycache__/prompts.cpython-310.pyc +0 -0
utils/prompts.py +47 -0

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import json
 import re
 import pandas as pd
 from gradio.data_classes import FileData
 """
 TODOs:
@@ -48,62 +48,6 @@ def get_compatible_libraries(dataset: str):
     return resp.json()
-def generate_mapping_prompt(code):
-    logging.info("Generating mapping prompt")
-    logging.info(code)
-    format_instructions = "Format the following python code to a list of cells to be used in a jupyter notebook:\n"
-    format_instructions += code
-    format_instructions += """
-The output should be a markdown code snippet formatted in the
-following schema, including the leading and trailing "```json" and "```":
-```json
-[
-    {
-        "cell_type": string  // This refers either is a markdown or code cell type.
-        "source": list of string separated by comma // This is the list of text or python code.
-    }
-]
-```
-"""
-    return format_instructions
-def generate_eda_prompt(columns_info, df, first_code):
-    sample_data = df.head(5).to_dict(orient="records")
-    prompt = """
-You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
-Columns and Data Types:
-{columns_info}
-Sample Data:
-{sample_data}
-Please create a pandas EDA notebook that includes the following:
-1. Summary statistics for numerical columns.
-2. Distribution plots for numerical columns.
-3. Bar plots or count plots for categorical columns.
-4. Correlation matrix and heatmap for numerical columns.
-5. Any additional relevant visualizations or analyses you deem appropriate.
-Ensure the notebook is well-organized, with explanations for each step.
-It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way:
-{first_code}
-"""
-    return prompt.format(
-        columns_info=columns_info,
-        sample_data=sample_data,
-        first_code=first_code,
-    )
 def create_notebook_file(cell_commands, notebook_name):
     nb = nbf.v4.new_notebook()
     nb["cells"] = [
@@ -205,7 +149,8 @@ def generate_cells(dataset_id):
     first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
     logging.info(f"First split file: {first_file}")
     features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
-    prompt = generate_eda_prompt(features, df, first_code)
     messages = [gr.ChatMessage(role="user", content=prompt)]
     yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
@@ -226,7 +171,7 @@ def generate_cells(dataset_id):
         yield messages
     yield messages
-    logging.info("---> FOrmated prompt")
     formatted_prompt = generate_mapping_prompt(GENERATED_TEXT)
     logging.info(formatted_prompt)
     prompt_messages = [{"role": "user", "content": formatted_prompt}]

 import re
 import pandas as pd
 from gradio.data_classes import FileData
+from utils.prompts import generate_mapping_prompt, generate_eda_prompt
 """
 TODOs:
     return resp.json()
 def create_notebook_file(cell_commands, notebook_name):
     nb = nbf.v4.new_notebook()
     nb["cells"] = [
     first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
     logging.info(f"First split file: {first_file}")
     features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
+    sample_data = df.head(5).to_dict(orient="records")
+    prompt = generate_eda_prompt(features, sample_data, first_code)
     messages = [gr.ChatMessage(role="user", content=prompt)]
     yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
         yield messages
     yield messages
+    logging.info("---> Formated prompt")
     formatted_prompt = generate_mapping_prompt(GENERATED_TEXT)
     logging.info(formatted_prompt)
     prompt_messages = [{"role": "user", "content": formatted_prompt}]

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 gradio_huggingfacehub_search==0.0.7
 huggingface_hub
 nbformat
-httpx

 gradio_huggingfacehub_search==0.0.7
 huggingface_hub
 nbformat
+httpx
+outlines

utils/ __init__.py ADDED Viewed

File without changes

utils/__pycache__/prompts.cpython-310.pyc ADDED Viewed

Binary file (1.86 kB). View file

utils/prompts.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import outlines
+@outlines.prompt
+def generate_mapping_prompt(code):
+    """Format the following python code to a list of cells to be used in a jupyter notebook:
+    {{ code }}
+    The output should be a markdown code snippet formatted in the
+    following schema, including the leading and trailing "```json" and "```":
+    ```json
+    [
+        {
+            "cell_type": string  // This refers either is a markdown or code cell type.
+            "source": list of string separated by comma // This is the list of text or python code.
+        }
+    ]
+    ```
+    """
+@outlines.prompt
+def generate_eda_prompt(columns_info, sample_data, first_code):
+    """You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
+    Columns and Data Types:
+    {{ columns_info }}
+    Sample Data:
+    {{ sample_data }}
+    Please create a pandas EDA notebook that includes the following:
+    1. Summary statistics for numerical columns.
+    2. Distribution plots for numerical columns.
+    3. Bar plots or count plots for categorical columns.
+    4. Correlation matrix and heatmap for numerical columns.
+    5. Any additional relevant visualizations or analyses you deem appropriate.
+    Ensure the notebook is well-organized, with explanations for each step.
+    It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way:
+    {{ first_code }}
+    """