Spaces:

asoria
/

auto-dataset-analyst-creator

Build error

App Files Files Community

asoria commited on Aug 20, 2024

Commit

777edd0

1 Parent(s): 44cdaf2

Adding second layer to parse code to cells

Browse files

Files changed (1) hide show

app.py +137 -40

app.py CHANGED Viewed

@@ -8,6 +8,8 @@ from huggingface_hub import InferenceClient
 import json
 import re
 import pandas as pd
 """
 TODOs:
@@ -30,6 +32,8 @@ TODOs:
 # Configuration
 BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
 HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
 client = Client(headers=HEADERS)
 inference_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
@@ -44,9 +48,12 @@ def get_compatible_libraries(dataset: str):
     return resp.json()
-def generate_eda_prompt(columns_info, df, first_code):
-    sample_data = df.head(5).to_dict(orient="records")
-    format_instructions = """
 The output should be a markdown code snippet formatted in the
 following schema, including the leading and trailing "```json" and "```":
@@ -58,7 +65,13 @@ following schema, including the leading and trailing "```json" and "```":
     }
 ]
 ```
-"""
     prompt = """
 You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
@@ -83,13 +96,11 @@ It is mandatory that you use the following code to load the dataset, DO NOT try
 {first_code}
-{format_instructions}
 """
     return prompt.format(
         columns_info=columns_info,
         sample_data=sample_data,
         first_code=first_code,
-        format_instructions=format_instructions,
     )
@@ -141,40 +152,40 @@ def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
     return features_dict, first_rows_df
 def content_from_output(output):
     pattern = r"`json(.*?)`"
-    logging.info("--------> Getting data from output")
     match = re.search(pattern, output, re.DOTALL)
     if not match:
         pattern = r"```(.*?)```"
-        logging.info("--------> Getting data from output, second try")
         match = re.search(pattern, output, re.DOTALL)
         if not match:
             raise Exception("Unable to generate jupyter notebook.")
-    extracted_text = match.group(1)
-    logging.info(extracted_text)
-    content = json.loads(extracted_text)
-    logging.info(content)
-    return content
-def get_notebook_cells(prompt):
-    messages = [{"role": "user", "content": prompt}]
-    output = inference_client.chat_completion(messages=messages, max_tokens=2500)
-    output = output.choices[0].message.content
-    return content_from_output(output)
-def generate_notebook(dataset_id):
     try:
         libraries = get_compatible_libraries(dataset_id)
     except Exception as err:
         gr.Error("Unable to retrieve dataset info from HF Hub.")
         logging.error(f"Failed to fetch compatible libraries: {err}")
-        return None
     if not libraries:
-        gr.Warning("Dataset not compatible with pandas library.")
         logging.error(f"Dataset not compatible with pandas library")
         return gr.File(visible=False), gr.Row.update(visible=False)
@@ -183,29 +194,103 @@ def generate_notebook(dataset_id):
         None,
     )
     if not pandas_library:
-        gr.Warning("Dataset not compatible with pandas library.")
-        logging.error(f"Dataset not compatible with pandas library")
-        return gr.File(visible=False), gr.Row.update(visible=False)
     first_config_loading_code = pandas_library["loading_codes"][0]
     first_code = first_config_loading_code["code"]
     first_config = first_config_loading_code["config_name"]
     first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
     logging.info(f"First config: {first_config} - first split: {first_split}")
     first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
     logging.info(f"First split file: {first_file}")
-    html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
     features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
     prompt = generate_eda_prompt(features, df, first_code)
-    logging.info(f"Prompt: {prompt}")
-    commands = get_notebook_cells(prompt)
     # Adding dataset viewer on the first part
-    commands.insert(0, {"cell_type": "code", "source": f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))'})
     commands.insert(0, {"cell_type": "markdown", "source": "# Dataset Viewer"})
     notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
     create_notebook_file(commands, notebook_name=notebook_name)
-    return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)
 with gr.Blocks() as demo:
@@ -231,8 +316,24 @@ with gr.Blocks() as demo:
         """
         return gr.HTML(value=html_code)
-    generate_btn = gr.Button("Generate notebook")
-    download_link = gr.File(label="Download notebook", visible=False)
     with gr.Row(visible=False) as auth_page:
         with gr.Column():
             gr.Markdown(
@@ -246,11 +347,7 @@ with gr.Blocks() as demo:
     push_btn = gr.Button("Push notebook to hub", visible=False)
     output_lbl = gr.HTML(value="", visible=False)
-    generate_btn.click(
-        generate_notebook,
-        inputs=[dataset_name],
-        outputs=[download_link, auth_page],
-    )
     def auth(token):
         if not token:
@@ -271,7 +368,7 @@ with gr.Blocks() as demo:
     push_btn.click(
         push_notebook,
-        inputs=[download_link, dataset_name, token_box],
         outputs=output_lbl,
     )

 import json
 import re
 import pandas as pd
+from gradio.data_classes import FileData
 """
 TODOs:
 # Configuration
 BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
 HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
+GENERATED_TEXT = ""
 client = Client(headers=HEADERS)
 inference_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
     return resp.json()
+def generate_mapping_prompt(code):
+    logging.info("Generating mapping prompt")
+    logging.info(code)
+    format_instructions = "Format the following python code to a list of cells to be used in a jupyter notebook:\n"
+    format_instructions += code
+    format_instructions += """
 The output should be a markdown code snippet formatted in the
 following schema, including the leading and trailing "```json" and "```":
     }
 ]
 ```
+"""
+    return format_instructions
+def generate_eda_prompt(columns_info, df, first_code):
+    sample_data = df.head(5).to_dict(orient="records")
     prompt = """
 You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
 {first_code}
 """
     return prompt.format(
         columns_info=columns_info,
         sample_data=sample_data,
         first_code=first_code,
     )
     return features_dict, first_rows_df
+def get_txt_from_output(output):
+    extracted_text = content_from_output(output)
+    content = json.loads(extracted_text)
+    logging.info(content)
+    return content
 def content_from_output(output):
     pattern = r"`json(.*?)`"
     match = re.search(pattern, output, re.DOTALL)
     if not match:
         pattern = r"```(.*?)```"
         match = re.search(pattern, output, re.DOTALL)
         if not match:
+            try:
+                index = output.index("```json")
+                logging.info(f"Index: {index}")
+                return output[index + 7 :]
+            except:
+                pass
             raise Exception("Unable to generate jupyter notebook.")
+    return match.group(1)
+def generate_cells(dataset_id):
     try:
         libraries = get_compatible_libraries(dataset_id)
     except Exception as err:
         gr.Error("Unable to retrieve dataset info from HF Hub.")
         logging.error(f"Failed to fetch compatible libraries: {err}")
+        return []
     if not libraries:
+        gr.Error("Dataset not compatible with pandas library.")
         logging.error(f"Dataset not compatible with pandas library")
         return gr.File(visible=False), gr.Row.update(visible=False)
         None,
     )
     if not pandas_library:
+        gr.Error("Dataset not compatible with pandas library.")
+        return []
     first_config_loading_code = pandas_library["loading_codes"][0]
     first_code = first_config_loading_code["code"]
     first_config = first_config_loading_code["config_name"]
     first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
     logging.info(f"First config: {first_config} - first split: {first_split}")
     first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
     logging.info(f"First split file: {first_file}")
     features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
     prompt = generate_eda_prompt(features, df, first_code)
+    messages = [gr.ChatMessage(role="user", content=prompt)]
+    yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
+    prompt_messages = [{"role": "user", "content": prompt}]
+    output = inference_client.chat_completion(
+        messages=prompt_messages, stream=True, max_tokens=2500
+    )
+    global GENERATED_TEXT
+    GENERATED_TEXT = ""
+    current_line = ""
+    for chunk in output:
+        current_line += chunk.choices[0].delta.content
+        if current_line.endswith("\n"):
+            GENERATED_TEXT += current_line
+            messages.append(gr.ChatMessage(role="assistant", content=current_line))
+            current_line = ""
+        yield messages
+    yield messages
+    logging.info("---> FOrmated prompt")
+    formatted_prompt = generate_mapping_prompt(GENERATED_TEXT)
+    logging.info(formatted_prompt)
+    prompt_messages = [{"role": "user", "content": formatted_prompt}]
+    yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Generating notebook..._")]
+    output = inference_client.chat_completion(
+        messages=prompt_messages, stream=False, max_tokens=2500
+    )
+    cells_txt = output.choices[0].message.content
+    logging.info("---> Model output")
+    logging.info(cells_txt)
+    commands = get_txt_from_output(cells_txt)
+    html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
     # Adding dataset viewer on the first part
+    commands.insert(
+        0,
+        {
+            "cell_type": "code",
+            "source": f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))',
+        },
+    )
+    commands.insert(0, {"cell_type": "markdown", "source": "# Dataset Viewer"})
+    notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
+    create_notebook_file(commands, notebook_name=notebook_name)
+    messages.append(
+        gr.ChatMessage(role="user", content="Here is the generated notebook")
+    )
+    yield messages
+    messages.append(
+        gr.ChatMessage(
+            role="user",
+            content=FileData(path=notebook_name, mime_type="application/x-ipynb+json"),
+        )
+    )
+    yield messages
+def write_notebook_file(dataset_id, history):
+    if not GENERATED_TEXT:
+        raise Exception("No generated notebook")
+    commands = get_txt_from_output(GENERATED_TEXT)
+    html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
+    # Adding dataset viewer on the first part
+    commands.insert(
+        0,
+        {
+            "cell_type": "code",
+            "source": f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))',
+        },
+    )
     commands.insert(0, {"cell_type": "markdown", "source": "# Dataset Viewer"})
     notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
     create_notebook_file(commands, notebook_name=notebook_name)
+    history.append(
+        gr.ChatMessage(role="user", content="Here is the generated notebook")
+    )
+    history.append(
+        gr.ChatMessage(
+            role="user",
+            content=FileData(path=notebook_name, mime_type="application/x-ipynb+json"),
+        )
+    )
+    return history
 with gr.Blocks() as demo:
         """
         return gr.HTML(value=html_code)
+    generate_cells_btn = gr.Button("Generate notebook")
+    chatbot = gr.Chatbot(
+        label="Results",
+        type="messages",
+        avatar_images=(
+            None,
+            None,
+        ),
+    )
+    generate_cells_btn.click(
+        generate_cells,
+        inputs=[dataset_name],
+        outputs=[chatbot],
+    )
     with gr.Row(visible=False) as auth_page:
         with gr.Column():
             gr.Markdown(
     push_btn = gr.Button("Push notebook to hub", visible=False)
     output_lbl = gr.HTML(value="", visible=False)
     def auth(token):
         if not token:
     push_btn.click(
         push_notebook,
+        inputs=[dataset_name, token_box],
         outputs=output_lbl,
     )