Spaces:
Sleeping
Sleeping
File size: 7,556 Bytes
e4d28dd becf5f6 e4d28dd becf5f6 e4d28dd 02d09cb e4d28dd 1e43cc3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
# NCTC
import os
import shutil
import gradio as gr
from transformers import ReactCodeAgent, HfEngine, Tool
import pandas as pd
from gradio import Chatbot
from transformers.agents import stream_to_gradio
from huggingface_hub import login
from gradio.data_classes import FileData
login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))
llm_engine = HfEngine("meta-llama/Meta-Llama-3.1-70B-Instruct")
agent = ReactCodeAgent(
tools=[],
llm_engine=llm_engine,
additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "scipy.stats"],
max_iterations=10,
)
# base_prompt = """You are an expert data analyst of National Customs Targeting Center. You will be uploaded with CSV file with multiple columns of numerical , categorical and text variables.
# According to the features you have and the data structure given below, determine which feature should be the target.
# Then list 3 interesting questions that could be asked on this data, for instance about specific correlations with target variable.
# Then answer these questions one by one, by finding the relevant numbers.
# Meanwhile, plot some figures using matplotlib/seaborn and save them to the (already existing) folder './figures/': take care to clear each figure with plt.clf() before doing another plot.
# In your final answer: summarize these correlations and trends
# After each number derive real worlds insights, for instance: "Correlation between is_december and boredness is 1.3453, which suggest people are more bored in winter".
# Your final answer should be a long string with at least 3 numbered and detailed parts.
# Structure of the data:
# {structure_notes}
# The data file is passed to you as the variable data_file, it is a pandas dataframe, you can use it directly.
# DO NOT try to load data_file, it is already a dataframe pre-loaded in your python interpreter!
# """
base_prompt = """You are an expert data analyst at the National Customs Targeting Center. You will be provided with a CSV file containing multiple columns of numerical, categorical, and text variables.
Your tasks are:
1. **Target Identification**:
- Determine which feature(s) should be the target for analysis. Focus primarily on numerical and categorical columns, and avoid using unstructured text columns as targets.
2. **Generate Interesting Questions**:
- Based on the identified target features, list at least 3 interesting questions that could be asked. For instance, explore specific correlations with the target variable(s), trends, or patterns.
3. **Answer the Questions**:
- Answer these questions one by one by analyzing the data and finding relevant numbers.
- Generate insights from these answers. For example: "Correlation between `is_december` and `boredness` is 1.3453, suggesting that people are more bored in winter."
4. **Generate Outlier Insights**:
- Identify outliers for each variable in the dataset.
- Provide insights into the outliers, including printing the outlier records and explaining their significance.
5. **Visualization**:
- Plot multiple figures using matplotlib or seaborn.
- Generate plots for various target columns, covering both numerical and categorical columns.
- Ensure each figure is saved to the './figures/' folder and clear each figure with `plt.clf()` before generating the next plot.
- Include relevant plots that visualize correlations, trends, distributions, and outliers.
6. **Final Summary**:
- Summarize the correlations, trends, and outlier insights in a detailed manner. Provide at least 3 numbered and detailed parts in the summary.
Structure of the data:
{structure_notes}
The data file is passed to you as the variable `data_file`, which is a pandas dataframe, and you can use it directly. DO NOT try to load `data_file`, as it is already pre-loaded in your Python interpreter!
Your final output should include:
1. The identified target feature(s).
2. Three interesting questions with detailed answers and real-world insights.
3. Outlier insights for each variable, including the outlier records.
4. Multiple saved plots in the './figures/' folder.
5. A long, detailed final summary.
"""
example_notes="""This data is about a sample Customs dataset with products imports (IMP_DESC),Importer ID( IEC No.), SUPPLIER ID , (item unit price) ITEM_UPI , CTH for product classification (Declared CTH), declared BCD Notification benefit (BCD Notification No. Declared) and value of import (ITEM_ASSESS_VAL)"""
def get_images_in_directory(directory):
image_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'}
image_files = []
for root, dirs, files in os.walk(directory):
for file in files:
if os.path.splitext(file)[1].lower() in image_extensions:
image_files.append(os.path.join(root, file))
return image_files
def interact_with_agent(file_input, additional_notes):
shutil.rmtree("./figures")
os.makedirs("./figures")
data_file = pd.read_csv(file_input)
data_structure_notes = f"""- Description (output of .describe()):
{data_file.describe()}
- Columns with dtypes:
{data_file.dtypes}"""
prompt = base_prompt.format(structure_notes=data_structure_notes)
if additional_notes and len(additional_notes) > 0:
prompt += "\nAdditional notes on the data:\n" + additional_notes
messages = [gr.ChatMessage(role="user", content=prompt)]
yield messages + [
gr.ChatMessage(role="assistant", content="β³ _Starting task..._")
]
plot_image_paths = {}
for msg in stream_to_gradio(agent, prompt, data_file=data_file):
messages.append(msg)
for image_path in get_images_in_directory("./figures"):
if image_path not in plot_image_paths:
image_message = gr.ChatMessage(
role="assistant",
content=FileData(path=image_path, mime_type="image/png"),
)
plot_image_paths[image_path] = True
messages.append(image_message)
yield messages + [
gr.ChatMessage(role="assistant", content="β³ _Still processing..._")
]
yield messages
import gradio as gr
with gr.Blocks(
theme=gr.themes.Soft(
primary_hue=gr.themes.colors.green, # Changing to a fresh green
secondary_hue=gr.themes.colors.purple, # Adding a touch of regal purple
)
) as demo:
gr.Markdown("""
<h1 style='color: darkblue; font-size: 2.5em;'>NCTC Llama-3.1 Data Analysis Agent ππ€</h1>
<p><b>NCTC's attempt to use LLM-based ReAct Autonomous Agents to assist in smart customs data analysis</b></p>
<p>Drop a .csv file below, add notes to describe this data if needed, and Llama-3.1-70B will analyze the file content and draw figures for you!</p>
""")
file_input = gr.File(label="Your file to analyze")
text_input = gr.Textbox(
label="Additional notes to support the analysis"
)
submit = gr.Button("Run analysis!", variant="primary")
chatbot = gr.Chatbot(
label="Data Analyst Agent",
type="messages",
avatar_images=(
None,
"https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
),
)
gr.Examples(
examples=[["./example/sample_customs_data_anonymised.csv", example_notes]],
inputs=[file_input, text_input],
cache_examples=False
)
submit.click(interact_with_agent, [file_input, text_input], [chatbot])
if __name__ == "__main__":
demo.launch() |