File size: 7,556 Bytes
e4d28dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
becf5f6
 
e4d28dd
 
 
becf5f6
e4d28dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02d09cb
e4d28dd
 
 
 
 
 
 
1e43cc3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# NCTC 
import os
import shutil
import gradio as gr
from transformers import ReactCodeAgent, HfEngine, Tool
import pandas as pd

from gradio import Chatbot
from transformers.agents import stream_to_gradio
from huggingface_hub import login
from gradio.data_classes import FileData

login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))

llm_engine = HfEngine("meta-llama/Meta-Llama-3.1-70B-Instruct")

agent = ReactCodeAgent(
    tools=[],
    llm_engine=llm_engine,
    additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "scipy.stats"],
    max_iterations=10,
)

# base_prompt = """You are an expert data analyst of National Customs Targeting Center. You will be uploaded with CSV file with multiple columns of numerical , categorical and text variables.
# According to the features you have and the data structure given below, determine which feature should be the target.
# Then list 3 interesting questions that could be asked on this data, for instance about specific correlations with target variable.
# Then answer these questions one by one, by finding the relevant numbers.
# Meanwhile, plot some figures using matplotlib/seaborn and save them to the (already existing) folder './figures/': take care to clear each figure with plt.clf() before doing another plot.

# In your final answer: summarize these correlations and trends
# After each number derive real worlds insights, for instance: "Correlation between is_december and boredness is 1.3453, which suggest people are more bored in winter".
# Your final answer should be a long string with at least 3 numbered and detailed parts.

# Structure of the data:
# {structure_notes}

# The data file is passed to you as the variable data_file, it is a pandas dataframe, you can use it directly.
# DO NOT try to load data_file, it is already a dataframe pre-loaded in your python interpreter!
# """
base_prompt = """You are an expert data analyst at the National Customs Targeting Center. You will be provided with a CSV file containing multiple columns of numerical, categorical, and text variables.

Your tasks are:
1. **Target Identification**:
   - Determine which feature(s) should be the target for analysis. Focus primarily on numerical and categorical columns, and avoid using unstructured text columns as targets.

2. **Generate Interesting Questions**:
   - Based on the identified target features, list at least 3 interesting questions that could be asked. For instance, explore specific correlations with the target variable(s), trends, or patterns.

3. **Answer the Questions**:
   - Answer these questions one by one by analyzing the data and finding relevant numbers.
   - Generate insights from these answers. For example: "Correlation between `is_december` and `boredness` is 1.3453, suggesting that people are more bored in winter."

4. **Generate Outlier Insights**:
   - Identify outliers for each variable in the dataset.
   - Provide insights into the outliers, including printing the outlier records and explaining their significance.

5. **Visualization**:
   - Plot multiple figures using matplotlib or seaborn.
   - Generate plots for various target columns, covering both numerical and categorical columns.
   - Ensure each figure is saved to the './figures/' folder and clear each figure with `plt.clf()` before generating the next plot.
   - Include relevant plots that visualize correlations, trends, distributions, and outliers.

6. **Final Summary**:
   - Summarize the correlations, trends, and outlier insights in a detailed manner. Provide at least 3 numbered and detailed parts in the summary.

Structure of the data:
{structure_notes}

The data file is passed to you as the variable `data_file`, which is a pandas dataframe, and you can use it directly. DO NOT try to load `data_file`, as it is already pre-loaded in your Python interpreter!

Your final output should include:
1. The identified target feature(s).
2. Three interesting questions with detailed answers and real-world insights.
3. Outlier insights for each variable, including the outlier records.
4. Multiple saved plots in the './figures/' folder.
5. A long, detailed final summary.
"""
example_notes="""This data is about a sample Customs dataset with products imports (IMP_DESC),Importer ID( IEC No.), SUPPLIER ID , (item unit price) ITEM_UPI , CTH for product classification (Declared CTH), declared BCD Notification benefit (BCD Notification No. Declared) and value of import (ITEM_ASSESS_VAL)"""

def get_images_in_directory(directory):
    image_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'}

    image_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if os.path.splitext(file)[1].lower() in image_extensions:
                image_files.append(os.path.join(root, file))
    return image_files

def interact_with_agent(file_input, additional_notes):
    shutil.rmtree("./figures")
    os.makedirs("./figures")

    data_file = pd.read_csv(file_input)
    data_structure_notes = f"""- Description (output of .describe()):
    {data_file.describe()}
    - Columns with dtypes:
    {data_file.dtypes}"""

    prompt = base_prompt.format(structure_notes=data_structure_notes)

    if additional_notes and len(additional_notes) > 0:
        prompt += "\nAdditional notes on the data:\n" + additional_notes

    messages = [gr.ChatMessage(role="user", content=prompt)]
    yield messages + [
        gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")
    ]

    plot_image_paths = {}
    for msg in stream_to_gradio(agent, prompt, data_file=data_file):
        messages.append(msg)
        for image_path in get_images_in_directory("./figures"):
            if image_path not in plot_image_paths:
                image_message = gr.ChatMessage(
                    role="assistant",
                    content=FileData(path=image_path, mime_type="image/png"),
                )
                plot_image_paths[image_path] = True
                messages.append(image_message)
        yield messages + [
            gr.ChatMessage(role="assistant", content="⏳ _Still processing..._")
        ]
    yield messages


import gradio as gr

with gr.Blocks(
    theme=gr.themes.Soft(
        primary_hue=gr.themes.colors.green,  # Changing to a fresh green
        secondary_hue=gr.themes.colors.purple,  # Adding a touch of regal purple
    )
) as demo:
    gr.Markdown("""
    <h1 style='color: darkblue; font-size: 2.5em;'>NCTC Llama-3.1 Data Analysis Agent πŸ“ŠπŸ€”</h1>
    <p><b>NCTC's attempt to use LLM-based ReAct Autonomous Agents to assist in smart customs data analysis</b></p>
    <p>Drop a .csv file below, add notes to describe this data if needed, and Llama-3.1-70B will analyze the file content and draw figures for you!</p>
    """)
    
    file_input = gr.File(label="Your file to analyze")
    text_input = gr.Textbox(
        label="Additional notes to support the analysis"
    )
    submit = gr.Button("Run analysis!", variant="primary")
    chatbot = gr.Chatbot(
        label="Data Analyst Agent",
        type="messages",
        avatar_images=(
            None,
            "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
        ),
    )
    
    gr.Examples(
        examples=[["./example/sample_customs_data_anonymised.csv", example_notes]],
        inputs=[file_input, text_input],
        cache_examples=False
    )

    submit.click(interact_with_agent, [file_input, text_input], [chatbot])

if __name__ == "__main__":
    demo.launch()