NCTCMumbai commited on
Commit
e4d28dd
·
verified ·
1 Parent(s): bf6e14b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +163 -132
app.py CHANGED
@@ -1,133 +1,164 @@
1
- import os
2
- import shutil
3
- import gradio as gr
4
- from transformers import ReactCodeAgent, HfEngine, Tool
5
- import pandas as pd
6
-
7
- from gradio import Chatbot
8
- from transformers.agents import stream_to_gradio
9
- from huggingface_hub import login
10
- from gradio.data_classes import FileData
11
-
12
- login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))
13
-
14
- llm_engine = HfEngine("meta-llama/Meta-Llama-3.1-70B-Instruct")
15
-
16
- agent = ReactCodeAgent(
17
- tools=[],
18
- llm_engine=llm_engine,
19
- additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "scipy.stats"],
20
- max_iterations=10,
21
- )
22
-
23
- base_prompt = """You are an expert data analyst.
24
- According to the features you have and the data structure given below, determine which feature should be the target.
25
- Then list 3 interesting questions that could be asked on this data, for instance about specific correlations with target variable.
26
- Then answer these questions one by one, by finding the relevant numbers.
27
- Meanwhile, plot some figures using matplotlib/seaborn and save them to the (already existing) folder './figures/': take care to clear each figure with plt.clf() before doing another plot.
28
-
29
- In your final answer: summarize these correlations and trends
30
- After each number derive real worlds insights, for instance: "Correlation between is_december and boredness is 1.3453, which suggest people are more bored in winter".
31
- Your final answer should be a long string with at least 3 numbered and detailed parts.
32
-
33
- Structure of the data:
34
- {structure_notes}
35
-
36
- The data file is passed to you as the variable data_file, it is a pandas dataframe, you can use it directly.
37
- DO NOT try to load data_file, it is already a dataframe pre-loaded in your python interpreter!
38
- """
39
-
40
- example_notes="""This data is about the Titanic wreck in 1912.
41
- The target figure is the survival of passengers, notes by 'Survived'
42
- pclass: A proxy for socio-economic status (SES)
43
- 1st = Upper
44
- 2nd = Middle
45
- 3rd = Lower
46
- age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
47
- sibsp: The dataset defines family relations in this way...
48
- Sibling = brother, sister, stepbrother, stepsister
49
- Spouse = husband, wife (mistresses and fiancés were ignored)
50
- parch: The dataset defines family relations in this way...
51
- Parent = mother, father
52
- Child = daughter, son, stepdaughter, stepson
53
- Some children travelled only with a nanny, therefore parch=0 for them."""
54
-
55
- def get_images_in_directory(directory):
56
- image_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'}
57
-
58
- image_files = []
59
- for root, dirs, files in os.walk(directory):
60
- for file in files:
61
- if os.path.splitext(file)[1].lower() in image_extensions:
62
- image_files.append(os.path.join(root, file))
63
- return image_files
64
-
65
- def interact_with_agent(file_input, additional_notes):
66
- shutil.rmtree("./figures")
67
- os.makedirs("./figures")
68
-
69
- data_file = pd.read_csv(file_input)
70
- data_structure_notes = f"""- Description (output of .describe()):
71
- {data_file.describe()}
72
- - Columns with dtypes:
73
- {data_file.dtypes}"""
74
-
75
- prompt = base_prompt.format(structure_notes=data_structure_notes)
76
-
77
- if additional_notes and len(additional_notes) > 0:
78
- prompt += "\nAdditional notes on the data:\n" + additional_notes
79
-
80
- messages = [gr.ChatMessage(role="user", content=prompt)]
81
- yield messages + [
82
- gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")
83
- ]
84
-
85
- plot_image_paths = {}
86
- for msg in stream_to_gradio(agent, prompt, data_file=data_file):
87
- messages.append(msg)
88
- for image_path in get_images_in_directory("./figures"):
89
- if image_path not in plot_image_paths:
90
- image_message = gr.ChatMessage(
91
- role="assistant",
92
- content=FileData(path=image_path, mime_type="image/png"),
93
- )
94
- plot_image_paths[image_path] = True
95
- messages.append(image_message)
96
- yield messages + [
97
- gr.ChatMessage(role="assistant", content="⏳ _Still processing..._")
98
- ]
99
- yield messages
100
-
101
-
102
- with gr.Blocks(
103
- theme=gr.themes.Soft(
104
- primary_hue=gr.themes.colors.yellow,
105
- secondary_hue=gr.themes.colors.blue,
106
- )
107
- ) as demo:
108
- gr.Markdown("""# Llama-3.1 Data analyst 📊🤔
109
-
110
- Drop a `.csv` file below, add notes to describe this data if needed, and **Llama-3.1-70B will analyze the file content and draw figures for you!**""")
111
- file_input = gr.File(label="Your file to analyze")
112
- text_input = gr.Textbox(
113
- label="Additional notes to support the analysis"
114
- )
115
- submit = gr.Button("Run analysis!", variant="primary")
116
- chatbot = gr.Chatbot(
117
- label="Data Analyst Agent",
118
- type="messages",
119
- avatar_images=(
120
- None,
121
- "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
122
- ),
123
- )
124
- gr.Examples(
125
- examples=[["./example/titanic.csv", example_notes]],
126
- inputs=[file_input, text_input],
127
- cache_examples=False
128
- )
129
-
130
- submit.click(interact_with_agent, [file_input, text_input], [chatbot])
131
-
132
- if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  demo.launch()
 
1
+ # NCTC
2
+ import os
3
+ import shutil
4
+ import gradio as gr
5
+ from transformers import ReactCodeAgent, HfEngine, Tool
6
+ import pandas as pd
7
+
8
+ from gradio import Chatbot
9
+ from transformers.agents import stream_to_gradio
10
+ from huggingface_hub import login
11
+ from gradio.data_classes import FileData
12
+
13
+ login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))
14
+
15
+ llm_engine = HfEngine("meta-llama/Meta-Llama-3.1-70B-Instruct")
16
+
17
+ agent = ReactCodeAgent(
18
+ tools=[],
19
+ llm_engine=llm_engine,
20
+ additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn", "scipy.stats"],
21
+ max_iterations=10,
22
+ )
23
+
24
+ # base_prompt = """You are an expert data analyst of National Customs Targeting Center. You will be uploaded with CSV file with multiple columns of numerical , categorical and text variables.
25
+ # According to the features you have and the data structure given below, determine which feature should be the target.
26
+ # Then list 3 interesting questions that could be asked on this data, for instance about specific correlations with target variable.
27
+ # Then answer these questions one by one, by finding the relevant numbers.
28
+ # Meanwhile, plot some figures using matplotlib/seaborn and save them to the (already existing) folder './figures/': take care to clear each figure with plt.clf() before doing another plot.
29
+
30
+ # In your final answer: summarize these correlations and trends
31
+ # After each number derive real worlds insights, for instance: "Correlation between is_december and boredness is 1.3453, which suggest people are more bored in winter".
32
+ # Your final answer should be a long string with at least 3 numbered and detailed parts.
33
+
34
+ # Structure of the data:
35
+ # {structure_notes}
36
+
37
+ # The data file is passed to you as the variable data_file, it is a pandas dataframe, you can use it directly.
38
+ # DO NOT try to load data_file, it is already a dataframe pre-loaded in your python interpreter!
39
+ # """
40
+ base_prompt = """You are an expert data analyst at the National Customs Targeting Center. You will be provided with a CSV file containing multiple columns of numerical, categorical, and text variables.
41
+
42
+ Your tasks are:
43
+ 1. **Target Identification**:
44
+ - Determine which feature(s) should be the target for analysis. Focus primarily on numerical and categorical columns, and avoid using unstructured text columns as targets.
45
+
46
+ 2. **Generate Interesting Questions**:
47
+ - Based on the identified target features, list at least 3 interesting questions that could be asked. For instance, explore specific correlations with the target variable(s), trends, or patterns.
48
+
49
+ 3. **Answer the Questions**:
50
+ - Answer these questions one by one by analyzing the data and finding relevant numbers.
51
+ - Generate insights from these answers. For example: "Correlation between `is_december` and `boredness` is 1.3453, suggesting that people are more bored in winter."
52
+
53
+ 4. **Generate Outlier Insights**:
54
+ - Identify outliers for each variable in the dataset.
55
+ - Provide insights into the outliers, including printing the outlier records and explaining their significance.
56
+
57
+ 5. **Visualization**:
58
+ - Plot multiple figures using matplotlib or seaborn.
59
+ - Generate plots for various target columns, covering both numerical and categorical columns.
60
+ - Ensure each figure is saved to the './figures/' folder and clear each figure with `plt.clf()` before generating the next plot.
61
+ - Include relevant plots that visualize correlations, trends, distributions, and outliers.
62
+
63
+ 6. **Final Summary**:
64
+ - Summarize the correlations, trends, and outlier insights in a detailed manner. Provide at least 3 numbered and detailed parts in the summary.
65
+
66
+ Structure of the data:
67
+ {structure_notes}
68
+
69
+ The data file is passed to you as the variable `data_file`, which is a pandas dataframe, and you can use it directly. DO NOT try to load `data_file`, as it is already pre-loaded in your Python interpreter!
70
+
71
+ Your final output should include:
72
+ 1. The identified target feature(s).
73
+ 2. Three interesting questions with detailed answers and real-world insights.
74
+ 3. Outlier insights for each variable, including the outlier records.
75
+ 4. Multiple saved plots in the './figures/' folder.
76
+ 5. A long, detailed final summary.
77
+ """
78
+ example_notes="""This data is about a sample Customs dataset with products imports (IMP_DESC),Importer ID( IEC No.), SUPPLIER ID , (item unit price) ITEM_UPI , CTH for product classification (Declared CTH), declared BCD Notification benefit (BCD Notification No. Declared) and value of import (ITEM_ASSESS_VAL)"""
79
+
80
+ def get_images_in_directory(directory):
81
+ image_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'}
82
+
83
+ image_files = []
84
+ for root, dirs, files in os.walk(directory):
85
+ for file in files:
86
+ if os.path.splitext(file)[1].lower() in image_extensions:
87
+ image_files.append(os.path.join(root, file))
88
+ return image_files
89
+
90
+ def interact_with_agent(file_input, additional_notes):
91
+ shutil.rmtree("./figures")
92
+ os.makedirs("./figures")
93
+
94
+ data_file = pd.read_csv(file_input)
95
+ data_structure_notes = f"""- Description (output of .describe()):
96
+ {data_file.describe()}
97
+ - Columns with dtypes:
98
+ {data_file.dtypes}"""
99
+
100
+ prompt = base_prompt.format(structure_notes=data_structure_notes)
101
+
102
+ if additional_notes and len(additional_notes) > 0:
103
+ prompt += "\nAdditional notes on the data:\n" + additional_notes
104
+
105
+ messages = [gr.ChatMessage(role="user", content=prompt)]
106
+ yield messages + [
107
+ gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")
108
+ ]
109
+
110
+ plot_image_paths = {}
111
+ for msg in stream_to_gradio(agent, prompt, data_file=data_file):
112
+ messages.append(msg)
113
+ for image_path in get_images_in_directory("./figures"):
114
+ if image_path not in plot_image_paths:
115
+ image_message = gr.ChatMessage(
116
+ role="assistant",
117
+ content=FileData(path=image_path, mime_type="image/png"),
118
+ )
119
+ plot_image_paths[image_path] = True
120
+ messages.append(image_message)
121
+ yield messages + [
122
+ gr.ChatMessage(role="assistant", content="⏳ _Still processing..._")
123
+ ]
124
+ yield messages
125
+
126
+
127
+ import gradio as gr
128
+
129
+ with gr.Blocks(
130
+ theme=gr.themes.Soft(
131
+ primary_hue=gr.themes.colors.yellow,
132
+ secondary_hue=gr.themes.colors.blue,
133
+ )
134
+ ) as demo:
135
+ gr.Markdown("""
136
+ <h1 style='color: darkblue; font-size: 2.5em;'>Llama-3.1 Data Analyst 📊🤔</h1>
137
+ <p><b>NCTC's attempt to use LLM-based ReAct Autonomous Agents to assist in smart customs data analysis</b></p>
138
+ <p>Drop a .csv file below, add notes to describe this data if needed, and Llama-3.1-70B will analyze the file content and draw figures for you!</p>
139
+ """)
140
+
141
+ file_input = gr.File(label="Your file to analyze")
142
+ text_input = gr.Textbox(
143
+ label="Additional notes to support the analysis"
144
+ )
145
+ submit = gr.Button("Run analysis!", variant="primary")
146
+ chatbot = gr.Chatbot(
147
+ label="Data Analyst Agent",
148
+ type="messages",
149
+ avatar_images=(
150
+ None,
151
+ "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
152
+ ),
153
+ )
154
+
155
+ gr.Examples(
156
+ examples=[["./example/titanic.csv", "Example notes on Titanic dataset."]],
157
+ inputs=[file_input, text_input],
158
+ cache_examples=False
159
+ )
160
+
161
+ submit.click(interact_with_agent, [file_input, text_input], [chatbot])
162
+
163
+ if __name__ == "__main__":
164
  demo.launch()