jzou19950715 commited on
Commit
47e9852
·
verified ·
1 Parent(s): 28c222c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +245 -329
app.py CHANGED
@@ -1,345 +1,261 @@
1
  """
2
- Advanced Data Analysis Assistant with Interactive Visualizations
3
- Integrates smolagents, GPT-4, and interactive Plotly visualizations.
4
  """
5
 
6
- import json
7
- import os
8
- from dataclasses import dataclass
9
- from pathlib import Path
10
- from typing import Any, Dict, List, Optional, Union, Tuple
11
-
12
  import gradio as gr
13
- import pandas as pd
14
- from smolagents import CodeAgent, LiteLLMModel
15
-
16
- # Import our custom tools
17
- from tools import (
18
- create_time_series_plot,
19
- create_correlation_heatmap,
20
- create_statistical_summary,
21
- detect_outliers,
22
- validate_dataframe,
23
- get_numeric_columns,
24
- get_temporal_columns,
25
- AnalysisError
26
- )
27
 
28
  # Constants
29
- SUPPORTED_FILE_TYPES = [".csv", ".xlsx", ".xls"]
30
- DEFAULT_MODEL = "gpt-4o-mini"
31
- HISTORY_FILE = "analysis_history.json"
32
-
33
- @dataclass
34
- class VisualizationConfig:
35
- """Configuration for visualizations."""
36
- width: int = 800
37
- height: int = 500
38
- template: str = "plotly_white"
39
- show_grid: bool = True
40
- interactive: bool = True
41
-
42
- class DataPreprocessor:
43
- """Handles data preprocessing and validation."""
44
-
45
- @staticmethod
46
- def preprocess_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, Any]]:
47
- """Preprocess the dataframe and return metadata."""
48
- # First validate the dataframe
49
- is_valid, error_msg = validate_dataframe(df)
50
- if not is_valid:
51
- raise ValueError(error_msg)
52
-
53
- metadata = {
54
- "original_shape": df.shape,
55
- "missing_values": df.isnull().sum().to_dict(),
56
- "dtypes": df.dtypes.astype(str).to_dict(),
57
- "numeric_columns": get_numeric_columns(df),
58
- "categorical_columns": df.select_dtypes(include=['object']).columns.tolist(),
59
- "temporal_columns": get_temporal_columns(df)
60
- }
61
-
62
- # Handle missing values
63
- df = df.fillna(method='ffill').fillna(method='bfill')
64
-
65
- return df, metadata
66
-
67
- class DataAnalysisAssistant:
68
- """Enhanced data analysis assistant with visualization capabilities."""
69
-
70
- def __init__(self, api_key: str):
71
- self.model = LiteLLMModel(
72
- model_id=DEFAULT_MODEL,
73
- api_key=api_key
74
- )
75
- self.history = AnalysisHistory()
76
-
77
- # Initialize agent with tools and our custom analysis tools
78
- self.agent = CodeAgent(
79
- model=self.model,
80
- tools=[
81
- create_time_series_plot,
82
- create_correlation_heatmap,
83
- create_statistical_summary,
84
- detect_outliers
85
- ],
86
- additional_authorized_imports=[
87
- 'pandas', 'numpy', 'plotly.express', 'plotly.graph_objects',
88
- 'seaborn', 'scipy', 'statsmodels'
89
- ],
90
- )
91
-
92
- def analyze(self, df: pd.DataFrame, query: str) -> str:
93
- """Perform analysis with interactive visualizations."""
94
- try:
95
- # Preprocess data
96
- df, metadata = DataPreprocessor.preprocess_dataframe(df)
97
-
98
- # Create context for the agent
99
- context = self._create_analysis_context(df, metadata, query)
100
-
101
- # Get analysis plan and execute
102
- response = self.agent.run(context, additional_args={"df": df})
103
-
104
- # Save to history
105
- self.history.add_entry(query, str(response))
106
-
107
- return self._format_results(response)
108
-
109
- except Exception as e:
110
- return f"Analysis failed: {str(e)}"
111
-
112
- def _create_analysis_context(self, df: pd.DataFrame, metadata: Dict, query: str) -> str:
113
- """Create detailed context for analysis."""
114
- tools_description = """
115
- Available analysis tools:
116
- - create_time_series_plot: Create interactive time series visualizations
117
- - create_correlation_heatmap: Generate correlation analysis with heatmap
118
- - create_statistical_summary: Compute statistical summaries with visualizations
119
- - detect_outliers: Identify and visualize outliers
120
- """
121
-
122
- return f"""
123
- Analyze the following data with interactive visualizations.
124
-
125
- DataFrame Information:
126
- - Shape: {metadata['original_shape']}
127
- - Numeric columns: {', '.join(metadata['numeric_columns'])}
128
- - Categorical columns: {', '.join(metadata['categorical_columns'])}
129
- - Temporal columns: {', '.join(metadata['temporal_columns'])}
130
-
131
- {tools_description}
132
-
133
- User Query: {query}
134
-
135
- Guidelines:
136
- 1. Use the provided analysis tools for visualizations
137
- 2. Include clear titles and labels
138
- 3. Handle errors gracefully
139
- 4. Chain multiple analyses when needed
140
- 5. Provide insights along with visualizations
141
-
142
- The DataFrame is available as 'df'.
143
- """
144
-
145
- def _format_results(self, response: str) -> str:
146
- """Format analysis results with visualizations."""
147
- return f'<div class="analysis-text">{response}</div>'
148
-
149
- class AnalysisHistory:
150
  """
151
- Manages analysis history and persistence.
152
-
153
- Attributes:
154
- history_file (Path): Path to the JSON file storing analysis history
155
- history (List[Dict]): List of historical analysis entries
156
-
157
- Each history entry is a dictionary containing:
158
- - timestamp: ISO format timestamp
159
- - query: The user's analysis query
160
- - result: The analysis result/response
161
  """
162
-
163
- def __init__(self, history_file: str = HISTORY_FILE):
164
- """
165
- Initialize the analysis history manager.
166
-
167
- Args:
168
- history_file (str): Path to history JSON file. Defaults to HISTORY_FILE.
169
- """
170
- self.history_file = Path(history_file)
171
- self.history = self._load_history()
172
-
173
- def _load_history(self) -> List[Dict]:
174
- """
175
- Load analysis history from file.
176
-
177
- Returns:
178
- List[Dict]: List of historical analysis entries
179
- """
180
- if self.history_file.exists():
181
- try:
182
- with self.history_file.open('r') as f:
183
- return json.load(f)
184
- except json.JSONDecodeError:
185
- logger.error(f"Invalid JSON in history file: {self.history_file}")
186
- return []
187
- except Exception as e:
188
- logger.error(f"Error loading history file: {str(e)}")
189
- return []
190
- return []
191
-
192
- def add_entry(self, query: str, result: str) -> None:
193
- """
194
- Add a new analysis entry to history.
195
-
196
- Args:
197
- query (str): The analysis query
198
- result (str): The analysis result/response
199
- """
200
- entry = {
201
- 'timestamp': datetime.now().isoformat(),
202
- 'query': query,
203
- 'result': result
204
- }
205
- self.history.append(entry)
206
- self._save_history()
207
-
208
- def get_recent_analyses(self, limit: int = 5) -> List[Dict]:
209
- """
210
- Get most recent analysis entries.
211
-
212
- Args:
213
- limit (int): Maximum number of entries to return. Defaults to 5.
214
-
215
- Returns:
216
- List[Dict]: Recent analysis entries, sorted by timestamp (newest first)
217
- """
218
- return sorted(
219
- self.history,
220
- key=lambda x: x['timestamp'],
221
- reverse=True
222
- )[:limit]
223
-
224
- def _save_history(self) -> None:
225
- """Save history to file."""
226
  try:
227
- with self.history_file.open('w') as f:
228
- json.dump(self.history, f, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  except Exception as e:
230
- logger.error(f"Failed to save history: {str(e)}")
231
-
232
- def clear_history(self) -> None:
233
- """Clear all analysis history."""
234
- self.history = []
235
- self._save_history()
236
-
237
- def get_history_by_date(self, start_date: datetime, end_date: datetime) -> List[Dict]:
238
- """
239
- Get analysis history within a date range.
240
-
241
- Args:
242
- start_date (datetime): Start of date range
243
- end_date (datetime): End of date range
244
-
245
- Returns:
246
- List[Dict]: Analysis entries within the specified date range
247
- """
248
- filtered_history = []
249
- for entry in self.history:
250
- try:
251
- entry_date = datetime.fromisoformat(entry['timestamp'])
252
- if start_date <= entry_date <= end_date:
253
- filtered_history.append(entry)
254
- except Exception as e:
255
- logger.error(f"Error parsing entry date: {str(e)}")
256
- continue
257
- return filtered_history
258
-
259
- def search_history(self, search_term: str) -> List[Dict]:
260
- """
261
- Search analysis history for a specific term.
262
-
263
- Args:
264
- search_term (str): Term to search for in queries and results
265
-
266
- Returns:
267
- List[Dict]: Matching analysis entries
268
- """
269
- search_term = search_term.lower()
270
- return [
271
- entry for entry in self.history
272
- if search_term in entry['query'].lower()
273
- or search_term in entry['result'].lower()
274
- ]
275
-
276
- def get_statistics(self) -> Dict[str, Any]:
277
- """
278
- Get statistics about the analysis history.
279
-
280
- Returns:
281
- Dict[str, Any]: Statistics including total entries, date range, etc.
282
- """
283
- if not self.history:
284
- return {
285
- "total_entries": 0,
286
- "date_range": None,
287
- "average_entries_per_day": 0
288
  }
 
289
 
290
- dates = [datetime.fromisoformat(entry['timestamp']) for entry in self.history]
291
- first_date = min(dates)
292
- last_date = max(dates)
293
- days_span = (last_date - first_date).days or 1
294
-
295
- return {
296
- "total_entries": len(self.history),
297
- "date_range": {
298
- "first": first_date.isoformat(),
299
- "last": last_date.isoformat()
300
- },
301
- "average_entries_per_day": len(self.history) / days_span
302
- }
303
-
304
- def export_history(self, format: str = 'json') -> str:
305
- """
306
- Export analysis history in specified format.
307
-
308
- Args:
309
- format (str): Export format ('json' or 'csv'). Defaults to 'json'.
310
-
311
- Returns:
312
- str: Path to exported file
313
-
314
- Raises:
315
- ValueError: If format is not supported
316
- """
317
- timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
318
- if format == 'json':
319
- export_path = f'analysis_history_{timestamp}.json'
320
- with open(export_path, 'w') as f:
321
- json.dump(self.history, f, indent=2)
322
- return export_path
323
- elif format == 'csv':
324
- export_path = f'analysis_history_{timestamp}.csv'
325
- df = pd.DataFrame(self.history)
326
- df.to_csv(export_path, index=False)
327
- return export_path
328
- else:
329
- raise ValueError(f"Unsupported export format: {format}")
330
-
331
- def process_file(file: gr.File) -> Optional[pd.DataFrame]:
332
- """Process uploaded file into DataFrame."""
333
- [Previous process_file implementation remains the same]
334
-
335
- def analyze_data(file: gr.File, query: str, api_key: str) -> str:
336
- """Main analysis function for Gradio interface."""
337
- [Previous analyze_data implementation remains the same]
338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  def create_interface():
340
- """Create enhanced Gradio interface."""
341
- [Previous create_interface implementation remains the same]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
 
343
  if __name__ == "__main__":
344
- interface = create_interface()
345
- interface.launch()
 
 
 
 
 
1
  """
2
+ Gradio Interactive Chat App for Educational Information Collection.
 
3
  """
4
 
5
+ # Imports
 
 
 
 
 
6
  import gradio as gr
7
+ import openai
8
+ import json
9
+ from datetime import datetime
10
+ from typing import List, Optional, Dict
11
+ from pydantic import BaseModel, Field
 
 
 
 
 
 
 
 
 
12
 
13
  # Constants
14
+ SYSTEM_PROMPT = """You are an educational information collection assistant. Your task is to systematically collect the following information in a conversational manner:
15
+
16
+ Required Information (collect in this order):
17
+ 1. Institution Details:
18
+ - Name
19
+ - Type (e.g., university, college, etc.)
20
+ - Location
21
+ 2. Degree Information:
22
+ - Type (e.g., Bachelor's, Master's, etc.)
23
+ - Field of Study
24
+ - Status (e.g., completed, ongoing)
25
+ 3. Attendance Dates (start and end)
26
+ 4. Academic Performance:
27
+ - GPA (if provided)
28
+ - Honors or awards
29
+ 5. Activities:
30
+ - Extracurricular activities, roles, and durations
31
+
32
+ Always maintain a friendly, professional tone while systematically collecting this information."""
33
+
34
+ # Data Models
35
+ class Institution(BaseModel):
36
+ name: str
37
+ type: str
38
+ location: str
39
+
40
+
41
+ class Degree(BaseModel):
42
+ type: str
43
+ field: str
44
+ status: str
45
+
46
+
47
+ class Dates(BaseModel):
48
+ start: str
49
+ end: str
50
+
51
+
52
+ class Activity(BaseModel):
53
+ name: str
54
+ description: str
55
+ duration: str
56
+
57
+
58
+ class Academic(BaseModel):
59
+ gpa: Optional[float] = None
60
+ honors: List[str] = Field(default_factory=list)
61
+ achievements: List[str] = Field(default_factory=list)
62
+
63
+
64
+ class Education(BaseModel):
65
+ institution: Institution
66
+ degree: Degree
67
+ dates: Dates
68
+ academic: Academic
69
+ activities: List[Activity] = Field(default_factory=list)
70
+
71
+
72
+ class EducationRecord(BaseModel):
73
+ education: List[Education] = Field(default_factory=list)
74
+ metadata: Dict[str, str]
75
+
76
+
77
+ # Assistant Logic
78
+ class EducationAssistant:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  """
80
+ Handles conversation state, chat interactions, and JSON generation.
 
 
 
 
 
 
 
 
 
81
  """
82
+
83
+ def __init__(self):
84
+ self.conversation_history = []
85
+ self.client = None
86
+ self.system_prompt = SYSTEM_PROMPT
87
+
88
+ def initialize_chat(self, api_key: str) -> str:
89
+ """Initializes OpenAI client and provides the first prompt."""
90
+ try:
91
+ openai.api_key = api_key
92
+ return "Hello! Let's record your educational history. What is the name of your most recent educational institution?"
93
+ except Exception as e:
94
+ return f"Error initializing chat: {str(e)}"
95
+
96
+ def chat(self, message: str, api_key: str) -> Dict[str, str]:
97
+ """Processes user messages and generates responses."""
98
+ if not self.client:
99
+ first_message = self.initialize_chat(api_key)
100
+ self.conversation_history.append({"role": "assistant", "content": first_message})
101
+ return {"role": "assistant", "content": first_message}
102
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  try:
104
+ # Append user message to history
105
+ self.conversation_history.append({"role": "user", "content": message})
106
+
107
+ # Generate response
108
+ response = openai.ChatCompletion.create(
109
+ model="gpt-4o-mini",
110
+ messages=[{"role": "system", "content": self.system_prompt}] + self.conversation_history,
111
+ temperature=0.7,
112
+ max_tokens=300
113
+ )
114
+
115
+ # Parse and store assistant response
116
+ assistant_message = {"role": "assistant", "content": response.choices[0].message.content}
117
+ self.conversation_history.append(assistant_message)
118
+
119
+ return assistant_message
120
+
121
  except Exception as e:
122
+ return {"role": "assistant", "content": f"Error: {str(e)}"}
123
+
124
+ def generate_json(self) -> Optional[str]:
125
+ """Generates structured JSON from the conversation history."""
126
+ try:
127
+ json_prompt = """Based on our conversation, generate a structured JSON containing the educational information shared. Format it as follows:
128
+ {
129
+ "education": [
130
+ {
131
+ "institution": {
132
+ "name": string,
133
+ "type": string,
134
+ "location": string
135
+ },
136
+ "degree": {
137
+ "type": string,
138
+ "field": string,
139
+ "status": string
140
+ },
141
+ "dates": {
142
+ "start": string,
143
+ "end": string
144
+ },
145
+ "academic": {
146
+ "gpa": float (if provided),
147
+ "honors": [string],
148
+ "achievements": [string]
149
+ },
150
+ "activities": [
151
+ {
152
+ "name": string,
153
+ "description": string,
154
+ "duration": string
155
+ }
156
+ ]
157
+ }
158
+ ],
159
+ "metadata": {
160
+ "timestamp": string,
161
+ "source": "Education Information Assistant"
162
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  }
164
+ Respond ONLY with the JSON."""
165
 
166
+ # Generate JSON based on the conversation history
167
+ response = openai.ChatCompletion.create(
168
+ model="gpt-4o-mini",
169
+ messages=[{"role": "system", "content": self.system_prompt}] + self.conversation_history +
170
+ [{"role": "user", "content": json_prompt}],
171
+ temperature=0.1,
172
+ max_tokens=1500
173
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
+ # Parse response and write JSON file
176
+ json_data = json.loads(response.choices[0].message.content)
177
+ filename = f"education_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
178
+ with open(filename, "w") as f:
179
+ json.dump(json_data, f, indent=2)
180
+
181
+ return filename
182
+
183
+ except Exception as e:
184
+ print(f"Error generating JSON: {str(e)}")
185
+ return None
186
+
187
+
188
+ # Gradio Interface
189
  def create_interface():
190
+ assistant = EducationAssistant()
191
+
192
+ with gr.Blocks() as demo:
193
+ gr.Markdown("# 📘 Educational Information Collection Assistant")
194
+
195
+ with gr.Row():
196
+ api_key = gr.Textbox(
197
+ label="OpenAI API Key",
198
+ type="password",
199
+ placeholder="Enter your OpenAI API Key",
200
+ info="Required for using OpenAI's GPT model."
201
+ )
202
+
203
+ chatbot = gr.Chatbot(label="Assistant", height=400)
204
+
205
+ with gr.Row():
206
+ user_input = gr.Textbox(
207
+ label="Your Message",
208
+ placeholder="Type your message here...",
209
+ lines=2
210
+ )
211
+ send_button = gr.Button("Send", variant="primary")
212
+
213
+ generate_button = gr.Button("Generate JSON")
214
+ download_file = gr.File(label="Generated JSON")
215
+
216
+ # Event Handlers
217
+ def handle_send(message, history, api_key):
218
+ if not api_key.strip():
219
+ return history + [{"role": "assistant", "content": "Please provide your OpenAI API key to continue."}]
220
+
221
+ if not message.strip():
222
+ return history
223
+
224
+ response = assistant.chat(message, api_key)
225
+ return history + [{"role": "user", "content": message}, response]
226
+
227
+ def handle_generate():
228
+ filename = assistant.generate_json()
229
+ if filename:
230
+ return filename
231
+ return "Error generating JSON. Please ensure all required information is collected."
232
+
233
+ # Button Actions
234
+ send_button.click(
235
+ handle_send,
236
+ inputs=[user_input, chatbot, api_key],
237
+ outputs=[chatbot]
238
+ )
239
+
240
+ user_input.submit(
241
+ handle_send,
242
+ inputs=[user_input, chatbot, api_key],
243
+ outputs=[chatbot]
244
+ )
245
+
246
+ generate_button.click(
247
+ handle_generate,
248
+ outputs=[download_file]
249
+ )
250
+
251
+ return demo
252
+
253
 
254
+ # Main Execution
255
  if __name__ == "__main__":
256
+ demo = create_interface()
257
+ demo.launch(
258
+ server_name="0.0.0.0",
259
+ server_port=7860,
260
+ share=True
261
+ )