Aka18 commited on
Commit
fa623b2
·
verified ·
1 Parent(s): 24f5365

Update data_analysis_agent.py

Browse files
Files changed (1) hide show
  1. data_analysis_agent.py +219 -155
data_analysis_agent.py CHANGED
@@ -10,19 +10,19 @@ import warnings
10
  import traceback
11
  import time
12
  import random
13
- import httpx
 
 
 
14
  warnings.filterwarnings('ignore')
15
 
16
  from typing import Dict, List, Any, Optional, TypedDict
17
- import json
18
  from datetime import datetime
19
  import logging
20
 
21
- # LangGraph and LLM imports
22
  from langgraph.graph import StateGraph, END
23
- from langchain_groq import ChatGroq
24
  from langchain_core.messages import HumanMessage, SystemMessage
25
- from langchain_core.prompts import ChatPromptTemplate
26
 
27
  # Configure logging
28
  logging.basicConfig(level=logging.INFO)
@@ -41,86 +41,136 @@ class AnalysisState(TypedDict):
41
 
42
  class DataAnalysisAgent:
43
  def __init__(self, groq_api_key: str, model_name: str = "llama3-70b-8192"):
44
- """Initialize the Data Analysis Agent with HF Spaces networking fix"""
45
 
46
- # Detect if running in Hugging Face Spaces
47
- self.is_hf_spaces = os.environ.get('SPACE_ID') is not None
48
  self.groq_api_key = groq_api_key
49
  self.model_name = model_name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  if self.is_hf_spaces:
52
- logger.info("🚀 Initializing for Hugging Face Spaces...")
53
- # Hugging Face Spaces specific configuration
54
- self.llm = ChatGroq(
55
- groq_api_key=groq_api_key,
56
- model_name=model_name,
57
- temperature=0.1,
58
- max_tokens=2000,
59
- timeout=180, # Longer timeout for HF Spaces
60
- max_retries=0, # Disable retries, handle manually
61
- http_client=httpx.Client(
62
- timeout=httpx.Timeout(180.0),
63
- limits=httpx.Limits(max_connections=5, max_keepalive_connections=2),
64
- headers={
65
- "User-Agent": "Mozilla/5.0 (compatible; DataAnalysisAgent/1.0)",
66
- "Accept": "application/json",
67
- "Connection": "close" # Important for HF Spaces
68
- }
69
- )
70
- )
71
  else:
72
- logger.info("💻 Initializing for local/Streamlit Cloud...")
73
- # Normal configuration for local/Streamlit Cloud
74
- self.llm = ChatGroq(
75
- groq_api_key=groq_api_key,
76
- model_name=model_name,
77
- temperature=0.1,
78
- max_tokens=2000
79
- )
80
 
81
  # Set up the analysis workflow graph
82
  self.workflow = self._create_workflow()
83
 
84
- def _hf_spaces_llm_call(self, prompt: str) -> str:
85
- """LLM call optimized for Hugging Face Spaces"""
 
 
86
 
87
- if not self.is_hf_spaces:
88
- # Normal call for local/Streamlit Cloud
89
- response = self.llm.invoke([HumanMessage(content=prompt)])
90
- return response.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- # Hugging Face Spaces specific handling
93
- max_attempts = 3
94
  for attempt in range(max_attempts):
95
  try:
96
- # Add random delay to avoid rate limiting
97
  if attempt > 0:
98
- delay = random.uniform(2, 5) * attempt
99
- logger.info(f"⏳ HF Spaces: Waiting {delay:.1f}s before retry {attempt + 1}")
 
100
  time.sleep(delay)
101
 
102
- # Make the call
103
- logger.info(f"🤖 HF Spaces: LLM attempt {attempt + 1}/{max_attempts}")
104
- response = self.llm.invoke([HumanMessage(content=prompt)])
105
 
106
- if response and response.content:
107
- logger.info("✅ HF Spaces: LLM call successful")
108
- return response.content
 
 
109
 
110
- except Exception as e:
111
- error_str = str(e).lower()
112
- logger.warning(f"⚠️ HF Spaces: Attempt {attempt + 1} failed: {str(e)}")
 
 
 
 
 
 
 
113
 
114
- if "connection" in error_str or "timeout" in error_str:
115
- # Network issue - retry with longer delay
116
- if attempt < max_attempts - 1:
117
- continue
 
 
 
 
 
 
 
 
 
 
 
118
  else:
119
- # Other error - might be permanent
120
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
- # All attempts failed
123
- raise ConnectionError(f"HF Spaces: Failed after {max_attempts} attempts")
124
 
125
  def _create_workflow(self) -> StateGraph:
126
  """Create the LangGraph workflow for data analysis"""
@@ -167,21 +217,19 @@ class DataAnalysisAgent:
167
  "datetime_columns": df.select_dtypes(include=['datetime64']).columns.tolist()
168
  }
169
 
170
- # Use LLM to generate initial insights about the dataset
171
- prompt = f"""
172
- Analyze this dataset profile and provide initial observations:
173
-
174
- Dataset Shape: {dataset_info['shape']}
175
- Columns: {dataset_info['columns']}
176
- Data Types: {dataset_info['dtypes']}
177
- Missing Values: {dataset_info['null_counts']}
178
- Duplicate Rows: {dataset_info['duplicate_rows']}
179
-
180
- Provide a brief analysis of the dataset structure, data quality issues, and potential analysis opportunities.
181
- """
182
 
183
- # Use HF Spaces optimized call
184
- response_content = self._hf_spaces_llm_call(prompt)
185
  dataset_info["llm_profile"] = response_content
186
 
187
  state["dataset_info"] = dataset_info
@@ -275,22 +323,17 @@ class DataAnalysisAgent:
275
 
276
  column_analysis[column] = analysis
277
 
278
- # Use LLM to interpret column analysis
279
- prompt = f"""
280
- Analyze these column statistics and identify patterns, anomalies, and insights:
281
-
282
- {json.dumps(column_analysis, indent=2, default=str)}
283
-
284
- Focus on:
285
- 1. Data quality issues
286
- 2. Distribution patterns
287
- 3. Potential relationships between columns
288
- 4. Outliers or anomalies
289
- 5. Business insights
290
- """
291
 
292
- # Use HF Spaces optimized call
293
- response_content = self._hf_spaces_llm_call(prompt)
294
  column_analysis["llm_interpretation"] = response_content
295
 
296
  state["column_analysis"] = column_analysis
@@ -337,23 +380,40 @@ class DataAnalysisAgent:
337
  })
338
  correlations["high_correlations"] = high_correlations
339
 
340
- # Use LLM to generate comprehensive insights
341
- prompt = f"""
342
- Based on the dataset analysis, generate key insights and findings:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
- Dataset Info: {json.dumps(dataset_info, indent=2, default=str)}
345
- High Correlations: {json.dumps(correlations, indent=2, default=str)}
346
 
347
- Generate 5-10 specific, actionable insights that would be valuable for business decision-making.
348
- Focus on trends, patterns, anomalies, and opportunities.
349
- """
 
 
 
 
350
 
351
- # Use HF Spaces optimized call
352
- response_content = self._hf_spaces_llm_call(prompt)
353
- insights = response_content.split('\n')
354
- insights = [insight.strip() for insight in insights if insight.strip()]
355
 
356
- state["insights"] = insights
357
  state["current_step"] = "insight_generator"
358
 
359
  except Exception as e:
@@ -380,33 +440,23 @@ class DataAnalysisAgent:
380
  dataset_info["numeric_columns"] = df.select_dtypes(include=[np.number]).columns.tolist()
381
  dataset_info["categorical_columns"] = df.select_dtypes(include=['object', 'category']).columns.tolist()
382
 
383
- # Use LLM to plan visualizations
384
- prompt = f"""
385
- Plan the most effective visualizations for this dataset:
386
-
387
- Dataset Info: {json.dumps(dataset_info, indent=2, default=str)}
388
- Key Insights: {insights}
389
-
390
- Suggest 5-8 different visualization types with:
391
- 1. Chart type (histogram, scatter, bar, line, heatmap, etc.)
392
- 2. Columns to use
393
- 3. Purpose/insight to communicate
394
- 4. Title and description
395
-
396
- Return as a JSON list with this structure:
397
- [
398
- {{
399
- "type": "histogram",
400
- "columns": ["column_name"],
401
- "title": "Distribution of...",
402
- "description": "Shows the...",
403
- "purpose": "Understand distribution"
404
- }}
405
- ]
406
- """
407
-
408
- # Use HF Spaces optimized call
409
- response_content = self._hf_spaces_llm_call(prompt)
410
  try:
411
  # Extract JSON from response
412
  json_start = response_content.find('[')
@@ -540,7 +590,7 @@ class DataAnalysisAgent:
540
  plt.close()
541
 
542
  except Exception as e:
543
- logger.warning(f"Failed to create {viz['type']} chart: {str(e)}")
544
  plt.close()
545
  continue
546
 
@@ -562,28 +612,42 @@ class DataAnalysisAgent:
562
  insights = state["insights"]
563
  dataset_info = state["dataset_info"]
564
 
565
- # Use LLM to generate recommendations
566
- prompt = f"""
567
- Based on the complete data analysis, generate specific, actionable recommendations:
568
-
569
- Dataset Info: {json.dumps(dataset_info, indent=2, default=str)}
570
- Key Insights: {insights}
571
-
572
- Generate 5-10 specific recommendations that include:
573
- 1. Data quality improvements
574
- 2. Business opportunities
575
- 3. Further analysis suggestions
576
- 4. Action items for stakeholders
577
-
578
- Make recommendations specific, measurable, and actionable.
579
- """
580
-
581
- # Use HF Spaces optimized call
582
- response_content = self._hf_spaces_llm_call(prompt)
583
- recommendations = response_content.split('\n')
584
- recommendations = [rec.strip() for rec in recommendations if rec.strip()]
585
-
586
- state["recommendations"] = recommendations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
587
  state["current_step"] = "recommendation_engine"
588
 
589
  except Exception as e:
 
10
  import traceback
11
  import time
12
  import random
13
+ import requests
14
+ import json
15
+ from urllib3.util.retry import Retry
16
+ from requests.adapters import HTTPAdapter
17
  warnings.filterwarnings('ignore')
18
 
19
  from typing import Dict, List, Any, Optional, TypedDict
 
20
  from datetime import datetime
21
  import logging
22
 
23
+ # LangGraph imports
24
  from langgraph.graph import StateGraph, END
 
25
  from langchain_core.messages import HumanMessage, SystemMessage
 
26
 
27
  # Configure logging
28
  logging.basicConfig(level=logging.INFO)
 
41
 
42
  class DataAnalysisAgent:
43
  def __init__(self, groq_api_key: str, model_name: str = "llama3-70b-8192"):
44
+ """Initialize with direct Groq API calls to bypass HF Spaces blocks"""
45
 
 
 
46
  self.groq_api_key = groq_api_key
47
  self.model_name = model_name
48
+ self.is_hf_spaces = os.environ.get('SPACE_ID') is not None
49
+
50
+ # Configure requests session with aggressive retry strategy
51
+ self.session = requests.Session()
52
+ retry_strategy = Retry(
53
+ total=5,
54
+ backoff_factor=3,
55
+ status_forcelist=[429, 500, 502, 503, 504],
56
+ allowed_methods=["POST"]
57
+ )
58
+ adapter = HTTPAdapter(max_retries=retry_strategy)
59
+ self.session.mount("http://", adapter)
60
+ self.session.mount("https://", adapter)
61
+
62
+ # Set session headers to mimic browser/curl
63
+ self.session.headers.update({
64
+ "User-Agent": "curl/7.68.0",
65
+ "Accept": "*/*",
66
+ "Accept-Encoding": "gzip, deflate",
67
+ "Connection": "close"
68
+ })
69
 
70
  if self.is_hf_spaces:
71
+ logger.info("🚀 HF Spaces: Using direct Groq API calls")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  else:
73
+ logger.info("💻 Local: Using direct Groq API calls")
 
 
 
 
 
 
 
74
 
75
  # Set up the analysis workflow graph
76
  self.workflow = self._create_workflow()
77
 
78
+ def _direct_groq_call(self, prompt: str) -> str:
79
+ """Direct Groq API call bypassing LangChain completely"""
80
+
81
+ url = "https://api.groq.com/openai/v1/chat/completions"
82
 
83
+ headers = {
84
+ "Authorization": f"Bearer {self.groq_api_key}",
85
+ "Content-Type": "application/json",
86
+ "User-Agent": "curl/7.68.0",
87
+ "Accept": "*/*",
88
+ "Connection": "close"
89
+ }
90
+
91
+ data = {
92
+ "messages": [
93
+ {"role": "user", "content": prompt}
94
+ ],
95
+ "model": self.model_name,
96
+ "max_tokens": 1500,
97
+ "temperature": 0.1,
98
+ "stream": False
99
+ }
100
+
101
+ max_attempts = 5 if self.is_hf_spaces else 3
102
 
 
 
103
  for attempt in range(max_attempts):
104
  try:
 
105
  if attempt > 0:
106
+ # Exponential backoff with jitter
107
+ delay = (2 ** attempt) + random.uniform(1, 3)
108
+ logger.info(f"⏳ Waiting {delay:.1f}s before attempt {attempt + 1}")
109
  time.sleep(delay)
110
 
111
+ logger.info(f"🤖 Direct Groq API attempt {attempt + 1}/{max_attempts}")
 
 
112
 
113
+ # Try different approaches for HF Spaces
114
+ if self.is_hf_spaces and attempt > 1:
115
+ # Try with different headers
116
+ headers["User-Agent"] = f"DataAnalysisAgent/1.{attempt}"
117
+ headers["X-Forwarded-For"] = "127.0.0.1"
118
 
119
+ response = self.session.post(
120
+ url,
121
+ headers=headers,
122
+ json=data,
123
+ timeout=120,
124
+ verify=True,
125
+ allow_redirects=True
126
+ )
127
+
128
+ logger.info(f"📡 Response status: {response.status_code}")
129
 
130
+ if response.status_code == 200:
131
+ result = response.json()
132
+ content = result["choices"][0]["message"]["content"]
133
+ logger.info("✅ Direct Groq API call successful")
134
+ return content
135
+
136
+ elif response.status_code == 429:
137
+ logger.warning("⚠️ Rate limited, retrying...")
138
+ time.sleep(10)
139
+ continue
140
+
141
+ elif response.status_code in [500, 502, 503, 504]:
142
+ logger.warning(f"⚠️ Server error {response.status_code}, retrying...")
143
+ continue
144
+
145
  else:
146
+ logger.error(f"❌ API error {response.status_code}: {response.text}")
147
+ if attempt == max_attempts - 1:
148
+ raise Exception(f"Groq API error: {response.status_code}")
149
+ continue
150
+
151
+ except requests.exceptions.ConnectTimeout:
152
+ logger.warning(f"⚠️ Connection timeout on attempt {attempt + 1}")
153
+ continue
154
+
155
+ except requests.exceptions.ReadTimeout:
156
+ logger.warning(f"⚠️ Read timeout on attempt {attempt + 1}")
157
+ continue
158
+
159
+ except requests.exceptions.ConnectionError as e:
160
+ logger.warning(f"⚠️ Connection error on attempt {attempt + 1}: {str(e)}")
161
+ # Try with different session for HF Spaces
162
+ if self.is_hf_spaces and attempt > 2:
163
+ logger.info("🔄 Creating new session...")
164
+ self.session = requests.Session()
165
+ continue
166
+
167
+ except Exception as e:
168
+ logger.error(f"❌ Unexpected error on attempt {attempt + 1}: {str(e)}")
169
+ if attempt == max_attempts - 1:
170
+ raise
171
+ continue
172
 
173
+ raise ConnectionError(f"Failed to connect to Groq API after {max_attempts} attempts")
 
174
 
175
  def _create_workflow(self) -> StateGraph:
176
  """Create the LangGraph workflow for data analysis"""
 
217
  "datetime_columns": df.select_dtypes(include=['datetime64']).columns.tolist()
218
  }
219
 
220
+ # Simpler prompt for better success rate
221
+ prompt = f"""Analyze this dataset profile:
222
+
223
+ Dataset: {dataset_info['shape'][0]} rows × {dataset_info['shape'][1]} columns
224
+ Missing values: {sum(dataset_info['null_counts'].values())} total
225
+ Duplicates: {dataset_info['duplicate_rows']}
226
+ Numeric columns: {len(dataset_info['numeric_columns'])}
227
+ Categorical columns: {len(dataset_info['categorical_columns'])}
228
+
229
+ Provide a brief professional assessment of data quality and analysis potential in 2-3 sentences."""
 
 
230
 
231
+ # Use direct Groq API call
232
+ response_content = self._direct_groq_call(prompt)
233
  dataset_info["llm_profile"] = response_content
234
 
235
  state["dataset_info"] = dataset_info
 
323
 
324
  column_analysis[column] = analysis
325
 
326
+ # Simplified prompt for column analysis
327
+ prompt = f"""Analyze these column statistics and identify key patterns:
328
+
329
+ Total columns analyzed: {len(column_analysis)}
330
+ Numeric columns: {len([c for c in column_analysis if 'mean' in column_analysis[c]])}
331
+ Text columns: {len([c for c in column_analysis if 'top_values' in column_analysis[c]])}
332
+
333
+ Provide 2-3 key observations about data patterns and quality issues."""
 
 
 
 
 
334
 
335
+ # Use direct Groq API call
336
+ response_content = self._direct_groq_call(prompt)
337
  column_analysis["llm_interpretation"] = response_content
338
 
339
  state["column_analysis"] = column_analysis
 
380
  })
381
  correlations["high_correlations"] = high_correlations
382
 
383
+ # Simplified prompt for insights
384
+ prompt = f"""Generate exactly 5 specific insights for this dataset:
385
+
386
+ Dataset: {dataset_info.get('shape', [0])[0]:,} rows, {dataset_info.get('shape', [0])[1]} columns
387
+ Missing values: {sum(dataset_info.get('null_counts', {}).values()):,}
388
+ Numeric variables: {len(numeric_cols)}
389
+ Categorical variables: {len(dataset_info.get('categorical_columns', []))}
390
+ Strong correlations found: {len(correlations.get('high_correlations', []))}
391
+
392
+ Format as:
393
+ **Insight 1:** [specific finding]
394
+ **Insight 2:** [specific finding]
395
+ **Insight 3:** [specific finding]
396
+ **Insight 4:** [specific finding]
397
+ **Insight 5:** [specific finding]
398
+
399
+ Focus on data quality, patterns, and business value."""
400
 
401
+ # Use direct Groq API call
402
+ response_content = self._direct_groq_call(prompt)
403
 
404
+ # Parse insights from response
405
+ insights = []
406
+ lines = response_content.split('\n')
407
+ for line in lines:
408
+ line = line.strip()
409
+ if line and ('**Insight' in line or line.startswith(('1.', '2.', '3.', '4.', '5.'))):
410
+ insights.append(line)
411
 
412
+ # If parsing failed, split by lines and take meaningful ones
413
+ if len(insights) < 3:
414
+ insights = [line.strip() for line in response_content.split('\n') if len(line.strip()) > 20]
 
415
 
416
+ state["insights"] = insights[:10] # Limit to 10 insights
417
  state["current_step"] = "insight_generator"
418
 
419
  except Exception as e:
 
440
  dataset_info["numeric_columns"] = df.select_dtypes(include=[np.number]).columns.tolist()
441
  dataset_info["categorical_columns"] = df.select_dtypes(include=['object', 'category']).columns.tolist()
442
 
443
+ # Simplified prompt for visualization planning
444
+ prompt = f"""Plan 5 effective visualizations for this dataset:
445
+
446
+ Numeric columns: {len(dataset_info.get('numeric_columns', []))}
447
+ Categorical columns: {len(dataset_info.get('categorical_columns', []))}
448
+
449
+ Return as JSON array:
450
+ [
451
+ {{"type": "histogram", "columns": ["col1"], "title": "Distribution of col1", "description": "Shows distribution", "purpose": "Understand patterns"}},
452
+ {{"type": "bar", "columns": ["col2"], "title": "Frequency of col2", "description": "Shows counts", "purpose": "Category analysis"}}
453
+ ]
454
+
455
+ Use types: histogram, bar, scatter, heatmap, line"""
456
+
457
+ # Use direct Groq API call
458
+ response_content = self._direct_groq_call(prompt)
459
+
 
 
 
 
 
 
 
 
 
 
460
  try:
461
  # Extract JSON from response
462
  json_start = response_content.find('[')
 
590
  plt.close()
591
 
592
  except Exception as e:
593
+ logger.warning(f"Failed to create {viz.get('type', 'unknown')} chart: {str(e)}")
594
  plt.close()
595
  continue
596
 
 
612
  insights = state["insights"]
613
  dataset_info = state["dataset_info"]
614
 
615
+ # Simplified prompt for recommendations
616
+ prompt = f"""Based on this data analysis, generate exactly 5 specific recommendations:
617
+
618
+ Dataset: {dataset_info.get('shape', [0])[0]:,} rows, {dataset_info.get('shape', [0])[1]} columns
619
+ Missing values: {sum(dataset_info.get('null_counts', {}).values()):,}
620
+ Key insights available: {len(insights)}
621
+
622
+ Format as:
623
+ **Recommendation 1:** [specific action]
624
+ **Recommendation 2:** [specific action]
625
+ **Recommendation 3:** [specific action]
626
+ **Recommendation 4:** [specific action]
627
+ **Recommendation 5:** [specific action]
628
+
629
+ Focus on:
630
+ - Data quality improvements
631
+ - Business opportunities
632
+ - Further analysis suggestions
633
+ - Actionable next steps"""
634
+
635
+ # Use direct Groq API call
636
+ response_content = self._direct_groq_call(prompt)
637
+
638
+ # Parse recommendations from response
639
+ recommendations = []
640
+ lines = response_content.split('\n')
641
+ for line in lines:
642
+ line = line.strip()
643
+ if line and ('**Recommendation' in line or line.startswith(('1.', '2.', '3.', '4.', '5.'))):
644
+ recommendations.append(line)
645
+
646
+ # If parsing failed, split by lines and take meaningful ones
647
+ if len(recommendations) < 3:
648
+ recommendations = [line.strip() for line in response_content.split('\n') if len(line.strip()) > 20]
649
+
650
+ state["recommendations"] = recommendations[:10] # Limit to 10
651
  state["current_step"] = "recommendation_engine"
652
 
653
  except Exception as e: