Update data_analysis_agent.py
Browse files- data_analysis_agent.py +148 -147
data_analysis_agent.py
CHANGED
|
@@ -7,6 +7,7 @@ import plotly.express as px
|
|
| 7 |
import plotly.graph_objects as go
|
| 8 |
from plotly.subplots import make_subplots
|
| 9 |
import warnings
|
|
|
|
| 10 |
warnings.filterwarnings('ignore')
|
| 11 |
|
| 12 |
from typing import Dict, List, Any, Optional, TypedDict
|
|
@@ -19,7 +20,6 @@ from langgraph.graph import StateGraph, END
|
|
| 19 |
from langchain_groq import ChatGroq
|
| 20 |
from langchain_core.messages import HumanMessage, SystemMessage
|
| 21 |
from langchain_core.prompts import ChatPromptTemplate
|
| 22 |
-
os.environ['GROQ_TIMEOUT'] = '60'
|
| 23 |
|
| 24 |
# Configure logging
|
| 25 |
logging.basicConfig(level=logging.INFO)
|
|
@@ -39,12 +39,13 @@ class AnalysisState(TypedDict):
|
|
| 39 |
class DataAnalysisAgent:
|
| 40 |
def __init__(self, groq_api_key: str, model_name: str = "llama3-70b-8192"):
|
| 41 |
"""Initialize the Data Analysis Agent"""
|
| 42 |
-
# Fixed: Use correct model name format
|
| 43 |
self.llm = ChatGroq(
|
| 44 |
groq_api_key=groq_api_key,
|
| 45 |
-
model_name=model_name,
|
| 46 |
temperature=0.1,
|
| 47 |
-
max_tokens=2000
|
|
|
|
| 48 |
)
|
| 49 |
|
| 50 |
# Set up the analysis workflow graph
|
|
@@ -518,153 +519,153 @@ class DataAnalysisAgent:
|
|
| 518 |
return state
|
| 519 |
|
| 520 |
def analyze_dataset(self, dataset_path: str) -> Dict[str, Any]:
|
| 521 |
-
"""
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
try:
|
| 525 |
-
# Load dataset
|
| 526 |
-
logger.info("π Loading dataset...")
|
| 527 |
-
if dataset_path.endswith('.csv'):
|
| 528 |
-
df = pd.read_csv(dataset_path)
|
| 529 |
-
elif dataset_path.endswith(('.xlsx', '.xls')):
|
| 530 |
-
df = pd.read_excel(dataset_path)
|
| 531 |
-
elif dataset_path.endswith('.json'):
|
| 532 |
-
df = pd.read_json(dataset_path)
|
| 533 |
-
else:
|
| 534 |
-
raise ValueError("Unsupported file format. Use CSV, Excel, or JSON.")
|
| 535 |
-
|
| 536 |
-
logger.info(f"β
Dataset loaded: {df.shape}")
|
| 537 |
|
| 538 |
-
# Initialize state with all required fields
|
| 539 |
-
initial_state = AnalysisState(
|
| 540 |
-
dataset=df,
|
| 541 |
-
dataset_info={},
|
| 542 |
-
column_analysis={},
|
| 543 |
-
insights=[],
|
| 544 |
-
visualizations=[],
|
| 545 |
-
recommendations=[],
|
| 546 |
-
current_step="",
|
| 547 |
-
error_messages=[]
|
| 548 |
-
)
|
| 549 |
-
|
| 550 |
-
logger.info("π Starting workflow execution...")
|
| 551 |
-
|
| 552 |
-
# Test each step individually to find the failure point
|
| 553 |
try:
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
try:
|
| 563 |
-
logger.info("π Step 2: Column analysis...")
|
| 564 |
-
state = self._analyze_columns(state)
|
| 565 |
-
logger.info(f"β
Column analysis completed. Columns analyzed: {len(state.get('column_analysis', {}))}")
|
| 566 |
-
except Exception as e:
|
| 567 |
-
logger.error(f"β Column analysis failed: {str(e)}")
|
| 568 |
-
state['error_messages'].append(f"Column analysis failed: {str(e)}")
|
| 569 |
-
|
| 570 |
-
try:
|
| 571 |
-
logger.info("π‘ Step 3: Generating insights...")
|
| 572 |
-
state = self._generate_insights(state)
|
| 573 |
-
insights_count = len(state.get('insights', []))
|
| 574 |
-
logger.info(f"β
Insights generation completed. Generated: {insights_count} insights")
|
| 575 |
-
if insights_count > 0:
|
| 576 |
-
logger.info(f"First insight: {state['insights'][0][:100]}...")
|
| 577 |
else:
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
logger.
|
| 581 |
-
|
| 582 |
-
#
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
try:
|
| 599 |
-
logger.info("π¨ Step 5: Creating charts...")
|
| 600 |
-
state = self._create_charts(state)
|
| 601 |
-
logger.info("β
Chart creation completed")
|
| 602 |
-
except Exception as e:
|
| 603 |
-
logger.error(f"β Chart creation failed: {str(e)}")
|
| 604 |
-
state['error_messages'].append(f"Chart creation failed: {str(e)}")
|
| 605 |
-
|
| 606 |
-
try:
|
| 607 |
-
logger.info("π― Step 6: Generating recommendations...")
|
| 608 |
-
state = self._generate_recommendations(state)
|
| 609 |
-
rec_count = len(state.get('recommendations', []))
|
| 610 |
-
logger.info(f"β
Recommendations generation completed. Generated: {rec_count} recommendations")
|
| 611 |
-
if rec_count > 0:
|
| 612 |
-
logger.info(f"First recommendation: {state['recommendations'][0][:100]}...")
|
| 613 |
-
else:
|
| 614 |
-
logger.warning("β οΈ No recommendations were generated!")
|
| 615 |
-
except Exception as e:
|
| 616 |
-
logger.error(f"β Recommendations generation failed: {str(e)}")
|
| 617 |
-
state['error_messages'].append(f"Recommendations generation failed: {str(e)}")
|
| 618 |
-
# Add fallback recommendations
|
| 619 |
-
state['recommendations'] = [
|
| 620 |
-
"Conduct detailed data quality assessment",
|
| 621 |
-
"Implement data monitoring processes",
|
| 622 |
-
"Consider advanced analytics for business insights",
|
| 623 |
-
"Review data collection and validation procedures"
|
| 624 |
-
]
|
| 625 |
-
|
| 626 |
-
# Clean up temp file
|
| 627 |
-
if os.path.exists(dataset_path):
|
| 628 |
try:
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
|
| 669 |
def _generate_report(self, results: Dict[str, Any], dataset_path: str):
|
| 670 |
"""Generate a comprehensive analysis report"""
|
|
|
|
| 7 |
import plotly.graph_objects as go
|
| 8 |
from plotly.subplots import make_subplots
|
| 9 |
import warnings
|
| 10 |
+
import traceback
|
| 11 |
warnings.filterwarnings('ignore')
|
| 12 |
|
| 13 |
from typing import Dict, List, Any, Optional, TypedDict
|
|
|
|
| 20 |
from langchain_groq import ChatGroq
|
| 21 |
from langchain_core.messages import HumanMessage, SystemMessage
|
| 22 |
from langchain_core.prompts import ChatPromptTemplate
|
|
|
|
| 23 |
|
| 24 |
# Configure logging
|
| 25 |
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 39 |
class DataAnalysisAgent:
|
| 40 |
def __init__(self, groq_api_key: str, model_name: str = "llama3-70b-8192"):
|
| 41 |
"""Initialize the Data Analysis Agent"""
|
| 42 |
+
# Fixed: Use correct model name format and add timeout
|
| 43 |
self.llm = ChatGroq(
|
| 44 |
groq_api_key=groq_api_key,
|
| 45 |
+
model_name=model_name,
|
| 46 |
temperature=0.1,
|
| 47 |
+
max_tokens=2000,
|
| 48 |
+
timeout=60 # Added timeout for container environments
|
| 49 |
)
|
| 50 |
|
| 51 |
# Set up the analysis workflow graph
|
|
|
|
| 519 |
return state
|
| 520 |
|
| 521 |
def analyze_dataset(self, dataset_path: str) -> Dict[str, Any]:
|
| 522 |
+
"""DIAGNOSTIC VERSION: Main method to analyze a dataset with detailed logging"""
|
| 523 |
+
logger.info(f"π Starting analysis of dataset: {dataset_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 524 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 525 |
try:
|
| 526 |
+
# Load dataset
|
| 527 |
+
logger.info("π Loading dataset...")
|
| 528 |
+
if dataset_path.endswith('.csv'):
|
| 529 |
+
df = pd.read_csv(dataset_path)
|
| 530 |
+
elif dataset_path.endswith(('.xlsx', '.xls')):
|
| 531 |
+
df = pd.read_excel(dataset_path)
|
| 532 |
+
elif dataset_path.endswith('.json'):
|
| 533 |
+
df = pd.read_json(dataset_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 534 |
else:
|
| 535 |
+
raise ValueError("Unsupported file format. Use CSV, Excel, or JSON.")
|
| 536 |
+
|
| 537 |
+
logger.info(f"β
Dataset loaded: {df.shape}")
|
| 538 |
+
|
| 539 |
+
# Initialize state with all required fields
|
| 540 |
+
initial_state = AnalysisState(
|
| 541 |
+
dataset=df,
|
| 542 |
+
dataset_info={},
|
| 543 |
+
column_analysis={},
|
| 544 |
+
insights=[],
|
| 545 |
+
visualizations=[],
|
| 546 |
+
recommendations=[],
|
| 547 |
+
current_step="",
|
| 548 |
+
error_messages=[]
|
| 549 |
+
)
|
| 550 |
+
|
| 551 |
+
logger.info("π Starting workflow execution...")
|
| 552 |
+
|
| 553 |
+
# Test each step individually to find the failure point
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 554 |
try:
|
| 555 |
+
logger.info("π Step 1: Data profiling...")
|
| 556 |
+
state = self._profile_dataset(initial_state)
|
| 557 |
+
logger.info(f"β
Data profiling completed. Info keys: {list(state.get('dataset_info', {}).keys())}")
|
| 558 |
+
except Exception as e:
|
| 559 |
+
logger.error(f"β Data profiling failed: {str(e)}")
|
| 560 |
+
state = initial_state
|
| 561 |
+
state['error_messages'] = [f"Data profiling failed: {str(e)}"]
|
| 562 |
+
|
| 563 |
+
try:
|
| 564 |
+
logger.info("π Step 2: Column analysis...")
|
| 565 |
+
state = self._analyze_columns(state)
|
| 566 |
+
logger.info(f"β
Column analysis completed. Columns analyzed: {len(state.get('column_analysis', {}))}")
|
| 567 |
+
except Exception as e:
|
| 568 |
+
logger.error(f"β Column analysis failed: {str(e)}")
|
| 569 |
+
state['error_messages'].append(f"Column analysis failed: {str(e)}")
|
| 570 |
+
|
| 571 |
+
try:
|
| 572 |
+
logger.info("π‘ Step 3: Generating insights...")
|
| 573 |
+
state = self._generate_insights(state)
|
| 574 |
+
insights_count = len(state.get('insights', []))
|
| 575 |
+
logger.info(f"β
Insights generation completed. Generated: {insights_count} insights")
|
| 576 |
+
if insights_count > 0:
|
| 577 |
+
logger.info(f"First insight: {state['insights'][0][:100]}...")
|
| 578 |
+
else:
|
| 579 |
+
logger.warning("β οΈ No insights were generated!")
|
| 580 |
+
except Exception as e:
|
| 581 |
+
logger.error(f"β Insights generation failed: {str(e)}")
|
| 582 |
+
state['error_messages'].append(f"Insights generation failed: {str(e)}")
|
| 583 |
+
# Add fallback insights
|
| 584 |
+
state['insights'] = [
|
| 585 |
+
"Basic dataset analysis completed",
|
| 586 |
+
f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns",
|
| 587 |
+
"Manual review recommended for detailed insights"
|
| 588 |
+
]
|
| 589 |
+
|
| 590 |
+
try:
|
| 591 |
+
logger.info("π Step 4: Planning visualizations...")
|
| 592 |
+
state = self._plan_visualizations(state)
|
| 593 |
+
viz_count = len(state.get('visualizations', []))
|
| 594 |
+
logger.info(f"β
Visualization planning completed. Planned: {viz_count} visualizations")
|
| 595 |
+
except Exception as e:
|
| 596 |
+
logger.error(f"β Visualization planning failed: {str(e)}")
|
| 597 |
+
state['error_messages'].append(f"Visualization planning failed: {str(e)}")
|
| 598 |
+
|
| 599 |
+
try:
|
| 600 |
+
logger.info("π¨ Step 5: Creating charts...")
|
| 601 |
+
state = self._create_charts(state)
|
| 602 |
+
logger.info("β
Chart creation completed")
|
| 603 |
+
except Exception as e:
|
| 604 |
+
logger.error(f"β Chart creation failed: {str(e)}")
|
| 605 |
+
state['error_messages'].append(f"Chart creation failed: {str(e)}")
|
| 606 |
+
|
| 607 |
+
try:
|
| 608 |
+
logger.info("π― Step 6: Generating recommendations...")
|
| 609 |
+
state = self._generate_recommendations(state)
|
| 610 |
+
rec_count = len(state.get('recommendations', []))
|
| 611 |
+
logger.info(f"β
Recommendations generation completed. Generated: {rec_count} recommendations")
|
| 612 |
+
if rec_count > 0:
|
| 613 |
+
logger.info(f"First recommendation: {state['recommendations'][0][:100]}...")
|
| 614 |
+
else:
|
| 615 |
+
logger.warning("β οΈ No recommendations were generated!")
|
| 616 |
+
except Exception as e:
|
| 617 |
+
logger.error(f"β Recommendations generation failed: {str(e)}")
|
| 618 |
+
state['error_messages'].append(f"Recommendations generation failed: {str(e)}")
|
| 619 |
+
# Add fallback recommendations
|
| 620 |
+
state['recommendations'] = [
|
| 621 |
+
"Conduct detailed data quality assessment",
|
| 622 |
+
"Implement data monitoring processes",
|
| 623 |
+
"Consider advanced analytics for business insights",
|
| 624 |
+
"Review data collection and validation procedures"
|
| 625 |
+
]
|
| 626 |
+
|
| 627 |
+
# Clean up temp file
|
| 628 |
+
if os.path.exists(dataset_path):
|
| 629 |
+
try:
|
| 630 |
+
os.remove(dataset_path)
|
| 631 |
+
logger.info("π§Ή Temporary file cleaned up")
|
| 632 |
+
except:
|
| 633 |
+
pass
|
| 634 |
+
|
| 635 |
+
# Prepare results
|
| 636 |
+
results = {
|
| 637 |
+
"dataset_info": state.get("dataset_info", {}),
|
| 638 |
+
"column_analysis": state.get("column_analysis", {}),
|
| 639 |
+
"insights": state.get("insights", []),
|
| 640 |
+
"visualizations": state.get("visualizations", []),
|
| 641 |
+
"recommendations": state.get("recommendations", []),
|
| 642 |
+
"analysis_timestamp": datetime.now().isoformat(),
|
| 643 |
+
"errors": state.get("error_messages", [])
|
| 644 |
+
}
|
| 645 |
+
|
| 646 |
+
# Log final results
|
| 647 |
+
logger.info(f"π Analysis completed!")
|
| 648 |
+
logger.info(f"π Final results: {len(results['insights'])} insights, {len(results['recommendations'])} recommendations")
|
| 649 |
+
logger.info(f"β Errors encountered: {len(results['errors'])}")
|
| 650 |
+
|
| 651 |
+
for error in results['errors']:
|
| 652 |
+
logger.error(f"Error details: {error}")
|
| 653 |
+
|
| 654 |
+
return results
|
| 655 |
+
|
| 656 |
+
except Exception as e:
|
| 657 |
+
logger.error(f"π₯ Critical analysis failure: {str(e)}")
|
| 658 |
+
logger.error(traceback.format_exc())
|
| 659 |
+
return {
|
| 660 |
+
"error": str(e),
|
| 661 |
+
"dataset_info": {},
|
| 662 |
+
"insights": [f"Analysis failed: {str(e)}"],
|
| 663 |
+
"recommendations": ["Please check logs and try again"],
|
| 664 |
+
"visualizations": [],
|
| 665 |
+
"column_analysis": {},
|
| 666 |
+
"analysis_timestamp": datetime.now().isoformat(),
|
| 667 |
+
"errors": [str(e)]
|
| 668 |
+
}
|
| 669 |
|
| 670 |
def _generate_report(self, results: Dict[str, Any], dataset_path: str):
|
| 671 |
"""Generate a comprehensive analysis report"""
|