Update data_analysis_agent.py
Browse files- data_analysis_agent.py +82 -22
data_analysis_agent.py
CHANGED
|
@@ -612,42 +612,102 @@ Use types: histogram, bar, scatter, heatmap, line"""
|
|
| 612 |
insights = state["insights"]
|
| 613 |
dataset_info = state["dataset_info"]
|
| 614 |
|
| 615 |
-
#
|
| 616 |
-
prompt = f"""Based on this data analysis, generate exactly 5 specific recommendations:
|
| 617 |
|
| 618 |
Dataset: {dataset_info.get('shape', [0])[0]:,} rows, {dataset_info.get('shape', [0])[1]} columns
|
| 619 |
Missing values: {sum(dataset_info.get('null_counts', {}).values()):,}
|
| 620 |
Key insights available: {len(insights)}
|
| 621 |
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
|
|
|
| 628 |
|
| 629 |
-
|
| 630 |
-
- Data quality improvements
|
| 631 |
-
- Business opportunities
|
| 632 |
-
- Further analysis suggestions
|
| 633 |
-
- Actionable next steps"""
|
| 634 |
|
| 635 |
# Use direct Groq API call
|
| 636 |
response_content = self._direct_groq_call(prompt)
|
|
|
|
| 637 |
|
| 638 |
-
#
|
| 639 |
recommendations = []
|
| 640 |
-
lines = response_content.split('\n')
|
| 641 |
-
for line in lines:
|
| 642 |
-
line = line.strip()
|
| 643 |
-
if line and ('**Recommendation' in line or line.startswith(('1.', '2.', '3.', '4.', '5.'))):
|
| 644 |
-
recommendations.append(line)
|
| 645 |
|
| 646 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 647 |
if len(recommendations) < 3:
|
| 648 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 649 |
|
| 650 |
-
state["recommendations"] = recommendations[:
|
| 651 |
state["current_step"] = "recommendation_engine"
|
| 652 |
|
| 653 |
except Exception as e:
|
|
|
|
| 612 |
insights = state["insights"]
|
| 613 |
dataset_info = state["dataset_info"]
|
| 614 |
|
| 615 |
+
# Enhanced prompt for better recommendation generation
|
| 616 |
+
prompt = f"""Based on this data analysis, generate exactly 5 specific actionable recommendations:
|
| 617 |
|
| 618 |
Dataset: {dataset_info.get('shape', [0])[0]:,} rows, {dataset_info.get('shape', [0])[1]} columns
|
| 619 |
Missing values: {sum(dataset_info.get('null_counts', {}).values()):,}
|
| 620 |
Key insights available: {len(insights)}
|
| 621 |
|
| 622 |
+
Please provide exactly 5 recommendations in this format:
|
| 623 |
+
|
| 624 |
+
1. [First recommendation about data quality or collection]
|
| 625 |
+
2. [Second recommendation about analysis or modeling]
|
| 626 |
+
3. [Third recommendation about business opportunities]
|
| 627 |
+
4. [Fourth recommendation about process improvements]
|
| 628 |
+
5. [Fifth recommendation about next steps or monitoring]
|
| 629 |
|
| 630 |
+
Each recommendation should be specific, measurable, and actionable."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 631 |
|
| 632 |
# Use direct Groq API call
|
| 633 |
response_content = self._direct_groq_call(prompt)
|
| 634 |
+
logger.info(f"🎯 Raw recommendations response: {response_content[:200]}...")
|
| 635 |
|
| 636 |
+
# Enhanced parsing for recommendations
|
| 637 |
recommendations = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 638 |
|
| 639 |
+
# Method 1: Look for numbered items (1., 2., etc.)
|
| 640 |
+
import re
|
| 641 |
+
numbered_pattern = r'(\d+\.)\s*(.+?)(?=\d+\.|$)'
|
| 642 |
+
numbered_matches = re.findall(numbered_pattern, response_content, re.DOTALL)
|
| 643 |
+
|
| 644 |
+
if numbered_matches and len(numbered_matches) >= 3:
|
| 645 |
+
for num, content in numbered_matches:
|
| 646 |
+
clean_content = content.strip().replace('\n', ' ').replace(' ', ' ')
|
| 647 |
+
if len(clean_content) > 15:
|
| 648 |
+
recommendations.append(f"**Recommendation {num[0]}:** {clean_content}")
|
| 649 |
+
logger.info(f"✅ Found {len(recommendations)} numbered recommendations")
|
| 650 |
+
|
| 651 |
+
# Method 2: Look for **Recommendation** format
|
| 652 |
if len(recommendations) < 3:
|
| 653 |
+
rec_pattern = r'\*\*Recommendation\s*(\d+):\*\*\s*(.+?)(?=\*\*Recommendation\s*\d+:|\n\n|$)'
|
| 654 |
+
rec_matches = re.findall(rec_pattern, response_content, re.DOTALL)
|
| 655 |
+
|
| 656 |
+
recommendations = []
|
| 657 |
+
for num, content in rec_matches:
|
| 658 |
+
clean_content = content.strip().replace('\n', ' ').replace(' ', ' ')
|
| 659 |
+
if len(clean_content) > 15:
|
| 660 |
+
recommendations.append(f"**Recommendation {num}:** {clean_content}")
|
| 661 |
+
logger.info(f"✅ Found {len(recommendations)} **Recommendation** format items")
|
| 662 |
+
|
| 663 |
+
# Method 3: Split by lines and filter meaningful content
|
| 664 |
+
if len(recommendations) < 3:
|
| 665 |
+
lines = response_content.split('\n')
|
| 666 |
+
recommendations = []
|
| 667 |
+
for line in lines:
|
| 668 |
+
line = line.strip()
|
| 669 |
+
# Look for lines that start with numbers or contain recommendation keywords
|
| 670 |
+
if (line and
|
| 671 |
+
(re.match(r'^\d+\.', line) or
|
| 672 |
+
'recommend' in line.lower() or
|
| 673 |
+
any(keyword in line.lower() for keyword in ['implement', 'consider', 'develop', 'improve', 'analyze', 'monitor', 'establish'])) and
|
| 674 |
+
len(line) > 25):
|
| 675 |
+
# Clean up the line
|
| 676 |
+
clean_line = re.sub(r'^\d+\.\s*', '', line) # Remove leading numbers
|
| 677 |
+
if not clean_line.startswith('**Recommendation'):
|
| 678 |
+
clean_line = f"**Recommendation {len(recommendations)+1}:** {clean_line}"
|
| 679 |
+
recommendations.append(clean_line)
|
| 680 |
+
if len(recommendations) >= 5:
|
| 681 |
+
break
|
| 682 |
+
logger.info(f"✅ Found {len(recommendations)} recommendations via line parsing")
|
| 683 |
+
|
| 684 |
+
# Method 4: Fallback - create structured recommendations
|
| 685 |
+
if len(recommendations) < 3:
|
| 686 |
+
logger.warning("⚠️ Parsing failed, creating fallback recommendations")
|
| 687 |
+
missing_pct = (sum(dataset_info.get('null_counts', {}).values()) /
|
| 688 |
+
max(1, dataset_info.get('shape', [1, 1])[0] * dataset_info.get('shape', [1, 1])[1]) * 100)
|
| 689 |
+
|
| 690 |
+
recommendations = [
|
| 691 |
+
"**Recommendation 1:** Implement comprehensive data quality assessment and validation procedures to ensure data integrity",
|
| 692 |
+
"**Recommendation 2:** Develop automated monitoring dashboards for key metrics and performance indicators",
|
| 693 |
+
"**Recommendation 3:** Consider advanced statistical modeling and machine learning techniques for predictive insights",
|
| 694 |
+
"**Recommendation 4:** Establish regular data collection and analysis workflows for ongoing business intelligence",
|
| 695 |
+
"**Recommendation 5:** Create stakeholder reporting mechanisms to communicate findings and track implementation progress"
|
| 696 |
+
]
|
| 697 |
+
|
| 698 |
+
if missing_pct > 10:
|
| 699 |
+
recommendations[0] = f"**Recommendation 1:** Address {missing_pct:.1f}% missing data through improved collection processes or imputation strategies"
|
| 700 |
+
|
| 701 |
+
# Ensure we have exactly 5 recommendations
|
| 702 |
+
while len(recommendations) < 5:
|
| 703 |
+
recommendations.append(f"**Recommendation {len(recommendations)+1}:** Conduct further analysis to identify additional optimization opportunities")
|
| 704 |
+
|
| 705 |
+
# Log final count
|
| 706 |
+
logger.info(f"🎯 Final recommendations count: {len(recommendations[:5])}")
|
| 707 |
+
for i, rec in enumerate(recommendations[:5], 1):
|
| 708 |
+
logger.info(f"Rec {i}: {rec[:100]}...")
|
| 709 |
|
| 710 |
+
state["recommendations"] = recommendations[:5] # Exactly 5 recommendations
|
| 711 |
state["current_step"] = "recommendation_engine"
|
| 712 |
|
| 713 |
except Exception as e:
|