Aka18 commited on
Commit
63ef20b
·
verified ·
1 Parent(s): fa623b2

Update data_analysis_agent.py

Browse files
Files changed (1) hide show
  1. data_analysis_agent.py +82 -22
data_analysis_agent.py CHANGED
@@ -612,42 +612,102 @@ Use types: histogram, bar, scatter, heatmap, line"""
612
  insights = state["insights"]
613
  dataset_info = state["dataset_info"]
614
 
615
- # Simplified prompt for recommendations
616
- prompt = f"""Based on this data analysis, generate exactly 5 specific recommendations:
617
 
618
  Dataset: {dataset_info.get('shape', [0])[0]:,} rows, {dataset_info.get('shape', [0])[1]} columns
619
  Missing values: {sum(dataset_info.get('null_counts', {}).values()):,}
620
  Key insights available: {len(insights)}
621
 
622
- Format as:
623
- **Recommendation 1:** [specific action]
624
- **Recommendation 2:** [specific action]
625
- **Recommendation 3:** [specific action]
626
- **Recommendation 4:** [specific action]
627
- **Recommendation 5:** [specific action]
 
628
 
629
- Focus on:
630
- - Data quality improvements
631
- - Business opportunities
632
- - Further analysis suggestions
633
- - Actionable next steps"""
634
 
635
  # Use direct Groq API call
636
  response_content = self._direct_groq_call(prompt)
 
637
 
638
- # Parse recommendations from response
639
  recommendations = []
640
- lines = response_content.split('\n')
641
- for line in lines:
642
- line = line.strip()
643
- if line and ('**Recommendation' in line or line.startswith(('1.', '2.', '3.', '4.', '5.'))):
644
- recommendations.append(line)
645
 
646
- # If parsing failed, split by lines and take meaningful ones
 
 
 
 
 
 
 
 
 
 
 
 
647
  if len(recommendations) < 3:
648
- recommendations = [line.strip() for line in response_content.split('\n') if len(line.strip()) > 20]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
649
 
650
- state["recommendations"] = recommendations[:10] # Limit to 10
651
  state["current_step"] = "recommendation_engine"
652
 
653
  except Exception as e:
 
612
  insights = state["insights"]
613
  dataset_info = state["dataset_info"]
614
 
615
+ # Enhanced prompt for better recommendation generation
616
+ prompt = f"""Based on this data analysis, generate exactly 5 specific actionable recommendations:
617
 
618
  Dataset: {dataset_info.get('shape', [0])[0]:,} rows, {dataset_info.get('shape', [0])[1]} columns
619
  Missing values: {sum(dataset_info.get('null_counts', {}).values()):,}
620
  Key insights available: {len(insights)}
621
 
622
+ Please provide exactly 5 recommendations in this format:
623
+
624
+ 1. [First recommendation about data quality or collection]
625
+ 2. [Second recommendation about analysis or modeling]
626
+ 3. [Third recommendation about business opportunities]
627
+ 4. [Fourth recommendation about process improvements]
628
+ 5. [Fifth recommendation about next steps or monitoring]
629
 
630
+ Each recommendation should be specific, measurable, and actionable."""
 
 
 
 
631
 
632
  # Use direct Groq API call
633
  response_content = self._direct_groq_call(prompt)
634
+ logger.info(f"🎯 Raw recommendations response: {response_content[:200]}...")
635
 
636
+ # Enhanced parsing for recommendations
637
  recommendations = []
 
 
 
 
 
638
 
639
+ # Method 1: Look for numbered items (1., 2., etc.)
640
+ import re
641
+ numbered_pattern = r'(\d+\.)\s*(.+?)(?=\d+\.|$)'
642
+ numbered_matches = re.findall(numbered_pattern, response_content, re.DOTALL)
643
+
644
+ if numbered_matches and len(numbered_matches) >= 3:
645
+ for num, content in numbered_matches:
646
+ clean_content = content.strip().replace('\n', ' ').replace(' ', ' ')
647
+ if len(clean_content) > 15:
648
+ recommendations.append(f"**Recommendation {num[0]}:** {clean_content}")
649
+ logger.info(f"✅ Found {len(recommendations)} numbered recommendations")
650
+
651
+ # Method 2: Look for **Recommendation** format
652
  if len(recommendations) < 3:
653
+ rec_pattern = r'\*\*Recommendation\s*(\d+):\*\*\s*(.+?)(?=\*\*Recommendation\s*\d+:|\n\n|$)'
654
+ rec_matches = re.findall(rec_pattern, response_content, re.DOTALL)
655
+
656
+ recommendations = []
657
+ for num, content in rec_matches:
658
+ clean_content = content.strip().replace('\n', ' ').replace(' ', ' ')
659
+ if len(clean_content) > 15:
660
+ recommendations.append(f"**Recommendation {num}:** {clean_content}")
661
+ logger.info(f"✅ Found {len(recommendations)} **Recommendation** format items")
662
+
663
+ # Method 3: Split by lines and filter meaningful content
664
+ if len(recommendations) < 3:
665
+ lines = response_content.split('\n')
666
+ recommendations = []
667
+ for line in lines:
668
+ line = line.strip()
669
+ # Look for lines that start with numbers or contain recommendation keywords
670
+ if (line and
671
+ (re.match(r'^\d+\.', line) or
672
+ 'recommend' in line.lower() or
673
+ any(keyword in line.lower() for keyword in ['implement', 'consider', 'develop', 'improve', 'analyze', 'monitor', 'establish'])) and
674
+ len(line) > 25):
675
+ # Clean up the line
676
+ clean_line = re.sub(r'^\d+\.\s*', '', line) # Remove leading numbers
677
+ if not clean_line.startswith('**Recommendation'):
678
+ clean_line = f"**Recommendation {len(recommendations)+1}:** {clean_line}"
679
+ recommendations.append(clean_line)
680
+ if len(recommendations) >= 5:
681
+ break
682
+ logger.info(f"✅ Found {len(recommendations)} recommendations via line parsing")
683
+
684
+ # Method 4: Fallback - create structured recommendations
685
+ if len(recommendations) < 3:
686
+ logger.warning("⚠️ Parsing failed, creating fallback recommendations")
687
+ missing_pct = (sum(dataset_info.get('null_counts', {}).values()) /
688
+ max(1, dataset_info.get('shape', [1, 1])[0] * dataset_info.get('shape', [1, 1])[1]) * 100)
689
+
690
+ recommendations = [
691
+ "**Recommendation 1:** Implement comprehensive data quality assessment and validation procedures to ensure data integrity",
692
+ "**Recommendation 2:** Develop automated monitoring dashboards for key metrics and performance indicators",
693
+ "**Recommendation 3:** Consider advanced statistical modeling and machine learning techniques for predictive insights",
694
+ "**Recommendation 4:** Establish regular data collection and analysis workflows for ongoing business intelligence",
695
+ "**Recommendation 5:** Create stakeholder reporting mechanisms to communicate findings and track implementation progress"
696
+ ]
697
+
698
+ if missing_pct > 10:
699
+ recommendations[0] = f"**Recommendation 1:** Address {missing_pct:.1f}% missing data through improved collection processes or imputation strategies"
700
+
701
+ # Ensure we have exactly 5 recommendations
702
+ while len(recommendations) < 5:
703
+ recommendations.append(f"**Recommendation {len(recommendations)+1}:** Conduct further analysis to identify additional optimization opportunities")
704
+
705
+ # Log final count
706
+ logger.info(f"🎯 Final recommendations count: {len(recommendations[:5])}")
707
+ for i, rec in enumerate(recommendations[:5], 1):
708
+ logger.info(f"Rec {i}: {rec[:100]}...")
709
 
710
+ state["recommendations"] = recommendations[:5] # Exactly 5 recommendations
711
  state["current_step"] = "recommendation_engine"
712
 
713
  except Exception as e: