Spaces:

Aka18
/

AIDA

Running

App Files Files Community

Aka18 commited on Jul 6, 2025

Commit

63ef20b

verified ·

1 Parent(s): fa623b2

Update data_analysis_agent.py

Browse files

Files changed (1) hide show

data_analysis_agent.py +82 -22

data_analysis_agent.py CHANGED Viewed

@@ -612,42 +612,102 @@ Use types: histogram, bar, scatter, heatmap, line"""
             insights = state["insights"]
             dataset_info = state["dataset_info"]
-            # Simplified prompt for recommendations
-            prompt = f"""Based on this data analysis, generate exactly 5 specific recommendations:
 Dataset: {dataset_info.get('shape', [0])[0]:,} rows, {dataset_info.get('shape', [0])[1]} columns
 Missing values: {sum(dataset_info.get('null_counts', {}).values()):,}
 Key insights available: {len(insights)}
-Format as:
-**Recommendation 1:** [specific action]
-**Recommendation 2:** [specific action]
-**Recommendation 3:** [specific action]
-**Recommendation 4:** [specific action]
-**Recommendation 5:** [specific action]
-Focus on:
-- Data quality improvements
-- Business opportunities
-- Further analysis suggestions
-- Actionable next steps"""
             # Use direct Groq API call
             response_content = self._direct_groq_call(prompt)
-            # Parse recommendations from response
             recommendations = []
-            lines = response_content.split('\n')
-            for line in lines:
-                line = line.strip()
-                if line and ('**Recommendation' in line or line.startswith(('1.', '2.', '3.', '4.', '5.'))):
-                    recommendations.append(line)
-            # If parsing failed, split by lines and take meaningful ones
             if len(recommendations) < 3:
-                recommendations = [line.strip() for line in response_content.split('\n') if len(line.strip()) > 20]
-            state["recommendations"] = recommendations[:10]  # Limit to 10
             state["current_step"] = "recommendation_engine"
         except Exception as e:

             insights = state["insights"]
             dataset_info = state["dataset_info"]
+            # Enhanced prompt for better recommendation generation
+            prompt = f"""Based on this data analysis, generate exactly 5 specific actionable recommendations:
 Dataset: {dataset_info.get('shape', [0])[0]:,} rows, {dataset_info.get('shape', [0])[1]} columns
 Missing values: {sum(dataset_info.get('null_counts', {}).values()):,}
 Key insights available: {len(insights)}
+Please provide exactly 5 recommendations in this format:
+1. [First recommendation about data quality or collection]
+2. [Second recommendation about analysis or modeling]
+3. [Third recommendation about business opportunities]
+4. [Fourth recommendation about process improvements]
+5. [Fifth recommendation about next steps or monitoring]
+Each recommendation should be specific, measurable, and actionable."""
             # Use direct Groq API call
             response_content = self._direct_groq_call(prompt)
+            logger.info(f"🎯 Raw recommendations response: {response_content[:200]}...")
+            # Enhanced parsing for recommendations
             recommendations = []
+            # Method 1: Look for numbered items (1., 2., etc.)
+            import re
+            numbered_pattern = r'(\d+\.)\s*(.+?)(?=\d+\.|$)'
+            numbered_matches = re.findall(numbered_pattern, response_content, re.DOTALL)
+            if numbered_matches and len(numbered_matches) >= 3:
+                for num, content in numbered_matches:
+                    clean_content = content.strip().replace('\n', ' ').replace('  ', ' ')
+                    if len(clean_content) > 15:
+                        recommendations.append(f"**Recommendation {num[0]}:** {clean_content}")
+                logger.info(f"✅ Found {len(recommendations)} numbered recommendations")
+            # Method 2: Look for **Recommendation** format
             if len(recommendations) < 3:
+                rec_pattern = r'\*\*Recommendation\s*(\d+):\*\*\s*(.+?)(?=\*\*Recommendation\s*\d+:|\n\n|$)'
+                rec_matches = re.findall(rec_pattern, response_content, re.DOTALL)
+                recommendations = []
+                for num, content in rec_matches:
+                    clean_content = content.strip().replace('\n', ' ').replace('  ', ' ')
+                    if len(clean_content) > 15:
+                        recommendations.append(f"**Recommendation {num}:** {clean_content}")
+                logger.info(f"✅ Found {len(recommendations)} **Recommendation** format items")
+            # Method 3: Split by lines and filter meaningful content
+            if len(recommendations) < 3:
+                lines = response_content.split('\n')
+                recommendations = []
+                for line in lines:
+                    line = line.strip()
+                    # Look for lines that start with numbers or contain recommendation keywords
+                    if (line and
+                        (re.match(r'^\d+\.', line) or
+                         'recommend' in line.lower() or
+                         any(keyword in line.lower() for keyword in ['implement', 'consider', 'develop', 'improve', 'analyze', 'monitor', 'establish'])) and
+                        len(line) > 25):
+                        # Clean up the line
+                        clean_line = re.sub(r'^\d+\.\s*', '', line)  # Remove leading numbers
+                        if not clean_line.startswith('**Recommendation'):
+                            clean_line = f"**Recommendation {len(recommendations)+1}:** {clean_line}"
+                        recommendations.append(clean_line)
+                        if len(recommendations) >= 5:
+                            break
+                logger.info(f"✅ Found {len(recommendations)} recommendations via line parsing")
+            # Method 4: Fallback - create structured recommendations
+            if len(recommendations) < 3:
+                logger.warning("⚠️ Parsing failed, creating fallback recommendations")
+                missing_pct = (sum(dataset_info.get('null_counts', {}).values()) /
+                             max(1, dataset_info.get('shape', [1, 1])[0] * dataset_info.get('shape', [1, 1])[1]) * 100)
+                recommendations = [
+                    "**Recommendation 1:** Implement comprehensive data quality assessment and validation procedures to ensure data integrity",
+                    "**Recommendation 2:** Develop automated monitoring dashboards for key metrics and performance indicators",
+                    "**Recommendation 3:** Consider advanced statistical modeling and machine learning techniques for predictive insights",
+                    "**Recommendation 4:** Establish regular data collection and analysis workflows for ongoing business intelligence",
+                    "**Recommendation 5:** Create stakeholder reporting mechanisms to communicate findings and track implementation progress"
+                ]
+                if missing_pct > 10:
+                    recommendations[0] = f"**Recommendation 1:** Address {missing_pct:.1f}% missing data through improved collection processes or imputation strategies"
+            # Ensure we have exactly 5 recommendations
+            while len(recommendations) < 5:
+                recommendations.append(f"**Recommendation {len(recommendations)+1}:** Conduct further analysis to identify additional optimization opportunities")
+            # Log final count
+            logger.info(f"🎯 Final recommendations count: {len(recommendations[:5])}")
+            for i, rec in enumerate(recommendations[:5], 1):
+                logger.info(f"Rec {i}: {rec[:100]}...")
+            state["recommendations"] = recommendations[:5]  # Exactly 5 recommendations
             state["current_step"] = "recommendation_engine"
         except Exception as e: