Spaces:

Aka18
/

AIDA

Sleeping

App Files Files Community

Aka18 commited on Jul 6, 2025

Commit

fa623b2

verified ·

1 Parent(s): 24f5365

Update data_analysis_agent.py

Browse files

Files changed (1) hide show

data_analysis_agent.py +219 -155

data_analysis_agent.py CHANGED Viewed

@@ -10,19 +10,19 @@ import warnings
 import traceback
 import time
 import random
-import httpx
 warnings.filterwarnings('ignore')
 from typing import Dict, List, Any, Optional, TypedDict
-import json
 from datetime import datetime
 import logging
-# LangGraph and LLM imports
 from langgraph.graph import StateGraph, END
-from langchain_groq import ChatGroq
 from langchain_core.messages import HumanMessage, SystemMessage
-from langchain_core.prompts import ChatPromptTemplate
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -41,86 +41,136 @@ class AnalysisState(TypedDict):
 class DataAnalysisAgent:
     def __init__(self, groq_api_key: str, model_name: str = "llama3-70b-8192"):
-        """Initialize the Data Analysis Agent with HF Spaces networking fix"""
-        # Detect if running in Hugging Face Spaces
-        self.is_hf_spaces = os.environ.get('SPACE_ID') is not None
         self.groq_api_key = groq_api_key
         self.model_name = model_name
         if self.is_hf_spaces:
-            logger.info("🚀 Initializing for Hugging Face Spaces...")
-            # Hugging Face Spaces specific configuration
-            self.llm = ChatGroq(
-                groq_api_key=groq_api_key,
-                model_name=model_name,
-                temperature=0.1,
-                max_tokens=2000,
-                timeout=180,  # Longer timeout for HF Spaces
-                max_retries=0,  # Disable retries, handle manually
-                http_client=httpx.Client(
-                    timeout=httpx.Timeout(180.0),
-                    limits=httpx.Limits(max_connections=5, max_keepalive_connections=2),
-                    headers={
-                        "User-Agent": "Mozilla/5.0 (compatible; DataAnalysisAgent/1.0)",
-                        "Accept": "application/json",
-                        "Connection": "close"  # Important for HF Spaces
-                    }
-                )
-            )
         else:
-            logger.info("💻 Initializing for local/Streamlit Cloud...")
-            # Normal configuration for local/Streamlit Cloud
-            self.llm = ChatGroq(
-                groq_api_key=groq_api_key,
-                model_name=model_name,
-                temperature=0.1,
-                max_tokens=2000
-            )
         # Set up the analysis workflow graph
         self.workflow = self._create_workflow()
-    def _hf_spaces_llm_call(self, prompt: str) -> str:
-        """LLM call optimized for Hugging Face Spaces"""
-        if not self.is_hf_spaces:
-            # Normal call for local/Streamlit Cloud
-            response = self.llm.invoke([HumanMessage(content=prompt)])
-            return response.content
-        # Hugging Face Spaces specific handling
-        max_attempts = 3
         for attempt in range(max_attempts):
             try:
-                # Add random delay to avoid rate limiting
                 if attempt > 0:
-                    delay = random.uniform(2, 5) * attempt
-                    logger.info(f"⏳ HF Spaces: Waiting {delay:.1f}s before retry {attempt + 1}")
                     time.sleep(delay)
-                # Make the call
-                logger.info(f"🤖 HF Spaces: LLM attempt {attempt + 1}/{max_attempts}")
-                response = self.llm.invoke([HumanMessage(content=prompt)])
-                if response and response.content:
-                    logger.info("✅ HF Spaces: LLM call successful")
-                    return response.content
-            except Exception as e:
-                error_str = str(e).lower()
-                logger.warning(f"⚠️ HF Spaces: Attempt {attempt + 1} failed: {str(e)}")
-                if "connection" in error_str or "timeout" in error_str:
-                    # Network issue - retry with longer delay
-                    if attempt < max_attempts - 1:
-                        continue
                 else:
-                    # Other error - might be permanent
-                    break
-        # All attempts failed
-        raise ConnectionError(f"HF Spaces: Failed after {max_attempts} attempts")
     def _create_workflow(self) -> StateGraph:
         """Create the LangGraph workflow for data analysis"""
@@ -167,21 +217,19 @@ class DataAnalysisAgent:
                 "datetime_columns": df.select_dtypes(include=['datetime64']).columns.tolist()
             }
-            # Use LLM to generate initial insights about the dataset
-            prompt = f"""
-            Analyze this dataset profile and provide initial observations:
-            Dataset Shape: {dataset_info['shape']}
-            Columns: {dataset_info['columns']}
-            Data Types: {dataset_info['dtypes']}
-            Missing Values: {dataset_info['null_counts']}
-            Duplicate Rows: {dataset_info['duplicate_rows']}
-            Provide a brief analysis of the dataset structure, data quality issues, and potential analysis opportunities.
-            """
-            # Use HF Spaces optimized call
-            response_content = self._hf_spaces_llm_call(prompt)
             dataset_info["llm_profile"] = response_content
             state["dataset_info"] = dataset_info
@@ -275,22 +323,17 @@ class DataAnalysisAgent:
                 column_analysis[column] = analysis
-            # Use LLM to interpret column analysis
-            prompt = f"""
-            Analyze these column statistics and identify patterns, anomalies, and insights:
-            {json.dumps(column_analysis, indent=2, default=str)}
-            Focus on:
-            1. Data quality issues
-            2. Distribution patterns
-            3. Potential relationships between columns
-            4. Outliers or anomalies
-            5. Business insights
-            """
-            # Use HF Spaces optimized call
-            response_content = self._hf_spaces_llm_call(prompt)
             column_analysis["llm_interpretation"] = response_content
             state["column_analysis"] = column_analysis
@@ -337,23 +380,40 @@ class DataAnalysisAgent:
                             })
                 correlations["high_correlations"] = high_correlations
-            # Use LLM to generate comprehensive insights
-            prompt = f"""
-            Based on the dataset analysis, generate key insights and findings:
-            Dataset Info: {json.dumps(dataset_info, indent=2, default=str)}
-            High Correlations: {json.dumps(correlations, indent=2, default=str)}
-            Generate 5-10 specific, actionable insights that would be valuable for business decision-making.
-            Focus on trends, patterns, anomalies, and opportunities.
-            """
-            # Use HF Spaces optimized call
-            response_content = self._hf_spaces_llm_call(prompt)
-            insights = response_content.split('\n')
-            insights = [insight.strip() for insight in insights if insight.strip()]
-            state["insights"] = insights
             state["current_step"] = "insight_generator"
         except Exception as e:
@@ -380,33 +440,23 @@ class DataAnalysisAgent:
                 dataset_info["numeric_columns"] = df.select_dtypes(include=[np.number]).columns.tolist()
                 dataset_info["categorical_columns"] = df.select_dtypes(include=['object', 'category']).columns.tolist()
-            # Use LLM to plan visualizations
-            prompt = f"""
-            Plan the most effective visualizations for this dataset:
-            Dataset Info: {json.dumps(dataset_info, indent=2, default=str)}
-            Key Insights: {insights}
-            Suggest 5-8 different visualization types with:
-            1. Chart type (histogram, scatter, bar, line, heatmap, etc.)
-            2. Columns to use
-            3. Purpose/insight to communicate
-            4. Title and description
-            Return as a JSON list with this structure:
-            [
-                {{
-                    "type": "histogram",
-                    "columns": ["column_name"],
-                    "title": "Distribution of...",
-                    "description": "Shows the...",
-                    "purpose": "Understand distribution"
-                }}
-            ]
-            """
-            # Use HF Spaces optimized call
-            response_content = self._hf_spaces_llm_call(prompt)
             try:
                 # Extract JSON from response
                 json_start = response_content.find('[')
@@ -540,7 +590,7 @@ class DataAnalysisAgent:
                     plt.close()
                 except Exception as e:
-                    logger.warning(f"Failed to create {viz['type']} chart: {str(e)}")
                     plt.close()
                     continue
@@ -562,28 +612,42 @@ class DataAnalysisAgent:
             insights = state["insights"]
             dataset_info = state["dataset_info"]
-            # Use LLM to generate recommendations
-            prompt = f"""
-            Based on the complete data analysis, generate specific, actionable recommendations:
-            Dataset Info: {json.dumps(dataset_info, indent=2, default=str)}
-            Key Insights: {insights}
-            Generate 5-10 specific recommendations that include:
-            1. Data quality improvements
-            2. Business opportunities
-            3. Further analysis suggestions
-            4. Action items for stakeholders
-            Make recommendations specific, measurable, and actionable.
-            """
-            # Use HF Spaces optimized call
-            response_content = self._hf_spaces_llm_call(prompt)
-            recommendations = response_content.split('\n')
-            recommendations = [rec.strip() for rec in recommendations if rec.strip()]
-            state["recommendations"] = recommendations
             state["current_step"] = "recommendation_engine"
         except Exception as e:

 import traceback
 import time
 import random
+import requests
+import json
+from urllib3.util.retry import Retry
+from requests.adapters import HTTPAdapter
 warnings.filterwarnings('ignore')
 from typing import Dict, List, Any, Optional, TypedDict
 from datetime import datetime
 import logging
+# LangGraph imports
 from langgraph.graph import StateGraph, END
 from langchain_core.messages import HumanMessage, SystemMessage
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 class DataAnalysisAgent:
     def __init__(self, groq_api_key: str, model_name: str = "llama3-70b-8192"):
+        """Initialize with direct Groq API calls to bypass HF Spaces blocks"""
         self.groq_api_key = groq_api_key
         self.model_name = model_name
+        self.is_hf_spaces = os.environ.get('SPACE_ID') is not None
+        # Configure requests session with aggressive retry strategy
+        self.session = requests.Session()
+        retry_strategy = Retry(
+            total=5,
+            backoff_factor=3,
+            status_forcelist=[429, 500, 502, 503, 504],
+            allowed_methods=["POST"]
+        )
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        self.session.mount("http://", adapter)
+        self.session.mount("https://", adapter)
+        # Set session headers to mimic browser/curl
+        self.session.headers.update({
+            "User-Agent": "curl/7.68.0",
+            "Accept": "*/*",
+            "Accept-Encoding": "gzip, deflate",
+            "Connection": "close"
+        })
         if self.is_hf_spaces:
+            logger.info("🚀 HF Spaces: Using direct Groq API calls")
         else:
+            logger.info("💻 Local: Using direct Groq API calls")
         # Set up the analysis workflow graph
         self.workflow = self._create_workflow()
+    def _direct_groq_call(self, prompt: str) -> str:
+        """Direct Groq API call bypassing LangChain completely"""
+        url = "https://api.groq.com/openai/v1/chat/completions"
+        headers = {
+            "Authorization": f"Bearer {self.groq_api_key}",
+            "Content-Type": "application/json",
+            "User-Agent": "curl/7.68.0",
+            "Accept": "*/*",
+            "Connection": "close"
+        }
+        data = {
+            "messages": [
+                {"role": "user", "content": prompt}
+            ],
+            "model": self.model_name,
+            "max_tokens": 1500,
+            "temperature": 0.1,
+            "stream": False
+        }
+        max_attempts = 5 if self.is_hf_spaces else 3
         for attempt in range(max_attempts):
             try:
                 if attempt > 0:
+                    # Exponential backoff with jitter
+                    delay = (2 ** attempt) + random.uniform(1, 3)
+                    logger.info(f"⏳ Waiting {delay:.1f}s before attempt {attempt + 1}")
                     time.sleep(delay)
+                logger.info(f"🤖 Direct Groq API attempt {attempt + 1}/{max_attempts}")
+                # Try different approaches for HF Spaces
+                if self.is_hf_spaces and attempt > 1:
+                    # Try with different headers
+                    headers["User-Agent"] = f"DataAnalysisAgent/1.{attempt}"
+                    headers["X-Forwarded-For"] = "127.0.0.1"
+                response = self.session.post(
+                    url,
+                    headers=headers,
+                    json=data,
+                    timeout=120,
+                    verify=True,
+                    allow_redirects=True
+                )
+                logger.info(f"📡 Response status: {response.status_code}")
+                if response.status_code == 200:
+                    result = response.json()
+                    content = result["choices"][0]["message"]["content"]
+                    logger.info("✅ Direct Groq API call successful")
+                    return content
+                elif response.status_code == 429:
+                    logger.warning("⚠️ Rate limited, retrying...")
+                    time.sleep(10)
+                    continue
+                elif response.status_code in [500, 502, 503, 504]:
+                    logger.warning(f"⚠️ Server error {response.status_code}, retrying...")
+                    continue
                 else:
+                    logger.error(f"❌ API error {response.status_code}: {response.text}")
+                    if attempt == max_attempts - 1:
+                        raise Exception(f"Groq API error: {response.status_code}")
+                    continue
+            except requests.exceptions.ConnectTimeout:
+                logger.warning(f"⚠️ Connection timeout on attempt {attempt + 1}")
+                continue
+            except requests.exceptions.ReadTimeout:
+                logger.warning(f"⚠️ Read timeout on attempt {attempt + 1}")
+                continue
+            except requests.exceptions.ConnectionError as e:
+                logger.warning(f"⚠️ Connection error on attempt {attempt + 1}: {str(e)}")
+                # Try with different session for HF Spaces
+                if self.is_hf_spaces and attempt > 2:
+                    logger.info("🔄 Creating new session...")
+                    self.session = requests.Session()
+                continue
+            except Exception as e:
+                logger.error(f"❌ Unexpected error on attempt {attempt + 1}: {str(e)}")
+                if attempt == max_attempts - 1:
+                    raise
+                continue
+        raise ConnectionError(f"Failed to connect to Groq API after {max_attempts} attempts")
     def _create_workflow(self) -> StateGraph:
         """Create the LangGraph workflow for data analysis"""
                 "datetime_columns": df.select_dtypes(include=['datetime64']).columns.tolist()
             }
+            # Simpler prompt for better success rate
+            prompt = f"""Analyze this dataset profile:
+Dataset: {dataset_info['shape'][0]} rows × {dataset_info['shape'][1]} columns
+Missing values: {sum(dataset_info['null_counts'].values())} total
+Duplicates: {dataset_info['duplicate_rows']}
+Numeric columns: {len(dataset_info['numeric_columns'])}
+Categorical columns: {len(dataset_info['categorical_columns'])}
+Provide a brief professional assessment of data quality and analysis potential in 2-3 sentences."""
+            # Use direct Groq API call
+            response_content = self._direct_groq_call(prompt)
             dataset_info["llm_profile"] = response_content
             state["dataset_info"] = dataset_info
                 column_analysis[column] = analysis
+            # Simplified prompt for column analysis
+            prompt = f"""Analyze these column statistics and identify key patterns:
+Total columns analyzed: {len(column_analysis)}
+Numeric columns: {len([c for c in column_analysis if 'mean' in column_analysis[c]])}
+Text columns: {len([c for c in column_analysis if 'top_values' in column_analysis[c]])}
+Provide 2-3 key observations about data patterns and quality issues."""
+            # Use direct Groq API call
+            response_content = self._direct_groq_call(prompt)
             column_analysis["llm_interpretation"] = response_content
             state["column_analysis"] = column_analysis
                             })
                 correlations["high_correlations"] = high_correlations
+            # Simplified prompt for insights
+            prompt = f"""Generate exactly 5 specific insights for this dataset:
+Dataset: {dataset_info.get('shape', [0])[0]:,} rows, {dataset_info.get('shape', [0])[1]} columns
+Missing values: {sum(dataset_info.get('null_counts', {}).values()):,}
+Numeric variables: {len(numeric_cols)}
+Categorical variables: {len(dataset_info.get('categorical_columns', []))}
+Strong correlations found: {len(correlations.get('high_correlations', []))}
+Format as:
+**Insight 1:** [specific finding]
+**Insight 2:** [specific finding]
+**Insight 3:** [specific finding]
+**Insight 4:** [specific finding]
+**Insight 5:** [specific finding]
+Focus on data quality, patterns, and business value."""
+            # Use direct Groq API call
+            response_content = self._direct_groq_call(prompt)
+            # Parse insights from response
+            insights = []
+            lines = response_content.split('\n')
+            for line in lines:
+                line = line.strip()
+                if line and ('**Insight' in line or line.startswith(('1.', '2.', '3.', '4.', '5.'))):
+                    insights.append(line)
+            # If parsing failed, split by lines and take meaningful ones
+            if len(insights) < 3:
+                insights = [line.strip() for line in response_content.split('\n') if len(line.strip()) > 20]
+            state["insights"] = insights[:10]  # Limit to 10 insights
             state["current_step"] = "insight_generator"
         except Exception as e:
                 dataset_info["numeric_columns"] = df.select_dtypes(include=[np.number]).columns.tolist()
                 dataset_info["categorical_columns"] = df.select_dtypes(include=['object', 'category']).columns.tolist()
+            # Simplified prompt for visualization planning
+            prompt = f"""Plan 5 effective visualizations for this dataset:
+Numeric columns: {len(dataset_info.get('numeric_columns', []))}
+Categorical columns: {len(dataset_info.get('categorical_columns', []))}
+Return as JSON array:
+[
+  {{"type": "histogram", "columns": ["col1"], "title": "Distribution of col1", "description": "Shows distribution", "purpose": "Understand patterns"}},
+  {{"type": "bar", "columns": ["col2"], "title": "Frequency of col2", "description": "Shows counts", "purpose": "Category analysis"}}
+]
+Use types: histogram, bar, scatter, heatmap, line"""
+            # Use direct Groq API call
+            response_content = self._direct_groq_call(prompt)
             try:
                 # Extract JSON from response
                 json_start = response_content.find('[')
                     plt.close()
                 except Exception as e:
+                    logger.warning(f"Failed to create {viz.get('type', 'unknown')} chart: {str(e)}")
                     plt.close()
                     continue
             insights = state["insights"]
             dataset_info = state["dataset_info"]
+            # Simplified prompt for recommendations
+            prompt = f"""Based on this data analysis, generate exactly 5 specific recommendations:
+Dataset: {dataset_info.get('shape', [0])[0]:,} rows, {dataset_info.get('shape', [0])[1]} columns
+Missing values: {sum(dataset_info.get('null_counts', {}).values()):,}
+Key insights available: {len(insights)}
+Format as:
+**Recommendation 1:** [specific action]
+**Recommendation 2:** [specific action]
+**Recommendation 3:** [specific action]
+**Recommendation 4:** [specific action]
+**Recommendation 5:** [specific action]
+Focus on:
+- Data quality improvements
+- Business opportunities
+- Further analysis suggestions
+- Actionable next steps"""
+            # Use direct Groq API call
+            response_content = self._direct_groq_call(prompt)
+            # Parse recommendations from response
+            recommendations = []
+            lines = response_content.split('\n')
+            for line in lines:
+                line = line.strip()
+                if line and ('**Recommendation' in line or line.startswith(('1.', '2.', '3.', '4.', '5.'))):
+                    recommendations.append(line)
+            # If parsing failed, split by lines and take meaningful ones
+            if len(recommendations) < 3:
+                recommendations = [line.strip() for line in response_content.split('\n') if len(line.strip()) > 20]
+            state["recommendations"] = recommendations[:10]  # Limit to 10
             state["current_step"] = "recommendation_engine"
         except Exception as e: