Spaces:

holistic-ai
/

AgentGraph

Running

File size: 9,055 Bytes

"""
Unified Knowledge Extraction Method (1-Task Approach)

Copied from core/agent_monitoring_unified.py and adapted for evaluation framework.
Uses the unified 1-task CrewAI approach with a single agent that performs all
knowledge extraction tasks in one step.
"""

# Import the LiteLLM fix FIRST, before any other imports that might use LiteLLM
import os
import sys

# Add the parent directory to the path to ensure imports work correctly
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))
import json
import logging
import time
from datetime import datetime
from typing import Any, Dict

from crewai import Agent, Crew, Process, Task

from evaluation.knowledge_extraction.baselines.base_method import BaseKnowledgeExtractionMethod
from evaluation.knowledge_extraction.utils.models import KnowledgeGraph

# Import shared prompt templates
from evaluation.knowledge_extraction.utils.prompts import (
    ENTITY_EXTRACTION_INSTRUCTION_PROMPT,
    ENTITY_EXTRACTION_SYSTEM_PROMPT,
    GRAPH_BUILDER_SYSTEM_PROMPT,
    RELATION_EXTRACTION_INSTRUCTION_PROMPT,
    RELATION_EXTRACTION_SYSTEM_PROMPT,
)
from utils.fix_litellm_stop_param import *  # This applies the patches  # noqa: F403

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set higher log levels for noisy libraries
logging.getLogger("openai").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("litellm").setLevel(logging.WARNING)
logging.getLogger("chromadb").setLevel(logging.WARNING)

# Set default verbosity level
verbose_level = 0

# Set environment variables
os.environ["OPENAI_MODEL_NAME"] = "gpt-5-mini"

class UnifiedKnowledgeExtractionMethod(BaseKnowledgeExtractionMethod):
    """Unified 1-task knowledge extraction method using CrewAI."""
    
    def __init__(self, **kwargs):
        super().__init__("unified_method", **kwargs)
        self._setup_agent_and_task()
    
    def _setup_agent_and_task(self):
        """Set up the CrewAI agent and task."""
        
        # Create unified agent
        self.unified_knowledge_graph_agent = Agent(
            role="Unified Knowledge Graph Analyst",
            goal="Create comprehensive knowledge graphs from agent system data in a single analysis pass",
            backstory=f"""{ENTITY_EXTRACTION_SYSTEM_PROMPT}

{RELATION_EXTRACTION_SYSTEM_PROMPT}

{GRAPH_BUILDER_SYSTEM_PROMPT}.""",
            verbose=bool(verbose_level),
            llm=os.environ["OPENAI_MODEL_NAME"]
        )

        # Create unified task
        self.unified_knowledge_graph_task = Task(
            description=f"""
            Extract entities:
            {ENTITY_EXTRACTION_INSTRUCTION_PROMPT}
            
            Also extract relationships:
            {RELATION_EXTRACTION_INSTRUCTION_PROMPT}
            
            Finally, build the knowledge graph:
            """,
            agent=self.unified_knowledge_graph_agent,
            expected_output="A complete knowledge graph with entities, relations, and metadata",
            output_pydantic=KnowledgeGraph,
        )

        # Create crew
        self.unified_agent_monitoring_crew = Crew(
            agents=[self.unified_knowledge_graph_agent],
            tasks=[self.unified_knowledge_graph_task],
            verbose=bool(verbose_level),
            memory=False,
            planning=False,
            process=Process.sequential,
        )
    
    def process_text(self, text: str) -> Dict[str, Any]:
        """
        Process input text using the unified 1-task CrewAI approach.
        
        Args:
            text: Input text to process
            
        Returns:
            Dictionary with kg_data, metadata, success, and optional error
        """
        start_time = time.time()
        
        try:
            logger.info(f"process_text called with text length: {len(text)}")
            logger.info(f"text first 200 chars: {repr(text[:200])}")
            
            logger.info("Starting crew execution with input_data...")
            
            # Run the crew with proper input mechanism
            result = self.unified_agent_monitoring_crew.kickoff(inputs={"input_data": text})
            
            logger.info(f"Crew execution completed, result type: {type(result)}")
            
            processing_time = time.time() - start_time
            
            # Extract the knowledge graph from the result
            if hasattr(result, 'pydantic') and result.pydantic:
                kg_data = result.pydantic.dict()
            elif hasattr(result, 'raw'):
                # Try to parse as JSON
                try:
                    kg_data = json.loads(result.raw)
                except:  # noqa: E722
                    kg_data = {"entities": [], "relations": [], "error": "Failed to parse result"}
            else:
                kg_data = {"entities": [], "relations": [], "error": "Unknown result format"}
            
            # Validate kg_data structure
            if not isinstance(kg_data, dict):
                raise ValueError("kg_data is not a dict after parsing")
                
            if not ("entities" in kg_data and "relations" in kg_data):
                raise ValueError("kg_data missing 'entities' or 'relations'")
                
            # Add metadata
            if "metadata" not in kg_data:
                kg_data["metadata"] = {}
                
            kg_data["metadata"]["processing_info"] = {
                "method": "unified_single_task",
                "processing_time_seconds": processing_time,
                "processed_at": datetime.now().isoformat(),
                "agent_count": 1,
                "task_count": 1,
                "api_calls": 1
            }
            
            # Calculate statistics
            entity_count = len(kg_data.get("entities", []))
            relation_count = len(kg_data.get("relations", []))
            
            return {
                "success": True,
                "kg_data": kg_data,
                "metadata": {
                    "approach": "unified_1_task",
                    "tasks_executed": 1,
                    "agents_used": 1,
                    "method": self.method_name,
                    "processing_time_seconds": processing_time,
                    "entity_count": entity_count,
                    "relation_count": relation_count,
                    "entities_per_second": entity_count / processing_time if processing_time > 0 else 0,
                    "relations_per_second": relation_count / processing_time if processing_time > 0 else 0,
                    "api_calls": 1
                }
            }
            
        except Exception as e:
            processing_time = time.time() - start_time
            logger.error(f"Error in unified knowledge extraction method: {e}")
            logger.error(f"Error type: {type(e).__name__}")
            import traceback
            logger.error(f"Traceback: {traceback.format_exc()}")
            return {
                "success": False,
                "error": str(e),
                "kg_data": {"entities": [], "relations": []},
                "metadata": {
                    "approach": "unified_1_task",
                    "tasks_executed": 0,
                    "agents_used": 0,
                    "method": self.method_name,
                    "processing_time_seconds": processing_time,
                    "api_calls": 1
                }
            }
    
    def extract_knowledge_graph(self, trace_data: str) -> Dict[str, Any]:
        """
        Extract knowledge graph from trace data.
        
        Args:
            trace_data: Agent trace data as JSON string
            
        Returns:
            Dictionary with entities and relations
        """
        try:
            # Debug logging
            logger.info(f"extract_knowledge_graph called with trace_data type: {type(trace_data)}")
            if isinstance(trace_data, str):
                logger.info(f"trace_data length: {len(trace_data)}")
                logger.info(f"trace_data first 200 chars: {repr(trace_data[:200])}")
            
            # Pass the JSON string directly to process_text without re-encoding
            result = self.process_text(trace_data)
            
            # Return just the knowledge graph data
            if result.get("success", False):
                return result.get("kg_data", {"entities": [], "relations": []})
            else:
                # Return empty knowledge graph on failure
                return {"entities": [], "relations": []}
                
        except Exception as e:
            logger.error(f"Error in extract_knowledge_graph: {e}")
            logger.error(f"trace_data type: {type(trace_data)}")
            if isinstance(trace_data, str):
                logger.error(f"trace_data content (first 200 chars): {repr(trace_data[:200])}")
            return {"entities": [], "relations": []}