Spaces:

Leon4gr45
/

hyp

Sleeping

hyp

File size: 16,974 Bytes

# Main application entry point for AI Co-Scientist

import os
import argparse
import logging
import json
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional

from .agents.agent_factory import AgentFactory
from .tools.search_tool import SearchTool
from .tools.reasoning_tool import ReasoningTool
from .tools.citation_tool import CitationTool
from .utils.logger import setup_logging
from .utils.validators import validate_research_goal, validate_hypothesis
from .utils.exceptions import ConfigError, ValidationError

from .config.config import (
    AGENT_DEFAULT_MODEL,
    AGENT_DEFAULT_TEMPERATURE,
    LOG_LEVEL,
    LOG_TO_FILE,
    MAX_ITERATIONS,
    MAX_TOKENS
)

class AICoScientist:
    """Main class for the AI Co-Scientist system.
    
    This class coordinates the multi-agent scientific research workflow,
    managing the interactions between different specialized agents to generate
    and refine scientific hypotheses.
    """
    
    def __init__(self, config: Optional[Dict[str, Any]] = None):
        """Initialize the AI Co-Scientist system.
        
        Args:
            config: Optional configuration dictionary to override defaults
        """
        # Initialize logger
        self.logger = setup_logging(log_level=LOG_LEVEL, log_to_file=LOG_TO_FILE)
        self.logger.info("Initializing AI Co-Scientist system")
        
        # Load configuration
        self.config = {
            "model": AGENT_DEFAULT_MODEL,
            "temperature": AGENT_DEFAULT_TEMPERATURE,
            "max_iterations": MAX_ITERATIONS,
            "max_tokens": MAX_TOKENS,
        }
        
        # Override with provided config
        if config:
            # Ensure model is never None
            if config.get("model") is None:
                config["model"] = AGENT_DEFAULT_MODEL
                
            self.config.update(config)
            self.logger.info(f"Configuration overridden with custom values")
        
        self.logger.info(f"Using model: {self.config['model']}")
        
        # Initialize agent factory
        self.agent_factory = AgentFactory()
        
        # Initialize tools
        self.tools = {
            "search": SearchTool(),
            "reasoning": ReasoningTool(),
            "citation": CitationTool()
        }
        
        # Create supervisor agent
        self.supervisor = self.agent_factory.get_supervisor(
            model=self.config["model"],
            temperature=self.config["temperature"]
        )
        
        # Initialize research state
        self.research_goal = None
        self.hypotheses = []
        self.ranked_hypotheses = []
        self.final_report = None
    
    def set_research_goal(self, goal: str) -> bool:
        """Set the research goal for the system.
        
        Args:
            goal: The research goal to pursue
            
        Returns:
            True if the goal was successfully set, False otherwise
            
        Raises:
            ValidationError: If the goal is invalid
        """
        self.logger.info(f"Setting research goal: {goal}")
        
        # Validate the research goal
        is_valid, issues = validate_research_goal(goal)
        
        if not is_valid:
            error_msg = f"Invalid research goal: {', '.join(issues)}"
            self.logger.error(error_msg)
            raise ValidationError(error_msg)
        
        self.research_goal = goal
        self.logger.info(f"Research goal set successfully")
        return True
    
    def generate_hypotheses(self, count: int = 5) -> List[Dict[str, Any]]:
        """Generate scientific hypotheses based on the research goal.
        
        Args:
            count: Number of hypotheses to generate
            
        Returns:
            List of generated hypotheses with metadata
            
        Raises:
            ValueError: If research goal is not set
        """
        if not self.research_goal:
            raise ValueError("Research goal must be set before generating hypotheses")
        
        self.logger.info(f"Generating {count} hypotheses for research goal: {self.research_goal}")
        
        # Get generation agent
        generation_agent = self.agent_factory.get_agent(
            "generation",
            model=self.config["model"],
            temperature=self.config["temperature"] + 0.2  # Slightly higher temperature for creativity
        )
        
        # Generate hypotheses
        hypotheses = generation_agent.generate_hypotheses(self.research_goal, count=count)
        
        # Convert 'hypothesis' key to 'statement' for downstream compatibility
        for h in hypotheses:
            if "hypothesis" in h:
                h["statement"] = h["hypothesis"]

        # No validation or filtering: accept all generated areas of interest
        for h in hypotheses:
            area = h["statement"]
            # Simple transformation: create 2-3 research questions per area
            h["research_questions"] = [
                f"What are the key mechanisms underlying: {area}?",
                f"How could {area} be investigated in different populations or contexts?",
                f"What are the potential implications of {area} for future research or applications?"
            ]

        self.hypotheses = hypotheses
        self.logger.info(f"Generated {len(hypotheses)} areas of interest")
        
        return hypotheses
    
    def rank_hypotheses(self) -> List[Dict[str, Any]]:
        """Rank the generated hypotheses by quality and relevance.
        
        Returns:
            List of ranked hypotheses with scores
            
        Raises:
            ValueError: If no hypotheses have been generated
        """
        if not self.hypotheses:
            raise ValueError("No hypotheses have been generated to rank")
        
        self.logger.info(f"Ranking {len(self.hypotheses)} hypotheses")
        
        # Get ranking agent
        ranking_agent = self.agent_factory.get_agent("ranking")
        
        # Rank hypotheses
        ranked_hypotheses = ranking_agent.rank_hypotheses(
            self.research_goal,
            self.hypotheses
        )
        
        self.ranked_hypotheses = ranked_hypotheses
        self.logger.info(f"Ranked {len(ranked_hypotheses)} hypotheses")
        
        return ranked_hypotheses
    
    def refine_hypotheses(self, iterations: int = 3) -> List[Dict[str, Any]]:
        """Refine the top hypotheses through multiple iterations.
        
        Args:
            iterations: Number of refinement iterations
            
        Returns:
            List of refined hypotheses
            
        Raises:
            ValueError: If no ranked hypotheses are available
        """
        if not self.ranked_hypotheses:
            raise ValueError("No ranked hypotheses are available for refinement")
        
        # Get top 3 hypotheses for refinement
        top_hypotheses = self.ranked_hypotheses[:3]
        self.logger.info(f"Refining top {len(top_hypotheses)} hypotheses through {iterations} iterations")
        
        # Get agents for refinement
        reflection_agent = self.agent_factory.get_agent("reflection")
        evolution_agent = self.agent_factory.get_agent("evolution")
        proximity_agent = self.agent_factory.get_agent("proximity")
        
        refined_hypotheses = top_hypotheses.copy()
        
        for i in range(iterations):
            self.logger.info(f"Refinement iteration {i+1}/{iterations}")
            
            for j, hypothesis in enumerate(refined_hypotheses):
                # Get feedback from reflection agent
                feedback = reflection_agent.review_hypothesis(
                    hypothesis.get("hypothesis", hypothesis.get("statement")),
                    self.research_goal
                )
                
                # Evolve hypothesis based on feedback
                evolved = evolution_agent.evolve_hypothesis(
                    hypothesis.get("hypothesis", hypothesis.get("statement")),
                    feedback,
                    self.research_goal
                )
                
                # Check proximity to research goal
                proximity_result = proximity_agent.evaluate_proximity(
                    evolved.get("hypothesis", evolved.get("statement")),
                    self.research_goal
                )
                
                # Update hypothesis with refined version if it's relevant
                if proximity_result["proximity_score"] >= 0.7:  # Threshold for relevance
                    refined_hypotheses[j] = evolved
                    refined_hypotheses[j]["proximity"] = proximity_result
                    refined_hypotheses[j]["feedback"] = feedback
                    refined_hypotheses[j]["iteration"] = i + 1
                    main_text = evolved.get("hypothesis", evolved.get("statement", ""))
                    self.logger.info(f"Refined hypothesis {j+1}: {main_text[:100]}...")
                else:
                    self.logger.warning(f"Refined hypothesis {j+1} was too far from research goal (score: {proximity_result['proximity_score']})")
        
        # Update ranked hypotheses with refined versions
        for i, refined in enumerate(refined_hypotheses):
            self.ranked_hypotheses[i] = refined
        
        return refined_hypotheses
    
    def generate_research_report(self) -> Dict[str, Any]:
        """Generate a comprehensive research report with the findings.
        
        Returns:
            A dictionary containing the complete research report
            
        Raises:
            ValueError: If no hypotheses have been processed
        """
        if not self.ranked_hypotheses:
            raise ValueError("No hypotheses have been processed for report generation")
        
        self.logger.info("Generating comprehensive research report")
        
        # Get meta-review agent
        meta_review_agent = self.agent_factory.get_agent("metareview")
        
        # Generate report
        report = meta_review_agent.generate_report(
            self.research_goal,
            self.ranked_hypotheses,
            max_hypotheses=5  # Include top 5 hypotheses in report
        )
        
        self.final_report = report
        self.logger.info(f"Generated research report with {report.get('hypothesis_count', 'unknown')} hypotheses")
        
        return report
    
    def save_results(self, output_dir: Optional[str] = None) -> str:
        """Save the research results to disk.
        
        Args:
            output_dir: Optional directory to save results. If None, saves to 'results'
                directory in the project root.
                
        Returns:
            Path to the saved results file
            
        Raises:
            ValueError: If no final report is available
        """
        if not self.final_report:
            raise ValueError("No final report available to save")
        
        # Import dependencies here to avoid circular imports
        import time
        from datetime import datetime
        
        # Determine output directory
        if output_dir is None:
            # Get the project root directory (assuming this file is in src/)
            project_root = Path(__file__).parent.parent  # Go up one level from this file
            output_dir = project_root / "results"
        else:
            output_dir = Path(output_dir)
        
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Create results dictionary
        results = {
            "research_goal": self.research_goal,
            "configuration": self.config,
            "hypotheses": {
                "initial": self.hypotheses,
                "ranked": self.ranked_hypotheses
            },
            "final_report": self.final_report,
            "meta": {
                "model": self.config["model"],
                "timestamp": str(datetime.now().isoformat())
            }
        }
        
        # Save results to file
        results_file = output_dir / f"research_results_{time.strftime('%Y%m%d_%H%M%S')}.json"
        with open(results_file, 'w') as f:
            json.dump(results, f, indent=2)
        
        self.logger.info(f"Saved research results to {results_file}")
        return str(results_file)
    
    def run_full_workflow(self, research_goal: str, iterations: int = 3, output_dir: Optional[str] = None) -> Dict[str, Any]:
        """Run the complete research workflow from goal to final report.
        
        Args:
            research_goal: The research goal to pursue
            iterations: Number of refinement iterations
            output_dir: Optional directory to save results
            
        Returns:
            The complete research results dictionary
            
        Raises:
            ValidationError: If research goal is invalid
        """
        self.logger.info(f"Starting full research workflow for goal: {research_goal}")
        
        try:
            # Set research goal
            self.set_research_goal(research_goal)
            
            # Generate hypotheses
            self.generate_hypotheses(count=7)  # Generate 7 initial hypotheses
            
            # Rank hypotheses
            self.rank_hypotheses()
            
            # Refine hypotheses
            self.refine_hypotheses(iterations=iterations)
            
            # Generate report
            self.generate_research_report()
            
            # Save results if output directory is specified
            if output_dir:
                self.save_results(output_dir)
            
            self.logger.info("Research workflow completed successfully")
            
            # Return final results
            return {
                "research_goal": self.research_goal,
                "hypotheses": self.ranked_hypotheses,
                "report": self.final_report
            }
            
        except Exception as e:
            self.logger.error(f"Error in research workflow: {str(e)}")
            raise


def main():
    """Main entry point for the AI Co-Scientist application."""
    # For imports that may cause circular dependencies, import here
    import time
    from datetime import datetime
    
    # Parse command line arguments
    parser = argparse.ArgumentParser(description="AI Co-Scientist: Multi-agent system for scientific hypothesis generation")
    parser.add_argument("--goal", "-g", type=str, help="Research goal to pursue")
    parser.add_argument("--model", "-m", type=str, default=AGENT_DEFAULT_MODEL, help="LLM model to use")
    parser.add_argument("--temp", "-t", type=float, default=AGENT_DEFAULT_TEMPERATURE, help="Temperature for LLM generation")
    parser.add_argument("--iterations", "-i", type=int, default=3, help="Number of refinement iterations")
    parser.add_argument("--output", "-o", type=str, help="Output directory for results")
    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging")
    
    args = parser.parse_args()
    
    # Configure logging level
    log_level = "DEBUG" if args.verbose else LOG_LEVEL
    logger = setup_logging(log_level=log_level, log_to_file=LOG_TO_FILE)
    
    # Initialize configuration
    config = {
        "model": args.model,
        "temperature": args.temp,
        "max_iterations": MAX_ITERATIONS,
        "max_tokens": MAX_TOKENS
    }
    
    # Initialize AI Co-Scientist
    acs = AICoScientist(config=config)
    
    # Run workflow if goal is provided
    if args.goal:
        try:
            results = acs.run_full_workflow(
                research_goal=args.goal,
                iterations=args.iterations,
                output_dir=args.output
            )
            
            # Print summary results
            print("\n" + "=" * 80)
            print(f"RESEARCH GOAL: {args.goal}")
            print("=" * 80)
            print("\nTOP HYPOTHESES:")
            for i, h in enumerate(results["hypotheses"][:3]):
                print(f"\n{i+1}. {h.get('hypothesis', h.get('statement', ''))}")
                print(f"   Score: {h.get('score', 'N/A')}")
            
            print("\n" + "=" * 80)
            print("EXECUTIVE SUMMARY:")
            print(results["report"]["executive_summary"])
            print("=" * 80 + "\n")
            
            # Print path to full results if saved
            if args.output:
                print(f"Complete results saved to: {args.output}")
                
        except Exception as e:
            logger.error(f"Error in workflow execution: {str(e)}")
            print(f"\nError: {str(e)}")
            return 1
    else:
        parser.print_help()
    
    return 0


if __name__ == "__main__":
    main()