DeepCritical / src /services /statistical_analyzer.py
Joseph Pollack
Initial commit - Independent repository - Breaking fork relationship
016b413
"""Statistical analysis service using Modal code execution.
This module provides Modal-based statistical analysis WITHOUT depending on
agent_framework. This allows it to be used in the simple orchestrator mode
without requiring the magentic optional dependency.
The AnalysisAgent (in src/agents/) wraps this service for magentic mode.
"""
import asyncio
import re
from functools import lru_cache, partial
from typing import Any, Literal
# Type alias for verdict values
VerdictType = Literal["SUPPORTED", "REFUTED", "INCONCLUSIVE"]
from pydantic import BaseModel, Field
from pydantic_ai import Agent
from src.agent_factory.judges import get_model
from src.tools.code_execution import (
CodeExecutionError,
get_code_executor,
get_sandbox_library_prompt,
)
from src.utils.models import Evidence
class AnalysisResult(BaseModel):
"""Result of statistical analysis."""
verdict: VerdictType = Field(
description="SUPPORTED, REFUTED, or INCONCLUSIVE",
)
confidence: float = Field(ge=0.0, le=1.0, description="Confidence in verdict (0-1)")
statistical_evidence: str = Field(
description="Summary of statistical findings from code execution"
)
code_generated: str = Field(description="Python code that was executed")
execution_output: str = Field(description="Output from code execution")
key_findings: list[str] = Field(default_factory=list, description="Key takeaways")
limitations: list[str] = Field(default_factory=list, description="Limitations")
class StatisticalAnalyzer:
"""Performs statistical analysis using Modal code execution.
This service:
1. Generates Python code for statistical analysis using LLM
2. Executes code in Modal sandbox
3. Interprets results
4. Returns verdict (SUPPORTED/REFUTED/INCONCLUSIVE)
Note: This class has NO agent_framework dependency, making it safe
to use in the simple orchestrator without the magentic extra.
"""
def __init__(self) -> None:
"""Initialize the analyzer."""
self._code_executor: Any = None
self._agent: Agent[None, str] | None = None
def _get_code_executor(self) -> Any:
"""Lazy initialization of code executor."""
if self._code_executor is None:
self._code_executor = get_code_executor()
return self._code_executor
def _get_agent(self) -> Agent[None, str]:
"""Lazy initialization of LLM agent for code generation."""
if self._agent is None:
library_versions = get_sandbox_library_prompt()
self._agent = Agent(
model=get_model(),
output_type=str,
system_prompt=f"""You are a biomedical data scientist.
Generate Python code to analyze research evidence and test hypotheses.
Guidelines:
1. Use pandas, numpy, scipy.stats for analysis
2. Print clear, interpretable results
3. Include statistical tests (t-tests, chi-square, etc.)
4. Calculate effect sizes and confidence intervals
5. Keep code concise (<50 lines)
6. Set 'result' variable to SUPPORTED, REFUTED, or INCONCLUSIVE
Available libraries:
{library_versions}
Output format: Return ONLY executable Python code, no explanations.""",
)
return self._agent
async def analyze(
self,
query: str,
evidence: list[Evidence],
hypothesis: dict[str, Any] | None = None,
) -> AnalysisResult:
"""Run statistical analysis on evidence.
Args:
query: The research question
evidence: List of Evidence objects to analyze
hypothesis: Optional hypothesis dict with drug, target, pathway, effect
Returns:
AnalysisResult with verdict and statistics
"""
# Build analysis prompt (method handles slicing internally)
evidence_summary = self._summarize_evidence(evidence)
hypothesis_text = ""
if hypothesis:
hypothesis_text = (
f"\nHypothesis: {hypothesis.get('drug', 'Unknown')} → "
f"{hypothesis.get('target', '?')} → "
f"{hypothesis.get('pathway', '?')} → "
f"{hypothesis.get('effect', '?')}\n"
f"Confidence: {hypothesis.get('confidence', 0.5):.0%}\n"
)
prompt = f"""Generate Python code to statistically analyze:
**Research Question**: {query}
{hypothesis_text}
**Evidence Summary**:
{evidence_summary}
Generate executable Python code to analyze this evidence."""
try:
# Generate code
agent = self._get_agent()
code_result = await agent.run(prompt)
generated_code = code_result.output
# Execute in Modal sandbox
loop = asyncio.get_running_loop()
executor = self._get_code_executor()
execution = await loop.run_in_executor(
None, partial(executor.execute, generated_code, timeout=120)
)
if not execution["success"]:
return AnalysisResult(
verdict="INCONCLUSIVE",
confidence=0.0,
statistical_evidence=(
f"Execution failed: {execution.get('error', 'Unknown error')}"
),
code_generated=generated_code,
execution_output=execution.get("stderr", ""),
key_findings=[],
limitations=["Code execution failed"],
)
# Interpret results
return self._interpret_results(generated_code, execution)
except CodeExecutionError as e:
return AnalysisResult(
verdict="INCONCLUSIVE",
confidence=0.0,
statistical_evidence=str(e),
code_generated="",
execution_output="",
key_findings=[],
limitations=[f"Analysis error: {e}"],
)
def _summarize_evidence(self, evidence: list[Evidence]) -> str:
"""Summarize evidence for code generation prompt."""
if not evidence:
return "No evidence available."
lines = []
for i, ev in enumerate(evidence[:5], 1):
content = ev.content
truncated = content[:200] + ("..." if len(content) > 200 else "")
lines.append(f"{i}. {truncated}")
lines.append(f" Source: {ev.citation.title}")
lines.append(f" Relevance: {ev.relevance:.0%}\n")
return "\n".join(lines)
def _interpret_results(
self,
code: str,
execution: dict[str, Any],
) -> AnalysisResult:
"""Interpret code execution results."""
stdout = execution["stdout"]
stdout_upper = stdout.upper()
# Extract verdict with robust word-boundary matching
verdict: VerdictType = "INCONCLUSIVE"
if re.search(r"\bSUPPORTED\b", stdout_upper) and not re.search(
r"\b(?:NOT|UN)SUPPORTED\b", stdout_upper
):
verdict = "SUPPORTED"
elif re.search(r"\bREFUTED\b", stdout_upper):
verdict = "REFUTED"
# Extract key findings
key_findings = []
for line in stdout.split("\n"):
line_lower = line.lower()
if any(kw in line_lower for kw in ["p-value", "significant", "effect", "mean"]):
key_findings.append(line.strip())
# Calculate confidence from p-values
confidence = self._calculate_confidence(stdout)
return AnalysisResult(
verdict=verdict,
confidence=confidence,
statistical_evidence=stdout.strip(),
code_generated=code,
execution_output=stdout,
key_findings=key_findings[:5],
limitations=[
"Analysis based on summary data only",
"Limited to available evidence",
"Statistical tests assume data independence",
],
)
def _calculate_confidence(self, output: str) -> float:
"""Calculate confidence based on statistical results."""
p_values = re.findall(r"p[-\s]?value[:\s]+(\d+\.?\d*)", output.lower())
if p_values:
try:
min_p = min(float(p) for p in p_values)
if min_p < 0.001:
return 0.95
elif min_p < 0.01:
return 0.90
elif min_p < 0.05:
return 0.80
else:
return 0.60
except ValueError:
pass
return 0.70 # Default
@lru_cache(maxsize=1)
def get_statistical_analyzer() -> StatisticalAnalyzer:
"""Get or create singleton StatisticalAnalyzer instance (thread-safe via lru_cache)."""
return StatisticalAnalyzer()