Spaces:

cryogenic22
/

doc_knowledge_base

Runtime error

App Files Files Community

cryogenic22 commited on Apr 19, 2025

Commit

6ed286a

verified ·

1 Parent(s): 3ab5528

Create llm_interface.py

Browse files

Files changed (1) hide show

llm_interface.py +531 -0

llm_interface.py ADDED Viewed

	@@ -0,0 +1,531 @@

+"""
+Interface for interacting with Anthropic Claude API for:
+1. Extracting structured data from document sections
+2. Generating content for authoring
+3. Answering questions about documents via RAG
+"""
+import os
+import json
+import anthropic
+from typing import Dict, List, Any, Optional, Union
+import time
+class LLMInterface:
+    """Interface for interacting with LLMs, specifically Claude."""
+    def __init__(self, api_key=None):
+        """Initialize the interface with an API key."""
+        if api_key:
+            self.api_key = api_key
+        else:
+            # Get from environment variable
+            self.api_key = os.environ.get("ANTHROPIC_API_KEY")
+        if not self.api_key:
+            raise ValueError("Anthropic API Key is required")
+        self.client = anthropic.Anthropic(api_key=self.api_key)
+    def _call_claude(self, prompt: str, system: str = None, max_tokens: int = 4000,
+                    temperature: float = 0.2, model: str = "claude-3-sonnet-20240229") -> str:
+        """
+        Make a call to Claude API.
+        Args:
+            prompt: The prompt to send to Claude
+            system: Optional system prompt
+            max_tokens: Maximum tokens in the response
+            temperature: Temperature setting (0-1)
+            model: Model to use
+        Returns:
+            Claude's response as a string
+        """
+        try:
+            messages = [{"role": "user", "content": prompt}]
+            response = self.client.messages.create(
+                model=model,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                system=system,
+                messages=messages
+            )
+            return response.content[0].text
+        except Exception as e:
+            print(f"Error calling Claude API: {e}")
+            # Wait and retry once on rate limiting
+            if "rate" in str(e).lower() or "timeout" in str(e).lower():
+                print("Rate limit hit, waiting 5 seconds...")
+                time.sleep(5)
+                try:
+                    response = self.client.messages.create(
+                        model=model,
+                        max_tokens=max_tokens,
+                        temperature=temperature,
+                        system=system,
+                        messages=messages
+                    )
+                    return response.content[0].text
+                except Exception as retry_e:
+                    print(f"Retry failed: {retry_e}")
+                    return f"Error: {retry_e}"
+            return f"Error: {e}"
+    def _parse_json_from_response(self, response: str) -> Dict:
+        """
+        Extract and parse JSON from Claude's response.
+        Args:
+            response: Claude's text response
+        Returns:
+            Parsed JSON as a dictionary
+        """
+        try:
+            # Find JSON in the response (it might be wrapped in ```json or just be part of the text)
+            json_start = response.find('{')
+            json_end = response.rfind('}') + 1
+            if json_start >= 0 and json_end > json_start:
+                json_str = response[json_start:json_end]
+                return json.loads(json_str)
+            else:
+                print("No JSON found in response")
+                return {}
+        except json.JSONDecodeError as e:
+            print(f"Error parsing JSON: {e}")
+            print(f"Response was: {response}")
+            return {}
+    def extract_study_info(self, protocol_text: str) -> Dict:
+        """
+        Extract basic study information from protocol text.
+        Args:
+            protocol_text: Text from the protocol
+        Returns:
+            Dictionary with study information
+        """
+        system = """
+        You are an expert in clinical trial protocols with the specific task of extracting
+        structured data from protocol text. Extract only the information that is explicitly
+        stated in the text. If information is not available, use null or empty strings.
+        Return a valid JSON object.
+        """
+        prompt = """
+        Extract the following study information from the provided protocol text.
+        Return a valid JSON object with these keys:
+        {
+            "protocol_id": "string",       // The protocol identifier/number
+            "title": "string",             // The full protocol title
+            "phase": "string",             // Clinical trial phase
+            "status": "string",            // Protocol status if mentioned
+            "design_type": "string",       // Study design description (e.g., "Randomized, Double-Blind...")
+            "compound_id": "string",       // Investigational product identifier/name
+            "indication": "string",        // Disease or condition being studied
+            "planned_enrollment": "string" // Number of planned subjects/participants
+        }
+        Protocol text:
+        """
+        response = self._call_claude(prompt + protocol_text[:20000], system=system)
+        return self._parse_json_from_response(response)
+    def extract_objectives_and_endpoints(self, section_text: str, protocol_id: str) -> Dict:
+        """
+        Extract objectives and their corresponding endpoints from protocol text.
+        Args:
+            section_text: Text from the objectives/endpoints section
+            protocol_id: Protocol ID for reference
+        Returns:
+            Dictionary with objectives and endpoints
+        """
+        system = """
+        You are an expert in clinical trial protocols with the specific task of extracting
+        structured data about objectives and endpoints. Extract only the information that
+        is explicitly stated in the text. Return the data as a valid JSON object.
+        """
+        prompt = f"""
+        Extract the objectives and endpoints from the following protocol section text.
+        The protocol ID is: {protocol_id}
+        Return a valid JSON object with these keys:
+        {{
+            "objectives": [
+                {{
+                    "type": "string",        // "Primary", "Secondary", or "Exploratory"
+                    "description": "string", // The full text description of the objective
+                    "id": "string"           // A generated identifier (e.g., "OBJ1", "OBJ2")
+                }}
+            ],
+            "endpoints": [
+                {{
+                    "type": "string",        // "Primary", "Secondary", or "Exploratory"
+                    "name": "string",        // Short name of the endpoint
+                    "definition": "string",  // Full definition
+                    "objective_id": "string" // Reference to which objective this endpoint measures (if clear)
+                }}
+            ]
+        }}
+        Section text:
+        """
+        response = self._call_claude(prompt + section_text, system=system)
+        return self._parse_json_from_response(response)
+    def extract_population_criteria(self, section_text: str, protocol_id: str) -> Dict:
+        """
+        Extract inclusion and exclusion criteria from protocol text.
+        Args:
+            section_text: Text from the population/criteria section
+            protocol_id: Protocol ID for reference
+        Returns:
+            Dictionary with inclusion and exclusion criteria
+        """
+        system = """
+        You are an expert in clinical trial protocols with the specific task of extracting
+        structured data about inclusion and exclusion criteria. Extract the criteria
+        exactly as stated in the text, preserving numbering and formatting. Return the
+        data as a valid JSON object.
+        """
+        prompt = f"""
+        Extract the inclusion and exclusion criteria from the following protocol section.
+        The protocol ID is: {protocol_id}
+        Return a valid JSON object with these keys:
+        {{
+            "inclusion_criteria": [
+                {{
+                    "number": number or null,  // The criterion number if available (e.g., 1, 2)
+                    "text": "string",          // The full text of the criterion
+                    "attribute": "string",     // The characteristic being evaluated, if clear (e.g., "Age", "BMI")
+                    "operator": "string",      // The comparison operator if applicable (e.g., ">", "<", "=")
+                    "value": "string"          // The threshold value if applicable (e.g., "18 years")
+                }}
+            ],
+            "exclusion_criteria": [
+                {{
+                    "number": number or null,
+                    "text": "string",
+                    "attribute": "string",
+                    "operator": "string",
+                    "value": "string"
+                }}
+            ]
+        }}
+        Section text:
+        """
+        response = self._call_claude(prompt + section_text, system=system)
+        return self._parse_json_from_response(response)
+    def extract_study_design(self, section_text: str, protocol_id: str) -> Dict:
+        """
+        Extract study design information from protocol text.
+        Args:
+            section_text: Text from the study design section
+            protocol_id: Protocol ID for reference
+        Returns:
+            Dictionary with study design information
+        """
+        system = """
+        You are an expert in clinical trial protocols with the specific task of extracting
+        structured data about study design. Extract only information that is explicitly
+        stated in the text. Return the data as a valid JSON object.
+        """
+        prompt = f"""
+        Extract the study design information from the following protocol section.
+        The protocol ID is: {protocol_id}
+        Return a valid JSON object with these keys:
+        {{
+            "design_type": "string",          // E.g., "Randomized, Double-blind, Placebo-controlled"
+            "study_parts": [                  // List of different parts/cohorts if applicable
+                {{
+                    "part": "string",           // Identifier (e.g., "Part A", "Cohort 1")
+                    "description": "string",    // Description
+                    "population": "string",     // E.g., "Healthy Volunteers" or "T2DM Patients"
+                    "planned_n": "string"       // Planned number of subjects
+                }}
+            ],
+            "randomization": "string",        // Description of randomization process
+            "blinding": "string",             // Description of blinding (e.g., "Double-blind")
+            "duration": "string",             // Study duration information
+            "dose_info": "string"             // Information about dosing if mentioned
+        }}
+        Section text:
+        """
+        response = self._call_claude(prompt + section_text, system=system)
+        return self._parse_json_from_response(response)
+    def extract_statistical_methods(self, section_text: str, protocol_id: str) -> Dict:
+        """
+        Extract statistical analysis methods from SAP or protocol text.
+        Args:
+            section_text: Text from the statistical methods section
+            protocol_id: Protocol ID for reference
+        Returns:
+            Dictionary with statistical methods information
+        """
+        system = """
+        You are an expert in clinical trial statistics with the specific task of extracting
+        structured data about statistical methods from protocols or SAPs. Return the data
+        as a valid JSON object.
+        """
+        prompt = f"""
+        Extract the statistical methods information from the following section.
+        The protocol ID is: {protocol_id}
+        Return a valid JSON object with these keys:
+        {{
+            "analysis_populations": [
+                {{
+                    "name": "string",        // E.g., "Full Analysis Set", "Safety Population"
+                    "definition": "string"   // Definition of the population
+                }}
+            ],
+            "primary_analysis": {{
+                "endpoint": "string",        // Primary endpoint being analyzed
+                "method": "string",          // Statistical method (e.g., "MMRM", "t-test")
+                "covariates": ["string"],    // List of covariates if mentioned
+                "handling_missing": "string" // How missing data is handled
+            }},
+            "secondary_analyses": [
+                {{
+                    "endpoint": "string",
+                    "method": "string",
+                    "covariates": ["string"],
+                    "handling_missing": "string"
+                }}
+            ],
+            "multiplicity": "string",        // How multiplicity is addressed
+            "sample_size_justification": "string" // Sample size rationale
+        }}
+        Section text:
+        """
+        response = self._call_claude(prompt + section_text, system=system)
+        return self._parse_json_from_response(response)
+    def extract_assessments(self, section_text: str, protocol_id: str) -> Dict:
+        """
+        Extract assessment information from protocol text.
+        Args:
+            section_text: Text from the assessments section
+            protocol_id: Protocol ID for reference
+        Returns:
+            Dictionary with assessment information
+        """
+        system = """
+        You are an expert in clinical trial protocols with the specific task of extracting
+        structured data about assessments and procedures. Return the data as a valid JSON object.
+        """
+        prompt = f"""
+        Extract information about assessments and procedures from the following protocol section.
+        The protocol ID is: {protocol_id}
+        Return a valid JSON object with these keys:
+        {{
+            "assessments": [
+                {{
+                    "name": "string",         // Name of assessment (e.g., "OGTT", "ECG")
+                    "type": "string",         // Type (e.g., "Safety", "PK", "PD")
+                    "description": "string",  // Description of the procedure
+                    "timing": "string",       // When it's performed
+                    "analytes": ["string"]    // Measured analytes if applicable
+                }}
+            ]
+        }}
+        Section text:
+        """
+        response = self._call_claude(prompt + section_text, system=system)
+        return self._parse_json_from_response(response)
+    def generate_content_from_knowledge(self, section_type: str, context: List[Dict],
+                                       protocol_id: str = None, style_guide: str = None) -> str:
+        """
+        Generate document content based on knowledge extracted from similar documents.
+        Args:
+            section_type: Type of section to generate (e.g., "Introduction", "Study Design")
+            context: List of relevant text chunks from knowledge base
+            protocol_id: Optional protocol ID for reference
+            style_guide: Optional style guide instructions
+        Returns:
+            Generated content as a string
+        """
+        system = """
+        You are an expert medical writer who specializes in pharmaceutical R&D documents
+        like protocols, SAPs, and CSRs. Your task is to draft high-quality content
+        based on similar examples, following the conventions of scientific/medical writing
+        and any provided style guides.
+        """
+        # Prepare context text
+        context_text = ""
+        for i, chunk in enumerate(context):
+            context_text += f"\nEXAMPLE {i+1} (Source: {chunk.get('metadata', {}).get('source', 'Unknown')})\n"
+            context_text += chunk.get('page_content', '')
+            context_text += "\n" + "-"*50 + "\n"
+        protocol_ref = f"for protocol {protocol_id}" if protocol_id else ""
+        style_instructions = f"\nFollow these style guidelines:\n{style_guide}" if style_guide else ""
+        prompt = f"""
+        Please draft a {section_type} section {protocol_ref} for a clinical study document.
+        The content should be:
+        1. Well-structured and professionally written
+        2. Scientifically accurate and precise
+        3. Appropriate for a regulatory/scientific audience
+        4. In line with typical conventions for pharmaceutical documents{style_instructions}
+        Here are examples of similar content from other documents to guide your writing:
+        {context_text}
+        Please draft a complete {section_type} section that follows these examples in style and
+        structure but is original.
+        """
+        # Use a higher max tokens for content generation
+        response = self._call_claude(prompt, system=system, max_tokens=4000, temperature=0.3)
+        return response
+    def answer_protocol_question(self, question: str, context: List[Dict],
+                               chat_history: List[Dict] = None) -> str:
+        """
+        Answer a question about protocols using retrieved context.
+        Args:
+            question: User's question
+            context: List of relevant text chunks from knowledge base
+            chat_history: Optional list of previous interactions
+        Returns:
+            Answer as a string
+        """
+        system = """
+        You are a Protocol Coach, an expert assistant specializing in pharmaceutical R&D documents.
+        Your role is to answer questions about clinical study protocols, SAPs, and other related documents
+        using the specific context provided. Base your answers strictly on the provided context and
+        indicate when information might not be available in the provided excerpts.
+        Always cite the source documents when answering questions.
+        """
+        # Prepare context text
+        context_text = ""
+        for i, chunk in enumerate(context):
+            source = chunk.get('metadata', {}).get('source', 'Unknown')
+            section = chunk.get('metadata', {}).get('section', 'Unknown section')
+            context_text += f"\nCONTEXT {i+1} [Source: {source}, Section: {section}]\n"
+            context_text += chunk.get('page_content', '')
+            context_text += "\n" + "-"*50 + "\n"
+        # Prepare chat history if available
+        history_text = ""
+        if chat_history and len(chat_history) > 0:
+            history_text = "\nPrevious conversation:\n"
+            for entry in chat_history[-3:]:  # Only use last 3 exchanges for context
+                if 'user' in entry:
+                    history_text += f"User: {entry['user']}\n"
+                if 'assistant' in entry:
+                    history_text += f"Assistant: {entry['assistant']}\n"
+            history_text += "\n"
+        prompt = f"""
+        {history_text}
+        User question: {question}
+        Please answer the question based on the following context from clinical documents:
+        {context_text}
+        Answer the question comprehensively using only the information in the provided context.
+        If the context doesn't contain sufficient information to provide a complete answer,
+        clearly state which aspects you can and cannot address based on the available information.
+        """
+        response = self._call_claude(prompt, system=system, max_tokens=2000, temperature=0.2)
+        return response
+    def find_document_connections(self, source_doc_info: Dict, target_doc_info: Dict,
+                                entity_pairs: List[Dict]) -> str:
+        """
+        Analyze connections between two documents based on entity pairs.
+        Args:
+            source_doc_info: Information about the source document
+            target_doc_info: Information about the target document
+            entity_pairs: List of potentially matching entities from both documents
+        Returns:
+            Analysis of connections as a string
+        """
+        system = """
+        You are an expert in pharmaceutical R&D document analysis, specialized in
+        identifying relationships, consistency, and traceability between related
+        documents like protocols and SAPs. Your task is to analyze potential
+        matches between entities in different documents and assess their alignment.
+        """
+        # Convert entity pairs to formatted text
+        entity_pairs_text = ""
+        for i, pair in enumerate(entity_pairs):
+            entity_pairs_text += f"\nCOMPARISON {i+1}:\n"
+            entity_pairs_text += f"Source: {pair.get('source_text', 'Not available')}\n"
+            entity_pairs_text += f"Target: {pair.get('target_text', 'Not available')}\n"
+            entity_pairs_text += f"Entity Type: {pair.get('entity_type', 'Unknown')}\n"
+            entity_pairs_text += "-"*50 + "\n"
+        prompt = f"""
+        Analyze the connections between these two pharmaceutical documents:
+        SOURCE DOCUMENT: {source_doc_info.get('title', 'Unknown')} (Type: {source_doc_info.get('type', 'Unknown')})
+        TARGET DOCUMENT: {target_doc_info.get('title', 'Unknown')} (Type: {target_doc_info.get('type', 'Unknown')})
+        I'll provide pairs of potentially related elements from both documents. For each pair, assess:
+        1. Whether they refer to the same entity or concept
+        2. The level of consistency between them (High/Medium/Low)
+        3. Any notable differences or potential issues
+        Here are the element pairs to analyze:
+        {entity_pairs_text}
+        Provide:
+        1. A summary of the overall consistency between documents
+        2. Specific observations about each compared element
+        3. Potential implications of any inconsistencies
+        4. Recommendations for improving alignment
+        """
+        response = self._call_claude(prompt, system=system, max_tokens=3000, temperature=0.2)
+        return response