Spaces:
Runtime error
Runtime error
| """ | |
| Interface for interacting with Anthropic Claude API for: | |
| 1. Extracting structured data from document sections | |
| 2. Generating content for authoring | |
| 3. Answering questions about documents via RAG | |
| """ | |
| import os | |
| import json | |
| import anthropic | |
| from typing import Dict, List, Any, Optional, Union | |
| import time | |
| class LLMInterface: | |
| """Interface for interacting with LLMs, specifically Claude.""" | |
| def __init__(self, api_key=None): | |
| """Initialize the interface with an API key.""" | |
| if api_key: | |
| self.api_key = api_key | |
| else: | |
| # Get from environment variable | |
| self.api_key = os.environ.get("ANTHROPIC_API_KEY") | |
| if not self.api_key: | |
| raise ValueError("Anthropic API Key is required") | |
| self.client = anthropic.Anthropic(api_key=self.api_key) | |
| def _call_claude(self, prompt: str, system: str = None, max_tokens: int = 4000, | |
| temperature: float = 0.2, model: str = "claude-3-sonnet-20240229") -> str: | |
| """ | |
| Make a call to Claude API. | |
| Args: | |
| prompt: The prompt to send to Claude | |
| system: Optional system prompt | |
| max_tokens: Maximum tokens in the response | |
| temperature: Temperature setting (0-1) | |
| model: Model to use | |
| Returns: | |
| Claude's response as a string | |
| """ | |
| try: | |
| messages = [{"role": "user", "content": prompt}] | |
| response = self.client.messages.create( | |
| model=model, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| system=system, | |
| messages=messages | |
| ) | |
| return response.content[0].text | |
| except Exception as e: | |
| print(f"Error calling Claude API: {e}") | |
| # Wait and retry once on rate limiting | |
| if "rate" in str(e).lower() or "timeout" in str(e).lower(): | |
| print("Rate limit hit, waiting 5 seconds...") | |
| time.sleep(5) | |
| try: | |
| response = self.client.messages.create( | |
| model=model, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| system=system, | |
| messages=messages | |
| ) | |
| return response.content[0].text | |
| except Exception as retry_e: | |
| print(f"Retry failed: {retry_e}") | |
| return f"Error: {retry_e}" | |
| return f"Error: {e}" | |
| def _parse_json_from_response(self, response: str) -> Dict: | |
| """ | |
| Extract and parse JSON from Claude's response. | |
| Args: | |
| response: Claude's text response | |
| Returns: | |
| Parsed JSON as a dictionary | |
| """ | |
| try: | |
| # Find JSON in the response (it might be wrapped in ```json or just be part of the text) | |
| json_start = response.find('{') | |
| json_end = response.rfind('}') + 1 | |
| if json_start >= 0 and json_end > json_start: | |
| json_str = response[json_start:json_end] | |
| return json.loads(json_str) | |
| else: | |
| print("No JSON found in response") | |
| return {} | |
| except json.JSONDecodeError as e: | |
| print(f"Error parsing JSON: {e}") | |
| print(f"Response was: {response}") | |
| return {} | |
| def extract_study_info(self, protocol_text: str) -> Dict: | |
| """ | |
| Extract basic study information from protocol text. | |
| Args: | |
| protocol_text: Text from the protocol | |
| Returns: | |
| Dictionary with study information | |
| """ | |
| system = """ | |
| You are an expert in clinical trial protocols with the specific task of extracting | |
| structured data from protocol text. Extract only the information that is explicitly | |
| stated in the text. If information is not available, use null or empty strings. | |
| Return a valid JSON object. | |
| """ | |
| prompt = """ | |
| Extract the following study information from the provided protocol text. | |
| Return a valid JSON object with these keys: | |
| { | |
| "protocol_id": "string", // The protocol identifier/number | |
| "title": "string", // The full protocol title | |
| "phase": "string", // Clinical trial phase | |
| "status": "string", // Protocol status if mentioned | |
| "design_type": "string", // Study design description (e.g., "Randomized, Double-Blind...") | |
| "compound_id": "string", // Investigational product identifier/name | |
| "indication": "string", // Disease or condition being studied | |
| "planned_enrollment": "string" // Number of planned subjects/participants | |
| } | |
| Protocol text: | |
| """ | |
| response = self._call_claude(prompt + protocol_text[:20000], system=system) | |
| return self._parse_json_from_response(response) | |
| def extract_objectives_and_endpoints(self, section_text: str, protocol_id: str) -> Dict: | |
| """ | |
| Extract objectives and their corresponding endpoints from protocol text. | |
| Args: | |
| section_text: Text from the objectives/endpoints section | |
| protocol_id: Protocol ID for reference | |
| Returns: | |
| Dictionary with objectives and endpoints | |
| """ | |
| system = """ | |
| You are an expert in clinical trial protocols with the specific task of extracting | |
| structured data about objectives and endpoints. Extract only the information that | |
| is explicitly stated in the text. Return the data as a valid JSON object. | |
| """ | |
| prompt = f""" | |
| Extract the objectives and endpoints from the following protocol section text. | |
| The protocol ID is: {protocol_id} | |
| Return a valid JSON object with these keys: | |
| {{ | |
| "objectives": [ | |
| {{ | |
| "type": "string", // "Primary", "Secondary", or "Exploratory" | |
| "description": "string", // The full text description of the objective | |
| "id": "string" // A generated identifier (e.g., "OBJ1", "OBJ2") | |
| }} | |
| ], | |
| "endpoints": [ | |
| {{ | |
| "type": "string", // "Primary", "Secondary", or "Exploratory" | |
| "name": "string", // Short name of the endpoint | |
| "definition": "string", // Full definition | |
| "objective_id": "string" // Reference to which objective this endpoint measures (if clear) | |
| }} | |
| ] | |
| }} | |
| Section text: | |
| """ | |
| response = self._call_claude(prompt + section_text, system=system) | |
| return self._parse_json_from_response(response) | |
| def extract_population_criteria(self, section_text: str, protocol_id: str) -> Dict: | |
| """ | |
| Extract inclusion and exclusion criteria from protocol text. | |
| Args: | |
| section_text: Text from the population/criteria section | |
| protocol_id: Protocol ID for reference | |
| Returns: | |
| Dictionary with inclusion and exclusion criteria | |
| """ | |
| system = """ | |
| You are an expert in clinical trial protocols with the specific task of extracting | |
| structured data about inclusion and exclusion criteria. Extract the criteria | |
| exactly as stated in the text, preserving numbering and formatting. Return the | |
| data as a valid JSON object. | |
| """ | |
| prompt = f""" | |
| Extract the inclusion and exclusion criteria from the following protocol section. | |
| The protocol ID is: {protocol_id} | |
| Return a valid JSON object with these keys: | |
| {{ | |
| "inclusion_criteria": [ | |
| {{ | |
| "number": number or null, // The criterion number if available (e.g., 1, 2) | |
| "text": "string", // The full text of the criterion | |
| "attribute": "string", // The characteristic being evaluated, if clear (e.g., "Age", "BMI") | |
| "operator": "string", // The comparison operator if applicable (e.g., ">", "<", "=") | |
| "value": "string" // The threshold value if applicable (e.g., "18 years") | |
| }} | |
| ], | |
| "exclusion_criteria": [ | |
| {{ | |
| "number": number or null, | |
| "text": "string", | |
| "attribute": "string", | |
| "operator": "string", | |
| "value": "string" | |
| }} | |
| ] | |
| }} | |
| Section text: | |
| """ | |
| response = self._call_claude(prompt + section_text, system=system) | |
| return self._parse_json_from_response(response) | |
| def extract_study_design(self, section_text: str, protocol_id: str) -> Dict: | |
| """ | |
| Extract study design information from protocol text. | |
| Args: | |
| section_text: Text from the study design section | |
| protocol_id: Protocol ID for reference | |
| Returns: | |
| Dictionary with study design information | |
| """ | |
| system = """ | |
| You are an expert in clinical trial protocols with the specific task of extracting | |
| structured data about study design. Extract only information that is explicitly | |
| stated in the text. Return the data as a valid JSON object. | |
| """ | |
| prompt = f""" | |
| Extract the study design information from the following protocol section. | |
| The protocol ID is: {protocol_id} | |
| Return a valid JSON object with these keys: | |
| {{ | |
| "design_type": "string", // E.g., "Randomized, Double-blind, Placebo-controlled" | |
| "study_parts": [ // List of different parts/cohorts if applicable | |
| {{ | |
| "part": "string", // Identifier (e.g., "Part A", "Cohort 1") | |
| "description": "string", // Description | |
| "population": "string", // E.g., "Healthy Volunteers" or "T2DM Patients" | |
| "planned_n": "string" // Planned number of subjects | |
| }} | |
| ], | |
| "randomization": "string", // Description of randomization process | |
| "blinding": "string", // Description of blinding (e.g., "Double-blind") | |
| "duration": "string", // Study duration information | |
| "dose_info": "string" // Information about dosing if mentioned | |
| }} | |
| Section text: | |
| """ | |
| response = self._call_claude(prompt + section_text, system=system) | |
| return self._parse_json_from_response(response) | |
| def extract_statistical_methods(self, section_text: str, protocol_id: str) -> Dict: | |
| """ | |
| Extract statistical analysis methods from SAP or protocol text. | |
| Args: | |
| section_text: Text from the statistical methods section | |
| protocol_id: Protocol ID for reference | |
| Returns: | |
| Dictionary with statistical methods information | |
| """ | |
| system = """ | |
| You are an expert in clinical trial statistics with the specific task of extracting | |
| structured data about statistical methods from protocols or SAPs. Return the data | |
| as a valid JSON object. | |
| """ | |
| prompt = f""" | |
| Extract the statistical methods information from the following section. | |
| The protocol ID is: {protocol_id} | |
| Return a valid JSON object with these keys: | |
| {{ | |
| "analysis_populations": [ | |
| {{ | |
| "name": "string", // E.g., "Full Analysis Set", "Safety Population" | |
| "definition": "string" // Definition of the population | |
| }} | |
| ], | |
| "primary_analysis": {{ | |
| "endpoint": "string", // Primary endpoint being analyzed | |
| "method": "string", // Statistical method (e.g., "MMRM", "t-test") | |
| "covariates": ["string"], // List of covariates if mentioned | |
| "handling_missing": "string" // How missing data is handled | |
| }}, | |
| "secondary_analyses": [ | |
| {{ | |
| "endpoint": "string", | |
| "method": "string", | |
| "covariates": ["string"], | |
| "handling_missing": "string" | |
| }} | |
| ], | |
| "multiplicity": "string", // How multiplicity is addressed | |
| "sample_size_justification": "string" // Sample size rationale | |
| }} | |
| Section text: | |
| """ | |
| response = self._call_claude(prompt + section_text, system=system) | |
| return self._parse_json_from_response(response) | |
| def extract_assessments(self, section_text: str, protocol_id: str) -> Dict: | |
| """ | |
| Extract assessment information from protocol text. | |
| Args: | |
| section_text: Text from the assessments section | |
| protocol_id: Protocol ID for reference | |
| Returns: | |
| Dictionary with assessment information | |
| """ | |
| system = """ | |
| You are an expert in clinical trial protocols with the specific task of extracting | |
| structured data about assessments and procedures. Return the data as a valid JSON object. | |
| """ | |
| prompt = f""" | |
| Extract information about assessments and procedures from the following protocol section. | |
| The protocol ID is: {protocol_id} | |
| Return a valid JSON object with these keys: | |
| {{ | |
| "assessments": [ | |
| {{ | |
| "name": "string", // Name of assessment (e.g., "OGTT", "ECG") | |
| "type": "string", // Type (e.g., "Safety", "PK", "PD") | |
| "description": "string", // Description of the procedure | |
| "timing": "string", // When it's performed | |
| "analytes": ["string"] // Measured analytes if applicable | |
| }} | |
| ] | |
| }} | |
| Section text: | |
| """ | |
| response = self._call_claude(prompt + section_text, system=system) | |
| return self._parse_json_from_response(response) | |
| def generate_content_from_knowledge(self, section_type: str, context: List[Dict], | |
| protocol_id: str = None, style_guide: str = None) -> str: | |
| """ | |
| Generate document content based on knowledge extracted from similar documents. | |
| Args: | |
| section_type: Type of section to generate (e.g., "Introduction", "Study Design") | |
| context: List of relevant text chunks from knowledge base | |
| protocol_id: Optional protocol ID for reference | |
| style_guide: Optional style guide instructions | |
| Returns: | |
| Generated content as a string | |
| """ | |
| system = """ | |
| You are an expert medical writer who specializes in pharmaceutical R&D documents | |
| like protocols, SAPs, and CSRs. Your task is to draft high-quality content | |
| based on similar examples, following the conventions of scientific/medical writing | |
| and any provided style guides. | |
| """ | |
| # Prepare context text | |
| context_text = "" | |
| for i, chunk in enumerate(context): | |
| context_text += f"\nEXAMPLE {i+1} (Source: {chunk.get('metadata', {}).get('source', 'Unknown')})\n" | |
| context_text += chunk.get('page_content', '') | |
| context_text += "\n" + "-"*50 + "\n" | |
| protocol_ref = f"for protocol {protocol_id}" if protocol_id else "" | |
| style_instructions = f"\nFollow these style guidelines:\n{style_guide}" if style_guide else "" | |
| prompt = f""" | |
| Please draft a {section_type} section {protocol_ref} for a clinical study document. | |
| The content should be: | |
| 1. Well-structured and professionally written | |
| 2. Scientifically accurate and precise | |
| 3. Appropriate for a regulatory/scientific audience | |
| 4. In line with typical conventions for pharmaceutical documents{style_instructions} | |
| Here are examples of similar content from other documents to guide your writing: | |
| {context_text} | |
| Please draft a complete {section_type} section that follows these examples in style and | |
| structure but is original. | |
| """ | |
| # Use a higher max tokens for content generation | |
| response = self._call_claude(prompt, system=system, max_tokens=4000, temperature=0.3) | |
| return response | |
| def answer_protocol_question(self, question: str, context: List[Dict], | |
| chat_history: List[Dict] = None) -> str: | |
| """ | |
| Answer a question about protocols using retrieved context. | |
| Args: | |
| question: User's question | |
| context: List of relevant text chunks from knowledge base | |
| chat_history: Optional list of previous interactions | |
| Returns: | |
| Answer as a string | |
| """ | |
| system = """ | |
| You are a Protocol Coach, an expert assistant specializing in pharmaceutical R&D documents. | |
| Your role is to answer questions about clinical study protocols, SAPs, and other related documents | |
| using the specific context provided. Base your answers strictly on the provided context and | |
| indicate when information might not be available in the provided excerpts. | |
| Always cite the source documents when answering questions. | |
| """ | |
| # Prepare context text | |
| context_text = "" | |
| for i, chunk in enumerate(context): | |
| source = chunk.get('metadata', {}).get('source', 'Unknown') | |
| section = chunk.get('metadata', {}).get('section', 'Unknown section') | |
| context_text += f"\nCONTEXT {i+1} [Source: {source}, Section: {section}]\n" | |
| context_text += chunk.get('page_content', '') | |
| context_text += "\n" + "-"*50 + "\n" | |
| # Prepare chat history if available | |
| history_text = "" | |
| if chat_history and len(chat_history) > 0: | |
| history_text = "\nPrevious conversation:\n" | |
| for entry in chat_history[-3:]: # Only use last 3 exchanges for context | |
| if 'user' in entry: | |
| history_text += f"User: {entry['user']}\n" | |
| if 'assistant' in entry: | |
| history_text += f"Assistant: {entry['assistant']}\n" | |
| history_text += "\n" | |
| prompt = f""" | |
| {history_text} | |
| User question: {question} | |
| Please answer the question based on the following context from clinical documents: | |
| {context_text} | |
| Answer the question comprehensively using only the information in the provided context. | |
| If the context doesn't contain sufficient information to provide a complete answer, | |
| clearly state which aspects you can and cannot address based on the available information. | |
| """ | |
| response = self._call_claude(prompt, system=system, max_tokens=2000, temperature=0.2) | |
| return response | |
| def find_document_connections(self, source_doc_info: Dict, target_doc_info: Dict, | |
| entity_pairs: List[Dict]) -> str: | |
| """ | |
| Analyze connections between two documents based on entity pairs. | |
| Args: | |
| source_doc_info: Information about the source document | |
| target_doc_info: Information about the target document | |
| entity_pairs: List of potentially matching entities from both documents | |
| Returns: | |
| Analysis of connections as a string | |
| """ | |
| system = """ | |
| You are an expert in pharmaceutical R&D document analysis, specialized in | |
| identifying relationships, consistency, and traceability between related | |
| documents like protocols and SAPs. Your task is to analyze potential | |
| matches between entities in different documents and assess their alignment. | |
| """ | |
| # Convert entity pairs to formatted text | |
| entity_pairs_text = "" | |
| for i, pair in enumerate(entity_pairs): | |
| entity_pairs_text += f"\nCOMPARISON {i+1}:\n" | |
| entity_pairs_text += f"Source: {pair.get('source_text', 'Not available')}\n" | |
| entity_pairs_text += f"Target: {pair.get('target_text', 'Not available')}\n" | |
| entity_pairs_text += f"Entity Type: {pair.get('entity_type', 'Unknown')}\n" | |
| entity_pairs_text += "-"*50 + "\n" | |
| prompt = f""" | |
| Analyze the connections between these two pharmaceutical documents: | |
| SOURCE DOCUMENT: {source_doc_info.get('title', 'Unknown')} (Type: {source_doc_info.get('type', 'Unknown')}) | |
| TARGET DOCUMENT: {target_doc_info.get('title', 'Unknown')} (Type: {target_doc_info.get('type', 'Unknown')}) | |
| I'll provide pairs of potentially related elements from both documents. For each pair, assess: | |
| 1. Whether they refer to the same entity or concept | |
| 2. The level of consistency between them (High/Medium/Low) | |
| 3. Any notable differences or potential issues | |
| Here are the element pairs to analyze: | |
| {entity_pairs_text} | |
| Provide: | |
| 1. A summary of the overall consistency between documents | |
| 2. Specific observations about each compared element | |
| 3. Potential implications of any inconsistencies | |
| 4. Recommendations for improving alignment | |
| """ | |
| response = self._call_claude(prompt, system=system, max_tokens=3000, temperature=0.2) | |
| return response |