cryogenic22 commited on
Commit
6ed286a
·
verified ·
1 Parent(s): 3ab5528

Create llm_interface.py

Browse files
Files changed (1) hide show
  1. llm_interface.py +531 -0
llm_interface.py ADDED
@@ -0,0 +1,531 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Interface for interacting with Anthropic Claude API for:
3
+ 1. Extracting structured data from document sections
4
+ 2. Generating content for authoring
5
+ 3. Answering questions about documents via RAG
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import anthropic
11
+ from typing import Dict, List, Any, Optional, Union
12
+ import time
13
+
14
+ class LLMInterface:
15
+ """Interface for interacting with LLMs, specifically Claude."""
16
+
17
+ def __init__(self, api_key=None):
18
+ """Initialize the interface with an API key."""
19
+ if api_key:
20
+ self.api_key = api_key
21
+ else:
22
+ # Get from environment variable
23
+ self.api_key = os.environ.get("ANTHROPIC_API_KEY")
24
+
25
+ if not self.api_key:
26
+ raise ValueError("Anthropic API Key is required")
27
+
28
+ self.client = anthropic.Anthropic(api_key=self.api_key)
29
+
30
+ def _call_claude(self, prompt: str, system: str = None, max_tokens: int = 4000,
31
+ temperature: float = 0.2, model: str = "claude-3-sonnet-20240229") -> str:
32
+ """
33
+ Make a call to Claude API.
34
+
35
+ Args:
36
+ prompt: The prompt to send to Claude
37
+ system: Optional system prompt
38
+ max_tokens: Maximum tokens in the response
39
+ temperature: Temperature setting (0-1)
40
+ model: Model to use
41
+
42
+ Returns:
43
+ Claude's response as a string
44
+ """
45
+ try:
46
+ messages = [{"role": "user", "content": prompt}]
47
+
48
+ response = self.client.messages.create(
49
+ model=model,
50
+ max_tokens=max_tokens,
51
+ temperature=temperature,
52
+ system=system,
53
+ messages=messages
54
+ )
55
+
56
+ return response.content[0].text
57
+ except Exception as e:
58
+ print(f"Error calling Claude API: {e}")
59
+ # Wait and retry once on rate limiting
60
+ if "rate" in str(e).lower() or "timeout" in str(e).lower():
61
+ print("Rate limit hit, waiting 5 seconds...")
62
+ time.sleep(5)
63
+ try:
64
+ response = self.client.messages.create(
65
+ model=model,
66
+ max_tokens=max_tokens,
67
+ temperature=temperature,
68
+ system=system,
69
+ messages=messages
70
+ )
71
+ return response.content[0].text
72
+ except Exception as retry_e:
73
+ print(f"Retry failed: {retry_e}")
74
+ return f"Error: {retry_e}"
75
+ return f"Error: {e}"
76
+
77
+ def _parse_json_from_response(self, response: str) -> Dict:
78
+ """
79
+ Extract and parse JSON from Claude's response.
80
+
81
+ Args:
82
+ response: Claude's text response
83
+
84
+ Returns:
85
+ Parsed JSON as a dictionary
86
+ """
87
+ try:
88
+ # Find JSON in the response (it might be wrapped in ```json or just be part of the text)
89
+ json_start = response.find('{')
90
+ json_end = response.rfind('}') + 1
91
+
92
+ if json_start >= 0 and json_end > json_start:
93
+ json_str = response[json_start:json_end]
94
+ return json.loads(json_str)
95
+ else:
96
+ print("No JSON found in response")
97
+ return {}
98
+ except json.JSONDecodeError as e:
99
+ print(f"Error parsing JSON: {e}")
100
+ print(f"Response was: {response}")
101
+ return {}
102
+
103
+ def extract_study_info(self, protocol_text: str) -> Dict:
104
+ """
105
+ Extract basic study information from protocol text.
106
+
107
+ Args:
108
+ protocol_text: Text from the protocol
109
+
110
+ Returns:
111
+ Dictionary with study information
112
+ """
113
+ system = """
114
+ You are an expert in clinical trial protocols with the specific task of extracting
115
+ structured data from protocol text. Extract only the information that is explicitly
116
+ stated in the text. If information is not available, use null or empty strings.
117
+ Return a valid JSON object.
118
+ """
119
+
120
+ prompt = """
121
+ Extract the following study information from the provided protocol text.
122
+ Return a valid JSON object with these keys:
123
+ {
124
+ "protocol_id": "string", // The protocol identifier/number
125
+ "title": "string", // The full protocol title
126
+ "phase": "string", // Clinical trial phase
127
+ "status": "string", // Protocol status if mentioned
128
+ "design_type": "string", // Study design description (e.g., "Randomized, Double-Blind...")
129
+ "compound_id": "string", // Investigational product identifier/name
130
+ "indication": "string", // Disease or condition being studied
131
+ "planned_enrollment": "string" // Number of planned subjects/participants
132
+ }
133
+
134
+ Protocol text:
135
+ """
136
+
137
+ response = self._call_claude(prompt + protocol_text[:20000], system=system)
138
+ return self._parse_json_from_response(response)
139
+
140
+ def extract_objectives_and_endpoints(self, section_text: str, protocol_id: str) -> Dict:
141
+ """
142
+ Extract objectives and their corresponding endpoints from protocol text.
143
+
144
+ Args:
145
+ section_text: Text from the objectives/endpoints section
146
+ protocol_id: Protocol ID for reference
147
+
148
+ Returns:
149
+ Dictionary with objectives and endpoints
150
+ """
151
+ system = """
152
+ You are an expert in clinical trial protocols with the specific task of extracting
153
+ structured data about objectives and endpoints. Extract only the information that
154
+ is explicitly stated in the text. Return the data as a valid JSON object.
155
+ """
156
+
157
+ prompt = f"""
158
+ Extract the objectives and endpoints from the following protocol section text.
159
+ The protocol ID is: {protocol_id}
160
+
161
+ Return a valid JSON object with these keys:
162
+ {{
163
+ "objectives": [
164
+ {{
165
+ "type": "string", // "Primary", "Secondary", or "Exploratory"
166
+ "description": "string", // The full text description of the objective
167
+ "id": "string" // A generated identifier (e.g., "OBJ1", "OBJ2")
168
+ }}
169
+ ],
170
+ "endpoints": [
171
+ {{
172
+ "type": "string", // "Primary", "Secondary", or "Exploratory"
173
+ "name": "string", // Short name of the endpoint
174
+ "definition": "string", // Full definition
175
+ "objective_id": "string" // Reference to which objective this endpoint measures (if clear)
176
+ }}
177
+ ]
178
+ }}
179
+
180
+ Section text:
181
+ """
182
+
183
+ response = self._call_claude(prompt + section_text, system=system)
184
+ return self._parse_json_from_response(response)
185
+
186
+ def extract_population_criteria(self, section_text: str, protocol_id: str) -> Dict:
187
+ """
188
+ Extract inclusion and exclusion criteria from protocol text.
189
+
190
+ Args:
191
+ section_text: Text from the population/criteria section
192
+ protocol_id: Protocol ID for reference
193
+
194
+ Returns:
195
+ Dictionary with inclusion and exclusion criteria
196
+ """
197
+ system = """
198
+ You are an expert in clinical trial protocols with the specific task of extracting
199
+ structured data about inclusion and exclusion criteria. Extract the criteria
200
+ exactly as stated in the text, preserving numbering and formatting. Return the
201
+ data as a valid JSON object.
202
+ """
203
+
204
+ prompt = f"""
205
+ Extract the inclusion and exclusion criteria from the following protocol section.
206
+ The protocol ID is: {protocol_id}
207
+
208
+ Return a valid JSON object with these keys:
209
+ {{
210
+ "inclusion_criteria": [
211
+ {{
212
+ "number": number or null, // The criterion number if available (e.g., 1, 2)
213
+ "text": "string", // The full text of the criterion
214
+ "attribute": "string", // The characteristic being evaluated, if clear (e.g., "Age", "BMI")
215
+ "operator": "string", // The comparison operator if applicable (e.g., ">", "<", "=")
216
+ "value": "string" // The threshold value if applicable (e.g., "18 years")
217
+ }}
218
+ ],
219
+ "exclusion_criteria": [
220
+ {{
221
+ "number": number or null,
222
+ "text": "string",
223
+ "attribute": "string",
224
+ "operator": "string",
225
+ "value": "string"
226
+ }}
227
+ ]
228
+ }}
229
+
230
+ Section text:
231
+ """
232
+
233
+ response = self._call_claude(prompt + section_text, system=system)
234
+ return self._parse_json_from_response(response)
235
+
236
+ def extract_study_design(self, section_text: str, protocol_id: str) -> Dict:
237
+ """
238
+ Extract study design information from protocol text.
239
+
240
+ Args:
241
+ section_text: Text from the study design section
242
+ protocol_id: Protocol ID for reference
243
+
244
+ Returns:
245
+ Dictionary with study design information
246
+ """
247
+ system = """
248
+ You are an expert in clinical trial protocols with the specific task of extracting
249
+ structured data about study design. Extract only information that is explicitly
250
+ stated in the text. Return the data as a valid JSON object.
251
+ """
252
+
253
+ prompt = f"""
254
+ Extract the study design information from the following protocol section.
255
+ The protocol ID is: {protocol_id}
256
+
257
+ Return a valid JSON object with these keys:
258
+ {{
259
+ "design_type": "string", // E.g., "Randomized, Double-blind, Placebo-controlled"
260
+ "study_parts": [ // List of different parts/cohorts if applicable
261
+ {{
262
+ "part": "string", // Identifier (e.g., "Part A", "Cohort 1")
263
+ "description": "string", // Description
264
+ "population": "string", // E.g., "Healthy Volunteers" or "T2DM Patients"
265
+ "planned_n": "string" // Planned number of subjects
266
+ }}
267
+ ],
268
+ "randomization": "string", // Description of randomization process
269
+ "blinding": "string", // Description of blinding (e.g., "Double-blind")
270
+ "duration": "string", // Study duration information
271
+ "dose_info": "string" // Information about dosing if mentioned
272
+ }}
273
+
274
+ Section text:
275
+ """
276
+
277
+ response = self._call_claude(prompt + section_text, system=system)
278
+ return self._parse_json_from_response(response)
279
+
280
+ def extract_statistical_methods(self, section_text: str, protocol_id: str) -> Dict:
281
+ """
282
+ Extract statistical analysis methods from SAP or protocol text.
283
+
284
+ Args:
285
+ section_text: Text from the statistical methods section
286
+ protocol_id: Protocol ID for reference
287
+
288
+ Returns:
289
+ Dictionary with statistical methods information
290
+ """
291
+ system = """
292
+ You are an expert in clinical trial statistics with the specific task of extracting
293
+ structured data about statistical methods from protocols or SAPs. Return the data
294
+ as a valid JSON object.
295
+ """
296
+
297
+ prompt = f"""
298
+ Extract the statistical methods information from the following section.
299
+ The protocol ID is: {protocol_id}
300
+
301
+ Return a valid JSON object with these keys:
302
+ {{
303
+ "analysis_populations": [
304
+ {{
305
+ "name": "string", // E.g., "Full Analysis Set", "Safety Population"
306
+ "definition": "string" // Definition of the population
307
+ }}
308
+ ],
309
+ "primary_analysis": {{
310
+ "endpoint": "string", // Primary endpoint being analyzed
311
+ "method": "string", // Statistical method (e.g., "MMRM", "t-test")
312
+ "covariates": ["string"], // List of covariates if mentioned
313
+ "handling_missing": "string" // How missing data is handled
314
+ }},
315
+ "secondary_analyses": [
316
+ {{
317
+ "endpoint": "string",
318
+ "method": "string",
319
+ "covariates": ["string"],
320
+ "handling_missing": "string"
321
+ }}
322
+ ],
323
+ "multiplicity": "string", // How multiplicity is addressed
324
+ "sample_size_justification": "string" // Sample size rationale
325
+ }}
326
+
327
+ Section text:
328
+ """
329
+
330
+ response = self._call_claude(prompt + section_text, system=system)
331
+ return self._parse_json_from_response(response)
332
+
333
+ def extract_assessments(self, section_text: str, protocol_id: str) -> Dict:
334
+ """
335
+ Extract assessment information from protocol text.
336
+
337
+ Args:
338
+ section_text: Text from the assessments section
339
+ protocol_id: Protocol ID for reference
340
+
341
+ Returns:
342
+ Dictionary with assessment information
343
+ """
344
+ system = """
345
+ You are an expert in clinical trial protocols with the specific task of extracting
346
+ structured data about assessments and procedures. Return the data as a valid JSON object.
347
+ """
348
+
349
+ prompt = f"""
350
+ Extract information about assessments and procedures from the following protocol section.
351
+ The protocol ID is: {protocol_id}
352
+
353
+ Return a valid JSON object with these keys:
354
+ {{
355
+ "assessments": [
356
+ {{
357
+ "name": "string", // Name of assessment (e.g., "OGTT", "ECG")
358
+ "type": "string", // Type (e.g., "Safety", "PK", "PD")
359
+ "description": "string", // Description of the procedure
360
+ "timing": "string", // When it's performed
361
+ "analytes": ["string"] // Measured analytes if applicable
362
+ }}
363
+ ]
364
+ }}
365
+
366
+ Section text:
367
+ """
368
+
369
+ response = self._call_claude(prompt + section_text, system=system)
370
+ return self._parse_json_from_response(response)
371
+
372
+ def generate_content_from_knowledge(self, section_type: str, context: List[Dict],
373
+ protocol_id: str = None, style_guide: str = None) -> str:
374
+ """
375
+ Generate document content based on knowledge extracted from similar documents.
376
+
377
+ Args:
378
+ section_type: Type of section to generate (e.g., "Introduction", "Study Design")
379
+ context: List of relevant text chunks from knowledge base
380
+ protocol_id: Optional protocol ID for reference
381
+ style_guide: Optional style guide instructions
382
+
383
+ Returns:
384
+ Generated content as a string
385
+ """
386
+ system = """
387
+ You are an expert medical writer who specializes in pharmaceutical R&D documents
388
+ like protocols, SAPs, and CSRs. Your task is to draft high-quality content
389
+ based on similar examples, following the conventions of scientific/medical writing
390
+ and any provided style guides.
391
+ """
392
+
393
+ # Prepare context text
394
+ context_text = ""
395
+ for i, chunk in enumerate(context):
396
+ context_text += f"\nEXAMPLE {i+1} (Source: {chunk.get('metadata', {}).get('source', 'Unknown')})\n"
397
+ context_text += chunk.get('page_content', '')
398
+ context_text += "\n" + "-"*50 + "\n"
399
+
400
+ protocol_ref = f"for protocol {protocol_id}" if protocol_id else ""
401
+ style_instructions = f"\nFollow these style guidelines:\n{style_guide}" if style_guide else ""
402
+
403
+ prompt = f"""
404
+ Please draft a {section_type} section {protocol_ref} for a clinical study document.
405
+
406
+ The content should be:
407
+ 1. Well-structured and professionally written
408
+ 2. Scientifically accurate and precise
409
+ 3. Appropriate for a regulatory/scientific audience
410
+ 4. In line with typical conventions for pharmaceutical documents{style_instructions}
411
+
412
+ Here are examples of similar content from other documents to guide your writing:
413
+ {context_text}
414
+
415
+ Please draft a complete {section_type} section that follows these examples in style and
416
+ structure but is original.
417
+ """
418
+
419
+ # Use a higher max tokens for content generation
420
+ response = self._call_claude(prompt, system=system, max_tokens=4000, temperature=0.3)
421
+ return response
422
+
423
+ def answer_protocol_question(self, question: str, context: List[Dict],
424
+ chat_history: List[Dict] = None) -> str:
425
+ """
426
+ Answer a question about protocols using retrieved context.
427
+
428
+ Args:
429
+ question: User's question
430
+ context: List of relevant text chunks from knowledge base
431
+ chat_history: Optional list of previous interactions
432
+
433
+ Returns:
434
+ Answer as a string
435
+ """
436
+ system = """
437
+ You are a Protocol Coach, an expert assistant specializing in pharmaceutical R&D documents.
438
+ Your role is to answer questions about clinical study protocols, SAPs, and other related documents
439
+ using the specific context provided. Base your answers strictly on the provided context and
440
+ indicate when information might not be available in the provided excerpts.
441
+
442
+ Always cite the source documents when answering questions.
443
+ """
444
+
445
+ # Prepare context text
446
+ context_text = ""
447
+ for i, chunk in enumerate(context):
448
+ source = chunk.get('metadata', {}).get('source', 'Unknown')
449
+ section = chunk.get('metadata', {}).get('section', 'Unknown section')
450
+ context_text += f"\nCONTEXT {i+1} [Source: {source}, Section: {section}]\n"
451
+ context_text += chunk.get('page_content', '')
452
+ context_text += "\n" + "-"*50 + "\n"
453
+
454
+ # Prepare chat history if available
455
+ history_text = ""
456
+ if chat_history and len(chat_history) > 0:
457
+ history_text = "\nPrevious conversation:\n"
458
+ for entry in chat_history[-3:]: # Only use last 3 exchanges for context
459
+ if 'user' in entry:
460
+ history_text += f"User: {entry['user']}\n"
461
+ if 'assistant' in entry:
462
+ history_text += f"Assistant: {entry['assistant']}\n"
463
+ history_text += "\n"
464
+
465
+ prompt = f"""
466
+ {history_text}
467
+ User question: {question}
468
+
469
+ Please answer the question based on the following context from clinical documents:
470
+ {context_text}
471
+
472
+ Answer the question comprehensively using only the information in the provided context.
473
+ If the context doesn't contain sufficient information to provide a complete answer,
474
+ clearly state which aspects you can and cannot address based on the available information.
475
+ """
476
+
477
+ response = self._call_claude(prompt, system=system, max_tokens=2000, temperature=0.2)
478
+ return response
479
+
480
+ def find_document_connections(self, source_doc_info: Dict, target_doc_info: Dict,
481
+ entity_pairs: List[Dict]) -> str:
482
+ """
483
+ Analyze connections between two documents based on entity pairs.
484
+
485
+ Args:
486
+ source_doc_info: Information about the source document
487
+ target_doc_info: Information about the target document
488
+ entity_pairs: List of potentially matching entities from both documents
489
+
490
+ Returns:
491
+ Analysis of connections as a string
492
+ """
493
+ system = """
494
+ You are an expert in pharmaceutical R&D document analysis, specialized in
495
+ identifying relationships, consistency, and traceability between related
496
+ documents like protocols and SAPs. Your task is to analyze potential
497
+ matches between entities in different documents and assess their alignment.
498
+ """
499
+
500
+ # Convert entity pairs to formatted text
501
+ entity_pairs_text = ""
502
+ for i, pair in enumerate(entity_pairs):
503
+ entity_pairs_text += f"\nCOMPARISON {i+1}:\n"
504
+ entity_pairs_text += f"Source: {pair.get('source_text', 'Not available')}\n"
505
+ entity_pairs_text += f"Target: {pair.get('target_text', 'Not available')}\n"
506
+ entity_pairs_text += f"Entity Type: {pair.get('entity_type', 'Unknown')}\n"
507
+ entity_pairs_text += "-"*50 + "\n"
508
+
509
+ prompt = f"""
510
+ Analyze the connections between these two pharmaceutical documents:
511
+
512
+ SOURCE DOCUMENT: {source_doc_info.get('title', 'Unknown')} (Type: {source_doc_info.get('type', 'Unknown')})
513
+ TARGET DOCUMENT: {target_doc_info.get('title', 'Unknown')} (Type: {target_doc_info.get('type', 'Unknown')})
514
+
515
+ I'll provide pairs of potentially related elements from both documents. For each pair, assess:
516
+ 1. Whether they refer to the same entity or concept
517
+ 2. The level of consistency between them (High/Medium/Low)
518
+ 3. Any notable differences or potential issues
519
+
520
+ Here are the element pairs to analyze:
521
+ {entity_pairs_text}
522
+
523
+ Provide:
524
+ 1. A summary of the overall consistency between documents
525
+ 2. Specific observations about each compared element
526
+ 3. Potential implications of any inconsistencies
527
+ 4. Recommendations for improving alignment
528
+ """
529
+
530
+ response = self._call_claude(prompt, system=system, max_tokens=3000, temperature=0.2)
531
+ return response