cryogenic22 commited on
Commit
3ab5528
·
verified ·
1 Parent(s): e30ad71

Create schemas.py

Browse files
Files changed (1) hide show
  1. schemas.py +156 -0
schemas.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Schemas for structured data extraction from pharmaceutical R&D documents.
3
+ These schemas define the structure of objects stored in the knowledge base.
4
+ """
5
+
6
+ from typing import Dict, List, Optional, Union, Any, TypedDict
7
+
8
+ # =========================================================================
9
+ # Core Entity Types
10
+ # =========================================================================
11
+
12
+ class Document(TypedDict):
13
+ """Schema for document metadata."""
14
+ document_id: str
15
+ protocol_id: Optional[str]
16
+ type: str # "Protocol", "SAP", "CSR", "IB", etc.
17
+ version: Optional[str]
18
+ date: Optional[str]
19
+ title: str
20
+ status: Optional[str]
21
+ related_protocols: Optional[List[str]]
22
+ filename: str
23
+ path: str
24
+
25
+ class Study(TypedDict):
26
+ """Schema for study information."""
27
+ protocol_id: str
28
+ title: str
29
+ phase: Optional[str]
30
+ status: Optional[str]
31
+ design_type: Optional[str]
32
+ start_date: Optional[str]
33
+ end_date: Optional[str]
34
+ compound_id: Optional[str]
35
+ indication: Optional[str]
36
+ planned_enrollment: Optional[str]
37
+
38
+ class Compound(TypedDict):
39
+ """Schema for investigational product information."""
40
+ compound_id: str
41
+ name: str
42
+ mechanism_of_action: Optional[str]
43
+ drug_class: Optional[str]
44
+ targets: Optional[List[str]]
45
+ indications: Optional[List[str]]
46
+
47
+ class StudyObjective(TypedDict):
48
+ """Schema for study objectives."""
49
+ objective_id: str
50
+ protocol_id: str
51
+ type: str # "Primary", "Secondary", "Exploratory"
52
+ description: str
53
+ endpoints: Optional[List[str]] # IDs of endpoints measuring this objective
54
+
55
+ class Endpoint(TypedDict):
56
+ """Schema for endpoints."""
57
+ endpoint_id: str
58
+ protocol_id: str
59
+ type: str # "Safety", "Efficacy", "PK", "PD", "Biomarker"
60
+ name: str
61
+ definition: str
62
+ measurement_unit: Optional[str]
63
+ timepoints: Optional[List[str]]
64
+ analysis_method: Optional[str]
65
+ objective_id: Optional[str] # ID of the objective this endpoint supports
66
+
67
+ class PopulationCriterion(TypedDict):
68
+ """Schema for inclusion/exclusion criteria."""
69
+ criterion_id: str
70
+ protocol_id: str
71
+ criterion_type: str # "Inclusion" or "Exclusion"
72
+ number: Optional[int] # E.g., Inclusion criterion #3
73
+ text: str # Full text of the criterion
74
+ attribute: Optional[str] # E.g., "Age", "BMI"
75
+ operator: Optional[str] # E.g., ">", "<", "="
76
+ value: Optional[str] # E.g., "18 years", "30 kg/m²"
77
+
78
+ class StudyArm(TypedDict):
79
+ """Schema for study arms/cohorts."""
80
+ arm_id: str
81
+ protocol_id: str
82
+ name: str
83
+ description: str
84
+ treatment: str # Drug/placebo
85
+ dose: Optional[str]
86
+ frequency: Optional[str]
87
+ duration: Optional[str]
88
+ planned_subjects: Optional[int]
89
+
90
+ class Assessment(TypedDict):
91
+ """Schema for study assessments/procedures."""
92
+ assessment_id: str
93
+ protocol_id: str
94
+ name: str # E.g., "OGTT", "ECG"
95
+ type: str # "Safety", "PK", "PD", etc.
96
+ description: str
97
+ method: Optional[str]
98
+ timing: Optional[str] # Simplified from complex structure
99
+ analytes: Optional[List[str]] # What's being measured
100
+
101
+ class Analyte(TypedDict):
102
+ """Schema for analytes/biomarkers measured."""
103
+ analyte_id: str
104
+ name: str
105
+ type: Optional[str] # E.g., "Biomarker", "PK", "Safety"
106
+
107
+ # =========================================================================
108
+ # Graph State Types (for LangGraph)
109
+ # =========================================================================
110
+
111
+ class DocumentExtractionState(TypedDict):
112
+ """State for document extraction workflow."""
113
+ document_path: str
114
+ document_text: Optional[str]
115
+ document_metadata: Optional[Document]
116
+ sections: Optional[Dict[str, str]]
117
+ extracted_study: Optional[Study]
118
+ extracted_objectives: Optional[List[StudyObjective]]
119
+ extracted_endpoints: Optional[List[Endpoint]]
120
+ extracted_population: Optional[Dict[str, List[PopulationCriterion]]]
121
+ extracted_design: Optional[Dict]
122
+ extracted_assessments: Optional[List[Assessment]]
123
+ extracted_compound: Optional[Compound]
124
+ status: str
125
+ error: Optional[str]
126
+ vector_chunks: Optional[List[Dict]]
127
+
128
+ class ProtocolCoachState(TypedDict):
129
+ """State for protocol coach chatbot workflow."""
130
+ query: str
131
+ retrieved_context: Optional[List[Dict]]
132
+ chat_history: Optional[List[Dict]]
133
+ response: Optional[str]
134
+ error: Optional[str]
135
+
136
+ class ContentAuthoringState(TypedDict):
137
+ """State for content authoring workflow."""
138
+ section_type: str
139
+ target_protocol_id: Optional[str]
140
+ style_guide: Optional[str]
141
+ retrieved_context: Optional[List[Dict]]
142
+ generated_content: Optional[str]
143
+ feedback: Optional[str]
144
+ revised_content: Optional[str]
145
+ error: Optional[str]
146
+
147
+ class TraceabilityState(TypedDict):
148
+ """State for document traceability analysis workflow."""
149
+ source_document_id: str
150
+ target_document_id: str
151
+ entity_type: str # "objectives", "endpoints", "population", etc.
152
+ source_entities: Optional[List[Dict]]
153
+ target_entities: Optional[List[Dict]]
154
+ matched_pairs: Optional[List[Dict]]
155
+ analysis: Optional[str]
156
+ error: Optional[str]