File size: 5,124 Bytes
3ab5528
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
Schemas for structured data extraction from pharmaceutical R&D documents.
These schemas define the structure of objects stored in the knowledge base.
"""

from typing import Dict, List, Optional, Union, Any, TypedDict

# =========================================================================
# Core Entity Types
# =========================================================================

class Document(TypedDict):
    """Schema for document metadata."""
    document_id: str
    protocol_id: Optional[str]
    type: str  # "Protocol", "SAP", "CSR", "IB", etc.
    version: Optional[str]
    date: Optional[str]
    title: str
    status: Optional[str]
    related_protocols: Optional[List[str]]
    filename: str
    path: str

class Study(TypedDict):
    """Schema for study information."""
    protocol_id: str
    title: str
    phase: Optional[str]
    status: Optional[str]
    design_type: Optional[str]
    start_date: Optional[str]
    end_date: Optional[str]
    compound_id: Optional[str]
    indication: Optional[str]
    planned_enrollment: Optional[str]

class Compound(TypedDict):
    """Schema for investigational product information."""
    compound_id: str
    name: str
    mechanism_of_action: Optional[str]
    drug_class: Optional[str]
    targets: Optional[List[str]]
    indications: Optional[List[str]]

class StudyObjective(TypedDict):
    """Schema for study objectives."""
    objective_id: str
    protocol_id: str
    type: str  # "Primary", "Secondary", "Exploratory"
    description: str
    endpoints: Optional[List[str]]  # IDs of endpoints measuring this objective

class Endpoint(TypedDict):
    """Schema for endpoints."""
    endpoint_id: str
    protocol_id: str
    type: str  # "Safety", "Efficacy", "PK", "PD", "Biomarker"
    name: str
    definition: str
    measurement_unit: Optional[str]
    timepoints: Optional[List[str]]
    analysis_method: Optional[str]
    objective_id: Optional[str]  # ID of the objective this endpoint supports

class PopulationCriterion(TypedDict):
    """Schema for inclusion/exclusion criteria."""
    criterion_id: str
    protocol_id: str
    criterion_type: str  # "Inclusion" or "Exclusion"
    number: Optional[int]  # E.g., Inclusion criterion #3
    text: str  # Full text of the criterion
    attribute: Optional[str]  # E.g., "Age", "BMI"
    operator: Optional[str]  # E.g., ">", "<", "="
    value: Optional[str]  # E.g., "18 years", "30 kg/m²"

class StudyArm(TypedDict):
    """Schema for study arms/cohorts."""
    arm_id: str
    protocol_id: str
    name: str
    description: str
    treatment: str  # Drug/placebo
    dose: Optional[str]
    frequency: Optional[str]
    duration: Optional[str]
    planned_subjects: Optional[int]

class Assessment(TypedDict):
    """Schema for study assessments/procedures."""
    assessment_id: str
    protocol_id: str
    name: str  # E.g., "OGTT", "ECG"
    type: str  # "Safety", "PK", "PD", etc.
    description: str
    method: Optional[str]
    timing: Optional[str]  # Simplified from complex structure
    analytes: Optional[List[str]]  # What's being measured

class Analyte(TypedDict):
    """Schema for analytes/biomarkers measured."""
    analyte_id: str
    name: str
    type: Optional[str]  # E.g., "Biomarker", "PK", "Safety"

# =========================================================================
# Graph State Types (for LangGraph)
# =========================================================================

class DocumentExtractionState(TypedDict):
    """State for document extraction workflow."""
    document_path: str
    document_text: Optional[str]
    document_metadata: Optional[Document]
    sections: Optional[Dict[str, str]]
    extracted_study: Optional[Study]
    extracted_objectives: Optional[List[StudyObjective]]
    extracted_endpoints: Optional[List[Endpoint]]
    extracted_population: Optional[Dict[str, List[PopulationCriterion]]]
    extracted_design: Optional[Dict]
    extracted_assessments: Optional[List[Assessment]]
    extracted_compound: Optional[Compound]
    status: str
    error: Optional[str]
    vector_chunks: Optional[List[Dict]]

class ProtocolCoachState(TypedDict):
    """State for protocol coach chatbot workflow."""
    query: str
    retrieved_context: Optional[List[Dict]]
    chat_history: Optional[List[Dict]]
    response: Optional[str]
    error: Optional[str]

class ContentAuthoringState(TypedDict):
    """State for content authoring workflow."""
    section_type: str
    target_protocol_id: Optional[str]
    style_guide: Optional[str]
    retrieved_context: Optional[List[Dict]]
    generated_content: Optional[str]
    feedback: Optional[str]
    revised_content: Optional[str]
    error: Optional[str]

class TraceabilityState(TypedDict):
    """State for document traceability analysis workflow."""
    source_document_id: str
    target_document_id: str
    entity_type: str  # "objectives", "endpoints", "population", etc.
    source_entities: Optional[List[Dict]]
    target_entities: Optional[List[Dict]]
    matched_pairs: Optional[List[Dict]]
    analysis: Optional[str]
    error: Optional[str]