Spaces:
Runtime error
Runtime error
Create schemas.py
Browse files- schemas.py +156 -0
schemas.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Schemas for structured data extraction from pharmaceutical R&D documents.
|
| 3 |
+
These schemas define the structure of objects stored in the knowledge base.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Dict, List, Optional, Union, Any, TypedDict
|
| 7 |
+
|
| 8 |
+
# =========================================================================
|
| 9 |
+
# Core Entity Types
|
| 10 |
+
# =========================================================================
|
| 11 |
+
|
| 12 |
+
class Document(TypedDict):
|
| 13 |
+
"""Schema for document metadata."""
|
| 14 |
+
document_id: str
|
| 15 |
+
protocol_id: Optional[str]
|
| 16 |
+
type: str # "Protocol", "SAP", "CSR", "IB", etc.
|
| 17 |
+
version: Optional[str]
|
| 18 |
+
date: Optional[str]
|
| 19 |
+
title: str
|
| 20 |
+
status: Optional[str]
|
| 21 |
+
related_protocols: Optional[List[str]]
|
| 22 |
+
filename: str
|
| 23 |
+
path: str
|
| 24 |
+
|
| 25 |
+
class Study(TypedDict):
|
| 26 |
+
"""Schema for study information."""
|
| 27 |
+
protocol_id: str
|
| 28 |
+
title: str
|
| 29 |
+
phase: Optional[str]
|
| 30 |
+
status: Optional[str]
|
| 31 |
+
design_type: Optional[str]
|
| 32 |
+
start_date: Optional[str]
|
| 33 |
+
end_date: Optional[str]
|
| 34 |
+
compound_id: Optional[str]
|
| 35 |
+
indication: Optional[str]
|
| 36 |
+
planned_enrollment: Optional[str]
|
| 37 |
+
|
| 38 |
+
class Compound(TypedDict):
|
| 39 |
+
"""Schema for investigational product information."""
|
| 40 |
+
compound_id: str
|
| 41 |
+
name: str
|
| 42 |
+
mechanism_of_action: Optional[str]
|
| 43 |
+
drug_class: Optional[str]
|
| 44 |
+
targets: Optional[List[str]]
|
| 45 |
+
indications: Optional[List[str]]
|
| 46 |
+
|
| 47 |
+
class StudyObjective(TypedDict):
|
| 48 |
+
"""Schema for study objectives."""
|
| 49 |
+
objective_id: str
|
| 50 |
+
protocol_id: str
|
| 51 |
+
type: str # "Primary", "Secondary", "Exploratory"
|
| 52 |
+
description: str
|
| 53 |
+
endpoints: Optional[List[str]] # IDs of endpoints measuring this objective
|
| 54 |
+
|
| 55 |
+
class Endpoint(TypedDict):
|
| 56 |
+
"""Schema for endpoints."""
|
| 57 |
+
endpoint_id: str
|
| 58 |
+
protocol_id: str
|
| 59 |
+
type: str # "Safety", "Efficacy", "PK", "PD", "Biomarker"
|
| 60 |
+
name: str
|
| 61 |
+
definition: str
|
| 62 |
+
measurement_unit: Optional[str]
|
| 63 |
+
timepoints: Optional[List[str]]
|
| 64 |
+
analysis_method: Optional[str]
|
| 65 |
+
objective_id: Optional[str] # ID of the objective this endpoint supports
|
| 66 |
+
|
| 67 |
+
class PopulationCriterion(TypedDict):
|
| 68 |
+
"""Schema for inclusion/exclusion criteria."""
|
| 69 |
+
criterion_id: str
|
| 70 |
+
protocol_id: str
|
| 71 |
+
criterion_type: str # "Inclusion" or "Exclusion"
|
| 72 |
+
number: Optional[int] # E.g., Inclusion criterion #3
|
| 73 |
+
text: str # Full text of the criterion
|
| 74 |
+
attribute: Optional[str] # E.g., "Age", "BMI"
|
| 75 |
+
operator: Optional[str] # E.g., ">", "<", "="
|
| 76 |
+
value: Optional[str] # E.g., "18 years", "30 kg/m²"
|
| 77 |
+
|
| 78 |
+
class StudyArm(TypedDict):
|
| 79 |
+
"""Schema for study arms/cohorts."""
|
| 80 |
+
arm_id: str
|
| 81 |
+
protocol_id: str
|
| 82 |
+
name: str
|
| 83 |
+
description: str
|
| 84 |
+
treatment: str # Drug/placebo
|
| 85 |
+
dose: Optional[str]
|
| 86 |
+
frequency: Optional[str]
|
| 87 |
+
duration: Optional[str]
|
| 88 |
+
planned_subjects: Optional[int]
|
| 89 |
+
|
| 90 |
+
class Assessment(TypedDict):
|
| 91 |
+
"""Schema for study assessments/procedures."""
|
| 92 |
+
assessment_id: str
|
| 93 |
+
protocol_id: str
|
| 94 |
+
name: str # E.g., "OGTT", "ECG"
|
| 95 |
+
type: str # "Safety", "PK", "PD", etc.
|
| 96 |
+
description: str
|
| 97 |
+
method: Optional[str]
|
| 98 |
+
timing: Optional[str] # Simplified from complex structure
|
| 99 |
+
analytes: Optional[List[str]] # What's being measured
|
| 100 |
+
|
| 101 |
+
class Analyte(TypedDict):
|
| 102 |
+
"""Schema for analytes/biomarkers measured."""
|
| 103 |
+
analyte_id: str
|
| 104 |
+
name: str
|
| 105 |
+
type: Optional[str] # E.g., "Biomarker", "PK", "Safety"
|
| 106 |
+
|
| 107 |
+
# =========================================================================
|
| 108 |
+
# Graph State Types (for LangGraph)
|
| 109 |
+
# =========================================================================
|
| 110 |
+
|
| 111 |
+
class DocumentExtractionState(TypedDict):
|
| 112 |
+
"""State for document extraction workflow."""
|
| 113 |
+
document_path: str
|
| 114 |
+
document_text: Optional[str]
|
| 115 |
+
document_metadata: Optional[Document]
|
| 116 |
+
sections: Optional[Dict[str, str]]
|
| 117 |
+
extracted_study: Optional[Study]
|
| 118 |
+
extracted_objectives: Optional[List[StudyObjective]]
|
| 119 |
+
extracted_endpoints: Optional[List[Endpoint]]
|
| 120 |
+
extracted_population: Optional[Dict[str, List[PopulationCriterion]]]
|
| 121 |
+
extracted_design: Optional[Dict]
|
| 122 |
+
extracted_assessments: Optional[List[Assessment]]
|
| 123 |
+
extracted_compound: Optional[Compound]
|
| 124 |
+
status: str
|
| 125 |
+
error: Optional[str]
|
| 126 |
+
vector_chunks: Optional[List[Dict]]
|
| 127 |
+
|
| 128 |
+
class ProtocolCoachState(TypedDict):
|
| 129 |
+
"""State for protocol coach chatbot workflow."""
|
| 130 |
+
query: str
|
| 131 |
+
retrieved_context: Optional[List[Dict]]
|
| 132 |
+
chat_history: Optional[List[Dict]]
|
| 133 |
+
response: Optional[str]
|
| 134 |
+
error: Optional[str]
|
| 135 |
+
|
| 136 |
+
class ContentAuthoringState(TypedDict):
|
| 137 |
+
"""State for content authoring workflow."""
|
| 138 |
+
section_type: str
|
| 139 |
+
target_protocol_id: Optional[str]
|
| 140 |
+
style_guide: Optional[str]
|
| 141 |
+
retrieved_context: Optional[List[Dict]]
|
| 142 |
+
generated_content: Optional[str]
|
| 143 |
+
feedback: Optional[str]
|
| 144 |
+
revised_content: Optional[str]
|
| 145 |
+
error: Optional[str]
|
| 146 |
+
|
| 147 |
+
class TraceabilityState(TypedDict):
|
| 148 |
+
"""State for document traceability analysis workflow."""
|
| 149 |
+
source_document_id: str
|
| 150 |
+
target_document_id: str
|
| 151 |
+
entity_type: str # "objectives", "endpoints", "population", etc.
|
| 152 |
+
source_entities: Optional[List[Dict]]
|
| 153 |
+
target_entities: Optional[List[Dict]]
|
| 154 |
+
matched_pairs: Optional[List[Dict]]
|
| 155 |
+
analysis: Optional[str]
|
| 156 |
+
error: Optional[str]
|