Spaces:
Runtime error
Runtime error
File size: 5,124 Bytes
3ab5528 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
"""
Schemas for structured data extraction from pharmaceutical R&D documents.
These schemas define the structure of objects stored in the knowledge base.
"""
from typing import Dict, List, Optional, Union, Any, TypedDict
# =========================================================================
# Core Entity Types
# =========================================================================
class Document(TypedDict):
"""Schema for document metadata."""
document_id: str
protocol_id: Optional[str]
type: str # "Protocol", "SAP", "CSR", "IB", etc.
version: Optional[str]
date: Optional[str]
title: str
status: Optional[str]
related_protocols: Optional[List[str]]
filename: str
path: str
class Study(TypedDict):
"""Schema for study information."""
protocol_id: str
title: str
phase: Optional[str]
status: Optional[str]
design_type: Optional[str]
start_date: Optional[str]
end_date: Optional[str]
compound_id: Optional[str]
indication: Optional[str]
planned_enrollment: Optional[str]
class Compound(TypedDict):
"""Schema for investigational product information."""
compound_id: str
name: str
mechanism_of_action: Optional[str]
drug_class: Optional[str]
targets: Optional[List[str]]
indications: Optional[List[str]]
class StudyObjective(TypedDict):
"""Schema for study objectives."""
objective_id: str
protocol_id: str
type: str # "Primary", "Secondary", "Exploratory"
description: str
endpoints: Optional[List[str]] # IDs of endpoints measuring this objective
class Endpoint(TypedDict):
"""Schema for endpoints."""
endpoint_id: str
protocol_id: str
type: str # "Safety", "Efficacy", "PK", "PD", "Biomarker"
name: str
definition: str
measurement_unit: Optional[str]
timepoints: Optional[List[str]]
analysis_method: Optional[str]
objective_id: Optional[str] # ID of the objective this endpoint supports
class PopulationCriterion(TypedDict):
"""Schema for inclusion/exclusion criteria."""
criterion_id: str
protocol_id: str
criterion_type: str # "Inclusion" or "Exclusion"
number: Optional[int] # E.g., Inclusion criterion #3
text: str # Full text of the criterion
attribute: Optional[str] # E.g., "Age", "BMI"
operator: Optional[str] # E.g., ">", "<", "="
value: Optional[str] # E.g., "18 years", "30 kg/m²"
class StudyArm(TypedDict):
"""Schema for study arms/cohorts."""
arm_id: str
protocol_id: str
name: str
description: str
treatment: str # Drug/placebo
dose: Optional[str]
frequency: Optional[str]
duration: Optional[str]
planned_subjects: Optional[int]
class Assessment(TypedDict):
"""Schema for study assessments/procedures."""
assessment_id: str
protocol_id: str
name: str # E.g., "OGTT", "ECG"
type: str # "Safety", "PK", "PD", etc.
description: str
method: Optional[str]
timing: Optional[str] # Simplified from complex structure
analytes: Optional[List[str]] # What's being measured
class Analyte(TypedDict):
"""Schema for analytes/biomarkers measured."""
analyte_id: str
name: str
type: Optional[str] # E.g., "Biomarker", "PK", "Safety"
# =========================================================================
# Graph State Types (for LangGraph)
# =========================================================================
class DocumentExtractionState(TypedDict):
"""State for document extraction workflow."""
document_path: str
document_text: Optional[str]
document_metadata: Optional[Document]
sections: Optional[Dict[str, str]]
extracted_study: Optional[Study]
extracted_objectives: Optional[List[StudyObjective]]
extracted_endpoints: Optional[List[Endpoint]]
extracted_population: Optional[Dict[str, List[PopulationCriterion]]]
extracted_design: Optional[Dict]
extracted_assessments: Optional[List[Assessment]]
extracted_compound: Optional[Compound]
status: str
error: Optional[str]
vector_chunks: Optional[List[Dict]]
class ProtocolCoachState(TypedDict):
"""State for protocol coach chatbot workflow."""
query: str
retrieved_context: Optional[List[Dict]]
chat_history: Optional[List[Dict]]
response: Optional[str]
error: Optional[str]
class ContentAuthoringState(TypedDict):
"""State for content authoring workflow."""
section_type: str
target_protocol_id: Optional[str]
style_guide: Optional[str]
retrieved_context: Optional[List[Dict]]
generated_content: Optional[str]
feedback: Optional[str]
revised_content: Optional[str]
error: Optional[str]
class TraceabilityState(TypedDict):
"""State for document traceability analysis workflow."""
source_document_id: str
target_document_id: str
entity_type: str # "objectives", "endpoints", "population", etc.
source_entities: Optional[List[Dict]]
target_entities: Optional[List[Dict]]
matched_pairs: Optional[List[Dict]]
analysis: Optional[str]
error: Optional[str] |