doc_knowledge_base / schemas.py
cryogenic22's picture
Create schemas.py
3ab5528 verified
"""
Schemas for structured data extraction from pharmaceutical R&D documents.
These schemas define the structure of objects stored in the knowledge base.
"""
from typing import Dict, List, Optional, Union, Any, TypedDict
# =========================================================================
# Core Entity Types
# =========================================================================
class Document(TypedDict):
"""Schema for document metadata."""
document_id: str
protocol_id: Optional[str]
type: str # "Protocol", "SAP", "CSR", "IB", etc.
version: Optional[str]
date: Optional[str]
title: str
status: Optional[str]
related_protocols: Optional[List[str]]
filename: str
path: str
class Study(TypedDict):
"""Schema for study information."""
protocol_id: str
title: str
phase: Optional[str]
status: Optional[str]
design_type: Optional[str]
start_date: Optional[str]
end_date: Optional[str]
compound_id: Optional[str]
indication: Optional[str]
planned_enrollment: Optional[str]
class Compound(TypedDict):
"""Schema for investigational product information."""
compound_id: str
name: str
mechanism_of_action: Optional[str]
drug_class: Optional[str]
targets: Optional[List[str]]
indications: Optional[List[str]]
class StudyObjective(TypedDict):
"""Schema for study objectives."""
objective_id: str
protocol_id: str
type: str # "Primary", "Secondary", "Exploratory"
description: str
endpoints: Optional[List[str]] # IDs of endpoints measuring this objective
class Endpoint(TypedDict):
"""Schema for endpoints."""
endpoint_id: str
protocol_id: str
type: str # "Safety", "Efficacy", "PK", "PD", "Biomarker"
name: str
definition: str
measurement_unit: Optional[str]
timepoints: Optional[List[str]]
analysis_method: Optional[str]
objective_id: Optional[str] # ID of the objective this endpoint supports
class PopulationCriterion(TypedDict):
"""Schema for inclusion/exclusion criteria."""
criterion_id: str
protocol_id: str
criterion_type: str # "Inclusion" or "Exclusion"
number: Optional[int] # E.g., Inclusion criterion #3
text: str # Full text of the criterion
attribute: Optional[str] # E.g., "Age", "BMI"
operator: Optional[str] # E.g., ">", "<", "="
value: Optional[str] # E.g., "18 years", "30 kg/m²"
class StudyArm(TypedDict):
"""Schema for study arms/cohorts."""
arm_id: str
protocol_id: str
name: str
description: str
treatment: str # Drug/placebo
dose: Optional[str]
frequency: Optional[str]
duration: Optional[str]
planned_subjects: Optional[int]
class Assessment(TypedDict):
"""Schema for study assessments/procedures."""
assessment_id: str
protocol_id: str
name: str # E.g., "OGTT", "ECG"
type: str # "Safety", "PK", "PD", etc.
description: str
method: Optional[str]
timing: Optional[str] # Simplified from complex structure
analytes: Optional[List[str]] # What's being measured
class Analyte(TypedDict):
"""Schema for analytes/biomarkers measured."""
analyte_id: str
name: str
type: Optional[str] # E.g., "Biomarker", "PK", "Safety"
# =========================================================================
# Graph State Types (for LangGraph)
# =========================================================================
class DocumentExtractionState(TypedDict):
"""State for document extraction workflow."""
document_path: str
document_text: Optional[str]
document_metadata: Optional[Document]
sections: Optional[Dict[str, str]]
extracted_study: Optional[Study]
extracted_objectives: Optional[List[StudyObjective]]
extracted_endpoints: Optional[List[Endpoint]]
extracted_population: Optional[Dict[str, List[PopulationCriterion]]]
extracted_design: Optional[Dict]
extracted_assessments: Optional[List[Assessment]]
extracted_compound: Optional[Compound]
status: str
error: Optional[str]
vector_chunks: Optional[List[Dict]]
class ProtocolCoachState(TypedDict):
"""State for protocol coach chatbot workflow."""
query: str
retrieved_context: Optional[List[Dict]]
chat_history: Optional[List[Dict]]
response: Optional[str]
error: Optional[str]
class ContentAuthoringState(TypedDict):
"""State for content authoring workflow."""
section_type: str
target_protocol_id: Optional[str]
style_guide: Optional[str]
retrieved_context: Optional[List[Dict]]
generated_content: Optional[str]
feedback: Optional[str]
revised_content: Optional[str]
error: Optional[str]
class TraceabilityState(TypedDict):
"""State for document traceability analysis workflow."""
source_document_id: str
target_document_id: str
entity_type: str # "objectives", "endpoints", "population", etc.
source_entities: Optional[List[Dict]]
target_entities: Optional[List[Dict]]
matched_pairs: Optional[List[Dict]]
analysis: Optional[str]
error: Optional[str]