""" Schemas for structured data extraction from pharmaceutical R&D documents. These schemas define the structure of objects stored in the knowledge base. """ from typing import Dict, List, Optional, Union, Any, TypedDict # ========================================================================= # Core Entity Types # ========================================================================= class Document(TypedDict): """Schema for document metadata.""" document_id: str protocol_id: Optional[str] type: str # "Protocol", "SAP", "CSR", "IB", etc. version: Optional[str] date: Optional[str] title: str status: Optional[str] related_protocols: Optional[List[str]] filename: str path: str class Study(TypedDict): """Schema for study information.""" protocol_id: str title: str phase: Optional[str] status: Optional[str] design_type: Optional[str] start_date: Optional[str] end_date: Optional[str] compound_id: Optional[str] indication: Optional[str] planned_enrollment: Optional[str] class Compound(TypedDict): """Schema for investigational product information.""" compound_id: str name: str mechanism_of_action: Optional[str] drug_class: Optional[str] targets: Optional[List[str]] indications: Optional[List[str]] class StudyObjective(TypedDict): """Schema for study objectives.""" objective_id: str protocol_id: str type: str # "Primary", "Secondary", "Exploratory" description: str endpoints: Optional[List[str]] # IDs of endpoints measuring this objective class Endpoint(TypedDict): """Schema for endpoints.""" endpoint_id: str protocol_id: str type: str # "Safety", "Efficacy", "PK", "PD", "Biomarker" name: str definition: str measurement_unit: Optional[str] timepoints: Optional[List[str]] analysis_method: Optional[str] objective_id: Optional[str] # ID of the objective this endpoint supports class PopulationCriterion(TypedDict): """Schema for inclusion/exclusion criteria.""" criterion_id: str protocol_id: str criterion_type: str # "Inclusion" or "Exclusion" number: Optional[int] # E.g., Inclusion criterion #3 text: str # Full text of the criterion attribute: Optional[str] # E.g., "Age", "BMI" operator: Optional[str] # E.g., ">", "<", "=" value: Optional[str] # E.g., "18 years", "30 kg/m²" class StudyArm(TypedDict): """Schema for study arms/cohorts.""" arm_id: str protocol_id: str name: str description: str treatment: str # Drug/placebo dose: Optional[str] frequency: Optional[str] duration: Optional[str] planned_subjects: Optional[int] class Assessment(TypedDict): """Schema for study assessments/procedures.""" assessment_id: str protocol_id: str name: str # E.g., "OGTT", "ECG" type: str # "Safety", "PK", "PD", etc. description: str method: Optional[str] timing: Optional[str] # Simplified from complex structure analytes: Optional[List[str]] # What's being measured class Analyte(TypedDict): """Schema for analytes/biomarkers measured.""" analyte_id: str name: str type: Optional[str] # E.g., "Biomarker", "PK", "Safety" # ========================================================================= # Graph State Types (for LangGraph) # ========================================================================= class DocumentExtractionState(TypedDict): """State for document extraction workflow.""" document_path: str document_text: Optional[str] document_metadata: Optional[Document] sections: Optional[Dict[str, str]] extracted_study: Optional[Study] extracted_objectives: Optional[List[StudyObjective]] extracted_endpoints: Optional[List[Endpoint]] extracted_population: Optional[Dict[str, List[PopulationCriterion]]] extracted_design: Optional[Dict] extracted_assessments: Optional[List[Assessment]] extracted_compound: Optional[Compound] status: str error: Optional[str] vector_chunks: Optional[List[Dict]] class ProtocolCoachState(TypedDict): """State for protocol coach chatbot workflow.""" query: str retrieved_context: Optional[List[Dict]] chat_history: Optional[List[Dict]] response: Optional[str] error: Optional[str] class ContentAuthoringState(TypedDict): """State for content authoring workflow.""" section_type: str target_protocol_id: Optional[str] style_guide: Optional[str] retrieved_context: Optional[List[Dict]] generated_content: Optional[str] feedback: Optional[str] revised_content: Optional[str] error: Optional[str] class TraceabilityState(TypedDict): """State for document traceability analysis workflow.""" source_document_id: str target_document_id: str entity_type: str # "objectives", "endpoints", "population", etc. source_entities: Optional[List[Dict]] target_entities: Optional[List[Dict]] matched_pairs: Optional[List[Dict]] analysis: Optional[str] error: Optional[str]