| """
|
| Input Normalization Layer.
|
|
|
| This module is responsible for parsing and normalizing all forms of user input
|
| into the Canonical JSON Schema (AnalysisRequest). It handles:
|
| - Free-text input parsing
|
| - File content extraction
|
| - Image processing
|
| - SMILES/CAS number extraction
|
|
|
| Design Note:
|
| This layer only EXTRACTS and NORMALIZES data. It does NOT make any
|
| scientific judgments or predictions. All interpretation is delegated
|
| to the Prompt Orchestration layer.
|
| """
|
|
|
| import re
|
| from typing import Optional, Tuple, List
|
| from pathlib import Path
|
|
|
| from schemas.canonical_schema import (
|
| AnalysisRequest,
|
| APIInput,
|
| ExcipientInput,
|
| StabilityData,
|
| )
|
| from utils.file_parsers import FileParser
|
| from utils.image_processor import ImageProcessor
|
|
|
|
|
| class InputNormalizer:
|
| """
|
| Normalizes user input into Canonical JSON format.
|
|
|
| This class serves as the entry point for all user input processing.
|
| It coordinates text parsing, file extraction, and image processing
|
| to produce a standardized AnalysisRequest.
|
| """
|
|
|
|
|
| SMILES_PATTERN = re.compile(
|
| r'(?:SMILES[:\s]*)?'
|
| r'([A-Za-z0-9@+\-\[\]\(\)\\\/=#$%&*!.:]{10,})'
|
| )
|
|
|
| CAS_PATTERN = re.compile(
|
| r'(?:CAS[:\s-]*)?'
|
| r'(\d{2,7}-\d{2}-\d)'
|
| )
|
|
|
| MOLECULAR_FORMULA_PATTERN = re.compile(
|
| r'([A-Z][a-z]?\d*(?:[A-Z][a-z]?\d*)*)'
|
| )
|
|
|
| MOLECULAR_WEIGHT_PATTERN = re.compile(
|
| r'(?:MW|分子量|Molecular Weight)[:\s]*(\d+\.?\d*)'
|
| )
|
|
|
| def __init__(self):
|
| """Initialize the input normalizer."""
|
| self.file_parser = FileParser()
|
| self.image_processor = ImageProcessor()
|
|
|
| def normalize(
|
| self,
|
| text_input: Optional[str] = None,
|
| file_paths: Optional[List[str]] = None,
|
| image_paths: Optional[List[str]] = None,
|
| ) -> AnalysisRequest:
|
| """
|
| Normalize all inputs into a single AnalysisRequest.
|
|
|
| Args:
|
| text_input: Free-text description from user
|
| file_paths: Paths to uploaded documents (Word, Excel, PDF)
|
| image_paths: Paths to uploaded images (structure diagrams)
|
|
|
| Returns:
|
| AnalysisRequest: Normalized canonical representation
|
| """
|
|
|
| api_info = APIInput()
|
| excipients: List[ExcipientInput] = []
|
| stability_data: Optional[StabilityData] = None
|
|
|
|
|
| if text_input:
|
| api_info, excipients, stability_data = self._parse_text_input(text_input)
|
|
|
|
|
| if file_paths:
|
| file_api, file_excipients, file_stability = self._process_files(file_paths)
|
| api_info = self._merge_api_info(api_info, file_api)
|
| excipients.extend(file_excipients)
|
| if file_stability:
|
| stability_data = file_stability
|
|
|
|
|
| if image_paths:
|
| for image_path in image_paths:
|
|
|
| if not api_info.structure_image_path:
|
| api_info.structure_image_path = image_path
|
|
|
|
|
| return AnalysisRequest(
|
| api=api_info,
|
| excipients=excipients,
|
| stability_data=stability_data,
|
| analysis_focus=self._determine_analysis_focus(api_info, excipients),
|
| )
|
|
|
| def _parse_text_input(
|
| self,
|
| text: str
|
| ) -> Tuple[APIInput, List[ExcipientInput], Optional[StabilityData]]:
|
| """
|
| Parse free-text input to extract API, excipient, and stability info.
|
|
|
| This uses pattern matching and heuristics to identify different
|
| types of information in the user's text.
|
| """
|
| api_info = APIInput()
|
| excipients: List[ExcipientInput] = []
|
| stability_data: Optional[StabilityData] = None
|
|
|
|
|
| smiles_match = self.SMILES_PATTERN.search(text)
|
| if smiles_match:
|
| api_info.smiles = smiles_match.group(1)
|
|
|
|
|
| cas_match = self.CAS_PATTERN.search(text)
|
| if cas_match:
|
| api_info.cas_number = cas_match.group(1)
|
|
|
|
|
| mw_match = self.MOLECULAR_WEIGHT_PATTERN.search(text)
|
| if mw_match:
|
| try:
|
| api_info.molecular_weight = float(mw_match.group(1))
|
| except ValueError:
|
| pass
|
|
|
|
|
| excipient_patterns = [
|
| r'(?:辅料|excipient)[:\s]*([^\n,]+)',
|
| r'(?:和|与|with)\s*([^\n,]+?)(?:的|的相容性|compatibility)',
|
| r'(?:相容性|compatibility)[^与和]*(?:与|和|with)\s*([^\n,]+)',
|
| ]
|
|
|
| for pattern in excipient_patterns:
|
| matches = re.findall(pattern, text, re.IGNORECASE)
|
| for match in matches:
|
| excipient_name = match.strip()
|
| if excipient_name and len(excipient_name) > 1:
|
| excipients.append(ExcipientInput(name=excipient_name))
|
|
|
|
|
| common_excipients = self._extract_common_excipients(text)
|
| for exc_name in common_excipients:
|
| if not any(e.name == exc_name for e in excipients):
|
| excipients.append(ExcipientInput(name=exc_name))
|
|
|
|
|
| stability_patterns = [
|
| r'(\d+[°℃]C?\s*/\s*\d+%\s*RH)',
|
| r'(加速|长期|中间)\s*(?:条件|试验)',
|
| ]
|
|
|
| for pattern in stability_patterns:
|
| match = re.search(pattern, text)
|
| if match:
|
| stability_data = StabilityData(
|
| conditions=match.group(0),
|
| observations=self._extract_stability_observations(text),
|
| )
|
| break
|
|
|
|
|
| if api_info.smiles and not api_info.name:
|
| api_info.name = f"Compound ({api_info.smiles[:20]}...)"
|
|
|
| return api_info, excipients, stability_data
|
|
|
| def _extract_common_excipients(self, text: str) -> List[str]:
|
| """
|
| Extract commonly known excipient names from text.
|
|
|
| This provides a fallback for users who may not use
|
| explicit "excipient:" labels.
|
| """
|
|
|
| excipient_keywords = {
|
|
|
| "无水磷酸氢钙": "DCP Anhydrous",
|
| "磷酸氢钙": "Dibasic Calcium Phosphate",
|
| "DCP": "DCP",
|
| "乳糖": "Lactose",
|
| "微晶纤维素": "MCC",
|
| "MCC": "MCC",
|
| "淀粉": "Starch",
|
| "甘露醇": "Mannitol",
|
|
|
|
|
| "HPMC": "HPMC",
|
| "羟丙甲纤维素": "HPMC",
|
| "PVP": "PVP",
|
| "预胶化淀粉": "Pregelatinized Starch",
|
|
|
|
|
| "硬脂酸镁": "Magnesium Stearate",
|
| "滑石粉": "Talc",
|
|
|
|
|
| "交联羧甲纤维素钠": "Croscarmellose Sodium",
|
| "交联PVP": "Crospovidone",
|
| }
|
|
|
| found = []
|
| text_lower = text.lower()
|
|
|
| for cn_name, en_name in excipient_keywords.items():
|
| if cn_name.lower() in text_lower or en_name.lower() in text_lower:
|
| found.append(cn_name)
|
|
|
| return found
|
|
|
| def _extract_stability_observations(self, text: str) -> Optional[str]:
|
| """Extract stability test observations from text."""
|
|
|
| patterns = [
|
| r'(?:结果|result|观察)[:\s]*([^\n]+)',
|
| r'(?:发现|observed)[:\s]*([^\n]+)',
|
| ]
|
|
|
| for pattern in patterns:
|
| match = re.search(pattern, text, re.IGNORECASE)
|
| if match:
|
| return match.group(1).strip()
|
|
|
| return None
|
|
|
| def _process_files(
|
| self,
|
| file_paths: List[str]
|
| ) -> Tuple[APIInput, List[ExcipientInput], Optional[StabilityData]]:
|
| """
|
| Process uploaded files and extract relevant information.
|
|
|
| Delegates to FileParser for actual content extraction.
|
| """
|
| api_info = APIInput()
|
| excipients: List[ExcipientInput] = []
|
| stability_data: Optional[StabilityData] = None
|
|
|
| for file_path in file_paths:
|
| path = Path(file_path)
|
|
|
| if path.suffix.lower() in ['.docx', '.doc']:
|
| content = self.file_parser.parse_word(file_path)
|
| elif path.suffix.lower() in ['.xlsx', '.xls']:
|
| content = self.file_parser.parse_excel(file_path)
|
| elif path.suffix.lower() == '.pdf':
|
| content = self.file_parser.parse_pdf(file_path)
|
| else:
|
| continue
|
|
|
|
|
| if content:
|
| file_api, file_exc, file_stab = self._parse_text_input(content)
|
| api_info = self._merge_api_info(api_info, file_api)
|
| excipients.extend(file_exc)
|
| if file_stab:
|
| stability_data = file_stab
|
|
|
| return api_info, excipients, stability_data
|
|
|
| def _merge_api_info(self, base: APIInput, new: APIInput) -> APIInput:
|
| """Merge two APIInput objects, preferring non-None values."""
|
| return APIInput(
|
| name=new.name or base.name,
|
| smiles=new.smiles or base.smiles,
|
| structure_image_path=new.structure_image_path or base.structure_image_path,
|
| cas_number=new.cas_number or base.cas_number,
|
| molecular_formula=new.molecular_formula or base.molecular_formula,
|
| molecular_weight=new.molecular_weight or base.molecular_weight,
|
| additional_info=self._merge_text(base.additional_info, new.additional_info),
|
| )
|
|
|
| def _merge_text(self, text1: Optional[str], text2: Optional[str]) -> Optional[str]:
|
| """Merge two text strings."""
|
| if text1 and text2:
|
| return f"{text1}\n{text2}"
|
| return text1 or text2
|
|
|
| def _determine_analysis_focus(
|
| self,
|
| api: APIInput,
|
| excipients: List[ExcipientInput]
|
| ) -> List[str]:
|
| """
|
| Determine which analysis dimensions to focus on based on input.
|
|
|
| This helps the Prompt Orchestrator prioritize its analysis.
|
| """
|
| focus = []
|
|
|
|
|
| if api.smiles:
|
| focus.append("api_structure")
|
|
|
|
|
| if excipients:
|
| focus.append("excipient_analysis")
|
| focus.append("compatibility")
|
|
|
| return focus if focus else ["api_structure", "excipient_analysis", "compatibility"]
|
|
|