Spaces:

Kevinshh
/

Preformu

Sleeping

App Files Files Community

Preformu / layers /output_normalizer.py

Kevinshh

Upload full project

aecf8ce verified 4 months ago

raw

history blame contribute delete

17.3 kB

	"""
	Output Normalization Layer.

	This module is responsible for normalizing LLM outputs into structured,
	consistent data suitable for report generation. It handles:
	1. Terminology standardization (CN/EN)
	2. Tone normalization (removing casual language)
	3. Uncertainty annotation
	4. Structure extraction from natural language

	Design Philosophy:
	- The LLM output is treated as "raw material" that needs refinement
	- Normalization ensures consistency regardless of model variations
	- Forbidden phrases are filtered to maintain professional tone
	"""

	import re
	from typing import Optional, List, Dict, Any
	from datetime import datetime
	import json

	from schemas.canonical_schema import (
	AnalysisResult,
	ReactiveGroup,
	PhysicochemicalProperties,
	ExcipientProfile,
	InteractionMechanism,
	FormulationStrategy,
	BilingualText,
	RiskLevel,
	ConfidenceLevel,
	PropertyType,
	ImpurityProfile,
	)
	from config.settings import settings


	class OutputNormalizer:
	"""
	Normalizes LLM outputs into structured report-ready data.

	This class takes raw text from LLM responses and transforms it into
	the AnalysisResult schema, ensuring consistency and filtering
	inappropriate content.
	"""

	def __init__(self):
	"""Initialize the normalizer with configuration."""
	self.terminology_map = settings.output.terminology_map
	self.forbidden_phrases = settings.output.forbidden_phrases
	self.risk_level_keywords = {
	RiskLevel.NONE: ["无风险", "no risk", "不具备", "不存在"],
	RiskLevel.LOW: ["低风险", "low risk", "轻微", "可控"],
	RiskLevel.MEDIUM: ["中等", "需关注", "medium", "moderate", "关注"],
	RiskLevel.HIGH: ["高风险", "high risk", "显著", "严重"],
	}

	def normalize(
	self,
	api_name: str,
	api_smiles: Optional[str],
	excipient_name: str,
	raw_outputs: Dict[str, str],
	) -> AnalysisResult:
	"""
	Normalize all raw LLM outputs into a structured AnalysisResult.

	Args:
	api_name: Name of the API
	api_smiles: SMILES notation of the API
	excipient_name: Name of the primary excipient
	raw_outputs: Dictionary of raw outputs by dimension

	Returns:
	AnalysisResult: Fully structured and normalized result
	"""
	# Generate report metadata
	report_id = self._generate_report_id()
	date = datetime.now().strftime("%Y-%m-%d")

	# Normalize each dimension
	reactive_groups = self._extract_reactive_groups(
	raw_outputs.get("api_structure", "")
	)

	physicochemical = self._extract_physicochemical(
	raw_outputs.get("api_structure", "")
	)

	excipient_profile = self._extract_excipient_profile(
	raw_outputs.get("excipient_analysis", ""),
	excipient_name
	)

	interactions = self._extract_interactions(
	raw_outputs.get("compatibility", "")
	)

	strategies = self._extract_formulation_strategies(
	raw_outputs.get("compatibility", "") +
	raw_outputs.get("synthesis", "")
	)

	# Extract uncertainty information
	assumptions, limitations = self._extract_uncertainties(
	raw_outputs.get("synthesis", "")
	)

	return AnalysisResult(
	report_id=report_id,
	date=date,
	api_name=api_name,
	api_smiles=api_smiles,
	excipient_name=excipient_name,
	reactive_groups=reactive_groups,
	physicochemical=physicochemical,
	excipient_profile=excipient_profile,
	interactions=interactions,
	formulation_strategies=strategies,
	assumptions=assumptions,
	limitations=limitations,
	data_sources=["药典通用知识", "ICH指南原则", "结构活性关系分析"],
	)

	def normalize_text(self, text: str) -> str:
	"""
	Normalize a piece of text by applying terminology mapping
	and removing forbidden phrases.

	Args:
	text: Raw text to normalize

	Returns:
	Normalized text
	"""
	result = text

	# Apply terminology standardization
	for original, replacement in self.terminology_map.items():
	result = result.replace(original, f"{original} ({replacement})")

	# Remove forbidden phrases
	for phrase in self.forbidden_phrases:
	result = re.sub(
	rf'\b{re.escape(phrase)}\b',
	'',
	result,
	flags=re.IGNORECASE
	)

	# Clean up extra whitespace
	result = re.sub(r'\s+', ' ', result).strip()

	return result

	def _generate_report_id(self) -> str:
	"""Generate a unique report ID."""
	prefix = settings.report.id_prefix
	timestamp = datetime.now().strftime("%Y-%m%d")
	# Simple counter - in production would use proper sequence
	import random
	seq = f"X{random.randint(10, 99)}"
	return f"{prefix}-{timestamp}-{seq}"

	def _extract_reactive_groups(self, text: str) -> List[ReactiveGroup]:
	"""Extract reactive group information from API analysis."""
	groups = []

	# Pattern-based extraction for common group types
	group_patterns = [
	(r"伯胺[基团]\s$Primary\s*Amine$", "伯胺基团", "Primary Amine", PropertyType.BASIC),
	(r"仲胺[基团]\s$Secondary\s*Amine$", "仲胺基团", "Secondary Amine", PropertyType.BASIC),
	(r"硫醚[基团]\s$Thioether$", "硫醚基团", "Thioether", PropertyType.NEUTRAL),
	(r"酚羟基\s$Phenolic\sHydroxyl$", "酚羟基", "Phenolic Hydroxyl", PropertyType.ACIDIC),
	(r"羧基\s*$Carboxyl$", "羧基", "Carboxyl Group", PropertyType.ACIDIC),
	(r"芳香杂环\s\(Aromatic\sHeterocycle", "芳香杂环", "Aromatic Heterocycle", PropertyType.BASIC),
	]

	for pattern, cn_name, en_name, prop_type in group_patterns:
	if re.search(pattern, text, re.IGNORECASE):
	# Extract potential reactions for this group
	reactions = self._extract_reactions_for_group(cn_name, text)

	groups.append(ReactiveGroup(
	name=BilingualText(cn=cn_name, en=en_name),
	property_type=prop_type,
	potential_reactions=reactions,
	))

	# If no groups found, create a default entry
	if not groups:
	groups.append(ReactiveGroup(
	name=BilingualText(cn="待分析", en="Pending Analysis"),
	property_type=PropertyType.NEUTRAL,
	potential_reactions=[],
	))

	return groups

	def _extract_reactions_for_group(
	self,
	group_name: str,
	text: str
	) -> List[BilingualText]:
	"""Extract potential reactions associated with a functional group."""
	reactions = []

	# Common reaction mappings
	reaction_map = {
	"伯胺": [
	("美拉德反应", "Maillard Reaction"),
	("氧化脱氨", "Oxidative Deamination"),
	("席夫碱形成", "Schiff Base Formation"),
	],
	"硫醚": [
	("氧化成亚砜", "Oxidation to Sulfoxide"),
	("氧化成砜", "Oxidation to Sulfone"),
	],
	"酚羟基": [
	("氧化", "Oxidation"),
	("光氧化", "Photooxidation"),
	],
	}

	for key, rxn_list in reaction_map.items():
	if key in group_name:
	for cn, en in rxn_list:
	reactions.append(BilingualText(cn=cn, en=en))

	return reactions

	def _extract_physicochemical(self, text: str) -> Optional[PhysicochemicalProperties]:
	"""Extract physicochemical properties from analysis."""
	# Default values - would be parsed from text in full implementation
	acidity = BilingualText(cn="碱性", en="Basic")

	# Try to extract LogP
	logp_match = re.search(r'LogP[:\s~约]*([0-9.]+)', text)
	logp = float(logp_match.group(1)) if logp_match else None

	# Try to extract H-bond donors/acceptors
	hbd_match = re.search(r'氢键供体[:\s]*(\d+)', text)
	hba_match = re.search(r'氢键受体[:\s]*(\d+)', text)

	return PhysicochemicalProperties(
	acidity_basicity=acidity,
	logp=logp,
	h_bond_donors=int(hbd_match.group(1)) if hbd_match else None,
	h_bond_acceptors=int(hba_match.group(1)) if hba_match else None,
	risk_profile="含多个碱性氮原子，硫醚对氧化敏感" if "硫醚" in text else None,
	)

	def _extract_excipient_profile(
	self,
	text: str,
	excipient_name: str
	) -> Optional[ExcipientProfile]:
	"""Extract excipient profile from analysis."""
	# Extract formula if present
	formula_match = re.search(r'化学式[:\s]*([A-Za-z0-9₀-₉]+)', text)
	formula = formula_match.group(1) if formula_match else None

	# Extract key properties
	key_properties = []
	if "直接压片" in text:
	key_properties.append("适合直接压片工艺")
	if "低吸湿性" in text or "<1%" in text:
	key_properties.append("低吸湿性(<1% at 90% RH)")
	if "pH" in text:
	key_properties.append("微环境pH约为6.5-7.5")

	# Try to extract impurity info
	impurity_profile = None
	if "Fe" in text or "金属离子" in text:
	impurity_profile = ImpurityProfile(
	fe_ppm=10.0, # Typical value
	mn_ppm=1.0, # Typical value
	)

	return ExcipientProfile(
	name=BilingualText(
	cn=excipient_name,
	en=self._translate_excipient_name(excipient_name)
	),
	formula=formula,
	key_properties=key_properties,
	impurity_profile=impurity_profile,
	microenvironment="弱碱性环境" if "碱性" in text else None,
	compatibility_notes=self.normalize_text(text[:200]) if text else None,
	)

	def _translate_excipient_name(self, cn_name: str) -> str:
	"""Translate common excipient names to English."""
	translations = {
	"无水磷酸氢钙": "DCP Anhydrous",
	"磷酸氢钙": "Dibasic Calcium Phosphate",
	"乳糖": "Lactose",
	"微晶纤维素": "Microcrystalline Cellulose",
	"硬脂酸镁": "Magnesium Stearate",
	"淀粉": "Starch",
	"甘露醇": "Mannitol",
	}
	return translations.get(cn_name, cn_name)

	def _extract_interactions(self, text: str) -> List[InteractionMechanism]:
	"""Extract interaction mechanisms from compatibility analysis."""
	interactions = []

	# Define interaction types to look for
	interaction_types = [
	("美拉德反应", "Maillard Reaction"),
	("氧化反应", "Oxidation"),
	("酸碱反应", "Acid-Base Interaction"),
	("水解反应", "Hydrolysis"),
	("吸附作用", "Adsorption"),
	]

	for cn_name, en_name in interaction_types:
	# Check if this interaction type is discussed
	if cn_name in text:
	# Determine risk level from context
	risk = self._determine_risk_level(text, cn_name)

	# Extract mechanism analysis
	mechanism = self._extract_mechanism_for_type(text, cn_name)

	# Extract expert notes
	expert_notes = self._extract_expert_notes(text, cn_name)

	interactions.append(InteractionMechanism(
	reaction_type=BilingualText(cn=cn_name, en=en_name),
	risk_level=risk,
	mechanism_analysis=mechanism,
	expert_notes=expert_notes,
	confidence=ConfidenceLevel.MEDIUM,
	))

	return interactions

	def _determine_risk_level(self, text: str, reaction_type: str) -> RiskLevel:
	"""Determine risk level for a specific reaction type from text."""
	# Find the section about this reaction type
	# Simple heuristic: look for risk keywords near the reaction type mention

	# Search window around the reaction type
	idx = text.find(reaction_type)
	if idx == -1:
	return RiskLevel.MEDIUM

	window = text[max(0, idx-50):min(len(text), idx+200)]

	for level, keywords in self.risk_level_keywords.items():
	for keyword in keywords:
	if keyword in window.lower():
	return level

	return RiskLevel.MEDIUM

	def _extract_mechanism_for_type(self, text: str, reaction_type: str) -> str:
	"""Extract mechanism analysis for a specific reaction type."""
	# Look for mechanism description after the reaction type
	patterns = [
	rf'{reaction_type}.?机制分析[:\s]([^#]+?)(?=\n\n\|\n###\|\n##\|$)',
	rf'{reaction_type}.*?(?:因为\|由于\|主要是)([^。]+。)',
	]

	for pattern in patterns:
	match = re.search(pattern, text, re.DOTALL)
	if match:
	return self.normalize_text(match.group(1)[:300])

	return "请参阅详细分析报告"

	def _extract_expert_notes(self, text: str, reaction_type: str) -> str:
	"""Extract expert commentary for a reaction type."""
	patterns = [
	rf'{reaction_type}.?专家点评[:\s]([^#\n]+)',
	rf'{reaction_type}.?建议[:\s]([^#\n]+)',
	]

	for pattern in patterns:
	match = re.search(pattern, text, re.DOTALL)
	if match:
	return self.normalize_text(match.group(1)[:200])

	return "需结合实验数据进一步评估"

	def _extract_formulation_strategies(self, text: str) -> List[FormulationStrategy]:
	"""Extract formulation strategy recommendations."""
	strategies = []

	# Look for numbered recommendations
	pattern = r'(\d+)\.\s\\([^]+)\\[:\s]*([^\n]+)'
	matches = re.findall(pattern, text)

	for _, title, description in matches[:5]: # Limit to 5 strategies
	strategies.append(FormulationStrategy(
	title=self.normalize_text(title.strip()),
	description=self.normalize_text(description.strip()),
	))

	# Add default strategies if none found
	if not strategies:
	strategies = [
	FormulationStrategy(
	title="辅料选择优化",
	description="建议选用低金属离子规格辅料",
	),
	FormulationStrategy(
	title="稳定剂考虑",
	description="根据风险评估结果考虑添加适当稳定剂",
	),
	]

	return strategies

	def _extract_uncertainties(self, text: str) -> tuple:
	"""Extract assumptions and limitations from synthesis."""
	assumptions = []
	limitations = []

	# Extract assumptions
	assumption_pattern = r'假设[:\s]*([^。\n]+)'
	for match in re.finditer(assumption_pattern, text):
	assumptions.append(match.group(1).strip())

	# Extract limitations
	limitation_pattern = r'局限[:\s]*([^。\n]+)'
	for match in re.finditer(limitation_pattern, text):
	limitations.append(match.group(1).strip())

	# Default values if none found
	if not assumptions:
	assumptions = [
	"分析基于SMILES结构推断",
	"假设正常制剂工艺条件",
	]

	if not limitations:
	limitations = [
	"具体批次数据需COA确认",
	"相容性结论需稳定性试验验证",
	]

	return assumptions, limitations