Kevinshh commited on
Commit
214ccb1
·
verified ·
1 Parent(s): b328949

Upload professional_analyzer.py

Browse files
Files changed (1) hide show
  1. layers/professional_analyzer.py +1126 -0
layers/professional_analyzer.py ADDED
@@ -0,0 +1,1126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Professional Compatibility Analyzer.
3
+
4
+ This module implements the dual-phase analysis approach:
5
+ Phase 1: Deep chemical reasoning (structure-bound analysis)
6
+ Phase 2: Professional report writing (CMC documentation style)
7
+
8
+ This replaces the simple template-filling approach with genuine
9
+ expert-level pharmaceutical analysis.
10
+ """
11
+
12
+ import asyncio
13
+ from typing import Optional, Dict, Any, Tuple
14
+ from datetime import datetime
15
+ import os
16
+
17
+ from prompts.professional_prompts import ProfessionalPrompts
18
+ from layers.model_invoker import ModelInvoker
19
+ from utils.molecule_renderer import MoleculeRenderer
20
+ from utils.data_parser import DataParser
21
+
22
+ from prompts.stability_modeling import StabilityModelingPrompts
23
+
24
+ class ProfessionalAnalyzer:
25
+ """
26
+ Dual-phase professional compatibility analyzer.
27
+
28
+ Phase 1: Deep reasoning - Chemical mechanism analysis
29
+ Phase 2: Report writing - CMC documentation formatting
30
+ """
31
+
32
+ def __init__(self, model_invoker: Optional[ModelInvoker] = None):
33
+ """
34
+ Initialize the analyzer.
35
+
36
+ Args:
37
+ model_invoker: Model invoker instance. Creates new one if not provided.
38
+ """
39
+ self.model_invoker = model_invoker or ModelInvoker()
40
+ self.renderer = MoleculeRenderer()
41
+
42
+ def analyze_stability_advanced(
43
+ self,
44
+ goal: str,
45
+ file_paths: list,
46
+ api_info: str = "",
47
+ excipient_info: str = "",
48
+ progress_callback=None,
49
+ ) -> Dict[str, Any]:
50
+ """
51
+ Advanced stability analysis with scientific modeling (4-Phase Pipeline).
52
+
53
+ This method implements quantitative prediction based on:
54
+ - Degradation kinetics modeling (Zero-order/First-order)
55
+ - Arrhenius acceleration factor
56
+ - Statistical inference (R², SE, 95% CI)
57
+ - Trend transfer across formulations
58
+
59
+ Args:
60
+ goal: User's analysis goal
61
+ file_paths: List of file paths to analyze
62
+ api_info: Optional API background info
63
+ excipient_info: Optional excipient info
64
+ progress_callback: Progress callback
65
+
66
+ Returns:
67
+ Result dictionary with quantitative predictions
68
+ """
69
+ result = {
70
+ "success": False,
71
+ "error": None,
72
+ "report_output": "",
73
+ "phase1_data": "",
74
+ "phase2_kinetics": "",
75
+ "phase3_arrhenius": "",
76
+ "phase4_predictions": "",
77
+ "report_id": self._generate_report_id(),
78
+ "date": datetime.now().strftime("%Y-%m-%d"),
79
+ }
80
+
81
+ try:
82
+ # Parse Data Files
83
+ if progress_callback:
84
+ progress_callback(0.05, "正在解析数据文件...")
85
+
86
+ data_contents = []
87
+ for fp in file_paths:
88
+ content = DataParser.parse_file(fp)
89
+ data_contents.append(f"--- File: {os.path.basename(fp)} ---\n{content}\n")
90
+
91
+ full_data_content = "\n".join(data_contents)
92
+
93
+ if not full_data_content.strip():
94
+ result["error"] = "未能从上传文件中解析出有效数据"
95
+ return result
96
+
97
+ # ==================================================================
98
+ # Phase 1: Data Extraction & Validation
99
+ # ==================================================================
100
+ if progress_callback:
101
+ progress_callback(0.15, "Phase 1/4: 数据提取与验证...")
102
+
103
+ sys1, usr1 = StabilityModelingPrompts.get_phase1_prompt(
104
+ document_content=full_data_content,
105
+ analysis_goal=goal
106
+ )
107
+
108
+ resp1 = self.model_invoker.invoke(sys1, usr1, temperature=0.2)
109
+ if not resp1.success:
110
+ result["error"] = f"Phase 1 failed: {resp1.error}"
111
+ return result
112
+
113
+ result["phase1_data"] = resp1.content
114
+
115
+ # ==================================================================
116
+ # Phase 2: Kinetics Modeling
117
+ # ==================================================================
118
+ if progress_callback:
119
+ progress_callback(0.35, "Phase 2/4: 动力学建模 (计算k, R²)...")
120
+
121
+ sys2, usr2 = StabilityModelingPrompts.get_phase2_prompt(
122
+ extracted_data=resp1.content,
123
+ analysis_goal=goal
124
+ )
125
+
126
+ resp2 = self.model_invoker.invoke(sys2, usr2, temperature=0.1)
127
+ if not resp2.success:
128
+ result["error"] = f"Phase 2 failed: {resp2.error}"
129
+ return result
130
+
131
+ result["phase2_kinetics"] = resp2.content
132
+
133
+ # ==================================================================
134
+ # Phase 3: Arrhenius \u0026 Trend Transfer
135
+ # ==================================================================
136
+ if progress_callback:
137
+ progress_callback(0.60, "Phase 3/4: Arrhenius 计算与趋势迁移...")
138
+
139
+ sys3, usr3 = StabilityModelingPrompts.get_phase3_prompt(
140
+ kinetics_results=resp2.content,
141
+ analysis_goal=goal
142
+ )
143
+
144
+ resp3 = self.model_invoker.invoke(sys3, usr3, temperature=0.1)
145
+ if not resp3.success:
146
+ result["error"] = f"Phase 3 failed: {resp3.error}"
147
+ return result
148
+
149
+ result["phase3_arrhenius"] = resp3.content
150
+
151
+ # ==================================================================
152
+ # Phase 4: Prediction \u0026 Uncertainty
153
+ # ==================================================================
154
+ if progress_callback:
155
+ progress_callback(0.85, "Phase 4/4: 定量预测与不确定性评估...")
156
+
157
+ sys4, usr4 = StabilityModelingPrompts.get_phase4_prompt(
158
+ arrhenius_results=resp3.content,
159
+ original_data=resp1.content, # Pass extracted data for visualization
160
+ analysis_goal=goal
161
+ )
162
+
163
+ resp4 = self.model_invoker.invoke(sys4, usr4, temperature=0.1)
164
+ if not resp4.success:
165
+ result["error"] = f"Phase 4 failed: {resp4.error}"
166
+ # Fallback to Phase 3 results
167
+ result["report_output"] = resp3.content
168
+ else:
169
+ result["phase4_predictions"] = resp4.content
170
+ result["report_output"] = resp4.content
171
+ result["success"] = True
172
+
173
+ if progress_callback:
174
+ progress_callback(1.0, "科学建模完成!")
175
+
176
+ except Exception as e:
177
+ result["error"] = str(e)
178
+ import traceback
179
+ traceback.print_exc()
180
+
181
+ return result
182
+
183
+ def format_stability_report(
184
+ self,
185
+ analysis_result: Dict[str, Any],
186
+ goal: str,
187
+ ) -> str:
188
+ """
189
+ Format the stability analysis result into HTML.
190
+
191
+ Args:
192
+ analysis_result: Result from analyze_stability() or analyze_stability_advanced()
193
+ goal: Analysis goal
194
+
195
+ Returns:
196
+ HTML string
197
+ """
198
+ import re
199
+
200
+ # Clean the report output
201
+ report_text = analysis_result.get("report_output", "")
202
+ # Basic markdown cleaning
203
+ report_text = re.sub(r'\*\*([^*]+)\*\*', r'<b>\1</b>', report_text)
204
+ report_text = re.sub(r'#{1,6}\s*(.*)', r'<h3>\1</h3>', report_text)
205
+
206
+ # Convert newlines to paragraphs
207
+ paras = [p.strip() for p in report_text.split('\n') if p.strip()]
208
+ content_html = ""
209
+ for p in paras:
210
+ if p.startswith('<h3>'):
211
+ content_html += p
212
+ else:
213
+ content_html += f'<p>{p}</p>'
214
+
215
+ html = f'''<!DOCTYPE html>
216
+ <html lang="zh-CN">
217
+ <head>
218
+ <meta charset="UTF-8">
219
+ <style>
220
+ body {{ font-family: 'Segoe UI', Arial, sans-serif; line-height: 1.6; color: #333; }}
221
+ .header {{ border-bottom: 2px solid #003366; padding-bottom: 10px; margin-bottom: 20px; }}
222
+ .title {{ font-size: 24px; color: #003366; font-weight: bold; }}
223
+ .meta {{ background: #f5f7fa; padding: 10px; border-radius: 4px; font-size: 12px; margin-bottom: 20px; }}
224
+ .content {{ background: white; padding: 20px; border: 1px solid #e0e4e8; border-radius: 8px; }}
225
+ h3 {{ color: #0066cc; margin-top: 20px; border-bottom: 1px solid #eee; padding-bottom: 5px; }}
226
+ p {{ margin-bottom: 10px; text-align: justify; }}
227
+ </style>
228
+ </head>
229
+ <body>
230
+ <div class="header">
231
+ <div class="title">药物稳定性研究报告</div>
232
+ <div style="color:#666;font-size:14px;">Pharmaceutical Stability Analysis Report</div>
233
+ </div>
234
+
235
+ <div class="meta">
236
+ <b>REPORT ID:</b> {analysis_result.get("report_id", "N/A")} |
237
+ <b>DATE:</b> {analysis_result.get("date", "N/A")} |
238
+ <b>GOAL:</b> {goal}
239
+ </div>
240
+
241
+ <div class="content">
242
+ {content_html}
243
+ </div>
244
+
245
+ <div style="margin-top:20px;font-size:10px;color:#999;text-align:center;">
246
+ Generated by Drug Stability Skill | AI-Assisted Data Analysis
247
+ </div>
248
+ </body>
249
+ </html>'''
250
+ return html
251
+
252
+ def analyze_stability(
253
+ self,
254
+ goal: str,
255
+ file_paths: list,
256
+ api_info: str = "",
257
+ excipient_info: str = "",
258
+ progress_callback=None,
259
+ ) -> Dict[str, Any]:
260
+ """
261
+ Run the stability data analysis pipeline.
262
+
263
+ Args:
264
+ goal: User's analysis goal
265
+ file_paths: List of file paths to analyze
266
+ api_info: Optional API background info
267
+ excipient_info: Optional excipient info
268
+ progress_callback: Progress callback
269
+
270
+ Returns:
271
+ Result dictionary
272
+ """
273
+ result = {
274
+ "success": False,
275
+ "error": None,
276
+ "report_output": "",
277
+ "analysis_summary": "",
278
+ "data_content": "",
279
+ "report_id": self._generate_report_id(),
280
+ "date": datetime.now().strftime("%Y-%m-%d"),
281
+ }
282
+
283
+ try:
284
+ # Step 1: Parse Data Files
285
+ if progress_callback:
286
+ progress_callback(0.1, "正在解析上传的数据文件...")
287
+
288
+ data_contents = []
289
+ for fp in file_paths:
290
+ content = DataParser.parse_file(fp)
291
+ data_contents.append(f"--- File: {fp.split(os.sep)[-1]} ---\n{content}\n")
292
+
293
+ full_data_content = "\n".join(data_contents)
294
+ result["data_content"] = full_data_content
295
+
296
+ if not full_data_content.strip():
297
+ result["error"] = "未能从上传文件中解析出有效数据"
298
+ return result
299
+
300
+ # Step 2: Phase 1 - Statistical Analysis
301
+ if progress_callback:
302
+ progress_callback(0.3, "Phase 1: 统计评估与趋势分析...")
303
+
304
+ stat_system, stat_user = ProfessionalPrompts.get_stability_data_prompt(
305
+ goal=goal,
306
+ data_content=full_data_content,
307
+ api_info=api_info,
308
+ excipient_info=excipient_info
309
+ )
310
+
311
+ stat_response = self.model_invoker.invoke(stat_system, stat_user, temperature=0.3)
312
+ if not stat_response.success:
313
+ result["error"] = f"Statistical analysis failed: {stat_response.error}"
314
+ return result
315
+
316
+ result["analysis_summary"] = stat_response.content
317
+
318
+ # Step 3: Phase 2 - Report Generation
319
+ if progress_callback:
320
+ progress_callback(0.7, "Phase 2: 撰写稳定性总结报告...")
321
+
322
+ # Use a snippet of data for context (first 2000 chars)
323
+ data_snippet = full_data_content[:2000] + "..." if len(full_data_content) > 2000 else full_data_content
324
+
325
+ rep_system, rep_user = ProfessionalPrompts.get_stability_report_prompt(
326
+ goal=goal,
327
+ analysis_summary=stat_response.content,
328
+ data_snippet=data_snippet
329
+ )
330
+
331
+ rep_response = self.model_invoker.invoke(rep_system, rep_user, temperature=0.2)
332
+ if not rep_response.success:
333
+ result["report_output"] = stat_response.content # Fallback
334
+ result["error"] = f"Report generation failed: {rep_response.error}"
335
+ else:
336
+ result["report_output"] = rep_response.content
337
+ result["success"] = True
338
+
339
+ if progress_callback:
340
+ progress_callback(1.0, "分析完成!")
341
+
342
+ except Exception as e:
343
+ result["error"] = str(e)
344
+ import traceback
345
+ traceback.print_exc()
346
+
347
+ return result
348
+
349
+ def analyze(
350
+ self,
351
+ smiles: str,
352
+ excipient_name: str,
353
+ api_name: Optional[str] = None,
354
+ excipient_type: str = "填充剂",
355
+ excipient_properties: str = "",
356
+ progress_callback=None,
357
+ ) -> Dict[str, Any]:
358
+ """
359
+ Run the full dual-phase analysis.
360
+
361
+ Args:
362
+ smiles: API SMILES notation
363
+ excipient_name: Excipient name
364
+ api_name: Optional API name (uses SMILES if not provided)
365
+ excipient_type: Type of excipient (filler, binder, etc.)
366
+ excipient_properties: Known properties of the excipient
367
+ progress_callback: Optional callback for progress updates
368
+
369
+ Returns:
370
+ Dictionary containing:
371
+ - reasoning_output: Raw reasoning from Phase 1
372
+ - report_output: Formatted report from Phase 2
373
+ - structure_image: Data URI for molecular structure
374
+ - properties: Calculated molecular properties
375
+ - functional_groups: Identified reactive groups
376
+ - report_id: Generated report ID
377
+ - date: Analysis date
378
+ """
379
+ result = {
380
+ "success": False,
381
+ "error": None,
382
+ "smiles": smiles, # Save SMILES for HTML report
383
+ "reasoning_output": "",
384
+ "report_output": "",
385
+ "structure_image": None,
386
+ "properties": {},
387
+ "functional_groups": [],
388
+ "functional_groups_summary": "",
389
+ "report_id": self._generate_report_id(),
390
+ "date": datetime.now().strftime("%Y-%m-%d"),
391
+ }
392
+
393
+ api_name = api_name or smiles[:50]
394
+
395
+ try:
396
+ # Step 1: Molecular analysis with RDKit
397
+ if progress_callback:
398
+ progress_callback(0.1, "正在解析分子结构...")
399
+
400
+ props = self.renderer.calculate_properties(smiles)
401
+ if props:
402
+ result["properties"] = props
403
+ else:
404
+ # Use placeholder values if RDKit not available
405
+ props = {"molecular_weight": 0, "logp": 0, "hbd": 0, "hba": 0}
406
+
407
+ # Step 2: Identify functional groups
408
+ if progress_callback:
409
+ progress_callback(0.15, "正在识别反应活性基团...")
410
+
411
+ functional_groups = self.renderer.identify_functional_groups(smiles)
412
+ result["functional_groups"] = functional_groups
413
+
414
+ groups_summary = self.renderer.get_functional_groups_summary(smiles)
415
+ result["functional_groups_summary"] = groups_summary
416
+
417
+ # Step 3: Generate structure image
418
+ if progress_callback:
419
+ progress_callback(0.2, "正在生成分子结构图...")
420
+
421
+ structure_image = self.renderer.get_data_uri(smiles)
422
+ result["structure_image"] = structure_image
423
+
424
+ # Step 4: Phase 1 - Deep Reasoning
425
+ if progress_callback:
426
+ progress_callback(0.3, "Phase 1: 深度机理推理中...")
427
+
428
+ reasoning_system, reasoning_user = ProfessionalPrompts.get_reasoning_prompt(
429
+ smiles=smiles,
430
+ functional_groups=groups_summary,
431
+ mw=props.get("molecular_weight", 0),
432
+ logp=props.get("logp", 0),
433
+ hbd=props.get("hbd", 0),
434
+ hba=props.get("hba", 0),
435
+ excipient_name=excipient_name,
436
+ excipient_type=excipient_type,
437
+ excipient_properties=excipient_properties,
438
+ )
439
+
440
+ reasoning_response = self.model_invoker.invoke(
441
+ reasoning_system,
442
+ reasoning_user,
443
+ temperature=0.3,
444
+ )
445
+
446
+ if not reasoning_response.success:
447
+ result["error"] = f"Phase 1 failed: {reasoning_response.error}"
448
+ return result
449
+
450
+ result["reasoning_output"] = reasoning_response.content
451
+
452
+ # Step 5: Phase 2 - Professional Report Writing
453
+ if progress_callback:
454
+ progress_callback(0.7, "Phase 2: 专业报告撰写中...")
455
+
456
+ writing_system, writing_user = ProfessionalPrompts.get_writing_prompt(
457
+ api_name=api_name,
458
+ excipient_name=excipient_name,
459
+ report_id=result["report_id"],
460
+ date=result["date"],
461
+ reasoning_content=reasoning_response.content,
462
+ )
463
+
464
+ writing_response = self.model_invoker.invoke(
465
+ writing_system,
466
+ writing_user,
467
+ temperature=0.2, # Lower temperature for more consistent formatting
468
+ )
469
+
470
+ if not writing_response.success:
471
+ # Fall back to reasoning output if writing phase fails
472
+ result["report_output"] = reasoning_response.content
473
+ result["error"] = f"Phase 2 failed (using raw reasoning): {writing_response.error}"
474
+ else:
475
+ result["report_output"] = writing_response.content
476
+
477
+ if progress_callback:
478
+ progress_callback(1.0, "分析完成!")
479
+
480
+ result["success"] = True
481
+
482
+ except Exception as e:
483
+ result["error"] = str(e)
484
+ import traceback
485
+ traceback.print_exc()
486
+
487
+ return result
488
+
489
+ def _generate_report_id(self) -> str:
490
+ """Generate a unique report ID."""
491
+ import random
492
+ timestamp = datetime.now().strftime("%Y%m%d")
493
+ seq = f"{random.randint(100, 999)}"
494
+ return f"PRE-{timestamp}-{seq}"
495
+
496
+ def format_html_report(
497
+ self,
498
+ analysis_result: Dict[str, Any],
499
+ api_name: str,
500
+ excipient_name: str,
501
+ ) -> str:
502
+ """
503
+ Format the analysis result into a professional HTML report.
504
+
505
+ This creates a clean, professional report suitable for pharmaceutical R&D.
506
+ Incorporates QbD visualization principles.
507
+
508
+ Args:
509
+ analysis_result: Result from analyze() method
510
+ api_name: API name
511
+ excipient_name: Excipient name
512
+
513
+ Returns:
514
+ HTML string
515
+ """
516
+ import re
517
+
518
+ # Clean the report output - remove any residual Markdown
519
+ report_text = analysis_result.get("report_output", "")
520
+ report_text = self._clean_markdown(report_text)
521
+
522
+ # Parse report sections for better formatting
523
+ sections_html = self._parse_report_sections_qbd(report_text)
524
+
525
+ # Build structure section
526
+ structure_img = analysis_result.get("structure_image", "")
527
+ smiles = analysis_result.get("smiles", api_name)
528
+
529
+ # Build functional groups HTML with reactions
530
+ groups = analysis_result.get("functional_groups", [])
531
+ groups_html = self._build_functional_groups_html(groups)
532
+
533
+ # Build properties table with enhanced info
534
+ props = analysis_result.get("properties", {})
535
+ props_html = self._build_properties_html(props)
536
+
537
+ # Build Risk Assessment Matrix (New QbD Component)
538
+ risk_matrix_html = self._build_risk_matrix(groups)
539
+
540
+ html = f'''<!DOCTYPE html>
541
+ <html lang="zh-CN">
542
+ <head>
543
+ <meta charset="UTF-8">
544
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
545
+ <title>药物-辅料相容性评估报告 | {api_name}</title>
546
+ <style>
547
+ :root {{
548
+ --primary-color: #003366; /* Deep pharmaceutical blue */
549
+ --secondary-color: #0066cc;
550
+ --accent-color: #0099ff;
551
+ --text-primary: #333333;
552
+ --text-secondary: #666666;
553
+ --bg-light: #f5f7fa;
554
+ --border-color: #e0e4e8;
555
+ --risk-none: #28a745;
556
+ --risk-low: #17a2b8;
557
+ --risk-medium: #ffc107;
558
+ --risk-high: #dc3545;
559
+ }}
560
+
561
+ * {{ margin: 0; padding: 0; box-sizing: border-box; }}
562
+
563
+ body {{
564
+ font-family: 'Segoe UI', 'Microsoft YaHei', Arial, sans-serif;
565
+ line-height: 1.6;
566
+ color: var(--text-primary);
567
+ background: #ffffff;
568
+ font-size: 14px; /* Optimized for A4 reading */
569
+ }}
570
+
571
+ /* A4 Page Setup mimicking proper PDF layout */
572
+ .report-page {{
573
+ width: 210mm;
574
+ min-height: 297mm;
575
+ margin: 0 auto;
576
+ background: white;
577
+ padding: 20mm;
578
+ box-shadow: 0 0 10px rgba(0,0,0,0.1);
579
+ }}
580
+
581
+ .report-header {{
582
+ border-bottom: 3px solid var(--primary-color);
583
+ padding-bottom: 15px;
584
+ margin-bottom: 25px;
585
+ }}
586
+
587
+ .report-title {{
588
+ font-size: 24px;
589
+ font-weight: bold;
590
+ color: var(--primary-color);
591
+ text-transform: uppercase;
592
+ letter-spacing: 1px;
593
+ }}
594
+
595
+ .report-subtitle {{
596
+ font-size: 14px;
597
+ color: var(--text-secondary);
598
+ margin-top: 5px;
599
+ font-weight: 500;
600
+ }}
601
+
602
+ .report-meta-grid {{
603
+ display: grid;
604
+ grid-template-columns: repeat(4, 1fr);
605
+ gap: 15px;
606
+ margin-top: 15px;
607
+ background: var(--bg-light);
608
+ padding: 10px;
609
+ border-radius: 4px;
610
+ font-size: 12px;
611
+ }}
612
+
613
+ .meta-label {{ font-weight: bold; color: var(--secondary-color); display: block; }}
614
+
615
+ /* Section Styling */
616
+ .section {{ margin-bottom: 25px; page-break-inside: avoid; }}
617
+
618
+ .section-title {{
619
+ font-size: 16px;
620
+ font-weight: bold;
621
+ color: var(--primary-color);
622
+ margin-bottom: 15px;
623
+ padding-bottom: 5px;
624
+ border-bottom: 2px solid var(--border-color);
625
+ display: flex;
626
+ align-items: center;
627
+ }}
628
+
629
+ .section-number {{
630
+ background: var(--primary-color);
631
+ color: white;
632
+ width: 24px;
633
+ height: 24px;
634
+ border-radius: 50%;
635
+ display: inline-flex;
636
+ align-items: center;
637
+ justify-content: center;
638
+ margin-right: 10px;
639
+ font-size: 12px;
640
+ }}
641
+
642
+ /* QbD Components */
643
+ .qbd-grid {{
644
+ display: grid;
645
+ grid-template-columns: 2fr 1fr;
646
+ gap: 20px;
647
+ }}
648
+
649
+ .structure-card {{
650
+ border: 1px solid var(--border-color);
651
+ border-radius: 8px;
652
+ padding: 15px;
653
+ text-align: center;
654
+ background: white;
655
+ }}
656
+
657
+ .structure-image {{ max-width: 100%; max-height: 180px; object-fit: contain; }}
658
+
659
+ .risk-matrix-container {{
660
+ border: 1px solid var(--border-color);
661
+ border-radius: 8px;
662
+ padding: 15px;
663
+ background: white;
664
+ }}
665
+
666
+ .risk-badge {{
667
+ display: inline-block;
668
+ padding: 3px 8px;
669
+ border-radius: 4px;
670
+ font-size: 11px;
671
+ font-weight: bold;
672
+ color: white;
673
+ margin-right: 5px;
674
+ }}
675
+ .bg-high {{ background-color: var(--risk-high); }}
676
+ .bg-medium {{ background-color: var(--risk-medium); text-color: #333; }}
677
+ .bg-low {{ background-color: var(--risk-low); }}
678
+ .bg-none {{ background-color: var(--risk-none); }}
679
+
680
+ /* Action Checklist Style for Control Strategy */
681
+ .action-list {{ list-style: none; }}
682
+ .action-item {{
683
+ margin-bottom: 8px;
684
+ padding: 8px 12px;
685
+ border-left: 3px solid transparent;
686
+ background: #fcfcfc;
687
+ border: 1px solid #eee;
688
+ border-radius: 4px;
689
+ }}
690
+ .action-must {{ border-left-color: var(--risk-high); background: #fff5f5; }}
691
+ .action-suggest {{ border-left-color: var(--risk-medium); background: #fffbf0; }}
692
+ .action-check {{ border-left-color: var(--risk-low); background: #f0f8ff; }}
693
+
694
+ .highlight-tag {{
695
+ font-size: 10px;
696
+ text-transform: uppercase;
697
+ padding: 2px 4px;
698
+ border-radius: 2px;
699
+ margin-right: 8px;
700
+ color: white;
701
+ font-weight: bold;
702
+ }}
703
+
704
+ /* Print Optimization */
705
+ @media print {{
706
+ body {{ background: white; }}
707
+ .report-page {{
708
+ width: 100%;
709
+ margin: 0;
710
+ padding: 0;
711
+ box-shadow: none;
712
+ }}
713
+ .section {{ break-inside: avoid; }}
714
+ h2 {{ break-after: avoid; }}
715
+ }}
716
+
717
+ .properties-table, .risk-table {{ width: 100%; border-collapse: collapse; font-size: 12px; }}
718
+ .properties-table th {{ text-align: left; color: var(--text-secondary); width: 40%; padding: 6px; }}
719
+ .properties-table td {{ padding: 6px; font-weight: 500; }}
720
+
721
+ /* Functional Group Cards - QbD Style */
722
+ .group-card {{
723
+ background: #f8f9fa;
724
+ border: 1px solid var(--border-color);
725
+ border-radius: 6px;
726
+ padding: 12px;
727
+ margin-bottom: 10px;
728
+ }}
729
+ .group-header {{
730
+ display: flex;
731
+ justify-content: space-between;
732
+ align-items: center;
733
+ margin-bottom: 5px;
734
+ }}
735
+ .group-name {{
736
+ font-weight: 600;
737
+ color: var(--primary-color);
738
+ font-size: 13px;
739
+ }}
740
+ .group-property {{
741
+ font-size: 10px;
742
+ padding: 2px 6px;
743
+ border-radius: 3px;
744
+ font-weight: 500;
745
+ }}
746
+ .property-acidic {{ background: #ffe0e0; color: #c62828; }}
747
+ .property-basic {{ background: #e0f0ff; color: #1565c0; }}
748
+ .property-neutral {{ background: #e8e8e8; color: #666; }}
749
+
750
+ /* Reaction Tags - Overflow Prevention */
751
+ .group-reactions {{
752
+ display: flex;
753
+ flex-wrap: wrap;
754
+ gap: 4px;
755
+ margin-top: 8px;
756
+ }}
757
+ .reaction-tag {{
758
+ display: inline-block;
759
+ background: var(--secondary-color);
760
+ color: white;
761
+ padding: 2px 8px;
762
+ border-radius: 3px;
763
+ font-size: 10px;
764
+ white-space: nowrap;
765
+ }}
766
+
767
+ /* Section Content - Prevent Overflow */
768
+ .section-content {{
769
+ word-wrap: break-word;
770
+ overflow-wrap: break-word;
771
+ }}
772
+
773
+ p {{ margin-bottom: 10px; text-align: justify; word-wrap: break-word; }}
774
+ </style>
775
+ </head>
776
+ <body>
777
+ <div class="report-page">
778
+ <header class="report-header">
779
+ <h1 class="report-title">药物-辅料相容性评估报告</h1>
780
+ <p class="report-subtitle">Pharmaceutical Product Development - Compatibility Assessment Report</p>
781
+ <div class="report-meta-grid">
782
+ <div><span class="meta-label">REPORT ID</span>{analysis_result["report_id"]}</div>
783
+ <div><span class="meta-label">DATE</span>{analysis_result["date"]}</div>
784
+ <div><span class="meta-label">API</span>{api_name}</div>
785
+ <div><span class="meta-label">EXCIPIENT</span>{excipient_name}</div>
786
+ </div>
787
+ </header>
788
+
789
+ <!-- Section 1: Molecule & Risks (QbD Visuals) -->
790
+ <section class="section">
791
+ <div class="section-title">
792
+ <span class="section-number">01</span>
793
+ API结构特征与关键质量属性 (CQA Analysis)
794
+ </div>
795
+
796
+ <div class="qbd-grid">
797
+ <!-- Left: Structure & Properties -->
798
+ <div class="structure-card">
799
+ {f'<img src="{structure_img}" alt="Molecular Structure" class="structure-image">' if structure_img else '<div style="padding:40px;color:#999;">Structure Generating...</div>'}
800
+ <div style="font-family:monospace;font-size:10px;color:#666;margin-top:10px;word-break:break-all;">{smiles}</div>
801
+ <div style="margin-top:15px;text-align:left;">
802
+ {props_html}
803
+ </div>
804
+ </div>
805
+
806
+ <!-- Right: Risk Matrix Summary -->
807
+ <div class="risk-matrix-container">
808
+ <h4 style="color:var(--primary-color);margin-bottom:10px;border-bottom:1px solid #eee;padding-bottom:5px;">反应活性概览</h4>
809
+ {groups_html}
810
+ {risk_matrix_html}
811
+ </div>
812
+ </div>
813
+ </section>
814
+
815
+ <!-- Section 2: Excipient Analysis -->
816
+ {sections_html}
817
+
818
+ <footer style="margin-top:40px;border-top:1px solid #eee;padding-top:10px;font-size:10px;color:#999;text-align:center;">
819
+ Confidential - Pharmaceutical R&D Use Only | Generated by Drug Stability Skill
820
+ </footer>
821
+ </div>
822
+ </body>
823
+ </html>'''
824
+ return html
825
+
826
+ def _build_functional_groups_html(self, groups: list) -> str:
827
+ """Build HTML for functional groups with proper reaction tags."""
828
+ if not groups:
829
+ return '<div class="group-card"><div class="group-name">未检测到特征官能团</div></div>'
830
+
831
+ html_parts = []
832
+ for g in groups:
833
+ # Determine property class
834
+ prop_type = g.get("property_type", "中性")
835
+ if "酸" in prop_type:
836
+ prop_class = "property-acidic"
837
+ elif "碱" in prop_type:
838
+ prop_class = "property-basic"
839
+ else:
840
+ prop_class = "property-neutral"
841
+
842
+ # Build reaction tags
843
+ reactions = g.get("potential_reactions", [])
844
+ if reactions:
845
+ reactions_html = "".join([
846
+ f'<span class="reaction-tag">{r}</span>' for r in reactions
847
+ ])
848
+ else:
849
+ reactions_html = '<span style="color:#999;font-size:11px;">暂无特定反应风险</span>'
850
+
851
+ count_badge = f' ×{g["count"]}' if g.get("count", 1) > 1 else ""
852
+
853
+ html_parts.append(f'''
854
+ <div class="group-card">
855
+ <div class="group-header">
856
+ <span class="group-name">{g["name_cn"]}{count_badge}</span>
857
+ <span class="group-property {prop_class}">{prop_type}</span>
858
+ </div>
859
+ <div style="font-size:12px;color:#666;margin-bottom:8px;">({g["name_en"]})</div>
860
+ <div class="group-reactions">{reactions_html}</div>
861
+ </div>
862
+ ''')
863
+
864
+ return "".join(html_parts)
865
+
866
+ def _build_properties_html(self, props: dict) -> str:
867
+ """Build enhanced properties table HTML."""
868
+ mw = props.get("molecular_weight", "-")
869
+ logp = props.get("logp", "-")
870
+ hbd = props.get("hbd", "-")
871
+ hba = props.get("hba", "-")
872
+ tpsa = props.get("tpsa", "-")
873
+
874
+ # Add lipophilicity assessment
875
+ lipophilicity = "-"
876
+ if isinstance(logp, (int, float)):
877
+ if logp < 1:
878
+ lipophilicity = "亲水性"
879
+ elif logp < 3:
880
+ lipophilicity = "中等亲脂性"
881
+ elif logp < 5:
882
+ lipophilicity = "亲脂性"
883
+ else:
884
+ lipophilicity = "高亲脂性"
885
+
886
+ return f'''
887
+ <table class="properties-table">
888
+ <tr><th>分子量 (MW)</th><td>{mw} g/mol</td></tr>
889
+ <tr><th>LogP</th><td>{logp} ({lipophilicity})</td></tr>
890
+ <tr><th>氢键供体 (HBD)</th><td>{hbd}</td></tr>
891
+ <tr><th>氢键受体 (HBA)</th><td>{hba}</td></tr>
892
+ <tr><th>TPSA</th><td>{tpsa} Ų</td></tr>
893
+ </table>
894
+ '''
895
+
896
+ def _parse_report_sections_qbd(self, text: str) -> str:
897
+ """Parse report text into QbD-styled sections with action checklists."""
898
+ import re
899
+ sections = re.split(r'\n(?=[一二三四五六七八九十]、)', text)
900
+ html_parts = []
901
+
902
+ for section in sections:
903
+ section = section.strip()
904
+ if not section: continue
905
+
906
+ lines = section.split('\n', 1)
907
+ title_raw = lines[0].strip()
908
+ content = lines[1].strip() if len(lines) > 1 else ""
909
+
910
+ # Extract section number (Chinese)
911
+ match = re.match(r'([一二三四��六七八九十])、(.*)', title_raw)
912
+ if match:
913
+ sec_num_cn = match.group(1)
914
+ title = match.group(2)
915
+ # Map Chinese numeral to int for display
916
+ cn_map = {'一':1, '二':2, '三':3, '四':4, '五':5, '六':6}
917
+ sec_num = f"{cn_map.get(sec_num_cn, 0):02d}"
918
+ else:
919
+ sec_num = "00"
920
+ title = title_raw
921
+
922
+ # Special handling for "Control Strategy" section
923
+ if "控制" in title or "策略" in title or "建议" in title:
924
+ content_html = self._format_control_strategy(content)
925
+ else:
926
+ # Normal paragraph formatting
927
+ paras = [p.strip() for p in content.split('\n') if p.strip()]
928
+ content_html = ""
929
+ for p in paras:
930
+ if re.match(r'^[0-9]+\.[0-9]+', p):
931
+ content_html += f'<h4 style="color:var(--secondary-color);margin:12px 0 5px 0;font-size:13px;">{p}</h4>'
932
+ else:
933
+ content_html += f'<p>{p}</p>'
934
+
935
+ html_parts.append(f'''
936
+ <section class="section">
937
+ <div class="section-title">
938
+ <span class="section-number">{sec_num}</span>
939
+ {title}
940
+ </div>
941
+ <div class="section-content">
942
+ {content_html}
943
+ </div>
944
+ </section>
945
+ ''')
946
+
947
+ return "".join(html_parts)
948
+
949
+ def _format_control_strategy(self, text: str) -> str:
950
+ """Format control strategy as an action checklist."""
951
+ lines = [l.strip() for l in text.split('\n') if l.strip()]
952
+ html = '<ul class="action-list">'
953
+
954
+ for line in lines:
955
+ if line.startswith('[必须]'):
956
+ cls = 'action-must'
957
+ tag = '<span class="highlight-tag bg-high">必须 (Critical)</span>'
958
+ content = line.replace('[必须]', '').strip()
959
+ elif line.startswith('[建议]'):
960
+ cls = 'action-suggest'
961
+ tag = '<span class="highlight-tag bg-medium">建议 (Recommended)</span>'
962
+ content = line.replace('[建议]', '').strip()
963
+ elif line.startswith('[考察]'):
964
+ cls = 'action-check'
965
+ tag = '<span class="highlight-tag bg-low">考察 (Investigation)</span>'
966
+ content = line.replace('[考察]', '').strip()
967
+ else:
968
+ cls = 'action-item'
969
+ tag = ''
970
+ content = line
971
+
972
+ html += f'<li class="action-item {cls}">{tag}{content}</li>'
973
+
974
+ html += '</ul>'
975
+ return html
976
+
977
+ def _build_risk_matrix(self, groups: list) -> str:
978
+ """Build a comprehensive QbD-style visual risk assessment matrix."""
979
+ # Categorize risks by type
980
+ risk_summary = {
981
+ "oxidation": {"count": 0, "severity": "medium", "label": "氧化风险"},
982
+ "hydrolysis": {"count": 0, "severity": "medium", "label": "水解风险"},
983
+ "maillard": {"count": 0, "severity": "high", "label": "美拉德反应"},
984
+ "acid_base": {"count": 0, "severity": "low", "label": "酸碱反应"},
985
+ "adsorption": {"count": 0, "severity": "low", "label": "吸附作用"},
986
+ }
987
+
988
+ for g in groups:
989
+ for r in g.get("potential_reactions", []):
990
+ r_lower = r.lower()
991
+ if "氧化" in r or "oxidation" in r_lower:
992
+ risk_summary["oxidation"]["count"] += 1
993
+ if "水解" in r or "hydrolysis" in r_lower:
994
+ risk_summary["hydrolysis"]["count"] += 1
995
+ if "美拉德" in r or "maillard" in r_lower or "schiff" in r_lower:
996
+ risk_summary["maillard"]["count"] += 1
997
+ if "酸" in r or "碱" in r or "acid" in r_lower or "base" in r_lower:
998
+ risk_summary["acid_base"]["count"] += 1
999
+ if "吸附" in r or "adsorption" in r_lower:
1000
+ risk_summary["adsorption"]["count"] += 1
1001
+
1002
+ # Build risk bars
1003
+ risk_bars_html = ""
1004
+ for key, data in risk_summary.items():
1005
+ if data["count"] > 0:
1006
+ # Determine color based on severity
1007
+ if data["severity"] == "high":
1008
+ color = "var(--risk-high)"
1009
+ width = min(data["count"] * 40, 100)
1010
+ elif data["severity"] == "medium":
1011
+ color = "var(--risk-medium)"
1012
+ width = min(data["count"] * 30, 100)
1013
+ else:
1014
+ color = "var(--risk-low)"
1015
+ width = min(data["count"] * 20, 100)
1016
+
1017
+ risk_bars_html += f'''
1018
+ <div style="margin-bottom:8px;">
1019
+ <div style="display:flex;justify-content:space-between;font-size:11px;margin-bottom:3px;">
1020
+ <span>{data["label"]}</span>
1021
+ <span style="color:{color};font-weight:bold;">×{data["count"]}</span>
1022
+ </div>
1023
+ <div style="background:#eee;height:6px;border-radius:3px;overflow:hidden;">
1024
+ <div style="width:{width}%;background:{color};height:100%;"></div>
1025
+ </div>
1026
+ </div>
1027
+ '''
1028
+
1029
+ # Calculate overall risk score
1030
+ total_risks = sum(d["count"] for d in risk_summary.values())
1031
+ high_risk_count = sum(d["count"] for d in risk_summary.values() if d["severity"] == "high" and d["count"] > 0)
1032
+
1033
+ if high_risk_count > 0:
1034
+ overall_risk = "HIGH"
1035
+ risk_color = "var(--risk-high)"
1036
+ risk_icon = "⚠️"
1037
+ elif total_risks > 2:
1038
+ overall_risk = "MEDIUM"
1039
+ risk_color = "var(--risk-medium)"
1040
+ risk_icon = "⚡"
1041
+ elif total_risks > 0:
1042
+ overall_risk = "LOW"
1043
+ risk_color = "var(--risk-low)"
1044
+ risk_icon = "✓"
1045
+ else:
1046
+ overall_risk = "MINIMAL"
1047
+ risk_color = "var(--risk-none)"
1048
+ risk_icon = "✓"
1049
+
1050
+ return f'''
1051
+ <div style="margin-top:15px;">
1052
+ <div style="background:{risk_color};color:white;padding:8px 12px;border-radius:4px;margin-bottom:12px;text-align:center;">
1053
+ <span style="font-size:16px;">{risk_icon}</span>
1054
+ <span style="font-weight:bold;margin-left:5px;">综合风险等级: {overall_risk}</span>
1055
+ </div>
1056
+
1057
+ <div style="font-size:11px;color:#666;margin-bottom:10px;">
1058
+ <div style="display:flex;justify-content:space-between;margin-bottom:3px;">
1059
+ <span>检测到活性基团:</span> <b>{len(groups)}</b>
1060
+ </div>
1061
+ <div style="display:flex;justify-content:space-between;">
1062
+ <span>潜在反应类型:</span> <b>{total_risks}</b>
1063
+ </div>
1064
+ </div>
1065
+
1066
+ <div style="border-top:1px solid #eee;padding-top:10px;">
1067
+ <div style="font-size:11px;font-weight:bold;color:var(--primary-color);margin-bottom:8px;">风险分布</div>
1068
+ {risk_bars_html if risk_bars_html else '<div style="color:#999;font-size:11px;">暂无高风险反应</div>'}
1069
+ </div>
1070
+ </div>
1071
+ '''
1072
+
1073
+ def _clean_markdown(self, text: str) -> str:
1074
+ """Remove any residual Markdown formatting from text."""
1075
+ import re
1076
+
1077
+ # Remove bold markers (** and __)
1078
+ text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
1079
+ text = re.sub(r'__([^_]+)__', r'\1', text)
1080
+
1081
+ # Remove italic markers (* and _) - be careful not to break chemistry notation
1082
+ text = re.sub(r'(?<!\w)\*([^*\n]+)\*(?!\w)', r'\1', text)
1083
+ text = re.sub(r'(?<!\w)_([^_\n]+)_(?!\w)', r'\1', text)
1084
+
1085
+ # Remove strikethrough
1086
+ text = re.sub(r'~~([^~]+)~~', r'\1', text)
1087
+
1088
+ # Remove headers (but preserve Chinese numeral headers)
1089
+ text = re.sub(r'^#{1,6}\s*', '', text, flags=re.MULTILINE)
1090
+
1091
+ # Remove list markers (-, *, +) at start of line
1092
+ text = re.sub(r'^\s*[-*+]\s+', '• ', text, flags=re.MULTILINE)
1093
+
1094
+ # Remove numbered list markers and replace with proper formatting
1095
+ text = re.sub(r'^\s*(\d+)\.\s+', r'\1. ', text, flags=re.MULTILINE)
1096
+
1097
+ # Remove inline code backticks
1098
+ text = re.sub(r'`([^`]+)`', r'\1', text)
1099
+
1100
+ # Remove blockquotes
1101
+ text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
1102
+
1103
+ # Remove horizontal rules
1104
+ text = re.sub(r'^[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
1105
+
1106
+ # Remove link formatting [text](url) -> text
1107
+ text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
1108
+
1109
+ # Remove image formatting ![alt](url)
1110
+ text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', '', text)
1111
+
1112
+ # Clean up any remaining stray asterisks that are formatting (not chemistry)
1113
+ # This targets isolated asterisks not part of chemical structures
1114
+ text = re.sub(r'(?<![a-zA-Z])\*(?![a-zA-Z*])', '', text)
1115
+
1116
+ # Clean up multiple newlines
1117
+ text = re.sub(r'\n{3,}', '\n\n', text)
1118
+
1119
+ # Clean up extra spaces
1120
+ text = re.sub(r' +', ' ', text)
1121
+
1122
+ return text.strip()
1123
+
1124
+
1125
+ # Create singleton instance
1126
+ professional_analyzer = ProfessionalAnalyzer()