manpreet88 commited on
Commit
320fafc
·
1 Parent(s): f55ed04

Create gradio_interface.py

Browse files
Files changed (1) hide show
  1. PolyAgent/gradio_interface.py +1417 -0
PolyAgent/gradio_interface.py ADDED
@@ -0,0 +1,1417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import re
6
+ from pathlib import Path
7
+ from typing import Any, Dict, List, Tuple, Optional
8
+ from urllib.parse import urlparse
9
+
10
+ # Load .env if present so OPENAI_API_KEY/OPENAI_MODEL are available
11
+ try:
12
+ from dotenv import load_dotenv
13
+
14
+ load_dotenv()
15
+ except Exception:
16
+ pass
17
+
18
+ import gradio as gr
19
+
20
+ try:
21
+ from orchestrator_updated import PolymerOrchestrator, OrchestratorConfig
22
+ except Exception as e:
23
+ raise ImportError(
24
+ "Could not import PolymerOrchestrator from orchestrator_updated.py. "
25
+ "Ensure the updated orchestrator file is present. "
26
+ f"Original error: {e}"
27
+ )
28
+
29
+
30
+ # =============================================================================
31
+ # DOI NORMALIZATION HELPERS (UI layer must match orchestrator behavior)
32
+ # =============================================================================
33
+ _DOI_RE = re.compile(r"^10\.\d{4,9}/\S+$", re.IGNORECASE)
34
+
35
+ def normalize_doi(raw: str) -> Optional[str]:
36
+ if not isinstance(raw, str):
37
+ return None
38
+ s = raw.strip()
39
+ if not s:
40
+ return None
41
+ s = re.sub(r"^(?:https?://(?:dx\.)?doi\.org/)", "", s, flags=re.IGNORECASE)
42
+ s = re.sub(r"^doi:\s*", "", s, flags=re.IGNORECASE)
43
+ s = s.rstrip(").,;]}")
44
+ return s if _DOI_RE.match(s) else None
45
+
46
+ def doi_to_url(doi: str) -> str:
47
+ return f"https://doi.org/{doi}"
48
+
49
+ # -----------------------------------------------------------------------------
50
+ # Console defaults (no UI controls for these)
51
+ # -----------------------------------------------------------------------------
52
+ DEFAULT_CASE_BRIEF = (
53
+ "We are developing a polymer film for high-barrier flexible packaging (food-contact). "
54
+ "We need improved oxygen and water-vapor barrier while maintaining practical melt-processability "
55
+ "(film extrusion/cast). Please use web_search to ground your recommendations in recent literature "
56
+ "(last 5–10 years) on barrier improvement strategies (e.g., copolymerization, aromatic content, "
57
+ "rigid side groups, crystallinity control, chain stiffness, and compatibilization). "
58
+ "Constraints: avoid halogens; prioritize monomers with existing commercial suppliers; "
59
+ "avoid overly brittle formulations."
60
+ )
61
+
62
+ DEFAULT_PROPERTY_NAME = "glass transition"
63
+ DEFAULT_SEED_PSMILES = "[*]CC(=O)OCCOCCOC(=O)C[*]"
64
+ DEFAULT_LITERATURE_QUERY = (
65
+ "high barrier flexible packaging polyester copolymer Tg tuning oxygen permeability water vapor "
66
+ "rigid aromatic units side groups 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025"
67
+ )
68
+ DEFAULT_TARGET_VALUE = 60.0
69
+ DEFAULT_NUM_GEN_SAMPLES = 6
70
+ DEFAULT_FETCH_TOP_N = 3
71
+
72
+ # Increased to help ensure >=10 citations in typical runs
73
+ DEFAULT_SEARCH_ROWS = 12
74
+
75
+ # Property-specific fallback targets (ONLY used when generation is requested but target not found in questions)
76
+ DEFAULT_TARGET_BY_PROPERTY = {
77
+ "glass transition": 60.0, # °C (example placeholder)
78
+ "density": 1.20, # g/cm^3 (example placeholder)
79
+ "melting": 150.0, # °C (example placeholder)
80
+ "specific volume": 0.85, # cm^3/g (example placeholder)
81
+ "thermal decomposition": 350.0, # °C (example placeholder)
82
+ }
83
+
84
+ # -----------------------------------------------------------------------------
85
+ # NEW: Run instructions bubble (shown on load and retained)
86
+ # -----------------------------------------------------------------------------
87
+ RUN_INSTRUCTIONS_MD = (
88
+ "### How to run PolyAgent (one-time setup)\n"
89
+ "\n"
90
+ "**1) Environment**\n"
91
+ "- Activate your conda/venv where this repository is installed.\n"
92
+ "- Install required Python packages (project-specific; example):\n"
93
+ " - `pip install gradio torch transformers numpy joblib sentencepiece requests beautifulsoup4`\n"
94
+ " - If you want PNG visuals and RDKit validation: install RDKit (recommended via conda-forge).\n"
95
+ "\n"
96
+ "**2) Required model/artifact paths**\n"
97
+ "- Ensure these paths exist and contain the expected artifacts (as configured in `OrchestratorConfig`):\n"
98
+ " - `cl_weights_path` (CL encoder weights)\n"
99
+ " - `DOWNSTREAM_BESTWEIGHTS_5M_DIR` (property heads)\n"
100
+ " - `INVERSE_DESIGN_5M_DIR` (inverse design generator bundles)\n"
101
+ " - `spm_5M.model` (SentencePiece model)\n"
102
+ "\n"
103
+ "**3) Required environment variables**\n"
104
+ "- `OPENAI_API_KEY` (required for planning + composed answers)\n"
105
+ "- Optional:\n"
106
+ " - `OPENAI_MODEL` (defaults to `gpt-4.1` in config)\n"
107
+ " - `HF_TOKEN` (recommended for `materials.selfies-ted` model downloads)\n"
108
+ " - `SPRINGER_NATURE_API_KEY`, `SEMANTIC_SCHOLAR_API_KEY` (improves web_search coverage)\n"
109
+ "\n"
110
+ "**4) Start the interface**\n"
111
+ "- Run:\n"
112
+ " - `python gradio_interface_for_polymer_orchestrator.py --server-name 0.0.0.0 --server-port 7860`\n"
113
+ "\n"
114
+ "**5) How to prompt in the Console**\n"
115
+ "- To trigger inverse design: include a generation intent (e.g., “generate”, “inverse design”) and a target value.\n"
116
+ "- You can specify `target_value` in text (examples): `target_value=60`, `target: 60`, `Tg 60`.\n"
117
+ "- To seed with a polymer, include a pSMILES in a code block or via `seed_psmiles:`.\n"
118
+ "- To control citation count: ask explicitly (e.g., “cite 10 papers”).\n"
119
+ "\n"
120
+ "**Notes**\n"
121
+ "- Tool facts are cited as `[T]`.\n"
122
+ "- Literature/web/RAG citations appear inline as clickable DOI links (e.g., `[https://doi.org/...](https://doi.org/...)`) next to the claim.\n"
123
+ )
124
+
125
+ def pretty_json(x: Any) -> str:
126
+ try:
127
+ return json.dumps(x, indent=2, ensure_ascii=False)
128
+ except Exception:
129
+ return str(x)
130
+
131
+
132
+ # -----------------------------------------------------------------------------
133
+ # Display normalization (MINIMAL): convert bracketed [At] endpoints to [*]
134
+ # -----------------------------------------------------------------------------
135
+ _AT_BRACKET_RE = re.compile(r"\[(at)\]", flags=re.IGNORECASE)
136
+
137
+
138
+ def _convert_at_to_star(psmiles: str) -> str:
139
+ """
140
+ Minimal, display-only conversion:
141
+ - "[At]" / "[AT]" / ... -> "[*]"
142
+ """
143
+ if not isinstance(psmiles, str) or not psmiles:
144
+ return psmiles
145
+ return _AT_BRACKET_RE.sub("[*]", psmiles)
146
+
147
+
148
+ def _normalize_seed_inputs_for_display(obj: Any) -> Any:
149
+ """
150
+ Recursively normalize ONLY seed/input pSMILES fields for display.
151
+ We do NOT touch generation outputs here to preserve exact tool-returned strings.
152
+ """
153
+ if isinstance(obj, str):
154
+ if "[" in obj and "]" in obj and ("At" in obj or "AT" in obj or "at" in obj):
155
+ return _convert_at_to_star(obj)
156
+ return obj
157
+
158
+ if isinstance(obj, list):
159
+ return [_normalize_seed_inputs_for_display(x) for x in obj]
160
+
161
+ if isinstance(obj, dict):
162
+ out = {}
163
+ for k, v in obj.items():
164
+ if k in ("psmiles", "seed_psmiles", "seed_psmiles_used", "canonical_psmiles"):
165
+ out[k] = _normalize_seed_inputs_for_display(v)
166
+ else:
167
+ out[k] = _normalize_seed_inputs_for_display(v)
168
+ return out
169
+
170
+ return obj
171
+
172
+
173
+ # -----------------------------------------------------------------------------
174
+ # Markdown safety: keep polymer endpoint token "[*]" from being rendered as "[]"
175
+ # -----------------------------------------------------------------------------
176
+ # Markdown safety: keep polymer endpoint token "[*]" from being rendered as "[]"
177
+ _ENDPOINT_TOKEN_RE = re.compile(r"\[\*\]")
178
+
179
+ def _escape_endpoint_tokens_for_markdown(text: str) -> str:
180
+ """
181
+ Escape '[*]' ONLY outside code blocks and inline code.
182
+ This avoids turning '[*]' into '[\\*]' inside ```...``` where the backslash would show.
183
+ """
184
+ if not isinstance(text, str) or not text:
185
+ return text
186
+
187
+ # Split by fenced code blocks, keep delimiters
188
+ parts = re.split(r"(```[\s\S]*?```)", text)
189
+ out_parts = []
190
+
191
+ for part in parts:
192
+ # If this is a fenced code block, leave untouched
193
+ if part.startswith("```") and part.endswith("```"):
194
+ out_parts.append(part)
195
+ continue
196
+
197
+ # Split by inline code, keep delimiters
198
+ subparts = re.split(r"(`[^`]*`)", part)
199
+ for i, sp in enumerate(subparts):
200
+ if sp.startswith("`") and sp.endswith("`"):
201
+ continue
202
+ subparts[i] = _ENDPOINT_TOKEN_RE.sub(r"[\\*]", sp)
203
+
204
+ out_parts.append("".join(subparts))
205
+
206
+ return "".join(out_parts)
207
+
208
+ # -----------------------------------------------------------------------------
209
+ # NEW: Auto-detect property / target_value / seed from Questions (NO GUI CHANGES)
210
+ # -----------------------------------------------------------------------------
211
+ _NUM_RE = r"[-+]?\d+(?:\.\d+)?"
212
+
213
+
214
+ def _infer_property_from_questions(q: str) -> Optional[str]:
215
+ """
216
+ Infer canonical property name from free-text questions.
217
+ Canonical keys must match orchestrator's PROPERTY_HEAD_PATHS/GENERATOR_DIRS keys.
218
+ """
219
+ s = (q or "").lower()
220
+
221
+ # Allow explicit "property:" forms
222
+ m = re.search(r"\bproperty\b\s*[:=]\s*([a-zA-Z _-]+)", s)
223
+ if m:
224
+ cand = m.group(1).strip().lower()
225
+ # map common variants
226
+ if "glass" in cand or re.search(r"\btg\b", cand):
227
+ return "glass transition"
228
+ if "density" in cand or re.search(r"\brho\b", cand):
229
+ return "density"
230
+ if "melting" in cand or re.search(r"\btm\b", cand):
231
+ return "melting"
232
+ if "specific" in cand or re.search(r"\bsv\b", cand):
233
+ return "specific volume"
234
+ if "decomp" in cand or "decomposition" in cand or re.search(r"\btd\b", cand):
235
+ return "thermal decomposition"
236
+
237
+ # Token-based inference
238
+ if "thermal decomposition" in s or "decomposition temperature" in s or "decomposition" in s or re.search(r"\btd\b", s):
239
+ return "thermal decomposition"
240
+ if "specific volume" in s or re.search(r"\bsv\b", s):
241
+ return "specific volume"
242
+ if "glass transition" in s or "glass-transition" in s or re.search(r"\btg\b", s):
243
+ return "glass transition"
244
+ if "melting" in s or "melt temperature" in s or re.search(r"\btm\b", s):
245
+ return "melting"
246
+ if "density" in s or re.search(r"\brho\b", s):
247
+ return "density"
248
+
249
+ return None
250
+
251
+
252
+ def _infer_target_value_from_questions(q: str, prop: Optional[str]) -> Optional[float]:
253
+ """
254
+ Infer numeric target_value from free-text questions.
255
+ - supports explicit: target_value=..., target: ..., tgt ...
256
+ - supports property-attached: Tg 60, density 1.25, Td=380, sv 0.85, Tm 180
257
+ """
258
+ sl = (q or "").lower()
259
+
260
+ # Explicit
261
+ m = re.search(rf"\b(target_value|target|tgt)\b\s*[:=]?\s*({_NUM_RE})", sl)
262
+ if m:
263
+ try:
264
+ return float(m.group(2))
265
+ except Exception:
266
+ pass
267
+
268
+ prop = (prop or "").strip().lower()
269
+ prop_patterns: List[str] = []
270
+
271
+ if prop == "glass transition":
272
+ prop_patterns = [rf"\b(tg|glass\s*transition)\b\s*[:=]?\s*({_NUM_RE})"]
273
+ elif prop == "density":
274
+ prop_patterns = [rf"\b(density|rho)\b\s*[:=]?\s*({_NUM_RE})"]
275
+ elif prop == "melting":
276
+ prop_patterns = [rf"\b(tm|melting)\b\s*[:=]?\s*({_NUM_RE})"]
277
+ elif prop == "specific volume":
278
+ prop_patterns = [rf"\b(specific\s*volume|sv)\b\s*[:=]?\s*({_NUM_RE})"]
279
+ elif prop == "thermal decomposition":
280
+ prop_patterns = [rf"\b(td|thermal\s*decomposition|decomposition)\b\s*[:=]?\s*({_NUM_RE})"]
281
+
282
+ for pat in prop_patterns:
283
+ m = re.search(pat, sl)
284
+ if m:
285
+ try:
286
+ return float(m.group(m.lastindex))
287
+ except Exception:
288
+ pass
289
+
290
+ # Token-near-number fallback: pick first number within 80 chars after property token
291
+ tokens: List[str] = []
292
+ if prop == "glass transition":
293
+ tokens = ["tg", "glass transition"]
294
+ elif prop == "density":
295
+ tokens = ["density", "rho"]
296
+ elif prop == "melting":
297
+ tokens = ["tm", "melting"]
298
+ elif prop == "specific volume":
299
+ tokens = ["specific volume", "sv"]
300
+ elif prop == "thermal decomposition":
301
+ tokens = ["td", "thermal decomposition", "decomposition"]
302
+
303
+ for tok in tokens:
304
+ for mt in re.finditer(re.escape(tok), sl):
305
+ window = sl[mt.end():mt.end() + 80]
306
+ mn = re.search(rf"({_NUM_RE})", window)
307
+ if mn:
308
+ try:
309
+ return float(mn.group(1))
310
+ except Exception:
311
+ pass
312
+
313
+ return None
314
+
315
+
316
+ def _infer_generate_intent(q: str) -> bool:
317
+ """
318
+ Decide if the user is asking for inverse design / generation.
319
+ Conservative: only true when generation-ish verbs appear.
320
+ """
321
+ s = (q or "").lower()
322
+ triggers = [
323
+ "generate",
324
+ "inverse design",
325
+ "inverse-design",
326
+ "design candidates",
327
+ "propose candidates",
328
+ "suggest candidates",
329
+ "design polymer",
330
+ "design polymers",
331
+ "synthesize candidates",
332
+ "optimize",
333
+ ]
334
+ return any(t in s for t in triggers)
335
+
336
+
337
+ def _infer_seed_psmiles_from_questions(q: str) -> Optional[str]:
338
+ """
339
+ Best-effort extraction of seed pSMILES from the Questions text without GUI changes.
340
+ Supports:
341
+ - seed_psmiles: <token>
342
+ - psmiles=...
343
+ - smiles=...
344
+ - code block containing a single pSMILES/SMILES line
345
+ """
346
+ text = (q or "").strip()
347
+ if not text:
348
+ return None
349
+
350
+ # 1) Prefer code block content (first non-empty line)
351
+ code_blocks = re.findall(r"```(?:\w+)?\s*([\s\S]*?)```", text)
352
+ for block in code_blocks:
353
+ for line in (block or "").splitlines():
354
+ line = line.strip()
355
+ if not line:
356
+ continue
357
+ # Heuristic: polymer pSMILES often includes [*] or [At]
358
+ if "[*]" in line or "[At]" in line or "[AT]" in line or "*" in line or "[" in line:
359
+ return line
360
+
361
+ # 2) Keyed patterns
362
+ m = re.search(r"(seed_psmiles|seed|psmiles|smiles)\s*[:=]\s*([^\s]+)", text, flags=re.IGNORECASE)
363
+ if m:
364
+ return m.group(2).strip()
365
+
366
+ return None
367
+
368
+
369
+ # -----------------------------------------------------------------------------
370
+ # Domain normalization: show ROOT domain like nature.com, springer.com, etc.
371
+ # -----------------------------------------------------------------------------
372
+ _SECOND_LEVEL_TLDS = {
373
+ "co.uk",
374
+ "ac.uk",
375
+ "gov.uk",
376
+ "org.uk",
377
+ "co.jp",
378
+ "ne.jp",
379
+ "or.jp",
380
+ "com.au",
381
+ "net.au",
382
+ "org.au",
383
+ "edu.au",
384
+ "co.in",
385
+ "com.br",
386
+ "com.cn",
387
+ }
388
+
389
+
390
+ def _root_domain(netloc: str) -> str:
391
+ netloc = (netloc or "").strip().lower()
392
+ if netloc.startswith("www."):
393
+ netloc = netloc[4:]
394
+ parts = [p for p in netloc.split(".") if p]
395
+ if len(parts) <= 2:
396
+ return netloc
397
+ last2 = ".".join(parts[-2:])
398
+ last3 = ".".join(parts[-3:])
399
+ # handle second-level public suffixes
400
+ if last2 in _SECOND_LEVEL_TLDS and len(parts) >= 3:
401
+ return last3
402
+ if ".".join(parts[-2:]) in _SECOND_LEVEL_TLDS and len(parts) >= 3:
403
+ return last3
404
+ # if suffix looks like co.uk style
405
+ if last2 in _SECOND_LEVEL_TLDS:
406
+ return last3
407
+ if last2.endswith(".uk") and len(parts) >= 3:
408
+ if ".".join(parts[-2:]) in _SECOND_LEVEL_TLDS:
409
+ return last3
410
+ return last2
411
+
412
+
413
+ def _url_to_domain(url: str) -> Optional[str]:
414
+ if not isinstance(url, str) or not url.strip():
415
+ return None
416
+ try:
417
+ u = url.strip()
418
+ if not (u.startswith("http://") or u.startswith("https://")):
419
+ return None
420
+ netloc = urlparse(u).netloc.strip().lower()
421
+ if not netloc:
422
+ return None
423
+ return _root_domain(netloc)
424
+ except Exception:
425
+ return None
426
+
427
+
428
+ def _attach_source_domains(obj: Any) -> Any:
429
+ """
430
+ Recursively add a short source/domain field for RAG + web_search items where URLs are present.
431
+ """
432
+ if isinstance(obj, list):
433
+ return [_attach_source_domains(x) for x in obj]
434
+
435
+ if isinstance(obj, dict):
436
+ out: Dict[str, Any] = {}
437
+ for k, v in obj.items():
438
+ out[k] = _attach_source_domains(v)
439
+
440
+ for url_key in ("url", "landing_page", "landingPage", "doi_url", "pdf_url", "link", "href"):
441
+ v = out.get(url_key)
442
+ dom = _url_to_domain(v) if isinstance(v, str) else None
443
+ if dom:
444
+ out.setdefault("source_domain", dom)
445
+ break
446
+
447
+ return out
448
+
449
+ return obj
450
+
451
+
452
+ def _index_citable_sources(report: Dict[str, Any]) -> Dict[str, Any]:
453
+ """
454
+ Build a compact citation index for web_search + rag retrieval items.
455
+
456
+ Requirement:
457
+ - Tag format is STRICTLY: COMPLETE DOI URL (https://doi.org/...) when DOI exists,
458
+ otherwise the best available http(s) URL.
459
+ - No numbered citations.
460
+ """
461
+ citation_index: Dict[str, Any] = {"sources": []}
462
+
463
+ def is_citable_item(d: Dict[str, Any]) -> bool:
464
+ if not isinstance(d, dict):
465
+ return False
466
+ for k in ("url", "landing_page", "landingPage", "doi_url", "pdf_url", "link", "href"):
467
+ if isinstance(d.get(k), str) and (d[k].startswith("http://") or d[k].startswith("https://")):
468
+ return True
469
+ if isinstance(d.get("doi"), str) and d["doi"].strip():
470
+ return True
471
+ return False
472
+
473
+ def get_best_url(d: Dict[str, Any]) -> Optional[str]:
474
+ # DOI-first
475
+ doi = normalize_doi(d.get("doi", ""))
476
+ if doi:
477
+ return doi_to_url(doi)
478
+ for k in ("url", "landing_page", "landingPage", "doi_url", "pdf_url", "link", "href"):
479
+ v = d.get(k)
480
+ if isinstance(v, str) and (v.startswith("http://") or v.startswith("https://")):
481
+ return v
482
+ return None
483
+
484
+ def walk_and_tag(node: Any) -> Any:
485
+ if isinstance(node, list):
486
+ return [walk_and_tag(x) for x in node]
487
+
488
+ if isinstance(node, dict):
489
+ out = {k: walk_and_tag(v) for k, v in node.items()}
490
+
491
+ if is_citable_item(out):
492
+ url = get_best_url(out)
493
+ dom = out.get("source_domain") or (_url_to_domain(url) if url else None) or "source"
494
+ tag = url.strip() if isinstance(url, str) and url.strip() else "source"
495
+ # cite_tag must be DOI URL or URL fallback
496
+ cur = out.get("cite_tag")
497
+ if not (isinstance(cur, str) and cur.strip().startswith(("http://", "https://"))):
498
+ out["cite_tag"] = tag
499
+
500
+ citation_index["sources"].append(
501
+ {
502
+ "tag": out.get("cite_tag"),
503
+ "domain": dom,
504
+ "title": out.get("title") or out.get("name") or "Untitled",
505
+ "url": url,
506
+ "doi": out.get("doi"),
507
+ }
508
+ )
509
+ return out
510
+
511
+ return node
512
+
513
+ tagged = walk_and_tag(report)
514
+ if isinstance(tagged, dict):
515
+ tagged["citation_index"] = citation_index
516
+ return tagged
517
+
518
+ report["citation_index"] = citation_index
519
+ return report
520
+
521
+
522
+ def ensure_orch(state: Dict[str, Any]) -> Tuple[PolymerOrchestrator, Dict[str, Any]]:
523
+ if state.get("orch") is None:
524
+ cfg = OrchestratorConfig()
525
+ state["orch"] = PolymerOrchestrator(cfg)
526
+ state["ctx"] = {}
527
+ reason = getattr(state["orch"], "_openai_unavailable_reason", None)
528
+ if reason:
529
+ print("[OpenAI diagnostic]", reason)
530
+ if "ctx" not in state:
531
+ state["ctx"] = {}
532
+ return state["orch"], state["ctx"]
533
+
534
+
535
+ # -----------------------------------------------------------------------------
536
+ # NEW: extract tool output so the PLAN (execute_plan) drives the final report
537
+ # -----------------------------------------------------------------------------
538
+ def _extract_tool_output(exec_res: Dict[str, Any], tool_name: str) -> Optional[Any]:
539
+ """
540
+ Best-effort extraction of a tool output from execute_plan() results.
541
+
542
+ Supports a variety of common shapes:
543
+ exec_res["steps"] = [{"tool": "...", "output": {...}}, ...]
544
+ exec_res["steps"] = [{"tool": "...", "result": {...}}, ...]
545
+ exec_res["steps"] = [{"tool": "...", "data": {...}}, ...]
546
+ """
547
+ if not isinstance(exec_res, dict):
548
+ return None
549
+ steps = exec_res.get("steps")
550
+ if not isinstance(steps, list):
551
+ return None
552
+
553
+ tool_name = (tool_name or "").strip()
554
+ if not tool_name:
555
+ return None
556
+
557
+ for s in steps:
558
+ if not isinstance(s, dict):
559
+ continue
560
+ t = str(s.get("tool") or s.get("name") or "").strip()
561
+ if t != tool_name:
562
+ continue
563
+ for k in ("output", "result", "data", "payload"):
564
+ if k in s:
565
+ return s.get(k)
566
+ # fallback: sometimes the step dict itself is the output
567
+ return s
568
+
569
+ return None
570
+
571
+
572
+ def _compose_planner_prompt(
573
+ case_brief: str,
574
+ questions: str,
575
+ property_name: str,
576
+ seed_psmiles: str,
577
+ literature_query: str,
578
+ target_value: Optional[float],
579
+ ) -> str:
580
+ """
581
+ Planner prompt updated to enforce:
582
+ - per-question coverage
583
+ - explicit mapping Qi -> steps
584
+ - report_generation included as a planned step
585
+ """
586
+ lines = []
587
+ lines.append("### CASE / CONTEXT (POLYMER SYSTEM)")
588
+ if case_brief.strip():
589
+ lines.append(case_brief.strip())
590
+ if seed_psmiles.strip():
591
+ lines.append(f"Seed pSMILES: {seed_psmiles.strip()}")
592
+ if property_name.strip():
593
+ lines.append(f"Primary property of interest: {property_name.strip()}")
594
+ if target_value is not None:
595
+ lines.append(f"Inverse-design target_value (required for generation): {target_value}")
596
+ if literature_query.strip():
597
+ lines.append(f"Literature query hint (optional): {literature_query.strip()}")
598
+
599
+ lines.append("\n### USER QUESTIONS (ANSWER THESE)")
600
+ q = questions.strip()
601
+ if q:
602
+ lines.append(q)
603
+ else:
604
+ lines.append(
605
+ "Q1. Interpret the current formulation and key properties.\n"
606
+ "Q2. Analyze structure–property relationships and root causes.\n"
607
+ "Q3. Propose and (if possible) generate candidate polymers.\n"
608
+ "Q4. Summarize evidence, limitations, and next experiments."
609
+ )
610
+
611
+ lines.append("\n### TOOLING REQUIREMENTS")
612
+ lines.append(
613
+ "- Select from tools: data_extraction, cl_encoding, property_prediction, polymer_generation,\n"
614
+ " rag_retrieval, web_search, report_generation, and PNG-only visual tools.\n"
615
+ "- Plan a small, ordered tool chain (2–10 steps) that answers the USER QUESTIONS.\n"
616
+ "- Ensure property_prediction uses cl_encoding output when possible.\n"
617
+ "- polymer_generation is inverse design and REQUIRES target_value.\n"
618
+ "- Do NOT answer the scientific questions yourself; only plan which tools to run."
619
+ )
620
+
621
+ # Critical: make the plan sensitive to the questions, not a fixed recipe
622
+ lines.append("\n### PLANNING RULES (STRICT)")
623
+ lines.append(
624
+ "- Create an explicit mapping: for each question Qi, list the step numbers that address it.\n"
625
+ "- Every planned step must contribute to at least one Qi.\n"
626
+ "- If a Qi needs literature evidence, include web_search and/or rag_retrieval steps.\n"
627
+ "- Include a final report_generation step that synthesizes tool outputs into answers for each Qi.\n"
628
+ "- If a Qi cannot be answered from tools, plan to state 'not available' for missing numeric values "
629
+ "and provide clearly labeled qualitative expectations where appropriate."
630
+ )
631
+
632
+ return "\n".join(lines)
633
+
634
+
635
+ def _seed_inputs(
636
+ property_name: str,
637
+ seed_psmiles: str,
638
+ literature_query: str,
639
+ target_value: Optional[float],
640
+ questions: str,
641
+ ) -> Dict[str, Any]:
642
+ """
643
+ Provide user_inputs to execute_plan(). Include questions so the orchestrator/tools
644
+ can condition retrieval and synthesis on the actual user ask.
645
+ """
646
+ payload: Dict[str, Any] = {}
647
+ if property_name.strip():
648
+ payload["property"] = property_name.strip()
649
+ if seed_psmiles.strip():
650
+ payload["psmiles"] = seed_psmiles.strip()
651
+ if literature_query.strip():
652
+ payload["literature_query"] = literature_query.strip()
653
+ payload["query"] = literature_query.strip()
654
+ if target_value is not None:
655
+ payload["target_value"] = float(target_value)
656
+ payload["num_samples"] = int(DEFAULT_NUM_GEN_SAMPLES)
657
+ if isinstance(questions, str) and questions.strip():
658
+ payload["questions"] = questions.strip()
659
+ return payload
660
+
661
+
662
+ def _maybe_add_artifacts(
663
+ orch: PolymerOrchestrator,
664
+ report: Dict[str, Any],
665
+ seed_psmiles_fallback: Optional[str] = None,
666
+ property_name_fallback: Optional[str] = None,
667
+ ) -> Tuple[List[str], Dict[str, Any]]:
668
+ imgs: List[str] = []
669
+ extras: Dict[str, Any] = {}
670
+
671
+ # Generation grid (unchanged)
672
+ try:
673
+ gen = (report.get("summary", {}) or {}).get("generation", {})
674
+ if isinstance(gen, dict) and gen.get("generated_psmiles"):
675
+ grid = orch._run_gen_grid({}, {"polymer_generation": gen})
676
+ if isinstance(grid, dict) and grid.get("png_path") and Path(grid["png_path"]).exists():
677
+ imgs.append(grid["png_path"])
678
+ extras["gen_grid"] = grid
679
+ except Exception as e:
680
+ extras["gen_grid_error"] = str(e)
681
+
682
+ # Molecule render (seed) (unchanged but you may also want fallback)
683
+ try:
684
+ seed_psmiles = ((report.get("summary", {}) or {}).get("property_prediction", {}) or {}).get("psmiles")
685
+ if not seed_psmiles:
686
+ seed_psmiles = seed_psmiles_fallback
687
+ if seed_psmiles:
688
+ mol_png = orch._run_mol_render({}, {"psmiles": seed_psmiles, "view": "2d"})
689
+ if isinstance(mol_png, dict) and mol_png.get("png_path") and Path(mol_png["png_path"]).exists():
690
+ imgs.append(mol_png["png_path"])
691
+ extras["mol_render"] = mol_png
692
+ except Exception as e:
693
+ extras["mol_render_error"] = str(e)
694
+
695
+ # Explainability heatmap (NOW ALWAYS ATTEMPTED WHEN WE HAVE ANY pSMILES)
696
+ try:
697
+ summary = report.get("summary", {}) or {}
698
+ tool_outputs = report.get("tool_outputs", {}) or {}
699
+
700
+ prop_pred = summary.get("property_prediction", {}) or {}
701
+ data_ex = summary.get("data_extraction", {}) or tool_outputs.get("data_extraction", {}) or {}
702
+
703
+ seed_psmiles = (
704
+ prop_pred.get("psmiles")
705
+ or data_ex.get("canonical_psmiles")
706
+ or seed_psmiles_fallback
707
+ )
708
+
709
+ prop_name = (
710
+ prop_pred.get("property")
711
+ or property_name_fallback
712
+ or DEFAULT_PROPERTY_NAME
713
+ )
714
+
715
+ if seed_psmiles:
716
+ expl_payload = {"psmiles": seed_psmiles, "top_k_atoms": 12, "property": prop_name}
717
+ expl = orch._run_prop_attribution({}, expl_payload)
718
+ if isinstance(expl, dict) and expl.get("png_path") and Path(expl["png_path"]).exists():
719
+ imgs.append(expl["png_path"])
720
+ extras["prop_attribution"] = expl
721
+ else:
722
+ extras["prop_attribution_error"] = expl.get("error") if isinstance(expl, dict) else "unknown"
723
+ else:
724
+ extras["prop_attribution_error"] = "No seed pSMILES available for attribution."
725
+ except Exception as e:
726
+ extras["prop_attribution_error"] = str(e)
727
+
728
+ return imgs, extras
729
+
730
+ def _requested_citation_count(questions: str, default_n: int = 10) -> int:
731
+ """
732
+ If the user explicitly asks for N citations/papers/sources/references, honor that.
733
+ Otherwise, default to 10.
734
+ """
735
+ q = (questions or "").lower()
736
+
737
+ patterns = [
738
+ r"(?:at\s+least\s+)?(\d{1,3})\s*(?:citations|citation|papers|paper|sources|source|references|reference)\b",
739
+ r"\bcite\s+(\d{1,3})\s*(?:papers|paper|sources|source|references|reference|citations|citation)\b",
740
+ r"\b(\d{1,3})\s*(?:papers|paper|sources|source|references|reference|citations|citation)\s*(?:minimum|min)\b",
741
+ ]
742
+ for pat in patterns:
743
+ m = re.search(pat, q, flags=re.IGNORECASE)
744
+ if m:
745
+ try:
746
+ n = int(m.group(1))
747
+ return max(1, min(n, 200))
748
+ except Exception:
749
+ pass
750
+ return max(1, default_n)
751
+
752
+
753
+ def _collect_citations(report: Dict[str, Any]) -> List[Dict[str, Any]]:
754
+ """
755
+ Collect citations from report['citation_index']['sources'] if present; otherwise walk the report.
756
+ Deduplicate by DOI (preferred) or URL.
757
+ """
758
+ if not isinstance(report, dict):
759
+ return []
760
+
761
+ sources = []
762
+ ci = report.get("citation_index")
763
+ if isinstance(ci, dict) and isinstance(ci.get("sources"), list):
764
+ for s in ci["sources"]:
765
+ if isinstance(s, dict):
766
+ sources.append(s)
767
+
768
+ # fallback walk
769
+ if not sources:
770
+ def walk(node: Any):
771
+ if isinstance(node, dict):
772
+ if "url" in node or "doi" in node:
773
+ doi = normalize_doi(node.get("doi", "")) or ""
774
+ url = None
775
+ if doi:
776
+ url = doi_to_url(doi)
777
+ else:
778
+ url = node.get("url")
779
+ sources.append({
780
+ "domain": node.get("source_domain") or _url_to_domain(node.get("url") or ""),
781
+ "title": node.get("title") or node.get("name") or "Untitled",
782
+ "url": url,
783
+ "doi": doi,
784
+ "tag": url,
785
+ })
786
+ for v in node.values():
787
+ walk(v)
788
+ elif isinstance(node, list):
789
+ for x in node:
790
+ walk(x)
791
+ walk(report)
792
+
793
+ # normalize + dedupe
794
+ dedup: Dict[str, Dict[str, Any]] = {}
795
+ for s in sources:
796
+ if not isinstance(s, dict):
797
+ continue
798
+ url = s.get("url")
799
+ doi = normalize_doi(s.get("doi", "")) or ""
800
+
801
+ # Requirement: label should be COMPLETE DOI URL (preferred) else URL.
802
+ tag = s.get("tag")
803
+ if doi:
804
+ cite_url = doi_to_url(doi)
805
+ elif isinstance(url, str) and url.strip():
806
+ cite_url = url.strip()
807
+ else:
808
+ continue
809
+
810
+ key = None
811
+ if doi:
812
+ key = "doi:" + doi.lower()
813
+ elif isinstance(cite_url, str) and cite_url.strip():
814
+ key = "url:" + cite_url.strip()
815
+ else:
816
+ continue
817
+
818
+ title = s.get("title") or "Untitled"
819
+
820
+ dedup[key] = {
821
+ # Keep key name "domain" for UI compatibility, but it now holds the DOI URL / URL text requirement.
822
+ "domain": cite_url,
823
+ "title": title,
824
+ "url": cite_url,
825
+ "doi": doi,
826
+ "tag": cite_url if isinstance(cite_url, str) else tag,
827
+ }
828
+
829
+ # stable-ish ordering: prefer items that have a URL and non-generic domain
830
+ def _rank(x: Dict[str, Any]) -> Tuple[int, int, str]:
831
+ dom = (x.get("domain") or "").lower()
832
+ url = x.get("url") or ""
833
+ generic = int(dom in ("source", "doi.org"))
834
+ has_url = 0 if (isinstance(url, str) and url.startswith("http")) else 1
835
+ return (generic, has_url, dom)
836
+
837
+ out = list(dedup.values())
838
+ out.sort(key=_rank)
839
+ return out
840
+
841
+
842
+ def _build_sources_section(citations: List[Dict[str, Any]], n_needed: int) -> str:
843
+ """
844
+ Deterministic clickable source list.
845
+
846
+ Requirement:
847
+ - link text must be the COMPLETE DOI URL (preferred) else URL.
848
+ Bullet format:
849
+ - [https://doi.org/...](https://doi.org/...) — Title
850
+ """
851
+ if n_needed < 1:
852
+ n_needed = 1
853
+
854
+ picked: List[Dict[str, Any]] = []
855
+ seen_urls: set = set()
856
+ for c in citations:
857
+ url = c.get("url")
858
+ if not isinstance(url, str) or not url.startswith("http"):
859
+ continue
860
+ if url in seen_urls:
861
+ continue
862
+ seen_urls.add(url)
863
+ picked.append(c)
864
+ if len(picked) >= n_needed:
865
+ break
866
+
867
+ lines = []
868
+ lines.append("\n\n---\n\n### Sources (clickable)\n")
869
+ if not picked:
870
+ lines.append("_No citable web/RAG sources were available in the report output._\n")
871
+ return "".join(lines)
872
+
873
+ if len(picked) < n_needed:
874
+ lines.append(f"_Only {len(picked)} unique sources were available; target was {n_needed}._\n\n")
875
+
876
+ for c in picked:
877
+ cite_text = (c.get("domain") or c.get("url") or "source").strip()
878
+ url = c.get("url")
879
+ title = (c.get("title") or "Untitled").strip()
880
+ lines.append(f"- [{cite_text}]({url}) — {title}\n")
881
+
882
+ return "".join(lines)
883
+
884
+
885
+ def _augment_questions_for_grounding(questions: str, n_citations: int) -> str:
886
+ """
887
+ Updated grounding constraints:
888
+ - Tool citations MUST be [T] only.
889
+ - Paper citations MUST be clickable hyperlinks whose link text is the COMPLETE DOI URL (preferred).
890
+ - Ensure at least n_citations unique citations unless user asked otherwise.
891
+ - Do not repeat the same DOI/URL more than once.
892
+ """
893
+ constraints = (
894
+ "\n\nCONSTRAINTS FOR THE ANSWER:\n"
895
+ "- Do NOT manufacture DOIs or sources. Use only URLs/DOIs present in the provided report.\n"
896
+ "- Tool-derived facts: cite inline using [T] (exactly; do NOT use [T1], [T2], etc.).\n"
897
+ "- Literature/web/RAG citations: cite as clickable hyperlinks where the bracket text is the COMPLETE DOI URL "
898
+ "(https://doi.org/...) when DOI is available; otherwise use the best available URL.\n"
899
+ "- Do NOT use numbered bracket citations like [1], [2].\n"
900
+ "- You are FORBIDDEN from adding a separate references list/section (e.g., 'References', 'Sources').\n"
901
+ "- All literature citations must be inline hyperlinks: [https://doi.org/...](https://doi.org/...) placed immediately after the claim.\n"
902
+ "- Distribute citations across the answer (do not cluster them in one place).\n"
903
+ "- NON-DUPLICATES: Do not repeat the same paper link. Each DOI/URL may appear at most once in the entire answer.\n"
904
+ "- Each major section should include at least 1 inline literature citation when relevant.\n"
905
+ "- Numeric values: only use numeric values that appear in tool outputs; otherwise state 'not available'.\n"
906
+ "- Qualitative expectations are allowed when numeric outputs are not available; label them clearly as qualitative.\n"
907
+ "- When presenting polymer_generation outputs (e.g., generated_psmiles), reproduce them verbatim exactly as returned.\n"
908
+ "- Polymer endpoint tokens: preserve attachment-point placeholders exactly as '[*]' in any pSMILES/SMILES shown.\n"
909
+ " Do NOT drop the '*' or render it as empty brackets '[]'.\n"
910
+ f"- Citation minimum: include at least {int(n_citations)} NON-DUPLICATE literature citations (unique by URL/DOI), "
911
+ "unless the user explicitly requested a different number.\n"
912
+ )
913
+ q = (questions or "").rstrip()
914
+ return q + constraints
915
+
916
+
917
+ def _assign_tool_tags(plan: Dict[str, Any], exec_res: Dict[str, Any], report: Dict[str, Any]) -> None:
918
+ """
919
+ Tool tags are ALWAYS [T] (single tag only).
920
+ """
921
+ try:
922
+ steps_executed = (exec_res or {}).get("steps", []) or []
923
+ for s in steps_executed:
924
+ if isinstance(s, dict):
925
+ s["cite_tag"] = "[T]"
926
+ except Exception:
927
+ pass
928
+
929
+ try:
930
+ summary = report.get("summary", {}) if isinstance(report, dict) else {}
931
+ if isinstance(summary, dict):
932
+ for k, v in list(summary.items()):
933
+ if isinstance(v, dict):
934
+ v["cite_tag"] = "[T]"
935
+ except Exception:
936
+ pass
937
+
938
+ try:
939
+ tool_outputs = report.get("tool_outputs", {}) if isinstance(report, dict) else {}
940
+ if isinstance(tool_outputs, dict):
941
+ for _, v in tool_outputs.items():
942
+ if isinstance(v, dict):
943
+ v["cite_tag"] = "[T]"
944
+ except Exception:
945
+ pass
946
+
947
+
948
+ # -----------------------------------------------------------------------------
949
+ # PolyAgent Console: corrected run (plan drives tools; report comes from execute_plan)
950
+ # -----------------------------------------------------------------------------
951
+ def run_agent(state: Dict[str, Any], questions: str) -> Tuple[str, List[str]]:
952
+ orch, ctx = ensure_orch(state)
953
+
954
+ # ---------- AUTO-DETECTION (NO GUI CHANGES) ----------
955
+ qtxt = questions or ""
956
+
957
+ inferred_prop = _infer_property_from_questions(qtxt) or DEFAULT_PROPERTY_NAME
958
+
959
+ inferred_seed = _infer_seed_psmiles_from_questions(qtxt)
960
+ seed_psmiles = _convert_at_to_star(inferred_seed) if inferred_seed else _convert_at_to_star(DEFAULT_SEED_PSMILES)
961
+
962
+ want_generation = _infer_generate_intent(qtxt)
963
+
964
+ inferred_target = _infer_target_value_from_questions(qtxt, inferred_prop)
965
+
966
+ # Only default a target when the user appears to want generation but omitted an explicit value
967
+ if inferred_target is None and want_generation:
968
+ inferred_target = float(DEFAULT_TARGET_BY_PROPERTY.get(inferred_prop, DEFAULT_TARGET_VALUE))
969
+
970
+ target_value: Optional[float] = float(inferred_target) if inferred_target is not None else None
971
+
972
+ # Literature query: keep your existing behavior (fallback to default unless questions long enough)
973
+ literature_query_default = DEFAULT_LITERATURE_QUERY
974
+ case_brief = DEFAULT_CASE_BRIEF
975
+ property_name = inferred_prop
976
+
977
+ # Planner prompt
978
+ planner_prompt = _compose_planner_prompt(
979
+ case_brief=case_brief,
980
+ questions=qtxt,
981
+ property_name=property_name,
982
+ seed_psmiles=seed_psmiles,
983
+ literature_query=literature_query_default,
984
+ target_value=target_value,
985
+ )
986
+ plan = orch.analyze_query(planner_prompt)
987
+ ctx["last_plan"] = plan
988
+
989
+ # Execute plan with inferred inputs
990
+ exec_inputs = _seed_inputs(
991
+ property_name=property_name,
992
+ seed_psmiles=seed_psmiles,
993
+ literature_query=literature_query_default,
994
+ target_value=target_value,
995
+ questions=qtxt,
996
+ )
997
+ exec_res = orch.execute_plan(plan, user_inputs=exec_inputs)
998
+ ctx["last_exec"] = exec_res
999
+
1000
+ # IMPORTANT: Prefer report_generation output from execute_plan (plan-driven)
1001
+ report = _extract_tool_output(exec_res, "report_generation")
1002
+
1003
+ # Fallback if orchestrator didn't include report_generation in the executed plan
1004
+ if report is None:
1005
+ qhint = (qtxt or "").strip()
1006
+ if len(qhint) >= 20:
1007
+ lit_query = qhint
1008
+ else:
1009
+ lit_query = literature_query_default
1010
+
1011
+ rep_inputs: Dict[str, Any] = {
1012
+ "questions": qtxt,
1013
+ "literature_query": lit_query,
1014
+ "query": lit_query,
1015
+ "psmiles": seed_psmiles,
1016
+ "property": property_name,
1017
+ "rows": int(DEFAULT_SEARCH_ROWS),
1018
+ "fetch_top_n": int(DEFAULT_FETCH_TOP_N),
1019
+ "fetch_top_n_arxiv": 1,
1020
+ "num_samples": int(DEFAULT_NUM_GEN_SAMPLES),
1021
+ }
1022
+
1023
+ # Only request generation if we have a target_value (or generation intent + fallback target above)
1024
+ if target_value is not None:
1025
+ rep_inputs["generate"] = True
1026
+ rep_inputs["target_value"] = float(target_value)
1027
+
1028
+ report = orch.generate_report(rep_inputs)
1029
+
1030
+ if not isinstance(report, dict):
1031
+ report = {"summary": {"report_generation": {"text": str(report)}}}
1032
+
1033
+ # Attach domains/citations; do NOT normalize generation outputs here
1034
+ report = _attach_source_domains(report)
1035
+ report = _index_citable_sources(report)
1036
+
1037
+ # Tool tags: ALWAYS [T]
1038
+ _assign_tool_tags(plan=plan, exec_res=exec_res, report=report)
1039
+
1040
+ # Normalize seed-related pSMILES for display only
1041
+ report = _normalize_seed_inputs_for_display(report)
1042
+ ctx["last_report"] = report
1043
+
1044
+ # Artifacts
1045
+ imgs, extras = _maybe_add_artifacts(
1046
+ orch,
1047
+ report,
1048
+ seed_psmiles_fallback=seed_psmiles,
1049
+ property_name_fallback=property_name,
1050
+ )
1051
+ ctx.update(extras)
1052
+
1053
+ # Decide required citation count (default 10 unless user asked otherwise)
1054
+ n_citations = _requested_citation_count(qtxt, default_n=10)
1055
+ ctx["required_citations"] = n_citations
1056
+
1057
+ # Collect citations deterministically for an explicit clickable list
1058
+ citations = _collect_citations(report)
1059
+ ctx["citations_collected"] = len(citations)
1060
+
1061
+ # Compose final answer with strict constraints
1062
+ guarded_questions = _augment_questions_for_grounding(qtxt, n_citations=n_citations)
1063
+ final_md, composer_imgs = orch.compose_gpt_style_answer(
1064
+ report,
1065
+ case_brief=case_brief,
1066
+ questions=guarded_questions,
1067
+ )
1068
+
1069
+ final_md = _escape_endpoint_tokens_for_markdown(final_md)
1070
+
1071
+ # Append deterministic source list to GUARANTEE explicit clickable citations
1072
+ # final_md = final_md.rstrip() + _build_sources_section(citations, n_needed=n_citations)
1073
+
1074
+ for p in composer_imgs:
1075
+ if p not in imgs and Path(p).exists():
1076
+ imgs.append(p)
1077
+
1078
+ return final_md, imgs
1079
+
1080
+
1081
+ # ----------------------------- Advanced Tools (optional tab) ----------------------------- #
1082
+ def tool_data_extraction(state: Dict[str, Any], psmiles: str) -> Tuple[str, List[str]]:
1083
+ orch, ctx = ensure_orch(state)
1084
+ psmiles = _convert_at_to_star(psmiles)
1085
+ out = orch._run_data_extraction({"step": 1}, {"psmiles": psmiles})
1086
+ ctx["data_extraction"] = out
1087
+ images: List[str] = []
1088
+
1089
+ if isinstance(out, dict) and out.get("canonical_psmiles"):
1090
+ mimg = orch._run_mol_render({}, {"psmiles": out["canonical_psmiles"], "view": "2d"})
1091
+ if isinstance(mimg, dict) and mimg.get("png_path") and Path(mimg["png_path"]).exists():
1092
+ images.append(mimg["png_path"])
1093
+
1094
+ expl = orch._run_prop_attribution({}, {"psmiles": out["canonical_psmiles"], "top_k_atoms": 12})
1095
+ if isinstance(expl, dict) and expl.get("png_path") and Path(expl["png_path"]).exists():
1096
+ images.append(expl["png_path"])
1097
+
1098
+ return pretty_json(out), images
1099
+
1100
+
1101
+ def tool_property_prediction(state: Dict[str, Any], property_name: str, psmiles: Optional[str]) -> str:
1102
+ orch, ctx = ensure_orch(state)
1103
+ payload: Dict[str, Any] = {"property": property_name}
1104
+ if psmiles:
1105
+ payload["psmiles"] = _convert_at_to_star(psmiles)
1106
+ if ctx.get("data_extraction"):
1107
+ payload["data_extraction"] = ctx["data_extraction"]
1108
+ if ctx.get("cl_encoding"):
1109
+ payload["cl_encoding"] = ctx["cl_encoding"]
1110
+ out = orch._run_property_prediction({"step": 3}, payload)
1111
+ ctx["property_prediction"] = out
1112
+ return pretty_json(out)
1113
+
1114
+
1115
+ def tool_polymer_generation(
1116
+ state: Dict[str, Any], property_name: str, target_value: float, num_samples: int
1117
+ ) -> Tuple[str, List[str]]:
1118
+ orch, ctx = ensure_orch(state)
1119
+ payload: Dict[str, Any] = {
1120
+ "property": property_name,
1121
+ "target_value": float(target_value),
1122
+ "num_samples": int(num_samples),
1123
+ }
1124
+ out = orch._run_polymer_generation({"step": 4}, payload)
1125
+ ctx["polymer_generation"] = out
1126
+
1127
+ images: List[str] = []
1128
+ try:
1129
+ grid = orch._run_gen_grid({}, {"polymer_generation": out})
1130
+ if isinstance(grid, dict) and grid.get("png_path") and Path(grid["png_path"]).exists():
1131
+ images.append(grid["png_path"])
1132
+ except Exception:
1133
+ pass
1134
+
1135
+ return pretty_json(out), images
1136
+
1137
+
1138
+ def tool_web_search(state: Dict[str, Any], source: str, query: str, rows: int) -> Tuple[str, List[str]]:
1139
+ orch, ctx = ensure_orch(state)
1140
+ out = orch._run_web_search({"step": 5}, {"source": source, "query": query, "rows": rows})
1141
+ out = _attach_source_domains(out)
1142
+ out = _index_citable_sources(out) if isinstance(out, dict) else out
1143
+ ctx.setdefault("web_search", {})[source] = out
1144
+ return pretty_json(out), []
1145
+
1146
+
1147
+ def tool_rag_retrieval(state: Dict[str, Any], query: str) -> str:
1148
+ orch, ctx = ensure_orch(state)
1149
+ out = orch._run_rag_retrieval({"step": 7}, {"query": query})
1150
+ out = _attach_source_domains(out)
1151
+ out = _index_citable_sources(out) if isinstance(out, dict) else out
1152
+ ctx["rag_retrieval"] = out
1153
+ return pretty_json(out)
1154
+
1155
+
1156
+ def tool_explainability(state: Dict[str, Any], psmiles: str, property_name: str) -> Tuple[str, List[str]]:
1157
+ orch, ctx = ensure_orch(state)
1158
+ psmiles = _convert_at_to_star(psmiles)
1159
+ payload: Dict[str, Any] = {"psmiles": psmiles, "top_k_atoms": 12}
1160
+ if property_name:
1161
+ payload["property"] = property_name
1162
+ out = orch._run_prop_attribution({"step": 8}, payload)
1163
+ images: List[str] = []
1164
+ if isinstance(out, dict) and out.get("png_path") and Path(out["png_path"]).exists():
1165
+ images.append(out["png_path"])
1166
+ return pretty_json(out), images
1167
+
1168
+
1169
+ def tool_openai_probe(state: Dict[str, Any]) -> str:
1170
+ orch, _ = ensure_orch(state)
1171
+ if getattr(orch, "openai_client", None) is None or orch.openai_client is None:
1172
+ return pretty_json({"ok": False, "reason": getattr(orch, "_openai_unavailable_reason", "OpenAI client not available")})
1173
+
1174
+ try:
1175
+ resp = orch.openai_client.chat.completions.create(
1176
+ model=orch.config.model,
1177
+ messages=[
1178
+ {"role": "system", "content": 'Return a tiny JSON object {"ok":true} and nothing else.'},
1179
+ {"role": "user", "content": "ping"},
1180
+ ],
1181
+ response_format={"type": "json_object"},
1182
+ )
1183
+ return resp.choices[0].message.content
1184
+ except Exception as e:
1185
+ return pretty_json({"ok": False, "error": str(e)})
1186
+
1187
+
1188
+ # ----------------------------- GPT-only ----------------------------- #
1189
+ def gpt_only_answer(state: Dict[str, Any], prompt: str) -> str:
1190
+ """
1191
+ Pure GPT-only responses. This function will not call orchestrator tools or perform web search.
1192
+ """
1193
+ orch, _ = ensure_orch(state)
1194
+ if getattr(orch, "openai_client", None) is None or orch.openai_client is None:
1195
+ return pretty_json({"ok": False, "reason": getattr(orch, "_openai_unavailable_reason", "OpenAI client not available")})
1196
+
1197
+ p = (prompt or "").strip()
1198
+ if not p:
1199
+ return "Please provide a prompt."
1200
+
1201
+ try:
1202
+ resp = orch.openai_client.chat.completions.create(
1203
+ model=orch.config.model,
1204
+ messages=[
1205
+ {
1206
+ "role": "system",
1207
+ "content": (
1208
+ "You are a polymer R&D assistant. Answer directly and clearly. "
1209
+ "Do not call tools or run web searches. If you are uncertain, state uncertainty."
1210
+ ),
1211
+ },
1212
+ {"role": "user", "content": p},
1213
+ ],
1214
+ )
1215
+ return resp.choices[0].message.content or ""
1216
+ except Exception as e:
1217
+ return pretty_json({"ok": False, "error": str(e)})
1218
+
1219
+
1220
+ # ----------------------------- Other LLMs (Hugging Face Inference) ----------------------------- #
1221
+ def llm_only_answer(state: Dict[str, Any], model_name: str, prompt: str) -> str:
1222
+ """
1223
+ LLM-only responses using Hugging Face Inference API for non-GPT models.
1224
+ """
1225
+ ensure_orch(state)
1226
+
1227
+ import os
1228
+ from huggingface_hub import InferenceClient
1229
+
1230
+ HF_TOKEN = (os.getenv("HF_TOKEN") or "").strip()
1231
+ if not HF_TOKEN:
1232
+ return pretty_json({"ok": False, "error": "HF_TOKEN is not set. Add HF_TOKEN=hf_... to your .env or env vars."})
1233
+
1234
+ HF_MODEL_MAP = {
1235
+ "mixtral-8x22b-instruct": "mistralai/Mixtral-8x22B-Instruct-v0.1",
1236
+ "llama-3.1-8b-instruct": "meta-llama/Llama-3.1-8B-Instruct",
1237
+ }
1238
+
1239
+ m = (model_name or "").strip()
1240
+ p = (prompt or "").strip()
1241
+ if not p:
1242
+ return "Please provide a prompt."
1243
+ if not m:
1244
+ return "Please select a model."
1245
+
1246
+ model_id = HF_MODEL_MAP.get(m)
1247
+ if not model_id:
1248
+ return pretty_json({"ok": False, "error": f"Unsupported model selection: {m}", "supported": list(HF_MODEL_MAP.keys())})
1249
+
1250
+ client = InferenceClient(model=model_id, token=HF_TOKEN)
1251
+
1252
+ try:
1253
+ resp = client.chat_completion(
1254
+ messages=[
1255
+ {
1256
+ "role": "system",
1257
+ "content": (
1258
+ "You are a polymer R&D assistant. Answer directly and clearly. "
1259
+ "Do not call tools or run web searches. If you are uncertain, state uncertainty."
1260
+ ),
1261
+ },
1262
+ {"role": "user", "content": p},
1263
+ ],
1264
+ max_tokens=900,
1265
+ temperature=0.7,
1266
+ )
1267
+ return resp.choices[0].message.content or ""
1268
+ except Exception as e:
1269
+ return pretty_json({"ok": False, "error": str(e), "model_id": model_id})
1270
+
1271
+
1272
+ def build_ui() -> gr.Blocks:
1273
+ with gr.Blocks(
1274
+ css="""
1275
+ .mono {font-family: ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,'Liberation Mono','Courier New',monospace}
1276
+ .info-bubble {
1277
+ border: 1px solid rgba(15, 23, 42, 0.18);
1278
+ background: rgba(15, 23, 42, 0.04);
1279
+ border-radius: 18px;
1280
+ padding: 16px 18px;
1281
+ margin: 10px 0 14px 0;
1282
+ }
1283
+ """
1284
+ ) as demo:
1285
+ state = gr.State({})
1286
+
1287
+ gr.Markdown("## PolyAgent\n")
1288
+
1289
+ # Big bubble shown on load and retained (no dismiss / no state gating).
1290
+ gr.Markdown(RUN_INSTRUCTIONS_MD, elem_classes=["info-bubble"])
1291
+
1292
+ with gr.Tabs():
1293
+ with gr.Tab("PolyAgent Console"):
1294
+ with gr.Row():
1295
+ with gr.Column(scale=1):
1296
+ gr.Markdown("### Questions")
1297
+ questions = gr.Textbox(
1298
+ label="Ask your questions",
1299
+ lines=16,
1300
+ placeholder=(
1301
+ "Example:\n"
1302
+ "1) For high-barrier flexible packaging films, what polymer design strategies improve OTR/WVTR?\n"
1303
+ "2) What recent (2015–2025) literature supports these strategies? (cite 10 papers)\n"
1304
+ "3) Suggest candidate polyester families and practical next experiments.\n"
1305
+ ),
1306
+ )
1307
+ btn_run = gr.Button("Run PolyAgent", variant="primary")
1308
+
1309
+ with gr.Column(scale=1):
1310
+ gr.Markdown("### PolyAgent Answer")
1311
+ final_answer = gr.Markdown("PolyAgent will respond here with a single structured answer.")
1312
+ gr.Markdown("### PNG Artifacts (Molecule, Grid, Explainability)")
1313
+ ev_imgs = gr.Gallery(label="", columns=3, height=260)
1314
+
1315
+ btn_run.click(
1316
+ fn=run_agent,
1317
+ inputs=[state, questions],
1318
+ outputs=[final_answer, ev_imgs],
1319
+ )
1320
+
1321
+ with gr.Tab("Tools"):
1322
+ gr.Markdown("Run individual tools for debugging/ad-hoc usage. Visuals are PNG-only.")
1323
+
1324
+ with gr.Accordion("Data Extraction", open=True):
1325
+ psm_in = gr.Textbox(label="pSMILES")
1326
+ btn_ex = gr.Button("Extract", variant="primary")
1327
+ ex_json = gr.Code(label="Output", language="json", elem_classes=["mono"])
1328
+ ex_imgs = gr.Gallery(label="PNG (molecule + explainability)", columns=3, height=220)
1329
+ btn_ex.click(tool_data_extraction, [state, psm_in], [ex_json, ex_imgs])
1330
+
1331
+ with gr.Accordion("Property Prediction", open=False):
1332
+ prop = gr.Dropdown(
1333
+ label="Property",
1334
+ choices=["density", "glass transition", "melting", "specific volume", "thermal decomposition"],
1335
+ value="glass transition",
1336
+ )
1337
+ psm_pred = gr.Textbox(label="Optional pSMILES (if not using previous extraction)")
1338
+ btn_pred = gr.Button("Predict", variant="primary")
1339
+ pred_json = gr.Code(label="Output", language="json", elem_classes=["mono"])
1340
+ btn_pred.click(tool_property_prediction, [state, prop, psm_pred], [pred_json])
1341
+
1342
+ with gr.Accordion("Polymer Generation (inverse design)", open=False):
1343
+ prop_g = gr.Dropdown(
1344
+ label="Property (select generator)",
1345
+ choices=["density", "glass transition", "melting", "specific volume", "thermal decomposition"],
1346
+ value="glass transition",
1347
+ )
1348
+ tgt = gr.Number(label="target_value (required)", value=60.0, precision=4)
1349
+ ns = gr.Slider(1, 24, value=4, step=1, label="# Samples")
1350
+ btn_gen = gr.Button("Generate", variant="primary")
1351
+ gen_json = gr.Code(label="Output", language="json", elem_classes=["mono"])
1352
+ gen_imgs = gr.Gallery(label="PNG (generation grid)", columns=3, height=220)
1353
+ btn_gen.click(tool_polymer_generation, [state, prop_g, tgt, ns], [gen_json, gen_imgs])
1354
+
1355
+ with gr.Accordion("Web / RAG", open=False):
1356
+ src = gr.Dropdown(
1357
+ label="Source",
1358
+ choices=["crossref", "openalex", "epmc", "arxiv", "semanticscholar", "springer", "internetarchive", "all"],
1359
+ value="all",
1360
+ )
1361
+ query = gr.Textbox(label="Query")
1362
+ rows = gr.Slider(1, 50, value=12, step=1, label="rows")
1363
+ btn_ws = gr.Button("Search", variant="primary")
1364
+ ws_json = gr.Code(label="Output", language="json", elem_classes=["mono"])
1365
+ ws_imgs = gr.Gallery(label="(not used)", columns=3, height=10)
1366
+ btn_ws.click(tool_web_search, [state, src, query, rows], [ws_json, ws_imgs])
1367
+
1368
+ rag_q = gr.Textbox(label="RAG query (local polymer KB)")
1369
+ btn_rag = gr.Button("Retrieve (RAG)", variant="secondary")
1370
+ rag_json = gr.Code(label="Output", language="json", elem_classes=["mono"])
1371
+ btn_rag.click(tool_rag_retrieval, [state, rag_q], [rag_json])
1372
+
1373
+ with gr.Accordion("Explainability (top-K atom occlusion)", open=False):
1374
+ psm_expl = gr.Textbox(label="pSMILES")
1375
+ prop_expl = gr.Dropdown(
1376
+ label="Property (for attribution)",
1377
+ choices=["density", "glass transition", "melting", "specific volume", "thermal decomposition"],
1378
+ value="glass transition",
1379
+ )
1380
+ btn_expl = gr.Button("Explain", variant="primary")
1381
+ expl_json = gr.Code(label="Attribution data (JSON)", language="json", elem_classes=["mono"])
1382
+ expl_imgs = gr.Gallery(label="PNG (heatmap)", columns=2, height=220)
1383
+ btn_expl.click(tool_explainability, [state, psm_expl, prop_expl], [expl_json, expl_imgs])
1384
+
1385
+ with gr.Accordion("Diagnostics", open=False):
1386
+ btn_probe = gr.Button("Probe OpenAI (JSON ping)")
1387
+ probe_json = gr.Code(label="Result", language="json", elem_classes=["mono"])
1388
+ btn_probe.click(tool_openai_probe, [state], [probe_json])
1389
+
1390
+ with gr.Tab("Other LLMs"):
1391
+ gr.Markdown("Run a direct LLM-only response (no tools, no web search) using a non-GPT model name.")
1392
+
1393
+ llm_model = gr.Dropdown(
1394
+ label="Model",
1395
+ choices=["mixtral-8x22b-instruct", "llama-3.1-8b-instruct"],
1396
+ value="mixtral-8x22b-instruct",
1397
+ )
1398
+ llm_prompt = gr.Textbox(label="Prompt", lines=10, placeholder="Enter your polymer question/prompt.")
1399
+ llm_btn = gr.Button("Run LLM", variant="primary")
1400
+ llm_out = gr.Markdown("The model response will appear here.")
1401
+ llm_btn.click(fn=llm_only_answer, inputs=[state, llm_model, llm_prompt], outputs=[llm_out])
1402
+
1403
+ return demo
1404
+
1405
+
1406
+ def main():
1407
+ parser = argparse.ArgumentParser()
1408
+ parser.add_argument("--server-name", type=str, default=None)
1409
+ parser.add_argument("--server-port", type=int, default=None)
1410
+ args = parser.parse_args()
1411
+
1412
+ demo = build_ui()
1413
+ demo.launch(server_name=args.server_name, server_port=args.server_port, show_api=False, share=True)
1414
+
1415
+
1416
+ if __name__ == "__main__":
1417
+ main()