Spaces:
Running
Running
Update format_transplant.py
Browse files- format_transplant.py +234 -124
format_transplant.py
CHANGED
|
@@ -313,16 +313,71 @@ class LLMProvider(Enum):
|
|
| 313 |
|
| 314 |
|
| 315 |
# Per-provider defaults β base_url=None means the provider uses its own SDK
|
|
|
|
| 316 |
PROVIDER_DEFAULTS: Dict[str, Dict[str, Any]] = {
|
| 317 |
-
"openai":
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
"
|
| 325 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
}
|
| 327 |
|
| 328 |
|
|
@@ -342,6 +397,7 @@ class LLMConfig:
|
|
| 342 |
# Retry settings
|
| 343 |
max_retries: int = 5
|
| 344 |
retry_delay_s: float = 5.0
|
|
|
|
| 345 |
|
| 346 |
|
| 347 |
def llm_config_from_args(
|
|
@@ -370,6 +426,7 @@ def llm_config_from_args(
|
|
| 370 |
api_key=resolved_key,
|
| 371 |
base_url=defaults.get("base_url"),
|
| 372 |
para_batch_size=defaults.get("batch_size", 15),
|
|
|
|
| 373 |
)
|
| 374 |
|
| 375 |
|
|
@@ -2011,54 +2068,72 @@ class MultiProviderLLMClient:
|
|
| 2011 |
|
| 2012 |
def complete(self, system: str, user: str, config: LLMConfig) -> str:
|
| 2013 |
"""Send a chat completion and return the assistant's text."""
|
| 2014 |
-
|
| 2015 |
-
|
| 2016 |
-
|
| 2017 |
-
|
| 2018 |
-
|
| 2019 |
-
|
| 2020 |
-
|
| 2021 |
-
|
| 2022 |
-
|
| 2023 |
-
|
| 2024 |
-
|
| 2025 |
-
|
| 2026 |
-
|
| 2027 |
-
|
| 2028 |
-
|
| 2029 |
-
|
| 2030 |
-
|
| 2031 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2032 |
|
| 2033 |
-
|
| 2034 |
-
|
| 2035 |
-
|
| 2036 |
-
|
| 2037 |
-
|
| 2038 |
-
|
| 2039 |
-
|
| 2040 |
-
|
| 2041 |
-
|
| 2042 |
-
|
| 2043 |
-
|
| 2044 |
-
|
| 2045 |
-
|
| 2046 |
-
|
| 2047 |
-
|
| 2048 |
-
|
| 2049 |
-
|
| 2050 |
-
|
| 2051 |
-
|
| 2052 |
-
|
| 2053 |
-
|
| 2054 |
-
|
| 2055 |
-
|
| 2056 |
-
|
| 2057 |
-
|
| 2058 |
-
|
| 2059 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2060 |
raise RuntimeError(
|
| 2061 |
-
f"[LLM] All
|
| 2062 |
)
|
| 2063 |
|
| 2064 |
def get_available_models(self, config: LLMConfig) -> List[Dict[str, Any]]:
|
|
@@ -2345,39 +2420,52 @@ def extract_blueprint_text(doc: Document, max_chars: int = 40_000) -> str:
|
|
| 2345 |
# ============================================================================
|
| 2346 |
|
| 2347 |
_SG_SYSTEM = """\
|
| 2348 |
-
You are an expert scholarly editor
|
| 2349 |
-
Your output
|
| 2350 |
-
|
| 2351 |
-
|
| 2352 |
-
|
| 2353 |
-
|
| 2354 |
-
|
| 2355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2356 |
"""
|
| 2357 |
|
| 2358 |
_SG_USER_TMPL = """\
|
| 2359 |
-
Below is
|
|
|
|
| 2360 |
|
| 2361 |
-
DOCUMENT EXCERPT:
|
| 2362 |
ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2363 |
{blueprint_text}
|
| 2364 |
ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2365 |
{extra_section}
|
| 2366 |
-
|
| 2367 |
-
|
| 2368 |
-
|
| 2369 |
-
|
| 2370 |
-
|
| 2371 |
-
|
| 2372 |
-
|
| 2373 |
-
|
| 2374 |
-
|
| 2375 |
-
|
| 2376 |
-
|
| 2377 |
-
|
| 2378 |
-
|
| 2379 |
-
|
| 2380 |
-
Aim for
|
| 2381 |
"""
|
| 2382 |
|
| 2383 |
|
|
@@ -2472,21 +2560,20 @@ def parse_md_runs(text: str) -> List["RunData"]:
|
|
| 2472 |
|
| 2473 |
_FMT_SYSTEM = """\
|
| 2474 |
You are a scholarly editor applying a strict editorial style guide to existing text.
|
| 2475 |
-
Your
|
| 2476 |
|
| 2477 |
-
|
| 2478 |
-
1.
|
| 2479 |
-
2.
|
| 2480 |
-
3. DO NOT
|
| 2481 |
-
4. DO NOT
|
| 2482 |
-
5. REPRODUCE the text EXACTLY as given, only adding Markdown markers for formatting.
|
| 2483 |
|
| 2484 |
Use Markdown for inline formatting:
|
| 2485 |
*italic* for italic text
|
| 2486 |
**bold** for bold text
|
| 2487 |
***bold italic*** for bold + italic
|
| 2488 |
-
No other Markdown
|
| 2489 |
-
Return EXACTLY one
|
| 2490 |
"""
|
| 2491 |
|
| 2492 |
_PARA_USER_TMPL = """\
|
|
@@ -2495,12 +2582,18 @@ STYLE GUIDE:
|
|
| 2495 |
{styleguide}
|
| 2496 |
ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2497 |
|
| 2498 |
-
|
| 2499 |
-
|
| 2500 |
-
|
| 2501 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2502 |
|
| 2503 |
-
PARAGRAPHS:
|
| 2504 |
{content}
|
| 2505 |
"""
|
| 2506 |
|
|
@@ -2510,14 +2603,18 @@ STYLE GUIDE:
|
|
| 2510 |
{styleguide}
|
| 2511 |
ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2512 |
|
| 2513 |
-
|
| 2514 |
-
|
| 2515 |
-
|
| 2516 |
-
|
| 2517 |
-
|
| 2518 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2519 |
|
| 2520 |
-
FOOTNOTES:
|
| 2521 |
{content}
|
| 2522 |
"""
|
| 2523 |
|
|
@@ -2576,6 +2673,8 @@ class LLMContentFormatter:
|
|
| 2576 |
len(to_format), mode, config.para_batch_size,
|
| 2577 |
)
|
| 2578 |
|
|
|
|
|
|
|
| 2579 |
for batch_start in range(0, len(to_format), config.para_batch_size):
|
| 2580 |
# Inter-batch delay to stay under rate limits
|
| 2581 |
if batch_start > 0:
|
|
@@ -2586,14 +2685,17 @@ class LLMContentFormatter:
|
|
| 2586 |
time.sleep(batch_delay)
|
| 2587 |
|
| 2588 |
batch = to_format[batch_start: batch_start + config.para_batch_size]
|
| 2589 |
-
|
| 2590 |
-
|
| 2591 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2592 |
tmpl = _FN_USER_TMPL if mode == "footnote" else _PARA_USER_TMPL
|
| 2593 |
user_msg = tmpl.format(
|
| 2594 |
styleguide=styleguide,
|
| 2595 |
n=len(batch),
|
| 2596 |
-
sep=_BATCH_SEP,
|
| 2597 |
content=content,
|
| 2598 |
)
|
| 2599 |
|
|
@@ -2604,40 +2706,48 @@ class LLMContentFormatter:
|
|
| 2604 |
|
| 2605 |
try:
|
| 2606 |
response = self.client.complete(_FMT_SYSTEM, user_msg, config)
|
| 2607 |
-
parsed = self.
|
| 2608 |
except Exception as exc:
|
| 2609 |
logger.error("[LLM-FMT] Batch failed, using originals: %s", exc)
|
| 2610 |
-
parsed =
|
| 2611 |
|
| 2612 |
for pd, formatted in zip(batch, parsed):
|
| 2613 |
if formatted.strip():
|
| 2614 |
result[id(pd)] = formatted
|
| 2615 |
logger.debug(
|
| 2616 |
-
"[LLM-FMT]
|
| 2617 |
-
pd.get_text(), formatted,
|
| 2618 |
)
|
| 2619 |
|
| 2620 |
return result
|
| 2621 |
|
| 2622 |
# ------------------------------------------------------------------
|
| 2623 |
@staticmethod
|
| 2624 |
-
def
|
| 2625 |
"""
|
| 2626 |
-
|
| 2627 |
-
Falls back to originals for any missing entries.
|
| 2628 |
"""
|
| 2629 |
-
|
| 2630 |
-
|
| 2631 |
-
|
| 2632 |
-
|
| 2633 |
-
|
| 2634 |
-
|
| 2635 |
-
|
| 2636 |
-
|
| 2637 |
-
|
| 2638 |
-
|
| 2639 |
-
|
| 2640 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2641 |
|
| 2642 |
|
| 2643 |
# ============================================================================
|
|
|
|
| 313 |
|
| 314 |
|
| 315 |
# Per-provider defaults β base_url=None means the provider uses its own SDK
|
| 316 |
+
# Added top 5 fallback models for each provider
|
| 317 |
PROVIDER_DEFAULTS: Dict[str, Dict[str, Any]] = {
|
| 318 |
+
"openai": {
|
| 319 |
+
"base_url": "https://api.openai.com/v1",
|
| 320 |
+
"env": "OPENAI_API_KEY",
|
| 321 |
+
"model": "gpt-4o",
|
| 322 |
+
"fallbacks": ["gpt-4o-2024-08-06", "gpt-4o-mini", "gpt-4-turbo", "gpt-3.5-turbo"],
|
| 323 |
+
"batch_size": 15
|
| 324 |
+
},
|
| 325 |
+
"anthropic": {
|
| 326 |
+
"base_url": None,
|
| 327 |
+
"env": "ANTHROPIC_API_KEY",
|
| 328 |
+
"model": "claude-3-7-sonnet-20250219",
|
| 329 |
+
"fallbacks": ["claude-3-5-sonnet-20241022", "claude-3-5-haiku-20241022", "claude-3-opus-20240229", "claude-2.1"],
|
| 330 |
+
"batch_size": 15
|
| 331 |
+
},
|
| 332 |
+
"groq": {
|
| 333 |
+
"base_url": "https://api.groq.com/openai/v1",
|
| 334 |
+
"env": "GROQ_API_KEY",
|
| 335 |
+
"model": "llama-3.3-70b-versatile",
|
| 336 |
+
"fallbacks": ["llama-3.1-70b-versatile", "llama-3.1-8b-instant", "mixtral-8x7b-32768", "gemma2-9b-it"],
|
| 337 |
+
"batch_size": 5
|
| 338 |
+
},
|
| 339 |
+
"nebius": {
|
| 340 |
+
"base_url": "https://api.studio.nebius.ai/v1",
|
| 341 |
+
"env": "NEBIUS_API_KEY",
|
| 342 |
+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
| 343 |
+
"fallbacks": ["meta-llama/Meta-Llama-3.1-8B-Instruct", "meta-llama/Llama-Guard-3-8B"],
|
| 344 |
+
"batch_size": 15
|
| 345 |
+
},
|
| 346 |
+
"scaleway": {
|
| 347 |
+
"base_url": "https://api.scaleway.ai/v1",
|
| 348 |
+
"env": "SCALEWAY_API_KEY", # Updated to match .env
|
| 349 |
+
"model": "llama-3.3-70b-instruct",
|
| 350 |
+
"fallbacks": ["deepseek-r1-distill-llama-70b", "llama-3.1-8b-instruct", "mistral-nemo-instruct-2407", "pixtral-12b-2409"],
|
| 351 |
+
"batch_size": 15
|
| 352 |
+
},
|
| 353 |
+
"openrouter": {
|
| 354 |
+
"base_url": "https://openrouter.ai/api/v1",
|
| 355 |
+
"env": "OPENROUTER_API_KEY",
|
| 356 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
| 357 |
+
"fallbacks": ["anthropic/claude-3.5-sonnet", "google/gemini-pro-1.5", "mistralai/mistral-large", "qwen/qwen-2.5-72b-instruct"],
|
| 358 |
+
"batch_size": 15
|
| 359 |
+
},
|
| 360 |
+
"mistral": {
|
| 361 |
+
"base_url": "https://api.mistral.ai/v1",
|
| 362 |
+
"env": "MISTRAL_API_KEY",
|
| 363 |
+
"model": "mistral-large-latest",
|
| 364 |
+
"fallbacks": ["mistral-medium-latest", "mistral-small-latest", "codestral-latest", "open-mistral-nemo"],
|
| 365 |
+
"batch_size": 15
|
| 366 |
+
},
|
| 367 |
+
"poe": {
|
| 368 |
+
"base_url": None,
|
| 369 |
+
"env": "POE_API_KEY",
|
| 370 |
+
"model": "Claude-3.7-Sonnet",
|
| 371 |
+
"fallbacks": ["Claude-3.5-Sonnet", "GPT-4o", "Claude-3-Opus", "Llama-3.1-405B"],
|
| 372 |
+
"batch_size": 15
|
| 373 |
+
},
|
| 374 |
+
"ollama": {
|
| 375 |
+
"base_url": "http://localhost:11434/api",
|
| 376 |
+
"env": "OLLAMA_API_KEY",
|
| 377 |
+
"model": "ministral-3b-instruct-2512-q4_K_M",
|
| 378 |
+
"fallbacks": ["cas/llama-3.2-3b-instruct:latest", "llama3.2", "mistral", "phi3"],
|
| 379 |
+
"batch_size": 15
|
| 380 |
+
},
|
| 381 |
}
|
| 382 |
|
| 383 |
|
|
|
|
| 397 |
# Retry settings
|
| 398 |
max_retries: int = 5
|
| 399 |
retry_delay_s: float = 5.0
|
| 400 |
+
fallback_models: List[str] = field(default_factory=list)
|
| 401 |
|
| 402 |
|
| 403 |
def llm_config_from_args(
|
|
|
|
| 426 |
api_key=resolved_key,
|
| 427 |
base_url=defaults.get("base_url"),
|
| 428 |
para_batch_size=defaults.get("batch_size", 15),
|
| 429 |
+
fallback_models=defaults.get("fallbacks", []),
|
| 430 |
)
|
| 431 |
|
| 432 |
|
|
|
|
| 2068 |
|
| 2069 |
def complete(self, system: str, user: str, config: LLMConfig) -> str:
|
| 2070 |
"""Send a chat completion and return the assistant's text."""
|
| 2071 |
+
# Candidate models list: primary model followed by fallbacks
|
| 2072 |
+
models_to_try = [config.model] + config.fallback_models
|
| 2073 |
+
|
| 2074 |
+
last_exception = None
|
| 2075 |
+
|
| 2076 |
+
for model_id in models_to_try:
|
| 2077 |
+
current_config = copy.copy(config)
|
| 2078 |
+
current_config.model = model_id
|
| 2079 |
+
|
| 2080 |
+
logger.info("[LLM] %s: Trying model '%s'...", config.provider.value, model_id)
|
| 2081 |
+
|
| 2082 |
+
for attempt in range(1, config.max_retries + 1):
|
| 2083 |
+
try:
|
| 2084 |
+
if config.provider == LLMProvider.ANTHROPIC:
|
| 2085 |
+
return self._anthropic(system, user, current_config)
|
| 2086 |
+
elif config.provider == LLMProvider.POE:
|
| 2087 |
+
return self._poe(system, user, current_config)
|
| 2088 |
+
elif config.provider == LLMProvider.OLLAMA:
|
| 2089 |
+
return self._ollama(system, user, current_config)
|
| 2090 |
+
else:
|
| 2091 |
+
return self._openai_compat(system, user, current_config)
|
| 2092 |
+
except Exception as exc:
|
| 2093 |
+
last_exception = exc
|
| 2094 |
+
exc_str = str(exc).lower()
|
| 2095 |
+
is_rate_limit = "429" in exc_str or "rate limit" in exc_str
|
| 2096 |
+
is_model_not_found = "404" in exc_str or "not found" in exc_str or "does not exist" in exc_str
|
| 2097 |
|
| 2098 |
+
if is_model_not_found:
|
| 2099 |
+
logger.warning("[LLM] %s: Model '%s' not found. Trying next fallback...",
|
| 2100 |
+
config.provider.value, model_id)
|
| 2101 |
+
break # Exit attempt loop, try next model
|
| 2102 |
+
|
| 2103 |
+
# Exponential backoff: retry_delay * (2 ^ (attempt-1))
|
| 2104 |
+
delay = config.retry_delay_s * (2 ** (attempt - 1))
|
| 2105 |
+
header_delay = None
|
| 2106 |
+
|
| 2107 |
+
# OpenAI / Groq / OpenRouter often put it in headers
|
| 2108 |
+
if hasattr(exc, "response") and hasattr(exc.response, "headers"):
|
| 2109 |
+
retry_after = exc.response.headers.get("retry-after")
|
| 2110 |
+
if retry_after and retry_after.isdigit():
|
| 2111 |
+
header_delay = float(retry_after)
|
| 2112 |
+
|
| 2113 |
+
if header_delay:
|
| 2114 |
+
delay = max(delay, header_delay + 1.0) # Add 1s buffer
|
| 2115 |
+
elif is_rate_limit:
|
| 2116 |
+
delay *= 2 # Extra patience for rate limits
|
| 2117 |
+
|
| 2118 |
+
if is_rate_limit:
|
| 2119 |
+
logger.warning(
|
| 2120 |
+
"[LLM] %s rate limited (429) for model '%s'. Waiting %.1f seconds... (Attempt %d/%d)",
|
| 2121 |
+
config.provider.value, model_id, delay, attempt, config.max_retries
|
| 2122 |
+
)
|
| 2123 |
+
else:
|
| 2124 |
+
logger.warning(
|
| 2125 |
+
"[LLM] %s model '%s' attempt %d/%d failed: %s",
|
| 2126 |
+
config.provider.value, model_id, attempt, config.max_retries, exc,
|
| 2127 |
+
)
|
| 2128 |
+
|
| 2129 |
+
if attempt < config.max_retries:
|
| 2130 |
+
time.sleep(delay)
|
| 2131 |
+
else:
|
| 2132 |
+
logger.error("[LLM] %s: All retries failed for model '%s'.",
|
| 2133 |
+
config.provider.value, model_id)
|
| 2134 |
+
|
| 2135 |
raise RuntimeError(
|
| 2136 |
+
f"[LLM] All models and retries failed for {config.provider.value}. Last error: {last_exception}"
|
| 2137 |
)
|
| 2138 |
|
| 2139 |
def get_available_models(self, config: LLMConfig) -> List[Dict[str, Any]]:
|
|
|
|
| 2420 |
# ============================================================================
|
| 2421 |
|
| 2422 |
_SG_SYSTEM = """\
|
| 2423 |
+
You are an expert scholarly editor and citation specialist deriving a comprehensive editorial style guide from a document.
|
| 2424 |
+
Your output must be a precise instruction set for reformatting text to match this document's exact standards.
|
| 2425 |
+
|
| 2426 |
+
USER PRIORITY RULE:
|
| 2427 |
+
If the user provides supplementary style information, those rules take ABSOLUTE PRECEDENCE over patterns you observe in the excerpt.
|
| 2428 |
+
|
| 2429 |
+
MANDATORY AREAS OF ANALYSIS:
|
| 2430 |
+
1. CITATION STYLE: Meticulously analyze footnote citations. Identify patterns for:
|
| 2431 |
+
- Book/article titles (italic? quotes?)
|
| 2432 |
+
- Author names (Full name? Surname? All caps? Roman?)
|
| 2433 |
+
- Volume/Issue/Page notation (S. 12? p. 12? 12-15? 12f?)
|
| 2434 |
+
- Punctuation between components (Commas? Colons? Slashes?)
|
| 2435 |
+
- Repeated citations (Vgl.? See? Ibid.? ebenda?)
|
| 2436 |
+
2. PUNCTUATION & SYMBOLS: Identify specific choices for:
|
| 2437 |
+
- Quotation marks (Β»...Β«, β...β, "...", '...')
|
| 2438 |
+
- Dashes (β em-dash, β en-dash)
|
| 2439 |
+
- Spaces before/after symbols
|
| 2440 |
+
3. NAMES & TERMS: Identify treatment of personal names, institutional names, and foreign terms.
|
| 2441 |
+
|
| 2442 |
+
Write the style guide as actionable, imperative rules (e.g., "Always use...", "Never italicize...").
|
| 2443 |
"""
|
| 2444 |
|
| 2445 |
_SG_USER_TMPL = """\
|
| 2446 |
+
Below is a comprehensive excerpt from the **blueprint document**, including sampled footnotes.
|
| 2447 |
+
Analyse its editorial conventions with extreme care.
|
| 2448 |
|
| 2449 |
+
DOCUMENT EXCERPT (Body & Footnotes):
|
| 2450 |
ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2451 |
{blueprint_text}
|
| 2452 |
ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2453 |
{extra_section}
|
| 2454 |
+
|
| 2455 |
+
Produce a **MASTER STYLE GUIDE** in Markdown.
|
| 2456 |
+
|
| 2457 |
+
CRITICAL: Your guide must be detailed enough to handle complex academic citations and specific punctuation (like Β»...Β« quotation marks) without ambiguity.
|
| 2458 |
+
|
| 2459 |
+
Structure your guide:
|
| 2460 |
+
1. **Absolute User Overrides** (Include any rules from the 'Additional information' section here first)
|
| 2461 |
+
2. **Language & Register**
|
| 2462 |
+
3. **Personal & Institutional Names**
|
| 2463 |
+
4. **Foreign-Language Terms & Transliteration**
|
| 2464 |
+
5. **Inline Emphasis & Special Symbols** (Meticulously specify quotation marks: Β» vs β vs ")
|
| 2465 |
+
6. **Footnote & Citation System** (Provide specific templates for books, articles, and repeats)
|
| 2466 |
+
7. **Heading & Layout Conventions**
|
| 2467 |
+
|
| 2468 |
+
Aim for scholarly perfection.
|
| 2469 |
"""
|
| 2470 |
|
| 2471 |
|
|
|
|
| 2560 |
|
| 2561 |
_FMT_SYSTEM = """\
|
| 2562 |
You are a scholarly editor applying a strict editorial style guide to existing text.
|
| 2563 |
+
Your task is to re-format the provided text to match the Style Guide's exact conventions.
|
| 2564 |
|
| 2565 |
+
CONSTRAINTS:
|
| 2566 |
+
1. SUBSTANTIVE VERBATIM: Do NOT change the substantive meaning, names, or titles.
|
| 2567 |
+
2. EDITORIAL RE-FORMATTING: You MUST change punctuation, quotation marks, and citation structure (e.g., brackets vs commas, colons vs spaces) to strictly follow the Style Guide.
|
| 2568 |
+
3. DO NOT translate, summarize, or paraphrase.
|
| 2569 |
+
4. DO NOT add any introductory remarks or commentary.
|
|
|
|
| 2570 |
|
| 2571 |
Use Markdown for inline formatting:
|
| 2572 |
*italic* for italic text
|
| 2573 |
**bold** for bold text
|
| 2574 |
***bold italic*** for bold + italic
|
| 2575 |
+
No other Markdown. Return only the re-formatted paragraph text.
|
| 2576 |
+
Return EXACTLY one response for each input paragraph.
|
| 2577 |
"""
|
| 2578 |
|
| 2579 |
_PARA_USER_TMPL = """\
|
|
|
|
| 2582 |
{styleguide}
|
| 2583 |
ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2584 |
|
| 2585 |
+
Your task is to re-format {n} separate paragraphs according to the Style Guide.
|
| 2586 |
+
Each paragraph is provided inside indexed tags like [P1]...[/P1].
|
| 2587 |
+
|
| 2588 |
+
INSTRUCTIONS:
|
| 2589 |
+
1. Process each paragraph individually.
|
| 2590 |
+
2. You MUST return each re-formatted paragraph inside matching indexed tags, e.g.:
|
| 2591 |
+
[P1]Re-formatted text of first paragraph...[/P1]
|
| 2592 |
+
[P2]Re-formatted text of second paragraph...[/P2]
|
| 2593 |
+
3. DO NOT merge paragraphs.
|
| 2594 |
+
4. DO NOT add any commentary or extra text outside the tags.
|
| 2595 |
|
| 2596 |
+
PARAGRAPHS TO PROCESS:
|
| 2597 |
{content}
|
| 2598 |
"""
|
| 2599 |
|
|
|
|
| 2603 |
{styleguide}
|
| 2604 |
ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2605 |
|
| 2606 |
+
Your task is to re-format {n} separate footnotes according to the Style Guide.
|
| 2607 |
+
Each footnote is provided inside indexed tags like [F1]...[/F1].
|
| 2608 |
+
|
| 2609 |
+
INSTRUCTIONS:
|
| 2610 |
+
1. Process each footnote individually.
|
| 2611 |
+
2. You MUST return each re-formatted footnote inside matching indexed tags, e.g.:
|
| 2612 |
+
[F1]Re-formatted text of first footnote...[/F1]
|
| 2613 |
+
[F2]Re-formatted text of second footnote...[/F2]
|
| 2614 |
+
3. DO NOT merge footnotes.
|
| 2615 |
+
4. DO NOT add any commentary or extra text outside the tags.
|
| 2616 |
|
| 2617 |
+
FOOTNOTES TO PROCESS:
|
| 2618 |
{content}
|
| 2619 |
"""
|
| 2620 |
|
|
|
|
| 2673 |
len(to_format), mode, config.para_batch_size,
|
| 2674 |
)
|
| 2675 |
|
| 2676 |
+
prefix = "P" if mode == "para" else "F"
|
| 2677 |
+
|
| 2678 |
for batch_start in range(0, len(to_format), config.para_batch_size):
|
| 2679 |
# Inter-batch delay to stay under rate limits
|
| 2680 |
if batch_start > 0:
|
|
|
|
| 2685 |
time.sleep(batch_delay)
|
| 2686 |
|
| 2687 |
batch = to_format[batch_start: batch_start + config.para_batch_size]
|
| 2688 |
+
|
| 2689 |
+
# Wrap each paragraph in indexed tags
|
| 2690 |
+
tagged_texts = []
|
| 2691 |
+
for i, p in enumerate(batch, 1):
|
| 2692 |
+
tagged_texts.append(f"[{prefix}{i}]{p.get_text()}[/{prefix}{i}]")
|
| 2693 |
+
|
| 2694 |
+
content = "\n".join(tagged_texts)
|
| 2695 |
tmpl = _FN_USER_TMPL if mode == "footnote" else _PARA_USER_TMPL
|
| 2696 |
user_msg = tmpl.format(
|
| 2697 |
styleguide=styleguide,
|
| 2698 |
n=len(batch),
|
|
|
|
| 2699 |
content=content,
|
| 2700 |
)
|
| 2701 |
|
|
|
|
| 2706 |
|
| 2707 |
try:
|
| 2708 |
response = self.client.complete(_FMT_SYSTEM, user_msg, config)
|
| 2709 |
+
parsed = self._parse_tagged_response(response, len(batch), [p.get_text() for p in batch], prefix)
|
| 2710 |
except Exception as exc:
|
| 2711 |
logger.error("[LLM-FMT] Batch failed, using originals: %s", exc)
|
| 2712 |
+
parsed = [p.get_text() for p in batch]
|
| 2713 |
|
| 2714 |
for pd, formatted in zip(batch, parsed):
|
| 2715 |
if formatted.strip():
|
| 2716 |
result[id(pd)] = formatted
|
| 2717 |
logger.debug(
|
| 2718 |
+
"[LLM-FMT] %s formatted: orig='%.50s' β fmt='%.50s'",
|
| 2719 |
+
mode.capitalize(), pd.get_text(), formatted,
|
| 2720 |
)
|
| 2721 |
|
| 2722 |
return result
|
| 2723 |
|
| 2724 |
# ------------------------------------------------------------------
|
| 2725 |
@staticmethod
|
| 2726 |
+
def _parse_tagged_response(response: str, expected: int, originals: List[str], prefix: str) -> List[str]:
|
| 2727 |
"""
|
| 2728 |
+
Extract content from [P1]...[/P1] or [F1]...[/F1] tags.
|
| 2729 |
+
Falls back to originals for any missing or unparseable entries.
|
| 2730 |
"""
|
| 2731 |
+
results = []
|
| 2732 |
+
for i in range(1, expected + 1):
|
| 2733 |
+
tag = f"{prefix}{i}"
|
| 2734 |
+
# Non-greedy match between start and end tags
|
| 2735 |
+
pattern = rf"\[{tag}\](.*?)\[\/{tag}\]"
|
| 2736 |
+
match = re.search(pattern, response, re.DOTALL)
|
| 2737 |
+
|
| 2738 |
+
if match:
|
| 2739 |
+
results.append(match.group(1).strip())
|
| 2740 |
+
else:
|
| 2741 |
+
# Try fallback: just the start tag if the LLM forgot the end tag
|
| 2742 |
+
pattern_fallback = rf"\[{tag}\](.*?)(?=\[{prefix}{i+1}\]|$)"
|
| 2743 |
+
match_fallback = re.search(pattern_fallback, response, re.DOTALL)
|
| 2744 |
+
if match_fallback:
|
| 2745 |
+
results.append(match_fallback.group(1).strip())
|
| 2746 |
+
else:
|
| 2747 |
+
logger.warning("[LLM-FMT] Could not find tag [%s] in response", tag)
|
| 2748 |
+
results.append(originals[i-1])
|
| 2749 |
+
|
| 2750 |
+
return results
|
| 2751 |
|
| 2752 |
|
| 2753 |
# ============================================================================
|