cstr commited on
Commit
2560199
Β·
verified Β·
1 Parent(s): 4f54832

Update format_transplant.py

Browse files
Files changed (1) hide show
  1. format_transplant.py +126 -38
format_transplant.py CHANGED
@@ -22,6 +22,7 @@ import argparse
22
  import asyncio
23
  import copy
24
  import logging
 
25
  import re
26
  import shutil
27
  import sys
@@ -93,6 +94,28 @@ logging.basicConfig(
93
  )
94
  logger = logging.getLogger("FormatTransplant")
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  # ============================================================================
97
  # SEMANTIC CLASSIFICATION CONSTANTS
98
  # ============================================================================
@@ -291,15 +314,15 @@ class LLMProvider(Enum):
291
 
292
  # Per-provider defaults β€” base_url=None means the provider uses its own SDK
293
  PROVIDER_DEFAULTS: Dict[str, Dict[str, Any]] = {
294
- "openai": {"base_url": "https://api.openai.com/v1", "env": "OPENAI_API_KEY", "model": "gpt-4o"},
295
- "anthropic": {"base_url": None, "env": "ANTHROPIC_API_KEY", "model": "claude-3-5-sonnet-20241022"},
296
- "groq": {"base_url": "https://api.groq.com/openai/v1", "env": "GROQ_API_KEY", "model": "llama-3.3-70b-versatile"},
297
- "nebius": {"base_url": "https://api.studio.nebius.com/v1", "env": "NEBIUS_API_KEY", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct"},
298
- "scaleway": {"base_url": "https://api.scaleway.ai/v1", "env": "SCW_SECRET_KEY", "model": "llama-3.3-70b-instruct"},
299
- "openrouter": {"base_url": "https://openrouter.ai/api/v1", "env": "OPENROUTER_API_KEY","model": "meta-llama/llama-3.3-70b-instruct"},
300
- "mistral": {"base_url": "https://api.mistral.ai/v1", "env": "MISTRAL_API_KEY", "model": "mistral-large-latest"},
301
- "poe": {"base_url": None, "env": "POE_API_KEY", "model": "Claude-3.7-Sonnet"},
302
- "ollama": {"base_url": "http://localhost:11434/api", "env": "OLLAMA_API_KEY", "model": "llama3.2"},
303
  }
304
 
305
 
@@ -313,11 +336,11 @@ class LLMConfig:
313
  max_tokens: int = 4096
314
  temperature: float = 0.1 # low for deterministic formatting
315
  # How many chars of blueprint text to send for styleguide generation (~10 K tokens)
316
- blueprint_context_chars: int = 40_000
317
  # Source paragraphs per LLM batch
318
  para_batch_size: int = 15
319
  # Retry settings
320
- max_retries: int = 3
321
  retry_delay_s: float = 5.0
322
 
323
 
@@ -346,6 +369,7 @@ def llm_config_from_args(
346
  model=resolved_model or defaults.get("model", ""),
347
  api_key=resolved_key,
348
  base_url=defaults.get("base_url"),
 
349
  )
350
 
351
 
@@ -1769,6 +1793,11 @@ class DocumentBuilder:
1769
  for r in list(p_elem.findall(qn("w:r"))):
1770
  if r not in marker_runs:
1771
  p_elem.remove(r)
 
 
 
 
 
1772
  for rd in parse_md_runs(llm_text):
1773
  if not rd.text:
1774
  continue
@@ -1790,22 +1819,22 @@ class DocumentBuilder:
1790
  "[BUILD] LLM footnote id=%s para %d: '%.50s'",
1791
  fd.footnote_id, para_idx, llm_text,
1792
  )
1793
- continue # skip the original run-cleaning below
1794
-
1795
- # ── Original run-clean path ────────────────────────
1796
- # Apply blueprint style to <w:footnoteRef> marker run;
1797
- # strip source aesthetics from all other runs.
1798
- for r_elem in p_elem.findall(qn("w:r")):
1799
- fn_ref_check = _xpath(r_elem, ".//w:footnoteRef")
1800
- if fn_ref_check:
1801
- self._apply_fn_ref_style(r_elem)
1802
- continue
1803
- rPr = r_elem.find(qn("w:rPr"))
1804
- if rPr is not None:
1805
- for child in [c for c in rPr if c.tag not in KEEP_RPR_TAGS]:
1806
- rPr.remove(child)
1807
 
1808
  # Ensure separator after marker matches blueprint convention
 
1809
  self._normalize_fn_separator(p_elem)
1810
 
1811
  fn_root.append(fn_xml)
@@ -1993,12 +2022,41 @@ class MultiProviderLLMClient:
1993
  else:
1994
  return self._openai_compat(system, user, config)
1995
  except Exception as exc:
1996
- logger.warning(
1997
- "[LLM] %s attempt %d/%d failed: %s",
1998
- config.provider.value, attempt, config.max_retries, exc,
1999
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2000
  if attempt < config.max_retries:
2001
- time.sleep(config.retry_delay_s)
2002
  raise RuntimeError(
2003
  f"[LLM] All {config.max_retries} attempts failed for {config.provider.value}"
2004
  )
@@ -2172,9 +2230,6 @@ class MultiProviderLLMClient:
2172
  text = resp.json().get("response", "")
2173
  logger.debug("[LLM] Response: %d chars", len(text))
2174
  return text
2175
- text = resp.choices[0].message.content or ""
2176
- logger.debug("[LLM] Response: %d chars", len(text))
2177
- return text
2178
 
2179
  # ── Anthropic ─────────────────────────────────────────────────────
2180
  def _anthropic(self, system: str, user: str, config: LLMConfig) -> str:
@@ -2416,16 +2471,22 @@ def parse_md_runs(text: str) -> List["RunData"]:
2416
  # ============================================================================
2417
 
2418
  _FMT_SYSTEM = """\
2419
- You are a scholarly editor applying a strict style guide to format paragraphs.
2420
- Follow the style guide exactly. Do not paraphrase, translate, or alter any wording.
2421
- Only apply formatting as the guide specifies.
 
 
 
 
 
 
2422
 
2423
  Use Markdown for inline formatting:
2424
  *italic* for italic text
2425
  **bold** for bold text
2426
  ***bold italic*** for bold + italic
2427
- No other Markdown. Return plain paragraph text with inline markers only.
2428
- No extra commentary, no numbering, no blank lines within a paragraph.
2429
  """
2430
 
2431
  _PARA_USER_TMPL = """\
@@ -2516,6 +2577,14 @@ class LLMContentFormatter:
2516
  )
2517
 
2518
  for batch_start in range(0, len(to_format), config.para_batch_size):
 
 
 
 
 
 
 
 
2519
  batch = to_format[batch_start: batch_start + config.para_batch_size]
2520
  texts = [p.get_text() for p in batch]
2521
 
@@ -2663,6 +2732,7 @@ class LLMFormatTransplanter:
2663
  styleguide_out: Optional[Path] = None,
2664
  llm_mode: str = "both",
2665
  user_style_overrides: Optional[Dict[str, str]] = None,
 
2666
  ) -> Optional[Path]:
2667
  """
2668
  Returns the path to the saved styleguide if styleguide_out was set,
@@ -2675,6 +2745,8 @@ class LLMFormatTransplanter:
2675
  logger.info(" Output : %s", output_path)
2676
  logger.info(" Provider : %s / %s", llm_config.provider.value, llm_config.model)
2677
  logger.info(" LLM mode : %s", llm_mode)
 
 
2678
  logger.info(" Batch size : %d Context chars: %d",
2679
  llm_config.para_batch_size, llm_config.blueprint_context_chars)
2680
  logger.info("═" * 60)
@@ -2712,6 +2784,19 @@ class LLMFormatTransplanter:
2712
  extractor = ContentExtractor()
2713
  body_elements, footnotes = extractor.extract(src_doc)
2714
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2715
  # ── Phase 2-LLM: LLM content formatting ───────────────────────
2716
  formatter = LLMContentFormatter(client)
2717
  llm_para_map: Dict[int, str] = {}
@@ -2841,6 +2926,8 @@ Debug tips:
2841
  help="Blueprint chars to send for styleguide gen (default: 40000)")
2842
  llm_group.add_argument("--llm-batch", type=int, default=15, metavar="N",
2843
  help="Source paragraphs per LLM batch (default: 15)")
 
 
2844
 
2845
  args = parser.parse_args()
2846
 
@@ -2889,6 +2976,7 @@ Debug tips:
2889
  styleguide_out=sg_out,
2890
  llm_mode=args.llm_mode,
2891
  user_style_overrides=overrides,
 
2892
  )
2893
  except Exception as exc:
2894
  logger.error("Fatal error: %s", exc, exc_info=True)
 
22
  import asyncio
23
  import copy
24
  import logging
25
+ import os
26
  import re
27
  import shutil
28
  import sys
 
94
  )
95
  logger = logging.getLogger("FormatTransplant")
96
 
97
+ def load_dotenv(path: Optional[Path] = None):
98
+ """Simple .env loader to avoid extra dependencies."""
99
+ env_path = path or Path(".env")
100
+ if not env_path.exists():
101
+ return
102
+ try:
103
+ with open(env_path, "r", encoding="utf-8") as f:
104
+ for line in f:
105
+ line = line.strip()
106
+ if not line or line.startswith("#"):
107
+ continue
108
+ if "=" in line:
109
+ key, _, value = line.partition("=")
110
+ # Strip quotes if present
111
+ value = value.strip().strip('"').strip("'")
112
+ os.environ[key.strip()] = value
113
+ except Exception as e:
114
+ logger.warning(f"Failed to load .env: {e}")
115
+
116
+ # Load environment early
117
+ load_dotenv()
118
+
119
  # ============================================================================
120
  # SEMANTIC CLASSIFICATION CONSTANTS
121
  # ============================================================================
 
314
 
315
  # Per-provider defaults β€” base_url=None means the provider uses its own SDK
316
  PROVIDER_DEFAULTS: Dict[str, Dict[str, Any]] = {
317
+ "openai": {"base_url": "https://api.openai.com/v1", "env": "OPENAI_API_KEY", "model": "gpt-4o", "batch_size": 15},
318
+ "anthropic": {"base_url": None, "env": "ANTHROPIC_API_KEY", "model": "claude-3-5-sonnet-20241022", "batch_size": 15},
319
+ "groq": {"base_url": "https://api.groq.com/openai/v1", "env": "GROQ_API_KEY", "model": "llama-3.3-70b-versatile", "batch_size": 5},
320
+ "nebius": {"base_url": "https://api.studio.nebius.com/v1", "env": "NEBIUS_API_KEY", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "batch_size": 15},
321
+ "scaleway": {"base_url": "https://api.scaleway.ai/v1", "env": "SCW_SECRET_KEY", "model": "llama-3.3-70b-instruct", "batch_size": 15},
322
+ "openrouter": {"base_url": "https://openrouter.ai/api/v1", "env": "OPENROUTER_API_KEY","model": "meta-llama/llama-3.3-70b-instruct", "batch_size": 15},
323
+ "mistral": {"base_url": "https://api.mistral.ai/v1", "env": "MISTRAL_API_KEY", "model": "mistral-large-latest", "batch_size": 15},
324
+ "poe": {"base_url": None, "env": "POE_API_KEY", "model": "Claude-3.7-Sonnet", "batch_size": 15},
325
+ "ollama": {"base_url": "http://localhost:11434/api", "env": "OLLAMA_API_KEY", "model": "llama3.2", "batch_size": 15},
326
  }
327
 
328
 
 
336
  max_tokens: int = 4096
337
  temperature: float = 0.1 # low for deterministic formatting
338
  # How many chars of blueprint text to send for styleguide generation (~10 K tokens)
339
+ blueprint_context_chars: int = 100_000
340
  # Source paragraphs per LLM batch
341
  para_batch_size: int = 15
342
  # Retry settings
343
+ max_retries: int = 5
344
  retry_delay_s: float = 5.0
345
 
346
 
 
369
  model=resolved_model or defaults.get("model", ""),
370
  api_key=resolved_key,
371
  base_url=defaults.get("base_url"),
372
+ para_batch_size=defaults.get("batch_size", 15),
373
  )
374
 
375
 
 
1793
  for r in list(p_elem.findall(qn("w:r"))):
1794
  if r not in marker_runs:
1795
  p_elem.remove(r)
1796
+
1797
+ # Apply blueprint marker formatting to the preserved marker runs
1798
+ for r_marker in marker_runs:
1799
+ self._apply_fn_ref_style(r_marker)
1800
+
1801
  for rd in parse_md_runs(llm_text):
1802
  if not rd.text:
1803
  continue
 
1819
  "[BUILD] LLM footnote id=%s para %d: '%.50s'",
1820
  fd.footnote_id, para_idx, llm_text,
1821
  )
1822
+ else:
1823
+ # ── Original run-clean path ────────────────────────
1824
+ # Apply blueprint style to <w:footnoteRef> marker run;
1825
+ # strip source aesthetics from all other runs.
1826
+ for r_elem in p_elem.findall(qn("w:r")):
1827
+ fn_ref_check = _xpath(r_elem, ".//w:footnoteRef")
1828
+ if fn_ref_check:
1829
+ self._apply_fn_ref_style(r_elem)
1830
+ continue
1831
+ rPr = r_elem.find(qn("w:rPr"))
1832
+ if rPr is not None:
1833
+ for child in [c for c in rPr if c.tag not in KEEP_RPR_TAGS]:
1834
+ rPr.remove(child)
 
1835
 
1836
  # Ensure separator after marker matches blueprint convention
1837
+ # (Re-applied even for LLM text to ensure tab preservation)
1838
  self._normalize_fn_separator(p_elem)
1839
 
1840
  fn_root.append(fn_xml)
 
2022
  else:
2023
  return self._openai_compat(system, user, config)
2024
  except Exception as exc:
2025
+ is_rate_limit = False
2026
+ header_delay = None
2027
+
2028
+ # Try to extract retry-after from common SDK exceptions
2029
+ exc_str = str(exc).lower()
2030
+ if "429" in exc_str or "rate limit" in exc_str:
2031
+ is_rate_limit = True
2032
+
2033
+ # OpenAI / Groq / OpenRouter often put it in headers
2034
+ if hasattr(exc, "response") and hasattr(exc.response, "headers"):
2035
+ retry_after = exc.response.headers.get("retry-after")
2036
+ if retry_after and retry_after.isdigit():
2037
+ header_delay = float(retry_after)
2038
+
2039
+ # Exponential backoff: retry_delay * (2 ^ (attempt-1))
2040
+ delay = config.retry_delay_s * (2 ** (attempt - 1))
2041
+
2042
+ if header_delay:
2043
+ delay = max(delay, header_delay + 1.0) # Add 1s buffer
2044
+ elif is_rate_limit:
2045
+ delay *= 2 # Extra patience for rate limits
2046
+
2047
+ if is_rate_limit:
2048
+ logger.warning(
2049
+ "[LLM] %s rate limited (429). Waiting %.1f seconds... (Attempt %d/%d)",
2050
+ config.provider.value, delay, attempt, config.max_retries
2051
+ )
2052
+ else:
2053
+ logger.warning(
2054
+ "[LLM] %s attempt %d/%d failed: %s",
2055
+ config.provider.value, attempt, config.max_retries, exc,
2056
+ )
2057
+
2058
  if attempt < config.max_retries:
2059
+ time.sleep(delay)
2060
  raise RuntimeError(
2061
  f"[LLM] All {config.max_retries} attempts failed for {config.provider.value}"
2062
  )
 
2230
  text = resp.json().get("response", "")
2231
  logger.debug("[LLM] Response: %d chars", len(text))
2232
  return text
 
 
 
2233
 
2234
  # ── Anthropic ─────────────────────────────────────────────────────
2235
  def _anthropic(self, system: str, user: str, config: LLMConfig) -> str:
 
2471
  # ============================================================================
2472
 
2473
  _FMT_SYSTEM = """\
2474
+ You are a scholarly editor applying a strict editorial style guide to existing text.
2475
+ Your ONLY task is to apply inline formatting (bold/italic) to the text provided.
2476
+
2477
+ CRITICAL CONSTRAINTS:
2478
+ 1. DO NOT translate the text.
2479
+ 2. DO NOT paraphrase or summarize the text.
2480
+ 3. DO NOT add any introductory remarks, commentary, or conclusions.
2481
+ 4. DO NOT change a single word or punctuation mark of the original text.
2482
+ 5. REPRODUCE the text EXACTLY as given, only adding Markdown markers for formatting.
2483
 
2484
  Use Markdown for inline formatting:
2485
  *italic* for italic text
2486
  **bold** for bold text
2487
  ***bold italic*** for bold + italic
2488
+ No other Markdown (no # headings, no lists). Return plain paragraph text with inline markers only.
2489
+ Return EXACTLY one formatted response for each input paragraph.
2490
  """
2491
 
2492
  _PARA_USER_TMPL = """\
 
2577
  )
2578
 
2579
  for batch_start in range(0, len(to_format), config.para_batch_size):
2580
+ # Inter-batch delay to stay under rate limits
2581
+ if batch_start > 0:
2582
+ batch_delay = 2.0 # 2 seconds between batches
2583
+ if config.provider == LLMProvider.GROQ:
2584
+ batch_delay = 15.0 # Extra delay for Groq (very tight limits)
2585
+ logger.info("[LLM-FMT] Inter-batch delay: %.1fs...", batch_delay)
2586
+ time.sleep(batch_delay)
2587
+
2588
  batch = to_format[batch_start: batch_start + config.para_batch_size]
2589
  texts = [p.get_text() for p in batch]
2590
 
 
2732
  styleguide_out: Optional[Path] = None,
2733
  llm_mode: str = "both",
2734
  user_style_overrides: Optional[Dict[str, str]] = None,
2735
+ debug_limit: Optional[int] = None,
2736
  ) -> Optional[Path]:
2737
  """
2738
  Returns the path to the saved styleguide if styleguide_out was set,
 
2745
  logger.info(" Output : %s", output_path)
2746
  logger.info(" Provider : %s / %s", llm_config.provider.value, llm_config.model)
2747
  logger.info(" LLM mode : %s", llm_mode)
2748
+ if debug_limit:
2749
+ logger.info(" Debug limit: %d paragraphs", debug_limit)
2750
  logger.info(" Batch size : %d Context chars: %d",
2751
  llm_config.para_batch_size, llm_config.blueprint_context_chars)
2752
  logger.info("═" * 60)
 
2784
  extractor = ContentExtractor()
2785
  body_elements, footnotes = extractor.extract(src_doc)
2786
 
2787
+ # Apply debug limit if requested
2788
+ if debug_limit:
2789
+ count = 0
2790
+ limited_body = []
2791
+ for e in body_elements:
2792
+ limited_body.append(e)
2793
+ if e.semantic_class != "table":
2794
+ count += 1
2795
+ if count >= debug_limit:
2796
+ break
2797
+ body_elements = limited_body
2798
+ logger.info("Debug limit applied: only processing first %d body paragraphs.", debug_limit)
2799
+
2800
  # ── Phase 2-LLM: LLM content formatting ───────────────────────
2801
  formatter = LLMContentFormatter(client)
2802
  llm_para_map: Dict[int, str] = {}
 
2926
  help="Blueprint chars to send for styleguide gen (default: 40000)")
2927
  llm_group.add_argument("--llm-batch", type=int, default=15, metavar="N",
2928
  help="Source paragraphs per LLM batch (default: 15)")
2929
+ llm_group.add_argument("--debug-limit", type=int, default=None, metavar="N",
2930
+ help="Process only first N paragraphs (for testing)")
2931
 
2932
  args = parser.parse_args()
2933
 
 
2976
  styleguide_out=sg_out,
2977
  llm_mode=args.llm_mode,
2978
  user_style_overrides=overrides,
2979
+ debug_limit=args.debug_limit,
2980
  )
2981
  except Exception as exc:
2982
  logger.error("Fatal error: %s", exc, exc_info=True)