Spaces:

cstr
/

FormatTransplant

Running

App Files Files Community

cstr commited on 17 days ago

Commit

2560199

verified ·

1 Parent(s): 4f54832

Update format_transplant.py

Browse files

Files changed (1) hide show

format_transplant.py +126 -38

format_transplant.py CHANGED Viewed

@@ -22,6 +22,7 @@ import argparse
 import asyncio
 import copy
 import logging
 import re
 import shutil
 import sys
@@ -93,6 +94,28 @@ logging.basicConfig(
 )
 logger = logging.getLogger("FormatTransplant")
 # ============================================================================
 # SEMANTIC CLASSIFICATION CONSTANTS
 # ============================================================================
@@ -291,15 +314,15 @@ class LLMProvider(Enum):
 # Per-provider defaults — base_url=None means the provider uses its own SDK
 PROVIDER_DEFAULTS: Dict[str, Dict[str, Any]] = {
-    "openai":     {"base_url": "https://api.openai.com/v1",           "env": "OPENAI_API_KEY",    "model": "gpt-4o"},
-    "anthropic":  {"base_url": None,                                   "env": "ANTHROPIC_API_KEY", "model": "claude-3-5-sonnet-20241022"},
-    "groq":       {"base_url": "https://api.groq.com/openai/v1",      "env": "GROQ_API_KEY",      "model": "llama-3.3-70b-versatile"},
-    "nebius":     {"base_url": "https://api.studio.nebius.com/v1",    "env": "NEBIUS_API_KEY",    "model": "meta-llama/Meta-Llama-3.1-70B-Instruct"},
-    "scaleway":   {"base_url": "https://api.scaleway.ai/v1",          "env": "SCW_SECRET_KEY",    "model": "llama-3.3-70b-instruct"},
-    "openrouter": {"base_url": "https://openrouter.ai/api/v1",        "env": "OPENROUTER_API_KEY","model": "meta-llama/llama-3.3-70b-instruct"},
-    "mistral":    {"base_url": "https://api.mistral.ai/v1",           "env": "MISTRAL_API_KEY",   "model": "mistral-large-latest"},
-    "poe":        {"base_url": None,                                   "env": "POE_API_KEY",       "model": "Claude-3.7-Sonnet"},
-    "ollama":     {"base_url": "http://localhost:11434/api",          "env": "OLLAMA_API_KEY",    "model": "llama3.2"},
 }
@@ -313,11 +336,11 @@ class LLMConfig:
     max_tokens: int = 4096
     temperature: float = 0.1           # low for deterministic formatting
     # How many chars of blueprint text to send for styleguide generation (~10 K tokens)
-    blueprint_context_chars: int = 40_000
     # Source paragraphs per LLM batch
     para_batch_size: int = 15
     # Retry settings
-    max_retries: int = 3
     retry_delay_s: float = 5.0
@@ -346,6 +369,7 @@ def llm_config_from_args(
         model=resolved_model or defaults.get("model", ""),
         api_key=resolved_key,
         base_url=defaults.get("base_url"),
     )
@@ -1769,6 +1793,11 @@ class DocumentBuilder:
                         for r in list(p_elem.findall(qn("w:r"))):
                             if r not in marker_runs:
                                 p_elem.remove(r)
                         for rd in parse_md_runs(llm_text):
                             if not rd.text:
                                 continue
@@ -1790,22 +1819,22 @@ class DocumentBuilder:
                             "[BUILD] LLM footnote id=%s para %d: '%.50s'",
                             fd.footnote_id, para_idx, llm_text,
                         )
-                        continue   # skip the original run-cleaning below
-                    # ── Original run-clean path ────────────────────────
-                    # Apply blueprint style to <w:footnoteRef> marker run;
-                    # strip source aesthetics from all other runs.
-                    for r_elem in p_elem.findall(qn("w:r")):
-                        fn_ref_check = _xpath(r_elem, ".//w:footnoteRef")
-                        if fn_ref_check:
-                            self._apply_fn_ref_style(r_elem)
-                            continue
-                        rPr = r_elem.find(qn("w:rPr"))
-                        if rPr is not None:
-                            for child in [c for c in rPr if c.tag not in KEEP_RPR_TAGS]:
-                                rPr.remove(child)
                     # Ensure separator after marker matches blueprint convention
                     self._normalize_fn_separator(p_elem)
                 fn_root.append(fn_xml)
@@ -1993,12 +2022,41 @@ class MultiProviderLLMClient:
                 else:
                     return self._openai_compat(system, user, config)
             except Exception as exc:
-                logger.warning(
-                    "[LLM] %s attempt %d/%d failed: %s",
-                    config.provider.value, attempt, config.max_retries, exc,
-                )
                 if attempt < config.max_retries:
-                    time.sleep(config.retry_delay_s)
         raise RuntimeError(
             f"[LLM] All {config.max_retries} attempts failed for {config.provider.value}"
         )
@@ -2172,9 +2230,6 @@ class MultiProviderLLMClient:
         text = resp.json().get("response", "")
         logger.debug("[LLM] Response: %d chars", len(text))
         return text
-        text = resp.choices[0].message.content or ""
-        logger.debug("[LLM] Response: %d chars", len(text))
-        return text
     # ── Anthropic ─────────────────────────────────────────────────────
     def _anthropic(self, system: str, user: str, config: LLMConfig) -> str:
@@ -2416,16 +2471,22 @@ def parse_md_runs(text: str) -> List["RunData"]:
 # ============================================================================
 _FMT_SYSTEM = """\
-You are a scholarly editor applying a strict style guide to format paragraphs.
-Follow the style guide exactly. Do not paraphrase, translate, or alter any wording.
-Only apply formatting as the guide specifies.
 Use Markdown for inline formatting:
   *italic*          for italic text
   **bold**          for bold text
   ***bold italic*** for bold + italic
-No other Markdown. Return plain paragraph text with inline markers only.
-No extra commentary, no numbering, no blank lines within a paragraph.
 """
 _PARA_USER_TMPL = """\
@@ -2516,6 +2577,14 @@ class LLMContentFormatter:
         )
         for batch_start in range(0, len(to_format), config.para_batch_size):
             batch = to_format[batch_start: batch_start + config.para_batch_size]
             texts = [p.get_text() for p in batch]
@@ -2663,6 +2732,7 @@ class LLMFormatTransplanter:
         styleguide_out: Optional[Path] = None,
         llm_mode: str = "both",
         user_style_overrides: Optional[Dict[str, str]] = None,
     ) -> Optional[Path]:
         """
         Returns the path to the saved styleguide if styleguide_out was set,
@@ -2675,6 +2745,8 @@ class LLMFormatTransplanter:
         logger.info("  Output     : %s", output_path)
         logger.info("  Provider   : %s / %s", llm_config.provider.value, llm_config.model)
         logger.info("  LLM mode   : %s", llm_mode)
         logger.info("  Batch size : %d  Context chars: %d",
                     llm_config.para_batch_size, llm_config.blueprint_context_chars)
         logger.info("═" * 60)
@@ -2712,6 +2784,19 @@ class LLMFormatTransplanter:
         extractor = ContentExtractor()
         body_elements, footnotes = extractor.extract(src_doc)
         # ── Phase 2-LLM: LLM content formatting ───────────────────────
         formatter = LLMContentFormatter(client)
         llm_para_map: Dict[int, str] = {}
@@ -2841,6 +2926,8 @@ Debug tips:
                            help="Blueprint chars to send for styleguide gen (default: 40000)")
     llm_group.add_argument("--llm-batch", type=int, default=15, metavar="N",
                            help="Source paragraphs per LLM batch (default: 15)")
     args = parser.parse_args()
@@ -2889,6 +2976,7 @@ Debug tips:
                 styleguide_out=sg_out,
                 llm_mode=args.llm_mode,
                 user_style_overrides=overrides,
             )
         except Exception as exc:
             logger.error("Fatal error: %s", exc, exc_info=True)

 import asyncio
 import copy
 import logging
+import os
 import re
 import shutil
 import sys
 )
 logger = logging.getLogger("FormatTransplant")
+def load_dotenv(path: Optional[Path] = None):
+    """Simple .env loader to avoid extra dependencies."""
+    env_path = path or Path(".env")
+    if not env_path.exists():
+        return
+    try:
+        with open(env_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+                if "=" in line:
+                    key, _, value = line.partition("=")
+                    # Strip quotes if present
+                    value = value.strip().strip('"').strip("'")
+                    os.environ[key.strip()] = value
+    except Exception as e:
+        logger.warning(f"Failed to load .env: {e}")
+# Load environment early
+load_dotenv()
 # ============================================================================
 # SEMANTIC CLASSIFICATION CONSTANTS
 # ============================================================================
 # Per-provider defaults — base_url=None means the provider uses its own SDK
 PROVIDER_DEFAULTS: Dict[str, Dict[str, Any]] = {
+    "openai":     {"base_url": "https://api.openai.com/v1",           "env": "OPENAI_API_KEY",    "model": "gpt-4o", "batch_size": 15},
+    "anthropic":  {"base_url": None,                                   "env": "ANTHROPIC_API_KEY", "model": "claude-3-5-sonnet-20241022", "batch_size": 15},
+    "groq":       {"base_url": "https://api.groq.com/openai/v1",      "env": "GROQ_API_KEY",      "model": "llama-3.3-70b-versatile", "batch_size": 5},
+    "nebius":     {"base_url": "https://api.studio.nebius.com/v1",    "env": "NEBIUS_API_KEY",    "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "batch_size": 15},
+    "scaleway":   {"base_url": "https://api.scaleway.ai/v1",          "env": "SCW_SECRET_KEY",    "model": "llama-3.3-70b-instruct", "batch_size": 15},
+    "openrouter": {"base_url": "https://openrouter.ai/api/v1",        "env": "OPENROUTER_API_KEY","model": "meta-llama/llama-3.3-70b-instruct", "batch_size": 15},
+    "mistral":    {"base_url": "https://api.mistral.ai/v1",           "env": "MISTRAL_API_KEY",   "model": "mistral-large-latest", "batch_size": 15},
+    "poe":        {"base_url": None,                                   "env": "POE_API_KEY",       "model": "Claude-3.7-Sonnet", "batch_size": 15},
+    "ollama":     {"base_url": "http://localhost:11434/api",          "env": "OLLAMA_API_KEY",    "model": "llama3.2", "batch_size": 15},
 }
     max_tokens: int = 4096
     temperature: float = 0.1           # low for deterministic formatting
     # How many chars of blueprint text to send for styleguide generation (~10 K tokens)
+    blueprint_context_chars: int = 100_000
     # Source paragraphs per LLM batch
     para_batch_size: int = 15
     # Retry settings
+    max_retries: int = 5
     retry_delay_s: float = 5.0
         model=resolved_model or defaults.get("model", ""),
         api_key=resolved_key,
         base_url=defaults.get("base_url"),
+        para_batch_size=defaults.get("batch_size", 15),
     )
                         for r in list(p_elem.findall(qn("w:r"))):
                             if r not in marker_runs:
                                 p_elem.remove(r)
+                        # Apply blueprint marker formatting to the preserved marker runs
+                        for r_marker in marker_runs:
+                            self._apply_fn_ref_style(r_marker)
                         for rd in parse_md_runs(llm_text):
                             if not rd.text:
                                 continue
                             "[BUILD] LLM footnote id=%s para %d: '%.50s'",
                             fd.footnote_id, para_idx, llm_text,
                         )
+                    else:
+                        # ── Original run-clean path ────────────────────────
+                        # Apply blueprint style to <w:footnoteRef> marker run;
+                        # strip source aesthetics from all other runs.
+                        for r_elem in p_elem.findall(qn("w:r")):
+                            fn_ref_check = _xpath(r_elem, ".//w:footnoteRef")
+                            if fn_ref_check:
+                                self._apply_fn_ref_style(r_elem)
+                                continue
+                            rPr = r_elem.find(qn("w:rPr"))
+                            if rPr is not None:
+                                for child in [c for c in rPr if c.tag not in KEEP_RPR_TAGS]:
+                                    rPr.remove(child)
                     # Ensure separator after marker matches blueprint convention
+                    # (Re-applied even for LLM text to ensure tab preservation)
                     self._normalize_fn_separator(p_elem)
                 fn_root.append(fn_xml)
                 else:
                     return self._openai_compat(system, user, config)
             except Exception as exc:
+                is_rate_limit = False
+                header_delay = None
+                # Try to extract retry-after from common SDK exceptions
+                exc_str = str(exc).lower()
+                if "429" in exc_str or "rate limit" in exc_str:
+                    is_rate_limit = True
+                # OpenAI / Groq / OpenRouter often put it in headers
+                if hasattr(exc, "response") and hasattr(exc.response, "headers"):
+                    retry_after = exc.response.headers.get("retry-after")
+                    if retry_after and retry_after.isdigit():
+                        header_delay = float(retry_after)
+                # Exponential backoff: retry_delay * (2 ^ (attempt-1))
+                delay = config.retry_delay_s * (2 ** (attempt - 1))
+                if header_delay:
+                    delay = max(delay, header_delay + 1.0) # Add 1s buffer
+                elif is_rate_limit:
+                    delay *= 2 # Extra patience for rate limits
+                if is_rate_limit:
+                    logger.warning(
+                        "[LLM] %s rate limited (429). Waiting %.1f seconds... (Attempt %d/%d)",
+                        config.provider.value, delay, attempt, config.max_retries
+                    )
+                else:
+                    logger.warning(
+                        "[LLM] %s attempt %d/%d failed: %s",
+                        config.provider.value, attempt, config.max_retries, exc,
+                    )
                 if attempt < config.max_retries:
+                    time.sleep(delay)
         raise RuntimeError(
             f"[LLM] All {config.max_retries} attempts failed for {config.provider.value}"
         )
         text = resp.json().get("response", "")
         logger.debug("[LLM] Response: %d chars", len(text))
         return text
     # ── Anthropic ─────────────────────────────────────────────────────
     def _anthropic(self, system: str, user: str, config: LLMConfig) -> str:
 # ============================================================================
 _FMT_SYSTEM = """\
+You are a scholarly editor applying a strict editorial style guide to existing text.
+Your ONLY task is to apply inline formatting (bold/italic) to the text provided.
+CRITICAL CONSTRAINTS:
+1. DO NOT translate the text.
+2. DO NOT paraphrase or summarize the text.
+3. DO NOT add any introductory remarks, commentary, or conclusions.
+4. DO NOT change a single word or punctuation mark of the original text.
+5. REPRODUCE the text EXACTLY as given, only adding Markdown markers for formatting.
 Use Markdown for inline formatting:
   *italic*          for italic text
   **bold**          for bold text
   ***bold italic*** for bold + italic
+No other Markdown (no # headings, no lists). Return plain paragraph text with inline markers only.
+Return EXACTLY one formatted response for each input paragraph.
 """
 _PARA_USER_TMPL = """\
         )
         for batch_start in range(0, len(to_format), config.para_batch_size):
+            # Inter-batch delay to stay under rate limits
+            if batch_start > 0:
+                batch_delay = 2.0 # 2 seconds between batches
+                if config.provider == LLMProvider.GROQ:
+                    batch_delay = 15.0 # Extra delay for Groq (very tight limits)
+                logger.info("[LLM-FMT] Inter-batch delay: %.1fs...", batch_delay)
+                time.sleep(batch_delay)
             batch = to_format[batch_start: batch_start + config.para_batch_size]
             texts = [p.get_text() for p in batch]
         styleguide_out: Optional[Path] = None,
         llm_mode: str = "both",
         user_style_overrides: Optional[Dict[str, str]] = None,
+        debug_limit: Optional[int] = None,
     ) -> Optional[Path]:
         """
         Returns the path to the saved styleguide if styleguide_out was set,
         logger.info("  Output     : %s", output_path)
         logger.info("  Provider   : %s / %s", llm_config.provider.value, llm_config.model)
         logger.info("  LLM mode   : %s", llm_mode)
+        if debug_limit:
+            logger.info("  Debug limit: %d paragraphs", debug_limit)
         logger.info("  Batch size : %d  Context chars: %d",
                     llm_config.para_batch_size, llm_config.blueprint_context_chars)
         logger.info("═" * 60)
         extractor = ContentExtractor()
         body_elements, footnotes = extractor.extract(src_doc)
+        # Apply debug limit if requested
+        if debug_limit:
+            count = 0
+            limited_body = []
+            for e in body_elements:
+                limited_body.append(e)
+                if e.semantic_class != "table":
+                    count += 1
+                if count >= debug_limit:
+                    break
+            body_elements = limited_body
+            logger.info("Debug limit applied: only processing first %d body paragraphs.", debug_limit)
         # ── Phase 2-LLM: LLM content formatting ───────────────────────
         formatter = LLMContentFormatter(client)
         llm_para_map: Dict[int, str] = {}
                            help="Blueprint chars to send for styleguide gen (default: 40000)")
     llm_group.add_argument("--llm-batch", type=int, default=15, metavar="N",
                            help="Source paragraphs per LLM batch (default: 15)")
+    llm_group.add_argument("--debug-limit", type=int, default=None, metavar="N",
+                           help="Process only first N paragraphs (for testing)")
     args = parser.parse_args()
                 styleguide_out=sg_out,
                 llm_mode=args.llm_mode,
                 user_style_overrides=overrides,
+                debug_limit=args.debug_limit,
             )
         except Exception as exc:
             logger.error("Fatal error: %s", exc, exc_info=True)