Spaces:

cstr
/

FormatTransplant

Running

App Files Files Community

cstr commited on 16 days ago

Commit

6835ea6

verified ·

1 Parent(s): bec3dbc

Update format_transplant.py

Browse files

Files changed (1) hide show

format_transplant.py +234 -124

format_transplant.py CHANGED Viewed

@@ -313,16 +313,71 @@ class LLMProvider(Enum):
 # Per-provider defaults — base_url=None means the provider uses its own SDK
 PROVIDER_DEFAULTS: Dict[str, Dict[str, Any]] = {
-    "openai":     {"base_url": "https://api.openai.com/v1",           "env": "OPENAI_API_KEY",    "model": "gpt-4o", "batch_size": 15},
-    "anthropic":  {"base_url": None,                                   "env": "ANTHROPIC_API_KEY", "model": "claude-3-5-sonnet-20241022", "batch_size": 15},
-    "groq":       {"base_url": "https://api.groq.com/openai/v1",      "env": "GROQ_API_KEY",      "model": "llama-3.3-70b-versatile", "batch_size": 5},
-    "nebius":     {"base_url": "https://api.studio.nebius.com/v1",    "env": "NEBIUS_API_KEY",    "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "batch_size": 15},
-    "scaleway":   {"base_url": "https://api.scaleway.ai/v1",          "env": "SCW_SECRET_KEY",    "model": "llama-3.3-70b-instruct", "batch_size": 15},
-    "openrouter": {"base_url": "https://openrouter.ai/api/v1",        "env": "OPENROUTER_API_KEY","model": "meta-llama/llama-3.3-70b-instruct", "batch_size": 15},
-    "mistral":    {"base_url": "https://api.mistral.ai/v1",           "env": "MISTRAL_API_KEY",   "model": "mistral-large-latest", "batch_size": 15},
-    "poe":        {"base_url": None,                                   "env": "POE_API_KEY",       "model": "Claude-3.7-Sonnet", "batch_size": 15},
-    "ollama":     {"base_url": "http://localhost:11434/api",          "env": "OLLAMA_API_KEY",    "model": "llama3.2", "batch_size": 15},
 }
@@ -342,6 +397,7 @@ class LLMConfig:
     # Retry settings
     max_retries: int = 5
     retry_delay_s: float = 5.0
 def llm_config_from_args(
@@ -370,6 +426,7 @@ def llm_config_from_args(
         api_key=resolved_key,
         base_url=defaults.get("base_url"),
         para_batch_size=defaults.get("batch_size", 15),
     )
@@ -2011,54 +2068,72 @@ class MultiProviderLLMClient:
     def complete(self, system: str, user: str, config: LLMConfig) -> str:
         """Send a chat completion and return the assistant's text."""
-        for attempt in range(1, config.max_retries + 1):
-            try:
-                if config.provider == LLMProvider.ANTHROPIC:
-                    return self._anthropic(system, user, config)
-                elif config.provider == LLMProvider.POE:
-                    return self._poe(system, user, config)
-                elif config.provider == LLMProvider.OLLAMA:
-                    return self._ollama(system, user, config)
-                else:
-                    return self._openai_compat(system, user, config)
-            except Exception as exc:
-                is_rate_limit = False
-                header_delay = None
-                # Try to extract retry-after from common SDK exceptions
-                exc_str = str(exc).lower()
-                if "429" in exc_str or "rate limit" in exc_str:
-                    is_rate_limit = True
-                # OpenAI / Groq / OpenRouter often put it in headers
-                if hasattr(exc, "response") and hasattr(exc.response, "headers"):
-                    retry_after = exc.response.headers.get("retry-after")
-                    if retry_after and retry_after.isdigit():
-                        header_delay = float(retry_after)
-                # Exponential backoff: retry_delay * (2 ^ (attempt-1))
-                delay = config.retry_delay_s * (2 ** (attempt - 1))
-                if header_delay:
-                    delay = max(delay, header_delay + 1.0) # Add 1s buffer
-                elif is_rate_limit:
-                    delay *= 2 # Extra patience for rate limits
-                if is_rate_limit:
-                    logger.warning(
-                        "[LLM] %s rate limited (429). Waiting %.1f seconds... (Attempt %d/%d)",
-                        config.provider.value, delay, attempt, config.max_retries
-                    )
-                else:
-                    logger.warning(
-                        "[LLM] %s attempt %d/%d failed: %s",
-                        config.provider.value, attempt, config.max_retries, exc,
-                    )
-                if attempt < config.max_retries:
-                    time.sleep(delay)
         raise RuntimeError(
-            f"[LLM] All {config.max_retries} attempts failed for {config.provider.value}"
         )
     def get_available_models(self, config: LLMConfig) -> List[Dict[str, Any]]:
@@ -2345,39 +2420,52 @@ def extract_blueprint_text(doc: Document, max_chars: int = 40_000) -> str:
 # ============================================================================
 _SG_SYSTEM = """\
-You are an expert scholarly editor analysing a document to derive its editorial style guide.
-Your output will be used as a precise instruction set for reformatting the text of a companion document.
-Study every formatting pattern you can observe in the excerpt: how names are treated, how foreign-language
-terms are handled, how citations are formatted, which punctuation conventions are used, how quotation
-marks are chosen, how headings are structured, and any other recurring editorial rule.
-Write the style guide as INSTRUCTIONS TO YOURSELF — precise, actionable rules in the imperative mood.
 """
 _SG_USER_TMPL = """\
-Below is an excerpt from the **blueprint document**. Analyse its editorial conventions carefully.
-DOCUMENT EXCERPT:
 ──────────────────────────────────────────────────
 {blueprint_text}
 ──────────────────────────────────────────────────
 {extra_section}
-Produce a **STYLE GUIDE** in Markdown. It must cover, at minimum:
-1. **Language & register** — formal/informal, which languages appear, multilingual conventions
-2. **Personal names** — italic or not, how introduced (full name on first occurrence, surname thereafter, etc.)
-3. **Institutional & place names** — treatment, abbreviations
-4. **Foreign-language terms** — each language present: italic? quotation marks? transliteration system?
-   (e.g. "Arabic terms in DMG transliteration, always italic"; "Latin phrases italic"; "French loan words roman")
-5. **Inline emphasis** — when to use *italic* vs **bold**, what categories of content receive each
-6. **Quotation marks** — which style: "…" vs '…' vs «…» vs „…" — and in which contexts
-7. **Citations in footnotes** — author-title format, edition notation, page references
-8. **Heading conventions** — capitalisation, numbering if any
-9. **Any other observable rule**
-Write each rule as a precise instruction: "Always italicise…", "Use „…" for German quotations…", etc.
-Aim for completeness — the guide must be self-sufficient for a human editor who has never seen the blueprint.
 """
@@ -2472,21 +2560,20 @@ def parse_md_runs(text: str) -> List["RunData"]:
 _FMT_SYSTEM = """\
 You are a scholarly editor applying a strict editorial style guide to existing text.
-Your ONLY task is to apply inline formatting (bold/italic) to the text provided.
-CRITICAL CONSTRAINTS:
-1. DO NOT translate the text.
-2. DO NOT paraphrase or summarize the text.
-3. DO NOT add any introductory remarks, commentary, or conclusions.
-4. DO NOT change a single word or punctuation mark of the original text.
-5. REPRODUCE the text EXACTLY as given, only adding Markdown markers for formatting.
 Use Markdown for inline formatting:
   *italic*          for italic text
   **bold**          for bold text
   ***bold italic*** for bold + italic
-No other Markdown (no # headings, no lists). Return plain paragraph text with inline markers only.
-Return EXACTLY one formatted response for each input paragraph.
 """
 _PARA_USER_TMPL = """\
@@ -2495,12 +2582,18 @@ STYLE GUIDE:
 {styleguide}
 ──────────────────────────────────────────────────
-Apply this style guide to each of the {n} paragraphs below.
-Return EXACTLY {n} formatted paragraphs separated by the line:
-{sep}
-Do NOT number them. Do NOT add any commentary or blank lines between the separator and the next paragraph.
-PARAGRAPHS:
 {content}
 """
@@ -2510,14 +2603,18 @@ STYLE GUIDE:
 {styleguide}
 ──────────────────────────────────────────────────
-Apply this style guide to each of the {n} footnotes below.
-Footnotes often contain citations, names, foreign terms and references —
-pay special attention to the citation and name conventions in the style guide.
-Return EXACTLY {n} formatted footnotes separated by the line:
-{sep}
-Do NOT number them. Do NOT add commentary.
-FOOTNOTES:
 {content}
 """
@@ -2576,6 +2673,8 @@ class LLMContentFormatter:
             len(to_format), mode, config.para_batch_size,
         )
         for batch_start in range(0, len(to_format), config.para_batch_size):
             # Inter-batch delay to stay under rate limits
             if batch_start > 0:
@@ -2586,14 +2685,17 @@ class LLMContentFormatter:
                 time.sleep(batch_delay)
             batch = to_format[batch_start: batch_start + config.para_batch_size]
-            texts = [p.get_text() for p in batch]
-            content = f"\n{_BATCH_SEP}\n".join(texts)
             tmpl = _FN_USER_TMPL if mode == "footnote" else _PARA_USER_TMPL
             user_msg = tmpl.format(
                 styleguide=styleguide,
                 n=len(batch),
-                sep=_BATCH_SEP,
                 content=content,
             )
@@ -2604,40 +2706,48 @@ class LLMContentFormatter:
             try:
                 response = self.client.complete(_FMT_SYSTEM, user_msg, config)
-                parsed = self._parse_response(response, len(batch), texts)
             except Exception as exc:
                 logger.error("[LLM-FMT] Batch failed, using originals: %s", exc)
-                parsed = texts
             for pd, formatted in zip(batch, parsed):
                 if formatted.strip():
                     result[id(pd)] = formatted
                     logger.debug(
-                        "[LLM-FMT] Para formatted: orig='%.50s' → fmt='%.50s'",
-                        pd.get_text(), formatted,
                     )
         return result
     # ------------------------------------------------------------------
     @staticmethod
-    def _parse_response(response: str, expected: int, originals: List[str]) -> List[str]:
         """
-        Split the LLM response on _BATCH_SEP and return exactly `expected` strings.
-        Falls back to originals for any missing entries.
         """
-        parts = [p.strip() for p in response.split(_BATCH_SEP)]
-        parts = [p for p in parts if p]   # remove empties
-        if len(parts) != expected:
-            logger.warning(
-                "[LLM-FMT] Expected %d parts, got %d — padding/truncating",
-                expected, len(parts),
-            )
-        # Pad with originals if too short, truncate if too long
-        while len(parts) < expected:
-            parts.append(originals[len(parts)])
-        return parts[:expected]
 # ============================================================================

 # Per-provider defaults — base_url=None means the provider uses its own SDK
+# Added top 5 fallback models for each provider
 PROVIDER_DEFAULTS: Dict[str, Dict[str, Any]] = {
+    "openai": {
+        "base_url": "https://api.openai.com/v1",
+        "env": "OPENAI_API_KEY",
+        "model": "gpt-4o",
+        "fallbacks": ["gpt-4o-2024-08-06", "gpt-4o-mini", "gpt-4-turbo", "gpt-3.5-turbo"],
+        "batch_size": 15
+    },
+    "anthropic": {
+        "base_url": None,
+        "env": "ANTHROPIC_API_KEY",
+        "model": "claude-3-7-sonnet-20250219",
+        "fallbacks": ["claude-3-5-sonnet-20241022", "claude-3-5-haiku-20241022", "claude-3-opus-20240229", "claude-2.1"],
+        "batch_size": 15
+    },
+    "groq": {
+        "base_url": "https://api.groq.com/openai/v1",
+        "env": "GROQ_API_KEY",
+        "model": "llama-3.3-70b-versatile",
+        "fallbacks": ["llama-3.1-70b-versatile", "llama-3.1-8b-instant", "mixtral-8x7b-32768", "gemma2-9b-it"],
+        "batch_size": 5
+    },
+    "nebius": {
+        "base_url": "https://api.studio.nebius.ai/v1",
+        "env": "NEBIUS_API_KEY",
+        "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+        "fallbacks": ["meta-llama/Meta-Llama-3.1-8B-Instruct", "meta-llama/Llama-Guard-3-8B"],
+        "batch_size": 15
+    },
+    "scaleway": {
+        "base_url": "https://api.scaleway.ai/v1",
+        "env": "SCALEWAY_API_KEY", # Updated to match .env
+        "model": "llama-3.3-70b-instruct",
+        "fallbacks": ["deepseek-r1-distill-llama-70b", "llama-3.1-8b-instruct", "mistral-nemo-instruct-2407", "pixtral-12b-2409"],
+        "batch_size": 15
+    },
+    "openrouter": {
+        "base_url": "https://openrouter.ai/api/v1",
+        "env": "OPENROUTER_API_KEY",
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "fallbacks": ["anthropic/claude-3.5-sonnet", "google/gemini-pro-1.5", "mistralai/mistral-large", "qwen/qwen-2.5-72b-instruct"],
+        "batch_size": 15
+    },
+    "mistral": {
+        "base_url": "https://api.mistral.ai/v1",
+        "env": "MISTRAL_API_KEY",
+        "model": "mistral-large-latest",
+        "fallbacks": ["mistral-medium-latest", "mistral-small-latest", "codestral-latest", "open-mistral-nemo"],
+        "batch_size": 15
+    },
+    "poe": {
+        "base_url": None,
+        "env": "POE_API_KEY",
+        "model": "Claude-3.7-Sonnet",
+        "fallbacks": ["Claude-3.5-Sonnet", "GPT-4o", "Claude-3-Opus", "Llama-3.1-405B"],
+        "batch_size": 15
+    },
+    "ollama": {
+        "base_url": "http://localhost:11434/api",
+        "env": "OLLAMA_API_KEY",
+        "model": "ministral-3b-instruct-2512-q4_K_M",
+        "fallbacks": ["cas/llama-3.2-3b-instruct:latest", "llama3.2", "mistral", "phi3"],
+        "batch_size": 15
+    },
 }
     # Retry settings
     max_retries: int = 5
     retry_delay_s: float = 5.0
+    fallback_models: List[str] = field(default_factory=list)
 def llm_config_from_args(
         api_key=resolved_key,
         base_url=defaults.get("base_url"),
         para_batch_size=defaults.get("batch_size", 15),
+        fallback_models=defaults.get("fallbacks", []),
     )
     def complete(self, system: str, user: str, config: LLMConfig) -> str:
         """Send a chat completion and return the assistant's text."""
+        # Candidate models list: primary model followed by fallbacks
+        models_to_try = [config.model] + config.fallback_models
+        last_exception = None
+        for model_id in models_to_try:
+            current_config = copy.copy(config)
+            current_config.model = model_id
+            logger.info("[LLM] %s: Trying model '%s'...", config.provider.value, model_id)
+            for attempt in range(1, config.max_retries + 1):
+                try:
+                    if config.provider == LLMProvider.ANTHROPIC:
+                        return self._anthropic(system, user, current_config)
+                    elif config.provider == LLMProvider.POE:
+                        return self._poe(system, user, current_config)
+                    elif config.provider == LLMProvider.OLLAMA:
+                        return self._ollama(system, user, current_config)
+                    else:
+                        return self._openai_compat(system, user, current_config)
+                except Exception as exc:
+                    last_exception = exc
+                    exc_str = str(exc).lower()
+                    is_rate_limit = "429" in exc_str or "rate limit" in exc_str
+                    is_model_not_found = "404" in exc_str or "not found" in exc_str or "does not exist" in exc_str
+                    if is_model_not_found:
+                        logger.warning("[LLM] %s: Model '%s' not found. Trying next fallback...",
+                                       config.provider.value, model_id)
+                        break # Exit attempt loop, try next model
+                    # Exponential backoff: retry_delay * (2 ^ (attempt-1))
+                    delay = config.retry_delay_s * (2 ** (attempt - 1))
+                    header_delay = None
+                    # OpenAI / Groq / OpenRouter often put it in headers
+                    if hasattr(exc, "response") and hasattr(exc.response, "headers"):
+                        retry_after = exc.response.headers.get("retry-after")
+                        if retry_after and retry_after.isdigit():
+                            header_delay = float(retry_after)
+                    if header_delay:
+                        delay = max(delay, header_delay + 1.0) # Add 1s buffer
+                    elif is_rate_limit:
+                        delay *= 2 # Extra patience for rate limits
+                    if is_rate_limit:
+                        logger.warning(
+                            "[LLM] %s rate limited (429) for model '%s'. Waiting %.1f seconds... (Attempt %d/%d)",
+                            config.provider.value, model_id, delay, attempt, config.max_retries
+                        )
+                    else:
+                        logger.warning(
+                            "[LLM] %s model '%s' attempt %d/%d failed: %s",
+                            config.provider.value, model_id, attempt, config.max_retries, exc,
+                        )
+                    if attempt < config.max_retries:
+                        time.sleep(delay)
+                    else:
+                        logger.error("[LLM] %s: All retries failed for model '%s'.",
+                                     config.provider.value, model_id)
         raise RuntimeError(
+            f"[LLM] All models and retries failed for {config.provider.value}. Last error: {last_exception}"
         )
     def get_available_models(self, config: LLMConfig) -> List[Dict[str, Any]]:
 # ============================================================================
 _SG_SYSTEM = """\
+You are an expert scholarly editor and citation specialist deriving a comprehensive editorial style guide from a document.
+Your output must be a precise instruction set for reformatting text to match this document's exact standards.
+USER PRIORITY RULE:
+If the user provides supplementary style information, those rules take ABSOLUTE PRECEDENCE over patterns you observe in the excerpt.
+MANDATORY AREAS OF ANALYSIS:
+1. CITATION STYLE: Meticulously analyze footnote citations. Identify patterns for:
+   - Book/article titles (italic? quotes?)
+   - Author names (Full name? Surname? All caps? Roman?)
+   - Volume/Issue/Page notation (S. 12? p. 12? 12-15? 12f?)
+   - Punctuation between components (Commas? Colons? Slashes?)
+   - Repeated citations (Vgl.? See? Ibid.? ebenda?)
+2. PUNCTUATION & SYMBOLS: Identify specific choices for:
+   - Quotation marks (»...«, „...“, "...", '...')
+   - Dashes (— em-dash, – en-dash)
+   - Spaces before/after symbols
+3. NAMES & TERMS: Identify treatment of personal names, institutional names, and foreign terms.
+Write the style guide as actionable, imperative rules (e.g., "Always use...", "Never italicize...").
 """
 _SG_USER_TMPL = """\
+Below is a comprehensive excerpt from the **blueprint document**, including sampled footnotes.
+Analyse its editorial conventions with extreme care.
+DOCUMENT EXCERPT (Body & Footnotes):
 ──────────────────────────────────────────────────
 {blueprint_text}
 ──────────────────────────────────────────────────
 {extra_section}
+Produce a **MASTER STYLE GUIDE** in Markdown.
+CRITICAL: Your guide must be detailed enough to handle complex academic citations and specific punctuation (like »...« quotation marks) without ambiguity.
+Structure your guide:
+1. **Absolute User Overrides** (Include any rules from the 'Additional information' section here first)
+2. **Language & Register**
+3. **Personal & Institutional Names**
+4. **Foreign-Language Terms & Transliteration**
+5. **Inline Emphasis & Special Symbols** (Meticulously specify quotation marks: » vs „ vs ")
+6. **Footnote & Citation System** (Provide specific templates for books, articles, and repeats)
+7. **Heading & Layout Conventions**
+Aim for scholarly perfection.
 """
 _FMT_SYSTEM = """\
 You are a scholarly editor applying a strict editorial style guide to existing text.
+Your task is to re-format the provided text to match the Style Guide's exact conventions.
+CONSTRAINTS:
+1. SUBSTANTIVE VERBATIM: Do NOT change the substantive meaning, names, or titles.
+2. EDITORIAL RE-FORMATTING: You MUST change punctuation, quotation marks, and citation structure (e.g., brackets vs commas, colons vs spaces) to strictly follow the Style Guide.
+3. DO NOT translate, summarize, or paraphrase.
+4. DO NOT add any introductory remarks or commentary.
 Use Markdown for inline formatting:
   *italic*          for italic text
   **bold**          for bold text
   ***bold italic*** for bold + italic
+No other Markdown. Return only the re-formatted paragraph text.
+Return EXACTLY one response for each input paragraph.
 """
 _PARA_USER_TMPL = """\
 {styleguide}
 ──────────────────────────────────────────────────
+Your task is to re-format {n} separate paragraphs according to the Style Guide.
+Each paragraph is provided inside indexed tags like [P1]...[/P1].
+INSTRUCTIONS:
+1. Process each paragraph individually.
+2. You MUST return each re-formatted paragraph inside matching indexed tags, e.g.:
+   [P1]Re-formatted text of first paragraph...[/P1]
+   [P2]Re-formatted text of second paragraph...[/P2]
+3. DO NOT merge paragraphs.
+4. DO NOT add any commentary or extra text outside the tags.
+PARAGRAPHS TO PROCESS:
 {content}
 """
 {styleguide}
 ──────────────────────────────────────────────────
+Your task is to re-format {n} separate footnotes according to the Style Guide.
+Each footnote is provided inside indexed tags like [F1]...[/F1].
+INSTRUCTIONS:
+1. Process each footnote individually.
+2. You MUST return each re-formatted footnote inside matching indexed tags, e.g.:
+   [F1]Re-formatted text of first footnote...[/F1]
+   [F2]Re-formatted text of second footnote...[/F2]
+3. DO NOT merge footnotes.
+4. DO NOT add any commentary or extra text outside the tags.
+FOOTNOTES TO PROCESS:
 {content}
 """
             len(to_format), mode, config.para_batch_size,
         )
+        prefix = "P" if mode == "para" else "F"
         for batch_start in range(0, len(to_format), config.para_batch_size):
             # Inter-batch delay to stay under rate limits
             if batch_start > 0:
                 time.sleep(batch_delay)
             batch = to_format[batch_start: batch_start + config.para_batch_size]
+            # Wrap each paragraph in indexed tags
+            tagged_texts = []
+            for i, p in enumerate(batch, 1):
+                tagged_texts.append(f"[{prefix}{i}]{p.get_text()}[/{prefix}{i}]")
+            content = "\n".join(tagged_texts)
             tmpl = _FN_USER_TMPL if mode == "footnote" else _PARA_USER_TMPL
             user_msg = tmpl.format(
                 styleguide=styleguide,
                 n=len(batch),
                 content=content,
             )
             try:
                 response = self.client.complete(_FMT_SYSTEM, user_msg, config)
+                parsed = self._parse_tagged_response(response, len(batch), [p.get_text() for p in batch], prefix)
             except Exception as exc:
                 logger.error("[LLM-FMT] Batch failed, using originals: %s", exc)
+                parsed = [p.get_text() for p in batch]
             for pd, formatted in zip(batch, parsed):
                 if formatted.strip():
                     result[id(pd)] = formatted
                     logger.debug(
+                        "[LLM-FMT] %s formatted: orig='%.50s' → fmt='%.50s'",
+                        mode.capitalize(), pd.get_text(), formatted,
                     )
         return result
     # ------------------------------------------------------------------
     @staticmethod
+    def _parse_tagged_response(response: str, expected: int, originals: List[str], prefix: str) -> List[str]:
         """
+        Extract content from [P1]...[/P1] or [F1]...[/F1] tags.
+        Falls back to originals for any missing or unparseable entries.
         """
+        results = []
+        for i in range(1, expected + 1):
+            tag = f"{prefix}{i}"
+            # Non-greedy match between start and end tags
+            pattern = rf"\[{tag}\](.*?)\[\/{tag}\]"
+            match = re.search(pattern, response, re.DOTALL)
+            if match:
+                results.append(match.group(1).strip())
+            else:
+                # Try fallback: just the start tag if the LLM forgot the end tag
+                pattern_fallback = rf"\[{tag}\](.*?)(?=\[{prefix}{i+1}\]|$)"
+                match_fallback = re.search(pattern_fallback, response, re.DOTALL)
+                if match_fallback:
+                    results.append(match_fallback.group(1).strip())
+                else:
+                    logger.warning("[LLM-FMT] Could not find tag [%s] in response", tag)
+                    results.append(originals[i-1])
+        return results
 # ============================================================================