Spaces:

cstr
/

FormatTransplant

Sleeping

App Files Files Community

cstr commited on Mar 1

Commit

8abde45

verified ·

1 Parent(s): c61fd1d

Update format_transplant.py

Browse files

Files changed (1) hide show

format_transplant.py +228 -49

format_transplant.py CHANGED Viewed

@@ -63,6 +63,7 @@ HAS_LXML  = _check("lxml",       "from lxml import etree")
 HAS_OPENAI    = _check("openai",       "from openai import OpenAI")
 HAS_ANTHROPIC = _check("anthropic",    "import anthropic")
 HAS_POE       = _check("fastapi-poe",  "import fastapi_poe as fp")
 print("-" * 44)
@@ -79,6 +80,7 @@ from docx.oxml.shared import OxmlElement  # noqa: E402
 from docx.shared import Emu, Pt, RGBColor  # noqa: E402
 from docx.text.paragraph import Paragraph  # noqa: E402
 from lxml import etree  # noqa: E402
 # ============================================================================
 # LOGGING
@@ -278,22 +280,26 @@ class BlueprintSchema:
 class LLMProvider(Enum):
     OPENAI     = "openai"
     ANTHROPIC  = "anthropic"
     NEBIUS     = "nebius"
     SCALEWAY   = "scaleway"
     OPENROUTER = "openrouter"
     MISTRAL    = "mistral"
     POE        = "poe"
 # Per-provider defaults — base_url=None means the provider uses its own SDK
 PROVIDER_DEFAULTS: Dict[str, Dict[str, Any]] = {
     "openai":     {"base_url": "https://api.openai.com/v1",           "env": "OPENAI_API_KEY",    "model": "gpt-4o"},
-    "anthropic":  {"base_url": None,                                   "env": "ANTHROPIC_API_KEY", "model": "claude-opus-4-5"},
     "nebius":     {"base_url": "https://api.studio.nebius.com/v1",    "env": "NEBIUS_API_KEY",    "model": "meta-llama/Meta-Llama-3.1-70B-Instruct"},
     "scaleway":   {"base_url": "https://api.scaleway.ai/v1",          "env": "SCW_SECRET_KEY",    "model": "llama-3.3-70b-instruct"},
     "openrouter": {"base_url": "https://openrouter.ai/api/v1",        "env": "OPENROUTER_API_KEY","model": "meta-llama/llama-3.3-70b-instruct"},
     "mistral":    {"base_url": "https://api.mistral.ai/v1",           "env": "MISTRAL_API_KEY",   "model": "mistral-large-latest"},
     "poe":        {"base_url": None,                                   "env": "POE_API_KEY",       "model": "Claude-3.7-Sonnet"},
 }
@@ -741,25 +747,34 @@ class BlueprintAnalyzer:
                             )
                     # ── Separator after marker ────────────────────────────
                     if not sep_found:
                         if ri + 1 < len(runs):
                             next_r = runs[ri + 1]
                             t_elems = next_r.findall(qn("w:t"))
                             sep_text = "".join(t.text or "" for t in t_elems)
-                            schema.footnote_separator = sep_text
-                            sep_found = True
-                            label = repr(sep_text) if sep_text else "(empty)"
-                            logger.debug(
-                                "[BLUEPRINT] Footnote separator: %s (fn id=%d)",
-                                label, fn_id,
-                            )
-                        else:
-                            schema.footnote_separator = ""
-                            sep_found = True
-                            logger.debug(
-                                "[BLUEPRINT] No separator run after marker (fn id=%d)",
-                                fn_id,
-                            )
                     break  # found the marker in this footnote; move to next footnote
                 if rPr_found and sep_found:
@@ -768,6 +783,15 @@ class BlueprintAnalyzer:
             if samples == 0:
                 logger.debug("[BLUEPRINT] Blueprint has no numbered footnotes to sample")
             else:
                 logger.info(
                     "[BLUEPRINT] Footnote format: marker_rPr=%s  separator=%s",
                     "captured" if rPr_found else "none",
@@ -1847,49 +1871,62 @@ class DocumentBuilder:
         _XML_SPACE_ATTR = "{http://www.w3.org/XML/1998/namespace}space"
         runs = list(p_elem.findall(qn("w:r")))
         for ri, r_elem in enumerate(runs):
             if not _xpath(r_elem, ".//w:footnoteRef"):
                 continue
             if ri + 1 < len(runs):
-                # Separator run already exists — normalise its text
                 next_r = runs[ri + 1]
                 t_elems = next_r.findall(qn("w:t"))
                 current = "".join(t.text or "" for t in t_elems)
-                if current == wanted:
-                    break  # already correct
-                if wanted == "":
-                    # Blueprint uses no separator — clear the text
-                    for t in t_elems:
-                        t.text = ""
-                    logger.debug("[BUILD] Footnote separator cleared (blueprint has none)")
-                else:
-                    if t_elems:
-                        t_elems[0].text = wanted
-                        if " " in wanted or "\t" in wanted:
-                            t_elems[0].set(_XML_SPACE_ATTR, "preserve")
-                        for t in t_elems[1:]:
                             t.text = ""
-                    else:
-                        t_elem = OxmlElement("w:t")
-                        t_elem.text = wanted
-                        if " " in wanted or "\t" in wanted:
-                            t_elem.set(_XML_SPACE_ATTR, "preserve")
-                        next_r.append(t_elem)
-                    logger.debug(
-                        "[BUILD] Footnote separator normalised: %r → %r", current, wanted
-                    )
             elif wanted:
-                # No run after the marker at all, but blueprint uses a separator
-                sep_r = OxmlElement("w:r")
-                t_elem = OxmlElement("w:t")
-                t_elem.text = wanted
-                if " " in wanted or "\t" in wanted:
-                    t_elem.set(_XML_SPACE_ATTR, "preserve")
-                sep_r.append(t_elem)
-                r_elem.addnext(sep_r)
-                logger.debug("[BUILD] Footnote separator run inserted: %r", wanted)
             break  # found the footnoteRef; done
@@ -1908,7 +1945,7 @@ class MultiProviderLLMClient:
     """
     Unified synchronous LLM client.
-    OpenAI-compatible providers (OpenAI, Nebius, Scaleway, OpenRouter, Mistral)
     all use `openai.OpenAI(base_url=…)`.
     Anthropic uses its own SDK.
     Poe uses fastapi-poe (async, wrapped synchronously).
@@ -1922,6 +1959,8 @@ class MultiProviderLLMClient:
                     return self._anthropic(system, user, config)
                 elif config.provider == LLMProvider.POE:
                     return self._poe(system, user, config)
                 else:
                     return self._openai_compat(system, user, config)
             except Exception as exc:
@@ -1935,6 +1974,116 @@ class MultiProviderLLMClient:
             f"[LLM] All {config.max_retries} attempts failed for {config.provider.value}"
         )
     # ── OpenAI-compatible ─────────────────────────────────────────────
     def _openai_compat(self, system: str, user: str, config: LLMConfig) -> str:
         if not HAS_OPENAI:
@@ -1968,6 +2117,36 @@ class MultiProviderLLMClient:
         logger.debug("[LLM] Response: %d chars", len(text))
         return text
     # ── Anthropic ─────────────────────────────────────────────────────
     def _anthropic(self, system: str, user: str, config: LLMConfig) -> str:
         if not HAS_ANTHROPIC:

 HAS_OPENAI    = _check("openai",       "from openai import OpenAI")
 HAS_ANTHROPIC = _check("anthropic",    "import anthropic")
 HAS_POE       = _check("fastapi-poe",  "import fastapi_poe as fp")
+HAS_REQUESTS  = _check("requests",     "import requests")
 print("-" * 44)
 from docx.shared import Emu, Pt, RGBColor  # noqa: E402
 from docx.text.paragraph import Paragraph  # noqa: E402
 from lxml import etree  # noqa: E402
+import requests  # noqa: E402
 # ============================================================================
 # LOGGING
 class LLMProvider(Enum):
     OPENAI     = "openai"
     ANTHROPIC  = "anthropic"
+    GROQ       = "groq"
     NEBIUS     = "nebius"
     SCALEWAY   = "scaleway"
     OPENROUTER = "openrouter"
     MISTRAL    = "mistral"
     POE        = "poe"
+    OLLAMA     = "ollama"
 # Per-provider defaults — base_url=None means the provider uses its own SDK
 PROVIDER_DEFAULTS: Dict[str, Dict[str, Any]] = {
     "openai":     {"base_url": "https://api.openai.com/v1",           "env": "OPENAI_API_KEY",    "model": "gpt-4o"},
+    "anthropic":  {"base_url": None,                                   "env": "ANTHROPIC_API_KEY", "model": "claude-3-5-sonnet-20241022"},
+    "groq":       {"base_url": "https://api.groq.com/openai/v1",      "env": "GROQ_API_KEY",      "model": "llama-3.3-70b-versatile"},
     "nebius":     {"base_url": "https://api.studio.nebius.com/v1",    "env": "NEBIUS_API_KEY",    "model": "meta-llama/Meta-Llama-3.1-70B-Instruct"},
     "scaleway":   {"base_url": "https://api.scaleway.ai/v1",          "env": "SCW_SECRET_KEY",    "model": "llama-3.3-70b-instruct"},
     "openrouter": {"base_url": "https://openrouter.ai/api/v1",        "env": "OPENROUTER_API_KEY","model": "meta-llama/llama-3.3-70b-instruct"},
     "mistral":    {"base_url": "https://api.mistral.ai/v1",           "env": "MISTRAL_API_KEY",   "model": "mistral-large-latest"},
     "poe":        {"base_url": None,                                   "env": "POE_API_KEY",       "model": "Claude-3.7-Sonnet"},
+    "ollama":     {"base_url": "http://localhost:11434/api",          "env": "OLLAMA_API_KEY",    "model": "llama3.2"},
 }
                             )
                     # ── Separator after marker ────────────────────────────
+                    # A separator run is one whose ENTIRE text content is
+                    # whitespace (tab, space, or empty). If the next run has
+                    # actual content, this footnote has no dedicated separator
+                    # run — skip it and try the next footnote.
                     if not sep_found:
                         if ri + 1 < len(runs):
                             next_r = runs[ri + 1]
                             t_elems = next_r.findall(qn("w:t"))
                             sep_text = "".join(t.text or "" for t in t_elems)
+                            if sep_text.strip() == "":
+                                # Pure whitespace → this IS the separator run
+                                schema.footnote_separator = sep_text
+                                sep_found = True
+                                label = repr(sep_text) if sep_text else "(empty)"
+                                logger.debug(
+                                    "[BLUEPRINT] Footnote separator: %s (fn id=%d)",
+                                    label, fn_id,
+                                )
+                            else:
+                                # Next run is actual footnote text — no separator
+                                # run in this footnote; keep looking in later ones
+                                logger.debug(
+                                    "[BLUEPRINT] Footnote id=%d: no separator run "
+                                    "(text starts immediately after marker)",
+                                    fn_id,
+                                )
+                        # else: no run after marker — keep looking
                     break  # found the marker in this footnote; move to next footnote
                 if rPr_found and sep_found:
             if samples == 0:
                 logger.debug("[BLUEPRINT] Blueprint has no numbered footnotes to sample")
             else:
+                # If we sampled footnotes but never found a pure-whitespace separator
+                # run, the blueprint uses no separator — record that explicitly.
+                if not sep_found:
+                    schema.footnote_separator = ""
+                    logger.debug(
+                        "[BLUEPRINT] No separator run found across %d sampled footnote(s)"
+                        " — blueprint uses no explicit separator",
+                        samples,
+                    )
                 logger.info(
                     "[BLUEPRINT] Footnote format: marker_rPr=%s  separator=%s",
                     "captured" if rPr_found else "none",
         _XML_SPACE_ATTR = "{http://www.w3.org/XML/1998/namespace}space"
         runs = list(p_elem.findall(qn("w:r")))
+        def _make_sep_run(text: str):
+            sep_r = OxmlElement("w:r")
+            t_elem = OxmlElement("w:t")
+            t_elem.text = text
+            if " " in text or "\t" in text:
+                t_elem.set(_XML_SPACE_ATTR, "preserve")
+            sep_r.append(t_elem)
+            return sep_r
         for ri, r_elem in enumerate(runs):
             if not _xpath(r_elem, ".//w:footnoteRef"):
                 continue
             if ri + 1 < len(runs):
                 next_r = runs[ri + 1]
                 t_elems = next_r.findall(qn("w:t"))
                 current = "".join(t.text or "" for t in t_elems)
+                is_sep_run = current.strip() == ""  # purely whitespace = separator run
+                if is_sep_run:
+                    if wanted == "":
+                        # Blueprint has no separator — clear the whitespace run
+                        for t in t_elems:
                             t.text = ""
+                        logger.debug("[BUILD] Footnote separator cleared")
+                    elif current != wanted:
+                        # Replace whitespace content with the blueprint's separator
+                        if t_elems:
+                            t_elems[0].text = wanted
+                            if " " in wanted or "\t" in wanted:
+                                t_elems[0].set(_XML_SPACE_ATTR, "preserve")
+                            for t in t_elems[1:]:
+                                t.text = ""
+                        else:
+                            t_elem = OxmlElement("w:t")
+                            t_elem.text = wanted
+                            if " " in wanted or "\t" in wanted:
+                                t_elem.set(_XML_SPACE_ATTR, "preserve")
+                            next_r.append(t_elem)
+                        logger.debug(
+                            "[BUILD] Footnote separator: %r → %r", current, wanted
+                        )
+                    # else: already matches — no-op
+                else:
+                    # Next run is actual footnote text, not a separator run.
+                    if wanted:
+                        # Blueprint uses a separator — insert a new run before the text
+                        next_r.addprevious(_make_sep_run(wanted))
+                        logger.debug(
+                            "[BUILD] Footnote separator inserted before text: %r", wanted
+                        )
+                    # else: blueprint has no separator either — nothing to do
             elif wanted:
+                # No run at all after the marker — insert a new separator run
+                r_elem.addnext(_make_sep_run(wanted))
+                logger.debug("[BUILD] Footnote separator run appended: %r", wanted)
             break  # found the footnoteRef; done
     """
     Unified synchronous LLM client.
+    OpenAI-compatible providers (OpenAI, Nebius, Scaleway, OpenRouter, Mistral, Groq, Ollama)
     all use `openai.OpenAI(base_url=…)`.
     Anthropic uses its own SDK.
     Poe uses fastapi-poe (async, wrapped synchronously).
                     return self._anthropic(system, user, config)
                 elif config.provider == LLMProvider.POE:
                     return self._poe(system, user, config)
+                elif config.provider == LLMProvider.OLLAMA:
+                    return self._ollama(system, user, config)
                 else:
                     return self._openai_compat(system, user, config)
             except Exception as exc:
             f"[LLM] All {config.max_retries} attempts failed for {config.provider.value}"
         )
+    def get_available_models(self, config: LLMConfig) -> List[Dict[str, Any]]:
+        """
+        Query available models from the provider's /models endpoint.
+        Returns a list of model info dictionaries with parsed capabilities.
+        """
+        logger.info("[LLM] Querying available models for %s...", config.provider.value)
+        try:
+            if config.provider == LLMProvider.ANTHROPIC:
+                return self._list_anthropic_models(config)
+            elif config.provider == LLMProvider.POE:
+                return [{"id": "Poe Bots", "capabilities": "Unknown"}]
+            elif config.provider == LLMProvider.OLLAMA:
+                return self._list_ollama_models(config)
+            else:
+                return self._list_openai_compat_models(config)
+        except Exception as e:
+            logger.error("[LLM] Failed to query models for %s: %s", config.provider.value, e)
+            return []
+    def _list_openai_compat_models(self, config: LLMConfig) -> List[Dict[str, Any]]:
+        base_url = config.base_url or PROVIDER_DEFAULTS.get(config.provider.value, {}).get("base_url")
+        if not base_url:
+            return []
+        headers = {"Authorization": f"Bearer {config.api_key}"}
+        if config.provider == LLMProvider.OPENROUTER:
+            headers["X-Title"] = "CrispTranslator"
+        try:
+            resp = requests.get(f"{base_url}/models", headers=headers, timeout=10)
+            if resp.status_code != 200:
+                logger.error("[LLM] HTTP %d: %s", resp.status_code, resp.text)
+                return []
+            data = resp.json()
+            models = []
+            raw_models = data.get("data", []) if isinstance(data, dict) else data
+            for m in raw_models:
+                m_id = m.get("id")
+                if not m_id: continue
+                # Parse capabilities
+                caps = []
+                if "context_window" in m:
+                    caps.append(f"ctx: {m['context_window']}")
+                elif "context_length" in m:
+                    caps.append(f"ctx: {m['context_length']}")
+                if m.get("pricing"):
+                    p = m["pricing"]
+                    caps.append(f"price: {p.get('prompt', '?')}/{p.get('completion', '?')}")
+                info = {
+                    "id": m_id,
+                    "capabilities": ", ".join(caps) if caps else "Available",
+                    "raw": m
+                }
+                models.append(info)
+                logger.debug("[LLM] Found model: %s (%s)", m_id, info["capabilities"])
+            return sorted(models, key=lambda x: x["id"])
+        except Exception as e:
+            logger.debug("[LLM] Model listing failed: %s", e)
+            return []
+    def _list_anthropic_models(self, config: LLMConfig) -> List[Dict[str, Any]]:
+        # Anthropic recently added /v1/models
+        headers = {
+            "x-api-key": config.api_key,
+            "anthropic-version": "2023-06-01"
+        }
+        try:
+            resp = requests.get("https://api.anthropic.com/v1/models", headers=headers, timeout=10)
+            if resp.status_code == 200:
+                data = resp.json()
+                models = []
+                for m in data.get("data", []):
+                    m_id = m.get("id")
+                    info = {
+                        "id": m_id,
+                        "capabilities": f"Display: {m.get('display_name', '')}",
+                        "raw": m
+                    }
+                    models.append(info)
+                    logger.debug("[LLM] Found Anthropic model: %s", m_id)
+                return models
+        except:
+            pass
+        # Fallback if endpoint is not available
+        return [{"id": "claude-3-5-sonnet-20241022", "capabilities": "Hardcoded Fallback"}]
+    def _list_ollama_models(self, config: LLMConfig) -> List[Dict[str, Any]]:
+        base_url = config.base_url or "http://localhost:11434/api"
+        try:
+            resp = requests.get(f"{base_url}/tags", timeout=5)
+            if resp.status_code == 200:
+                data = resp.json()
+                models = []
+                for m in data.get("models", []):
+                    m_id = m.get("name")
+                    details = m.get("details", {})
+                    caps = f"{details.get('parameter_size', '?')} params, {details.get('format', '?')}"
+                    models.append({"id": m_id, "capabilities": caps, "raw": m})
+                    logger.debug("[LLM] Found Ollama model: %s (%s)", m_id, caps)
+                return models
+        except:
+            pass
+        return []
     # ── OpenAI-compatible ─────────────────────────────────────────────
     def _openai_compat(self, system: str, user: str, config: LLMConfig) -> str:
         if not HAS_OPENAI:
         logger.debug("[LLM] Response: %d chars", len(text))
         return text
+    # ── Ollama ────────────────────────────────────────────────────────
+    def _ollama(self, system: str, user: str, config: LLMConfig) -> str:
+        base_url = config.base_url or "http://localhost:11434/api"
+        logger.debug("[LLM] ollama → %s | sys=%d chars user=%d chars",
+                     config.model, len(system), len(user))
+        prompt = f"{system}\n\n{user}" if system else user
+        resp = requests.post(
+            f"{base_url}/generate",
+            json={
+                "model": config.model,
+                "prompt": prompt,
+                "stream": False,
+                "options": {
+                    "temperature": config.temperature,
+                }
+            },
+            timeout=180
+        )
+        if resp.status_code != 200:
+            raise RuntimeError(f"Ollama error {resp.status_code}: {resp.text}")
+        text = resp.json().get("response", "")
+        logger.debug("[LLM] Response: %d chars", len(text))
+        return text
+        text = resp.choices[0].message.content or ""
+        logger.debug("[LLM] Response: %d chars", len(text))
+        return text
     # ── Anthropic ─────────────────────────────────────────────────────
     def _anthropic(self, system: str, user: str, config: LLMConfig) -> str:
         if not HAS_ANTHROPIC: