cstr commited on
Commit
8abde45
Β·
verified Β·
1 Parent(s): c61fd1d

Update format_transplant.py

Browse files
Files changed (1) hide show
  1. format_transplant.py +228 -49
format_transplant.py CHANGED
@@ -63,6 +63,7 @@ HAS_LXML = _check("lxml", "from lxml import etree")
63
  HAS_OPENAI = _check("openai", "from openai import OpenAI")
64
  HAS_ANTHROPIC = _check("anthropic", "import anthropic")
65
  HAS_POE = _check("fastapi-poe", "import fastapi_poe as fp")
 
66
 
67
  print("-" * 44)
68
 
@@ -79,6 +80,7 @@ from docx.oxml.shared import OxmlElement # noqa: E402
79
  from docx.shared import Emu, Pt, RGBColor # noqa: E402
80
  from docx.text.paragraph import Paragraph # noqa: E402
81
  from lxml import etree # noqa: E402
 
82
 
83
  # ============================================================================
84
  # LOGGING
@@ -278,22 +280,26 @@ class BlueprintSchema:
278
  class LLMProvider(Enum):
279
  OPENAI = "openai"
280
  ANTHROPIC = "anthropic"
 
281
  NEBIUS = "nebius"
282
  SCALEWAY = "scaleway"
283
  OPENROUTER = "openrouter"
284
  MISTRAL = "mistral"
285
  POE = "poe"
 
286
 
287
 
288
  # Per-provider defaults β€” base_url=None means the provider uses its own SDK
289
  PROVIDER_DEFAULTS: Dict[str, Dict[str, Any]] = {
290
  "openai": {"base_url": "https://api.openai.com/v1", "env": "OPENAI_API_KEY", "model": "gpt-4o"},
291
- "anthropic": {"base_url": None, "env": "ANTHROPIC_API_KEY", "model": "claude-opus-4-5"},
 
292
  "nebius": {"base_url": "https://api.studio.nebius.com/v1", "env": "NEBIUS_API_KEY", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct"},
293
  "scaleway": {"base_url": "https://api.scaleway.ai/v1", "env": "SCW_SECRET_KEY", "model": "llama-3.3-70b-instruct"},
294
  "openrouter": {"base_url": "https://openrouter.ai/api/v1", "env": "OPENROUTER_API_KEY","model": "meta-llama/llama-3.3-70b-instruct"},
295
  "mistral": {"base_url": "https://api.mistral.ai/v1", "env": "MISTRAL_API_KEY", "model": "mistral-large-latest"},
296
  "poe": {"base_url": None, "env": "POE_API_KEY", "model": "Claude-3.7-Sonnet"},
 
297
  }
298
 
299
 
@@ -741,25 +747,34 @@ class BlueprintAnalyzer:
741
  )
742
 
743
  # ── Separator after marker ────────────────────────────
 
 
 
 
744
  if not sep_found:
745
  if ri + 1 < len(runs):
746
  next_r = runs[ri + 1]
747
  t_elems = next_r.findall(qn("w:t"))
748
  sep_text = "".join(t.text or "" for t in t_elems)
749
- schema.footnote_separator = sep_text
750
- sep_found = True
751
- label = repr(sep_text) if sep_text else "(empty)"
752
- logger.debug(
753
- "[BLUEPRINT] Footnote separator: %s (fn id=%d)",
754
- label, fn_id,
755
- )
756
- else:
757
- schema.footnote_separator = ""
758
- sep_found = True
759
- logger.debug(
760
- "[BLUEPRINT] No separator run after marker (fn id=%d)",
761
- fn_id,
762
- )
 
 
 
 
 
763
  break # found the marker in this footnote; move to next footnote
764
 
765
  if rPr_found and sep_found:
@@ -768,6 +783,15 @@ class BlueprintAnalyzer:
768
  if samples == 0:
769
  logger.debug("[BLUEPRINT] Blueprint has no numbered footnotes to sample")
770
  else:
 
 
 
 
 
 
 
 
 
771
  logger.info(
772
  "[BLUEPRINT] Footnote format: marker_rPr=%s separator=%s",
773
  "captured" if rPr_found else "none",
@@ -1847,49 +1871,62 @@ class DocumentBuilder:
1847
  _XML_SPACE_ATTR = "{http://www.w3.org/XML/1998/namespace}space"
1848
  runs = list(p_elem.findall(qn("w:r")))
1849
 
 
 
 
 
 
 
 
 
 
1850
  for ri, r_elem in enumerate(runs):
1851
  if not _xpath(r_elem, ".//w:footnoteRef"):
1852
  continue
1853
 
1854
  if ri + 1 < len(runs):
1855
- # Separator run already exists β€” normalise its text
1856
  next_r = runs[ri + 1]
1857
  t_elems = next_r.findall(qn("w:t"))
1858
  current = "".join(t.text or "" for t in t_elems)
1859
- if current == wanted:
1860
- break # already correct
1861
-
1862
- if wanted == "":
1863
- # Blueprint uses no separator β€” clear the text
1864
- for t in t_elems:
1865
- t.text = ""
1866
- logger.debug("[BUILD] Footnote separator cleared (blueprint has none)")
1867
- else:
1868
- if t_elems:
1869
- t_elems[0].text = wanted
1870
- if " " in wanted or "\t" in wanted:
1871
- t_elems[0].set(_XML_SPACE_ATTR, "preserve")
1872
- for t in t_elems[1:]:
1873
  t.text = ""
1874
- else:
1875
- t_elem = OxmlElement("w:t")
1876
- t_elem.text = wanted
1877
- if " " in wanted or "\t" in wanted:
1878
- t_elem.set(_XML_SPACE_ATTR, "preserve")
1879
- next_r.append(t_elem)
1880
- logger.debug(
1881
- "[BUILD] Footnote separator normalised: %r β†’ %r", current, wanted
1882
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1883
  elif wanted:
1884
- # No run after the marker at all, but blueprint uses a separator
1885
- sep_r = OxmlElement("w:r")
1886
- t_elem = OxmlElement("w:t")
1887
- t_elem.text = wanted
1888
- if " " in wanted or "\t" in wanted:
1889
- t_elem.set(_XML_SPACE_ATTR, "preserve")
1890
- sep_r.append(t_elem)
1891
- r_elem.addnext(sep_r)
1892
- logger.debug("[BUILD] Footnote separator run inserted: %r", wanted)
1893
  break # found the footnoteRef; done
1894
 
1895
 
@@ -1908,7 +1945,7 @@ class MultiProviderLLMClient:
1908
  """
1909
  Unified synchronous LLM client.
1910
 
1911
- OpenAI-compatible providers (OpenAI, Nebius, Scaleway, OpenRouter, Mistral)
1912
  all use `openai.OpenAI(base_url=…)`.
1913
  Anthropic uses its own SDK.
1914
  Poe uses fastapi-poe (async, wrapped synchronously).
@@ -1922,6 +1959,8 @@ class MultiProviderLLMClient:
1922
  return self._anthropic(system, user, config)
1923
  elif config.provider == LLMProvider.POE:
1924
  return self._poe(system, user, config)
 
 
1925
  else:
1926
  return self._openai_compat(system, user, config)
1927
  except Exception as exc:
@@ -1935,6 +1974,116 @@ class MultiProviderLLMClient:
1935
  f"[LLM] All {config.max_retries} attempts failed for {config.provider.value}"
1936
  )
1937
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1938
  # ── OpenAI-compatible ─────────────────────────────────────────────
1939
  def _openai_compat(self, system: str, user: str, config: LLMConfig) -> str:
1940
  if not HAS_OPENAI:
@@ -1968,6 +2117,36 @@ class MultiProviderLLMClient:
1968
  logger.debug("[LLM] Response: %d chars", len(text))
1969
  return text
1970
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1971
  # ── Anthropic ─────────────────────────────────────────────────────
1972
  def _anthropic(self, system: str, user: str, config: LLMConfig) -> str:
1973
  if not HAS_ANTHROPIC:
 
63
  HAS_OPENAI = _check("openai", "from openai import OpenAI")
64
  HAS_ANTHROPIC = _check("anthropic", "import anthropic")
65
  HAS_POE = _check("fastapi-poe", "import fastapi_poe as fp")
66
+ HAS_REQUESTS = _check("requests", "import requests")
67
 
68
  print("-" * 44)
69
 
 
80
  from docx.shared import Emu, Pt, RGBColor # noqa: E402
81
  from docx.text.paragraph import Paragraph # noqa: E402
82
  from lxml import etree # noqa: E402
83
+ import requests # noqa: E402
84
 
85
  # ============================================================================
86
  # LOGGING
 
280
  class LLMProvider(Enum):
281
  OPENAI = "openai"
282
  ANTHROPIC = "anthropic"
283
+ GROQ = "groq"
284
  NEBIUS = "nebius"
285
  SCALEWAY = "scaleway"
286
  OPENROUTER = "openrouter"
287
  MISTRAL = "mistral"
288
  POE = "poe"
289
+ OLLAMA = "ollama"
290
 
291
 
292
  # Per-provider defaults β€” base_url=None means the provider uses its own SDK
293
  PROVIDER_DEFAULTS: Dict[str, Dict[str, Any]] = {
294
  "openai": {"base_url": "https://api.openai.com/v1", "env": "OPENAI_API_KEY", "model": "gpt-4o"},
295
+ "anthropic": {"base_url": None, "env": "ANTHROPIC_API_KEY", "model": "claude-3-5-sonnet-20241022"},
296
+ "groq": {"base_url": "https://api.groq.com/openai/v1", "env": "GROQ_API_KEY", "model": "llama-3.3-70b-versatile"},
297
  "nebius": {"base_url": "https://api.studio.nebius.com/v1", "env": "NEBIUS_API_KEY", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct"},
298
  "scaleway": {"base_url": "https://api.scaleway.ai/v1", "env": "SCW_SECRET_KEY", "model": "llama-3.3-70b-instruct"},
299
  "openrouter": {"base_url": "https://openrouter.ai/api/v1", "env": "OPENROUTER_API_KEY","model": "meta-llama/llama-3.3-70b-instruct"},
300
  "mistral": {"base_url": "https://api.mistral.ai/v1", "env": "MISTRAL_API_KEY", "model": "mistral-large-latest"},
301
  "poe": {"base_url": None, "env": "POE_API_KEY", "model": "Claude-3.7-Sonnet"},
302
+ "ollama": {"base_url": "http://localhost:11434/api", "env": "OLLAMA_API_KEY", "model": "llama3.2"},
303
  }
304
 
305
 
 
747
  )
748
 
749
  # ── Separator after marker ────────────────────────────
750
+ # A separator run is one whose ENTIRE text content is
751
+ # whitespace (tab, space, or empty). If the next run has
752
+ # actual content, this footnote has no dedicated separator
753
+ # run β€” skip it and try the next footnote.
754
  if not sep_found:
755
  if ri + 1 < len(runs):
756
  next_r = runs[ri + 1]
757
  t_elems = next_r.findall(qn("w:t"))
758
  sep_text = "".join(t.text or "" for t in t_elems)
759
+ if sep_text.strip() == "":
760
+ # Pure whitespace β†’ this IS the separator run
761
+ schema.footnote_separator = sep_text
762
+ sep_found = True
763
+ label = repr(sep_text) if sep_text else "(empty)"
764
+ logger.debug(
765
+ "[BLUEPRINT] Footnote separator: %s (fn id=%d)",
766
+ label, fn_id,
767
+ )
768
+ else:
769
+ # Next run is actual footnote text β€” no separator
770
+ # run in this footnote; keep looking in later ones
771
+ logger.debug(
772
+ "[BLUEPRINT] Footnote id=%d: no separator run "
773
+ "(text starts immediately after marker)",
774
+ fn_id,
775
+ )
776
+ # else: no run after marker β€” keep looking
777
+
778
  break # found the marker in this footnote; move to next footnote
779
 
780
  if rPr_found and sep_found:
 
783
  if samples == 0:
784
  logger.debug("[BLUEPRINT] Blueprint has no numbered footnotes to sample")
785
  else:
786
+ # If we sampled footnotes but never found a pure-whitespace separator
787
+ # run, the blueprint uses no separator β€” record that explicitly.
788
+ if not sep_found:
789
+ schema.footnote_separator = ""
790
+ logger.debug(
791
+ "[BLUEPRINT] No separator run found across %d sampled footnote(s)"
792
+ " β€” blueprint uses no explicit separator",
793
+ samples,
794
+ )
795
  logger.info(
796
  "[BLUEPRINT] Footnote format: marker_rPr=%s separator=%s",
797
  "captured" if rPr_found else "none",
 
1871
  _XML_SPACE_ATTR = "{http://www.w3.org/XML/1998/namespace}space"
1872
  runs = list(p_elem.findall(qn("w:r")))
1873
 
1874
+ def _make_sep_run(text: str):
1875
+ sep_r = OxmlElement("w:r")
1876
+ t_elem = OxmlElement("w:t")
1877
+ t_elem.text = text
1878
+ if " " in text or "\t" in text:
1879
+ t_elem.set(_XML_SPACE_ATTR, "preserve")
1880
+ sep_r.append(t_elem)
1881
+ return sep_r
1882
+
1883
  for ri, r_elem in enumerate(runs):
1884
  if not _xpath(r_elem, ".//w:footnoteRef"):
1885
  continue
1886
 
1887
  if ri + 1 < len(runs):
 
1888
  next_r = runs[ri + 1]
1889
  t_elems = next_r.findall(qn("w:t"))
1890
  current = "".join(t.text or "" for t in t_elems)
1891
+ is_sep_run = current.strip() == "" # purely whitespace = separator run
1892
+
1893
+ if is_sep_run:
1894
+ if wanted == "":
1895
+ # Blueprint has no separator β€” clear the whitespace run
1896
+ for t in t_elems:
 
 
 
 
 
 
 
 
1897
  t.text = ""
1898
+ logger.debug("[BUILD] Footnote separator cleared")
1899
+ elif current != wanted:
1900
+ # Replace whitespace content with the blueprint's separator
1901
+ if t_elems:
1902
+ t_elems[0].text = wanted
1903
+ if " " in wanted or "\t" in wanted:
1904
+ t_elems[0].set(_XML_SPACE_ATTR, "preserve")
1905
+ for t in t_elems[1:]:
1906
+ t.text = ""
1907
+ else:
1908
+ t_elem = OxmlElement("w:t")
1909
+ t_elem.text = wanted
1910
+ if " " in wanted or "\t" in wanted:
1911
+ t_elem.set(_XML_SPACE_ATTR, "preserve")
1912
+ next_r.append(t_elem)
1913
+ logger.debug(
1914
+ "[BUILD] Footnote separator: %r β†’ %r", current, wanted
1915
+ )
1916
+ # else: already matches β€” no-op
1917
+ else:
1918
+ # Next run is actual footnote text, not a separator run.
1919
+ if wanted:
1920
+ # Blueprint uses a separator β€” insert a new run before the text
1921
+ next_r.addprevious(_make_sep_run(wanted))
1922
+ logger.debug(
1923
+ "[BUILD] Footnote separator inserted before text: %r", wanted
1924
+ )
1925
+ # else: blueprint has no separator either β€” nothing to do
1926
  elif wanted:
1927
+ # No run at all after the marker β€” insert a new separator run
1928
+ r_elem.addnext(_make_sep_run(wanted))
1929
+ logger.debug("[BUILD] Footnote separator run appended: %r", wanted)
 
 
 
 
 
 
1930
  break # found the footnoteRef; done
1931
 
1932
 
 
1945
  """
1946
  Unified synchronous LLM client.
1947
 
1948
+ OpenAI-compatible providers (OpenAI, Nebius, Scaleway, OpenRouter, Mistral, Groq, Ollama)
1949
  all use `openai.OpenAI(base_url=…)`.
1950
  Anthropic uses its own SDK.
1951
  Poe uses fastapi-poe (async, wrapped synchronously).
 
1959
  return self._anthropic(system, user, config)
1960
  elif config.provider == LLMProvider.POE:
1961
  return self._poe(system, user, config)
1962
+ elif config.provider == LLMProvider.OLLAMA:
1963
+ return self._ollama(system, user, config)
1964
  else:
1965
  return self._openai_compat(system, user, config)
1966
  except Exception as exc:
 
1974
  f"[LLM] All {config.max_retries} attempts failed for {config.provider.value}"
1975
  )
1976
 
1977
+ def get_available_models(self, config: LLMConfig) -> List[Dict[str, Any]]:
1978
+ """
1979
+ Query available models from the provider's /models endpoint.
1980
+ Returns a list of model info dictionaries with parsed capabilities.
1981
+ """
1982
+ logger.info("[LLM] Querying available models for %s...", config.provider.value)
1983
+ try:
1984
+ if config.provider == LLMProvider.ANTHROPIC:
1985
+ return self._list_anthropic_models(config)
1986
+ elif config.provider == LLMProvider.POE:
1987
+ return [{"id": "Poe Bots", "capabilities": "Unknown"}]
1988
+ elif config.provider == LLMProvider.OLLAMA:
1989
+ return self._list_ollama_models(config)
1990
+ else:
1991
+ return self._list_openai_compat_models(config)
1992
+ except Exception as e:
1993
+ logger.error("[LLM] Failed to query models for %s: %s", config.provider.value, e)
1994
+ return []
1995
+
1996
+ def _list_openai_compat_models(self, config: LLMConfig) -> List[Dict[str, Any]]:
1997
+ base_url = config.base_url or PROVIDER_DEFAULTS.get(config.provider.value, {}).get("base_url")
1998
+ if not base_url:
1999
+ return []
2000
+
2001
+ headers = {"Authorization": f"Bearer {config.api_key}"}
2002
+ if config.provider == LLMProvider.OPENROUTER:
2003
+ headers["X-Title"] = "CrispTranslator"
2004
+
2005
+ try:
2006
+ resp = requests.get(f"{base_url}/models", headers=headers, timeout=10)
2007
+ if resp.status_code != 200:
2008
+ logger.error("[LLM] HTTP %d: %s", resp.status_code, resp.text)
2009
+ return []
2010
+
2011
+ data = resp.json()
2012
+ models = []
2013
+ raw_models = data.get("data", []) if isinstance(data, dict) else data
2014
+
2015
+ for m in raw_models:
2016
+ m_id = m.get("id")
2017
+ if not m_id: continue
2018
+
2019
+ # Parse capabilities
2020
+ caps = []
2021
+ if "context_window" in m:
2022
+ caps.append(f"ctx: {m['context_window']}")
2023
+ elif "context_length" in m:
2024
+ caps.append(f"ctx: {m['context_length']}")
2025
+
2026
+ if m.get("pricing"):
2027
+ p = m["pricing"]
2028
+ caps.append(f"price: {p.get('prompt', '?')}/{p.get('completion', '?')}")
2029
+
2030
+ info = {
2031
+ "id": m_id,
2032
+ "capabilities": ", ".join(caps) if caps else "Available",
2033
+ "raw": m
2034
+ }
2035
+ models.append(info)
2036
+ logger.debug("[LLM] Found model: %s (%s)", m_id, info["capabilities"])
2037
+
2038
+ return sorted(models, key=lambda x: x["id"])
2039
+ except Exception as e:
2040
+ logger.debug("[LLM] Model listing failed: %s", e)
2041
+ return []
2042
+
2043
+ def _list_anthropic_models(self, config: LLMConfig) -> List[Dict[str, Any]]:
2044
+ # Anthropic recently added /v1/models
2045
+ headers = {
2046
+ "x-api-key": config.api_key,
2047
+ "anthropic-version": "2023-06-01"
2048
+ }
2049
+ try:
2050
+ resp = requests.get("https://api.anthropic.com/v1/models", headers=headers, timeout=10)
2051
+ if resp.status_code == 200:
2052
+ data = resp.json()
2053
+ models = []
2054
+ for m in data.get("data", []):
2055
+ m_id = m.get("id")
2056
+ info = {
2057
+ "id": m_id,
2058
+ "capabilities": f"Display: {m.get('display_name', '')}",
2059
+ "raw": m
2060
+ }
2061
+ models.append(info)
2062
+ logger.debug("[LLM] Found Anthropic model: %s", m_id)
2063
+ return models
2064
+ except:
2065
+ pass
2066
+ # Fallback if endpoint is not available
2067
+ return [{"id": "claude-3-5-sonnet-20241022", "capabilities": "Hardcoded Fallback"}]
2068
+
2069
+ def _list_ollama_models(self, config: LLMConfig) -> List[Dict[str, Any]]:
2070
+ base_url = config.base_url or "http://localhost:11434/api"
2071
+ try:
2072
+ resp = requests.get(f"{base_url}/tags", timeout=5)
2073
+ if resp.status_code == 200:
2074
+ data = resp.json()
2075
+ models = []
2076
+ for m in data.get("models", []):
2077
+ m_id = m.get("name")
2078
+ details = m.get("details", {})
2079
+ caps = f"{details.get('parameter_size', '?')} params, {details.get('format', '?')}"
2080
+ models.append({"id": m_id, "capabilities": caps, "raw": m})
2081
+ logger.debug("[LLM] Found Ollama model: %s (%s)", m_id, caps)
2082
+ return models
2083
+ except:
2084
+ pass
2085
+ return []
2086
+
2087
  # ── OpenAI-compatible ─────────────────────────────────────────────
2088
  def _openai_compat(self, system: str, user: str, config: LLMConfig) -> str:
2089
  if not HAS_OPENAI:
 
2117
  logger.debug("[LLM] Response: %d chars", len(text))
2118
  return text
2119
 
2120
+ # ── Ollama ────────────────────────────────────────────────────────
2121
+ def _ollama(self, system: str, user: str, config: LLMConfig) -> str:
2122
+ base_url = config.base_url or "http://localhost:11434/api"
2123
+ logger.debug("[LLM] ollama β†’ %s | sys=%d chars user=%d chars",
2124
+ config.model, len(system), len(user))
2125
+
2126
+ prompt = f"{system}\n\n{user}" if system else user
2127
+
2128
+ resp = requests.post(
2129
+ f"{base_url}/generate",
2130
+ json={
2131
+ "model": config.model,
2132
+ "prompt": prompt,
2133
+ "stream": False,
2134
+ "options": {
2135
+ "temperature": config.temperature,
2136
+ }
2137
+ },
2138
+ timeout=180
2139
+ )
2140
+ if resp.status_code != 200:
2141
+ raise RuntimeError(f"Ollama error {resp.status_code}: {resp.text}")
2142
+
2143
+ text = resp.json().get("response", "")
2144
+ logger.debug("[LLM] Response: %d chars", len(text))
2145
+ return text
2146
+ text = resp.choices[0].message.content or ""
2147
+ logger.debug("[LLM] Response: %d chars", len(text))
2148
+ return text
2149
+
2150
  # ── Anthropic ─────────────────────────────────────────────────────
2151
  def _anthropic(self, system: str, user: str, config: LLMConfig) -> str:
2152
  if not HAS_ANTHROPIC: