chore: update something
Browse files- docsifer/service.py +15 -14
docsifer/service.py
CHANGED
|
@@ -130,6 +130,8 @@ class DocsiferService:
|
|
| 130 |
Returns:
|
| 131 |
A tuple containing a dictionary with keys "filename" and "markdown", and the token count.
|
| 132 |
"""
|
|
|
|
|
|
|
| 133 |
if source.startswith("http"):
|
| 134 |
filename = f"{scuid()}.html"
|
| 135 |
else:
|
|
@@ -160,38 +162,37 @@ class DocsiferService:
|
|
| 160 |
)
|
| 161 |
|
| 162 |
# Perform HTML cleanup if requested.
|
| 163 |
-
|
| 164 |
-
|
| 165 |
|
|
|
|
| 166 |
filename = new_filename
|
| 167 |
source = tmp_path
|
| 168 |
|
| 169 |
# Decide whether to use LLM-enhanced conversion or the basic converter.
|
| 170 |
if openai_config and openai_config.get("api_key"):
|
| 171 |
-
print("openai_config:\n", openai_config)
|
| 172 |
md_converter = self._init_markitdown_with_llm(openai_config)
|
| 173 |
else:
|
| 174 |
-
print("no openai_config")
|
| 175 |
md_converter = self._basic_markitdown
|
| 176 |
|
| 177 |
# Load cookies if provided in the HTTP config.
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
|
| 186 |
try:
|
| 187 |
-
result_obj = md_converter.convert(source=
|
| 188 |
print("result_obj:\n", result_obj.text_content)
|
| 189 |
except Exception as e:
|
| 190 |
logger.error("MarkItDown conversion failed: %s", e)
|
| 191 |
raise RuntimeError(f"Conversion failed for '{source}': {e}")
|
| 192 |
|
| 193 |
-
|
| 194 |
-
|
| 195 |
|
| 196 |
# Count tokens in the resulting markdown text.
|
| 197 |
token_count = self._count_tokens(result_obj.text_content)
|
|
|
|
| 130 |
Returns:
|
| 131 |
A tuple containing a dictionary with keys "filename" and "markdown", and the token count.
|
| 132 |
"""
|
| 133 |
+
file_extension = None
|
| 134 |
+
|
| 135 |
if source.startswith("http"):
|
| 136 |
filename = f"{scuid()}.html"
|
| 137 |
else:
|
|
|
|
| 162 |
)
|
| 163 |
|
| 164 |
# Perform HTML cleanup if requested.
|
| 165 |
+
if cleanup and guessed_ext.lower() in (".html", ".htm"):
|
| 166 |
+
self._maybe_cleanup_html(tmp_path)
|
| 167 |
|
| 168 |
+
file_extension = guessed_ext.lstrip(".")
|
| 169 |
filename = new_filename
|
| 170 |
source = tmp_path
|
| 171 |
|
| 172 |
# Decide whether to use LLM-enhanced conversion or the basic converter.
|
| 173 |
if openai_config and openai_config.get("api_key"):
|
|
|
|
| 174 |
md_converter = self._init_markitdown_with_llm(openai_config)
|
| 175 |
else:
|
|
|
|
| 176 |
md_converter = self._basic_markitdown
|
| 177 |
|
| 178 |
# Load cookies if provided in the HTTP config.
|
| 179 |
+
if http_config:
|
| 180 |
+
if "cookies" in http_config:
|
| 181 |
+
requests.cookies.cookiejar_from_dict(
|
| 182 |
+
http_config["cookies"],
|
| 183 |
+
requests.cookies.RequestsCookieJar,
|
| 184 |
+
overwrite=True,
|
| 185 |
+
)
|
| 186 |
|
| 187 |
try:
|
| 188 |
+
result_obj = md_converter.convert(source, file_extension=file_extension)
|
| 189 |
print("result_obj:\n", result_obj.text_content)
|
| 190 |
except Exception as e:
|
| 191 |
logger.error("MarkItDown conversion failed: %s", e)
|
| 192 |
raise RuntimeError(f"Conversion failed for '{source}': {e}")
|
| 193 |
|
| 194 |
+
if isinstance(source, Path) and source.exists():
|
| 195 |
+
source.unlink()
|
| 196 |
|
| 197 |
# Count tokens in the resulting markdown text.
|
| 198 |
token_count = self._count_tokens(result_obj.text_content)
|