Spaces:

dromerosm
/

ddgs

Sleeping

App Files Files Community

dromero-nttd commited on Feb 2

Commit

9c093af

1 Parent(s): 5e671da

Add markitdown URL markdown and ignore tasks

Browse files

Files changed (8) hide show

.gitignore +2 -1
README.md +1 -1
ddgs_cli.py +55 -1
postman_collection.json +1 -1
pyproject.toml +1 -0
resources.md +8 -0
test_scripts/run_smoke.sh +2 -0
test_scripts/test_remote_api.sh +37 -0

.gitignore CHANGED Viewed

@@ -10,4 +10,5 @@ __pycache__/
 .vscode/
 .idea/
 .ddgs_search_err
-certif_zscaler/*

 .vscode/
 .idea/
 .ddgs_search_err
+certif_zscaler/*
+tasks/

README.md CHANGED Viewed

@@ -37,7 +37,7 @@ Example:
 ddgs-search "site:openai.com safety" --region us-en --safesearch moderate --timelimit m --max-results 5 --format json
 ```
-`--format json` and `--format jsonl` emit the full result objects returned by DDGS (all available fields).
 ## API (FastAPI)

 ddgs-search "site:openai.com safety" --region us-en --safesearch moderate --timelimit m --max-results 5 --format json
 ```
+`--format json` and `--format jsonl` emit the full result objects returned by DDGS (all available fields), plus a `markdown` field containing the MarkItDown-converted content for each URL (PDFs are skipped).
 ## API (FastAPI)

ddgs_cli.py CHANGED Viewed

@@ -4,6 +4,7 @@ import json
 import os
 import sys
 from pathlib import Path
 from dotenv import load_dotenv
 from ddgs import DDGS
@@ -135,6 +136,56 @@ def _resolve_verify(verify_flag: bool) -> bool | str:
     return ca_bundle or True
 def ddgs_search(
     query: str,
     *,
@@ -166,7 +217,10 @@ def ddgs_search(
             ddgs_init_kwargs["proxy"] = proxy
     with DDGS(**ddgs_init_kwargs) as ddgs:
-        return list(ddgs.text(query, **ddgs_kwargs))
 def main(argv: list[str] | None = None) -> int:

 import os
 import sys
 from pathlib import Path
+from urllib.parse import urlsplit
 from dotenv import load_dotenv
 from ddgs import DDGS
     return ca_bundle or True
+def _is_pdf_url(url: str) -> bool:
+    try:
+        path = urlsplit(url).path.lower()
+    except ValueError:
+        path = url.lower()
+    return path.endswith(".pdf")
+def _get_markdown_converter():
+    try:
+        from markitdown import MarkItDown
+    except ImportError:
+        return None
+    return MarkItDown()
+def _convert_url_to_markdown(converter, url: str) -> str:
+    if hasattr(converter, "convert_uri"):
+        result = converter.convert_uri(url)
+    elif hasattr(converter, "convert_url"):
+        result = converter.convert_url(url)
+    else:
+        result = converter.convert(url)
+    text = getattr(result, "markdown", None)
+    if text is None:
+        text = getattr(result, "text_content", None)
+    if text is None:
+        raise ValueError("MarkItDown result missing markdown content")
+    return text
+def _attach_markdown(results: list[dict]) -> None:
+    converter = _get_markdown_converter()
+    for item in results:
+        url = item.get("href") or item.get("url")
+        if not url or _is_pdf_url(url):
+            item["markdown"] = None
+            continue
+        if converter is None:
+            item["markdown"] = None
+            item["markdown_error"] = "markitdown_not_installed"
+            continue
+        try:
+            item["markdown"] = _convert_url_to_markdown(converter, url)
+        except Exception as exc:  # noqa: BLE001 - surface conversion errors in response
+            item["markdown"] = None
+            item["markdown_error"] = str(exc)
 def ddgs_search(
     query: str,
     *,
             ddgs_init_kwargs["proxy"] = proxy
     with DDGS(**ddgs_init_kwargs) as ddgs:
+        results = list(ddgs.text(query, **ddgs_kwargs))
+    _attach_markdown(results)
+    return results
 def main(argv: list[str] | None = None) -> int:

postman_collection.json CHANGED Viewed

@@ -22,7 +22,7 @@
         ],
         "body": {
           "mode": "raw",
-          "raw": "{\n  \"query\": \"openai\",\n  \"max_results\": 1,\n  \"region\": \"us-en\",\n  \"safesearch\": \"moderate\",\n  \"timelimit\": \"m\",\n  \"backend\": \"auto\",\n  \"timeout\": 30,\n  \"verify\": true\n}",
           "options": {
             "raw": {
               "language": "json"

         ],
         "body": {
           "mode": "raw",
+          "raw": "{\n  \"query\": \"openai\",\n  \"region\": \"us-en\",\n  \"safesearch\": \"moderate\",\n  \"timelimit\": \"m\",\n  \"max_results\": 5,\n  \"backend\": \"auto\",\n  \"proxy\": null,\n  \"timeout\": 30,\n  \"verify\": true\n}",
           "options": {
             "raw": {
               "language": "json"

pyproject.toml CHANGED Viewed

@@ -11,6 +11,7 @@ dependencies = [
   "certifi",
   "huggingface_hub",
   "python-dotenv",
 ]
 [project.scripts]

   "certifi",
   "huggingface_hub",
   "python-dotenv",
+  "markitdown",
 ]
 [project.scripts]

resources.md CHANGED Viewed

@@ -11,3 +11,11 @@ install: pip install ddgs
 ## Hugging Face Spaces
 API docs: https://huggingface.co/docs/huggingface_hub/main/en/package_reference/hf_api#huggingface_hub.HfApi.add_space_secret

 ## Hugging Face Spaces
 API docs: https://huggingface.co/docs/huggingface_hub/main/en/package_reference/hf_api#huggingface_hub.HfApi.add_space_secret
+# Markdown converter
+## MarkItDown
+Info: https://github.com/microsoft/markitdown
+Notes:
+- `MarkItDown().convert_uri(url)` (or `convert_url`) returns an object with `markdown` (or `text_content`).

test_scripts/run_smoke.sh CHANGED Viewed

@@ -56,5 +56,7 @@ python - <<'PY'
 import json, os
 payload = json.loads(os.environ["DDS_OUT"])
 assert isinstance(payload, list)
 print("OK: received", len(payload), "result(s)")
 PY

 import json, os
 payload = json.loads(os.environ["DDS_OUT"])
 assert isinstance(payload, list)
+if payload:
+    assert "markdown" in payload[0]
 print("OK: received", len(payload), "result(s)")
 PY

test_scripts/test_remote_api.sh CHANGED Viewed

@@ -15,3 +15,40 @@ curl -s -X POST "$BASE_URL/search" \
   -H "Content-Type: application/json" \
   -d '{"query":"openai","max_results":1,"region":"us-en","safesearch":"moderate","timelimit":"m","backend":"auto","timeout":30,"verify":true}' \
   | python -c 'import json,sys; payload=json.load(sys.stdin); print("OK", payload.get("count"))'

   -H "Content-Type: application/json" \
   -d '{"query":"openai","max_results":1,"region":"us-en","safesearch":"moderate","timelimit":"m","backend":"auto","timeout":30,"verify":true}' \
   | python -c 'import json,sys; payload=json.load(sys.stdin); print("OK", payload.get("count"))'
+echo "Running 10 concurrent requests..."
+VENDORS=(
+  "OpenAI"
+  "Anthropic"
+  "Google"
+  "Meta"
+  "Microsoft"
+  "Cohere"
+  "Mistral"
+  "AI21"
+  "Perplexity"
+  "xAI"
+)
+RESULTS_FILE="$(mktemp)"
+for vendor in "${VENDORS[@]}"; do
+  (
+    curl -s -X POST "$BASE_URL/search" \
+      -H "Authorization: Bearer $TOKEN" \
+      -H "Content-Type: application/json" \
+      -d "{\"query\":\"$vendor LLM\",\"max_results\":1,\"region\":\"us-en\",\"safesearch\":\"moderate\",\"timelimit\":\"m\",\"backend\":\"auto\",\"timeout\":30,\"verify\":true}" \
+      | python -c 'import json,sys; payload=json.load(sys.stdin); print(payload.get("count"))' \
+      >> "$RESULTS_FILE"
+  ) &
+done
+wait
+RESULTS=$(cat "$RESULTS_FILE")
+rm -f "$RESULTS_FILE"
+if command -v rg >/dev/null 2>&1; then
+  SUCCESS=$(printf "%s\n" "$RESULTS" | rg -c "^[0-9]+$")
+else
+  SUCCESS=$(printf "%s\n" "$RESULTS" | grep -E -c "^[0-9]+$")
+fi
+FAIL=$((10 - SUCCESS))
+echo "Summary: total=10 success=$SUCCESS fail=$FAIL"