dromero-nttd commited on
Commit
9c093af
·
1 Parent(s): 5e671da

Add markitdown URL markdown and ignore tasks

Browse files
.gitignore CHANGED
@@ -10,4 +10,5 @@ __pycache__/
10
  .vscode/
11
  .idea/
12
  .ddgs_search_err
13
- certif_zscaler/*
 
 
10
  .vscode/
11
  .idea/
12
  .ddgs_search_err
13
+ certif_zscaler/*
14
+ tasks/
README.md CHANGED
@@ -37,7 +37,7 @@ Example:
37
  ddgs-search "site:openai.com safety" --region us-en --safesearch moderate --timelimit m --max-results 5 --format json
38
  ```
39
 
40
- `--format json` and `--format jsonl` emit the full result objects returned by DDGS (all available fields).
41
 
42
  ## API (FastAPI)
43
 
 
37
  ddgs-search "site:openai.com safety" --region us-en --safesearch moderate --timelimit m --max-results 5 --format json
38
  ```
39
 
40
+ `--format json` and `--format jsonl` emit the full result objects returned by DDGS (all available fields), plus a `markdown` field containing the MarkItDown-converted content for each URL (PDFs are skipped).
41
 
42
  ## API (FastAPI)
43
 
ddgs_cli.py CHANGED
@@ -4,6 +4,7 @@ import json
4
  import os
5
  import sys
6
  from pathlib import Path
 
7
 
8
  from dotenv import load_dotenv
9
  from ddgs import DDGS
@@ -135,6 +136,56 @@ def _resolve_verify(verify_flag: bool) -> bool | str:
135
  return ca_bundle or True
136
 
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  def ddgs_search(
139
  query: str,
140
  *,
@@ -166,7 +217,10 @@ def ddgs_search(
166
  ddgs_init_kwargs["proxy"] = proxy
167
 
168
  with DDGS(**ddgs_init_kwargs) as ddgs:
169
- return list(ddgs.text(query, **ddgs_kwargs))
 
 
 
170
 
171
 
172
  def main(argv: list[str] | None = None) -> int:
 
4
  import os
5
  import sys
6
  from pathlib import Path
7
+ from urllib.parse import urlsplit
8
 
9
  from dotenv import load_dotenv
10
  from ddgs import DDGS
 
136
  return ca_bundle or True
137
 
138
 
139
+ def _is_pdf_url(url: str) -> bool:
140
+ try:
141
+ path = urlsplit(url).path.lower()
142
+ except ValueError:
143
+ path = url.lower()
144
+ return path.endswith(".pdf")
145
+
146
+
147
+ def _get_markdown_converter():
148
+ try:
149
+ from markitdown import MarkItDown
150
+ except ImportError:
151
+ return None
152
+ return MarkItDown()
153
+
154
+
155
+ def _convert_url_to_markdown(converter, url: str) -> str:
156
+ if hasattr(converter, "convert_uri"):
157
+ result = converter.convert_uri(url)
158
+ elif hasattr(converter, "convert_url"):
159
+ result = converter.convert_url(url)
160
+ else:
161
+ result = converter.convert(url)
162
+
163
+ text = getattr(result, "markdown", None)
164
+ if text is None:
165
+ text = getattr(result, "text_content", None)
166
+ if text is None:
167
+ raise ValueError("MarkItDown result missing markdown content")
168
+ return text
169
+
170
+
171
+ def _attach_markdown(results: list[dict]) -> None:
172
+ converter = _get_markdown_converter()
173
+ for item in results:
174
+ url = item.get("href") or item.get("url")
175
+ if not url or _is_pdf_url(url):
176
+ item["markdown"] = None
177
+ continue
178
+ if converter is None:
179
+ item["markdown"] = None
180
+ item["markdown_error"] = "markitdown_not_installed"
181
+ continue
182
+ try:
183
+ item["markdown"] = _convert_url_to_markdown(converter, url)
184
+ except Exception as exc: # noqa: BLE001 - surface conversion errors in response
185
+ item["markdown"] = None
186
+ item["markdown_error"] = str(exc)
187
+
188
+
189
  def ddgs_search(
190
  query: str,
191
  *,
 
217
  ddgs_init_kwargs["proxy"] = proxy
218
 
219
  with DDGS(**ddgs_init_kwargs) as ddgs:
220
+ results = list(ddgs.text(query, **ddgs_kwargs))
221
+
222
+ _attach_markdown(results)
223
+ return results
224
 
225
 
226
  def main(argv: list[str] | None = None) -> int:
postman_collection.json CHANGED
@@ -22,7 +22,7 @@
22
  ],
23
  "body": {
24
  "mode": "raw",
25
- "raw": "{\n \"query\": \"openai\",\n \"max_results\": 1,\n \"region\": \"us-en\",\n \"safesearch\": \"moderate\",\n \"timelimit\": \"m\",\n \"backend\": \"auto\",\n \"timeout\": 30,\n \"verify\": true\n}",
26
  "options": {
27
  "raw": {
28
  "language": "json"
 
22
  ],
23
  "body": {
24
  "mode": "raw",
25
+ "raw": "{\n \"query\": \"openai\",\n \"region\": \"us-en\",\n \"safesearch\": \"moderate\",\n \"timelimit\": \"m\",\n \"max_results\": 5,\n \"backend\": \"auto\",\n \"proxy\": null,\n \"timeout\": 30,\n \"verify\": true\n}",
26
  "options": {
27
  "raw": {
28
  "language": "json"
pyproject.toml CHANGED
@@ -11,6 +11,7 @@ dependencies = [
11
  "certifi",
12
  "huggingface_hub",
13
  "python-dotenv",
 
14
  ]
15
 
16
  [project.scripts]
 
11
  "certifi",
12
  "huggingface_hub",
13
  "python-dotenv",
14
+ "markitdown",
15
  ]
16
 
17
  [project.scripts]
resources.md CHANGED
@@ -11,3 +11,11 @@ install: pip install ddgs
11
 
12
  ## Hugging Face Spaces
13
  API docs: https://huggingface.co/docs/huggingface_hub/main/en/package_reference/hf_api#huggingface_hub.HfApi.add_space_secret
 
 
 
 
 
 
 
 
 
11
 
12
  ## Hugging Face Spaces
13
  API docs: https://huggingface.co/docs/huggingface_hub/main/en/package_reference/hf_api#huggingface_hub.HfApi.add_space_secret
14
+
15
+ # Markdown converter
16
+
17
+ ## MarkItDown
18
+
19
+ Info: https://github.com/microsoft/markitdown
20
+ Notes:
21
+ - `MarkItDown().convert_uri(url)` (or `convert_url`) returns an object with `markdown` (or `text_content`).
test_scripts/run_smoke.sh CHANGED
@@ -56,5 +56,7 @@ python - <<'PY'
56
  import json, os
57
  payload = json.loads(os.environ["DDS_OUT"])
58
  assert isinstance(payload, list)
 
 
59
  print("OK: received", len(payload), "result(s)")
60
  PY
 
56
  import json, os
57
  payload = json.loads(os.environ["DDS_OUT"])
58
  assert isinstance(payload, list)
59
+ if payload:
60
+ assert "markdown" in payload[0]
61
  print("OK: received", len(payload), "result(s)")
62
  PY
test_scripts/test_remote_api.sh CHANGED
@@ -15,3 +15,40 @@ curl -s -X POST "$BASE_URL/search" \
15
  -H "Content-Type: application/json" \
16
  -d '{"query":"openai","max_results":1,"region":"us-en","safesearch":"moderate","timelimit":"m","backend":"auto","timeout":30,"verify":true}' \
17
  | python -c 'import json,sys; payload=json.load(sys.stdin); print("OK", payload.get("count"))'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  -H "Content-Type: application/json" \
16
  -d '{"query":"openai","max_results":1,"region":"us-en","safesearch":"moderate","timelimit":"m","backend":"auto","timeout":30,"verify":true}' \
17
  | python -c 'import json,sys; payload=json.load(sys.stdin); print("OK", payload.get("count"))'
18
+
19
+ echo "Running 10 concurrent requests..."
20
+ VENDORS=(
21
+ "OpenAI"
22
+ "Anthropic"
23
+ "Google"
24
+ "Meta"
25
+ "Microsoft"
26
+ "Cohere"
27
+ "Mistral"
28
+ "AI21"
29
+ "Perplexity"
30
+ "xAI"
31
+ )
32
+
33
+ RESULTS_FILE="$(mktemp)"
34
+ for vendor in "${VENDORS[@]}"; do
35
+ (
36
+ curl -s -X POST "$BASE_URL/search" \
37
+ -H "Authorization: Bearer $TOKEN" \
38
+ -H "Content-Type: application/json" \
39
+ -d "{\"query\":\"$vendor LLM\",\"max_results\":1,\"region\":\"us-en\",\"safesearch\":\"moderate\",\"timelimit\":\"m\",\"backend\":\"auto\",\"timeout\":30,\"verify\":true}" \
40
+ | python -c 'import json,sys; payload=json.load(sys.stdin); print(payload.get("count"))' \
41
+ >> "$RESULTS_FILE"
42
+ ) &
43
+ done
44
+ wait
45
+ RESULTS=$(cat "$RESULTS_FILE")
46
+ rm -f "$RESULTS_FILE"
47
+
48
+ if command -v rg >/dev/null 2>&1; then
49
+ SUCCESS=$(printf "%s\n" "$RESULTS" | rg -c "^[0-9]+$")
50
+ else
51
+ SUCCESS=$(printf "%s\n" "$RESULTS" | grep -E -c "^[0-9]+$")
52
+ fi
53
+ FAIL=$((10 - SUCCESS))
54
+ echo "Summary: total=10 success=$SUCCESS fail=$FAIL"