motsobelal commited on
Commit
8628def
·
verified ·
1 Parent(s): 51e3a99

Upload 39 files

Browse files
CONTRIBUTING.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing
2
+
3
+ Please open a Discussion, Issue, or email the maintainers to talk over any major changes before submitting a pull request.
4
+
5
+ ## IDE configuration
6
+
7
+ If you use **VSCode**, install recommended extensions (press `F1` → *Show Recommended Extensions*):
8
+
9
+ - `ms-python.python`
10
+ - `ms-python.mypy-type-checker`
11
+ - `charliermarsh.ruff`
12
+ - `usernamehw.errorlens`
13
+ - `fill-labs.dependi`
14
+
15
+ ## Development
16
+
17
+ 1. Fork the repository and clone your fork:
18
+ ```sh
19
+ git clone https://github.com/{your_profile}/ddgs
20
+ cd ddgs
21
+ ```
22
+
23
+ 2. Create and activate a virtual environment, then install development dependencies:
24
+ ```sh
25
+ python -m venv .venv
26
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
27
+ pip install -e .[dev]
28
+ ```
29
+ 3. Install pre-commit hooks (automates formatting, linting, typing):
30
+ ```sh
31
+ pre-commit install
32
+ ```
33
+ - Hooks run `ruff` and `mypy` automatically on each commit.
34
+ - To run them manually: `pre-commit run --all-files`.
35
+
36
+ 3. Create a feature branch:
37
+ ```sh
38
+ git checkout -b feat/new-feature
39
+ ```
40
+ 4. Implement your changes.
41
+ 5. Run tests locally:
42
+ ```sh
43
+ pytest
44
+ ```
45
+ 6. Commit changes (follow Conventional Commits):
46
+ ```sh
47
+ git add .
48
+ git commit -m "feat: add feature description"
49
+ ```
50
+ 7. Push your branch to your fork
51
+ ```sh
52
+ git push origin feat/new-feature
53
+ ```
54
+ 8. Open a pull request against the upstream repository and reference any related Discussion/Issue.
55
+
56
+
57
+ ## Code style
58
+
59
+ - Formatting and linting are enforced with **ruff**.
60
+ - Static typing is checked with **mypy**.
61
+
62
+ ## PR checklist
63
+
64
+ - Tests pass: `pytest`
65
+ - pre-commit checks pass: `pre-commit run --all-files`
66
+ - Commit messages follow Conventional Commits
67
+ - PR references related Issue/Discussion and describes changes
68
+ - Add tests for new behavior where applicable
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.11 slim image as base
2
+ FROM python:3.11-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Set environment variables
8
+ ENV PYTHONDONTWRITEBYTECODE=1
9
+ ENV PYTHONUNBUFFERED=1
10
+ ENV PIP_NO_CACHE_DIR=1
11
+ ENV PIP_DISABLE_PIP_VERSION_CHECK=1
12
+ ENV PYTHONPATH=/app
13
+
14
+ # Install system dependencies including curl for healthcheck
15
+ RUN apt-get update && apt-get upgrade -y \
16
+ && apt-get install -y curl \
17
+ && rm -rf /var/lib/apt/lists/*
18
+
19
+ # Copy application code
20
+ COPY . .
21
+
22
+ # Install Python dependencies (including API dependencies)
23
+ RUN pip install --no-cache-dir -e .[api]
24
+
25
+ # Expose port
26
+ EXPOSE 8000
27
+
28
+ # Create non-root user
29
+ RUN useradd --create-home --shell /bin/bash app \
30
+ && chown -R app:app /app
31
+ USER app
32
+
33
+ # Run the application
34
+ CMD ["python", "start_api.py"]
LICENSE.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 deedy5
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
Makefile ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PY := .venv/bin/python
2
+ PIP := .venv/bin/pip
3
+
4
+ .PHONY: help setup lint format test all clean
5
+
6
+ help:
7
+ @echo "Targets:"
8
+ @echo " setup - create venv and install dependencies"
9
+ @echo " lint - run ruff check, ruff format and mypy"
10
+ @echo " format - run ruff format and ruff check --fix"
11
+ @echo " test - run pytest"
12
+ @echo " all - run setup, lint, format and test"
13
+ @echo " clean - remove cache, venv and build artifacts"
14
+
15
+ setup:
16
+ python3 -m venv .venv
17
+ $(PIP) install -e .[dev]
18
+
19
+ lint:
20
+ $(PY) -m ruff check --fix
21
+ $(PY) -m mypy --install-types --non-interactive .
22
+
23
+ format:
24
+ $(PY) -m ruff format
25
+
26
+ test:
27
+ $(PY) -m pytest
28
+
29
+ all: setup lint format test
30
+
31
+ clean:
32
+ rm -rf .venv/
33
+ rm -rf .pytest_cache/
34
+ rm -rf .mypy_cache/
35
+ rm -rf .ruff_cache/
36
+ rm -rf build/
37
+ rm -rf dist/
38
+ rm -rf *.egg-info/
39
+ find . -name __pycache__ -exec rm -rf {} +
40
+ rm -f uv.lock
api/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """DDGS API package."""
2
+
3
+ __version__ = "1.0.0"
api/main.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FastAPI application for DDGS API."""
2
+
3
+ import logging
4
+ from typing import Any
5
+
6
+ from fastapi import FastAPI, HTTPException
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from pydantic import BaseModel, Field
9
+
10
+ from ddgs import DDGS
11
+
12
+ # Configure logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Create FastAPI app
17
+ app = FastAPI(
18
+ title="DDGS API",
19
+ description="A FastAPI wrapper for the DDGS (Dux Distributed Global Search) library",
20
+ version="1.0.0",
21
+ docs_url="/docs",
22
+ redoc_url="/redoc",
23
+ )
24
+
25
+ # Add CORS middleware
26
+ app.add_middleware(
27
+ CORSMiddleware,
28
+ allow_origins=["*"],
29
+ allow_credentials=True,
30
+ allow_methods=["*"],
31
+ allow_headers=["*"],
32
+ )
33
+
34
+
35
+ # Pydantic models for request/response
36
+ class TextSearchRequest(BaseModel):
37
+ """Request model for search operations."""
38
+
39
+ query: str = Field(..., description="Search query")
40
+ region: str = Field("us-en", description="Region for search (e.g., us-en, uk-en, ru-ru)")
41
+ safesearch: str = Field("moderate", description="Safe search setting (on, moderate, off)")
42
+ timelimit: str | None = Field(None, description="Time limit (d, w, m, y) or custom date range")
43
+ max_results: int | None = Field(10, description="Maximum number of results to return")
44
+ page: int = Field(1, description="Page number of results")
45
+ backend: str = Field("auto", description="Search backend (auto, or specific engine)")
46
+
47
+
48
+ class ImagesSearchRequest(BaseModel):
49
+ """Request model for image search operations."""
50
+
51
+ query: str = Field(..., description="Image search query")
52
+ region: str = Field("us-en", description="Region for search (e.g., us-en, uk-en, ru-ru)")
53
+ safesearch: str = Field("moderate", description="Safe search setting (on, moderate, off)")
54
+ timelimit: str | None = Field(None, description="Time limit (d, w, m, y) or custom date range")
55
+ max_results: int | None = Field(10, description="Maximum number of results to return")
56
+ page: int = Field(1, description="Page number of results")
57
+ backend: str = Field("auto", description="Search backend (auto, or specific engine)")
58
+ size: str | None = Field(None, description="Image size (Small, Medium, Large, Wallpaper)")
59
+ color: str | None = Field(
60
+ None,
61
+ description="Image color (Monochrome, Red, Orange, Yellow, Green, Blue, Purple, Pink, Brown, Black, Gray, Teal, White)", # noqa: E501
62
+ )
63
+ type_image: str | None = Field(None, description="Image type (photo, clipart, gif, transparent, line)")
64
+ layout: str | None = Field(None, description="Image layout (Square, Tall, Wide)")
65
+ license_image: str | None = Field(
66
+ None, description="Image license (any, Public, Share, ShareCommercially, Modify, ModifyCommercially)"
67
+ )
68
+
69
+
70
+ class NewsSearchRequest(BaseModel):
71
+ """Request model for search operations."""
72
+
73
+ query: str = Field(..., description="Search query")
74
+ region: str = Field("us-en", description="Region for search (e.g., us-en, uk-en, ru-ru)")
75
+ safesearch: str = Field("moderate", description="Safe search setting (on, moderate, off)")
76
+ timelimit: str | None = Field(None, description="Time limit (d, w, m, y) or custom date range")
77
+ max_results: int | None = Field(10, description="Maximum number of results to return")
78
+ page: int = Field(1, description="Page number of results")
79
+ backend: str = Field("auto", description="Search backend (auto, or specific engine)")
80
+
81
+
82
+ class VideosSearchRequest(BaseModel):
83
+ """Request model for video search operations."""
84
+
85
+ query: str = Field(..., description="Video search query")
86
+ region: str = Field("us-en", description="Region for search (e.g., us-en, uk-en, ru-ru)")
87
+ safesearch: str = Field("moderate", description="Safe search setting (on, moderate, off)")
88
+ timelimit: str | None = Field(None, description="Time limit (d, w, m) or custom date range")
89
+ max_results: int | None = Field(10, description="Maximum number of results to return")
90
+ page: int = Field(1, description="Page number of results")
91
+ backend: str = Field("auto", description="Search backend (auto, or specific engine)")
92
+ resolution: str | None = Field(None, description="Video resolution (high, standard)")
93
+ duration: str | None = Field(None, description="Video duration (short, medium, long)")
94
+ license_videos: str | None = Field(None, description="Video license (creativeCommon, youtube)")
95
+
96
+
97
+ class BooksSearchRequest(BaseModel):
98
+ """Request model for book search operations."""
99
+
100
+ query: str = Field(..., description="Books search query")
101
+ max_results: int | None = Field(10, description="Maximum number of results to return")
102
+ page: int = Field(1, description="Page number of results")
103
+ backend: str = Field("auto", description="Search backend (auto, or specific engine)")
104
+
105
+
106
+ class SearchResponse(BaseModel):
107
+ """Response model for search operations."""
108
+
109
+ results: list[dict[str, Any]]
110
+
111
+
112
+ class HealthResponse(BaseModel):
113
+ """Response model for health check."""
114
+
115
+ status: str
116
+ version: str
117
+ service: str
118
+
119
+
120
+ @app.get("/", response_model=HealthResponse)
121
+ async def root() -> HealthResponse:
122
+ """Root endpoint with basic service information."""
123
+ return HealthResponse(status="healthy", version="1.0.0", service="DDGS API")
124
+
125
+
126
+ @app.get("/health", response_model=HealthResponse)
127
+ async def health_check() -> HealthResponse:
128
+ """Health check endpoint."""
129
+ return HealthResponse(status="healthy", version="1.0.0", service="DDGS API")
130
+
131
+
132
+ @app.post("/search/text", response_model=SearchResponse)
133
+ async def search_text(request: TextSearchRequest) -> SearchResponse:
134
+ """Perform a text search."""
135
+ try:
136
+ results = DDGS().text(
137
+ query=request.query,
138
+ region=request.region,
139
+ safesearch=request.safesearch,
140
+ timelimit=request.timelimit,
141
+ max_results=request.max_results,
142
+ page=request.page,
143
+ backend=request.backend,
144
+ )
145
+
146
+ return SearchResponse(results=results)
147
+ except Exception as e:
148
+ logger.warning("Error in text search: %s", e)
149
+ raise HTTPException(status_code=500, detail=f"Search failed: {e!s}") from e
150
+
151
+
152
+ @app.get("/search/text", response_model=SearchResponse)
153
+ async def search_text_get(
154
+ query: str,
155
+ region: str = "us-en",
156
+ safesearch: str = "moderate",
157
+ timelimit: str | None = None,
158
+ max_results: int = 10,
159
+ page: int = 1,
160
+ backend: str = "auto",
161
+ ) -> SearchResponse:
162
+ """Perform a text search via GET request."""
163
+ try:
164
+ results = DDGS().text(
165
+ query=query,
166
+ region=region,
167
+ safesearch=safesearch,
168
+ timelimit=timelimit,
169
+ max_results=max_results,
170
+ page=page,
171
+ backend=backend,
172
+ )
173
+
174
+ return SearchResponse(results=results)
175
+ except Exception as e:
176
+ logger.warning("Error in text search (GET): %s", e)
177
+ raise HTTPException(status_code=500, detail=f"Search failed: {e!s}") from e
178
+
179
+
180
+ @app.post("/search/images", response_model=SearchResponse)
181
+ async def search_images(request: ImagesSearchRequest) -> SearchResponse:
182
+ """Perform an image search."""
183
+ try:
184
+ results = DDGS().images(
185
+ query=request.query,
186
+ region=request.region,
187
+ safesearch=request.safesearch,
188
+ timelimit=request.timelimit,
189
+ max_results=request.max_results,
190
+ page=request.page,
191
+ backend=request.backend,
192
+ size=request.size,
193
+ color=request.color,
194
+ type_image=request.type_image,
195
+ layout=request.layout,
196
+ license_image=request.license_image,
197
+ )
198
+
199
+ return SearchResponse(results=results)
200
+ except Exception as e:
201
+ logger.warning("Error in image search: %s", e)
202
+ raise HTTPException(status_code=500, detail=f"Image search failed: {e!s}") from e
203
+
204
+
205
+ @app.get("/search/images", response_model=SearchResponse)
206
+ async def search_images_get(
207
+ query: str,
208
+ region: str = "us-en",
209
+ safesearch: str = "moderate",
210
+ timelimit: str | None = None,
211
+ max_results: int = 10,
212
+ page: int = 1,
213
+ backend: str = "auto",
214
+ size: str | None = None,
215
+ color: str | None = None,
216
+ type_image: str | None = None,
217
+ layout: str | None = None,
218
+ license_image: str | None = None,
219
+ ) -> SearchResponse:
220
+ """Perform an image search via GET request."""
221
+ try:
222
+ results = DDGS().images(
223
+ query=query,
224
+ region=region,
225
+ safesearch=safesearch,
226
+ timelimit=timelimit,
227
+ max_results=max_results,
228
+ page=page,
229
+ backend=backend,
230
+ size=size,
231
+ color=color,
232
+ type_image=type_image,
233
+ layout=layout,
234
+ license_image=license_image,
235
+ )
236
+
237
+ return SearchResponse(results=results)
238
+ except Exception as e:
239
+ logger.warning("Error in image search (GET): %s", e)
240
+ raise HTTPException(status_code=500, detail=f"Image search failed: {e!s}") from e
241
+
242
+
243
+ @app.post("/search/news", response_model=SearchResponse)
244
+ async def search_news(request: NewsSearchRequest) -> SearchResponse:
245
+ """Perform a news search."""
246
+ try:
247
+ results = DDGS().news(
248
+ query=request.query,
249
+ region=request.region,
250
+ safesearch=request.safesearch,
251
+ timelimit=request.timelimit,
252
+ max_results=request.max_results,
253
+ page=request.page,
254
+ backend=request.backend,
255
+ )
256
+
257
+ return SearchResponse(results=results)
258
+ except Exception as e:
259
+ logger.warning("Error in news search: %s", e)
260
+ raise HTTPException(status_code=500, detail=f"News search failed: {e!s}") from e
261
+
262
+
263
+ @app.get("/search/news", response_model=SearchResponse)
264
+ async def search_news_get(
265
+ query: str,
266
+ region: str = "us-en",
267
+ safesearch: str = "moderate",
268
+ timelimit: str | None = None,
269
+ max_results: int = 10,
270
+ page: int = 1,
271
+ backend: str = "auto",
272
+ ) -> SearchResponse:
273
+ """Perform a news search via GET request."""
274
+ try:
275
+ results = DDGS().news(
276
+ query=query,
277
+ region=region,
278
+ safesearch=safesearch,
279
+ timelimit=timelimit,
280
+ max_results=max_results,
281
+ page=page,
282
+ backend=backend,
283
+ )
284
+
285
+ return SearchResponse(results=results)
286
+ except Exception as e:
287
+ logger.warning("Error in news search (GET): %s", e)
288
+ raise HTTPException(status_code=500, detail=f"News search failed: {e!s}") from e
289
+
290
+
291
+ @app.post("/search/videos", response_model=SearchResponse)
292
+ async def search_videos(request: VideosSearchRequest) -> SearchResponse:
293
+ """Perform a video search."""
294
+ try:
295
+ results = DDGS().videos(
296
+ query=request.query,
297
+ region=request.region,
298
+ safesearch=request.safesearch,
299
+ timelimit=request.timelimit,
300
+ max_results=request.max_results,
301
+ page=request.page,
302
+ backend=request.backend,
303
+ resolution=request.resolution,
304
+ duration=request.duration,
305
+ license_videos=request.license_videos,
306
+ )
307
+
308
+ return SearchResponse(results=results)
309
+ except Exception as e:
310
+ logger.warning("Error in video search: %s", e)
311
+ raise HTTPException(status_code=500, detail=f"Video search failed: {e!s}") from e
312
+
313
+
314
+ @app.get("/search/videos", response_model=SearchResponse)
315
+ async def search_videos_get(
316
+ query: str,
317
+ region: str = "us-en",
318
+ safesearch: str = "moderate",
319
+ timelimit: str | None = None,
320
+ max_results: int = 10,
321
+ page: int = 1,
322
+ backend: str = "auto",
323
+ resolution: str | None = None,
324
+ duration: str | None = None,
325
+ license_videos: str | None = None,
326
+ ) -> SearchResponse:
327
+ """Perform a video search via GET request."""
328
+ try:
329
+ results = DDGS().videos(
330
+ query=query,
331
+ region=region,
332
+ safesearch=safesearch,
333
+ timelimit=timelimit,
334
+ max_results=max_results,
335
+ page=page,
336
+ backend=backend,
337
+ resolution=resolution,
338
+ duration=duration,
339
+ license_videos=license_videos,
340
+ )
341
+
342
+ return SearchResponse(results=results)
343
+ except Exception as e:
344
+ logger.warning("Error in video search (GET): %s", e)
345
+ raise HTTPException(status_code=500, detail=f"Video search failed: {e!s}") from e
346
+
347
+
348
+ @app.post("/search/books", response_model=SearchResponse)
349
+ async def search_books(request: BooksSearchRequest) -> SearchResponse:
350
+ """Perform a book search."""
351
+ try:
352
+ results = DDGS().books(
353
+ query=request.query,
354
+ max_results=request.max_results,
355
+ page=request.page,
356
+ backend=request.backend,
357
+ )
358
+
359
+ return SearchResponse(results=results)
360
+ except Exception as e:
361
+ logger.warning("Error in book search: %s", e)
362
+ raise HTTPException(status_code=500, detail=f"Book search failed: {e!s}") from e
363
+
364
+
365
+ @app.get("/search/books", response_model=SearchResponse)
366
+ async def search_books_get(
367
+ query: str,
368
+ max_results: int = 10,
369
+ page: int = 1,
370
+ backend: str = "auto",
371
+ ) -> SearchResponse:
372
+ """Perform a book search via GET request."""
373
+ try:
374
+ results = DDGS().books(
375
+ query=query,
376
+ max_results=max_results,
377
+ page=page,
378
+ backend=backend,
379
+ )
380
+
381
+ return SearchResponse(results=results)
382
+ except Exception as e:
383
+ logger.warning("Error in book search (GET): %s", e)
384
+ raise HTTPException(status_code=500, detail=f"Book search failed: {e!s}") from e
385
+
386
+
387
+ if __name__ == "__main__":
388
+ import uvicorn
389
+
390
+ uvicorn.run(app, host="0.0.0.0", port=8000) # noqa: S104
ddgs/__init__.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """DDGS | Dux Distributed Global Search.
2
+
3
+ A metasearch library that aggregates results from diverse web search services.
4
+ """
5
+
6
+ import importlib
7
+ import logging
8
+ import threading
9
+ from typing import TYPE_CHECKING, Any, cast
10
+
11
+ __version__ = "9.10.0"
12
+ __all__ = ("DDGS",)
13
+
14
+ if TYPE_CHECKING:
15
+ from .ddgs import DDGS
16
+
17
+ # A do-nothing logging handler
18
+ # https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
19
+ logging.getLogger("ddgs").addHandler(logging.NullHandler())
20
+
21
+
22
+ class _ProxyMeta(type):
23
+ _lock: threading.Lock = threading.Lock()
24
+ _real_cls: type["DDGS"] | None = None
25
+
26
+ @classmethod
27
+ def _load_real(cls) -> type["DDGS"]:
28
+ if cls._real_cls is None:
29
+ with cls._lock:
30
+ if cls._real_cls is None:
31
+ cls._real_cls = importlib.import_module(".ddgs", package=__name__).DDGS
32
+ globals()["DDGS"] = cls._real_cls
33
+ return cls._real_cls
34
+
35
+ def __call__(cls, *args: Any, **kwargs: Any) -> "DDGS": # noqa: ANN401
36
+ real = type(cls)._load_real()
37
+ return real(*args, **kwargs)
38
+
39
+ def __getattr__(cls, name: str) -> Any: # noqa: ANN401
40
+ return getattr(type(cls)._load_real(), name)
41
+
42
+ def __dir__(cls) -> list[str]:
43
+ base = set(super().__dir__())
44
+ loaded_names = set(dir(type(cls)._load_real()))
45
+ return sorted(base | (loaded_names - base))
46
+
47
+
48
+ class _DDGSProxy(metaclass=_ProxyMeta):
49
+ """Proxy class for lazy-loading the real DDGS implementation."""
50
+
51
+
52
+ DDGS: type[DDGS] = cast("type[DDGS]", _DDGSProxy) # type: ignore[no-redef]
ddgs/base.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Base class for search engines."""
2
+
3
+ import logging
4
+ from abc import ABC, abstractmethod
5
+ from collections.abc import Mapping
6
+ from functools import cached_property
7
+ from typing import Any, ClassVar, Generic, Literal, TypeVar
8
+
9
+ from lxml import html
10
+ from lxml.etree import HTMLParser as LHTMLParser
11
+
12
+ from .http_client import HttpClient
13
+ from .results import BooksResult, ImagesResult, NewsResult, TextResult, VideosResult
14
+
15
+ logger = logging.getLogger(__name__)
16
+ T = TypeVar("T")
17
+
18
+
19
+ class BaseSearchEngine(ABC, Generic[T]):
20
+ """Abstract base class for all search-engine backends."""
21
+
22
+ name: ClassVar[str] # unique key, e.g. "google"
23
+ category: ClassVar[Literal["text", "images", "videos", "news", "books"]]
24
+ provider: ClassVar[str] # source of the search results (e.g. "bing" for DuckDuckgo)
25
+ disabled: ClassVar[bool] = False # if True, the engine is disabled
26
+ priority: ClassVar[float] = 1
27
+
28
+ search_url: str
29
+ search_method: ClassVar[str] # GET or POST
30
+ search_headers: ClassVar[Mapping[str, str]] = {}
31
+ items_xpath: ClassVar[str]
32
+ elements_xpath: ClassVar[Mapping[str, str]]
33
+ elements_replace: ClassVar[Mapping[str, str]]
34
+
35
+ def __init__(self, proxy: str | None = None, timeout: int | None = None, *, verify: bool | str = True) -> None:
36
+ self.http_client = HttpClient(proxy=proxy, timeout=timeout, verify=verify)
37
+ self.results: list[T] = []
38
+
39
+ @property
40
+ def result_type(self) -> type[T]:
41
+ """Get result type based on category."""
42
+ categories = {
43
+ "text": TextResult,
44
+ "images": ImagesResult,
45
+ "videos": VideosResult,
46
+ "news": NewsResult,
47
+ "books": BooksResult,
48
+ }
49
+ return categories[self.category]
50
+
51
+ @abstractmethod
52
+ def build_payload(
53
+ self,
54
+ query: str,
55
+ region: str,
56
+ safesearch: str,
57
+ timelimit: str | None,
58
+ page: int,
59
+ **kwargs: str,
60
+ ) -> dict[str, Any]:
61
+ """Build a payload for the search request."""
62
+ raise NotImplementedError
63
+
64
+ def request(self, *args: Any, **kwargs: Any) -> str | None: # noqa: ANN401
65
+ """Make a request to the search engine."""
66
+ resp = self.http_client.request(*args, **kwargs)
67
+ if resp.status_code == 200:
68
+ return resp.text
69
+ return None
70
+
71
+ @cached_property
72
+ def parser(self) -> LHTMLParser:
73
+ """Get HTML parser."""
74
+ return LHTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True, collect_ids=False)
75
+
76
+ def extract_tree(self, html_text: str) -> html.Element:
77
+ """Extract html tree from html text."""
78
+ return html.fromstring(html_text, parser=self.parser)
79
+
80
+ def pre_process_html(self, html_text: str) -> str:
81
+ """Pre-process html_text before extracting results."""
82
+ return html_text
83
+
84
+ def extract_results(self, html_text: str) -> list[T]:
85
+ """Extract search results from html text."""
86
+ html_text = self.pre_process_html(html_text)
87
+ tree = self.extract_tree(html_text)
88
+ items = tree.xpath(self.items_xpath)
89
+ results = []
90
+ for item in items:
91
+ result = self.result_type()
92
+ for key, value in self.elements_xpath.items():
93
+ data = " ".join(x.strip() for x in item.xpath(value))
94
+ result.__setattr__(key, data)
95
+ results.append(result)
96
+ return results
97
+
98
+ def post_extract_results(self, results: list[T]) -> list[T]:
99
+ """Post-process search results."""
100
+ return results
101
+
102
+ def search(
103
+ self,
104
+ query: str,
105
+ region: str = "us-en",
106
+ safesearch: str = "moderate",
107
+ timelimit: str | None = None,
108
+ page: int = 1,
109
+ **kwargs: str,
110
+ ) -> list[T] | None:
111
+ """Search the engine."""
112
+ payload = self.build_payload(
113
+ query=query, region=region, safesearch=safesearch, timelimit=timelimit, page=page, **kwargs
114
+ )
115
+ if self.search_method == "GET":
116
+ html_text = self.request(self.search_method, self.search_url, params=payload, headers=self.search_headers)
117
+ else:
118
+ html_text = self.request(self.search_method, self.search_url, data=payload, headers=self.search_headers)
119
+ if not html_text:
120
+ return None
121
+ results = self.extract_results(html_text)
122
+ return self.post_extract_results(results)
ddgs/cli.py ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CLI tool for DDGS."""
2
+
3
+ import csv
4
+ import json
5
+ import logging
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from datetime import datetime, timezone
8
+ from pathlib import Path
9
+ from urllib.parse import unquote
10
+
11
+ import click
12
+ import primp
13
+
14
+ from . import __version__
15
+ from .ddgs import DDGS
16
+ from .utils import _expand_proxy_tb_alias
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ COLORS = {
21
+ 0: "black",
22
+ 1: "red",
23
+ 2: "green",
24
+ 3: "yellow",
25
+ 4: "blue",
26
+ 5: "magenta",
27
+ 6: "cyan",
28
+ 7: "bright_black",
29
+ 8: "bright_red",
30
+ 9: "bright_green",
31
+ 10: "bright_yellow",
32
+ 11: "bright_blue",
33
+ 12: "bright_magenta",
34
+ 13: "bright_cyan",
35
+ 14: "white",
36
+ 15: "bright_white",
37
+ }
38
+
39
+
40
+ def _convert_tuple_to_csv(_ctx: click.Context, _param: click.Parameter, value: tuple[str] | None) -> str:
41
+ if value is not None and isinstance(value, tuple):
42
+ return ",".join(value)
43
+ return ""
44
+
45
+
46
+ def _save_data(query: str, data: list[dict[str, str]], function_name: str, filename: str | None) -> None:
47
+ filename, ext = filename.rsplit(".", 1) if filename and filename.endswith((".csv", ".json")) else (None, filename)
48
+ filename = filename if filename else f"{function_name}_{query}_{datetime.now(tz=timezone.utc):%Y%m%d_%H%M%S}"
49
+ if ext == "csv":
50
+ _save_csv(f"{filename}.{ext}", data)
51
+ elif ext == "json":
52
+ _save_json(f"{filename}.{ext}", data)
53
+
54
+
55
+ def _save_json(jsonfile: str | Path, data: list[dict[str, str]]) -> None:
56
+ with Path(jsonfile).open("w", encoding="utf-8") as file:
57
+ file.write(json.dumps(data, ensure_ascii=False, indent=2))
58
+
59
+
60
+ def _save_csv(csvfile: str | Path, data: list[dict[str, str]]) -> None:
61
+ with Path(csvfile).open("w", newline="", encoding="utf-8") as file:
62
+ if data:
63
+ headers = data[0].keys()
64
+ writer = csv.DictWriter(file, fieldnames=headers, quoting=csv.QUOTE_MINIMAL)
65
+ writer.writeheader()
66
+ writer.writerows(data)
67
+
68
+
69
+ def _print_data(data: list[dict[str, str]], *, no_color: bool = False) -> None:
70
+ if data:
71
+ for i, e in enumerate(data, start=1):
72
+ click.secho(f"{i}.\t {'=' * 78}", bg="black", fg="white")
73
+ for j, (k, v) in enumerate(e.items(), start=1):
74
+ if v:
75
+ width = 300 if k in ("content", "href", "image", "source", "thumbnail", "url") else 78
76
+ title = "language" if k == "detected_language" else k
77
+ text = click.wrap_text(
78
+ f"{v}",
79
+ width=width,
80
+ initial_indent="",
81
+ subsequent_indent=" " * 12,
82
+ preserve_paragraphs=True,
83
+ )
84
+ else:
85
+ title = k
86
+ text = v
87
+ click.secho(f"{title:<12}{text}", bg="black", fg=COLORS[j] if not no_color else "white", overline=True)
88
+ input()
89
+
90
+
91
+ def _sanitize_query(query: str) -> str:
92
+ return (
93
+ query.replace("filetype", "")
94
+ .replace(":", "")
95
+ .replace('"', "'")
96
+ .replace("site", "")
97
+ .replace(" ", "_")
98
+ .replace("/", "_")
99
+ .replace("\\", "_")
100
+ .replace(" ", "")
101
+ )
102
+
103
+
104
+ def _download_file(url: str, dir_path: str, filename: str, proxy: str | None, *, verify: bool) -> None:
105
+ try:
106
+ resp = primp.Client(proxy=proxy, impersonate="random", impersonate_os="random", timeout=10, verify=verify).get(
107
+ url,
108
+ )
109
+ if resp.status_code == 200:
110
+ f = Path(dir_path) / filename[:200]
111
+ with f.open("wb") as file:
112
+ file.write(resp.content)
113
+ except Exception as ex: # noqa: BLE001
114
+ logger.debug("Error download_file url=%s: %r", url, ex)
115
+
116
+
117
+ def _download_results(
118
+ query: str,
119
+ results: list[dict[str, str]],
120
+ function_name: str,
121
+ proxy: str | None = None,
122
+ threads: int | None = None,
123
+ pathname: str | None = None,
124
+ *,
125
+ verify: bool = True,
126
+ ) -> None:
127
+ path = pathname if pathname else f"{function_name}_{query}_{datetime.now(tz=timezone.utc):%Y%m%d_%H%M%S}"
128
+ Path(path).mkdir(parents=True, exist_ok=True)
129
+
130
+ threads = 10 if threads is None else threads
131
+ with ThreadPoolExecutor(max_workers=threads) as executor:
132
+ futures = []
133
+ for i, res in enumerate(results, start=1):
134
+ url = res["image"] if function_name == "images" else res["href"]
135
+ filename = unquote(url.split("/")[-1].split("?")[0])
136
+ f = executor.submit(_download_file, url, path, f"{i}_{filename}", proxy, verify=verify)
137
+ futures.append(f)
138
+
139
+ with click.progressbar(
140
+ length=len(futures),
141
+ label="Downloading",
142
+ show_percent=True,
143
+ show_pos=True,
144
+ width=50,
145
+ ) as bar:
146
+ for future in as_completed(futures):
147
+ future.result()
148
+ bar.update(1)
149
+
150
+
151
+ @click.group(chain=True)
152
+ def cli() -> None:
153
+ """DDGS CLI tool."""
154
+
155
+
156
+ def safe_entry_point() -> None:
157
+ """Run the CLI tool in try-except block to catch all exceptions."""
158
+ logging.basicConfig(level=logging.WARNING)
159
+ try:
160
+ cli()
161
+ except Exception as ex: # noqa: BLE001
162
+ click.echo(f"{type(ex).__name__}: {ex!r}")
163
+
164
+
165
+ @cli.command()
166
+ def version() -> str:
167
+ """Print and return version."""
168
+ print(__version__) # noqa: T201
169
+ return __version__
170
+
171
+
172
+ @cli.command()
173
+ @click.option("-q", "--query", help="text search query")
174
+ @click.option("-k", "--keywords", help="(Deprecated) text search query") # deprecated
175
+ @click.option("-r", "--region", default="us-en", help="us-en, ru-ru, etc.")
176
+ @click.option("-s", "--safesearch", default="moderate", type=click.Choice(["on", "moderate", "off"]))
177
+ @click.option("-t", "--timelimit", type=click.Choice(["d", "w", "m", "y"]), help="day, week, month, year")
178
+ @click.option("-m", "--max_results", default=10, type=int, help="maximum number of results")
179
+ @click.option("-p", "--page", default=1, type=int, help="page number of results")
180
+ @click.option(
181
+ "-b",
182
+ "--backend",
183
+ default=["auto"],
184
+ type=click.Choice(
185
+ [
186
+ "auto",
187
+ "all",
188
+ "bing",
189
+ "brave",
190
+ "duckduckgo",
191
+ "google",
192
+ "grokipedia",
193
+ "mojeek",
194
+ "yandex",
195
+ "yahoo",
196
+ "wikipedia",
197
+ ],
198
+ ),
199
+ multiple=True,
200
+ callback=_convert_tuple_to_csv,
201
+ )
202
+ @click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
203
+ @click.option("-d", "--download", is_flag=True, default=False, help="download results. -dd to set custom directory")
204
+ @click.option("-dd", "--download-directory", help="Specify custom download directory")
205
+ @click.option("-th", "--threads", default=10, help="download threads, default=10")
206
+ @click.option("-pr", "--proxy", help="the proxy to send requests, example: socks5h://127.0.0.1:9150")
207
+ @click.option("-v", "--verify", default=True, help="verify SSL when making the request")
208
+ @click.option("-nc", "--no-color", is_flag=True, default=False, help="disable color output")
209
+ def text(
210
+ query: str,
211
+ keywords: str | None, # deprecated
212
+ region: str,
213
+ safesearch: str,
214
+ timelimit: str | None,
215
+ max_results: int | None,
216
+ page: int,
217
+ backend: str,
218
+ output: str | None,
219
+ download_directory: str | None,
220
+ threads: int,
221
+ proxy: str | None,
222
+ *,
223
+ download: bool,
224
+ verify: bool,
225
+ no_color: bool,
226
+ ) -> None:
227
+ """CLI function to perform a DDGS text metasearch."""
228
+ data = DDGS(proxy=_expand_proxy_tb_alias(proxy), verify=verify).text(
229
+ query=query,
230
+ keywords=keywords, # deprecated
231
+ region=region,
232
+ safesearch=safesearch,
233
+ timelimit=timelimit,
234
+ max_results=max_results,
235
+ page=page,
236
+ backend=backend,
237
+ )
238
+ query = _sanitize_query(query)
239
+ if output:
240
+ _save_data(query, data, "text", filename=output)
241
+ if download:
242
+ _download_results(
243
+ query,
244
+ data,
245
+ function_name="text",
246
+ proxy=proxy,
247
+ threads=threads,
248
+ verify=verify,
249
+ pathname=download_directory,
250
+ )
251
+ if not output and not download:
252
+ _print_data(data, no_color=no_color)
253
+
254
+
255
+ @cli.command()
256
+ @click.option("-q", "--query", help="images search query")
257
+ @click.option("-k", "--keywords", help="(Deprecated) images search query") # deprecated
258
+ @click.option("-r", "--region", default="us-en", help="us-en, ru-ru, etc.")
259
+ @click.option("-s", "--safesearch", default="moderate", type=click.Choice(["on", "moderate", "off"]))
260
+ @click.option("-t", "--timelimit", type=click.Choice(["d", "w", "m", "y"]))
261
+ @click.option("-m", "--max_results", default=10, type=int, help="maximum number of results")
262
+ @click.option("-p", "--page", default=1, type=int, help="page number of results")
263
+ @click.option(
264
+ "-b",
265
+ "--backend",
266
+ default=["auto"],
267
+ type=click.Choice(["auto", "all", "duckduckgo"]),
268
+ multiple=True,
269
+ callback=_convert_tuple_to_csv,
270
+ )
271
+ @click.option("-size", "--size", type=click.Choice(["Small", "Medium", "Large", "Wallpaper"]))
272
+ @click.option(
273
+ "-c",
274
+ "--color",
275
+ type=click.Choice(
276
+ [
277
+ "color",
278
+ "Monochrome",
279
+ "Red",
280
+ "Orange",
281
+ "Yellow",
282
+ "Green",
283
+ "Blue",
284
+ "Purple",
285
+ "Pink",
286
+ "Brown",
287
+ "Black",
288
+ "Gray",
289
+ "Teal",
290
+ "White",
291
+ ],
292
+ ),
293
+ )
294
+ @click.option("-type", "--type_image", type=click.Choice(["photo", "clipart", "gif", "transparent", "line"]))
295
+ @click.option("-l", "--layout", type=click.Choice(["Square", "Tall", "Wide"]))
296
+ @click.option(
297
+ "-lic",
298
+ "--license_image",
299
+ type=click.Choice(["any", "Public", "Share", "ShareCommercially", "Modify", "ModifyCommercially"]),
300
+ )
301
+ @click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
302
+ @click.option("-d", "--download", is_flag=True, default=False, help="download results. -dd to set custom directory")
303
+ @click.option("-dd", "--download-directory", help="Specify custom download directory")
304
+ @click.option("-th", "--threads", default=10, help="download threads, default=10")
305
+ @click.option("-pr", "--proxy", help="the proxy to send requests, example: socks5h://127.0.0.1:9150")
306
+ @click.option("-v", "--verify", default=True, help="verify SSL when making the request")
307
+ @click.option("-nc", "--no-color", is_flag=True, default=False, help="disable color output")
308
+ def images(
309
+ query: str,
310
+ keywords: str | None, # deprecated
311
+ region: str,
312
+ safesearch: str,
313
+ timelimit: str | None,
314
+ max_results: int | None,
315
+ page: int,
316
+ backend: str,
317
+ size: str | None,
318
+ color: str | None,
319
+ type_image: str | None,
320
+ layout: str | None,
321
+ license_image: str | None,
322
+ download_directory: str | None,
323
+ threads: int,
324
+ output: str | None,
325
+ proxy: str | None,
326
+ *,
327
+ download: bool,
328
+ verify: bool,
329
+ no_color: bool,
330
+ ) -> None:
331
+ """CLI function to perform a DDGS images metasearch."""
332
+ data = DDGS(proxy=_expand_proxy_tb_alias(proxy), verify=verify).images(
333
+ query=query,
334
+ keywords=keywords, # deprecated
335
+ region=region,
336
+ safesearch=safesearch,
337
+ timelimit=timelimit,
338
+ max_results=max_results,
339
+ page=page,
340
+ backend=backend,
341
+ size=size,
342
+ color=color,
343
+ type_image=type_image,
344
+ layout=layout,
345
+ license_image=license_image,
346
+ )
347
+ query = _sanitize_query(query)
348
+ if output:
349
+ _save_data(query, data, function_name="images", filename=output)
350
+ if download:
351
+ _download_results(
352
+ query,
353
+ data,
354
+ function_name="images",
355
+ proxy=proxy,
356
+ threads=threads,
357
+ verify=verify,
358
+ pathname=download_directory,
359
+ )
360
+ if not output and not download:
361
+ _print_data(data, no_color=no_color)
362
+
363
+
364
+ @cli.command()
365
+ @click.option("-q", "--query", help="videos search query")
366
+ @click.option("-k", "--keywords", help="(Deprecated) videos search query") # deprecated
367
+ @click.option("-r", "--region", default="us-en", help="us-en, ru-ru, etc.")
368
+ @click.option("-s", "--safesearch", default="moderate", type=click.Choice(["on", "moderate", "off"]))
369
+ @click.option("-t", "--timelimit", type=click.Choice(["d", "w", "m"]), help="day, week, month")
370
+ @click.option("-m", "--max_results", default=10, type=int, help="maximum number of results")
371
+ @click.option("-p", "--page", default=1, type=int, help="page number of results")
372
+ @click.option(
373
+ "-b",
374
+ "--backend",
375
+ default=["auto"],
376
+ type=click.Choice(["auto", "all", "duckduckgo"]),
377
+ multiple=True,
378
+ callback=_convert_tuple_to_csv,
379
+ )
380
+ @click.option("-res", "--resolution", type=click.Choice(["high", "standart"]))
381
+ @click.option("-d", "--duration", type=click.Choice(["short", "medium", "long"]))
382
+ @click.option("-lic", "--license_videos", type=click.Choice(["creativeCommon", "youtube"]))
383
+ @click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
384
+ @click.option("-pr", "--proxy", help="the proxy to send requests, example: socks5h://127.0.0.1:9150")
385
+ @click.option("-v", "--verify", default=True, help="verify SSL when making the request")
386
+ @click.option("-nc", "--no-color", is_flag=True, default=False, help="disable color output")
387
+ def videos(
388
+ query: str,
389
+ keywords: str | None, # deprecated
390
+ region: str,
391
+ safesearch: str,
392
+ timelimit: str | None,
393
+ max_results: int | None,
394
+ page: int,
395
+ backend: str,
396
+ resolution: str | None,
397
+ duration: str | None,
398
+ license_videos: str | None,
399
+ output: str | None,
400
+ proxy: str | None,
401
+ *,
402
+ verify: bool,
403
+ no_color: bool,
404
+ ) -> None:
405
+ """CLI function to perform a DDGS videos metasearch."""
406
+ data = DDGS(proxy=_expand_proxy_tb_alias(proxy), verify=verify).videos(
407
+ query=query,
408
+ keywords=keywords, # deprecated
409
+ region=region,
410
+ safesearch=safesearch,
411
+ timelimit=timelimit,
412
+ max_results=max_results,
413
+ page=page,
414
+ backend=backend,
415
+ resolution=resolution,
416
+ duration=duration,
417
+ license_videos=license_videos,
418
+ )
419
+ query = _sanitize_query(query)
420
+ if output:
421
+ _save_data(query, data, function_name="videos", filename=output)
422
+ else:
423
+ _print_data(data, no_color=no_color)
424
+
425
+
426
+ @cli.command()
427
+ @click.option("-q", "--query", help="news search query")
428
+ @click.option("-k", "--keywords", help="(Deprecated) news search query") # deprecated
429
+ @click.option("-r", "--region", default="us-en", help="us-en, ru-ru, etc.")
430
+ @click.option("-s", "--safesearch", default="moderate", type=click.Choice(["on", "moderate", "off"]))
431
+ @click.option("-t", "--timelimit", type=click.Choice(["d", "w", "m", "y"]), help="day, week, month, year")
432
+ @click.option("-m", "--max_results", default=10, type=int, help="maximum number of results")
433
+ @click.option("-p", "--page", default=1, type=int, help="page number of results")
434
+ @click.option(
435
+ "-b",
436
+ "--backend",
437
+ default=["auto"],
438
+ type=click.Choice(["auto", "all", "bing", "duckduckgo", "yahoo"]),
439
+ multiple=True,
440
+ callback=_convert_tuple_to_csv,
441
+ )
442
+ @click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
443
+ @click.option("-pr", "--proxy", help="the proxy to send requests, example: socks5h://127.0.0.1:9150")
444
+ @click.option("-v", "--verify", default=True, help="verify SSL when making the request")
445
+ @click.option("-nc", "--no-color", is_flag=True, default=False, help="disable color output")
446
+ def news(
447
+ query: str,
448
+ keywords: str | None, # deprecated
449
+ region: str,
450
+ safesearch: str,
451
+ timelimit: str | None,
452
+ max_results: int | None,
453
+ page: int,
454
+ backend: str,
455
+ output: str | None,
456
+ proxy: str | None,
457
+ *,
458
+ verify: bool,
459
+ no_color: bool,
460
+ ) -> None:
461
+ """CLI function to perform a DDGS news metasearch."""
462
+ data = DDGS(proxy=_expand_proxy_tb_alias(proxy), verify=verify).news(
463
+ query=query,
464
+ keywords=keywords, # deprecated
465
+ region=region,
466
+ safesearch=safesearch,
467
+ timelimit=timelimit,
468
+ max_results=max_results,
469
+ page=page,
470
+ backend=backend,
471
+ )
472
+ query = _sanitize_query(query)
473
+ if output:
474
+ _save_data(query, data, function_name="news", filename=output)
475
+ else:
476
+ _print_data(data, no_color=no_color)
477
+
478
+
479
+ @cli.command()
480
+ @click.option("-q", "--query", help="books search query")
481
+ @click.option("-k", "--keywords", help="(Deprecated) books search query") # deprecated
482
+ @click.option("-m", "--max_results", default=10, type=int, help="maximum number of results")
483
+ @click.option("-p", "--page", default=1, type=int, help="page number of results")
484
+ @click.option(
485
+ "-b",
486
+ "--backend",
487
+ default=["auto"],
488
+ type=click.Choice(["auto", "all", "annasarchive"]),
489
+ multiple=True,
490
+ callback=_convert_tuple_to_csv,
491
+ )
492
+ @click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
493
+ @click.option("-pr", "--proxy", help="the proxy to send requests, example: socks5h://127.0.0.1:9150")
494
+ @click.option("-v", "--verify", default=True, help="verify SSL when making the request")
495
+ @click.option("-nc", "--no-color", is_flag=True, default=False, help="disable color output")
496
+ def books(
497
+ query: str,
498
+ keywords: str | None, # deprecated
499
+ max_results: int | None,
500
+ page: int,
501
+ backend: str,
502
+ output: str | None,
503
+ proxy: str | None,
504
+ *,
505
+ verify: bool,
506
+ no_color: bool,
507
+ ) -> None:
508
+ """CLI function to perform a DDGS books metasearch."""
509
+ data = DDGS(proxy=_expand_proxy_tb_alias(proxy), verify=verify).books(
510
+ query=query,
511
+ keywords=keywords, # deprecated
512
+ max_results=max_results,
513
+ page=page,
514
+ backend=backend,
515
+ )
516
+ if output:
517
+ _save_data(query, data, function_name="books", filename=output)
518
+ else:
519
+ _print_data(data, no_color=no_color)
520
+
521
+
522
+ if __name__ == "__main__":
523
+ safe_entry_point()
ddgs/ddgs.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """DDGS class implementation."""
2
+
3
+ import logging
4
+ import os
5
+ from concurrent.futures import ThreadPoolExecutor, wait
6
+ from math import ceil
7
+ from random import random, shuffle
8
+ from types import TracebackType
9
+ from typing import Any, ClassVar
10
+
11
+ from .base import BaseSearchEngine
12
+ from .engines import ENGINES
13
+ from .exceptions import DDGSException, TimeoutException
14
+ from .results import ResultsAggregator
15
+ from .similarity import SimpleFilterRanker
16
+ from .utils import _expand_proxy_tb_alias
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class DDGS:
22
+ """DDGS | Dux Distributed Global Search.
23
+
24
+ A metasearch library that aggregates results from diverse web search services.
25
+
26
+ Args:
27
+ proxy: The proxy to use for the search. Defaults to None.
28
+ timeout: The timeout for the search. Defaults to 5.
29
+ verify: bool (True to verify, False to skip) or str path to a PEM file. Defaults to True.
30
+
31
+ Attributes:
32
+ threads: The number of threads to use for the search. Defaults to None (automatic).
33
+ _executor: The ThreadPoolExecutor instance.
34
+
35
+ Raises:
36
+ DDGSException: If an error occurs during the search.
37
+
38
+ Example:
39
+ >>> from ddgs import DDGS
40
+ >>> results = DDGS().search("python")
41
+
42
+ """
43
+
44
+ threads: ClassVar[int | None] = None
45
+ _executor: ClassVar[ThreadPoolExecutor | None] = None
46
+
47
+ def __init__(self, proxy: str | None = None, timeout: int | None = 5, *, verify: bool | str = True) -> None:
48
+ self._proxy = _expand_proxy_tb_alias(proxy) or os.environ.get("DDGS_PROXY")
49
+ self._timeout = timeout
50
+ self._verify = verify
51
+ self._engines_cache: dict[
52
+ type[BaseSearchEngine[Any]], BaseSearchEngine[Any]
53
+ ] = {} # dict[engine_class, engine_instance]
54
+
55
+ def __enter__(self) -> "DDGS":
56
+ """Enter the context manager and return the DDGS instance."""
57
+ return self
58
+
59
+ def __exit__(
60
+ self,
61
+ exc_type: type[BaseException] | None = None,
62
+ exc_val: BaseException | None = None,
63
+ exc_tb: TracebackType | None = None,
64
+ ) -> None:
65
+ """Exit the context manager."""
66
+
67
+ @classmethod
68
+ def get_executor(cls) -> ThreadPoolExecutor:
69
+ """Get a ThreadPoolExecutor instance and cache it."""
70
+ if cls._executor is None:
71
+ cls._executor = ThreadPoolExecutor(max_workers=cls.threads, thread_name_prefix="DDGS")
72
+ return cls._executor
73
+
74
+ def _get_engines(
75
+ self,
76
+ category: str,
77
+ backend: str,
78
+ ) -> list[BaseSearchEngine[Any]]:
79
+ """Retrieve a list of search engine instances for a given category and backend.
80
+
81
+ Args:
82
+ category: The category of search engines (e.g., 'text', 'images', etc.).
83
+ backend: A single or comma-delimited backends. Defaults to "auto".
84
+
85
+ Returns:
86
+ A list of initialized search engine instances corresponding to the specified
87
+ category and backend. Instances are cached for reuse.
88
+
89
+ """
90
+ if isinstance(backend, list): # deprecated
91
+ backend = ",".join(backend)
92
+ backend_list = [x.strip() for x in backend.split(",")]
93
+ engine_keys = list(ENGINES[category].keys())
94
+ shuffle(engine_keys)
95
+ if "auto" in backend_list or "all" in backend_list:
96
+ keys = engine_keys
97
+ if category == "text":
98
+ keys = ["wikipedia", "grokipedia"] + [k for k in keys if k not in ("wikipedia", "grokipedia")]
99
+ else:
100
+ keys = backend_list
101
+
102
+ try:
103
+ engine_classes = [ENGINES[category][key] for key in keys]
104
+ # Initialize and cache engine instances
105
+ instances = []
106
+ for engine_class in engine_classes:
107
+ # If already cached, use the cached instance
108
+ if engine_class in self._engines_cache:
109
+ instances.append(self._engines_cache[engine_class])
110
+ # If not cached, create a new instance
111
+ else:
112
+ engine_instance = engine_class(proxy=self._proxy, timeout=self._timeout, verify=self._verify)
113
+ self._engines_cache[engine_class] = engine_instance
114
+ instances.append(engine_instance)
115
+
116
+ # sorting by `engine.priority`
117
+ instances.sort(key=lambda e: (e.priority, random), reverse=True)
118
+ except KeyError as ex:
119
+ logger.warning(
120
+ "%r - backend is not exist or disabled. Available: %s. Using 'auto'",
121
+ ex,
122
+ ", ".join(sorted(engine_keys)),
123
+ )
124
+ return self._get_engines(category, "auto")
125
+ else:
126
+ return instances
127
+
128
+ def _search( # noqa: C901
129
+ self,
130
+ category: str,
131
+ query: str,
132
+ keywords: str | None = None, # deprecated
133
+ *,
134
+ region: str = "us-en",
135
+ safesearch: str = "moderate",
136
+ timelimit: str | None = None,
137
+ max_results: int | None = 10,
138
+ page: int = 1,
139
+ backend: str = "auto",
140
+ **kwargs: str,
141
+ ) -> list[dict[str, Any]]:
142
+ """Perform a search across engines in the given category.
143
+
144
+ Args:
145
+ category: The category of search engines (e.g., 'text', 'images', etc.).
146
+ query: The search query.
147
+ keywords: Deprecated alias for `query`.
148
+ region: The region to use for the search (e.g., us-en, uk-en, ru-ru, etc.).
149
+ safesearch: The safesearch setting (e.g., on, moderate, off).
150
+ timelimit: The timelimit for the search (e.g., d, w, m, y) or custom date range.
151
+ max_results: The maximum number of results to return. Defaults to 10.
152
+ page: The page of results to return. Defaults to 1.
153
+ backend: A single or comma-delimited backends. Defaults to "auto".
154
+ **kwargs: Additional keyword arguments to pass to the search engines.
155
+
156
+ Returns:
157
+ A list of dictionaries containing the search results.
158
+
159
+ """
160
+ query = keywords or query
161
+ if not query:
162
+ msg = "query is mandatory."
163
+ raise DDGSException(msg)
164
+
165
+ engines = self._get_engines(category, backend)
166
+ len_unique_providers = len({engine.provider for engine in engines})
167
+ seen_providers: set[str] = set()
168
+
169
+ # Perform search
170
+ results_aggregator: ResultsAggregator[set[str]] = ResultsAggregator({"href", "image", "url", "embed_url"})
171
+ max_workers = min(len_unique_providers, ceil(max_results / 10) + 1) if max_results else len_unique_providers
172
+ executor = self.get_executor()
173
+ futures, err = {}, None
174
+ for i, engine in enumerate(engines, start=1):
175
+ if engine.provider in seen_providers:
176
+ continue
177
+ future = executor.submit(
178
+ engine.search,
179
+ query,
180
+ region=region,
181
+ safesearch=safesearch,
182
+ timelimit=timelimit,
183
+ page=page,
184
+ **kwargs,
185
+ )
186
+ futures[future] = engine
187
+
188
+ if len(futures) >= max_workers or i >= max_workers:
189
+ done, not_done = wait(futures, timeout=self._timeout, return_when="FIRST_EXCEPTION")
190
+ for f, f_engine in futures.items():
191
+ if f in done:
192
+ try:
193
+ if r := f.result():
194
+ results_aggregator.extend(r)
195
+ seen_providers.add(f_engine.provider)
196
+ except Exception as ex: # noqa: BLE001
197
+ err = ex
198
+ logger.info("Error in engine %s: %r", engine.name, ex)
199
+ futures = {f: futures[f] for f in not_done}
200
+
201
+ if max_results and len(results_aggregator) >= max_results:
202
+ break
203
+
204
+ results = results_aggregator.extract_dicts()
205
+ # Rank results
206
+ ranker = SimpleFilterRanker()
207
+ results = ranker.rank(results, query)
208
+
209
+ if results:
210
+ return results[:max_results] if max_results else results
211
+
212
+ if "timed out" in f"{err}":
213
+ raise TimeoutException(err)
214
+ raise DDGSException(err or "No results found.")
215
+
216
+ def text(self, query: str, **kwargs: Any) -> list[dict[str, Any]]: # noqa: ANN401
217
+ """Perform a text search."""
218
+ return self._search("text", query, **kwargs)
219
+
220
+ def images(self, query: str, **kwargs: Any) -> list[dict[str, Any]]: # noqa: ANN401
221
+ """Perform an image search."""
222
+ return self._search("images", query, **kwargs)
223
+
224
+ def news(self, query: str, **kwargs: Any) -> list[dict[str, Any]]: # noqa: ANN401
225
+ """Perform a news search."""
226
+ return self._search("news", query, **kwargs)
227
+
228
+ def videos(self, query: str, **kwargs: Any) -> list[dict[str, Any]]: # noqa: ANN401
229
+ """Perform a video search."""
230
+ return self._search("videos", query, **kwargs)
231
+
232
+ def books(self, query: str, **kwargs: Any) -> list[dict[str, Any]]: # noqa: ANN401
233
+ """Perform a book search."""
234
+ return self._search("books", query, **kwargs)
ddgs/engines/__init__.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Automatically build registry of search engines.
2
+
3
+ This module defines the module-level variable ENGINES, which is a dictionary
4
+ of dictionaries. The keys of the outer dictionary are the categories of search
5
+ engines, and the keys of the inner dictionaries are the names of the search
6
+ engines. The values of the inner dictionaries are the classes of the search
7
+ engines.
8
+
9
+ The search engines are automatically discovered by looking for classes in the
10
+ module that are subclasses of :class:`ddgs.base.BaseSearchEngine` and
11
+ subclasses of the base class do not have names starting with "Base", and
12
+ do not have a class attribute "disabled" set to True.
13
+
14
+ The module automatically builds the ENGINES dictionary, so it should not be
15
+ imported directly by user code.
16
+
17
+ Example of resulting dictionary ENGINES:
18
+
19
+ from .bing import Bing
20
+ from .brave import Brave
21
+ from .duckduckgo import Duckduckgo
22
+ from .duckduckgo_images import DuckduckgoImages
23
+ from .duckduckgo_news import DuckduckgoNews
24
+ from .duckduckgo_videos import DuckduckgoVideos
25
+ from .google import Google
26
+ from .mojeek import Mojeek
27
+ from .wikipedia import Wikipedia
28
+ from .yahoo import Yahoo
29
+ from .yandex import Yandex
30
+
31
+ ENGINES: dict[str, dict[str, type[BaseSearchEngine[Any]]]] = {
32
+ "text": {
33
+ "bing": Bing,
34
+ "brave": Brave,
35
+ "duckduckgo": Duckduckgo, # bing
36
+ "google": Google,
37
+ "mojeek": Mojeek,
38
+ "yahoo": Yahoo, # bing
39
+ "yandex": Yandex,
40
+ "wikipedia": Wikipedia,
41
+ },
42
+ "images": {
43
+ "duckduckgo": DuckduckgoImages,
44
+ },
45
+ "news": {
46
+ "duckduckgo": DuckduckgoNews,
47
+ },
48
+ "videos": {
49
+ "duckduckgo": DuckduckgoVideos,
50
+ },
51
+ }
52
+ """
53
+
54
+ import importlib
55
+ import inspect
56
+ import pkgutil
57
+ from collections import defaultdict
58
+ from typing import Any
59
+
60
+ from ddgs.base import BaseSearchEngine
61
+
62
+ # ENGINES[category][name] = class
63
+ ENGINES: dict[str, dict[str, type[BaseSearchEngine[Any]]]] = defaultdict(dict)
64
+
65
+ package_name = __name__
66
+ package = importlib.import_module(package_name)
67
+
68
+ for finder, modname, _ispkg in pkgutil.iter_modules(package.__path__, package_name + "."):
69
+ module_path = finder.path if hasattr(finder, "path") else finder
70
+ module = importlib.import_module(modname)
71
+ for _, cls in inspect.getmembers(module, inspect.isclass):
72
+ # 1) must subclass BaseSearchEngine (but not the base itself)
73
+ if not issubclass(cls, BaseSearchEngine) or cls is BaseSearchEngine:
74
+ continue
75
+
76
+ # 2) skip any class whose name starts with "Base"
77
+ if cls.__name__.startswith("Base"):
78
+ continue
79
+
80
+ # 3) skip disabled engines
81
+ if getattr(cls, "disabled", True):
82
+ continue
83
+
84
+ # 3) ensure they provided name & category
85
+ name = getattr(cls, "name", None)
86
+ category = getattr(cls, "category", None)
87
+ if not isinstance(name, str) or not isinstance(category, str):
88
+ msg = f"{cls.__qualname__} must define class attributes 'name: str' and 'category: str'."
89
+ raise TypeError(msg)
90
+
91
+ ENGINES[category][name] = cls
92
+
93
+ # freeze into normal dicts
94
+ ENGINES = {cat: dict(m) for cat, m in ENGINES.items()}
ddgs/engines/annasarchive.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Anna's Archive search engine implementation."""
2
+
3
+ from collections.abc import Mapping
4
+ from typing import Any, ClassVar
5
+
6
+ from ddgs.base import BaseSearchEngine
7
+ from ddgs.results import BooksResult
8
+
9
+
10
+ class AnnasArchive(BaseSearchEngine[BooksResult]):
11
+ """Anna's Archive search engine."""
12
+
13
+ name = "annasarchive"
14
+ category = "books"
15
+ provider = "annasarchive"
16
+
17
+ search_url = "https://annas-archive.li/search"
18
+ search_method = "GET"
19
+
20
+ items_xpath = "//div[contains(@class, 'record-list-outer')]/div"
21
+ elements_xpath: ClassVar[Mapping[str, str]] = {
22
+ "title": ".//a[contains(@class, 'text-lg')]//text()",
23
+ "author": ".//a[span[contains(@class, 'user')]]//text()",
24
+ "publisher": ".//a[span[contains(@class, 'company')]]//text()",
25
+ "info": ".//div[contains(@class, 'text-gray-800')]/text()",
26
+ "url": "./a/@href",
27
+ "thumbnail": ".//img/@src",
28
+ }
29
+
30
+ def build_payload(
31
+ self,
32
+ query: str,
33
+ region: str, # noqa: ARG002
34
+ safesearch: str, # noqa: ARG002
35
+ timelimit: str | None, # noqa: ARG002
36
+ page: int = 1,
37
+ **kwargs: str, # noqa: ARG002
38
+ ) -> dict[str, Any]:
39
+ """Build a payload for the search request."""
40
+ return {"q": query, "page": f"{page}"}
41
+
42
+ def pre_process_html(self, html_text: str) -> str:
43
+ """Pre-process the HTML text before parsing it."""
44
+ return html_text.replace("<!--", "").replace("-->", "")
45
+
46
+ def post_extract_results(self, results: list[BooksResult]) -> list[BooksResult]:
47
+ """Post-process search results."""
48
+ base_url = self.search_url.split("/search")[0]
49
+ for result in results:
50
+ result.url = f"{base_url}{result.url}"
51
+ return results
ddgs/engines/bing.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Bing search engine implementation."""
2
+
3
+ import base64
4
+ from collections.abc import Mapping
5
+ from time import time
6
+ from typing import Any, ClassVar
7
+ from urllib.parse import parse_qs, urlparse
8
+
9
+ from ddgs.base import BaseSearchEngine
10
+ from ddgs.results import TextResult
11
+
12
+
13
+ def unwrap_bing_url(raw_url: str) -> str | None:
14
+ """Decode the Bing-wrapped raw_url to extract the original url."""
15
+ parsed = urlparse(raw_url)
16
+ u_vals = parse_qs(parsed.query).get("u", [])
17
+ if not u_vals:
18
+ return None
19
+
20
+ u = u_vals[0]
21
+ if len(u) <= 2:
22
+ return None
23
+
24
+ # Drop the first two characters, pad to a multiple of 4, then decode
25
+ b64_part = u[2:]
26
+ padding = "=" * (-len(b64_part) % 4)
27
+ decoded = base64.urlsafe_b64decode(b64_part + padding)
28
+ return decoded.decode()
29
+
30
+
31
+ class Bing(BaseSearchEngine[TextResult]):
32
+ """Bing search engine."""
33
+
34
+ disabled = True # !!!
35
+
36
+ name = "bing"
37
+ category = "text"
38
+ provider = "bing"
39
+
40
+ search_url = "https://www.bing.com/search"
41
+ search_method = "GET"
42
+
43
+ items_xpath = "//li[contains(@class, 'b_algo')]"
44
+ elements_xpath: ClassVar[Mapping[str, str]] = {
45
+ "title": ".//h2/a//text()",
46
+ "href": ".//h2/a/@href",
47
+ "body": ".//p//text()",
48
+ }
49
+
50
+ def build_payload(
51
+ self,
52
+ query: str,
53
+ region: str,
54
+ safesearch: str, # noqa: ARG002
55
+ timelimit: str | None,
56
+ page: int = 1,
57
+ **kwargs: str, # noqa: ARG002
58
+ ) -> dict[str, Any]:
59
+ """Build a payload for the Bing search request."""
60
+ country, lang = region.lower().split("-")
61
+ payload = {"q": query, "pq": query, "cc": lang}
62
+ cookies = {
63
+ "_EDGE_CD": f"m={lang}-{country}&u={lang}-{country}",
64
+ "_EDGE_S": f"mkt={lang}-{country}&ui={lang}-{country}",
65
+ }
66
+ self.http_client.client.set_cookies("https://www.bing.com", cookies)
67
+ if timelimit:
68
+ d = int(time() // 86400)
69
+ code = f"ez5_{d - 365}_{d}" if timelimit == "y" else "ez" + {"d": "1", "w": "2", "m": "3"}[timelimit]
70
+ payload["filters"] = f'ex1:"{code}"'
71
+ if page > 1:
72
+ payload["first"] = f"{(page - 1) * 10}"
73
+ payload["FORM"] = f"PERE{page - 2 if page > 2 else ''}"
74
+ return payload
75
+
76
+ def post_extract_results(self, results: list[TextResult]) -> list[TextResult]:
77
+ """Post-process search results."""
78
+ post_results = []
79
+ for result in results:
80
+ if result.href.startswith("https://www.bing.com/aclick?"):
81
+ continue
82
+ if result.href.startswith("https://www.bing.com/ck/a?"):
83
+ result.href = unwrap_bing_url(result.href) or result.href
84
+ post_results.append(result)
85
+ return post_results
ddgs/engines/bing_news.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Bing news engine implementation."""
2
+
3
+ import re
4
+ from collections.abc import Mapping
5
+ from contextlib import suppress
6
+ from datetime import datetime, timedelta, timezone
7
+ from typing import Any, ClassVar
8
+
9
+ from ddgs.base import BaseSearchEngine
10
+ from ddgs.results import NewsResult
11
+
12
+ DATE_RE = re.compile(r"\b(\d+)\s*(days|tagen|jours|giorni|dias|días|дн\.|день)?\b", re.IGNORECASE)
13
+
14
+
15
+ def extract_date(pub_date_str: str) -> str:
16
+ """Extract date from string."""
17
+ # Try parsing the date with predefined formats
18
+ date_formats = ["%d.%m.%Y", "%m/%d/%Y", "%d/%m/%Y"]
19
+ for date_format in date_formats:
20
+ with suppress(ValueError):
21
+ return datetime.strptime(pub_date_str, date_format).astimezone(timezone.utc).isoformat()
22
+
23
+ # Search for relative date expressions
24
+ match = DATE_RE.search(pub_date_str)
25
+ if match:
26
+ days_ago = int(match.group(1))
27
+ return (datetime.now(timezone.utc) - timedelta(days=days_ago)).replace(microsecond=0).isoformat()
28
+
29
+ # Return the original string if no date is found
30
+ return pub_date_str
31
+
32
+
33
+ class BingNews(BaseSearchEngine[NewsResult]):
34
+ """Bing news engine."""
35
+
36
+ name = "bing"
37
+ category = "news"
38
+ provider = "bing"
39
+
40
+ search_url = "https://www.bing.com/news/infinitescrollajax"
41
+ search_method = "GET"
42
+
43
+ items_xpath = "//div[contains(@class, 'newsitem')]"
44
+ elements_xpath: ClassVar[Mapping[str, str]] = {
45
+ "date": ".//span[@aria-label]//@aria-label",
46
+ "title": "@data-title",
47
+ "body": ".//div[@class='snippet']//text()",
48
+ "url": "@url",
49
+ "image": ".//a[contains(@class, 'image')]//@src",
50
+ "source": "@data-author",
51
+ }
52
+
53
+ def build_payload(
54
+ self,
55
+ query: str,
56
+ region: str,
57
+ safesearch: str, # noqa: ARG002
58
+ timelimit: str | None,
59
+ page: int = 1,
60
+ **kwargs: str, # noqa: ARG002
61
+ ) -> dict[str, Any]:
62
+ """Build a payload for the Bing search request."""
63
+ country, lang = region.lower().split("-")
64
+ payload = {
65
+ "q": query,
66
+ "InfiniteScroll": "1",
67
+ "first": f"{page * 10 + 1}",
68
+ "SFX": f"{page}",
69
+ "cc": country,
70
+ "setlang": lang,
71
+ }
72
+ if timelimit:
73
+ payload["qft"] = {
74
+ "d": 'interval="4"', # doesn't exist so it's the same as one hour
75
+ "w": 'interval="7"',
76
+ "m": 'interval="9"',
77
+ "y": 'interval="9"', # doesn't exist so it's the same as month
78
+ }[timelimit]
79
+ return payload
80
+
81
+ def post_extract_results(self, results: list[NewsResult]) -> list[NewsResult]:
82
+ """Post-process search results."""
83
+ for result in results:
84
+ result.date = extract_date(result.date)
85
+ result.image = f"https://www.bing.com{result.image.split('&')[0]}" if result.image else ""
86
+ return results
ddgs/engines/brave.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Brave search engine implementation."""
2
+
3
+ from collections.abc import Mapping
4
+ from typing import Any, ClassVar
5
+
6
+ from ddgs.base import BaseSearchEngine
7
+ from ddgs.results import TextResult
8
+
9
+
10
+ class Brave(BaseSearchEngine[TextResult]):
11
+ """Brave search engine."""
12
+
13
+ name = "brave"
14
+ category = "text"
15
+ provider = "brave"
16
+
17
+ search_url = "https://search.brave.com/search"
18
+ search_method = "GET"
19
+
20
+ items_xpath = "//div[@data-type='web']"
21
+ elements_xpath: ClassVar[Mapping[str, str]] = {
22
+ "title": ".//div[(contains(@class,'title') or contains(@class,'sitename-container')) and position()=last()]//text()", # noqa: E501
23
+ "href": ".//a[div[contains(@class, 'title')]]/@href",
24
+ "body": ".//div[contains(@class, 'snippet')]//div[contains(@class, 'content')]//text()",
25
+ }
26
+
27
+ def build_payload(
28
+ self,
29
+ query: str,
30
+ region: str,
31
+ safesearch: str,
32
+ timelimit: str | None,
33
+ page: int = 1,
34
+ **kwargs: str, # noqa: ARG002
35
+ ) -> dict[str, Any]:
36
+ """Build a payload for the search request."""
37
+ payload = {"q": query, "source": "web"}
38
+ country, _lang = region.lower().split("-")
39
+ cookies = {country: country, "useLocation": "0"}
40
+ if safesearch != "moderate":
41
+ cookies["safesearch"] = "strict" if safesearch == "on" else "off"
42
+ self.http_client.client.set_cookies("https://search.brave.com", cookies)
43
+ if timelimit:
44
+ payload["tf"] = {"d": "pd", "w": "pw", "m": "pm", "y": "py"}[timelimit]
45
+ if page > 1:
46
+ payload["offset"] = f"{page - 1}"
47
+ return payload
ddgs/engines/duckduckgo.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Duckduckgo search engine implementation."""
2
+
3
+ from collections.abc import Mapping
4
+ from typing import Any, ClassVar, TypeVar
5
+
6
+ from fake_useragent import UserAgent
7
+
8
+ from ddgs.base import BaseSearchEngine
9
+ from ddgs.http_client2 import HttpClient2
10
+ from ddgs.results import TextResult
11
+
12
+ ua = UserAgent()
13
+
14
+ T = TypeVar("T")
15
+
16
+
17
+ class Duckduckgo(BaseSearchEngine[TextResult]):
18
+ """Duckduckgo search engine."""
19
+
20
+ name = "duckduckgo"
21
+ category = "text"
22
+ provider = "bing"
23
+
24
+ search_url = "https://html.duckduckgo.com/html/"
25
+ search_method = "POST"
26
+
27
+ items_xpath = "//div[contains(@class, 'body')]"
28
+ elements_xpath: ClassVar[Mapping[str, str]] = {"title": ".//h2//text()", "href": "./a/@href", "body": "./a//text()"}
29
+
30
+ headers: ClassVar[dict[str, str]] = {"User-Agent": ua.random}
31
+
32
+ def __init__(self, proxy: str | None = None, timeout: int | None = None, *, verify: bool = True) -> None:
33
+ """Temporary, delete when HttpClient is fixed."""
34
+ self.http_client = HttpClient2(headers=self.headers, proxy=proxy, timeout=timeout, verify=verify) # type: ignore[assignment]
35
+ self.results: list[T] = [] # type: ignore[valid-type]
36
+
37
+ def build_payload(
38
+ self,
39
+ query: str,
40
+ region: str,
41
+ safesearch: str, # noqa: ARG002
42
+ timelimit: str | None,
43
+ page: int = 1,
44
+ **kwargs: str, # noqa: ARG002
45
+ ) -> dict[str, Any]:
46
+ """Build a payload for the search request."""
47
+ payload = {"q": query, "b": "", "l": region}
48
+ if page > 1:
49
+ payload["s"] = f"{10 + (page - 2) * 15}"
50
+ if timelimit:
51
+ payload["df"] = timelimit
52
+ return payload
53
+
54
+ def post_extract_results(self, results: list[TextResult]) -> list[TextResult]:
55
+ """Post-process search results."""
56
+ return [r for r in results if not r.href.startswith("https://duckduckgo.com/y.js?")]
ddgs/engines/duckduckgo_images.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Duckduckgo images search engine implementation."""
2
+
3
+ import json
4
+ from collections.abc import Mapping
5
+ from typing import Any, ClassVar
6
+
7
+ from ddgs.base import BaseSearchEngine
8
+ from ddgs.results import ImagesResult
9
+ from ddgs.utils import _extract_vqd
10
+
11
+
12
+ class DuckduckgoImages(BaseSearchEngine[ImagesResult]):
13
+ """Duckduckgo images search engine."""
14
+
15
+ name = "duckduckgo"
16
+ category = "images"
17
+ provider = "bing"
18
+
19
+ search_url = "https://duckduckgo.com/i.js"
20
+ search_method = "GET"
21
+ search_headers: ClassVar[Mapping[str, str]] = {"Referer": "https://duckduckgo.com/", "Sec-Fetch-Mode": "cors"}
22
+
23
+ elements_replace: ClassVar[Mapping[str, str]] = {
24
+ "title": "title",
25
+ "image": "image",
26
+ "thumbnail": "thumbnail",
27
+ "url": "url",
28
+ "height": "height",
29
+ "width": "width",
30
+ "source": "source",
31
+ }
32
+
33
+ def _get_vqd(self, query: str) -> str:
34
+ """Get vqd value for a search query using DuckDuckGo."""
35
+ resp_content = self.http_client.request("GET", "https://duckduckgo.com", params={"q": query}).content
36
+ return _extract_vqd(resp_content, query)
37
+
38
+ def build_payload(
39
+ self,
40
+ query: str,
41
+ region: str,
42
+ safesearch: str,
43
+ timelimit: str | None,
44
+ page: int = 1,
45
+ **kwargs: str,
46
+ ) -> dict[str, Any]:
47
+ """Build a payload for the search request."""
48
+ safesearch_base = {"on": "1", "moderate": "1", "off": "-1"}
49
+ timelimit_base = {"d": "Day", "w": "Week", "m": "Month", "y": "Year"}
50
+ timelimit = f"time:{timelimit_base[timelimit]}" if timelimit else ""
51
+ size = kwargs.get("size")
52
+ size = f"size:{size}" if size else ""
53
+ color = kwargs.get("color")
54
+ color = f"color:{color}" if color else ""
55
+ type_image = kwargs.get("type_image")
56
+ type_image = f"type:{type_image}" if type_image else ""
57
+ layout = kwargs.get("layout")
58
+ layout = f"layout:{layout}" if layout else ""
59
+ license_image = kwargs.get("license_image")
60
+ license_image = f"license:{license_image}" if license_image else ""
61
+ payload = {
62
+ "o": "json",
63
+ "q": query,
64
+ "l": region,
65
+ "vqd": self._get_vqd(query),
66
+ "p": safesearch_base[safesearch.lower()],
67
+ }
68
+ if timelimit or size or color or type_image or layout or license_image:
69
+ payload["f"] = f"{timelimit},{size},{color},{type_image},{layout},{license_image}"
70
+ if page > 1:
71
+ payload["s"] = f"{(page - 1) * 100}"
72
+ return payload
73
+
74
+ def extract_results(self, html_text: str) -> list[ImagesResult]:
75
+ """Extract search results from html text."""
76
+ json_data = json.loads(html_text)
77
+ items = json_data.get("results", [])
78
+ results = []
79
+ for item in items:
80
+ result = ImagesResult()
81
+ for key, value in self.elements_replace.items():
82
+ data = item.get(key)
83
+ result.__setattr__(value, data)
84
+ results.append(result)
85
+ return results
ddgs/engines/duckduckgo_news.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Duckduckgo news search engine implementation."""
2
+
3
+ import json
4
+ from collections.abc import Mapping
5
+ from typing import Any, ClassVar
6
+
7
+ from ddgs.base import BaseSearchEngine
8
+ from ddgs.results import NewsResult
9
+ from ddgs.utils import _extract_vqd
10
+
11
+
12
+ class DuckduckgoNews(BaseSearchEngine[NewsResult]):
13
+ """Duckduckgo news search engine."""
14
+
15
+ name = "duckduckgo"
16
+ category = "news"
17
+ provider = "bing"
18
+
19
+ search_url = "https://duckduckgo.com/news.js"
20
+ search_method = "GET"
21
+
22
+ elements_replace: ClassVar[Mapping[str, str]] = {
23
+ "date": "date",
24
+ "title": "title",
25
+ "excerpt": "body",
26
+ "url": "url",
27
+ "image": "image",
28
+ "source": "source",
29
+ }
30
+
31
+ def _get_vqd(self, query: str) -> str:
32
+ """Get vqd value for a search query using DuckDuckGo."""
33
+ resp_content = self.http_client.request("GET", "https://duckduckgo.com", params={"q": query}).content
34
+ return _extract_vqd(resp_content, query)
35
+
36
+ def build_payload(
37
+ self,
38
+ query: str,
39
+ region: str,
40
+ safesearch: str,
41
+ timelimit: str | None,
42
+ page: int = 1,
43
+ **kwargs: str, # noqa: ARG002
44
+ ) -> dict[str, Any]:
45
+ """Build a payload for the search request."""
46
+ safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
47
+ payload = {
48
+ "l": region,
49
+ "o": "json",
50
+ "noamp": "1",
51
+ "q": query,
52
+ "vqd": self._get_vqd(query),
53
+ "p": safesearch_base[safesearch.lower()],
54
+ }
55
+ if timelimit:
56
+ payload["df"] = timelimit
57
+ if page > 1:
58
+ payload["s"] = f"{(page - 1) * 30}"
59
+ return payload
60
+
61
+ def extract_results(self, html_text: str) -> list[NewsResult]:
62
+ """Extract search results from lxml tree."""
63
+ json_data = json.loads(html_text)
64
+ items = json_data.get("results", [])
65
+ results = []
66
+ for item in items:
67
+ result = NewsResult()
68
+ for key, value in self.elements_replace.items():
69
+ data = item.get(key)
70
+ result.__setattr__(value, data)
71
+ results.append(result)
72
+ return results
ddgs/engines/duckduckgo_videos.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Duckduckgo videos search engine implementation."""
2
+
3
+ import json
4
+ from collections.abc import Mapping
5
+ from typing import Any, ClassVar
6
+
7
+ from ddgs.base import BaseSearchEngine
8
+ from ddgs.results import VideosResult
9
+ from ddgs.utils import _extract_vqd
10
+
11
+
12
+ class DuckduckgoVideos(BaseSearchEngine[VideosResult]):
13
+ """Duckduckgo videos search engine."""
14
+
15
+ name = "duckduckgo"
16
+ category = "videos"
17
+ provider = "bing"
18
+
19
+ search_url = "https://duckduckgo.com/v.js"
20
+ search_method = "GET"
21
+
22
+ elements_replace: ClassVar[Mapping[str, str]] = {
23
+ "content": "content",
24
+ "description": "description",
25
+ "duration": "duration",
26
+ "embed_html": "embed_html",
27
+ "embed_url": "embed_url",
28
+ "image_token": "image_token",
29
+ "images": "images",
30
+ "provider": "provider",
31
+ "published": "published",
32
+ "publisher": "publisher",
33
+ "statistics": "statistics",
34
+ "title": "title",
35
+ "uploader": "uploader",
36
+ }
37
+
38
+ def _get_vqd(self, query: str) -> str:
39
+ """Get vqd value for a search query using DuckDuckGo."""
40
+ resp_content = self.http_client.request("GET", "https://duckduckgo.com", params={"q": query}).content
41
+ return _extract_vqd(resp_content, query)
42
+
43
+ def build_payload(
44
+ self,
45
+ query: str,
46
+ region: str,
47
+ safesearch: str,
48
+ timelimit: str | None,
49
+ page: int = 1,
50
+ **kwargs: str,
51
+ ) -> dict[str, Any]:
52
+ """Build a payload for the search request."""
53
+ safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
54
+ timelimit = f"publishedAfter:{timelimit}" if timelimit else ""
55
+ resolution = kwargs.get("resolution")
56
+ duration = kwargs.get("duration")
57
+ license_videos = kwargs.get("license_videos")
58
+ resolution = f"videoDefinition:{resolution}" if resolution else ""
59
+ duration = f"videoDuration:{duration}" if duration else ""
60
+ license_videos = f"videoLicense:{license_videos}" if license_videos else ""
61
+ payload = {
62
+ "l": region,
63
+ "o": "json",
64
+ "q": query,
65
+ "vqd": self._get_vqd(query),
66
+ "f": f"{timelimit},{resolution},{duration},{license_videos}",
67
+ "p": safesearch_base[safesearch.lower()],
68
+ }
69
+ if page > 1:
70
+ payload["s"] = f"{(page - 1) * 60}"
71
+ return payload
72
+
73
+ def extract_results(self, html_text: str) -> list[VideosResult]:
74
+ """Extract search results from lxml tree."""
75
+ json_data = json.loads(html_text)
76
+ items = json_data.get("results", [])
77
+ results = []
78
+ for item in items:
79
+ result = VideosResult()
80
+ for key, value in self.elements_replace.items():
81
+ data = item.get(key)
82
+ result.__setattr__(value, data)
83
+ results.append(result)
84
+ return results
ddgs/engines/google.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Google search engine implementation."""
2
+
3
+ from collections.abc import Mapping
4
+ from random import SystemRandom
5
+ from typing import Any, ClassVar
6
+
7
+ from ddgs.base import BaseSearchEngine
8
+ from ddgs.results import TextResult
9
+
10
+ random = SystemRandom()
11
+
12
+
13
+ def get_ua() -> str:
14
+ """Return one random User-Agent string."""
15
+ patterns = [
16
+ "Opera/9.80 (J2ME/MIDP; Opera Mini/{v}/{b}; U; {l}) Presto/{p} Version/{f}",
17
+ "Opera/9.80 (Android; Linux; Opera Mobi/{b}; U; {l}) Presto/{p} Version/{f}",
18
+ "Opera/9.80 (iPhone; Opera Mini/{v}/{b}; U; {l}) Presto/{p} Version/{f}",
19
+ "Opera/9.80 (iPad; Opera Mini/{v}/{b}; U; {l}) Presto/{p} Version/{f}",
20
+ ]
21
+ mini_versions = ["4.0", "5.0.17381", "7.1.32444", "9.80"]
22
+ mobi_builds = ["27", "447", "ADR-1011151731"]
23
+ builds = ["18.678", "24.743", "503"]
24
+ prestos = ["2.6.35", "2.7.60", "2.8.119"]
25
+ finals = ["10.00", "11.10", "12.16"]
26
+ langs = ["en-US", "en-GB", "de-DE", "fr-FR", "es-ES", "ru-RU", "zh-CN"]
27
+ fallback = "Opera/9.80 (iPad; Opera Mini/5.0.17381/503; U; eu) Presto/2.6.35 Version/11.10"
28
+
29
+ try:
30
+ p = random.choice(patterns)
31
+ vals = {
32
+ "l": random.choice(langs),
33
+ "p": random.choice(prestos),
34
+ "f": random.choice(finals),
35
+ }
36
+ if "{v}" in p:
37
+ vals["v"] = random.choice(mini_versions)
38
+ if "{b}" in p:
39
+ vals["b"] = random.choice(mobi_builds) if "Opera Mobi" in p else random.choice(builds)
40
+ return p.format(**vals)
41
+ except Exception: # noqa: BLE001
42
+ return fallback
43
+
44
+
45
+ class Google(BaseSearchEngine[TextResult]):
46
+ """Google search engine."""
47
+
48
+ name = "google"
49
+ category = "text"
50
+ provider = "google"
51
+
52
+ search_url = "https://www.google.com/search"
53
+ search_method = "GET"
54
+ search_headers: ClassVar[dict[str, str]] = {"User-Agent": get_ua()}
55
+
56
+ items_xpath = "//div[div[@data-hveid]//div[h3]]"
57
+ elements_xpath: ClassVar[Mapping[str, str]] = {
58
+ "title": ".//h3//text()",
59
+ "href": ".//a/@href",
60
+ "body": "./div/div/div[2]//text()",
61
+ }
62
+
63
+ def build_payload(
64
+ self,
65
+ query: str,
66
+ region: str,
67
+ safesearch: str,
68
+ timelimit: str | None,
69
+ page: int = 1,
70
+ **kwargs: str, # noqa: ARG002
71
+ ) -> dict[str, Any]:
72
+ """Build a payload for the Google search request."""
73
+ safesearch_base = {"on": "2", "moderate": "1", "off": "0"}
74
+ start = (page - 1) * 10
75
+ payload = {
76
+ "q": query,
77
+ "filter": safesearch_base[safesearch.lower()],
78
+ "start": str(start),
79
+ }
80
+ country, lang = region.split("-")
81
+ payload["hl"] = f"{lang}-{country.upper()}" # interface language
82
+ payload["lr"] = f"lang_{lang}" # restricts to results written in a particular language
83
+ payload["cr"] = f"country{country.upper()}" # restricts to results written in a particular country
84
+ if timelimit:
85
+ payload["tbs"] = f"qdr:{timelimit}"
86
+ return payload
87
+
88
+ def post_extract_results(self, results: list[TextResult]) -> list[TextResult]:
89
+ """Post-process search results."""
90
+ post_results = []
91
+ for result in results:
92
+ if result.href.startswith("/url?q="):
93
+ result.href = result.href.split("?q=")[1].split("&")[0]
94
+ post_results.append(result)
95
+ return post_results
ddgs/engines/grokipedia.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Grokipedia text search engine."""
2
+
3
+ import json
4
+ import logging
5
+ from typing import Any
6
+
7
+ from ddgs.base import BaseSearchEngine
8
+ from ddgs.results import TextResult
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class Grokipedia(BaseSearchEngine[TextResult]):
14
+ """Grokipedia text search engine."""
15
+
16
+ name = "grokipedia"
17
+ category = "text"
18
+ provider = "grokipedia"
19
+ priority = 1.9
20
+
21
+ search_url = "https://grokipedia.com/api/typeahead"
22
+ search_method = "GET"
23
+
24
+ def build_payload(
25
+ self,
26
+ query: str,
27
+ region: str, # noqa: ARG002
28
+ safesearch: str, # noqa: ARG002
29
+ timelimit: str | None, # noqa: ARG002
30
+ page: int = 1, # noqa: ARG002
31
+ **kwargs: str, # noqa: ARG002
32
+ ) -> dict[str, Any]:
33
+ """Build a payload for the search request."""
34
+ payload: dict[str, Any] = {"query": query, "limit": "1"}
35
+ return payload
36
+
37
+ def extract_results(self, html_text: str) -> list[TextResult]:
38
+ """Extract search results from html text."""
39
+ json_data = json.loads(html_text)
40
+ items = json_data.get("results", [])
41
+ if not items:
42
+ return []
43
+
44
+ result = TextResult()
45
+ result.title = items[0].get("title", "").strip("_")
46
+ body = items[0].get("snippet", "")
47
+ result.body = body.split("\n\n", 1)[1] if "\n\n" in body else body
48
+ result.href = f"https://grokipedia.com/page/{items[0]['slug']}"
49
+ return [result]
ddgs/engines/mojeek.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Mojeek search engine implementation."""
2
+
3
+ from collections.abc import Mapping
4
+ from typing import Any, ClassVar
5
+
6
+ from ddgs.base import BaseSearchEngine
7
+ from ddgs.results import TextResult
8
+
9
+
10
+ class Mojeek(BaseSearchEngine[TextResult]):
11
+ """Mojeek search engine."""
12
+
13
+ name = "mojeek"
14
+ category = "text"
15
+ provider = "mojeek"
16
+
17
+ search_url = "https://www.mojeek.com/search"
18
+ search_method = "GET"
19
+
20
+ items_xpath = "//ul[contains(@class, 'results')]/li"
21
+ elements_xpath: ClassVar[Mapping[str, str]] = {
22
+ "title": ".//h2//text()",
23
+ "href": ".//h2/a/@href",
24
+ "body": ".//p[@class='s']//text()",
25
+ }
26
+
27
+ def build_payload(
28
+ self,
29
+ query: str,
30
+ region: str,
31
+ safesearch: str,
32
+ timelimit: str | None, # noqa: ARG002
33
+ page: int = 1,
34
+ **kwargs: str, # noqa: ARG002
35
+ ) -> dict[str, Any]:
36
+ """Build a payload for the search request."""
37
+ country, lang = region.lower().split("-")
38
+ cookies = {
39
+ "arc": country,
40
+ "lb": lang,
41
+ }
42
+ self.http_client.client.set_cookies("https://www.mojeek.com", cookies)
43
+ payload = {
44
+ "q": query,
45
+ # "tlen": f"{randint(68, 128)}", # Title length limit (default=68, max=128) # noqa: ERA001
46
+ # "dlen": f"{randint(160, 512)}", # Description length limit (default=160, max=512) # noqa: ERA001
47
+ }
48
+ if safesearch == "on":
49
+ payload["safe"] = "1"
50
+ if page > 1:
51
+ payload["s"] = f"{(page - 1) * 10 + 1}"
52
+ return payload
ddgs/engines/wikipedia.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Wikipedia text search engine."""
2
+
3
+ import json
4
+ import logging
5
+ from typing import Any
6
+ from urllib.parse import quote
7
+
8
+ from ddgs.base import BaseSearchEngine
9
+ from ddgs.results import TextResult
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class Wikipedia(BaseSearchEngine[TextResult]):
15
+ """Wikipedia text search engine."""
16
+
17
+ name = "wikipedia"
18
+ category = "text"
19
+ provider = "wikipedia"
20
+ priority = 2
21
+
22
+ search_url = "https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={query}"
23
+ search_method = "GET"
24
+
25
+ def build_payload(
26
+ self,
27
+ query: str,
28
+ region: str,
29
+ safesearch: str, # noqa: ARG002
30
+ timelimit: str | None, # noqa: ARG002
31
+ page: int = 1, # noqa: ARG002
32
+ **kwargs: str, # noqa: ARG002
33
+ ) -> dict[str, Any]:
34
+ """Build a payload for the search request."""
35
+ _country, lang = region.lower().split("-")
36
+ encoded_query = quote(query)
37
+ self.search_url = (
38
+ f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&profile=fuzzy&limit=1&search={encoded_query}"
39
+ )
40
+ payload: dict[str, Any] = {}
41
+ self.lang = lang # used in extract_results
42
+ return payload
43
+
44
+ def extract_results(self, html_text: str) -> list[TextResult]:
45
+ """Extract search results from html text."""
46
+ json_data = json.loads(html_text)
47
+ if not json_data[1]:
48
+ return []
49
+
50
+ result = TextResult()
51
+ result.title = json_data[1][0]
52
+ result.href = json_data[3][0]
53
+
54
+ # Add body
55
+ encoded_query = quote(result.title)
56
+ resp_data = self.request(
57
+ "GET",
58
+ f"https://{self.lang}.wikipedia.org/w/api.php?action=query&format=json&prop=extracts&titles={encoded_query}&explaintext=0&exintro=0&redirects=1",
59
+ )
60
+ if resp_data:
61
+ json_data = json.loads(resp_data)
62
+ result.body = next(iter(json_data["query"]["pages"].values())).get("extract", "")
63
+ if "may refer to:" in result.body:
64
+ return []
65
+
66
+ return [result]
ddgs/engines/yahoo.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Yahoo search engine."""
2
+
3
+ from collections.abc import Mapping
4
+ from secrets import token_urlsafe
5
+ from typing import Any, ClassVar
6
+ from urllib.parse import unquote_plus
7
+
8
+ from ddgs.base import BaseSearchEngine
9
+ from ddgs.results import TextResult
10
+
11
+
12
+ def extract_url(u: str) -> str:
13
+ """Sanitize url."""
14
+ t = u.split("/RU=", 1)[1]
15
+ return unquote_plus(t.split("/RK=", 1)[0].split("/RS=", 1)[0])
16
+
17
+
18
+ class Yahoo(BaseSearchEngine[TextResult]):
19
+ """Yahoo search engine."""
20
+
21
+ name = "yahoo"
22
+ category = "text"
23
+ provider = "bing"
24
+
25
+ search_url = "https://search.yahoo.com/search"
26
+ search_method = "GET"
27
+
28
+ items_xpath = "//div[contains(@class, 'relsrch')]"
29
+ elements_xpath: ClassVar[Mapping[str, str]] = {
30
+ "title": ".//div[contains(@class, 'Title')]//h3//text()",
31
+ "href": ".//div[contains(@class, 'Title')]//a/@href",
32
+ "body": ".//div[contains(@class, 'Text')]//text()",
33
+ }
34
+
35
+ def build_payload(
36
+ self,
37
+ query: str,
38
+ region: str, # noqa: ARG002
39
+ safesearch: str, # noqa: ARG002
40
+ timelimit: str | None,
41
+ page: int = 1,
42
+ **kwargs: str, # noqa: ARG002
43
+ ) -> dict[str, Any]:
44
+ """Build a payload for the search request."""
45
+ self.search_url = (
46
+ f"https://search.yahoo.com/search;_ylt={token_urlsafe(24 * 3 // 4)};_ylu={token_urlsafe(47 * 3 // 4)}"
47
+ )
48
+ payload = {"p": query}
49
+ if page > 1:
50
+ payload["b"] = f"{(page - 1) * 7 + 1}"
51
+ if timelimit:
52
+ payload["btf"] = timelimit
53
+ return payload
54
+
55
+ def post_extract_results(self, results: list[TextResult]) -> list[TextResult]:
56
+ """Post-process search results."""
57
+ post_results = []
58
+ for result in results:
59
+ if result.href.startswith("https://www.bing.com/aclick?"):
60
+ continue
61
+ if "/RU=" in result.href:
62
+ result.href = extract_url(result.href)
63
+ post_results.append(result)
64
+ return post_results
ddgs/engines/yahoo_news.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Yahoo! News search engine."""
2
+
3
+ import logging
4
+ import re
5
+ from collections.abc import Callable, Mapping
6
+ from datetime import datetime, timedelta, timezone
7
+ from typing import Any, ClassVar
8
+ from urllib.parse import unquote_plus
9
+
10
+ from ddgs.base import BaseSearchEngine
11
+ from ddgs.results import NewsResult
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ DATE_RE = re.compile(r"\b(\d+)\s*(year|month|week|day|hour|minute)s?\b", re.IGNORECASE)
16
+ DATE_UNITS: dict[str, Callable[[int], timedelta]] = {
17
+ "minute": lambda n: timedelta(minutes=n),
18
+ "hour": lambda n: timedelta(hours=n),
19
+ "day": lambda n: timedelta(days=n),
20
+ "week": lambda n: timedelta(weeks=n),
21
+ "month": lambda n: timedelta(days=30 * n),
22
+ "year": lambda n: timedelta(days=365 * n),
23
+ }
24
+
25
+
26
+ def extract_date(pub_date_str: str) -> str:
27
+ """Extract date from string."""
28
+ now = datetime.now(timezone.utc)
29
+ m = DATE_RE.search(pub_date_str)
30
+ if not m:
31
+ return pub_date_str
32
+
33
+ number = int(m.group(1))
34
+ unit = m.group(2).lower()
35
+ delta = DATE_UNITS[unit](number)
36
+ dt = (now - delta).replace(microsecond=0)
37
+ return dt.isoformat()
38
+
39
+
40
+ def extract_url(u: str) -> str:
41
+ """Sanitize url."""
42
+ url = u.split("/RU=", 1)[1].split("/RK=", 1)[0].split("?", 1)[0]
43
+ return unquote_plus(url)
44
+
45
+
46
+ def extract_image(u: str) -> str:
47
+ """Sanitize image url."""
48
+ idx = u.find("-/")
49
+ return u[idx + 2 :] if idx != -1 else u
50
+
51
+
52
+ def extract_source(s: str) -> str:
53
+ """Remove ' via Yahoo' from string."""
54
+ return s.split(" · via Yahoo")[0]
55
+
56
+
57
+ class YahooNews(BaseSearchEngine[NewsResult]):
58
+ """Yahoo news search engine."""
59
+
60
+ name = "yahoo"
61
+ category = "news"
62
+ provider = "yahoo"
63
+
64
+ search_url = "https://news.search.yahoo.com/search"
65
+ search_method = "GET"
66
+
67
+ items_xpath = "//div[@id='web']//li[a]"
68
+ elements_xpath: ClassVar[Mapping[str, str]] = {
69
+ "date": ".//span[contains(@class, 'time')]//text()",
70
+ "title": ".//h4//text()",
71
+ "body": ".//p//text()",
72
+ "url": ".//h4/a/@href",
73
+ "image": "(.//img/@data-src | .//img/@src)[1]",
74
+ "source": ".//span[contains(@class, 'source')]//text()",
75
+ }
76
+
77
+ def build_payload(
78
+ self,
79
+ query: str,
80
+ region: str, # noqa: ARG002
81
+ safesearch: str, # noqa: ARG002
82
+ timelimit: str | None,
83
+ page: int = 1,
84
+ **kwargs: str, # noqa: ARG002
85
+ ) -> dict[str, Any]:
86
+ """Build a payload for the search request."""
87
+ payload = {"p": query}
88
+ if page > 1:
89
+ payload["b"] = f"{(page - 1) * 10 + 1}"
90
+ if timelimit:
91
+ payload["btf"] = timelimit
92
+ return payload
93
+
94
+ def post_extract_results(self, results: list[NewsResult]) -> list[NewsResult]:
95
+ """Post-process search results."""
96
+ try:
97
+ for result in results:
98
+ result.date = extract_date(result.date)
99
+ result.url = extract_url(result.url)
100
+ result.image = extract_image(result.image)
101
+ result.source = extract_source(result.source)
102
+ except Exception as ex: # noqa: BLE001
103
+ logger.warning("Error post-processing results: %r", ex)
104
+ return results
ddgs/engines/yandex.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Yandex search engine."""
2
+
3
+ from collections.abc import Mapping
4
+ from random import SystemRandom
5
+ from typing import Any, ClassVar
6
+
7
+ from ddgs.base import BaseSearchEngine
8
+ from ddgs.results import TextResult
9
+
10
+ random = SystemRandom()
11
+
12
+
13
+ class Yandex(BaseSearchEngine[TextResult]):
14
+ """Yandex search engine."""
15
+
16
+ name = "yandex"
17
+ category = "text"
18
+ provider = "yandex"
19
+
20
+ search_url = "https://yandex.com/search/site/"
21
+ search_method = "GET"
22
+
23
+ items_xpath = "//li[contains(@class, 'serp-item')]"
24
+ elements_xpath: ClassVar[Mapping[str, str]] = {
25
+ "title": ".//h3//text()",
26
+ "href": ".//h3//a/@href",
27
+ "body": ".//div[contains(@class, 'text')]//text()",
28
+ }
29
+
30
+ def build_payload(
31
+ self,
32
+ query: str,
33
+ region: str, # noqa: ARG002
34
+ safesearch: str, # noqa: ARG002
35
+ timelimit: str | None, # noqa: ARG002
36
+ page: int = 1,
37
+ **kwargs: str, # noqa: ARG002
38
+ ) -> dict[str, Any]:
39
+ """Build a payload for the search request."""
40
+ payload = {
41
+ "text": query,
42
+ "web": "1",
43
+ "searchid": f"{random.randint(1000000, 9999999)}",
44
+ }
45
+ if page > 1:
46
+ payload["p"] = f"{page - 1}"
47
+ return payload
ddgs/exceptions.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """DDGS exceptions."""
2
+
3
+
4
+ class DDGSException(Exception):
5
+ """Base exception class for ddgs."""
6
+
7
+
8
+ class RatelimitException(DDGSException):
9
+ """Raised for rate limit exceeded errors during API requests."""
10
+
11
+
12
+ class TimeoutException(DDGSException):
13
+ """Raised for timeout errors during API requests."""
ddgs/http_client.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HTTP client."""
2
+
3
+ import logging
4
+ from secrets import choice
5
+ from typing import Any, Final, Literal, get_args
6
+
7
+ import primp
8
+
9
+ from .exceptions import DDGSException, TimeoutException
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class Response:
15
+ """HTTP response."""
16
+
17
+ __slots__ = ("content", "status_code", "text")
18
+
19
+ def __init__(self, status_code: int, content: bytes, text: str) -> None:
20
+ self.status_code = status_code
21
+ self.content = content
22
+ self.text = text
23
+
24
+
25
+ class HttpClient:
26
+ """HTTP client."""
27
+
28
+ _impersonates: Final = get_args(Literal[
29
+ "chrome_100", "chrome_101", "chrome_104", "chrome_105", "chrome_106", "chrome_107",
30
+ "chrome_108", "chrome_109", "chrome_114", "chrome_116", "chrome_117", "chrome_118",
31
+ "chrome_119", "chrome_120", "chrome_123", "chrome_124", "chrome_126", "chrome_127",
32
+ "chrome_128", "chrome_129", "chrome_130", "chrome_131", "chrome_133",
33
+ "safari_15.3", "safari_15.5", "safari_15.6.1", "safari_16", "safari_16.5",
34
+ "safari_17.0", "safari_17.2.1", "safari_17.4.1", "safari_17.5",
35
+ "safari_18", "safari_18.2",
36
+ "edge_101", "edge_122", "edge_127", "edge_131",
37
+ "firefox_109", "firefox_117", "firefox_128", "firefox_133", "firefox_135",
38
+ ]) # fmt: skip
39
+ _impersonates_os: Final = get_args(Literal["macos", "linux", "windows"])
40
+
41
+ def __init__(self, proxy: str | None = None, timeout: int | None = 10, *, verify: bool | str = True) -> None:
42
+ """Initialize the HttpClient object.
43
+
44
+ Args:
45
+ proxy (str, optional): proxy for the HTTP client, supports http/https/socks5 protocols.
46
+ example: "http://user:pass@example.com:3128". Defaults to None.
47
+ timeout (int, optional): Timeout value for the HTTP client. Defaults to 10.
48
+ verify: (bool | str): True to verify, False to skip, or a str path to a PEM file. Defaults to True.
49
+
50
+ """
51
+ self.client = primp.Client(
52
+ proxy=proxy,
53
+ timeout=timeout,
54
+ impersonate=choice(self._impersonates),
55
+ impersonate_os=choice(self._impersonates_os),
56
+ verify=verify if isinstance(verify, bool) else True,
57
+ ca_cert_file=verify if isinstance(verify, str) else None,
58
+ )
59
+
60
+ def request(self, *args: Any, **kwargs: Any) -> Response: # noqa: ANN401
61
+ """Make a request to the HTTP client."""
62
+ try:
63
+ resp = self.client.request(*args, **kwargs)
64
+ return Response(status_code=resp.status_code, content=resp.content, text=resp.text)
65
+ except Exception as ex:
66
+ if "timed out" in f"{ex}":
67
+ msg = f"Request timed out: {ex!r}"
68
+ raise TimeoutException(msg) from ex
69
+ msg = f"{type(ex).__name__}: {ex!r}"
70
+ raise DDGSException(msg) from ex
71
+
72
+ def get(self, *args: Any, **kwargs: Any) -> Response: # noqa: ANN401
73
+ """Make a GET request to the HTTP client."""
74
+ return self.request(*args, method="GET", **kwargs)
75
+
76
+ def post(self, *args: Any, **kwargs: Any) -> Response: # noqa: ANN401
77
+ """Make a POST request to the HTTP client."""
78
+ return self.request(*args, method="POST", **kwargs)
ddgs/http_client2.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Temporary HTTP client for 'backend=duckduckgo'. Delete when HttpClient is fixed."""
2
+
3
+ import logging
4
+ import ssl
5
+ from random import SystemRandom
6
+ from types import TracebackType
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ import h2
10
+ import httpcore
11
+ import httpx
12
+
13
+ from .exceptions import DDGSException, TimeoutException
14
+
15
+ if TYPE_CHECKING:
16
+ from collections.abc import Callable
17
+
18
+
19
+ logger = logging.getLogger(__name__)
20
+ random = SystemRandom()
21
+
22
+
23
+ class Response:
24
+ """HTTP response."""
25
+
26
+ __slots__ = ("content", "status_code", "text")
27
+
28
+ def __init__(self, status_code: int, content: bytes, text: str) -> None:
29
+ self.status_code = status_code
30
+ self.content = content
31
+ self.text = text
32
+
33
+
34
+ class HttpClient2:
35
+ """Temporary HTTP client."""
36
+
37
+ def __init__(
38
+ self,
39
+ headers: dict[str, str] | None = None,
40
+ proxy: str | None = None,
41
+ timeout: int | None = 10,
42
+ *,
43
+ verify: bool | str = True,
44
+ ) -> None:
45
+ """Initialize the HttpClient object.
46
+
47
+ Args:
48
+ headers (dict, optional): headers for the HTTP client.
49
+ proxy (str, optional): proxy for the HTTP client, supports http/https/socks5 protocols.
50
+ example: "http://user:pass@example.com:3128". Defaults to None.
51
+ timeout (int, optional): Timeout value for the HTTP client. Defaults to 10.
52
+ verify: (bool | str): True to verify, False to skip or str path to a PEM file. Defaults to True.
53
+
54
+ """
55
+ self.client = httpx.Client(
56
+ headers=headers,
57
+ proxy=proxy,
58
+ timeout=timeout,
59
+ verify=_get_random_ssl_context(verify=verify) if verify else False,
60
+ follow_redirects=False,
61
+ http2=True,
62
+ )
63
+
64
+ def request(self, *args: Any, **kwargs: Any) -> Response: # noqa: ANN401
65
+ """Make a request to the HTTP client."""
66
+ with Patch():
67
+ try:
68
+ resp = self.client.request(*args, **kwargs)
69
+ return Response(status_code=resp.status_code, content=resp.content, text=resp.text)
70
+ except Exception as ex:
71
+ if "timed out" in f"{ex}":
72
+ msg = f"Request timed out: {ex!r}"
73
+ raise TimeoutException(msg) from ex
74
+ msg = f"{type(ex).__name__}: {ex!r}"
75
+ raise DDGSException(msg) from ex
76
+
77
+ def get(self, *args: Any, **kwargs: Any) -> Response: # noqa: ANN401
78
+ """Make a GET request to the HTTP client."""
79
+ return self.request(*args, method="GET", **kwargs)
80
+
81
+ def post(self, *args: Any, **kwargs: Any) -> Response: # noqa: ANN401
82
+ """Make a POST request to the HTTP client."""
83
+ return self.request(*args, method="POST", **kwargs)
84
+
85
+
86
+ # SSL
87
+ DEFAULT_CIPHERS = [ # https://developers.cloudflare.com/ssl/reference/cipher-suites/recommendations/
88
+ "TLS_AES_128_GCM_SHA256", "TLS_AES_256_GCM_SHA384", "TLS_CHACHA20_POLY1305_SHA256",
89
+ # Modern:
90
+ "ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-ECDSA-CHACHA20-POLY1305", "ECDHE-RSA-AES128-GCM-SHA256",
91
+ "ECDHE-RSA-CHACHA20-POLY1305", "ECDHE-ECDSA-AES256-GCM-SHA384", "ECDHE-RSA-AES256-GCM-SHA384",
92
+ # Compatible:
93
+ "ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-ECDSA-CHACHA20-POLY1305", "ECDHE-RSA-AES128-GCM-SHA256",
94
+ "ECDHE-RSA-CHACHA20-POLY1305", "ECDHE-ECDSA-AES256-GCM-SHA384", "ECDHE-RSA-AES256-GCM-SHA384",
95
+ "ECDHE-ECDSA-AES128-SHA256", "ECDHE-RSA-AES128-SHA256", "ECDHE-ECDSA-AES256-SHA384", "ECDHE-RSA-AES256-SHA384",
96
+ # Legacy:
97
+ "ECDHE-ECDSA-AES128-SHA", "ECDHE-RSA-AES128-SHA", "AES128-GCM-SHA256", "AES128-SHA256", "AES128-SHA",
98
+ "ECDHE-RSA-AES256-SHA", "AES256-GCM-SHA384", "AES256-SHA256", "AES256-SHA", "DES-CBC3-SHA",
99
+ ] # fmt: skip
100
+
101
+
102
+ def _get_random_ssl_context(*, verify: bool | str) -> ssl.SSLContext:
103
+ ssl_context = ssl.create_default_context(cafile=verify if isinstance(verify, str) else None)
104
+ shuffled_ciphers = random.sample(DEFAULT_CIPHERS[9:], len(DEFAULT_CIPHERS) - 9)
105
+ ssl_context.set_ciphers(":".join(DEFAULT_CIPHERS[:9] + shuffled_ciphers))
106
+ commands: list[None | Callable[[ssl.SSLContext], None]] = [
107
+ None,
108
+ lambda context: setattr(context, "maximum_version", ssl.TLSVersion.TLSv1_2),
109
+ lambda context: setattr(context, "minimum_version", ssl.TLSVersion.TLSv1_3),
110
+ lambda context: setattr(context, "options", context.options | ssl.OP_NO_TICKET),
111
+ ]
112
+ random_command = random.choice(commands)
113
+ if random_command:
114
+ random_command(ssl_context)
115
+ return ssl_context
116
+
117
+
118
+ class Patch:
119
+ """Patch the HTTP2Connection._send_connection_init method."""
120
+
121
+ def __enter__(self) -> None:
122
+ """Enter the context manager."""
123
+
124
+ def _send_connection_init(self: httpcore._sync.http2.HTTP2Connection, request: httpcore.Request) -> None:
125
+ self._h2_state.local_settings = h2.settings.Settings(
126
+ client=True,
127
+ initial_values={
128
+ h2.settings.SettingCodes.INITIAL_WINDOW_SIZE: random.randint(100, 200),
129
+ h2.settings.SettingCodes.HEADER_TABLE_SIZE: random.randint(4000, 5000),
130
+ h2.settings.SettingCodes.MAX_FRAME_SIZE: random.randint(16384, 65535),
131
+ h2.settings.SettingCodes.MAX_CONCURRENT_STREAMS: random.randint(100, 200),
132
+ h2.settings.SettingCodes.MAX_HEADER_LIST_SIZE: random.randint(65500, 66500),
133
+ h2.settings.SettingCodes.ENABLE_CONNECT_PROTOCOL: random.randint(0, 1),
134
+ h2.settings.SettingCodes.ENABLE_PUSH: random.randint(0, 1),
135
+ },
136
+ )
137
+ self._h2_state.initiate_connection()
138
+ self._h2_state.increment_flow_control_window(2**24)
139
+ self._write_outgoing_data(request)
140
+
141
+ self.original_send_connection_init = httpcore._sync.http2.HTTP2Connection._send_connection_init
142
+ httpcore._sync.http2.HTTP2Connection._send_connection_init = _send_connection_init # type: ignore[method-assign]
143
+
144
+ def __exit__(
145
+ self,
146
+ exc_type: type[BaseException] | None = None,
147
+ exc_val: BaseException | None = None,
148
+ exc_tb: TracebackType | None = None,
149
+ ) -> None:
150
+ """Exit the context manager."""
151
+ httpcore._sync.http2.HTTP2Connection._send_connection_init = self.original_send_connection_init # type: ignore[method-assign]
ddgs/py.typed ADDED
@@ -0,0 +1 @@
 
 
1
+ # Marker file for PEP 561.
ddgs/results.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Result classes."""
2
+
3
+ from abc import ABC
4
+ from collections import Counter
5
+ from collections.abc import Callable, Mapping
6
+ from dataclasses import dataclass, field
7
+ from typing import Any, ClassVar, Generic, TypeVar
8
+
9
+ from .utils import _normalize_date, _normalize_text, _normalize_url
10
+
11
+ T = TypeVar("T")
12
+
13
+
14
+ class BaseResult:
15
+ """Base class for all results. Contains normalization functions."""
16
+
17
+ _normalizers: ClassVar[Mapping[str, Callable[[Any], str]]] = {
18
+ "title": _normalize_text,
19
+ "body": _normalize_text,
20
+ "href": _normalize_url,
21
+ "url": _normalize_url,
22
+ "thumbnail": _normalize_url,
23
+ "image": _normalize_url,
24
+ "date": _normalize_date,
25
+ "author": _normalize_text,
26
+ "publisher": _normalize_text,
27
+ "info": _normalize_text,
28
+ }
29
+
30
+ def __setattr__(self, name: str, value: str) -> None:
31
+ """Override setattr to apply normalization functions to certain attributes."""
32
+ if value and (normalizer := self._normalizers.get(name)):
33
+ value = normalizer(value)
34
+ object.__setattr__(self, name, value)
35
+
36
+
37
+ @dataclass
38
+ class TextResult(BaseResult):
39
+ """Text search result."""
40
+
41
+ title: str = ""
42
+ href: str = ""
43
+ body: str = ""
44
+
45
+
46
+ @dataclass
47
+ class ImagesResult(BaseResult):
48
+ """Image search result."""
49
+
50
+ title: str = ""
51
+ image: str = ""
52
+ thumbnail: str = ""
53
+ url: str = ""
54
+ height: str = ""
55
+ width: str = ""
56
+ source: str = ""
57
+
58
+
59
+ @dataclass
60
+ class NewsResult(BaseResult):
61
+ """News search result."""
62
+
63
+ date: str = ""
64
+ title: str = ""
65
+ body: str = ""
66
+ url: str = ""
67
+ image: str = ""
68
+ source: str = ""
69
+
70
+
71
+ @dataclass
72
+ class VideosResult(BaseResult):
73
+ """Video search result."""
74
+
75
+ title: str = ""
76
+ content: str = ""
77
+ description: str = ""
78
+ duration: str = ""
79
+ embed_html: str = ""
80
+ embed_url: str = ""
81
+ image_token: str = ""
82
+ images: dict[str, str] = field(default_factory=dict)
83
+ provider: str = ""
84
+ published: str = ""
85
+ publisher: str = ""
86
+ statistics: dict[str, str] = field(default_factory=dict)
87
+ uploader: str = ""
88
+
89
+
90
+ @dataclass
91
+ class BooksResult(BaseResult):
92
+ """Book search result."""
93
+
94
+ title: str = ""
95
+ author: str = ""
96
+ publisher: str = ""
97
+ info: str = ""
98
+ url: str = ""
99
+ thumbnail: str = ""
100
+
101
+
102
+ class ResultsAggregator(ABC, Generic[T]):
103
+ """Aggregates incoming results.
104
+
105
+ Items are deduplicated by `cache_field`. Append just increments a counter;
106
+ `extract_results` returns items sorted by descending frequency.
107
+ """
108
+
109
+ def __init__(self, cache_fields: set[str]) -> None:
110
+ if not cache_fields:
111
+ msg = "At least one cache_field must be provided"
112
+ raise ValueError(msg)
113
+ self.cache_fields = set(cache_fields)
114
+ self._counter: Counter[str] = Counter()
115
+ self._cache: dict[str, T] = {}
116
+
117
+ def _get_key(self, item: T) -> str:
118
+ for key in item.__dict__:
119
+ if key in self.cache_fields:
120
+ return str(item.__dict__[key])
121
+ msg = f"Item {item!r} has none of the cache fields {self.cache_fields}"
122
+ raise AttributeError(msg)
123
+
124
+ def __len__(self) -> int:
125
+ """Return the number of items in the cache."""
126
+ return len(self._cache)
127
+
128
+ def append(self, item: T) -> None:
129
+ """Add an item to the cache.
130
+
131
+ Register an occurrence of `item`. First time we see its key,
132
+ we store the item; every time we bump the counter.
133
+ """
134
+ key = self._get_key(item)
135
+ if key not in self._cache or len(item.__dict__.get("body", "")) > len(
136
+ self._cache[key].__dict__.get("body", ""),
137
+ ):
138
+ self._cache[key] = item
139
+ self._counter[key] += 1
140
+
141
+ def extend(self, items: list[T]) -> None:
142
+ """Add a list of items to the cache."""
143
+ for item in items:
144
+ self.append(item)
145
+
146
+ def extract_dicts(self) -> list[dict[str, Any]]:
147
+ """Return a list of items, sorted by descending frequency. Each item is returned as a dict."""
148
+ return [self._cache[key].__dict__ for key, _ in self._counter.most_common()]
ddgs/similarity.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Simple filter ranker."""
2
+
3
+ import re
4
+ from typing import Final
5
+
6
+
7
+ class SimpleFilterRanker:
8
+ """Simple filter ranker.
9
+
10
+ 1) Pull any doc with 'wikipedia.org' in its href to the top.
11
+ 2) Bucket the rest according to where query tokens appear:
12
+ - both title & body/description
13
+ - title only
14
+ - body only
15
+ - neither
16
+ 3) Return wikipedia-top + both + title-only + body-only + neither.
17
+ """
18
+
19
+ _splitter: Final = re.compile(r"\W+")
20
+
21
+ def __init__(self, min_token_length: int = 3) -> None:
22
+ self.min_token_length = min_token_length
23
+
24
+ def _extract_tokens(self, query: str) -> set[str]:
25
+ """Split on non-word characters & filter out short tokens."""
26
+ return {token for token in self._splitter.split(query.lower()) if len(token) >= self.min_token_length}
27
+
28
+ def _has_any_token(self, text: str, tokens: set[str]) -> bool:
29
+ """Check if any token is a substring of the lower-cased text."""
30
+ lower_text = text.lower()
31
+ return any(tok in lower_text for tok in tokens)
32
+
33
+ def rank(self, docs: list[dict[str, str]], query: str) -> list[dict[str, str]]:
34
+ """Rank a list of docs based on a query string."""
35
+ tokens = self._extract_tokens(query)
36
+
37
+ wiki_hits = []
38
+ both = []
39
+ title_only = []
40
+ body_only = []
41
+ neither = []
42
+
43
+ for doc in docs:
44
+ href = doc.get("href", "")
45
+ title = doc.get("title", "")
46
+ # fallback to 'description' if no 'body'
47
+ body = doc.get("body", doc.get("description", ""))
48
+
49
+ # Skip Wikimedia category pages
50
+ if all(x in title for x in ["Category:", "Wikimedia"]):
51
+ continue
52
+
53
+ # Wikipedia check
54
+ if "wikipedia.org" in href:
55
+ wiki_hits.append(doc)
56
+ continue
57
+
58
+ # Title / Body match
59
+ hit_title = self._has_any_token(title, tokens)
60
+ hit_body = self._has_any_token(body, tokens)
61
+
62
+ if hit_title and hit_body:
63
+ both.append(doc)
64
+ elif hit_title:
65
+ title_only.append(doc)
66
+ elif hit_body:
67
+ body_only.append(doc)
68
+ else:
69
+ neither.append(doc)
70
+
71
+ # final ranking
72
+ return wiki_hits + both + title_only + body_only + neither
ddgs/utils.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utilities."""
2
+
3
+ import re
4
+ import unicodedata
5
+ from contextlib import suppress
6
+ from datetime import datetime, timezone
7
+ from html import unescape
8
+ from urllib.parse import unquote
9
+
10
+ from .exceptions import DDGSException
11
+
12
+ _REGEX_STRIP_TAGS = re.compile("<.*?>")
13
+
14
+
15
+ def _extract_vqd(html_bytes: bytes, query: str) -> str:
16
+ """Extract vqd from html bytes."""
17
+ for c1, c1_len, c2 in (
18
+ (b'vqd="', 5, b'"'),
19
+ (b"vqd=", 4, b"&"),
20
+ (b"vqd='", 5, b"'"),
21
+ ):
22
+ with suppress(ValueError):
23
+ start = html_bytes.index(c1) + c1_len
24
+ end = html_bytes.index(c2, start)
25
+ return html_bytes[start:end].decode()
26
+
27
+ msg = f"_extract_vqd() {query=} Could not extract vqd."
28
+ raise DDGSException(msg)
29
+
30
+
31
+ def _normalize_url(url: str) -> str:
32
+ """Unquote URL and replace spaces with '+'."""
33
+ return unquote(url).replace(" ", "+") if url else ""
34
+
35
+
36
+ def _normalize_text(raw: str) -> str:
37
+ """Normalize text.
38
+
39
+ Strip HTML tags, unescape HTML entities, normalize Unicode,
40
+ remove "c" category characters, and collapse whitespace.
41
+ """
42
+ if not raw:
43
+ return ""
44
+
45
+ # 1. Strip HTML tags
46
+ text = _REGEX_STRIP_TAGS.sub("", raw)
47
+
48
+ # 2. Unescape HTML entities
49
+ text = unescape(text)
50
+
51
+ # 3. Unicode normalization
52
+ text = unicodedata.normalize("NFC", text)
53
+
54
+ # 4. Remove "C" category characters
55
+ c_to_none = {ord(ch): None for ch in set(text) if unicodedata.category(ch)[0] == "C"}
56
+ if c_to_none:
57
+ text = text.translate(c_to_none)
58
+
59
+ # 5. Collapse whitespace
60
+ return " ".join(text.split())
61
+
62
+
63
+ def _normalize_date(date: int | str) -> str:
64
+ """Normalize date from integer to ISO format if applicable."""
65
+ return datetime.fromtimestamp(date, timezone.utc).isoformat() if isinstance(date, int) else date
66
+
67
+
68
+ def _expand_proxy_tb_alias(proxy: str | None) -> str | None:
69
+ """Expand "tb" to a full proxy URL if applicable."""
70
+ return "socks5h://127.0.0.1:9150" if proxy == "tb" else proxy
docker-compose.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ ddgs-api:
3
+ build: .
4
+ ports:
5
+ - "8000:8000"
6
+ environment:
7
+ - DDGS_PROXY
8
+ volumes:
9
+ - ./logs:/app/logs
10
+ restart: unless-stopped
11
+ healthcheck:
12
+ test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
13
+ interval: 30s
14
+ timeout: 10s
15
+ retries: 3
16
+ start_period: 60s
pyproject.toml ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "ddgs"
7
+ description = "Dux Distributed Global Search. A metasearch library that aggregates results from diverse web search services."
8
+ readme = "README.md"
9
+ requires-python = ">=3.10"
10
+ license = "MIT"
11
+ keywords = ["python", "search", "metasearch"]
12
+ authors = [
13
+ {name = "deedy5"}
14
+ ]
15
+ classifiers = [
16
+ "Development Status :: 5 - Production/Stable",
17
+ "Operating System :: OS Independent",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3 :: Only",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Programming Language :: Python :: 3.13",
24
+ "Programming Language :: Python :: 3.14",
25
+ "Programming Language :: Python :: Implementation :: CPython",
26
+ "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
27
+ "Topic :: Software Development :: Libraries :: Python Modules",
28
+ ]
29
+ dependencies = [
30
+ "click>=8.1.8",
31
+ "primp>=0.15.0",
32
+ "lxml>=4.9.4",
33
+ "httpx[http2,socks,brotli]>=0.28.1", # temporarily
34
+ "fake-useragent>=2.2.0",
35
+ ]
36
+ dynamic = ["version"]
37
+
38
+ [project.urls] # Optional
39
+ "Homepage" = "https://github.com/deedy5/ddgs"
40
+
41
+ [project.scripts]
42
+ ddgs = "ddgs.cli:safe_entry_point"
43
+
44
+ [tool.setuptools.dynamic]
45
+ version = {attr = "ddgs.__version__"}
46
+
47
+ [tool.setuptools.packages.find]
48
+ include = ["ddgs*"]
49
+ exclude = ["api*"]
50
+
51
+ [project.optional-dependencies]
52
+ dev = [
53
+ "mypy>=1.17.1",
54
+ "pre-commit",
55
+ "pytest>=8.4.1",
56
+ "pytest-dependency>=0.6.0",
57
+ "ruff>=0.13.0",
58
+
59
+ # for mypy
60
+ "lxml-stubs",
61
+ "types-Pygments",
62
+ "types-pexpect",
63
+ "types-PyYAML",
64
+ "types-ujson"
65
+ ]
66
+ api = [
67
+ "fastapi[standard]>=0.104.0",
68
+ "fastapi-mcp>=0.4.0",
69
+ ]
70
+
71
+ [tool.ruff]
72
+ line-length = 120
73
+ exclude = ["tests"]
74
+
75
+ [tool.ruff.lint]
76
+ select = [
77
+ # Core rules
78
+ "E", # pycodestyle errors
79
+ "W", # pycodestyle warnings
80
+ "F", # pyflakes
81
+ "I", # isort
82
+
83
+ # Enhanced rules
84
+ "ERA", # eradicate
85
+ "YTT", # flake8-2020
86
+ "ANN", # flake8-annotations
87
+ "ASYNC", # flake8-async
88
+ "S", # flake8-bandit
89
+ "BLE", # flake8-blind-except
90
+ "FBT", # flake8-boolean-trap
91
+ "B", # flake8-bugbear
92
+ "A", # flake8-builtins
93
+ "COM", # flake8-commas
94
+ "C4", # flake8-comprehensions
95
+ "DTZ", # flake8-datetimez
96
+ "T10", # flake8-debugger
97
+ "EM", # flake8-errmsg
98
+ "FIX", # flake8-fixme
99
+ "FA", # flake8-future-annotations
100
+ "INT", # flake8-gettext
101
+ "ISC", # flake8-implicit-str-concat
102
+ "ICN", # flake8-import-conventions
103
+ "LOG", # flake8-logging
104
+ "G", # flake8-logging-format
105
+ "INP", # flake8-no-pep420
106
+ "PIE", # flake8-pie
107
+ "T20", # flake8-print
108
+ "PYI", # flake8-pyi
109
+ "PT", # flake8-pytest-style
110
+ "Q", # flake8-quotes
111
+ "RSE", # flake8-raise
112
+ "RET", # flake8-return
113
+ "SLF", # flake8-self
114
+ "SIM", # flake8-simplify
115
+ "SLOT", # flake8-slots
116
+ "TID", # flake8-tidy-imports
117
+ "TD", # flake8-todos
118
+ "TC", # flake8-type-checking
119
+ "ARG", # flake8-unused-arguments
120
+ "PTH", # flake8-use-pathlib
121
+ "FLY", # flynt
122
+ "C90", # mccabe
123
+ "N", # pep8-naming
124
+ "PERF", # perflint
125
+ "PGH", # pygrep-hooks
126
+ "PL", # Pylint
127
+ "UP", # pyupgrade
128
+ "FURB", # refurb
129
+ "RUF", # ruff-specific rules
130
+ "TRY", # tryceratops
131
+
132
+ # Documentation
133
+ "D", # pydocstyle
134
+ ]
135
+ ignore = [
136
+ "COM812", # Missing trailing comma (handled by formatter)
137
+ "D107", # Missing docstring in `__init__`
138
+ "D203", # incorrect-blank-line-before-class
139
+ "D213", # multi-line-summary-second-line
140
+ "N818", # Exception name {name} should be named with an Error suffix
141
+ "PLR0913", # Too many arguments to function call
142
+ "PLR2004", # Magic value used in comparison
143
+ "SLF001", # Private member accessed
144
+ ]
145
+
146
+ [tool.mypy]
147
+ python_version = "3.10"
148
+ strict = true
149
+ exclude = ["build/"]
start_api.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Start the DDGS API server."""
3
+
4
+ import logging
5
+ import sys
6
+
7
+ import uvicorn
8
+ from fastapi_mcp import FastApiMCP # type: ignore[import-untyped]
9
+
10
+ from api.main import app
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Add current directory to Python path
15
+ sys.path.insert(0, ".")
16
+
17
+ # MCP server
18
+ mcp = FastApiMCP(app, name="ddgs-search", description="DDGS (Dux Distributed Global Search) MCP Server")
19
+ mcp.mount_http()
20
+ logger.info("✅ MCP server enabled at /mcp")
21
+ mcp.mount_sse()
22
+ logger.info("✅ MCP server enabled at /sse")
23
+
24
+ logger.info("🚀 Starting DDGS API server on http://0.0.0.0:8000")
25
+ uvicorn.run(app, host="0.0.0.0", port=8000, workers=1) # noqa: S104
start_api.sh ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # DDGS API Startup Script
4
+
5
+ set -e
6
+
7
+ echo "🚀 Starting DDGS API..."
8
+
9
+ # Check if virtual environment exists
10
+ if [ ! -d ".venv" ]; then
11
+ echo "📦 Creating virtual environment..."
12
+ python3 -m venv .venv
13
+ fi
14
+
15
+ # Activate virtual environment
16
+ echo "🔧 Activating virtual environment..."
17
+ source .venv/bin/activate
18
+
19
+ # Install dependencies
20
+ echo "📥 Installing dependencies..."
21
+ pip install -e ".[api]"
22
+ pip install -e .
23
+
24
+ # Run the API
25
+ echo "🌐 Starting FastAPI server on http://localhost:8000"
26
+ echo "📚 API documentation available at http://localhost:8000/docs"
27
+ echo "🔍 ReDoc documentation available at http://localhost:8000/redoc"
28
+
29
+ python start_api.py
tests/cli_test.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pathlib
2
+ import shutil
3
+ import time
4
+ from pathlib import Path
5
+
6
+ import pytest
7
+ from click.testing import CliRunner
8
+
9
+ from ddgs import DDGS, __version__
10
+ from ddgs.cli import _download_results, _save_csv, _save_json, cli
11
+
12
+ runner = CliRunner()
13
+ TEXT_RESULTS = []
14
+ IMAGES_RESULTS = []
15
+
16
+
17
+ @pytest.fixture(autouse=True)
18
+ def pause_between_tests() -> None:
19
+ time.sleep(2)
20
+
21
+
22
+ def test_version_command() -> None:
23
+ result = runner.invoke(cli, ["version"])
24
+ assert result.output.strip() == __version__
25
+
26
+
27
+ def test_text_command() -> None:
28
+ result = runner.invoke(cli, ["text", "-q", "zebra"])
29
+ assert "title" in result.output
30
+
31
+
32
+ def test_images_command() -> None:
33
+ result = runner.invoke(cli, ["images", "-q", "fox"])
34
+ assert "title" in result.output
35
+
36
+
37
+ def test_news_command() -> None:
38
+ result = runner.invoke(cli, ["news", "-q", "deer"])
39
+ assert "title" in result.output
40
+
41
+
42
+ def test_videos_command() -> None:
43
+ result = runner.invoke(cli, ["videos", "-q", "pig"])
44
+ assert "title" in result.output
45
+
46
+
47
+ def test_books_command() -> None:
48
+ result = runner.invoke(cli, ["books", "-q", "bee"])
49
+ assert "title" in result.output
50
+
51
+
52
+ @pytest.mark.dependency()
53
+ def test_get_text() -> None:
54
+ global TEXT_RESULTS
55
+ TEXT_RESULTS = DDGS().text("cow", max_results=5)
56
+ assert TEXT_RESULTS
57
+
58
+
59
+ @pytest.mark.dependency()
60
+ def test_get_images() -> None:
61
+ global IMAGES_RESULTS
62
+ IMAGES_RESULTS = DDGS().images("horse", max_results=5)
63
+ assert IMAGES_RESULTS
64
+
65
+
66
+ @pytest.mark.dependency(depends=["test_get_text"])
67
+ def test_save_csv(tmp_path: Path) -> None:
68
+ temp_file = tmp_path / "test_csv.csv"
69
+ _save_csv(temp_file, TEXT_RESULTS)
70
+ assert temp_file.exists()
71
+
72
+
73
+ @pytest.mark.dependency(depends=["test_get_text"])
74
+ def test_save_json(tmp_path: Path) -> None:
75
+ temp_file = tmp_path / "test_json.json"
76
+ _save_json(temp_file, TEXT_RESULTS)
77
+ assert temp_file.exists()
78
+
79
+
80
+ @pytest.mark.dependency(depends=["test_get_text"])
81
+ def test_text_download() -> None:
82
+ pathname = pathlib.Path("text_downloads")
83
+ _download_results(f"{test_text_download}", TEXT_RESULTS, function_name="text", pathname=str(pathname))
84
+ assert pathname.is_dir() and pathname.iterdir()
85
+ for file in pathname.iterdir():
86
+ assert file.is_file()
87
+ shutil.rmtree(str(pathname))
88
+
89
+
90
+ @pytest.mark.dependency(depends=["test_get_images"])
91
+ def test_images_download() -> None:
92
+ pathname = pathlib.Path("images_downloads")
93
+ _download_results(f"{test_images_download}", IMAGES_RESULTS, function_name="images", pathname=str(pathname))
94
+ assert pathname.is_dir() and pathname.iterdir()
95
+ for file in pathname.iterdir():
96
+ assert file.is_file()
97
+ shutil.rmtree(str(pathname))
tests/ddgs_test.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ import pytest
4
+
5
+ from ddgs import DDGS
6
+
7
+
8
+ @pytest.fixture(autouse=True)
9
+ def pause_between_tests() -> None:
10
+ time.sleep(2)
11
+
12
+
13
+ def test_context_manager() -> None:
14
+ with DDGS() as ddgs:
15
+ results = ddgs.text("python")
16
+ assert len(results) > 0
17
+
18
+
19
+ def test_text_search() -> None:
20
+ query = "wolf"
21
+ results = DDGS().text(query)
22
+ assert isinstance(results, list)
23
+ assert len(results) > 0
24
+
25
+
26
+ def test_images_search() -> None:
27
+ query = "tiger"
28
+ results = DDGS().images(query)
29
+ assert isinstance(results, list)
30
+ assert len(results) > 0
31
+
32
+
33
+ def test_news_search() -> None:
34
+ query = "rabbit"
35
+ results = DDGS().news(query)
36
+ assert isinstance(results, list)
37
+ assert len(results) > 0
38
+
39
+
40
+ def test_videos_search() -> None:
41
+ query = "monkey"
42
+ results = DDGS().videos(query)
43
+ assert isinstance(results, list)
44
+ assert len(results) > 0
45
+
46
+
47
+ def test_books_search() -> None:
48
+ query = "mouse"
49
+ results = DDGS().books(query)
50
+ assert isinstance(results, list)
51
+ assert len(results) > 0