Spaces:

motsobelal
/

ddgs

Sleeping

App Files Files Community

motsobelal commited on Jan 22

Commit

8628def

verified ·

1 Parent(s): 51e3a99

Upload 39 files

Browse files

Files changed (39) hide show

CONTRIBUTING.md +68 -0
Dockerfile +34 -0
LICENSE.md +21 -0
Makefile +40 -0
api/__init__.py +3 -0
api/main.py +390 -0
ddgs/__init__.py +52 -0
ddgs/base.py +122 -0
ddgs/cli.py +523 -0
ddgs/ddgs.py +234 -0
ddgs/engines/__init__.py +94 -0
ddgs/engines/annasarchive.py +51 -0
ddgs/engines/bing.py +85 -0
ddgs/engines/bing_news.py +86 -0
ddgs/engines/brave.py +47 -0
ddgs/engines/duckduckgo.py +56 -0
ddgs/engines/duckduckgo_images.py +85 -0
ddgs/engines/duckduckgo_news.py +72 -0
ddgs/engines/duckduckgo_videos.py +84 -0
ddgs/engines/google.py +95 -0
ddgs/engines/grokipedia.py +49 -0
ddgs/engines/mojeek.py +52 -0
ddgs/engines/wikipedia.py +66 -0
ddgs/engines/yahoo.py +64 -0
ddgs/engines/yahoo_news.py +104 -0
ddgs/engines/yandex.py +47 -0
ddgs/exceptions.py +13 -0
ddgs/http_client.py +78 -0
ddgs/http_client2.py +151 -0
ddgs/py.typed +1 -0
ddgs/results.py +148 -0
ddgs/similarity.py +72 -0
ddgs/utils.py +70 -0
docker-compose.yml +16 -0
pyproject.toml +149 -0
start_api.py +25 -0
start_api.sh +29 -0
tests/cli_test.py +97 -0
tests/ddgs_test.py +51 -0

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,68 @@

+# Contributing
+Please open a Discussion, Issue, or email the maintainers to talk over any major changes before submitting a pull request.
+## IDE configuration
+If you use **VSCode**, install recommended extensions (press `F1` → *Show Recommended Extensions*):
+- `ms-python.python`
+- `ms-python.mypy-type-checker`
+- `charliermarsh.ruff`
+- `usernamehw.errorlens`
+- `fill-labs.dependi`
+## Development
+1. Fork the repository and clone your fork:
+   ```sh
+   git clone https://github.com/{your_profile}/ddgs
+   cd ddgs
+   ```
+2. Create and activate a virtual environment, then install development dependencies:
+   ```sh
+   python -m venv .venv
+   source .venv/bin/activate  # Windows: .venv\Scripts\activate
+   pip install -e .[dev]
+   ```
+3. Install pre-commit hooks (automates formatting, linting, typing):
+   ```sh
+   pre-commit install
+   ```
+   - Hooks run `ruff` and `mypy` automatically on each commit.
+   - To run them manually: `pre-commit run --all-files`.
+3. Create a feature branch:
+   ```sh
+   git checkout -b feat/new-feature
+   ```
+4. Implement your changes.
+5. Run tests locally:
+   ```sh
+   pytest
+   ```
+6. Commit changes (follow Conventional Commits):
+   ```sh
+   git add .
+   git commit -m "feat: add feature description"
+   ```
+7. Push your branch to your fork
+   ```sh
+   git push origin feat/new-feature
+   ```
+8. Open a pull request against the upstream repository and reference any related Discussion/Issue.
+## Code style
+   - Formatting and linting are enforced with **ruff**.
+   - Static typing is checked with **mypy**.
+## PR checklist
+   - Tests pass: `pytest`
+   - pre-commit checks pass: `pre-commit run --all-files`
+   - Commit messages follow Conventional Commits
+   - PR references related Issue/Discussion and describes changes
+   - Add tests for new behavior where applicable

Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+# Use Python 3.11 slim image as base
+FROM python:3.11-slim
+# Set working directory
+WORKDIR /app
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PIP_NO_CACHE_DIR=1
+ENV PIP_DISABLE_PIP_VERSION_CHECK=1
+ENV PYTHONPATH=/app
+# Install system dependencies including curl for healthcheck
+RUN apt-get update && apt-get upgrade -y \
+    && apt-get install -y curl \
+    && rm -rf /var/lib/apt/lists/*
+# Copy application code
+COPY . .
+# Install Python dependencies (including API dependencies)
+RUN pip install --no-cache-dir -e .[api]
+# Expose port
+EXPOSE 8000
+# Create non-root user
+RUN useradd --create-home --shell /bin/bash app \
+    && chown -R app:app /app
+USER app
+# Run the application
+CMD ["python", "start_api.py"]

LICENSE.md ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 deedy5
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

Makefile ADDED Viewed

	@@ -0,0 +1,40 @@

+PY := .venv/bin/python
+PIP := .venv/bin/pip
+.PHONY: help setup lint format test all clean
+help:
+	@echo "Targets:"
+	@echo "  setup   - create venv and install dependencies"
+	@echo "  lint    - run ruff check, ruff format and mypy"
+	@echo "  format  - run ruff format and ruff check --fix"
+	@echo "  test    - run pytest"
+	@echo "  all     - run setup, lint, format and test"
+	@echo "  clean   - remove cache, venv and build artifacts"
+setup:
+	python3 -m venv .venv
+	$(PIP) install -e .[dev]
+lint:
+	$(PY) -m ruff check --fix
+	$(PY) -m mypy --install-types --non-interactive .
+format:
+	$(PY) -m ruff format
+test:
+	$(PY) -m pytest
+all: setup lint format test
+clean:
+	rm -rf .venv/
+	rm -rf .pytest_cache/
+	rm -rf .mypy_cache/
+	rm -rf .ruff_cache/
+	rm -rf build/
+	rm -rf dist/
+	rm -rf *.egg-info/
+	find . -name __pycache__ -exec rm -rf {} +
+	rm -f uv.lock

api/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """DDGS API package."""
2	+
3	+ __version__ = "1.0.0"

api/main.py ADDED Viewed

	@@ -0,0 +1,390 @@

+"""FastAPI application for DDGS API."""
+import logging
+from typing import Any
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from ddgs import DDGS
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Create FastAPI app
+app = FastAPI(
+    title="DDGS API",
+    description="A FastAPI wrapper for the DDGS (Dux Distributed Global Search) library",
+    version="1.0.0",
+    docs_url="/docs",
+    redoc_url="/redoc",
+)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Pydantic models for request/response
+class TextSearchRequest(BaseModel):
+    """Request model for search operations."""
+    query: str = Field(..., description="Search query")
+    region: str = Field("us-en", description="Region for search (e.g., us-en, uk-en, ru-ru)")
+    safesearch: str = Field("moderate", description="Safe search setting (on, moderate, off)")
+    timelimit: str | None = Field(None, description="Time limit (d, w, m, y) or custom date range")
+    max_results: int | None = Field(10, description="Maximum number of results to return")
+    page: int = Field(1, description="Page number of results")
+    backend: str = Field("auto", description="Search backend (auto, or specific engine)")
+class ImagesSearchRequest(BaseModel):
+    """Request model for image search operations."""
+    query: str = Field(..., description="Image search query")
+    region: str = Field("us-en", description="Region for search (e.g., us-en, uk-en, ru-ru)")
+    safesearch: str = Field("moderate", description="Safe search setting (on, moderate, off)")
+    timelimit: str | None = Field(None, description="Time limit (d, w, m, y) or custom date range")
+    max_results: int | None = Field(10, description="Maximum number of results to return")
+    page: int = Field(1, description="Page number of results")
+    backend: str = Field("auto", description="Search backend (auto, or specific engine)")
+    size: str | None = Field(None, description="Image size (Small, Medium, Large, Wallpaper)")
+    color: str | None = Field(
+        None,
+        description="Image color (Monochrome, Red, Orange, Yellow, Green, Blue, Purple, Pink, Brown, Black, Gray, Teal, White)",  # noqa: E501
+    )
+    type_image: str | None = Field(None, description="Image type (photo, clipart, gif, transparent, line)")
+    layout: str | None = Field(None, description="Image layout (Square, Tall, Wide)")
+    license_image: str | None = Field(
+        None, description="Image license (any, Public, Share, ShareCommercially, Modify, ModifyCommercially)"
+    )
+class NewsSearchRequest(BaseModel):
+    """Request model for search operations."""
+    query: str = Field(..., description="Search query")
+    region: str = Field("us-en", description="Region for search (e.g., us-en, uk-en, ru-ru)")
+    safesearch: str = Field("moderate", description="Safe search setting (on, moderate, off)")
+    timelimit: str | None = Field(None, description="Time limit (d, w, m, y) or custom date range")
+    max_results: int | None = Field(10, description="Maximum number of results to return")
+    page: int = Field(1, description="Page number of results")
+    backend: str = Field("auto", description="Search backend (auto, or specific engine)")
+class VideosSearchRequest(BaseModel):
+    """Request model for video search operations."""
+    query: str = Field(..., description="Video search query")
+    region: str = Field("us-en", description="Region for search (e.g., us-en, uk-en, ru-ru)")
+    safesearch: str = Field("moderate", description="Safe search setting (on, moderate, off)")
+    timelimit: str | None = Field(None, description="Time limit (d, w, m) or custom date range")
+    max_results: int | None = Field(10, description="Maximum number of results to return")
+    page: int = Field(1, description="Page number of results")
+    backend: str = Field("auto", description="Search backend (auto, or specific engine)")
+    resolution: str | None = Field(None, description="Video resolution (high, standard)")
+    duration: str | None = Field(None, description="Video duration (short, medium, long)")
+    license_videos: str | None = Field(None, description="Video license (creativeCommon, youtube)")
+class BooksSearchRequest(BaseModel):
+    """Request model for book search operations."""
+    query: str = Field(..., description="Books search query")
+    max_results: int | None = Field(10, description="Maximum number of results to return")
+    page: int = Field(1, description="Page number of results")
+    backend: str = Field("auto", description="Search backend (auto, or specific engine)")
+class SearchResponse(BaseModel):
+    """Response model for search operations."""
+    results: list[dict[str, Any]]
+class HealthResponse(BaseModel):
+    """Response model for health check."""
+    status: str
+    version: str
+    service: str
+@app.get("/", response_model=HealthResponse)
+async def root() -> HealthResponse:
+    """Root endpoint with basic service information."""
+    return HealthResponse(status="healthy", version="1.0.0", service="DDGS API")
+@app.get("/health", response_model=HealthResponse)
+async def health_check() -> HealthResponse:
+    """Health check endpoint."""
+    return HealthResponse(status="healthy", version="1.0.0", service="DDGS API")
+@app.post("/search/text", response_model=SearchResponse)
+async def search_text(request: TextSearchRequest) -> SearchResponse:
+    """Perform a text search."""
+    try:
+        results = DDGS().text(
+            query=request.query,
+            region=request.region,
+            safesearch=request.safesearch,
+            timelimit=request.timelimit,
+            max_results=request.max_results,
+            page=request.page,
+            backend=request.backend,
+        )
+        return SearchResponse(results=results)
+    except Exception as e:
+        logger.warning("Error in text search: %s", e)
+        raise HTTPException(status_code=500, detail=f"Search failed: {e!s}") from e
+@app.get("/search/text", response_model=SearchResponse)
+async def search_text_get(
+    query: str,
+    region: str = "us-en",
+    safesearch: str = "moderate",
+    timelimit: str | None = None,
+    max_results: int = 10,
+    page: int = 1,
+    backend: str = "auto",
+) -> SearchResponse:
+    """Perform a text search via GET request."""
+    try:
+        results = DDGS().text(
+            query=query,
+            region=region,
+            safesearch=safesearch,
+            timelimit=timelimit,
+            max_results=max_results,
+            page=page,
+            backend=backend,
+        )
+        return SearchResponse(results=results)
+    except Exception as e:
+        logger.warning("Error in text search (GET): %s", e)
+        raise HTTPException(status_code=500, detail=f"Search failed: {e!s}") from e
+@app.post("/search/images", response_model=SearchResponse)
+async def search_images(request: ImagesSearchRequest) -> SearchResponse:
+    """Perform an image search."""
+    try:
+        results = DDGS().images(
+            query=request.query,
+            region=request.region,
+            safesearch=request.safesearch,
+            timelimit=request.timelimit,
+            max_results=request.max_results,
+            page=request.page,
+            backend=request.backend,
+            size=request.size,
+            color=request.color,
+            type_image=request.type_image,
+            layout=request.layout,
+            license_image=request.license_image,
+        )
+        return SearchResponse(results=results)
+    except Exception as e:
+        logger.warning("Error in image search: %s", e)
+        raise HTTPException(status_code=500, detail=f"Image search failed: {e!s}") from e
+@app.get("/search/images", response_model=SearchResponse)
+async def search_images_get(
+    query: str,
+    region: str = "us-en",
+    safesearch: str = "moderate",
+    timelimit: str | None = None,
+    max_results: int = 10,
+    page: int = 1,
+    backend: str = "auto",
+    size: str | None = None,
+    color: str | None = None,
+    type_image: str | None = None,
+    layout: str | None = None,
+    license_image: str | None = None,
+) -> SearchResponse:
+    """Perform an image search via GET request."""
+    try:
+        results = DDGS().images(
+            query=query,
+            region=region,
+            safesearch=safesearch,
+            timelimit=timelimit,
+            max_results=max_results,
+            page=page,
+            backend=backend,
+            size=size,
+            color=color,
+            type_image=type_image,
+            layout=layout,
+            license_image=license_image,
+        )
+        return SearchResponse(results=results)
+    except Exception as e:
+        logger.warning("Error in image search (GET): %s", e)
+        raise HTTPException(status_code=500, detail=f"Image search failed: {e!s}") from e
+@app.post("/search/news", response_model=SearchResponse)
+async def search_news(request: NewsSearchRequest) -> SearchResponse:
+    """Perform a news search."""
+    try:
+        results = DDGS().news(
+            query=request.query,
+            region=request.region,
+            safesearch=request.safesearch,
+            timelimit=request.timelimit,
+            max_results=request.max_results,
+            page=request.page,
+            backend=request.backend,
+        )
+        return SearchResponse(results=results)
+    except Exception as e:
+        logger.warning("Error in news search: %s", e)
+        raise HTTPException(status_code=500, detail=f"News search failed: {e!s}") from e
+@app.get("/search/news", response_model=SearchResponse)
+async def search_news_get(
+    query: str,
+    region: str = "us-en",
+    safesearch: str = "moderate",
+    timelimit: str | None = None,
+    max_results: int = 10,
+    page: int = 1,
+    backend: str = "auto",
+) -> SearchResponse:
+    """Perform a news search via GET request."""
+    try:
+        results = DDGS().news(
+            query=query,
+            region=region,
+            safesearch=safesearch,
+            timelimit=timelimit,
+            max_results=max_results,
+            page=page,
+            backend=backend,
+        )
+        return SearchResponse(results=results)
+    except Exception as e:
+        logger.warning("Error in news search (GET): %s", e)
+        raise HTTPException(status_code=500, detail=f"News search failed: {e!s}") from e
+@app.post("/search/videos", response_model=SearchResponse)
+async def search_videos(request: VideosSearchRequest) -> SearchResponse:
+    """Perform a video search."""
+    try:
+        results = DDGS().videos(
+            query=request.query,
+            region=request.region,
+            safesearch=request.safesearch,
+            timelimit=request.timelimit,
+            max_results=request.max_results,
+            page=request.page,
+            backend=request.backend,
+            resolution=request.resolution,
+            duration=request.duration,
+            license_videos=request.license_videos,
+        )
+        return SearchResponse(results=results)
+    except Exception as e:
+        logger.warning("Error in video search: %s", e)
+        raise HTTPException(status_code=500, detail=f"Video search failed: {e!s}") from e
+@app.get("/search/videos", response_model=SearchResponse)
+async def search_videos_get(
+    query: str,
+    region: str = "us-en",
+    safesearch: str = "moderate",
+    timelimit: str | None = None,
+    max_results: int = 10,
+    page: int = 1,
+    backend: str = "auto",
+    resolution: str | None = None,
+    duration: str | None = None,
+    license_videos: str | None = None,
+) -> SearchResponse:
+    """Perform a video search via GET request."""
+    try:
+        results = DDGS().videos(
+            query=query,
+            region=region,
+            safesearch=safesearch,
+            timelimit=timelimit,
+            max_results=max_results,
+            page=page,
+            backend=backend,
+            resolution=resolution,
+            duration=duration,
+            license_videos=license_videos,
+        )
+        return SearchResponse(results=results)
+    except Exception as e:
+        logger.warning("Error in video search (GET): %s", e)
+        raise HTTPException(status_code=500, detail=f"Video search failed: {e!s}") from e
+@app.post("/search/books", response_model=SearchResponse)
+async def search_books(request: BooksSearchRequest) -> SearchResponse:
+    """Perform a book search."""
+    try:
+        results = DDGS().books(
+            query=request.query,
+            max_results=request.max_results,
+            page=request.page,
+            backend=request.backend,
+        )
+        return SearchResponse(results=results)
+    except Exception as e:
+        logger.warning("Error in book search: %s", e)
+        raise HTTPException(status_code=500, detail=f"Book search failed: {e!s}") from e
+@app.get("/search/books", response_model=SearchResponse)
+async def search_books_get(
+    query: str,
+    max_results: int = 10,
+    page: int = 1,
+    backend: str = "auto",
+) -> SearchResponse:
+    """Perform a book search via GET request."""
+    try:
+        results = DDGS().books(
+            query=query,
+            max_results=max_results,
+            page=page,
+            backend=backend,
+        )
+        return SearchResponse(results=results)
+    except Exception as e:
+        logger.warning("Error in book search (GET): %s", e)
+        raise HTTPException(status_code=500, detail=f"Book search failed: {e!s}") from e
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)  # noqa: S104

ddgs/__init__.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""DDGS | Dux Distributed Global Search.
+A metasearch library that aggregates results from diverse web search services.
+"""
+import importlib
+import logging
+import threading
+from typing import TYPE_CHECKING, Any, cast
+__version__ = "9.10.0"
+__all__ = ("DDGS",)
+if TYPE_CHECKING:
+    from .ddgs import DDGS
+# A do-nothing logging handler
+# https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
+logging.getLogger("ddgs").addHandler(logging.NullHandler())
+class _ProxyMeta(type):
+    _lock: threading.Lock = threading.Lock()
+    _real_cls: type["DDGS"] | None = None
+    @classmethod
+    def _load_real(cls) -> type["DDGS"]:
+        if cls._real_cls is None:
+            with cls._lock:
+                if cls._real_cls is None:
+                    cls._real_cls = importlib.import_module(".ddgs", package=__name__).DDGS
+                    globals()["DDGS"] = cls._real_cls
+        return cls._real_cls
+    def __call__(cls, *args: Any, **kwargs: Any) -> "DDGS":  # noqa: ANN401
+        real = type(cls)._load_real()
+        return real(*args, **kwargs)
+    def __getattr__(cls, name: str) -> Any:  # noqa: ANN401
+        return getattr(type(cls)._load_real(), name)
+    def __dir__(cls) -> list[str]:
+        base = set(super().__dir__())
+        loaded_names = set(dir(type(cls)._load_real()))
+        return sorted(base | (loaded_names - base))
+class _DDGSProxy(metaclass=_ProxyMeta):
+    """Proxy class for lazy-loading the real DDGS implementation."""
+DDGS: type[DDGS] = cast("type[DDGS]", _DDGSProxy)  # type: ignore[no-redef]

ddgs/base.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""Base class for search engines."""
+import logging
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+from functools import cached_property
+from typing import Any, ClassVar, Generic, Literal, TypeVar
+from lxml import html
+from lxml.etree import HTMLParser as LHTMLParser
+from .http_client import HttpClient
+from .results import BooksResult, ImagesResult, NewsResult, TextResult, VideosResult
+logger = logging.getLogger(__name__)
+T = TypeVar("T")
+class BaseSearchEngine(ABC, Generic[T]):
+    """Abstract base class for all search-engine backends."""
+    name: ClassVar[str]  # unique key, e.g. "google"
+    category: ClassVar[Literal["text", "images", "videos", "news", "books"]]
+    provider: ClassVar[str]  # source of the search results (e.g. "bing" for DuckDuckgo)
+    disabled: ClassVar[bool] = False  # if True, the engine is disabled
+    priority: ClassVar[float] = 1
+    search_url: str
+    search_method: ClassVar[str]  # GET or POST
+    search_headers: ClassVar[Mapping[str, str]] = {}
+    items_xpath: ClassVar[str]
+    elements_xpath: ClassVar[Mapping[str, str]]
+    elements_replace: ClassVar[Mapping[str, str]]
+    def __init__(self, proxy: str | None = None, timeout: int | None = None, *, verify: bool | str = True) -> None:
+        self.http_client = HttpClient(proxy=proxy, timeout=timeout, verify=verify)
+        self.results: list[T] = []
+    @property
+    def result_type(self) -> type[T]:
+        """Get result type based on category."""
+        categories = {
+            "text": TextResult,
+            "images": ImagesResult,
+            "videos": VideosResult,
+            "news": NewsResult,
+            "books": BooksResult,
+        }
+        return categories[self.category]
+    @abstractmethod
+    def build_payload(
+        self,
+        query: str,
+        region: str,
+        safesearch: str,
+        timelimit: str | None,
+        page: int,
+        **kwargs: str,
+    ) -> dict[str, Any]:
+        """Build a payload for the search request."""
+        raise NotImplementedError
+    def request(self, *args: Any, **kwargs: Any) -> str | None:  # noqa: ANN401
+        """Make a request to the search engine."""
+        resp = self.http_client.request(*args, **kwargs)
+        if resp.status_code == 200:
+            return resp.text
+        return None
+    @cached_property
+    def parser(self) -> LHTMLParser:
+        """Get HTML parser."""
+        return LHTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True, collect_ids=False)
+    def extract_tree(self, html_text: str) -> html.Element:
+        """Extract html tree from html text."""
+        return html.fromstring(html_text, parser=self.parser)
+    def pre_process_html(self, html_text: str) -> str:
+        """Pre-process html_text before extracting results."""
+        return html_text
+    def extract_results(self, html_text: str) -> list[T]:
+        """Extract search results from html text."""
+        html_text = self.pre_process_html(html_text)
+        tree = self.extract_tree(html_text)
+        items = tree.xpath(self.items_xpath)
+        results = []
+        for item in items:
+            result = self.result_type()
+            for key, value in self.elements_xpath.items():
+                data = " ".join(x.strip() for x in item.xpath(value))
+                result.__setattr__(key, data)
+            results.append(result)
+        return results
+    def post_extract_results(self, results: list[T]) -> list[T]:
+        """Post-process search results."""
+        return results
+    def search(
+        self,
+        query: str,
+        region: str = "us-en",
+        safesearch: str = "moderate",
+        timelimit: str | None = None,
+        page: int = 1,
+        **kwargs: str,
+    ) -> list[T] | None:
+        """Search the engine."""
+        payload = self.build_payload(
+            query=query, region=region, safesearch=safesearch, timelimit=timelimit, page=page, **kwargs
+        )
+        if self.search_method == "GET":
+            html_text = self.request(self.search_method, self.search_url, params=payload, headers=self.search_headers)
+        else:
+            html_text = self.request(self.search_method, self.search_url, data=payload, headers=self.search_headers)
+        if not html_text:
+            return None
+        results = self.extract_results(html_text)
+        return self.post_extract_results(results)

ddgs/cli.py ADDED Viewed

	@@ -0,0 +1,523 @@

+"""CLI tool for DDGS."""
+import csv
+import json
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime, timezone
+from pathlib import Path
+from urllib.parse import unquote
+import click
+import primp
+from . import __version__
+from .ddgs import DDGS
+from .utils import _expand_proxy_tb_alias
+logger = logging.getLogger(__name__)
+COLORS = {
+    0: "black",
+    1: "red",
+    2: "green",
+    3: "yellow",
+    4: "blue",
+    5: "magenta",
+    6: "cyan",
+    7: "bright_black",
+    8: "bright_red",
+    9: "bright_green",
+    10: "bright_yellow",
+    11: "bright_blue",
+    12: "bright_magenta",
+    13: "bright_cyan",
+    14: "white",
+    15: "bright_white",
+}
+def _convert_tuple_to_csv(_ctx: click.Context, _param: click.Parameter, value: tuple[str] | None) -> str:
+    if value is not None and isinstance(value, tuple):
+        return ",".join(value)
+    return ""
+def _save_data(query: str, data: list[dict[str, str]], function_name: str, filename: str | None) -> None:
+    filename, ext = filename.rsplit(".", 1) if filename and filename.endswith((".csv", ".json")) else (None, filename)
+    filename = filename if filename else f"{function_name}_{query}_{datetime.now(tz=timezone.utc):%Y%m%d_%H%M%S}"
+    if ext == "csv":
+        _save_csv(f"{filename}.{ext}", data)
+    elif ext == "json":
+        _save_json(f"{filename}.{ext}", data)
+def _save_json(jsonfile: str | Path, data: list[dict[str, str]]) -> None:
+    with Path(jsonfile).open("w", encoding="utf-8") as file:
+        file.write(json.dumps(data, ensure_ascii=False, indent=2))
+def _save_csv(csvfile: str | Path, data: list[dict[str, str]]) -> None:
+    with Path(csvfile).open("w", newline="", encoding="utf-8") as file:
+        if data:
+            headers = data[0].keys()
+            writer = csv.DictWriter(file, fieldnames=headers, quoting=csv.QUOTE_MINIMAL)
+            writer.writeheader()
+            writer.writerows(data)
+def _print_data(data: list[dict[str, str]], *, no_color: bool = False) -> None:
+    if data:
+        for i, e in enumerate(data, start=1):
+            click.secho(f"{i}.\t    {'=' * 78}", bg="black", fg="white")
+            for j, (k, v) in enumerate(e.items(), start=1):
+                if v:
+                    width = 300 if k in ("content", "href", "image", "source", "thumbnail", "url") else 78
+                    title = "language" if k == "detected_language" else k
+                    text = click.wrap_text(
+                        f"{v}",
+                        width=width,
+                        initial_indent="",
+                        subsequent_indent=" " * 12,
+                        preserve_paragraphs=True,
+                    )
+                else:
+                    title = k
+                    text = v
+                click.secho(f"{title:<12}{text}", bg="black", fg=COLORS[j] if not no_color else "white", overline=True)
+            input()
+def _sanitize_query(query: str) -> str:
+    return (
+        query.replace("filetype", "")
+        .replace(":", "")
+        .replace('"', "'")
+        .replace("site", "")
+        .replace(" ", "_")
+        .replace("/", "_")
+        .replace("\\", "_")
+        .replace(" ", "")
+    )
+def _download_file(url: str, dir_path: str, filename: str, proxy: str | None, *, verify: bool) -> None:
+    try:
+        resp = primp.Client(proxy=proxy, impersonate="random", impersonate_os="random", timeout=10, verify=verify).get(
+            url,
+        )
+        if resp.status_code == 200:
+            f = Path(dir_path) / filename[:200]
+            with f.open("wb") as file:
+                file.write(resp.content)
+    except Exception as ex:  # noqa: BLE001
+        logger.debug("Error download_file url=%s: %r", url, ex)
+def _download_results(
+    query: str,
+    results: list[dict[str, str]],
+    function_name: str,
+    proxy: str | None = None,
+    threads: int | None = None,
+    pathname: str | None = None,
+    *,
+    verify: bool = True,
+) -> None:
+    path = pathname if pathname else f"{function_name}_{query}_{datetime.now(tz=timezone.utc):%Y%m%d_%H%M%S}"
+    Path(path).mkdir(parents=True, exist_ok=True)
+    threads = 10 if threads is None else threads
+    with ThreadPoolExecutor(max_workers=threads) as executor:
+        futures = []
+        for i, res in enumerate(results, start=1):
+            url = res["image"] if function_name == "images" else res["href"]
+            filename = unquote(url.split("/")[-1].split("?")[0])
+            f = executor.submit(_download_file, url, path, f"{i}_{filename}", proxy, verify=verify)
+            futures.append(f)
+        with click.progressbar(
+            length=len(futures),
+            label="Downloading",
+            show_percent=True,
+            show_pos=True,
+            width=50,
+        ) as bar:
+            for future in as_completed(futures):
+                future.result()
+                bar.update(1)
+@click.group(chain=True)
+def cli() -> None:
+    """DDGS CLI tool."""
+def safe_entry_point() -> None:
+    """Run the CLI tool in try-except block to catch all exceptions."""
+    logging.basicConfig(level=logging.WARNING)
+    try:
+        cli()
+    except Exception as ex:  # noqa: BLE001
+        click.echo(f"{type(ex).__name__}: {ex!r}")
+@cli.command()
+def version() -> str:
+    """Print and return version."""
+    print(__version__)  # noqa: T201
+    return __version__
+@cli.command()
+@click.option("-q", "--query", help="text search query")
+@click.option("-k", "--keywords", help="(Deprecated) text search query")  # deprecated
+@click.option("-r", "--region", default="us-en", help="us-en, ru-ru, etc.")
+@click.option("-s", "--safesearch", default="moderate", type=click.Choice(["on", "moderate", "off"]))
+@click.option("-t", "--timelimit", type=click.Choice(["d", "w", "m", "y"]), help="day, week, month, year")
+@click.option("-m", "--max_results", default=10, type=int, help="maximum number of results")
+@click.option("-p", "--page", default=1, type=int, help="page number of results")
+@click.option(
+    "-b",
+    "--backend",
+    default=["auto"],
+    type=click.Choice(
+        [
+            "auto",
+            "all",
+            "bing",
+            "brave",
+            "duckduckgo",
+            "google",
+            "grokipedia",
+            "mojeek",
+            "yandex",
+            "yahoo",
+            "wikipedia",
+        ],
+    ),
+    multiple=True,
+    callback=_convert_tuple_to_csv,
+)
+@click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
+@click.option("-d", "--download", is_flag=True, default=False, help="download results. -dd to set custom directory")
+@click.option("-dd", "--download-directory", help="Specify custom download directory")
+@click.option("-th", "--threads", default=10, help="download threads, default=10")
+@click.option("-pr", "--proxy", help="the proxy to send requests, example: socks5h://127.0.0.1:9150")
+@click.option("-v", "--verify", default=True, help="verify SSL when making the request")
+@click.option("-nc", "--no-color", is_flag=True, default=False, help="disable color output")
+def text(
+    query: str,
+    keywords: str | None,  # deprecated
+    region: str,
+    safesearch: str,
+    timelimit: str | None,
+    max_results: int | None,
+    page: int,
+    backend: str,
+    output: str | None,
+    download_directory: str | None,
+    threads: int,
+    proxy: str | None,
+    *,
+    download: bool,
+    verify: bool,
+    no_color: bool,
+) -> None:
+    """CLI function to perform a DDGS text metasearch."""
+    data = DDGS(proxy=_expand_proxy_tb_alias(proxy), verify=verify).text(
+        query=query,
+        keywords=keywords,  # deprecated
+        region=region,
+        safesearch=safesearch,
+        timelimit=timelimit,
+        max_results=max_results,
+        page=page,
+        backend=backend,
+    )
+    query = _sanitize_query(query)
+    if output:
+        _save_data(query, data, "text", filename=output)
+    if download:
+        _download_results(
+            query,
+            data,
+            function_name="text",
+            proxy=proxy,
+            threads=threads,
+            verify=verify,
+            pathname=download_directory,
+        )
+    if not output and not download:
+        _print_data(data, no_color=no_color)
+@cli.command()
+@click.option("-q", "--query", help="images search query")
+@click.option("-k", "--keywords", help="(Deprecated) images search query")  # deprecated
+@click.option("-r", "--region", default="us-en", help="us-en, ru-ru, etc.")
+@click.option("-s", "--safesearch", default="moderate", type=click.Choice(["on", "moderate", "off"]))
+@click.option("-t", "--timelimit", type=click.Choice(["d", "w", "m", "y"]))
+@click.option("-m", "--max_results", default=10, type=int, help="maximum number of results")
+@click.option("-p", "--page", default=1, type=int, help="page number of results")
+@click.option(
+    "-b",
+    "--backend",
+    default=["auto"],
+    type=click.Choice(["auto", "all", "duckduckgo"]),
+    multiple=True,
+    callback=_convert_tuple_to_csv,
+)
+@click.option("-size", "--size", type=click.Choice(["Small", "Medium", "Large", "Wallpaper"]))
+@click.option(
+    "-c",
+    "--color",
+    type=click.Choice(
+        [
+            "color",
+            "Monochrome",
+            "Red",
+            "Orange",
+            "Yellow",
+            "Green",
+            "Blue",
+            "Purple",
+            "Pink",
+            "Brown",
+            "Black",
+            "Gray",
+            "Teal",
+            "White",
+        ],
+    ),
+)
+@click.option("-type", "--type_image", type=click.Choice(["photo", "clipart", "gif", "transparent", "line"]))
+@click.option("-l", "--layout", type=click.Choice(["Square", "Tall", "Wide"]))
+@click.option(
+    "-lic",
+    "--license_image",
+    type=click.Choice(["any", "Public", "Share", "ShareCommercially", "Modify", "ModifyCommercially"]),
+)
+@click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
+@click.option("-d", "--download", is_flag=True, default=False, help="download results. -dd to set custom directory")
+@click.option("-dd", "--download-directory", help="Specify custom download directory")
+@click.option("-th", "--threads", default=10, help="download threads, default=10")
+@click.option("-pr", "--proxy", help="the proxy to send requests, example: socks5h://127.0.0.1:9150")
+@click.option("-v", "--verify", default=True, help="verify SSL when making the request")
+@click.option("-nc", "--no-color", is_flag=True, default=False, help="disable color output")
+def images(
+    query: str,
+    keywords: str | None,  # deprecated
+    region: str,
+    safesearch: str,
+    timelimit: str | None,
+    max_results: int | None,
+    page: int,
+    backend: str,
+    size: str | None,
+    color: str | None,
+    type_image: str | None,
+    layout: str | None,
+    license_image: str | None,
+    download_directory: str | None,
+    threads: int,
+    output: str | None,
+    proxy: str | None,
+    *,
+    download: bool,
+    verify: bool,
+    no_color: bool,
+) -> None:
+    """CLI function to perform a DDGS images metasearch."""
+    data = DDGS(proxy=_expand_proxy_tb_alias(proxy), verify=verify).images(
+        query=query,
+        keywords=keywords,  # deprecated
+        region=region,
+        safesearch=safesearch,
+        timelimit=timelimit,
+        max_results=max_results,
+        page=page,
+        backend=backend,
+        size=size,
+        color=color,
+        type_image=type_image,
+        layout=layout,
+        license_image=license_image,
+    )
+    query = _sanitize_query(query)
+    if output:
+        _save_data(query, data, function_name="images", filename=output)
+    if download:
+        _download_results(
+            query,
+            data,
+            function_name="images",
+            proxy=proxy,
+            threads=threads,
+            verify=verify,
+            pathname=download_directory,
+        )
+    if not output and not download:
+        _print_data(data, no_color=no_color)
+@cli.command()
+@click.option("-q", "--query", help="videos search query")
+@click.option("-k", "--keywords", help="(Deprecated) videos search query")  # deprecated
+@click.option("-r", "--region", default="us-en", help="us-en, ru-ru, etc.")
+@click.option("-s", "--safesearch", default="moderate", type=click.Choice(["on", "moderate", "off"]))
+@click.option("-t", "--timelimit", type=click.Choice(["d", "w", "m"]), help="day, week, month")
+@click.option("-m", "--max_results", default=10, type=int, help="maximum number of results")
+@click.option("-p", "--page", default=1, type=int, help="page number of results")
+@click.option(
+    "-b",
+    "--backend",
+    default=["auto"],
+    type=click.Choice(["auto", "all", "duckduckgo"]),
+    multiple=True,
+    callback=_convert_tuple_to_csv,
+)
+@click.option("-res", "--resolution", type=click.Choice(["high", "standart"]))
+@click.option("-d", "--duration", type=click.Choice(["short", "medium", "long"]))
+@click.option("-lic", "--license_videos", type=click.Choice(["creativeCommon", "youtube"]))
+@click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
+@click.option("-pr", "--proxy", help="the proxy to send requests, example: socks5h://127.0.0.1:9150")
+@click.option("-v", "--verify", default=True, help="verify SSL when making the request")
+@click.option("-nc", "--no-color", is_flag=True, default=False, help="disable color output")
+def videos(
+    query: str,
+    keywords: str | None,  # deprecated
+    region: str,
+    safesearch: str,
+    timelimit: str | None,
+    max_results: int | None,
+    page: int,
+    backend: str,
+    resolution: str | None,
+    duration: str | None,
+    license_videos: str | None,
+    output: str | None,
+    proxy: str | None,
+    *,
+    verify: bool,
+    no_color: bool,
+) -> None:
+    """CLI function to perform a DDGS videos metasearch."""
+    data = DDGS(proxy=_expand_proxy_tb_alias(proxy), verify=verify).videos(
+        query=query,
+        keywords=keywords,  # deprecated
+        region=region,
+        safesearch=safesearch,
+        timelimit=timelimit,
+        max_results=max_results,
+        page=page,
+        backend=backend,
+        resolution=resolution,
+        duration=duration,
+        license_videos=license_videos,
+    )
+    query = _sanitize_query(query)
+    if output:
+        _save_data(query, data, function_name="videos", filename=output)
+    else:
+        _print_data(data, no_color=no_color)
+@cli.command()
+@click.option("-q", "--query", help="news search query")
+@click.option("-k", "--keywords", help="(Deprecated) news search query")  # deprecated
+@click.option("-r", "--region", default="us-en", help="us-en, ru-ru, etc.")
+@click.option("-s", "--safesearch", default="moderate", type=click.Choice(["on", "moderate", "off"]))
+@click.option("-t", "--timelimit", type=click.Choice(["d", "w", "m", "y"]), help="day, week, month, year")
+@click.option("-m", "--max_results", default=10, type=int, help="maximum number of results")
+@click.option("-p", "--page", default=1, type=int, help="page number of results")
+@click.option(
+    "-b",
+    "--backend",
+    default=["auto"],
+    type=click.Choice(["auto", "all", "bing", "duckduckgo", "yahoo"]),
+    multiple=True,
+    callback=_convert_tuple_to_csv,
+)
+@click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
+@click.option("-pr", "--proxy", help="the proxy to send requests, example: socks5h://127.0.0.1:9150")
+@click.option("-v", "--verify", default=True, help="verify SSL when making the request")
+@click.option("-nc", "--no-color", is_flag=True, default=False, help="disable color output")
+def news(
+    query: str,
+    keywords: str | None,  # deprecated
+    region: str,
+    safesearch: str,
+    timelimit: str | None,
+    max_results: int | None,
+    page: int,
+    backend: str,
+    output: str | None,
+    proxy: str | None,
+    *,
+    verify: bool,
+    no_color: bool,
+) -> None:
+    """CLI function to perform a DDGS news metasearch."""
+    data = DDGS(proxy=_expand_proxy_tb_alias(proxy), verify=verify).news(
+        query=query,
+        keywords=keywords,  # deprecated
+        region=region,
+        safesearch=safesearch,
+        timelimit=timelimit,
+        max_results=max_results,
+        page=page,
+        backend=backend,
+    )
+    query = _sanitize_query(query)
+    if output:
+        _save_data(query, data, function_name="news", filename=output)
+    else:
+        _print_data(data, no_color=no_color)
+@cli.command()
+@click.option("-q", "--query", help="books search query")
+@click.option("-k", "--keywords", help="(Deprecated) books search query")  # deprecated
+@click.option("-m", "--max_results", default=10, type=int, help="maximum number of results")
+@click.option("-p", "--page", default=1, type=int, help="page number of results")
+@click.option(
+    "-b",
+    "--backend",
+    default=["auto"],
+    type=click.Choice(["auto", "all", "annasarchive"]),
+    multiple=True,
+    callback=_convert_tuple_to_csv,
+)
+@click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
+@click.option("-pr", "--proxy", help="the proxy to send requests, example: socks5h://127.0.0.1:9150")
+@click.option("-v", "--verify", default=True, help="verify SSL when making the request")
+@click.option("-nc", "--no-color", is_flag=True, default=False, help="disable color output")
+def books(
+    query: str,
+    keywords: str | None,  # deprecated
+    max_results: int | None,
+    page: int,
+    backend: str,
+    output: str | None,
+    proxy: str | None,
+    *,
+    verify: bool,
+    no_color: bool,
+) -> None:
+    """CLI function to perform a DDGS books metasearch."""
+    data = DDGS(proxy=_expand_proxy_tb_alias(proxy), verify=verify).books(
+        query=query,
+        keywords=keywords,  # deprecated
+        max_results=max_results,
+        page=page,
+        backend=backend,
+    )
+    if output:
+        _save_data(query, data, function_name="books", filename=output)
+    else:
+        _print_data(data, no_color=no_color)
+if __name__ == "__main__":
+    safe_entry_point()

ddgs/ddgs.py ADDED Viewed

	@@ -0,0 +1,234 @@

+"""DDGS class implementation."""
+import logging
+import os
+from concurrent.futures import ThreadPoolExecutor, wait
+from math import ceil
+from random import random, shuffle
+from types import TracebackType
+from typing import Any, ClassVar
+from .base import BaseSearchEngine
+from .engines import ENGINES
+from .exceptions import DDGSException, TimeoutException
+from .results import ResultsAggregator
+from .similarity import SimpleFilterRanker
+from .utils import _expand_proxy_tb_alias
+logger = logging.getLogger(__name__)
+class DDGS:
+    """DDGS | Dux Distributed Global Search.
+    A metasearch library that aggregates results from diverse web search services.
+    Args:
+        proxy: The proxy to use for the search. Defaults to None.
+        timeout: The timeout for the search. Defaults to 5.
+        verify: bool (True to verify, False to skip) or str path to a PEM file. Defaults to True.
+    Attributes:
+        threads: The number of threads to use for the search. Defaults to None (automatic).
+        _executor: The ThreadPoolExecutor instance.
+    Raises:
+        DDGSException: If an error occurs during the search.
+    Example:
+        >>> from ddgs import DDGS
+        >>> results = DDGS().search("python")
+    """
+    threads: ClassVar[int | None] = None
+    _executor: ClassVar[ThreadPoolExecutor | None] = None
+    def __init__(self, proxy: str | None = None, timeout: int | None = 5, *, verify: bool | str = True) -> None:
+        self._proxy = _expand_proxy_tb_alias(proxy) or os.environ.get("DDGS_PROXY")
+        self._timeout = timeout
+        self._verify = verify
+        self._engines_cache: dict[
+            type[BaseSearchEngine[Any]], BaseSearchEngine[Any]
+        ] = {}  # dict[engine_class, engine_instance]
+    def __enter__(self) -> "DDGS":
+        """Enter the context manager and return the DDGS instance."""
+        return self
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None = None,
+        exc_val: BaseException | None = None,
+        exc_tb: TracebackType | None = None,
+    ) -> None:
+        """Exit the context manager."""
+    @classmethod
+    def get_executor(cls) -> ThreadPoolExecutor:
+        """Get a ThreadPoolExecutor instance and cache it."""
+        if cls._executor is None:
+            cls._executor = ThreadPoolExecutor(max_workers=cls.threads, thread_name_prefix="DDGS")
+        return cls._executor
+    def _get_engines(
+        self,
+        category: str,
+        backend: str,
+    ) -> list[BaseSearchEngine[Any]]:
+        """Retrieve a list of search engine instances for a given category and backend.
+        Args:
+            category: The category of search engines (e.g., 'text', 'images', etc.).
+            backend: A single or comma-delimited backends. Defaults to "auto".
+        Returns:
+            A list of initialized search engine instances corresponding to the specified
+            category and backend. Instances are cached for reuse.
+        """
+        if isinstance(backend, list):  # deprecated
+            backend = ",".join(backend)
+        backend_list = [x.strip() for x in backend.split(",")]
+        engine_keys = list(ENGINES[category].keys())
+        shuffle(engine_keys)
+        if "auto" in backend_list or "all" in backend_list:
+            keys = engine_keys
+            if category == "text":
+                keys = ["wikipedia", "grokipedia"] + [k for k in keys if k not in ("wikipedia", "grokipedia")]
+        else:
+            keys = backend_list
+        try:
+            engine_classes = [ENGINES[category][key] for key in keys]
+            # Initialize and cache engine instances
+            instances = []
+            for engine_class in engine_classes:
+                # If already cached, use the cached instance
+                if engine_class in self._engines_cache:
+                    instances.append(self._engines_cache[engine_class])
+                # If not cached, create a new instance
+                else:
+                    engine_instance = engine_class(proxy=self._proxy, timeout=self._timeout, verify=self._verify)
+                    self._engines_cache[engine_class] = engine_instance
+                    instances.append(engine_instance)
+            # sorting by `engine.priority`
+            instances.sort(key=lambda e: (e.priority, random), reverse=True)
+        except KeyError as ex:
+            logger.warning(
+                "%r - backend is not exist or disabled. Available: %s. Using 'auto'",
+                ex,
+                ", ".join(sorted(engine_keys)),
+            )
+            return self._get_engines(category, "auto")
+        else:
+            return instances
+    def _search(  # noqa: C901
+        self,
+        category: str,
+        query: str,
+        keywords: str | None = None,  # deprecated
+        *,
+        region: str = "us-en",
+        safesearch: str = "moderate",
+        timelimit: str | None = None,
+        max_results: int | None = 10,
+        page: int = 1,
+        backend: str = "auto",
+        **kwargs: str,
+    ) -> list[dict[str, Any]]:
+        """Perform a search across engines in the given category.
+        Args:
+            category: The category of search engines (e.g., 'text', 'images', etc.).
+            query: The search query.
+            keywords: Deprecated alias for `query`.
+            region: The region to use for the search (e.g., us-en, uk-en, ru-ru, etc.).
+            safesearch: The safesearch setting (e.g., on, moderate, off).
+            timelimit: The timelimit for the search (e.g., d, w, m, y) or custom date range.
+            max_results: The maximum number of results to return. Defaults to 10.
+            page: The page of results to return. Defaults to 1.
+            backend: A single or comma-delimited backends. Defaults to "auto".
+            **kwargs: Additional keyword arguments to pass to the search engines.
+        Returns:
+            A list of dictionaries containing the search results.
+        """
+        query = keywords or query
+        if not query:
+            msg = "query is mandatory."
+            raise DDGSException(msg)
+        engines = self._get_engines(category, backend)
+        len_unique_providers = len({engine.provider for engine in engines})
+        seen_providers: set[str] = set()
+        # Perform search
+        results_aggregator: ResultsAggregator[set[str]] = ResultsAggregator({"href", "image", "url", "embed_url"})
+        max_workers = min(len_unique_providers, ceil(max_results / 10) + 1) if max_results else len_unique_providers
+        executor = self.get_executor()
+        futures, err = {}, None
+        for i, engine in enumerate(engines, start=1):
+            if engine.provider in seen_providers:
+                continue
+            future = executor.submit(
+                engine.search,
+                query,
+                region=region,
+                safesearch=safesearch,
+                timelimit=timelimit,
+                page=page,
+                **kwargs,
+            )
+            futures[future] = engine
+            if len(futures) >= max_workers or i >= max_workers:
+                done, not_done = wait(futures, timeout=self._timeout, return_when="FIRST_EXCEPTION")
+                for f, f_engine in futures.items():
+                    if f in done:
+                        try:
+                            if r := f.result():
+                                results_aggregator.extend(r)
+                                seen_providers.add(f_engine.provider)
+                        except Exception as ex:  # noqa: BLE001
+                            err = ex
+                            logger.info("Error in engine %s: %r", engine.name, ex)
+                futures = {f: futures[f] for f in not_done}
+            if max_results and len(results_aggregator) >= max_results:
+                break
+        results = results_aggregator.extract_dicts()
+        # Rank results
+        ranker = SimpleFilterRanker()
+        results = ranker.rank(results, query)
+        if results:
+            return results[:max_results] if max_results else results
+        if "timed out" in f"{err}":
+            raise TimeoutException(err)
+        raise DDGSException(err or "No results found.")
+    def text(self, query: str, **kwargs: Any) -> list[dict[str, Any]]:  # noqa: ANN401
+        """Perform a text search."""
+        return self._search("text", query, **kwargs)
+    def images(self, query: str, **kwargs: Any) -> list[dict[str, Any]]:  # noqa: ANN401
+        """Perform an image search."""
+        return self._search("images", query, **kwargs)
+    def news(self, query: str, **kwargs: Any) -> list[dict[str, Any]]:  # noqa: ANN401
+        """Perform a news search."""
+        return self._search("news", query, **kwargs)
+    def videos(self, query: str, **kwargs: Any) -> list[dict[str, Any]]:  # noqa: ANN401
+        """Perform a video search."""
+        return self._search("videos", query, **kwargs)
+    def books(self, query: str, **kwargs: Any) -> list[dict[str, Any]]:  # noqa: ANN401
+        """Perform a book search."""
+        return self._search("books", query, **kwargs)

ddgs/engines/__init__.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""Automatically build registry of search engines.
+This module defines the module-level variable ENGINES, which is a dictionary
+of dictionaries. The keys of the outer dictionary are the categories of search
+engines, and the keys of the inner dictionaries are the names of the search
+engines. The values of the inner dictionaries are the classes of the search
+engines.
+The search engines are automatically discovered by looking for classes in the
+module that are subclasses of :class:`ddgs.base.BaseSearchEngine` and
+subclasses of the base class do not have names starting with "Base", and
+do not have a class attribute "disabled" set to True.
+The module automatically builds the ENGINES dictionary, so it should not be
+imported directly by user code.
+Example of resulting dictionary ENGINES:
+from .bing import Bing
+from .brave import Brave
+from .duckduckgo import Duckduckgo
+from .duckduckgo_images import DuckduckgoImages
+from .duckduckgo_news import DuckduckgoNews
+from .duckduckgo_videos import DuckduckgoVideos
+from .google import Google
+from .mojeek import Mojeek
+from .wikipedia import Wikipedia
+from .yahoo import Yahoo
+from .yandex import Yandex
+ENGINES: dict[str, dict[str, type[BaseSearchEngine[Any]]]] = {
+    "text": {
+        "bing": Bing,
+        "brave": Brave,
+        "duckduckgo": Duckduckgo,  # bing
+        "google": Google,
+        "mojeek": Mojeek,
+        "yahoo": Yahoo,  # bing
+        "yandex": Yandex,
+        "wikipedia": Wikipedia,
+    },
+    "images": {
+        "duckduckgo": DuckduckgoImages,
+    },
+    "news": {
+        "duckduckgo": DuckduckgoNews,
+    },
+    "videos": {
+        "duckduckgo": DuckduckgoVideos,
+    },
+}
+"""
+import importlib
+import inspect
+import pkgutil
+from collections import defaultdict
+from typing import Any
+from ddgs.base import BaseSearchEngine
+# ENGINES[category][name] = class
+ENGINES: dict[str, dict[str, type[BaseSearchEngine[Any]]]] = defaultdict(dict)
+package_name = __name__
+package = importlib.import_module(package_name)
+for finder, modname, _ispkg in pkgutil.iter_modules(package.__path__, package_name + "."):
+    module_path = finder.path if hasattr(finder, "path") else finder
+    module = importlib.import_module(modname)
+    for _, cls in inspect.getmembers(module, inspect.isclass):
+        # 1) must subclass BaseSearchEngine (but not the base itself)
+        if not issubclass(cls, BaseSearchEngine) or cls is BaseSearchEngine:
+            continue
+        # 2) skip any class whose name starts with "Base"
+        if cls.__name__.startswith("Base"):
+            continue
+        # 3) skip disabled engines
+        if getattr(cls, "disabled", True):
+            continue
+        # 3) ensure they provided name & category
+        name = getattr(cls, "name", None)
+        category = getattr(cls, "category", None)
+        if not isinstance(name, str) or not isinstance(category, str):
+            msg = f"{cls.__qualname__} must define class attributes 'name: str' and 'category: str'."
+            raise TypeError(msg)
+        ENGINES[category][name] = cls
+# freeze into normal dicts
+ENGINES = {cat: dict(m) for cat, m in ENGINES.items()}

ddgs/engines/annasarchive.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""Anna's Archive search engine implementation."""
+from collections.abc import Mapping
+from typing import Any, ClassVar
+from ddgs.base import BaseSearchEngine
+from ddgs.results import BooksResult
+class AnnasArchive(BaseSearchEngine[BooksResult]):
+    """Anna's Archive search engine."""
+    name = "annasarchive"
+    category = "books"
+    provider = "annasarchive"
+    search_url = "https://annas-archive.li/search"
+    search_method = "GET"
+    items_xpath = "//div[contains(@class, 'record-list-outer')]/div"
+    elements_xpath: ClassVar[Mapping[str, str]] = {
+        "title": ".//a[contains(@class, 'text-lg')]//text()",
+        "author": ".//a[span[contains(@class, 'user')]]//text()",
+        "publisher": ".//a[span[contains(@class, 'company')]]//text()",
+        "info": ".//div[contains(@class, 'text-gray-800')]/text()",
+        "url": "./a/@href",
+        "thumbnail": ".//img/@src",
+    }
+    def build_payload(
+        self,
+        query: str,
+        region: str,  # noqa: ARG002
+        safesearch: str,  # noqa: ARG002
+        timelimit: str | None,  # noqa: ARG002
+        page: int = 1,
+        **kwargs: str,  # noqa: ARG002
+    ) -> dict[str, Any]:
+        """Build a payload for the search request."""
+        return {"q": query, "page": f"{page}"}
+    def pre_process_html(self, html_text: str) -> str:
+        """Pre-process the HTML text before parsing it."""
+        return html_text.replace("<!--", "").replace("-->", "")
+    def post_extract_results(self, results: list[BooksResult]) -> list[BooksResult]:
+        """Post-process search results."""
+        base_url = self.search_url.split("/search")[0]
+        for result in results:
+            result.url = f"{base_url}{result.url}"
+        return results

ddgs/engines/bing.py ADDED Viewed

	@@ -0,0 +1,85 @@

+"""Bing search engine implementation."""
+import base64
+from collections.abc import Mapping
+from time import time
+from typing import Any, ClassVar
+from urllib.parse import parse_qs, urlparse
+from ddgs.base import BaseSearchEngine
+from ddgs.results import TextResult
+def unwrap_bing_url(raw_url: str) -> str | None:
+    """Decode the Bing-wrapped raw_url to extract the original url."""
+    parsed = urlparse(raw_url)
+    u_vals = parse_qs(parsed.query).get("u", [])
+    if not u_vals:
+        return None
+    u = u_vals[0]
+    if len(u) <= 2:
+        return None
+    # Drop the first two characters, pad to a multiple of 4, then decode
+    b64_part = u[2:]
+    padding = "=" * (-len(b64_part) % 4)
+    decoded = base64.urlsafe_b64decode(b64_part + padding)
+    return decoded.decode()
+class Bing(BaseSearchEngine[TextResult]):
+    """Bing search engine."""
+    disabled = True  # !!!
+    name = "bing"
+    category = "text"
+    provider = "bing"
+    search_url = "https://www.bing.com/search"
+    search_method = "GET"
+    items_xpath = "//li[contains(@class, 'b_algo')]"
+    elements_xpath: ClassVar[Mapping[str, str]] = {
+        "title": ".//h2/a//text()",
+        "href": ".//h2/a/@href",
+        "body": ".//p//text()",
+    }
+    def build_payload(
+        self,
+        query: str,
+        region: str,
+        safesearch: str,  # noqa: ARG002
+        timelimit: str | None,
+        page: int = 1,
+        **kwargs: str,  # noqa: ARG002
+    ) -> dict[str, Any]:
+        """Build a payload for the Bing search request."""
+        country, lang = region.lower().split("-")
+        payload = {"q": query, "pq": query, "cc": lang}
+        cookies = {
+            "_EDGE_CD": f"m={lang}-{country}&u={lang}-{country}",
+            "_EDGE_S": f"mkt={lang}-{country}&ui={lang}-{country}",
+        }
+        self.http_client.client.set_cookies("https://www.bing.com", cookies)
+        if timelimit:
+            d = int(time() // 86400)
+            code = f"ez5_{d - 365}_{d}" if timelimit == "y" else "ez" + {"d": "1", "w": "2", "m": "3"}[timelimit]
+            payload["filters"] = f'ex1:"{code}"'
+        if page > 1:
+            payload["first"] = f"{(page - 1) * 10}"
+            payload["FORM"] = f"PERE{page - 2 if page > 2 else ''}"
+        return payload
+    def post_extract_results(self, results: list[TextResult]) -> list[TextResult]:
+        """Post-process search results."""
+        post_results = []
+        for result in results:
+            if result.href.startswith("https://www.bing.com/aclick?"):
+                continue
+            if result.href.startswith("https://www.bing.com/ck/a?"):
+                result.href = unwrap_bing_url(result.href) or result.href
+            post_results.append(result)
+        return post_results

ddgs/engines/bing_news.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""Bing news engine implementation."""
+import re
+from collections.abc import Mapping
+from contextlib import suppress
+from datetime import datetime, timedelta, timezone
+from typing import Any, ClassVar
+from ddgs.base import BaseSearchEngine
+from ddgs.results import NewsResult
+DATE_RE = re.compile(r"\b(\d+)\s*(days|tagen|jours|giorni|dias|días|дн\.|день)?\b", re.IGNORECASE)
+def extract_date(pub_date_str: str) -> str:
+    """Extract date from string."""
+    # Try parsing the date with predefined formats
+    date_formats = ["%d.%m.%Y", "%m/%d/%Y", "%d/%m/%Y"]
+    for date_format in date_formats:
+        with suppress(ValueError):
+            return datetime.strptime(pub_date_str, date_format).astimezone(timezone.utc).isoformat()
+    # Search for relative date expressions
+    match = DATE_RE.search(pub_date_str)
+    if match:
+        days_ago = int(match.group(1))
+        return (datetime.now(timezone.utc) - timedelta(days=days_ago)).replace(microsecond=0).isoformat()
+    # Return the original string if no date is found
+    return pub_date_str
+class BingNews(BaseSearchEngine[NewsResult]):
+    """Bing news engine."""
+    name = "bing"
+    category = "news"
+    provider = "bing"
+    search_url = "https://www.bing.com/news/infinitescrollajax"
+    search_method = "GET"
+    items_xpath = "//div[contains(@class, 'newsitem')]"
+    elements_xpath: ClassVar[Mapping[str, str]] = {
+        "date": ".//span[@aria-label]//@aria-label",
+        "title": "@data-title",
+        "body": ".//div[@class='snippet']//text()",
+        "url": "@url",
+        "image": ".//a[contains(@class, 'image')]//@src",
+        "source": "@data-author",
+    }
+    def build_payload(
+        self,
+        query: str,
+        region: str,
+        safesearch: str,  # noqa: ARG002
+        timelimit: str | None,
+        page: int = 1,
+        **kwargs: str,  # noqa: ARG002
+    ) -> dict[str, Any]:
+        """Build a payload for the Bing search request."""
+        country, lang = region.lower().split("-")
+        payload = {
+            "q": query,
+            "InfiniteScroll": "1",
+            "first": f"{page * 10 + 1}",
+            "SFX": f"{page}",
+            "cc": country,
+            "setlang": lang,
+        }
+        if timelimit:
+            payload["qft"] = {
+                "d": 'interval="4"',  # doesn't exist so it's the same as one hour
+                "w": 'interval="7"',
+                "m": 'interval="9"',
+                "y": 'interval="9"',  # doesn't exist so it's the same as month
+            }[timelimit]
+        return payload
+    def post_extract_results(self, results: list[NewsResult]) -> list[NewsResult]:
+        """Post-process search results."""
+        for result in results:
+            result.date = extract_date(result.date)
+            result.image = f"https://www.bing.com{result.image.split('&')[0]}" if result.image else ""
+        return results

ddgs/engines/brave.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""Brave search engine implementation."""
+from collections.abc import Mapping
+from typing import Any, ClassVar
+from ddgs.base import BaseSearchEngine
+from ddgs.results import TextResult
+class Brave(BaseSearchEngine[TextResult]):
+    """Brave search engine."""
+    name = "brave"
+    category = "text"
+    provider = "brave"
+    search_url = "https://search.brave.com/search"
+    search_method = "GET"
+    items_xpath = "//div[@data-type='web']"
+    elements_xpath: ClassVar[Mapping[str, str]] = {
+        "title": ".//div[(contains(@class,'title') or contains(@class,'sitename-container')) and position()=last()]//text()",  # noqa: E501
+        "href": ".//a[div[contains(@class, 'title')]]/@href",
+        "body": ".//div[contains(@class, 'snippet')]//div[contains(@class, 'content')]//text()",
+    }
+    def build_payload(
+        self,
+        query: str,
+        region: str,
+        safesearch: str,
+        timelimit: str | None,
+        page: int = 1,
+        **kwargs: str,  # noqa: ARG002
+    ) -> dict[str, Any]:
+        """Build a payload for the search request."""
+        payload = {"q": query, "source": "web"}
+        country, _lang = region.lower().split("-")
+        cookies = {country: country, "useLocation": "0"}
+        if safesearch != "moderate":
+            cookies["safesearch"] = "strict" if safesearch == "on" else "off"
+        self.http_client.client.set_cookies("https://search.brave.com", cookies)
+        if timelimit:
+            payload["tf"] = {"d": "pd", "w": "pw", "m": "pm", "y": "py"}[timelimit]
+        if page > 1:
+            payload["offset"] = f"{page - 1}"
+        return payload

ddgs/engines/duckduckgo.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""Duckduckgo search engine implementation."""
+from collections.abc import Mapping
+from typing import Any, ClassVar, TypeVar
+from fake_useragent import UserAgent
+from ddgs.base import BaseSearchEngine
+from ddgs.http_client2 import HttpClient2
+from ddgs.results import TextResult
+ua = UserAgent()
+T = TypeVar("T")
+class Duckduckgo(BaseSearchEngine[TextResult]):
+    """Duckduckgo search engine."""
+    name = "duckduckgo"
+    category = "text"
+    provider = "bing"
+    search_url = "https://html.duckduckgo.com/html/"
+    search_method = "POST"
+    items_xpath = "//div[contains(@class, 'body')]"
+    elements_xpath: ClassVar[Mapping[str, str]] = {"title": ".//h2//text()", "href": "./a/@href", "body": "./a//text()"}
+    headers: ClassVar[dict[str, str]] = {"User-Agent": ua.random}
+    def __init__(self, proxy: str | None = None, timeout: int | None = None, *, verify: bool = True) -> None:
+        """Temporary, delete when HttpClient is fixed."""
+        self.http_client = HttpClient2(headers=self.headers, proxy=proxy, timeout=timeout, verify=verify)  # type: ignore[assignment]
+        self.results: list[T] = []  # type: ignore[valid-type]
+    def build_payload(
+        self,
+        query: str,
+        region: str,
+        safesearch: str,  # noqa: ARG002
+        timelimit: str | None,
+        page: int = 1,
+        **kwargs: str,  # noqa: ARG002
+    ) -> dict[str, Any]:
+        """Build a payload for the search request."""
+        payload = {"q": query, "b": "", "l": region}
+        if page > 1:
+            payload["s"] = f"{10 + (page - 2) * 15}"
+        if timelimit:
+            payload["df"] = timelimit
+        return payload
+    def post_extract_results(self, results: list[TextResult]) -> list[TextResult]:
+        """Post-process search results."""
+        return [r for r in results if not r.href.startswith("https://duckduckgo.com/y.js?")]

ddgs/engines/duckduckgo_images.py ADDED Viewed

	@@ -0,0 +1,85 @@

+"""Duckduckgo images search engine implementation."""
+import json
+from collections.abc import Mapping
+from typing import Any, ClassVar
+from ddgs.base import BaseSearchEngine
+from ddgs.results import ImagesResult
+from ddgs.utils import _extract_vqd
+class DuckduckgoImages(BaseSearchEngine[ImagesResult]):
+    """Duckduckgo images search engine."""
+    name = "duckduckgo"
+    category = "images"
+    provider = "bing"
+    search_url = "https://duckduckgo.com/i.js"
+    search_method = "GET"
+    search_headers: ClassVar[Mapping[str, str]] = {"Referer": "https://duckduckgo.com/", "Sec-Fetch-Mode": "cors"}
+    elements_replace: ClassVar[Mapping[str, str]] = {
+        "title": "title",
+        "image": "image",
+        "thumbnail": "thumbnail",
+        "url": "url",
+        "height": "height",
+        "width": "width",
+        "source": "source",
+    }
+    def _get_vqd(self, query: str) -> str:
+        """Get vqd value for a search query using DuckDuckGo."""
+        resp_content = self.http_client.request("GET", "https://duckduckgo.com", params={"q": query}).content
+        return _extract_vqd(resp_content, query)
+    def build_payload(
+        self,
+        query: str,
+        region: str,
+        safesearch: str,
+        timelimit: str | None,
+        page: int = 1,
+        **kwargs: str,
+    ) -> dict[str, Any]:
+        """Build a payload for the search request."""
+        safesearch_base = {"on": "1", "moderate": "1", "off": "-1"}
+        timelimit_base = {"d": "Day", "w": "Week", "m": "Month", "y": "Year"}
+        timelimit = f"time:{timelimit_base[timelimit]}" if timelimit else ""
+        size = kwargs.get("size")
+        size = f"size:{size}" if size else ""
+        color = kwargs.get("color")
+        color = f"color:{color}" if color else ""
+        type_image = kwargs.get("type_image")
+        type_image = f"type:{type_image}" if type_image else ""
+        layout = kwargs.get("layout")
+        layout = f"layout:{layout}" if layout else ""
+        license_image = kwargs.get("license_image")
+        license_image = f"license:{license_image}" if license_image else ""
+        payload = {
+            "o": "json",
+            "q": query,
+            "l": region,
+            "vqd": self._get_vqd(query),
+            "p": safesearch_base[safesearch.lower()],
+        }
+        if timelimit or size or color or type_image or layout or license_image:
+            payload["f"] = f"{timelimit},{size},{color},{type_image},{layout},{license_image}"
+        if page > 1:
+            payload["s"] = f"{(page - 1) * 100}"
+        return payload
+    def extract_results(self, html_text: str) -> list[ImagesResult]:
+        """Extract search results from html text."""
+        json_data = json.loads(html_text)
+        items = json_data.get("results", [])
+        results = []
+        for item in items:
+            result = ImagesResult()
+            for key, value in self.elements_replace.items():
+                data = item.get(key)
+                result.__setattr__(value, data)
+            results.append(result)
+        return results

ddgs/engines/duckduckgo_news.py ADDED Viewed

	@@ -0,0 +1,72 @@

+"""Duckduckgo news search engine implementation."""
+import json
+from collections.abc import Mapping
+from typing import Any, ClassVar
+from ddgs.base import BaseSearchEngine
+from ddgs.results import NewsResult
+from ddgs.utils import _extract_vqd
+class DuckduckgoNews(BaseSearchEngine[NewsResult]):
+    """Duckduckgo news search engine."""
+    name = "duckduckgo"
+    category = "news"
+    provider = "bing"
+    search_url = "https://duckduckgo.com/news.js"
+    search_method = "GET"
+    elements_replace: ClassVar[Mapping[str, str]] = {
+        "date": "date",
+        "title": "title",
+        "excerpt": "body",
+        "url": "url",
+        "image": "image",
+        "source": "source",
+    }
+    def _get_vqd(self, query: str) -> str:
+        """Get vqd value for a search query using DuckDuckGo."""
+        resp_content = self.http_client.request("GET", "https://duckduckgo.com", params={"q": query}).content
+        return _extract_vqd(resp_content, query)
+    def build_payload(
+        self,
+        query: str,
+        region: str,
+        safesearch: str,
+        timelimit: str | None,
+        page: int = 1,
+        **kwargs: str,  # noqa: ARG002
+    ) -> dict[str, Any]:
+        """Build a payload for the search request."""
+        safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
+        payload = {
+            "l": region,
+            "o": "json",
+            "noamp": "1",
+            "q": query,
+            "vqd": self._get_vqd(query),
+            "p": safesearch_base[safesearch.lower()],
+        }
+        if timelimit:
+            payload["df"] = timelimit
+        if page > 1:
+            payload["s"] = f"{(page - 1) * 30}"
+        return payload
+    def extract_results(self, html_text: str) -> list[NewsResult]:
+        """Extract search results from lxml tree."""
+        json_data = json.loads(html_text)
+        items = json_data.get("results", [])
+        results = []
+        for item in items:
+            result = NewsResult()
+            for key, value in self.elements_replace.items():
+                data = item.get(key)
+                result.__setattr__(value, data)
+            results.append(result)
+        return results

ddgs/engines/duckduckgo_videos.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""Duckduckgo videos search engine implementation."""
+import json
+from collections.abc import Mapping
+from typing import Any, ClassVar
+from ddgs.base import BaseSearchEngine
+from ddgs.results import VideosResult
+from ddgs.utils import _extract_vqd
+class DuckduckgoVideos(BaseSearchEngine[VideosResult]):
+    """Duckduckgo videos search engine."""
+    name = "duckduckgo"
+    category = "videos"
+    provider = "bing"
+    search_url = "https://duckduckgo.com/v.js"
+    search_method = "GET"
+    elements_replace: ClassVar[Mapping[str, str]] = {
+        "content": "content",
+        "description": "description",
+        "duration": "duration",
+        "embed_html": "embed_html",
+        "embed_url": "embed_url",
+        "image_token": "image_token",
+        "images": "images",
+        "provider": "provider",
+        "published": "published",
+        "publisher": "publisher",
+        "statistics": "statistics",
+        "title": "title",
+        "uploader": "uploader",
+    }
+    def _get_vqd(self, query: str) -> str:
+        """Get vqd value for a search query using DuckDuckGo."""
+        resp_content = self.http_client.request("GET", "https://duckduckgo.com", params={"q": query}).content
+        return _extract_vqd(resp_content, query)
+    def build_payload(
+        self,
+        query: str,
+        region: str,
+        safesearch: str,
+        timelimit: str | None,
+        page: int = 1,
+        **kwargs: str,
+    ) -> dict[str, Any]:
+        """Build a payload for the search request."""
+        safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
+        timelimit = f"publishedAfter:{timelimit}" if timelimit else ""
+        resolution = kwargs.get("resolution")
+        duration = kwargs.get("duration")
+        license_videos = kwargs.get("license_videos")
+        resolution = f"videoDefinition:{resolution}" if resolution else ""
+        duration = f"videoDuration:{duration}" if duration else ""
+        license_videos = f"videoLicense:{license_videos}" if license_videos else ""
+        payload = {
+            "l": region,
+            "o": "json",
+            "q": query,
+            "vqd": self._get_vqd(query),
+            "f": f"{timelimit},{resolution},{duration},{license_videos}",
+            "p": safesearch_base[safesearch.lower()],
+        }
+        if page > 1:
+            payload["s"] = f"{(page - 1) * 60}"
+        return payload
+    def extract_results(self, html_text: str) -> list[VideosResult]:
+        """Extract search results from lxml tree."""
+        json_data = json.loads(html_text)
+        items = json_data.get("results", [])
+        results = []
+        for item in items:
+            result = VideosResult()
+            for key, value in self.elements_replace.items():
+                data = item.get(key)
+                result.__setattr__(value, data)
+            results.append(result)
+        return results

ddgs/engines/google.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""Google search engine implementation."""
+from collections.abc import Mapping
+from random import SystemRandom
+from typing import Any, ClassVar
+from ddgs.base import BaseSearchEngine
+from ddgs.results import TextResult
+random = SystemRandom()
+def get_ua() -> str:
+    """Return one random User-Agent string."""
+    patterns = [
+        "Opera/9.80 (J2ME/MIDP; Opera Mini/{v}/{b}; U; {l}) Presto/{p} Version/{f}",
+        "Opera/9.80 (Android; Linux; Opera Mobi/{b}; U; {l}) Presto/{p} Version/{f}",
+        "Opera/9.80 (iPhone; Opera Mini/{v}/{b}; U; {l}) Presto/{p} Version/{f}",
+        "Opera/9.80 (iPad; Opera Mini/{v}/{b}; U; {l}) Presto/{p} Version/{f}",
+    ]
+    mini_versions = ["4.0", "5.0.17381", "7.1.32444", "9.80"]
+    mobi_builds = ["27", "447", "ADR-1011151731"]
+    builds = ["18.678", "24.743", "503"]
+    prestos = ["2.6.35", "2.7.60", "2.8.119"]
+    finals = ["10.00", "11.10", "12.16"]
+    langs = ["en-US", "en-GB", "de-DE", "fr-FR", "es-ES", "ru-RU", "zh-CN"]
+    fallback = "Opera/9.80 (iPad; Opera Mini/5.0.17381/503; U; eu) Presto/2.6.35 Version/11.10"
+    try:
+        p = random.choice(patterns)
+        vals = {
+            "l": random.choice(langs),
+            "p": random.choice(prestos),
+            "f": random.choice(finals),
+        }
+        if "{v}" in p:
+            vals["v"] = random.choice(mini_versions)
+        if "{b}" in p:
+            vals["b"] = random.choice(mobi_builds) if "Opera Mobi" in p else random.choice(builds)
+        return p.format(**vals)
+    except Exception:  # noqa: BLE001
+        return fallback
+class Google(BaseSearchEngine[TextResult]):
+    """Google search engine."""
+    name = "google"
+    category = "text"
+    provider = "google"
+    search_url = "https://www.google.com/search"
+    search_method = "GET"
+    search_headers: ClassVar[dict[str, str]] = {"User-Agent": get_ua()}
+    items_xpath = "//div[div[@data-hveid]//div[h3]]"
+    elements_xpath: ClassVar[Mapping[str, str]] = {
+        "title": ".//h3//text()",
+        "href": ".//a/@href",
+        "body": "./div/div/div[2]//text()",
+    }
+    def build_payload(
+        self,
+        query: str,
+        region: str,
+        safesearch: str,
+        timelimit: str | None,
+        page: int = 1,
+        **kwargs: str,  # noqa: ARG002
+    ) -> dict[str, Any]:
+        """Build a payload for the Google search request."""
+        safesearch_base = {"on": "2", "moderate": "1", "off": "0"}
+        start = (page - 1) * 10
+        payload = {
+            "q": query,
+            "filter": safesearch_base[safesearch.lower()],
+            "start": str(start),
+        }
+        country, lang = region.split("-")
+        payload["hl"] = f"{lang}-{country.upper()}"  # interface language
+        payload["lr"] = f"lang_{lang}"  # restricts to results written in a particular language
+        payload["cr"] = f"country{country.upper()}"  # restricts to results written in a particular country
+        if timelimit:
+            payload["tbs"] = f"qdr:{timelimit}"
+        return payload
+    def post_extract_results(self, results: list[TextResult]) -> list[TextResult]:
+        """Post-process search results."""
+        post_results = []
+        for result in results:
+            if result.href.startswith("/url?q="):
+                result.href = result.href.split("?q=")[1].split("&")[0]
+            post_results.append(result)
+        return post_results

ddgs/engines/grokipedia.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""Grokipedia text search engine."""
+import json
+import logging
+from typing import Any
+from ddgs.base import BaseSearchEngine
+from ddgs.results import TextResult
+logger = logging.getLogger(__name__)
+class Grokipedia(BaseSearchEngine[TextResult]):
+    """Grokipedia text search engine."""
+    name = "grokipedia"
+    category = "text"
+    provider = "grokipedia"
+    priority = 1.9
+    search_url = "https://grokipedia.com/api/typeahead"
+    search_method = "GET"
+    def build_payload(
+        self,
+        query: str,
+        region: str,  # noqa: ARG002
+        safesearch: str,  # noqa: ARG002
+        timelimit: str | None,  # noqa: ARG002
+        page: int = 1,  # noqa: ARG002
+        **kwargs: str,  # noqa: ARG002
+    ) -> dict[str, Any]:
+        """Build a payload for the search request."""
+        payload: dict[str, Any] = {"query": query, "limit": "1"}
+        return payload
+    def extract_results(self, html_text: str) -> list[TextResult]:
+        """Extract search results from html text."""
+        json_data = json.loads(html_text)
+        items = json_data.get("results", [])
+        if not items:
+            return []
+        result = TextResult()
+        result.title = items[0].get("title", "").strip("_")
+        body = items[0].get("snippet", "")
+        result.body = body.split("\n\n", 1)[1] if "\n\n" in body else body
+        result.href = f"https://grokipedia.com/page/{items[0]['slug']}"
+        return [result]

ddgs/engines/mojeek.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""Mojeek search engine implementation."""
+from collections.abc import Mapping
+from typing import Any, ClassVar
+from ddgs.base import BaseSearchEngine
+from ddgs.results import TextResult
+class Mojeek(BaseSearchEngine[TextResult]):
+    """Mojeek search engine."""
+    name = "mojeek"
+    category = "text"
+    provider = "mojeek"
+    search_url = "https://www.mojeek.com/search"
+    search_method = "GET"
+    items_xpath = "//ul[contains(@class, 'results')]/li"
+    elements_xpath: ClassVar[Mapping[str, str]] = {
+        "title": ".//h2//text()",
+        "href": ".//h2/a/@href",
+        "body": ".//p[@class='s']//text()",
+    }
+    def build_payload(
+        self,
+        query: str,
+        region: str,
+        safesearch: str,
+        timelimit: str | None,  # noqa: ARG002
+        page: int = 1,
+        **kwargs: str,  # noqa: ARG002
+    ) -> dict[str, Any]:
+        """Build a payload for the search request."""
+        country, lang = region.lower().split("-")
+        cookies = {
+            "arc": country,
+            "lb": lang,
+        }
+        self.http_client.client.set_cookies("https://www.mojeek.com", cookies)
+        payload = {
+            "q": query,
+            # "tlen": f"{randint(68, 128)}",  # Title length limit (default=68, max=128)  # noqa: ERA001
+            # "dlen": f"{randint(160, 512)}",  # Description length limit (default=160, max=512)  # noqa: ERA001
+        }
+        if safesearch == "on":
+            payload["safe"] = "1"
+        if page > 1:
+            payload["s"] = f"{(page - 1) * 10 + 1}"
+        return payload

ddgs/engines/wikipedia.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""Wikipedia text search engine."""
+import json
+import logging
+from typing import Any
+from urllib.parse import quote
+from ddgs.base import BaseSearchEngine
+from ddgs.results import TextResult
+logger = logging.getLogger(__name__)
+class Wikipedia(BaseSearchEngine[TextResult]):
+    """Wikipedia text search engine."""
+    name = "wikipedia"
+    category = "text"
+    provider = "wikipedia"
+    priority = 2
+    search_url = "https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={query}"
+    search_method = "GET"
+    def build_payload(
+        self,
+        query: str,
+        region: str,
+        safesearch: str,  # noqa: ARG002
+        timelimit: str | None,  # noqa: ARG002
+        page: int = 1,  # noqa: ARG002
+        **kwargs: str,  # noqa: ARG002
+    ) -> dict[str, Any]:
+        """Build a payload for the search request."""
+        _country, lang = region.lower().split("-")
+        encoded_query = quote(query)
+        self.search_url = (
+            f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&profile=fuzzy&limit=1&search={encoded_query}"
+        )
+        payload: dict[str, Any] = {}
+        self.lang = lang  # used in extract_results
+        return payload
+    def extract_results(self, html_text: str) -> list[TextResult]:
+        """Extract search results from html text."""
+        json_data = json.loads(html_text)
+        if not json_data[1]:
+            return []
+        result = TextResult()
+        result.title = json_data[1][0]
+        result.href = json_data[3][0]
+        # Add body
+        encoded_query = quote(result.title)
+        resp_data = self.request(
+            "GET",
+            f"https://{self.lang}.wikipedia.org/w/api.php?action=query&format=json&prop=extracts&titles={encoded_query}&explaintext=0&exintro=0&redirects=1",
+        )
+        if resp_data:
+            json_data = json.loads(resp_data)
+            result.body = next(iter(json_data["query"]["pages"].values())).get("extract", "")
+        if "may refer to:" in result.body:
+            return []
+        return [result]

ddgs/engines/yahoo.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""Yahoo search engine."""
+from collections.abc import Mapping
+from secrets import token_urlsafe
+from typing import Any, ClassVar
+from urllib.parse import unquote_plus
+from ddgs.base import BaseSearchEngine
+from ddgs.results import TextResult
+def extract_url(u: str) -> str:
+    """Sanitize url."""
+    t = u.split("/RU=", 1)[1]
+    return unquote_plus(t.split("/RK=", 1)[0].split("/RS=", 1)[0])
+class Yahoo(BaseSearchEngine[TextResult]):
+    """Yahoo search engine."""
+    name = "yahoo"
+    category = "text"
+    provider = "bing"
+    search_url = "https://search.yahoo.com/search"
+    search_method = "GET"
+    items_xpath = "//div[contains(@class, 'relsrch')]"
+    elements_xpath: ClassVar[Mapping[str, str]] = {
+        "title": ".//div[contains(@class, 'Title')]//h3//text()",
+        "href": ".//div[contains(@class, 'Title')]//a/@href",
+        "body": ".//div[contains(@class, 'Text')]//text()",
+    }
+    def build_payload(
+        self,
+        query: str,
+        region: str,  # noqa: ARG002
+        safesearch: str,  # noqa: ARG002
+        timelimit: str | None,
+        page: int = 1,
+        **kwargs: str,  # noqa: ARG002
+    ) -> dict[str, Any]:
+        """Build a payload for the search request."""
+        self.search_url = (
+            f"https://search.yahoo.com/search;_ylt={token_urlsafe(24 * 3 // 4)};_ylu={token_urlsafe(47 * 3 // 4)}"
+        )
+        payload = {"p": query}
+        if page > 1:
+            payload["b"] = f"{(page - 1) * 7 + 1}"
+        if timelimit:
+            payload["btf"] = timelimit
+        return payload
+    def post_extract_results(self, results: list[TextResult]) -> list[TextResult]:
+        """Post-process search results."""
+        post_results = []
+        for result in results:
+            if result.href.startswith("https://www.bing.com/aclick?"):
+                continue
+            if "/RU=" in result.href:
+                result.href = extract_url(result.href)
+            post_results.append(result)
+        return post_results

ddgs/engines/yahoo_news.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""Yahoo! News search engine."""
+import logging
+import re
+from collections.abc import Callable, Mapping
+from datetime import datetime, timedelta, timezone
+from typing import Any, ClassVar
+from urllib.parse import unquote_plus
+from ddgs.base import BaseSearchEngine
+from ddgs.results import NewsResult
+logger = logging.getLogger(__name__)
+DATE_RE = re.compile(r"\b(\d+)\s*(year|month|week|day|hour|minute)s?\b", re.IGNORECASE)
+DATE_UNITS: dict[str, Callable[[int], timedelta]] = {
+    "minute": lambda n: timedelta(minutes=n),
+    "hour": lambda n: timedelta(hours=n),
+    "day": lambda n: timedelta(days=n),
+    "week": lambda n: timedelta(weeks=n),
+    "month": lambda n: timedelta(days=30 * n),
+    "year": lambda n: timedelta(days=365 * n),
+}
+def extract_date(pub_date_str: str) -> str:
+    """Extract date from string."""
+    now = datetime.now(timezone.utc)
+    m = DATE_RE.search(pub_date_str)
+    if not m:
+        return pub_date_str
+    number = int(m.group(1))
+    unit = m.group(2).lower()
+    delta = DATE_UNITS[unit](number)
+    dt = (now - delta).replace(microsecond=0)
+    return dt.isoformat()
+def extract_url(u: str) -> str:
+    """Sanitize url."""
+    url = u.split("/RU=", 1)[1].split("/RK=", 1)[0].split("?", 1)[0]
+    return unquote_plus(url)
+def extract_image(u: str) -> str:
+    """Sanitize image url."""
+    idx = u.find("-/")
+    return u[idx + 2 :] if idx != -1 else u
+def extract_source(s: str) -> str:
+    """Remove ' via Yahoo' from string."""
+    return s.split(" ·  via Yahoo")[0]
+class YahooNews(BaseSearchEngine[NewsResult]):
+    """Yahoo news search engine."""
+    name = "yahoo"
+    category = "news"
+    provider = "yahoo"
+    search_url = "https://news.search.yahoo.com/search"
+    search_method = "GET"
+    items_xpath = "//div[@id='web']//li[a]"
+    elements_xpath: ClassVar[Mapping[str, str]] = {
+        "date": ".//span[contains(@class, 'time')]//text()",
+        "title": ".//h4//text()",
+        "body": ".//p//text()",
+        "url": ".//h4/a/@href",
+        "image": "(.//img/@data-src | .//img/@src)[1]",
+        "source": ".//span[contains(@class, 'source')]//text()",
+    }
+    def build_payload(
+        self,
+        query: str,
+        region: str,  # noqa: ARG002
+        safesearch: str,  # noqa: ARG002
+        timelimit: str | None,
+        page: int = 1,
+        **kwargs: str,  # noqa: ARG002
+    ) -> dict[str, Any]:
+        """Build a payload for the search request."""
+        payload = {"p": query}
+        if page > 1:
+            payload["b"] = f"{(page - 1) * 10 + 1}"
+        if timelimit:
+            payload["btf"] = timelimit
+        return payload
+    def post_extract_results(self, results: list[NewsResult]) -> list[NewsResult]:
+        """Post-process search results."""
+        try:
+            for result in results:
+                result.date = extract_date(result.date)
+                result.url = extract_url(result.url)
+                result.image = extract_image(result.image)
+                result.source = extract_source(result.source)
+        except Exception as ex:  # noqa: BLE001
+            logger.warning("Error post-processing results: %r", ex)
+        return results

ddgs/engines/yandex.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""Yandex search engine."""
+from collections.abc import Mapping
+from random import SystemRandom
+from typing import Any, ClassVar
+from ddgs.base import BaseSearchEngine
+from ddgs.results import TextResult
+random = SystemRandom()
+class Yandex(BaseSearchEngine[TextResult]):
+    """Yandex search engine."""
+    name = "yandex"
+    category = "text"
+    provider = "yandex"
+    search_url = "https://yandex.com/search/site/"
+    search_method = "GET"
+    items_xpath = "//li[contains(@class, 'serp-item')]"
+    elements_xpath: ClassVar[Mapping[str, str]] = {
+        "title": ".//h3//text()",
+        "href": ".//h3//a/@href",
+        "body": ".//div[contains(@class, 'text')]//text()",
+    }
+    def build_payload(
+        self,
+        query: str,
+        region: str,  # noqa: ARG002
+        safesearch: str,  # noqa: ARG002
+        timelimit: str | None,  # noqa: ARG002
+        page: int = 1,
+        **kwargs: str,  # noqa: ARG002
+    ) -> dict[str, Any]:
+        """Build a payload for the search request."""
+        payload = {
+            "text": query,
+            "web": "1",
+            "searchid": f"{random.randint(1000000, 9999999)}",
+        }
+        if page > 1:
+            payload["p"] = f"{page - 1}"
+        return payload

ddgs/exceptions.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""DDGS exceptions."""
+class DDGSException(Exception):
+    """Base exception class for ddgs."""
+class RatelimitException(DDGSException):
+    """Raised for rate limit exceeded errors during API requests."""
+class TimeoutException(DDGSException):
+    """Raised for timeout errors during API requests."""

ddgs/http_client.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""HTTP client."""
+import logging
+from secrets import choice
+from typing import Any, Final, Literal, get_args
+import primp
+from .exceptions import DDGSException, TimeoutException
+logger = logging.getLogger(__name__)
+class Response:
+    """HTTP response."""
+    __slots__ = ("content", "status_code", "text")
+    def __init__(self, status_code: int, content: bytes, text: str) -> None:
+        self.status_code = status_code
+        self.content = content
+        self.text = text
+class HttpClient:
+    """HTTP client."""
+    _impersonates: Final = get_args(Literal[
+        "chrome_100", "chrome_101", "chrome_104", "chrome_105", "chrome_106", "chrome_107",
+        "chrome_108", "chrome_109", "chrome_114", "chrome_116", "chrome_117", "chrome_118",
+        "chrome_119", "chrome_120", "chrome_123", "chrome_124", "chrome_126", "chrome_127",
+        "chrome_128", "chrome_129", "chrome_130", "chrome_131", "chrome_133",
+        "safari_15.3", "safari_15.5", "safari_15.6.1", "safari_16", "safari_16.5",
+        "safari_17.0", "safari_17.2.1", "safari_17.4.1", "safari_17.5",
+        "safari_18", "safari_18.2",
+        "edge_101", "edge_122", "edge_127", "edge_131",
+        "firefox_109", "firefox_117", "firefox_128", "firefox_133", "firefox_135",
+    ])  # fmt: skip
+    _impersonates_os: Final = get_args(Literal["macos", "linux", "windows"])
+    def __init__(self, proxy: str | None = None, timeout: int | None = 10, *, verify: bool | str = True) -> None:
+        """Initialize the HttpClient object.
+        Args:
+            proxy (str, optional): proxy for the HTTP client, supports http/https/socks5 protocols.
+                example: "http://user:pass@example.com:3128". Defaults to None.
+            timeout (int, optional): Timeout value for the HTTP client. Defaults to 10.
+            verify: (bool | str):  True to verify, False to skip, or a str path to a PEM file. Defaults to True.
+        """
+        self.client = primp.Client(
+            proxy=proxy,
+            timeout=timeout,
+            impersonate=choice(self._impersonates),
+            impersonate_os=choice(self._impersonates_os),
+            verify=verify if isinstance(verify, bool) else True,
+            ca_cert_file=verify if isinstance(verify, str) else None,
+        )
+    def request(self, *args: Any, **kwargs: Any) -> Response:  # noqa: ANN401
+        """Make a request to the HTTP client."""
+        try:
+            resp = self.client.request(*args, **kwargs)
+            return Response(status_code=resp.status_code, content=resp.content, text=resp.text)
+        except Exception as ex:
+            if "timed out" in f"{ex}":
+                msg = f"Request timed out: {ex!r}"
+                raise TimeoutException(msg) from ex
+            msg = f"{type(ex).__name__}: {ex!r}"
+            raise DDGSException(msg) from ex
+    def get(self, *args: Any, **kwargs: Any) -> Response:  # noqa: ANN401
+        """Make a GET request to the HTTP client."""
+        return self.request(*args, method="GET", **kwargs)
+    def post(self, *args: Any, **kwargs: Any) -> Response:  # noqa: ANN401
+        """Make a POST request to the HTTP client."""
+        return self.request(*args, method="POST", **kwargs)

ddgs/http_client2.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""Temporary HTTP client for 'backend=duckduckgo'. Delete when HttpClient is fixed."""
+import logging
+import ssl
+from random import SystemRandom
+from types import TracebackType
+from typing import TYPE_CHECKING, Any
+import h2
+import httpcore
+import httpx
+from .exceptions import DDGSException, TimeoutException
+if TYPE_CHECKING:
+    from collections.abc import Callable
+logger = logging.getLogger(__name__)
+random = SystemRandom()
+class Response:
+    """HTTP response."""
+    __slots__ = ("content", "status_code", "text")
+    def __init__(self, status_code: int, content: bytes, text: str) -> None:
+        self.status_code = status_code
+        self.content = content
+        self.text = text
+class HttpClient2:
+    """Temporary HTTP client."""
+    def __init__(
+        self,
+        headers: dict[str, str] | None = None,
+        proxy: str | None = None,
+        timeout: int | None = 10,
+        *,
+        verify: bool | str = True,
+    ) -> None:
+        """Initialize the HttpClient object.
+        Args:
+            headers (dict, optional): headers for the HTTP client.
+            proxy (str, optional): proxy for the HTTP client, supports http/https/socks5 protocols.
+                example: "http://user:pass@example.com:3128". Defaults to None.
+            timeout (int, optional): Timeout value for the HTTP client. Defaults to 10.
+            verify: (bool | str):  True to verify, False to skip or str path to a PEM file. Defaults to True.
+        """
+        self.client = httpx.Client(
+            headers=headers,
+            proxy=proxy,
+            timeout=timeout,
+            verify=_get_random_ssl_context(verify=verify) if verify else False,
+            follow_redirects=False,
+            http2=True,
+        )
+    def request(self, *args: Any, **kwargs: Any) -> Response:  # noqa: ANN401
+        """Make a request to the HTTP client."""
+        with Patch():
+            try:
+                resp = self.client.request(*args, **kwargs)
+                return Response(status_code=resp.status_code, content=resp.content, text=resp.text)
+            except Exception as ex:
+                if "timed out" in f"{ex}":
+                    msg = f"Request timed out: {ex!r}"
+                    raise TimeoutException(msg) from ex
+                msg = f"{type(ex).__name__}: {ex!r}"
+                raise DDGSException(msg) from ex
+    def get(self, *args: Any, **kwargs: Any) -> Response:  # noqa: ANN401
+        """Make a GET request to the HTTP client."""
+        return self.request(*args, method="GET", **kwargs)
+    def post(self, *args: Any, **kwargs: Any) -> Response:  # noqa: ANN401
+        """Make a POST request to the HTTP client."""
+        return self.request(*args, method="POST", **kwargs)
+# SSL
+DEFAULT_CIPHERS = [  # https://developers.cloudflare.com/ssl/reference/cipher-suites/recommendations/
+    "TLS_AES_128_GCM_SHA256", "TLS_AES_256_GCM_SHA384", "TLS_CHACHA20_POLY1305_SHA256",
+    # Modern:
+    "ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-ECDSA-CHACHA20-POLY1305", "ECDHE-RSA-AES128-GCM-SHA256",
+    "ECDHE-RSA-CHACHA20-POLY1305", "ECDHE-ECDSA-AES256-GCM-SHA384", "ECDHE-RSA-AES256-GCM-SHA384",
+    # Compatible:
+    "ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-ECDSA-CHACHA20-POLY1305", "ECDHE-RSA-AES128-GCM-SHA256",
+    "ECDHE-RSA-CHACHA20-POLY1305", "ECDHE-ECDSA-AES256-GCM-SHA384", "ECDHE-RSA-AES256-GCM-SHA384",
+    "ECDHE-ECDSA-AES128-SHA256", "ECDHE-RSA-AES128-SHA256", "ECDHE-ECDSA-AES256-SHA384",  "ECDHE-RSA-AES256-SHA384",
+    # Legacy:
+    "ECDHE-ECDSA-AES128-SHA", "ECDHE-RSA-AES128-SHA", "AES128-GCM-SHA256", "AES128-SHA256", "AES128-SHA",
+    "ECDHE-RSA-AES256-SHA", "AES256-GCM-SHA384", "AES256-SHA256", "AES256-SHA", "DES-CBC3-SHA",
+]  # fmt: skip
+def _get_random_ssl_context(*, verify: bool | str) -> ssl.SSLContext:
+    ssl_context = ssl.create_default_context(cafile=verify if isinstance(verify, str) else None)
+    shuffled_ciphers = random.sample(DEFAULT_CIPHERS[9:], len(DEFAULT_CIPHERS) - 9)
+    ssl_context.set_ciphers(":".join(DEFAULT_CIPHERS[:9] + shuffled_ciphers))
+    commands: list[None | Callable[[ssl.SSLContext], None]] = [
+        None,
+        lambda context: setattr(context, "maximum_version", ssl.TLSVersion.TLSv1_2),
+        lambda context: setattr(context, "minimum_version", ssl.TLSVersion.TLSv1_3),
+        lambda context: setattr(context, "options", context.options | ssl.OP_NO_TICKET),
+    ]
+    random_command = random.choice(commands)
+    if random_command:
+        random_command(ssl_context)
+    return ssl_context
+class Patch:
+    """Patch the HTTP2Connection._send_connection_init method."""
+    def __enter__(self) -> None:
+        """Enter the context manager."""
+        def _send_connection_init(self: httpcore._sync.http2.HTTP2Connection, request: httpcore.Request) -> None:
+            self._h2_state.local_settings = h2.settings.Settings(
+                client=True,
+                initial_values={
+                    h2.settings.SettingCodes.INITIAL_WINDOW_SIZE: random.randint(100, 200),
+                    h2.settings.SettingCodes.HEADER_TABLE_SIZE: random.randint(4000, 5000),
+                    h2.settings.SettingCodes.MAX_FRAME_SIZE: random.randint(16384, 65535),
+                    h2.settings.SettingCodes.MAX_CONCURRENT_STREAMS: random.randint(100, 200),
+                    h2.settings.SettingCodes.MAX_HEADER_LIST_SIZE: random.randint(65500, 66500),
+                    h2.settings.SettingCodes.ENABLE_CONNECT_PROTOCOL: random.randint(0, 1),
+                    h2.settings.SettingCodes.ENABLE_PUSH: random.randint(0, 1),
+                },
+            )
+            self._h2_state.initiate_connection()
+            self._h2_state.increment_flow_control_window(2**24)
+            self._write_outgoing_data(request)
+        self.original_send_connection_init = httpcore._sync.http2.HTTP2Connection._send_connection_init
+        httpcore._sync.http2.HTTP2Connection._send_connection_init = _send_connection_init  # type: ignore[method-assign]
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None = None,
+        exc_val: BaseException | None = None,
+        exc_tb: TracebackType | None = None,
+    ) -> None:
+        """Exit the context manager."""
+        httpcore._sync.http2.HTTP2Connection._send_connection_init = self.original_send_connection_init  # type: ignore[method-assign]

ddgs/py.typed ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Marker file for PEP 561.

ddgs/results.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""Result classes."""
+from abc import ABC
+from collections import Counter
+from collections.abc import Callable, Mapping
+from dataclasses import dataclass, field
+from typing import Any, ClassVar, Generic, TypeVar
+from .utils import _normalize_date, _normalize_text, _normalize_url
+T = TypeVar("T")
+class BaseResult:
+    """Base class for all results. Contains normalization functions."""
+    _normalizers: ClassVar[Mapping[str, Callable[[Any], str]]] = {
+        "title": _normalize_text,
+        "body": _normalize_text,
+        "href": _normalize_url,
+        "url": _normalize_url,
+        "thumbnail": _normalize_url,
+        "image": _normalize_url,
+        "date": _normalize_date,
+        "author": _normalize_text,
+        "publisher": _normalize_text,
+        "info": _normalize_text,
+    }
+    def __setattr__(self, name: str, value: str) -> None:
+        """Override setattr to apply normalization functions to certain attributes."""
+        if value and (normalizer := self._normalizers.get(name)):
+            value = normalizer(value)
+        object.__setattr__(self, name, value)
+@dataclass
+class TextResult(BaseResult):
+    """Text search result."""
+    title: str = ""
+    href: str = ""
+    body: str = ""
+@dataclass
+class ImagesResult(BaseResult):
+    """Image search result."""
+    title: str = ""
+    image: str = ""
+    thumbnail: str = ""
+    url: str = ""
+    height: str = ""
+    width: str = ""
+    source: str = ""
+@dataclass
+class NewsResult(BaseResult):
+    """News search result."""
+    date: str = ""
+    title: str = ""
+    body: str = ""
+    url: str = ""
+    image: str = ""
+    source: str = ""
+@dataclass
+class VideosResult(BaseResult):
+    """Video search result."""
+    title: str = ""
+    content: str = ""
+    description: str = ""
+    duration: str = ""
+    embed_html: str = ""
+    embed_url: str = ""
+    image_token: str = ""
+    images: dict[str, str] = field(default_factory=dict)
+    provider: str = ""
+    published: str = ""
+    publisher: str = ""
+    statistics: dict[str, str] = field(default_factory=dict)
+    uploader: str = ""
+@dataclass
+class BooksResult(BaseResult):
+    """Book search result."""
+    title: str = ""
+    author: str = ""
+    publisher: str = ""
+    info: str = ""
+    url: str = ""
+    thumbnail: str = ""
+class ResultsAggregator(ABC, Generic[T]):
+    """Aggregates incoming results.
+    Items are deduplicated by `cache_field`. Append just increments a counter;
+    `extract_results` returns items sorted by descending frequency.
+    """
+    def __init__(self, cache_fields: set[str]) -> None:
+        if not cache_fields:
+            msg = "At least one cache_field must be provided"
+            raise ValueError(msg)
+        self.cache_fields = set(cache_fields)
+        self._counter: Counter[str] = Counter()
+        self._cache: dict[str, T] = {}
+    def _get_key(self, item: T) -> str:
+        for key in item.__dict__:
+            if key in self.cache_fields:
+                return str(item.__dict__[key])
+        msg = f"Item {item!r} has none of the cache fields {self.cache_fields}"
+        raise AttributeError(msg)
+    def __len__(self) -> int:
+        """Return the number of items in the cache."""
+        return len(self._cache)
+    def append(self, item: T) -> None:
+        """Add an item to the cache.
+        Register an occurrence of `item`. First time we see its key,
+        we store the item; every time we bump the counter.
+        """
+        key = self._get_key(item)
+        if key not in self._cache or len(item.__dict__.get("body", "")) > len(
+            self._cache[key].__dict__.get("body", ""),
+        ):
+            self._cache[key] = item
+        self._counter[key] += 1
+    def extend(self, items: list[T]) -> None:
+        """Add a list of items to the cache."""
+        for item in items:
+            self.append(item)
+    def extract_dicts(self) -> list[dict[str, Any]]:
+        """Return a list of items, sorted by descending frequency. Each item is returned as a dict."""
+        return [self._cache[key].__dict__ for key, _ in self._counter.most_common()]

ddgs/similarity.py ADDED Viewed

	@@ -0,0 +1,72 @@

+"""Simple filter ranker."""
+import re
+from typing import Final
+class SimpleFilterRanker:
+    """Simple filter ranker.
+    1) Pull any doc with 'wikipedia.org' in its href to the top.
+    2) Bucket the rest according to where query tokens appear:
+       - both title & body/description
+       - title only
+       - body only
+       - neither
+    3) Return wikipedia-top + both + title-only + body-only + neither.
+    """
+    _splitter: Final = re.compile(r"\W+")
+    def __init__(self, min_token_length: int = 3) -> None:
+        self.min_token_length = min_token_length
+    def _extract_tokens(self, query: str) -> set[str]:
+        """Split on non-word characters & filter out short tokens."""
+        return {token for token in self._splitter.split(query.lower()) if len(token) >= self.min_token_length}
+    def _has_any_token(self, text: str, tokens: set[str]) -> bool:
+        """Check if any token is a substring of the lower-cased text."""
+        lower_text = text.lower()
+        return any(tok in lower_text for tok in tokens)
+    def rank(self, docs: list[dict[str, str]], query: str) -> list[dict[str, str]]:
+        """Rank a list of docs based on a query string."""
+        tokens = self._extract_tokens(query)
+        wiki_hits = []
+        both = []
+        title_only = []
+        body_only = []
+        neither = []
+        for doc in docs:
+            href = doc.get("href", "")
+            title = doc.get("title", "")
+            # fallback to 'description' if no 'body'
+            body = doc.get("body", doc.get("description", ""))
+            # Skip Wikimedia category pages
+            if all(x in title for x in ["Category:", "Wikimedia"]):
+                continue
+            # Wikipedia check
+            if "wikipedia.org" in href:
+                wiki_hits.append(doc)
+                continue
+            # Title / Body match
+            hit_title = self._has_any_token(title, tokens)
+            hit_body = self._has_any_token(body, tokens)
+            if hit_title and hit_body:
+                both.append(doc)
+            elif hit_title:
+                title_only.append(doc)
+            elif hit_body:
+                body_only.append(doc)
+            else:
+                neither.append(doc)
+        # final ranking
+        return wiki_hits + both + title_only + body_only + neither

ddgs/utils.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""Utilities."""
+import re
+import unicodedata
+from contextlib import suppress
+from datetime import datetime, timezone
+from html import unescape
+from urllib.parse import unquote
+from .exceptions import DDGSException
+_REGEX_STRIP_TAGS = re.compile("<.*?>")
+def _extract_vqd(html_bytes: bytes, query: str) -> str:
+    """Extract vqd from html bytes."""
+    for c1, c1_len, c2 in (
+        (b'vqd="', 5, b'"'),
+        (b"vqd=", 4, b"&"),
+        (b"vqd='", 5, b"'"),
+    ):
+        with suppress(ValueError):
+            start = html_bytes.index(c1) + c1_len
+            end = html_bytes.index(c2, start)
+            return html_bytes[start:end].decode()
+    msg = f"_extract_vqd() {query=} Could not extract vqd."
+    raise DDGSException(msg)
+def _normalize_url(url: str) -> str:
+    """Unquote URL and replace spaces with '+'."""
+    return unquote(url).replace(" ", "+") if url else ""
+def _normalize_text(raw: str) -> str:
+    """Normalize text.
+    Strip HTML tags, unescape HTML entities, normalize Unicode,
+    remove "c" category characters, and collapse whitespace.
+    """
+    if not raw:
+        return ""
+    # 1. Strip HTML tags
+    text = _REGEX_STRIP_TAGS.sub("", raw)
+    # 2. Unescape HTML entities
+    text = unescape(text)
+    # 3. Unicode normalization
+    text = unicodedata.normalize("NFC", text)
+    # 4. Remove "C" category characters
+    c_to_none = {ord(ch): None for ch in set(text) if unicodedata.category(ch)[0] == "C"}
+    if c_to_none:
+        text = text.translate(c_to_none)
+    # 5. Collapse whitespace
+    return " ".join(text.split())
+def _normalize_date(date: int | str) -> str:
+    """Normalize date from integer to ISO format if applicable."""
+    return datetime.fromtimestamp(date, timezone.utc).isoformat() if isinstance(date, int) else date
+def _expand_proxy_tb_alias(proxy: str | None) -> str | None:
+    """Expand "tb" to a full proxy URL if applicable."""
+    return "socks5h://127.0.0.1:9150" if proxy == "tb" else proxy

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+services:
+  ddgs-api:
+    build: .
+    ports:
+      - "8000:8000"
+    environment:
+      - DDGS_PROXY
+    volumes:
+      - ./logs:/app/logs
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s

pyproject.toml ADDED Viewed

	@@ -0,0 +1,149 @@

+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "ddgs"
+description = "Dux Distributed Global Search. A metasearch library that aggregates results from diverse web search services."
+readme = "README.md"
+requires-python = ">=3.10"
+license = "MIT"
+keywords = ["python", "search", "metasearch"]
+authors = [
+  {name = "deedy5"}
+]
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
+    "Programming Language :: Python :: Implementation :: CPython",
+    "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+dependencies = [
+    "click>=8.1.8",
+    "primp>=0.15.0",
+    "lxml>=4.9.4",
+    "httpx[http2,socks,brotli]>=0.28.1",  # temporarily
+    "fake-useragent>=2.2.0",
+]
+dynamic = ["version"]
+[project.urls]  # Optional
+"Homepage" = "https://github.com/deedy5/ddgs"
+[project.scripts]
+ddgs = "ddgs.cli:safe_entry_point"
+[tool.setuptools.dynamic]
+version = {attr = "ddgs.__version__"}
+[tool.setuptools.packages.find]
+include = ["ddgs*"]
+exclude = ["api*"]
+[project.optional-dependencies]
+dev = [
+    "mypy>=1.17.1",
+    "pre-commit",
+    "pytest>=8.4.1",
+    "pytest-dependency>=0.6.0",
+    "ruff>=0.13.0",
+    # for mypy
+    "lxml-stubs",
+    "types-Pygments",
+    "types-pexpect",
+    "types-PyYAML",
+    "types-ujson"
+]
+api = [
+    "fastapi[standard]>=0.104.0",
+    "fastapi-mcp>=0.4.0",
+]
+[tool.ruff]
+line-length = 120
+exclude = ["tests"]
+[tool.ruff.lint]
+select = [
+    # Core rules
+    "E", # pycodestyle errors
+    "W", # pycodestyle warnings
+    "F",  # pyflakes
+    "I",  # isort
+    # Enhanced rules
+    "ERA",  # eradicate
+    "YTT",  # flake8-2020
+    "ANN",  # flake8-annotations
+    "ASYNC",  # flake8-async
+    "S",  # flake8-bandit
+    "BLE",  # flake8-blind-except
+    "FBT",  # flake8-boolean-trap
+    "B",    # flake8-bugbear
+    "A",    # flake8-builtins
+    "COM",  # flake8-commas
+    "C4",   # flake8-comprehensions
+    "DTZ",  # flake8-datetimez
+    "T10",  # flake8-debugger
+    "EM",   # flake8-errmsg
+    "FIX",  # flake8-fixme
+    "FA",   # flake8-future-annotations
+    "INT",  # flake8-gettext
+    "ISC",  # flake8-implicit-str-concat
+    "ICN",  # flake8-import-conventions
+    "LOG",  # flake8-logging
+    "G",    # flake8-logging-format
+    "INP",  # flake8-no-pep420
+    "PIE",  # flake8-pie
+    "T20",  # flake8-print
+    "PYI",  # flake8-pyi
+    "PT",   # flake8-pytest-style
+    "Q",    # flake8-quotes
+    "RSE",  # flake8-raise
+    "RET",  # flake8-return
+    "SLF",  # flake8-self
+    "SIM",  # flake8-simplify
+    "SLOT",  # flake8-slots
+    "TID",  # flake8-tidy-imports
+    "TD",  # flake8-todos
+    "TC",  # flake8-type-checking
+    "ARG",  # flake8-unused-arguments
+    "PTH",  # flake8-use-pathlib
+    "FLY",  # flynt
+    "C90",  # mccabe
+    "N",    # pep8-naming
+    "PERF", # perflint
+    "PGH",  # pygrep-hooks
+    "PL",   # Pylint
+    "UP",   # pyupgrade
+    "FURB", # refurb
+    "RUF",  # ruff-specific rules
+    "TRY",  # tryceratops
+    # Documentation
+    "D",  # pydocstyle
+]
+ignore = [
+    "COM812", # Missing trailing comma (handled by formatter)
+    "D107",  # Missing docstring in `__init__`
+    "D203",  # incorrect-blank-line-before-class
+    "D213",  # multi-line-summary-second-line
+    "N818",  # Exception name {name} should be named with an Error suffix
+    "PLR0913",  # Too many arguments to function call
+    "PLR2004",  # Magic value used in comparison
+    "SLF001",  # Private member accessed
+]
+[tool.mypy]
+python_version = "3.10"
+strict = true
+exclude = ["build/"]

start_api.py ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/usr/bin/env python3
+"""Start the DDGS API server."""
+import logging
+import sys
+import uvicorn
+from fastapi_mcp import FastApiMCP  # type: ignore[import-untyped]
+from api.main import app
+logger = logging.getLogger(__name__)
+# Add current directory to Python path
+sys.path.insert(0, ".")
+# MCP server
+mcp = FastApiMCP(app, name="ddgs-search", description="DDGS (Dux Distributed Global Search) MCP Server")
+mcp.mount_http()
+logger.info("✅ MCP server enabled at /mcp")
+mcp.mount_sse()
+logger.info("✅ MCP server enabled at /sse")
+logger.info("🚀 Starting DDGS API server on http://0.0.0.0:8000")
+uvicorn.run(app, host="0.0.0.0", port=8000, workers=1)  # noqa: S104

start_api.sh ADDED Viewed

	@@ -0,0 +1,29 @@

+#!/bin/bash
+# DDGS API Startup Script
+set -e
+echo "🚀 Starting DDGS API..."
+# Check if virtual environment exists
+if [ ! -d ".venv" ]; then
+    echo "📦 Creating virtual environment..."
+    python3 -m venv .venv
+fi
+# Activate virtual environment
+echo "🔧 Activating virtual environment..."
+source .venv/bin/activate
+# Install dependencies
+echo "📥 Installing dependencies..."
+pip install -e ".[api]"
+pip install -e .
+# Run the API
+echo "🌐 Starting FastAPI server on http://localhost:8000"
+echo "📚 API documentation available at http://localhost:8000/docs"
+echo "🔍 ReDoc documentation available at http://localhost:8000/redoc"
+python start_api.py

tests/cli_test.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import pathlib
+import shutil
+import time
+from pathlib import Path
+import pytest
+from click.testing import CliRunner
+from ddgs import DDGS, __version__
+from ddgs.cli import _download_results, _save_csv, _save_json, cli
+runner = CliRunner()
+TEXT_RESULTS = []
+IMAGES_RESULTS = []
+@pytest.fixture(autouse=True)
+def pause_between_tests() -> None:
+    time.sleep(2)
+def test_version_command() -> None:
+    result = runner.invoke(cli, ["version"])
+    assert result.output.strip() == __version__
+def test_text_command() -> None:
+    result = runner.invoke(cli, ["text", "-q", "zebra"])
+    assert "title" in result.output
+def test_images_command() -> None:
+    result = runner.invoke(cli, ["images", "-q", "fox"])
+    assert "title" in result.output
+def test_news_command() -> None:
+    result = runner.invoke(cli, ["news", "-q", "deer"])
+    assert "title" in result.output
+def test_videos_command() -> None:
+    result = runner.invoke(cli, ["videos", "-q", "pig"])
+    assert "title" in result.output
+def test_books_command() -> None:
+    result = runner.invoke(cli, ["books", "-q", "bee"])
+    assert "title" in result.output
+@pytest.mark.dependency()
+def test_get_text() -> None:
+    global TEXT_RESULTS
+    TEXT_RESULTS = DDGS().text("cow", max_results=5)
+    assert TEXT_RESULTS
+@pytest.mark.dependency()
+def test_get_images() -> None:
+    global IMAGES_RESULTS
+    IMAGES_RESULTS = DDGS().images("horse", max_results=5)
+    assert IMAGES_RESULTS
+@pytest.mark.dependency(depends=["test_get_text"])
+def test_save_csv(tmp_path: Path) -> None:
+    temp_file = tmp_path / "test_csv.csv"
+    _save_csv(temp_file, TEXT_RESULTS)
+    assert temp_file.exists()
+@pytest.mark.dependency(depends=["test_get_text"])
+def test_save_json(tmp_path: Path) -> None:
+    temp_file = tmp_path / "test_json.json"
+    _save_json(temp_file, TEXT_RESULTS)
+    assert temp_file.exists()
+@pytest.mark.dependency(depends=["test_get_text"])
+def test_text_download() -> None:
+    pathname = pathlib.Path("text_downloads")
+    _download_results(f"{test_text_download}", TEXT_RESULTS, function_name="text", pathname=str(pathname))
+    assert pathname.is_dir() and pathname.iterdir()
+    for file in pathname.iterdir():
+        assert file.is_file()
+    shutil.rmtree(str(pathname))
+@pytest.mark.dependency(depends=["test_get_images"])
+def test_images_download() -> None:
+    pathname = pathlib.Path("images_downloads")
+    _download_results(f"{test_images_download}", IMAGES_RESULTS, function_name="images", pathname=str(pathname))
+    assert pathname.is_dir() and pathname.iterdir()
+    for file in pathname.iterdir():
+        assert file.is_file()
+    shutil.rmtree(str(pathname))

tests/ddgs_test.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import time
+import pytest
+from ddgs import DDGS
+@pytest.fixture(autouse=True)
+def pause_between_tests() -> None:
+    time.sleep(2)
+def test_context_manager() -> None:
+    with DDGS() as ddgs:
+        results = ddgs.text("python")
+        assert len(results) > 0
+def test_text_search() -> None:
+    query = "wolf"
+    results = DDGS().text(query)
+    assert isinstance(results, list)
+    assert len(results) > 0
+def test_images_search() -> None:
+    query = "tiger"
+    results = DDGS().images(query)
+    assert isinstance(results, list)
+    assert len(results) > 0
+def test_news_search() -> None:
+    query = "rabbit"
+    results = DDGS().news(query)
+    assert isinstance(results, list)
+    assert len(results) > 0
+def test_videos_search() -> None:
+    query = "monkey"
+    results = DDGS().videos(query)
+    assert isinstance(results, list)
+    assert len(results) > 0
+def test_books_search() -> None:
+    query = "mouse"
+    results = DDGS().books(query)
+    assert isinstance(results, list)
+    assert len(results) > 0