Spaces:
Sleeping
Sleeping
Upload 39 files
Browse files- CONTRIBUTING.md +68 -0
- Dockerfile +34 -0
- LICENSE.md +21 -0
- Makefile +40 -0
- api/__init__.py +3 -0
- api/main.py +390 -0
- ddgs/__init__.py +52 -0
- ddgs/base.py +122 -0
- ddgs/cli.py +523 -0
- ddgs/ddgs.py +234 -0
- ddgs/engines/__init__.py +94 -0
- ddgs/engines/annasarchive.py +51 -0
- ddgs/engines/bing.py +85 -0
- ddgs/engines/bing_news.py +86 -0
- ddgs/engines/brave.py +47 -0
- ddgs/engines/duckduckgo.py +56 -0
- ddgs/engines/duckduckgo_images.py +85 -0
- ddgs/engines/duckduckgo_news.py +72 -0
- ddgs/engines/duckduckgo_videos.py +84 -0
- ddgs/engines/google.py +95 -0
- ddgs/engines/grokipedia.py +49 -0
- ddgs/engines/mojeek.py +52 -0
- ddgs/engines/wikipedia.py +66 -0
- ddgs/engines/yahoo.py +64 -0
- ddgs/engines/yahoo_news.py +104 -0
- ddgs/engines/yandex.py +47 -0
- ddgs/exceptions.py +13 -0
- ddgs/http_client.py +78 -0
- ddgs/http_client2.py +151 -0
- ddgs/py.typed +1 -0
- ddgs/results.py +148 -0
- ddgs/similarity.py +72 -0
- ddgs/utils.py +70 -0
- docker-compose.yml +16 -0
- pyproject.toml +149 -0
- start_api.py +25 -0
- start_api.sh +29 -0
- tests/cli_test.py +97 -0
- tests/ddgs_test.py +51 -0
CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Contributing
|
| 2 |
+
|
| 3 |
+
Please open a Discussion, Issue, or email the maintainers to talk over any major changes before submitting a pull request.
|
| 4 |
+
|
| 5 |
+
## IDE configuration
|
| 6 |
+
|
| 7 |
+
If you use **VSCode**, install recommended extensions (press `F1` → *Show Recommended Extensions*):
|
| 8 |
+
|
| 9 |
+
- `ms-python.python`
|
| 10 |
+
- `ms-python.mypy-type-checker`
|
| 11 |
+
- `charliermarsh.ruff`
|
| 12 |
+
- `usernamehw.errorlens`
|
| 13 |
+
- `fill-labs.dependi`
|
| 14 |
+
|
| 15 |
+
## Development
|
| 16 |
+
|
| 17 |
+
1. Fork the repository and clone your fork:
|
| 18 |
+
```sh
|
| 19 |
+
git clone https://github.com/{your_profile}/ddgs
|
| 20 |
+
cd ddgs
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
2. Create and activate a virtual environment, then install development dependencies:
|
| 24 |
+
```sh
|
| 25 |
+
python -m venv .venv
|
| 26 |
+
source .venv/bin/activate # Windows: .venv\Scripts\activate
|
| 27 |
+
pip install -e .[dev]
|
| 28 |
+
```
|
| 29 |
+
3. Install pre-commit hooks (automates formatting, linting, typing):
|
| 30 |
+
```sh
|
| 31 |
+
pre-commit install
|
| 32 |
+
```
|
| 33 |
+
- Hooks run `ruff` and `mypy` automatically on each commit.
|
| 34 |
+
- To run them manually: `pre-commit run --all-files`.
|
| 35 |
+
|
| 36 |
+
3. Create a feature branch:
|
| 37 |
+
```sh
|
| 38 |
+
git checkout -b feat/new-feature
|
| 39 |
+
```
|
| 40 |
+
4. Implement your changes.
|
| 41 |
+
5. Run tests locally:
|
| 42 |
+
```sh
|
| 43 |
+
pytest
|
| 44 |
+
```
|
| 45 |
+
6. Commit changes (follow Conventional Commits):
|
| 46 |
+
```sh
|
| 47 |
+
git add .
|
| 48 |
+
git commit -m "feat: add feature description"
|
| 49 |
+
```
|
| 50 |
+
7. Push your branch to your fork
|
| 51 |
+
```sh
|
| 52 |
+
git push origin feat/new-feature
|
| 53 |
+
```
|
| 54 |
+
8. Open a pull request against the upstream repository and reference any related Discussion/Issue.
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
## Code style
|
| 58 |
+
|
| 59 |
+
- Formatting and linting are enforced with **ruff**.
|
| 60 |
+
- Static typing is checked with **mypy**.
|
| 61 |
+
|
| 62 |
+
## PR checklist
|
| 63 |
+
|
| 64 |
+
- Tests pass: `pytest`
|
| 65 |
+
- pre-commit checks pass: `pre-commit run --all-files`
|
| 66 |
+
- Commit messages follow Conventional Commits
|
| 67 |
+
- PR references related Issue/Discussion and describes changes
|
| 68 |
+
- Add tests for new behavior where applicable
|
Dockerfile
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use Python 3.11 slim image as base
|
| 2 |
+
FROM python:3.11-slim
|
| 3 |
+
|
| 4 |
+
# Set working directory
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Set environment variables
|
| 8 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 9 |
+
ENV PYTHONUNBUFFERED=1
|
| 10 |
+
ENV PIP_NO_CACHE_DIR=1
|
| 11 |
+
ENV PIP_DISABLE_PIP_VERSION_CHECK=1
|
| 12 |
+
ENV PYTHONPATH=/app
|
| 13 |
+
|
| 14 |
+
# Install system dependencies including curl for healthcheck
|
| 15 |
+
RUN apt-get update && apt-get upgrade -y \
|
| 16 |
+
&& apt-get install -y curl \
|
| 17 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 18 |
+
|
| 19 |
+
# Copy application code
|
| 20 |
+
COPY . .
|
| 21 |
+
|
| 22 |
+
# Install Python dependencies (including API dependencies)
|
| 23 |
+
RUN pip install --no-cache-dir -e .[api]
|
| 24 |
+
|
| 25 |
+
# Expose port
|
| 26 |
+
EXPOSE 8000
|
| 27 |
+
|
| 28 |
+
# Create non-root user
|
| 29 |
+
RUN useradd --create-home --shell /bin/bash app \
|
| 30 |
+
&& chown -R app:app /app
|
| 31 |
+
USER app
|
| 32 |
+
|
| 33 |
+
# Run the application
|
| 34 |
+
CMD ["python", "start_api.py"]
|
LICENSE.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2022 deedy5
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
Makefile
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PY := .venv/bin/python
|
| 2 |
+
PIP := .venv/bin/pip
|
| 3 |
+
|
| 4 |
+
.PHONY: help setup lint format test all clean
|
| 5 |
+
|
| 6 |
+
help:
|
| 7 |
+
@echo "Targets:"
|
| 8 |
+
@echo " setup - create venv and install dependencies"
|
| 9 |
+
@echo " lint - run ruff check, ruff format and mypy"
|
| 10 |
+
@echo " format - run ruff format and ruff check --fix"
|
| 11 |
+
@echo " test - run pytest"
|
| 12 |
+
@echo " all - run setup, lint, format and test"
|
| 13 |
+
@echo " clean - remove cache, venv and build artifacts"
|
| 14 |
+
|
| 15 |
+
setup:
|
| 16 |
+
python3 -m venv .venv
|
| 17 |
+
$(PIP) install -e .[dev]
|
| 18 |
+
|
| 19 |
+
lint:
|
| 20 |
+
$(PY) -m ruff check --fix
|
| 21 |
+
$(PY) -m mypy --install-types --non-interactive .
|
| 22 |
+
|
| 23 |
+
format:
|
| 24 |
+
$(PY) -m ruff format
|
| 25 |
+
|
| 26 |
+
test:
|
| 27 |
+
$(PY) -m pytest
|
| 28 |
+
|
| 29 |
+
all: setup lint format test
|
| 30 |
+
|
| 31 |
+
clean:
|
| 32 |
+
rm -rf .venv/
|
| 33 |
+
rm -rf .pytest_cache/
|
| 34 |
+
rm -rf .mypy_cache/
|
| 35 |
+
rm -rf .ruff_cache/
|
| 36 |
+
rm -rf build/
|
| 37 |
+
rm -rf dist/
|
| 38 |
+
rm -rf *.egg-info/
|
| 39 |
+
find . -name __pycache__ -exec rm -rf {} +
|
| 40 |
+
rm -f uv.lock
|
api/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""DDGS API package."""
|
| 2 |
+
|
| 3 |
+
__version__ = "1.0.0"
|
api/main.py
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI application for DDGS API."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
from fastapi import FastAPI, HTTPException
|
| 7 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
|
| 10 |
+
from ddgs import DDGS
|
| 11 |
+
|
| 12 |
+
# Configure logging
|
| 13 |
+
logging.basicConfig(level=logging.INFO)
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
# Create FastAPI app
|
| 17 |
+
app = FastAPI(
|
| 18 |
+
title="DDGS API",
|
| 19 |
+
description="A FastAPI wrapper for the DDGS (Dux Distributed Global Search) library",
|
| 20 |
+
version="1.0.0",
|
| 21 |
+
docs_url="/docs",
|
| 22 |
+
redoc_url="/redoc",
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
# Add CORS middleware
|
| 26 |
+
app.add_middleware(
|
| 27 |
+
CORSMiddleware,
|
| 28 |
+
allow_origins=["*"],
|
| 29 |
+
allow_credentials=True,
|
| 30 |
+
allow_methods=["*"],
|
| 31 |
+
allow_headers=["*"],
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# Pydantic models for request/response
|
| 36 |
+
class TextSearchRequest(BaseModel):
|
| 37 |
+
"""Request model for search operations."""
|
| 38 |
+
|
| 39 |
+
query: str = Field(..., description="Search query")
|
| 40 |
+
region: str = Field("us-en", description="Region for search (e.g., us-en, uk-en, ru-ru)")
|
| 41 |
+
safesearch: str = Field("moderate", description="Safe search setting (on, moderate, off)")
|
| 42 |
+
timelimit: str | None = Field(None, description="Time limit (d, w, m, y) or custom date range")
|
| 43 |
+
max_results: int | None = Field(10, description="Maximum number of results to return")
|
| 44 |
+
page: int = Field(1, description="Page number of results")
|
| 45 |
+
backend: str = Field("auto", description="Search backend (auto, or specific engine)")
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class ImagesSearchRequest(BaseModel):
|
| 49 |
+
"""Request model for image search operations."""
|
| 50 |
+
|
| 51 |
+
query: str = Field(..., description="Image search query")
|
| 52 |
+
region: str = Field("us-en", description="Region for search (e.g., us-en, uk-en, ru-ru)")
|
| 53 |
+
safesearch: str = Field("moderate", description="Safe search setting (on, moderate, off)")
|
| 54 |
+
timelimit: str | None = Field(None, description="Time limit (d, w, m, y) or custom date range")
|
| 55 |
+
max_results: int | None = Field(10, description="Maximum number of results to return")
|
| 56 |
+
page: int = Field(1, description="Page number of results")
|
| 57 |
+
backend: str = Field("auto", description="Search backend (auto, or specific engine)")
|
| 58 |
+
size: str | None = Field(None, description="Image size (Small, Medium, Large, Wallpaper)")
|
| 59 |
+
color: str | None = Field(
|
| 60 |
+
None,
|
| 61 |
+
description="Image color (Monochrome, Red, Orange, Yellow, Green, Blue, Purple, Pink, Brown, Black, Gray, Teal, White)", # noqa: E501
|
| 62 |
+
)
|
| 63 |
+
type_image: str | None = Field(None, description="Image type (photo, clipart, gif, transparent, line)")
|
| 64 |
+
layout: str | None = Field(None, description="Image layout (Square, Tall, Wide)")
|
| 65 |
+
license_image: str | None = Field(
|
| 66 |
+
None, description="Image license (any, Public, Share, ShareCommercially, Modify, ModifyCommercially)"
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class NewsSearchRequest(BaseModel):
|
| 71 |
+
"""Request model for search operations."""
|
| 72 |
+
|
| 73 |
+
query: str = Field(..., description="Search query")
|
| 74 |
+
region: str = Field("us-en", description="Region for search (e.g., us-en, uk-en, ru-ru)")
|
| 75 |
+
safesearch: str = Field("moderate", description="Safe search setting (on, moderate, off)")
|
| 76 |
+
timelimit: str | None = Field(None, description="Time limit (d, w, m, y) or custom date range")
|
| 77 |
+
max_results: int | None = Field(10, description="Maximum number of results to return")
|
| 78 |
+
page: int = Field(1, description="Page number of results")
|
| 79 |
+
backend: str = Field("auto", description="Search backend (auto, or specific engine)")
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
class VideosSearchRequest(BaseModel):
|
| 83 |
+
"""Request model for video search operations."""
|
| 84 |
+
|
| 85 |
+
query: str = Field(..., description="Video search query")
|
| 86 |
+
region: str = Field("us-en", description="Region for search (e.g., us-en, uk-en, ru-ru)")
|
| 87 |
+
safesearch: str = Field("moderate", description="Safe search setting (on, moderate, off)")
|
| 88 |
+
timelimit: str | None = Field(None, description="Time limit (d, w, m) or custom date range")
|
| 89 |
+
max_results: int | None = Field(10, description="Maximum number of results to return")
|
| 90 |
+
page: int = Field(1, description="Page number of results")
|
| 91 |
+
backend: str = Field("auto", description="Search backend (auto, or specific engine)")
|
| 92 |
+
resolution: str | None = Field(None, description="Video resolution (high, standard)")
|
| 93 |
+
duration: str | None = Field(None, description="Video duration (short, medium, long)")
|
| 94 |
+
license_videos: str | None = Field(None, description="Video license (creativeCommon, youtube)")
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class BooksSearchRequest(BaseModel):
|
| 98 |
+
"""Request model for book search operations."""
|
| 99 |
+
|
| 100 |
+
query: str = Field(..., description="Books search query")
|
| 101 |
+
max_results: int | None = Field(10, description="Maximum number of results to return")
|
| 102 |
+
page: int = Field(1, description="Page number of results")
|
| 103 |
+
backend: str = Field("auto", description="Search backend (auto, or specific engine)")
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
class SearchResponse(BaseModel):
|
| 107 |
+
"""Response model for search operations."""
|
| 108 |
+
|
| 109 |
+
results: list[dict[str, Any]]
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
class HealthResponse(BaseModel):
|
| 113 |
+
"""Response model for health check."""
|
| 114 |
+
|
| 115 |
+
status: str
|
| 116 |
+
version: str
|
| 117 |
+
service: str
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
@app.get("/", response_model=HealthResponse)
|
| 121 |
+
async def root() -> HealthResponse:
|
| 122 |
+
"""Root endpoint with basic service information."""
|
| 123 |
+
return HealthResponse(status="healthy", version="1.0.0", service="DDGS API")
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
@app.get("/health", response_model=HealthResponse)
|
| 127 |
+
async def health_check() -> HealthResponse:
|
| 128 |
+
"""Health check endpoint."""
|
| 129 |
+
return HealthResponse(status="healthy", version="1.0.0", service="DDGS API")
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
@app.post("/search/text", response_model=SearchResponse)
|
| 133 |
+
async def search_text(request: TextSearchRequest) -> SearchResponse:
|
| 134 |
+
"""Perform a text search."""
|
| 135 |
+
try:
|
| 136 |
+
results = DDGS().text(
|
| 137 |
+
query=request.query,
|
| 138 |
+
region=request.region,
|
| 139 |
+
safesearch=request.safesearch,
|
| 140 |
+
timelimit=request.timelimit,
|
| 141 |
+
max_results=request.max_results,
|
| 142 |
+
page=request.page,
|
| 143 |
+
backend=request.backend,
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
return SearchResponse(results=results)
|
| 147 |
+
except Exception as e:
|
| 148 |
+
logger.warning("Error in text search: %s", e)
|
| 149 |
+
raise HTTPException(status_code=500, detail=f"Search failed: {e!s}") from e
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
@app.get("/search/text", response_model=SearchResponse)
|
| 153 |
+
async def search_text_get(
|
| 154 |
+
query: str,
|
| 155 |
+
region: str = "us-en",
|
| 156 |
+
safesearch: str = "moderate",
|
| 157 |
+
timelimit: str | None = None,
|
| 158 |
+
max_results: int = 10,
|
| 159 |
+
page: int = 1,
|
| 160 |
+
backend: str = "auto",
|
| 161 |
+
) -> SearchResponse:
|
| 162 |
+
"""Perform a text search via GET request."""
|
| 163 |
+
try:
|
| 164 |
+
results = DDGS().text(
|
| 165 |
+
query=query,
|
| 166 |
+
region=region,
|
| 167 |
+
safesearch=safesearch,
|
| 168 |
+
timelimit=timelimit,
|
| 169 |
+
max_results=max_results,
|
| 170 |
+
page=page,
|
| 171 |
+
backend=backend,
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
return SearchResponse(results=results)
|
| 175 |
+
except Exception as e:
|
| 176 |
+
logger.warning("Error in text search (GET): %s", e)
|
| 177 |
+
raise HTTPException(status_code=500, detail=f"Search failed: {e!s}") from e
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
@app.post("/search/images", response_model=SearchResponse)
|
| 181 |
+
async def search_images(request: ImagesSearchRequest) -> SearchResponse:
|
| 182 |
+
"""Perform an image search."""
|
| 183 |
+
try:
|
| 184 |
+
results = DDGS().images(
|
| 185 |
+
query=request.query,
|
| 186 |
+
region=request.region,
|
| 187 |
+
safesearch=request.safesearch,
|
| 188 |
+
timelimit=request.timelimit,
|
| 189 |
+
max_results=request.max_results,
|
| 190 |
+
page=request.page,
|
| 191 |
+
backend=request.backend,
|
| 192 |
+
size=request.size,
|
| 193 |
+
color=request.color,
|
| 194 |
+
type_image=request.type_image,
|
| 195 |
+
layout=request.layout,
|
| 196 |
+
license_image=request.license_image,
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
return SearchResponse(results=results)
|
| 200 |
+
except Exception as e:
|
| 201 |
+
logger.warning("Error in image search: %s", e)
|
| 202 |
+
raise HTTPException(status_code=500, detail=f"Image search failed: {e!s}") from e
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
@app.get("/search/images", response_model=SearchResponse)
|
| 206 |
+
async def search_images_get(
|
| 207 |
+
query: str,
|
| 208 |
+
region: str = "us-en",
|
| 209 |
+
safesearch: str = "moderate",
|
| 210 |
+
timelimit: str | None = None,
|
| 211 |
+
max_results: int = 10,
|
| 212 |
+
page: int = 1,
|
| 213 |
+
backend: str = "auto",
|
| 214 |
+
size: str | None = None,
|
| 215 |
+
color: str | None = None,
|
| 216 |
+
type_image: str | None = None,
|
| 217 |
+
layout: str | None = None,
|
| 218 |
+
license_image: str | None = None,
|
| 219 |
+
) -> SearchResponse:
|
| 220 |
+
"""Perform an image search via GET request."""
|
| 221 |
+
try:
|
| 222 |
+
results = DDGS().images(
|
| 223 |
+
query=query,
|
| 224 |
+
region=region,
|
| 225 |
+
safesearch=safesearch,
|
| 226 |
+
timelimit=timelimit,
|
| 227 |
+
max_results=max_results,
|
| 228 |
+
page=page,
|
| 229 |
+
backend=backend,
|
| 230 |
+
size=size,
|
| 231 |
+
color=color,
|
| 232 |
+
type_image=type_image,
|
| 233 |
+
layout=layout,
|
| 234 |
+
license_image=license_image,
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
return SearchResponse(results=results)
|
| 238 |
+
except Exception as e:
|
| 239 |
+
logger.warning("Error in image search (GET): %s", e)
|
| 240 |
+
raise HTTPException(status_code=500, detail=f"Image search failed: {e!s}") from e
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
@app.post("/search/news", response_model=SearchResponse)
|
| 244 |
+
async def search_news(request: NewsSearchRequest) -> SearchResponse:
|
| 245 |
+
"""Perform a news search."""
|
| 246 |
+
try:
|
| 247 |
+
results = DDGS().news(
|
| 248 |
+
query=request.query,
|
| 249 |
+
region=request.region,
|
| 250 |
+
safesearch=request.safesearch,
|
| 251 |
+
timelimit=request.timelimit,
|
| 252 |
+
max_results=request.max_results,
|
| 253 |
+
page=request.page,
|
| 254 |
+
backend=request.backend,
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
return SearchResponse(results=results)
|
| 258 |
+
except Exception as e:
|
| 259 |
+
logger.warning("Error in news search: %s", e)
|
| 260 |
+
raise HTTPException(status_code=500, detail=f"News search failed: {e!s}") from e
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
@app.get("/search/news", response_model=SearchResponse)
|
| 264 |
+
async def search_news_get(
|
| 265 |
+
query: str,
|
| 266 |
+
region: str = "us-en",
|
| 267 |
+
safesearch: str = "moderate",
|
| 268 |
+
timelimit: str | None = None,
|
| 269 |
+
max_results: int = 10,
|
| 270 |
+
page: int = 1,
|
| 271 |
+
backend: str = "auto",
|
| 272 |
+
) -> SearchResponse:
|
| 273 |
+
"""Perform a news search via GET request."""
|
| 274 |
+
try:
|
| 275 |
+
results = DDGS().news(
|
| 276 |
+
query=query,
|
| 277 |
+
region=region,
|
| 278 |
+
safesearch=safesearch,
|
| 279 |
+
timelimit=timelimit,
|
| 280 |
+
max_results=max_results,
|
| 281 |
+
page=page,
|
| 282 |
+
backend=backend,
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
return SearchResponse(results=results)
|
| 286 |
+
except Exception as e:
|
| 287 |
+
logger.warning("Error in news search (GET): %s", e)
|
| 288 |
+
raise HTTPException(status_code=500, detail=f"News search failed: {e!s}") from e
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
@app.post("/search/videos", response_model=SearchResponse)
|
| 292 |
+
async def search_videos(request: VideosSearchRequest) -> SearchResponse:
|
| 293 |
+
"""Perform a video search."""
|
| 294 |
+
try:
|
| 295 |
+
results = DDGS().videos(
|
| 296 |
+
query=request.query,
|
| 297 |
+
region=request.region,
|
| 298 |
+
safesearch=request.safesearch,
|
| 299 |
+
timelimit=request.timelimit,
|
| 300 |
+
max_results=request.max_results,
|
| 301 |
+
page=request.page,
|
| 302 |
+
backend=request.backend,
|
| 303 |
+
resolution=request.resolution,
|
| 304 |
+
duration=request.duration,
|
| 305 |
+
license_videos=request.license_videos,
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
return SearchResponse(results=results)
|
| 309 |
+
except Exception as e:
|
| 310 |
+
logger.warning("Error in video search: %s", e)
|
| 311 |
+
raise HTTPException(status_code=500, detail=f"Video search failed: {e!s}") from e
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
@app.get("/search/videos", response_model=SearchResponse)
|
| 315 |
+
async def search_videos_get(
|
| 316 |
+
query: str,
|
| 317 |
+
region: str = "us-en",
|
| 318 |
+
safesearch: str = "moderate",
|
| 319 |
+
timelimit: str | None = None,
|
| 320 |
+
max_results: int = 10,
|
| 321 |
+
page: int = 1,
|
| 322 |
+
backend: str = "auto",
|
| 323 |
+
resolution: str | None = None,
|
| 324 |
+
duration: str | None = None,
|
| 325 |
+
license_videos: str | None = None,
|
| 326 |
+
) -> SearchResponse:
|
| 327 |
+
"""Perform a video search via GET request."""
|
| 328 |
+
try:
|
| 329 |
+
results = DDGS().videos(
|
| 330 |
+
query=query,
|
| 331 |
+
region=region,
|
| 332 |
+
safesearch=safesearch,
|
| 333 |
+
timelimit=timelimit,
|
| 334 |
+
max_results=max_results,
|
| 335 |
+
page=page,
|
| 336 |
+
backend=backend,
|
| 337 |
+
resolution=resolution,
|
| 338 |
+
duration=duration,
|
| 339 |
+
license_videos=license_videos,
|
| 340 |
+
)
|
| 341 |
+
|
| 342 |
+
return SearchResponse(results=results)
|
| 343 |
+
except Exception as e:
|
| 344 |
+
logger.warning("Error in video search (GET): %s", e)
|
| 345 |
+
raise HTTPException(status_code=500, detail=f"Video search failed: {e!s}") from e
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
@app.post("/search/books", response_model=SearchResponse)
|
| 349 |
+
async def search_books(request: BooksSearchRequest) -> SearchResponse:
|
| 350 |
+
"""Perform a book search."""
|
| 351 |
+
try:
|
| 352 |
+
results = DDGS().books(
|
| 353 |
+
query=request.query,
|
| 354 |
+
max_results=request.max_results,
|
| 355 |
+
page=request.page,
|
| 356 |
+
backend=request.backend,
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
+
return SearchResponse(results=results)
|
| 360 |
+
except Exception as e:
|
| 361 |
+
logger.warning("Error in book search: %s", e)
|
| 362 |
+
raise HTTPException(status_code=500, detail=f"Book search failed: {e!s}") from e
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
@app.get("/search/books", response_model=SearchResponse)
|
| 366 |
+
async def search_books_get(
|
| 367 |
+
query: str,
|
| 368 |
+
max_results: int = 10,
|
| 369 |
+
page: int = 1,
|
| 370 |
+
backend: str = "auto",
|
| 371 |
+
) -> SearchResponse:
|
| 372 |
+
"""Perform a book search via GET request."""
|
| 373 |
+
try:
|
| 374 |
+
results = DDGS().books(
|
| 375 |
+
query=query,
|
| 376 |
+
max_results=max_results,
|
| 377 |
+
page=page,
|
| 378 |
+
backend=backend,
|
| 379 |
+
)
|
| 380 |
+
|
| 381 |
+
return SearchResponse(results=results)
|
| 382 |
+
except Exception as e:
|
| 383 |
+
logger.warning("Error in book search (GET): %s", e)
|
| 384 |
+
raise HTTPException(status_code=500, detail=f"Book search failed: {e!s}") from e
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
if __name__ == "__main__":
|
| 388 |
+
import uvicorn
|
| 389 |
+
|
| 390 |
+
uvicorn.run(app, host="0.0.0.0", port=8000) # noqa: S104
|
ddgs/__init__.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""DDGS | Dux Distributed Global Search.
|
| 2 |
+
|
| 3 |
+
A metasearch library that aggregates results from diverse web search services.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import importlib
|
| 7 |
+
import logging
|
| 8 |
+
import threading
|
| 9 |
+
from typing import TYPE_CHECKING, Any, cast
|
| 10 |
+
|
| 11 |
+
__version__ = "9.10.0"
|
| 12 |
+
__all__ = ("DDGS",)
|
| 13 |
+
|
| 14 |
+
if TYPE_CHECKING:
|
| 15 |
+
from .ddgs import DDGS
|
| 16 |
+
|
| 17 |
+
# A do-nothing logging handler
|
| 18 |
+
# https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
|
| 19 |
+
logging.getLogger("ddgs").addHandler(logging.NullHandler())
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class _ProxyMeta(type):
|
| 23 |
+
_lock: threading.Lock = threading.Lock()
|
| 24 |
+
_real_cls: type["DDGS"] | None = None
|
| 25 |
+
|
| 26 |
+
@classmethod
|
| 27 |
+
def _load_real(cls) -> type["DDGS"]:
|
| 28 |
+
if cls._real_cls is None:
|
| 29 |
+
with cls._lock:
|
| 30 |
+
if cls._real_cls is None:
|
| 31 |
+
cls._real_cls = importlib.import_module(".ddgs", package=__name__).DDGS
|
| 32 |
+
globals()["DDGS"] = cls._real_cls
|
| 33 |
+
return cls._real_cls
|
| 34 |
+
|
| 35 |
+
def __call__(cls, *args: Any, **kwargs: Any) -> "DDGS": # noqa: ANN401
|
| 36 |
+
real = type(cls)._load_real()
|
| 37 |
+
return real(*args, **kwargs)
|
| 38 |
+
|
| 39 |
+
def __getattr__(cls, name: str) -> Any: # noqa: ANN401
|
| 40 |
+
return getattr(type(cls)._load_real(), name)
|
| 41 |
+
|
| 42 |
+
def __dir__(cls) -> list[str]:
|
| 43 |
+
base = set(super().__dir__())
|
| 44 |
+
loaded_names = set(dir(type(cls)._load_real()))
|
| 45 |
+
return sorted(base | (loaded_names - base))
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class _DDGSProxy(metaclass=_ProxyMeta):
|
| 49 |
+
"""Proxy class for lazy-loading the real DDGS implementation."""
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
DDGS: type[DDGS] = cast("type[DDGS]", _DDGSProxy) # type: ignore[no-redef]
|
ddgs/base.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Base class for search engines."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from abc import ABC, abstractmethod
|
| 5 |
+
from collections.abc import Mapping
|
| 6 |
+
from functools import cached_property
|
| 7 |
+
from typing import Any, ClassVar, Generic, Literal, TypeVar
|
| 8 |
+
|
| 9 |
+
from lxml import html
|
| 10 |
+
from lxml.etree import HTMLParser as LHTMLParser
|
| 11 |
+
|
| 12 |
+
from .http_client import HttpClient
|
| 13 |
+
from .results import BooksResult, ImagesResult, NewsResult, TextResult, VideosResult
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
T = TypeVar("T")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class BaseSearchEngine(ABC, Generic[T]):
|
| 20 |
+
"""Abstract base class for all search-engine backends."""
|
| 21 |
+
|
| 22 |
+
name: ClassVar[str] # unique key, e.g. "google"
|
| 23 |
+
category: ClassVar[Literal["text", "images", "videos", "news", "books"]]
|
| 24 |
+
provider: ClassVar[str] # source of the search results (e.g. "bing" for DuckDuckgo)
|
| 25 |
+
disabled: ClassVar[bool] = False # if True, the engine is disabled
|
| 26 |
+
priority: ClassVar[float] = 1
|
| 27 |
+
|
| 28 |
+
search_url: str
|
| 29 |
+
search_method: ClassVar[str] # GET or POST
|
| 30 |
+
search_headers: ClassVar[Mapping[str, str]] = {}
|
| 31 |
+
items_xpath: ClassVar[str]
|
| 32 |
+
elements_xpath: ClassVar[Mapping[str, str]]
|
| 33 |
+
elements_replace: ClassVar[Mapping[str, str]]
|
| 34 |
+
|
| 35 |
+
def __init__(self, proxy: str | None = None, timeout: int | None = None, *, verify: bool | str = True) -> None:
|
| 36 |
+
self.http_client = HttpClient(proxy=proxy, timeout=timeout, verify=verify)
|
| 37 |
+
self.results: list[T] = []
|
| 38 |
+
|
| 39 |
+
@property
|
| 40 |
+
def result_type(self) -> type[T]:
|
| 41 |
+
"""Get result type based on category."""
|
| 42 |
+
categories = {
|
| 43 |
+
"text": TextResult,
|
| 44 |
+
"images": ImagesResult,
|
| 45 |
+
"videos": VideosResult,
|
| 46 |
+
"news": NewsResult,
|
| 47 |
+
"books": BooksResult,
|
| 48 |
+
}
|
| 49 |
+
return categories[self.category]
|
| 50 |
+
|
| 51 |
+
@abstractmethod
|
| 52 |
+
def build_payload(
|
| 53 |
+
self,
|
| 54 |
+
query: str,
|
| 55 |
+
region: str,
|
| 56 |
+
safesearch: str,
|
| 57 |
+
timelimit: str | None,
|
| 58 |
+
page: int,
|
| 59 |
+
**kwargs: str,
|
| 60 |
+
) -> dict[str, Any]:
|
| 61 |
+
"""Build a payload for the search request."""
|
| 62 |
+
raise NotImplementedError
|
| 63 |
+
|
| 64 |
+
def request(self, *args: Any, **kwargs: Any) -> str | None: # noqa: ANN401
|
| 65 |
+
"""Make a request to the search engine."""
|
| 66 |
+
resp = self.http_client.request(*args, **kwargs)
|
| 67 |
+
if resp.status_code == 200:
|
| 68 |
+
return resp.text
|
| 69 |
+
return None
|
| 70 |
+
|
| 71 |
+
@cached_property
|
| 72 |
+
def parser(self) -> LHTMLParser:
|
| 73 |
+
"""Get HTML parser."""
|
| 74 |
+
return LHTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True, collect_ids=False)
|
| 75 |
+
|
| 76 |
+
def extract_tree(self, html_text: str) -> html.Element:
|
| 77 |
+
"""Extract html tree from html text."""
|
| 78 |
+
return html.fromstring(html_text, parser=self.parser)
|
| 79 |
+
|
| 80 |
+
def pre_process_html(self, html_text: str) -> str:
|
| 81 |
+
"""Pre-process html_text before extracting results."""
|
| 82 |
+
return html_text
|
| 83 |
+
|
| 84 |
+
def extract_results(self, html_text: str) -> list[T]:
|
| 85 |
+
"""Extract search results from html text."""
|
| 86 |
+
html_text = self.pre_process_html(html_text)
|
| 87 |
+
tree = self.extract_tree(html_text)
|
| 88 |
+
items = tree.xpath(self.items_xpath)
|
| 89 |
+
results = []
|
| 90 |
+
for item in items:
|
| 91 |
+
result = self.result_type()
|
| 92 |
+
for key, value in self.elements_xpath.items():
|
| 93 |
+
data = " ".join(x.strip() for x in item.xpath(value))
|
| 94 |
+
result.__setattr__(key, data)
|
| 95 |
+
results.append(result)
|
| 96 |
+
return results
|
| 97 |
+
|
| 98 |
+
def post_extract_results(self, results: list[T]) -> list[T]:
|
| 99 |
+
"""Post-process search results."""
|
| 100 |
+
return results
|
| 101 |
+
|
| 102 |
+
def search(
|
| 103 |
+
self,
|
| 104 |
+
query: str,
|
| 105 |
+
region: str = "us-en",
|
| 106 |
+
safesearch: str = "moderate",
|
| 107 |
+
timelimit: str | None = None,
|
| 108 |
+
page: int = 1,
|
| 109 |
+
**kwargs: str,
|
| 110 |
+
) -> list[T] | None:
|
| 111 |
+
"""Search the engine."""
|
| 112 |
+
payload = self.build_payload(
|
| 113 |
+
query=query, region=region, safesearch=safesearch, timelimit=timelimit, page=page, **kwargs
|
| 114 |
+
)
|
| 115 |
+
if self.search_method == "GET":
|
| 116 |
+
html_text = self.request(self.search_method, self.search_url, params=payload, headers=self.search_headers)
|
| 117 |
+
else:
|
| 118 |
+
html_text = self.request(self.search_method, self.search_url, data=payload, headers=self.search_headers)
|
| 119 |
+
if not html_text:
|
| 120 |
+
return None
|
| 121 |
+
results = self.extract_results(html_text)
|
| 122 |
+
return self.post_extract_results(results)
|
ddgs/cli.py
ADDED
|
@@ -0,0 +1,523 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CLI tool for DDGS."""
|
| 2 |
+
|
| 3 |
+
import csv
|
| 4 |
+
import json
|
| 5 |
+
import logging
|
| 6 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 7 |
+
from datetime import datetime, timezone
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from urllib.parse import unquote
|
| 10 |
+
|
| 11 |
+
import click
|
| 12 |
+
import primp
|
| 13 |
+
|
| 14 |
+
from . import __version__
|
| 15 |
+
from .ddgs import DDGS
|
| 16 |
+
from .utils import _expand_proxy_tb_alias
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
COLORS = {
|
| 21 |
+
0: "black",
|
| 22 |
+
1: "red",
|
| 23 |
+
2: "green",
|
| 24 |
+
3: "yellow",
|
| 25 |
+
4: "blue",
|
| 26 |
+
5: "magenta",
|
| 27 |
+
6: "cyan",
|
| 28 |
+
7: "bright_black",
|
| 29 |
+
8: "bright_red",
|
| 30 |
+
9: "bright_green",
|
| 31 |
+
10: "bright_yellow",
|
| 32 |
+
11: "bright_blue",
|
| 33 |
+
12: "bright_magenta",
|
| 34 |
+
13: "bright_cyan",
|
| 35 |
+
14: "white",
|
| 36 |
+
15: "bright_white",
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _convert_tuple_to_csv(_ctx: click.Context, _param: click.Parameter, value: tuple[str] | None) -> str:
|
| 41 |
+
if value is not None and isinstance(value, tuple):
|
| 42 |
+
return ",".join(value)
|
| 43 |
+
return ""
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _save_data(query: str, data: list[dict[str, str]], function_name: str, filename: str | None) -> None:
|
| 47 |
+
filename, ext = filename.rsplit(".", 1) if filename and filename.endswith((".csv", ".json")) else (None, filename)
|
| 48 |
+
filename = filename if filename else f"{function_name}_{query}_{datetime.now(tz=timezone.utc):%Y%m%d_%H%M%S}"
|
| 49 |
+
if ext == "csv":
|
| 50 |
+
_save_csv(f"{filename}.{ext}", data)
|
| 51 |
+
elif ext == "json":
|
| 52 |
+
_save_json(f"{filename}.{ext}", data)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _save_json(jsonfile: str | Path, data: list[dict[str, str]]) -> None:
|
| 56 |
+
with Path(jsonfile).open("w", encoding="utf-8") as file:
|
| 57 |
+
file.write(json.dumps(data, ensure_ascii=False, indent=2))
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _save_csv(csvfile: str | Path, data: list[dict[str, str]]) -> None:
|
| 61 |
+
with Path(csvfile).open("w", newline="", encoding="utf-8") as file:
|
| 62 |
+
if data:
|
| 63 |
+
headers = data[0].keys()
|
| 64 |
+
writer = csv.DictWriter(file, fieldnames=headers, quoting=csv.QUOTE_MINIMAL)
|
| 65 |
+
writer.writeheader()
|
| 66 |
+
writer.writerows(data)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _print_data(data: list[dict[str, str]], *, no_color: bool = False) -> None:
|
| 70 |
+
if data:
|
| 71 |
+
for i, e in enumerate(data, start=1):
|
| 72 |
+
click.secho(f"{i}.\t {'=' * 78}", bg="black", fg="white")
|
| 73 |
+
for j, (k, v) in enumerate(e.items(), start=1):
|
| 74 |
+
if v:
|
| 75 |
+
width = 300 if k in ("content", "href", "image", "source", "thumbnail", "url") else 78
|
| 76 |
+
title = "language" if k == "detected_language" else k
|
| 77 |
+
text = click.wrap_text(
|
| 78 |
+
f"{v}",
|
| 79 |
+
width=width,
|
| 80 |
+
initial_indent="",
|
| 81 |
+
subsequent_indent=" " * 12,
|
| 82 |
+
preserve_paragraphs=True,
|
| 83 |
+
)
|
| 84 |
+
else:
|
| 85 |
+
title = k
|
| 86 |
+
text = v
|
| 87 |
+
click.secho(f"{title:<12}{text}", bg="black", fg=COLORS[j] if not no_color else "white", overline=True)
|
| 88 |
+
input()
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def _sanitize_query(query: str) -> str:
|
| 92 |
+
return (
|
| 93 |
+
query.replace("filetype", "")
|
| 94 |
+
.replace(":", "")
|
| 95 |
+
.replace('"', "'")
|
| 96 |
+
.replace("site", "")
|
| 97 |
+
.replace(" ", "_")
|
| 98 |
+
.replace("/", "_")
|
| 99 |
+
.replace("\\", "_")
|
| 100 |
+
.replace(" ", "")
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def _download_file(url: str, dir_path: str, filename: str, proxy: str | None, *, verify: bool) -> None:
|
| 105 |
+
try:
|
| 106 |
+
resp = primp.Client(proxy=proxy, impersonate="random", impersonate_os="random", timeout=10, verify=verify).get(
|
| 107 |
+
url,
|
| 108 |
+
)
|
| 109 |
+
if resp.status_code == 200:
|
| 110 |
+
f = Path(dir_path) / filename[:200]
|
| 111 |
+
with f.open("wb") as file:
|
| 112 |
+
file.write(resp.content)
|
| 113 |
+
except Exception as ex: # noqa: BLE001
|
| 114 |
+
logger.debug("Error download_file url=%s: %r", url, ex)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def _download_results(
|
| 118 |
+
query: str,
|
| 119 |
+
results: list[dict[str, str]],
|
| 120 |
+
function_name: str,
|
| 121 |
+
proxy: str | None = None,
|
| 122 |
+
threads: int | None = None,
|
| 123 |
+
pathname: str | None = None,
|
| 124 |
+
*,
|
| 125 |
+
verify: bool = True,
|
| 126 |
+
) -> None:
|
| 127 |
+
path = pathname if pathname else f"{function_name}_{query}_{datetime.now(tz=timezone.utc):%Y%m%d_%H%M%S}"
|
| 128 |
+
Path(path).mkdir(parents=True, exist_ok=True)
|
| 129 |
+
|
| 130 |
+
threads = 10 if threads is None else threads
|
| 131 |
+
with ThreadPoolExecutor(max_workers=threads) as executor:
|
| 132 |
+
futures = []
|
| 133 |
+
for i, res in enumerate(results, start=1):
|
| 134 |
+
url = res["image"] if function_name == "images" else res["href"]
|
| 135 |
+
filename = unquote(url.split("/")[-1].split("?")[0])
|
| 136 |
+
f = executor.submit(_download_file, url, path, f"{i}_{filename}", proxy, verify=verify)
|
| 137 |
+
futures.append(f)
|
| 138 |
+
|
| 139 |
+
with click.progressbar(
|
| 140 |
+
length=len(futures),
|
| 141 |
+
label="Downloading",
|
| 142 |
+
show_percent=True,
|
| 143 |
+
show_pos=True,
|
| 144 |
+
width=50,
|
| 145 |
+
) as bar:
|
| 146 |
+
for future in as_completed(futures):
|
| 147 |
+
future.result()
|
| 148 |
+
bar.update(1)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
@click.group(chain=True)
|
| 152 |
+
def cli() -> None:
|
| 153 |
+
"""DDGS CLI tool."""
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def safe_entry_point() -> None:
|
| 157 |
+
"""Run the CLI tool in try-except block to catch all exceptions."""
|
| 158 |
+
logging.basicConfig(level=logging.WARNING)
|
| 159 |
+
try:
|
| 160 |
+
cli()
|
| 161 |
+
except Exception as ex: # noqa: BLE001
|
| 162 |
+
click.echo(f"{type(ex).__name__}: {ex!r}")
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
@cli.command()
|
| 166 |
+
def version() -> str:
|
| 167 |
+
"""Print and return version."""
|
| 168 |
+
print(__version__) # noqa: T201
|
| 169 |
+
return __version__
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
@cli.command()
|
| 173 |
+
@click.option("-q", "--query", help="text search query")
|
| 174 |
+
@click.option("-k", "--keywords", help="(Deprecated) text search query") # deprecated
|
| 175 |
+
@click.option("-r", "--region", default="us-en", help="us-en, ru-ru, etc.")
|
| 176 |
+
@click.option("-s", "--safesearch", default="moderate", type=click.Choice(["on", "moderate", "off"]))
|
| 177 |
+
@click.option("-t", "--timelimit", type=click.Choice(["d", "w", "m", "y"]), help="day, week, month, year")
|
| 178 |
+
@click.option("-m", "--max_results", default=10, type=int, help="maximum number of results")
|
| 179 |
+
@click.option("-p", "--page", default=1, type=int, help="page number of results")
|
| 180 |
+
@click.option(
|
| 181 |
+
"-b",
|
| 182 |
+
"--backend",
|
| 183 |
+
default=["auto"],
|
| 184 |
+
type=click.Choice(
|
| 185 |
+
[
|
| 186 |
+
"auto",
|
| 187 |
+
"all",
|
| 188 |
+
"bing",
|
| 189 |
+
"brave",
|
| 190 |
+
"duckduckgo",
|
| 191 |
+
"google",
|
| 192 |
+
"grokipedia",
|
| 193 |
+
"mojeek",
|
| 194 |
+
"yandex",
|
| 195 |
+
"yahoo",
|
| 196 |
+
"wikipedia",
|
| 197 |
+
],
|
| 198 |
+
),
|
| 199 |
+
multiple=True,
|
| 200 |
+
callback=_convert_tuple_to_csv,
|
| 201 |
+
)
|
| 202 |
+
@click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
|
| 203 |
+
@click.option("-d", "--download", is_flag=True, default=False, help="download results. -dd to set custom directory")
|
| 204 |
+
@click.option("-dd", "--download-directory", help="Specify custom download directory")
|
| 205 |
+
@click.option("-th", "--threads", default=10, help="download threads, default=10")
|
| 206 |
+
@click.option("-pr", "--proxy", help="the proxy to send requests, example: socks5h://127.0.0.1:9150")
|
| 207 |
+
@click.option("-v", "--verify", default=True, help="verify SSL when making the request")
|
| 208 |
+
@click.option("-nc", "--no-color", is_flag=True, default=False, help="disable color output")
|
| 209 |
+
def text(
|
| 210 |
+
query: str,
|
| 211 |
+
keywords: str | None, # deprecated
|
| 212 |
+
region: str,
|
| 213 |
+
safesearch: str,
|
| 214 |
+
timelimit: str | None,
|
| 215 |
+
max_results: int | None,
|
| 216 |
+
page: int,
|
| 217 |
+
backend: str,
|
| 218 |
+
output: str | None,
|
| 219 |
+
download_directory: str | None,
|
| 220 |
+
threads: int,
|
| 221 |
+
proxy: str | None,
|
| 222 |
+
*,
|
| 223 |
+
download: bool,
|
| 224 |
+
verify: bool,
|
| 225 |
+
no_color: bool,
|
| 226 |
+
) -> None:
|
| 227 |
+
"""CLI function to perform a DDGS text metasearch."""
|
| 228 |
+
data = DDGS(proxy=_expand_proxy_tb_alias(proxy), verify=verify).text(
|
| 229 |
+
query=query,
|
| 230 |
+
keywords=keywords, # deprecated
|
| 231 |
+
region=region,
|
| 232 |
+
safesearch=safesearch,
|
| 233 |
+
timelimit=timelimit,
|
| 234 |
+
max_results=max_results,
|
| 235 |
+
page=page,
|
| 236 |
+
backend=backend,
|
| 237 |
+
)
|
| 238 |
+
query = _sanitize_query(query)
|
| 239 |
+
if output:
|
| 240 |
+
_save_data(query, data, "text", filename=output)
|
| 241 |
+
if download:
|
| 242 |
+
_download_results(
|
| 243 |
+
query,
|
| 244 |
+
data,
|
| 245 |
+
function_name="text",
|
| 246 |
+
proxy=proxy,
|
| 247 |
+
threads=threads,
|
| 248 |
+
verify=verify,
|
| 249 |
+
pathname=download_directory,
|
| 250 |
+
)
|
| 251 |
+
if not output and not download:
|
| 252 |
+
_print_data(data, no_color=no_color)
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
@cli.command()
|
| 256 |
+
@click.option("-q", "--query", help="images search query")
|
| 257 |
+
@click.option("-k", "--keywords", help="(Deprecated) images search query") # deprecated
|
| 258 |
+
@click.option("-r", "--region", default="us-en", help="us-en, ru-ru, etc.")
|
| 259 |
+
@click.option("-s", "--safesearch", default="moderate", type=click.Choice(["on", "moderate", "off"]))
|
| 260 |
+
@click.option("-t", "--timelimit", type=click.Choice(["d", "w", "m", "y"]))
|
| 261 |
+
@click.option("-m", "--max_results", default=10, type=int, help="maximum number of results")
|
| 262 |
+
@click.option("-p", "--page", default=1, type=int, help="page number of results")
|
| 263 |
+
@click.option(
|
| 264 |
+
"-b",
|
| 265 |
+
"--backend",
|
| 266 |
+
default=["auto"],
|
| 267 |
+
type=click.Choice(["auto", "all", "duckduckgo"]),
|
| 268 |
+
multiple=True,
|
| 269 |
+
callback=_convert_tuple_to_csv,
|
| 270 |
+
)
|
| 271 |
+
@click.option("-size", "--size", type=click.Choice(["Small", "Medium", "Large", "Wallpaper"]))
|
| 272 |
+
@click.option(
|
| 273 |
+
"-c",
|
| 274 |
+
"--color",
|
| 275 |
+
type=click.Choice(
|
| 276 |
+
[
|
| 277 |
+
"color",
|
| 278 |
+
"Monochrome",
|
| 279 |
+
"Red",
|
| 280 |
+
"Orange",
|
| 281 |
+
"Yellow",
|
| 282 |
+
"Green",
|
| 283 |
+
"Blue",
|
| 284 |
+
"Purple",
|
| 285 |
+
"Pink",
|
| 286 |
+
"Brown",
|
| 287 |
+
"Black",
|
| 288 |
+
"Gray",
|
| 289 |
+
"Teal",
|
| 290 |
+
"White",
|
| 291 |
+
],
|
| 292 |
+
),
|
| 293 |
+
)
|
| 294 |
+
@click.option("-type", "--type_image", type=click.Choice(["photo", "clipart", "gif", "transparent", "line"]))
|
| 295 |
+
@click.option("-l", "--layout", type=click.Choice(["Square", "Tall", "Wide"]))
|
| 296 |
+
@click.option(
|
| 297 |
+
"-lic",
|
| 298 |
+
"--license_image",
|
| 299 |
+
type=click.Choice(["any", "Public", "Share", "ShareCommercially", "Modify", "ModifyCommercially"]),
|
| 300 |
+
)
|
| 301 |
+
@click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
|
| 302 |
+
@click.option("-d", "--download", is_flag=True, default=False, help="download results. -dd to set custom directory")
|
| 303 |
+
@click.option("-dd", "--download-directory", help="Specify custom download directory")
|
| 304 |
+
@click.option("-th", "--threads", default=10, help="download threads, default=10")
|
| 305 |
+
@click.option("-pr", "--proxy", help="the proxy to send requests, example: socks5h://127.0.0.1:9150")
|
| 306 |
+
@click.option("-v", "--verify", default=True, help="verify SSL when making the request")
|
| 307 |
+
@click.option("-nc", "--no-color", is_flag=True, default=False, help="disable color output")
|
| 308 |
+
def images(
|
| 309 |
+
query: str,
|
| 310 |
+
keywords: str | None, # deprecated
|
| 311 |
+
region: str,
|
| 312 |
+
safesearch: str,
|
| 313 |
+
timelimit: str | None,
|
| 314 |
+
max_results: int | None,
|
| 315 |
+
page: int,
|
| 316 |
+
backend: str,
|
| 317 |
+
size: str | None,
|
| 318 |
+
color: str | None,
|
| 319 |
+
type_image: str | None,
|
| 320 |
+
layout: str | None,
|
| 321 |
+
license_image: str | None,
|
| 322 |
+
download_directory: str | None,
|
| 323 |
+
threads: int,
|
| 324 |
+
output: str | None,
|
| 325 |
+
proxy: str | None,
|
| 326 |
+
*,
|
| 327 |
+
download: bool,
|
| 328 |
+
verify: bool,
|
| 329 |
+
no_color: bool,
|
| 330 |
+
) -> None:
|
| 331 |
+
"""CLI function to perform a DDGS images metasearch."""
|
| 332 |
+
data = DDGS(proxy=_expand_proxy_tb_alias(proxy), verify=verify).images(
|
| 333 |
+
query=query,
|
| 334 |
+
keywords=keywords, # deprecated
|
| 335 |
+
region=region,
|
| 336 |
+
safesearch=safesearch,
|
| 337 |
+
timelimit=timelimit,
|
| 338 |
+
max_results=max_results,
|
| 339 |
+
page=page,
|
| 340 |
+
backend=backend,
|
| 341 |
+
size=size,
|
| 342 |
+
color=color,
|
| 343 |
+
type_image=type_image,
|
| 344 |
+
layout=layout,
|
| 345 |
+
license_image=license_image,
|
| 346 |
+
)
|
| 347 |
+
query = _sanitize_query(query)
|
| 348 |
+
if output:
|
| 349 |
+
_save_data(query, data, function_name="images", filename=output)
|
| 350 |
+
if download:
|
| 351 |
+
_download_results(
|
| 352 |
+
query,
|
| 353 |
+
data,
|
| 354 |
+
function_name="images",
|
| 355 |
+
proxy=proxy,
|
| 356 |
+
threads=threads,
|
| 357 |
+
verify=verify,
|
| 358 |
+
pathname=download_directory,
|
| 359 |
+
)
|
| 360 |
+
if not output and not download:
|
| 361 |
+
_print_data(data, no_color=no_color)
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
@cli.command()
|
| 365 |
+
@click.option("-q", "--query", help="videos search query")
|
| 366 |
+
@click.option("-k", "--keywords", help="(Deprecated) videos search query") # deprecated
|
| 367 |
+
@click.option("-r", "--region", default="us-en", help="us-en, ru-ru, etc.")
|
| 368 |
+
@click.option("-s", "--safesearch", default="moderate", type=click.Choice(["on", "moderate", "off"]))
|
| 369 |
+
@click.option("-t", "--timelimit", type=click.Choice(["d", "w", "m"]), help="day, week, month")
|
| 370 |
+
@click.option("-m", "--max_results", default=10, type=int, help="maximum number of results")
|
| 371 |
+
@click.option("-p", "--page", default=1, type=int, help="page number of results")
|
| 372 |
+
@click.option(
|
| 373 |
+
"-b",
|
| 374 |
+
"--backend",
|
| 375 |
+
default=["auto"],
|
| 376 |
+
type=click.Choice(["auto", "all", "duckduckgo"]),
|
| 377 |
+
multiple=True,
|
| 378 |
+
callback=_convert_tuple_to_csv,
|
| 379 |
+
)
|
| 380 |
+
@click.option("-res", "--resolution", type=click.Choice(["high", "standart"]))
|
| 381 |
+
@click.option("-d", "--duration", type=click.Choice(["short", "medium", "long"]))
|
| 382 |
+
@click.option("-lic", "--license_videos", type=click.Choice(["creativeCommon", "youtube"]))
|
| 383 |
+
@click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
|
| 384 |
+
@click.option("-pr", "--proxy", help="the proxy to send requests, example: socks5h://127.0.0.1:9150")
|
| 385 |
+
@click.option("-v", "--verify", default=True, help="verify SSL when making the request")
|
| 386 |
+
@click.option("-nc", "--no-color", is_flag=True, default=False, help="disable color output")
|
| 387 |
+
def videos(
|
| 388 |
+
query: str,
|
| 389 |
+
keywords: str | None, # deprecated
|
| 390 |
+
region: str,
|
| 391 |
+
safesearch: str,
|
| 392 |
+
timelimit: str | None,
|
| 393 |
+
max_results: int | None,
|
| 394 |
+
page: int,
|
| 395 |
+
backend: str,
|
| 396 |
+
resolution: str | None,
|
| 397 |
+
duration: str | None,
|
| 398 |
+
license_videos: str | None,
|
| 399 |
+
output: str | None,
|
| 400 |
+
proxy: str | None,
|
| 401 |
+
*,
|
| 402 |
+
verify: bool,
|
| 403 |
+
no_color: bool,
|
| 404 |
+
) -> None:
|
| 405 |
+
"""CLI function to perform a DDGS videos metasearch."""
|
| 406 |
+
data = DDGS(proxy=_expand_proxy_tb_alias(proxy), verify=verify).videos(
|
| 407 |
+
query=query,
|
| 408 |
+
keywords=keywords, # deprecated
|
| 409 |
+
region=region,
|
| 410 |
+
safesearch=safesearch,
|
| 411 |
+
timelimit=timelimit,
|
| 412 |
+
max_results=max_results,
|
| 413 |
+
page=page,
|
| 414 |
+
backend=backend,
|
| 415 |
+
resolution=resolution,
|
| 416 |
+
duration=duration,
|
| 417 |
+
license_videos=license_videos,
|
| 418 |
+
)
|
| 419 |
+
query = _sanitize_query(query)
|
| 420 |
+
if output:
|
| 421 |
+
_save_data(query, data, function_name="videos", filename=output)
|
| 422 |
+
else:
|
| 423 |
+
_print_data(data, no_color=no_color)
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
@cli.command()
|
| 427 |
+
@click.option("-q", "--query", help="news search query")
|
| 428 |
+
@click.option("-k", "--keywords", help="(Deprecated) news search query") # deprecated
|
| 429 |
+
@click.option("-r", "--region", default="us-en", help="us-en, ru-ru, etc.")
|
| 430 |
+
@click.option("-s", "--safesearch", default="moderate", type=click.Choice(["on", "moderate", "off"]))
|
| 431 |
+
@click.option("-t", "--timelimit", type=click.Choice(["d", "w", "m", "y"]), help="day, week, month, year")
|
| 432 |
+
@click.option("-m", "--max_results", default=10, type=int, help="maximum number of results")
|
| 433 |
+
@click.option("-p", "--page", default=1, type=int, help="page number of results")
|
| 434 |
+
@click.option(
|
| 435 |
+
"-b",
|
| 436 |
+
"--backend",
|
| 437 |
+
default=["auto"],
|
| 438 |
+
type=click.Choice(["auto", "all", "bing", "duckduckgo", "yahoo"]),
|
| 439 |
+
multiple=True,
|
| 440 |
+
callback=_convert_tuple_to_csv,
|
| 441 |
+
)
|
| 442 |
+
@click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
|
| 443 |
+
@click.option("-pr", "--proxy", help="the proxy to send requests, example: socks5h://127.0.0.1:9150")
|
| 444 |
+
@click.option("-v", "--verify", default=True, help="verify SSL when making the request")
|
| 445 |
+
@click.option("-nc", "--no-color", is_flag=True, default=False, help="disable color output")
|
| 446 |
+
def news(
|
| 447 |
+
query: str,
|
| 448 |
+
keywords: str | None, # deprecated
|
| 449 |
+
region: str,
|
| 450 |
+
safesearch: str,
|
| 451 |
+
timelimit: str | None,
|
| 452 |
+
max_results: int | None,
|
| 453 |
+
page: int,
|
| 454 |
+
backend: str,
|
| 455 |
+
output: str | None,
|
| 456 |
+
proxy: str | None,
|
| 457 |
+
*,
|
| 458 |
+
verify: bool,
|
| 459 |
+
no_color: bool,
|
| 460 |
+
) -> None:
|
| 461 |
+
"""CLI function to perform a DDGS news metasearch."""
|
| 462 |
+
data = DDGS(proxy=_expand_proxy_tb_alias(proxy), verify=verify).news(
|
| 463 |
+
query=query,
|
| 464 |
+
keywords=keywords, # deprecated
|
| 465 |
+
region=region,
|
| 466 |
+
safesearch=safesearch,
|
| 467 |
+
timelimit=timelimit,
|
| 468 |
+
max_results=max_results,
|
| 469 |
+
page=page,
|
| 470 |
+
backend=backend,
|
| 471 |
+
)
|
| 472 |
+
query = _sanitize_query(query)
|
| 473 |
+
if output:
|
| 474 |
+
_save_data(query, data, function_name="news", filename=output)
|
| 475 |
+
else:
|
| 476 |
+
_print_data(data, no_color=no_color)
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
@cli.command()
|
| 480 |
+
@click.option("-q", "--query", help="books search query")
|
| 481 |
+
@click.option("-k", "--keywords", help="(Deprecated) books search query") # deprecated
|
| 482 |
+
@click.option("-m", "--max_results", default=10, type=int, help="maximum number of results")
|
| 483 |
+
@click.option("-p", "--page", default=1, type=int, help="page number of results")
|
| 484 |
+
@click.option(
|
| 485 |
+
"-b",
|
| 486 |
+
"--backend",
|
| 487 |
+
default=["auto"],
|
| 488 |
+
type=click.Choice(["auto", "all", "annasarchive"]),
|
| 489 |
+
multiple=True,
|
| 490 |
+
callback=_convert_tuple_to_csv,
|
| 491 |
+
)
|
| 492 |
+
@click.option("-o", "--output", help="csv, json or filename.csv|json (save the results to a csv or json file)")
|
| 493 |
+
@click.option("-pr", "--proxy", help="the proxy to send requests, example: socks5h://127.0.0.1:9150")
|
| 494 |
+
@click.option("-v", "--verify", default=True, help="verify SSL when making the request")
|
| 495 |
+
@click.option("-nc", "--no-color", is_flag=True, default=False, help="disable color output")
|
| 496 |
+
def books(
|
| 497 |
+
query: str,
|
| 498 |
+
keywords: str | None, # deprecated
|
| 499 |
+
max_results: int | None,
|
| 500 |
+
page: int,
|
| 501 |
+
backend: str,
|
| 502 |
+
output: str | None,
|
| 503 |
+
proxy: str | None,
|
| 504 |
+
*,
|
| 505 |
+
verify: bool,
|
| 506 |
+
no_color: bool,
|
| 507 |
+
) -> None:
|
| 508 |
+
"""CLI function to perform a DDGS books metasearch."""
|
| 509 |
+
data = DDGS(proxy=_expand_proxy_tb_alias(proxy), verify=verify).books(
|
| 510 |
+
query=query,
|
| 511 |
+
keywords=keywords, # deprecated
|
| 512 |
+
max_results=max_results,
|
| 513 |
+
page=page,
|
| 514 |
+
backend=backend,
|
| 515 |
+
)
|
| 516 |
+
if output:
|
| 517 |
+
_save_data(query, data, function_name="books", filename=output)
|
| 518 |
+
else:
|
| 519 |
+
_print_data(data, no_color=no_color)
|
| 520 |
+
|
| 521 |
+
|
| 522 |
+
if __name__ == "__main__":
|
| 523 |
+
safe_entry_point()
|
ddgs/ddgs.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""DDGS class implementation."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
from concurrent.futures import ThreadPoolExecutor, wait
|
| 6 |
+
from math import ceil
|
| 7 |
+
from random import random, shuffle
|
| 8 |
+
from types import TracebackType
|
| 9 |
+
from typing import Any, ClassVar
|
| 10 |
+
|
| 11 |
+
from .base import BaseSearchEngine
|
| 12 |
+
from .engines import ENGINES
|
| 13 |
+
from .exceptions import DDGSException, TimeoutException
|
| 14 |
+
from .results import ResultsAggregator
|
| 15 |
+
from .similarity import SimpleFilterRanker
|
| 16 |
+
from .utils import _expand_proxy_tb_alias
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class DDGS:
|
| 22 |
+
"""DDGS | Dux Distributed Global Search.
|
| 23 |
+
|
| 24 |
+
A metasearch library that aggregates results from diverse web search services.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
proxy: The proxy to use for the search. Defaults to None.
|
| 28 |
+
timeout: The timeout for the search. Defaults to 5.
|
| 29 |
+
verify: bool (True to verify, False to skip) or str path to a PEM file. Defaults to True.
|
| 30 |
+
|
| 31 |
+
Attributes:
|
| 32 |
+
threads: The number of threads to use for the search. Defaults to None (automatic).
|
| 33 |
+
_executor: The ThreadPoolExecutor instance.
|
| 34 |
+
|
| 35 |
+
Raises:
|
| 36 |
+
DDGSException: If an error occurs during the search.
|
| 37 |
+
|
| 38 |
+
Example:
|
| 39 |
+
>>> from ddgs import DDGS
|
| 40 |
+
>>> results = DDGS().search("python")
|
| 41 |
+
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
threads: ClassVar[int | None] = None
|
| 45 |
+
_executor: ClassVar[ThreadPoolExecutor | None] = None
|
| 46 |
+
|
| 47 |
+
def __init__(self, proxy: str | None = None, timeout: int | None = 5, *, verify: bool | str = True) -> None:
|
| 48 |
+
self._proxy = _expand_proxy_tb_alias(proxy) or os.environ.get("DDGS_PROXY")
|
| 49 |
+
self._timeout = timeout
|
| 50 |
+
self._verify = verify
|
| 51 |
+
self._engines_cache: dict[
|
| 52 |
+
type[BaseSearchEngine[Any]], BaseSearchEngine[Any]
|
| 53 |
+
] = {} # dict[engine_class, engine_instance]
|
| 54 |
+
|
| 55 |
+
def __enter__(self) -> "DDGS":
|
| 56 |
+
"""Enter the context manager and return the DDGS instance."""
|
| 57 |
+
return self
|
| 58 |
+
|
| 59 |
+
def __exit__(
|
| 60 |
+
self,
|
| 61 |
+
exc_type: type[BaseException] | None = None,
|
| 62 |
+
exc_val: BaseException | None = None,
|
| 63 |
+
exc_tb: TracebackType | None = None,
|
| 64 |
+
) -> None:
|
| 65 |
+
"""Exit the context manager."""
|
| 66 |
+
|
| 67 |
+
@classmethod
|
| 68 |
+
def get_executor(cls) -> ThreadPoolExecutor:
|
| 69 |
+
"""Get a ThreadPoolExecutor instance and cache it."""
|
| 70 |
+
if cls._executor is None:
|
| 71 |
+
cls._executor = ThreadPoolExecutor(max_workers=cls.threads, thread_name_prefix="DDGS")
|
| 72 |
+
return cls._executor
|
| 73 |
+
|
| 74 |
+
def _get_engines(
|
| 75 |
+
self,
|
| 76 |
+
category: str,
|
| 77 |
+
backend: str,
|
| 78 |
+
) -> list[BaseSearchEngine[Any]]:
|
| 79 |
+
"""Retrieve a list of search engine instances for a given category and backend.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
category: The category of search engines (e.g., 'text', 'images', etc.).
|
| 83 |
+
backend: A single or comma-delimited backends. Defaults to "auto".
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
A list of initialized search engine instances corresponding to the specified
|
| 87 |
+
category and backend. Instances are cached for reuse.
|
| 88 |
+
|
| 89 |
+
"""
|
| 90 |
+
if isinstance(backend, list): # deprecated
|
| 91 |
+
backend = ",".join(backend)
|
| 92 |
+
backend_list = [x.strip() for x in backend.split(",")]
|
| 93 |
+
engine_keys = list(ENGINES[category].keys())
|
| 94 |
+
shuffle(engine_keys)
|
| 95 |
+
if "auto" in backend_list or "all" in backend_list:
|
| 96 |
+
keys = engine_keys
|
| 97 |
+
if category == "text":
|
| 98 |
+
keys = ["wikipedia", "grokipedia"] + [k for k in keys if k not in ("wikipedia", "grokipedia")]
|
| 99 |
+
else:
|
| 100 |
+
keys = backend_list
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
engine_classes = [ENGINES[category][key] for key in keys]
|
| 104 |
+
# Initialize and cache engine instances
|
| 105 |
+
instances = []
|
| 106 |
+
for engine_class in engine_classes:
|
| 107 |
+
# If already cached, use the cached instance
|
| 108 |
+
if engine_class in self._engines_cache:
|
| 109 |
+
instances.append(self._engines_cache[engine_class])
|
| 110 |
+
# If not cached, create a new instance
|
| 111 |
+
else:
|
| 112 |
+
engine_instance = engine_class(proxy=self._proxy, timeout=self._timeout, verify=self._verify)
|
| 113 |
+
self._engines_cache[engine_class] = engine_instance
|
| 114 |
+
instances.append(engine_instance)
|
| 115 |
+
|
| 116 |
+
# sorting by `engine.priority`
|
| 117 |
+
instances.sort(key=lambda e: (e.priority, random), reverse=True)
|
| 118 |
+
except KeyError as ex:
|
| 119 |
+
logger.warning(
|
| 120 |
+
"%r - backend is not exist or disabled. Available: %s. Using 'auto'",
|
| 121 |
+
ex,
|
| 122 |
+
", ".join(sorted(engine_keys)),
|
| 123 |
+
)
|
| 124 |
+
return self._get_engines(category, "auto")
|
| 125 |
+
else:
|
| 126 |
+
return instances
|
| 127 |
+
|
| 128 |
+
def _search( # noqa: C901
|
| 129 |
+
self,
|
| 130 |
+
category: str,
|
| 131 |
+
query: str,
|
| 132 |
+
keywords: str | None = None, # deprecated
|
| 133 |
+
*,
|
| 134 |
+
region: str = "us-en",
|
| 135 |
+
safesearch: str = "moderate",
|
| 136 |
+
timelimit: str | None = None,
|
| 137 |
+
max_results: int | None = 10,
|
| 138 |
+
page: int = 1,
|
| 139 |
+
backend: str = "auto",
|
| 140 |
+
**kwargs: str,
|
| 141 |
+
) -> list[dict[str, Any]]:
|
| 142 |
+
"""Perform a search across engines in the given category.
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
category: The category of search engines (e.g., 'text', 'images', etc.).
|
| 146 |
+
query: The search query.
|
| 147 |
+
keywords: Deprecated alias for `query`.
|
| 148 |
+
region: The region to use for the search (e.g., us-en, uk-en, ru-ru, etc.).
|
| 149 |
+
safesearch: The safesearch setting (e.g., on, moderate, off).
|
| 150 |
+
timelimit: The timelimit for the search (e.g., d, w, m, y) or custom date range.
|
| 151 |
+
max_results: The maximum number of results to return. Defaults to 10.
|
| 152 |
+
page: The page of results to return. Defaults to 1.
|
| 153 |
+
backend: A single or comma-delimited backends. Defaults to "auto".
|
| 154 |
+
**kwargs: Additional keyword arguments to pass to the search engines.
|
| 155 |
+
|
| 156 |
+
Returns:
|
| 157 |
+
A list of dictionaries containing the search results.
|
| 158 |
+
|
| 159 |
+
"""
|
| 160 |
+
query = keywords or query
|
| 161 |
+
if not query:
|
| 162 |
+
msg = "query is mandatory."
|
| 163 |
+
raise DDGSException(msg)
|
| 164 |
+
|
| 165 |
+
engines = self._get_engines(category, backend)
|
| 166 |
+
len_unique_providers = len({engine.provider for engine in engines})
|
| 167 |
+
seen_providers: set[str] = set()
|
| 168 |
+
|
| 169 |
+
# Perform search
|
| 170 |
+
results_aggregator: ResultsAggregator[set[str]] = ResultsAggregator({"href", "image", "url", "embed_url"})
|
| 171 |
+
max_workers = min(len_unique_providers, ceil(max_results / 10) + 1) if max_results else len_unique_providers
|
| 172 |
+
executor = self.get_executor()
|
| 173 |
+
futures, err = {}, None
|
| 174 |
+
for i, engine in enumerate(engines, start=1):
|
| 175 |
+
if engine.provider in seen_providers:
|
| 176 |
+
continue
|
| 177 |
+
future = executor.submit(
|
| 178 |
+
engine.search,
|
| 179 |
+
query,
|
| 180 |
+
region=region,
|
| 181 |
+
safesearch=safesearch,
|
| 182 |
+
timelimit=timelimit,
|
| 183 |
+
page=page,
|
| 184 |
+
**kwargs,
|
| 185 |
+
)
|
| 186 |
+
futures[future] = engine
|
| 187 |
+
|
| 188 |
+
if len(futures) >= max_workers or i >= max_workers:
|
| 189 |
+
done, not_done = wait(futures, timeout=self._timeout, return_when="FIRST_EXCEPTION")
|
| 190 |
+
for f, f_engine in futures.items():
|
| 191 |
+
if f in done:
|
| 192 |
+
try:
|
| 193 |
+
if r := f.result():
|
| 194 |
+
results_aggregator.extend(r)
|
| 195 |
+
seen_providers.add(f_engine.provider)
|
| 196 |
+
except Exception as ex: # noqa: BLE001
|
| 197 |
+
err = ex
|
| 198 |
+
logger.info("Error in engine %s: %r", engine.name, ex)
|
| 199 |
+
futures = {f: futures[f] for f in not_done}
|
| 200 |
+
|
| 201 |
+
if max_results and len(results_aggregator) >= max_results:
|
| 202 |
+
break
|
| 203 |
+
|
| 204 |
+
results = results_aggregator.extract_dicts()
|
| 205 |
+
# Rank results
|
| 206 |
+
ranker = SimpleFilterRanker()
|
| 207 |
+
results = ranker.rank(results, query)
|
| 208 |
+
|
| 209 |
+
if results:
|
| 210 |
+
return results[:max_results] if max_results else results
|
| 211 |
+
|
| 212 |
+
if "timed out" in f"{err}":
|
| 213 |
+
raise TimeoutException(err)
|
| 214 |
+
raise DDGSException(err or "No results found.")
|
| 215 |
+
|
| 216 |
+
def text(self, query: str, **kwargs: Any) -> list[dict[str, Any]]: # noqa: ANN401
|
| 217 |
+
"""Perform a text search."""
|
| 218 |
+
return self._search("text", query, **kwargs)
|
| 219 |
+
|
| 220 |
+
def images(self, query: str, **kwargs: Any) -> list[dict[str, Any]]: # noqa: ANN401
|
| 221 |
+
"""Perform an image search."""
|
| 222 |
+
return self._search("images", query, **kwargs)
|
| 223 |
+
|
| 224 |
+
def news(self, query: str, **kwargs: Any) -> list[dict[str, Any]]: # noqa: ANN401
|
| 225 |
+
"""Perform a news search."""
|
| 226 |
+
return self._search("news", query, **kwargs)
|
| 227 |
+
|
| 228 |
+
def videos(self, query: str, **kwargs: Any) -> list[dict[str, Any]]: # noqa: ANN401
|
| 229 |
+
"""Perform a video search."""
|
| 230 |
+
return self._search("videos", query, **kwargs)
|
| 231 |
+
|
| 232 |
+
def books(self, query: str, **kwargs: Any) -> list[dict[str, Any]]: # noqa: ANN401
|
| 233 |
+
"""Perform a book search."""
|
| 234 |
+
return self._search("books", query, **kwargs)
|
ddgs/engines/__init__.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Automatically build registry of search engines.
|
| 2 |
+
|
| 3 |
+
This module defines the module-level variable ENGINES, which is a dictionary
|
| 4 |
+
of dictionaries. The keys of the outer dictionary are the categories of search
|
| 5 |
+
engines, and the keys of the inner dictionaries are the names of the search
|
| 6 |
+
engines. The values of the inner dictionaries are the classes of the search
|
| 7 |
+
engines.
|
| 8 |
+
|
| 9 |
+
The search engines are automatically discovered by looking for classes in the
|
| 10 |
+
module that are subclasses of :class:`ddgs.base.BaseSearchEngine` and
|
| 11 |
+
subclasses of the base class do not have names starting with "Base", and
|
| 12 |
+
do not have a class attribute "disabled" set to True.
|
| 13 |
+
|
| 14 |
+
The module automatically builds the ENGINES dictionary, so it should not be
|
| 15 |
+
imported directly by user code.
|
| 16 |
+
|
| 17 |
+
Example of resulting dictionary ENGINES:
|
| 18 |
+
|
| 19 |
+
from .bing import Bing
|
| 20 |
+
from .brave import Brave
|
| 21 |
+
from .duckduckgo import Duckduckgo
|
| 22 |
+
from .duckduckgo_images import DuckduckgoImages
|
| 23 |
+
from .duckduckgo_news import DuckduckgoNews
|
| 24 |
+
from .duckduckgo_videos import DuckduckgoVideos
|
| 25 |
+
from .google import Google
|
| 26 |
+
from .mojeek import Mojeek
|
| 27 |
+
from .wikipedia import Wikipedia
|
| 28 |
+
from .yahoo import Yahoo
|
| 29 |
+
from .yandex import Yandex
|
| 30 |
+
|
| 31 |
+
ENGINES: dict[str, dict[str, type[BaseSearchEngine[Any]]]] = {
|
| 32 |
+
"text": {
|
| 33 |
+
"bing": Bing,
|
| 34 |
+
"brave": Brave,
|
| 35 |
+
"duckduckgo": Duckduckgo, # bing
|
| 36 |
+
"google": Google,
|
| 37 |
+
"mojeek": Mojeek,
|
| 38 |
+
"yahoo": Yahoo, # bing
|
| 39 |
+
"yandex": Yandex,
|
| 40 |
+
"wikipedia": Wikipedia,
|
| 41 |
+
},
|
| 42 |
+
"images": {
|
| 43 |
+
"duckduckgo": DuckduckgoImages,
|
| 44 |
+
},
|
| 45 |
+
"news": {
|
| 46 |
+
"duckduckgo": DuckduckgoNews,
|
| 47 |
+
},
|
| 48 |
+
"videos": {
|
| 49 |
+
"duckduckgo": DuckduckgoVideos,
|
| 50 |
+
},
|
| 51 |
+
}
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
import importlib
|
| 55 |
+
import inspect
|
| 56 |
+
import pkgutil
|
| 57 |
+
from collections import defaultdict
|
| 58 |
+
from typing import Any
|
| 59 |
+
|
| 60 |
+
from ddgs.base import BaseSearchEngine
|
| 61 |
+
|
| 62 |
+
# ENGINES[category][name] = class
|
| 63 |
+
ENGINES: dict[str, dict[str, type[BaseSearchEngine[Any]]]] = defaultdict(dict)
|
| 64 |
+
|
| 65 |
+
package_name = __name__
|
| 66 |
+
package = importlib.import_module(package_name)
|
| 67 |
+
|
| 68 |
+
for finder, modname, _ispkg in pkgutil.iter_modules(package.__path__, package_name + "."):
|
| 69 |
+
module_path = finder.path if hasattr(finder, "path") else finder
|
| 70 |
+
module = importlib.import_module(modname)
|
| 71 |
+
for _, cls in inspect.getmembers(module, inspect.isclass):
|
| 72 |
+
# 1) must subclass BaseSearchEngine (but not the base itself)
|
| 73 |
+
if not issubclass(cls, BaseSearchEngine) or cls is BaseSearchEngine:
|
| 74 |
+
continue
|
| 75 |
+
|
| 76 |
+
# 2) skip any class whose name starts with "Base"
|
| 77 |
+
if cls.__name__.startswith("Base"):
|
| 78 |
+
continue
|
| 79 |
+
|
| 80 |
+
# 3) skip disabled engines
|
| 81 |
+
if getattr(cls, "disabled", True):
|
| 82 |
+
continue
|
| 83 |
+
|
| 84 |
+
# 3) ensure they provided name & category
|
| 85 |
+
name = getattr(cls, "name", None)
|
| 86 |
+
category = getattr(cls, "category", None)
|
| 87 |
+
if not isinstance(name, str) or not isinstance(category, str):
|
| 88 |
+
msg = f"{cls.__qualname__} must define class attributes 'name: str' and 'category: str'."
|
| 89 |
+
raise TypeError(msg)
|
| 90 |
+
|
| 91 |
+
ENGINES[category][name] = cls
|
| 92 |
+
|
| 93 |
+
# freeze into normal dicts
|
| 94 |
+
ENGINES = {cat: dict(m) for cat, m in ENGINES.items()}
|
ddgs/engines/annasarchive.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Anna's Archive search engine implementation."""
|
| 2 |
+
|
| 3 |
+
from collections.abc import Mapping
|
| 4 |
+
from typing import Any, ClassVar
|
| 5 |
+
|
| 6 |
+
from ddgs.base import BaseSearchEngine
|
| 7 |
+
from ddgs.results import BooksResult
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class AnnasArchive(BaseSearchEngine[BooksResult]):
|
| 11 |
+
"""Anna's Archive search engine."""
|
| 12 |
+
|
| 13 |
+
name = "annasarchive"
|
| 14 |
+
category = "books"
|
| 15 |
+
provider = "annasarchive"
|
| 16 |
+
|
| 17 |
+
search_url = "https://annas-archive.li/search"
|
| 18 |
+
search_method = "GET"
|
| 19 |
+
|
| 20 |
+
items_xpath = "//div[contains(@class, 'record-list-outer')]/div"
|
| 21 |
+
elements_xpath: ClassVar[Mapping[str, str]] = {
|
| 22 |
+
"title": ".//a[contains(@class, 'text-lg')]//text()",
|
| 23 |
+
"author": ".//a[span[contains(@class, 'user')]]//text()",
|
| 24 |
+
"publisher": ".//a[span[contains(@class, 'company')]]//text()",
|
| 25 |
+
"info": ".//div[contains(@class, 'text-gray-800')]/text()",
|
| 26 |
+
"url": "./a/@href",
|
| 27 |
+
"thumbnail": ".//img/@src",
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
def build_payload(
|
| 31 |
+
self,
|
| 32 |
+
query: str,
|
| 33 |
+
region: str, # noqa: ARG002
|
| 34 |
+
safesearch: str, # noqa: ARG002
|
| 35 |
+
timelimit: str | None, # noqa: ARG002
|
| 36 |
+
page: int = 1,
|
| 37 |
+
**kwargs: str, # noqa: ARG002
|
| 38 |
+
) -> dict[str, Any]:
|
| 39 |
+
"""Build a payload for the search request."""
|
| 40 |
+
return {"q": query, "page": f"{page}"}
|
| 41 |
+
|
| 42 |
+
def pre_process_html(self, html_text: str) -> str:
|
| 43 |
+
"""Pre-process the HTML text before parsing it."""
|
| 44 |
+
return html_text.replace("<!--", "").replace("-->", "")
|
| 45 |
+
|
| 46 |
+
def post_extract_results(self, results: list[BooksResult]) -> list[BooksResult]:
|
| 47 |
+
"""Post-process search results."""
|
| 48 |
+
base_url = self.search_url.split("/search")[0]
|
| 49 |
+
for result in results:
|
| 50 |
+
result.url = f"{base_url}{result.url}"
|
| 51 |
+
return results
|
ddgs/engines/bing.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Bing search engine implementation."""
|
| 2 |
+
|
| 3 |
+
import base64
|
| 4 |
+
from collections.abc import Mapping
|
| 5 |
+
from time import time
|
| 6 |
+
from typing import Any, ClassVar
|
| 7 |
+
from urllib.parse import parse_qs, urlparse
|
| 8 |
+
|
| 9 |
+
from ddgs.base import BaseSearchEngine
|
| 10 |
+
from ddgs.results import TextResult
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def unwrap_bing_url(raw_url: str) -> str | None:
|
| 14 |
+
"""Decode the Bing-wrapped raw_url to extract the original url."""
|
| 15 |
+
parsed = urlparse(raw_url)
|
| 16 |
+
u_vals = parse_qs(parsed.query).get("u", [])
|
| 17 |
+
if not u_vals:
|
| 18 |
+
return None
|
| 19 |
+
|
| 20 |
+
u = u_vals[0]
|
| 21 |
+
if len(u) <= 2:
|
| 22 |
+
return None
|
| 23 |
+
|
| 24 |
+
# Drop the first two characters, pad to a multiple of 4, then decode
|
| 25 |
+
b64_part = u[2:]
|
| 26 |
+
padding = "=" * (-len(b64_part) % 4)
|
| 27 |
+
decoded = base64.urlsafe_b64decode(b64_part + padding)
|
| 28 |
+
return decoded.decode()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class Bing(BaseSearchEngine[TextResult]):
|
| 32 |
+
"""Bing search engine."""
|
| 33 |
+
|
| 34 |
+
disabled = True # !!!
|
| 35 |
+
|
| 36 |
+
name = "bing"
|
| 37 |
+
category = "text"
|
| 38 |
+
provider = "bing"
|
| 39 |
+
|
| 40 |
+
search_url = "https://www.bing.com/search"
|
| 41 |
+
search_method = "GET"
|
| 42 |
+
|
| 43 |
+
items_xpath = "//li[contains(@class, 'b_algo')]"
|
| 44 |
+
elements_xpath: ClassVar[Mapping[str, str]] = {
|
| 45 |
+
"title": ".//h2/a//text()",
|
| 46 |
+
"href": ".//h2/a/@href",
|
| 47 |
+
"body": ".//p//text()",
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
def build_payload(
|
| 51 |
+
self,
|
| 52 |
+
query: str,
|
| 53 |
+
region: str,
|
| 54 |
+
safesearch: str, # noqa: ARG002
|
| 55 |
+
timelimit: str | None,
|
| 56 |
+
page: int = 1,
|
| 57 |
+
**kwargs: str, # noqa: ARG002
|
| 58 |
+
) -> dict[str, Any]:
|
| 59 |
+
"""Build a payload for the Bing search request."""
|
| 60 |
+
country, lang = region.lower().split("-")
|
| 61 |
+
payload = {"q": query, "pq": query, "cc": lang}
|
| 62 |
+
cookies = {
|
| 63 |
+
"_EDGE_CD": f"m={lang}-{country}&u={lang}-{country}",
|
| 64 |
+
"_EDGE_S": f"mkt={lang}-{country}&ui={lang}-{country}",
|
| 65 |
+
}
|
| 66 |
+
self.http_client.client.set_cookies("https://www.bing.com", cookies)
|
| 67 |
+
if timelimit:
|
| 68 |
+
d = int(time() // 86400)
|
| 69 |
+
code = f"ez5_{d - 365}_{d}" if timelimit == "y" else "ez" + {"d": "1", "w": "2", "m": "3"}[timelimit]
|
| 70 |
+
payload["filters"] = f'ex1:"{code}"'
|
| 71 |
+
if page > 1:
|
| 72 |
+
payload["first"] = f"{(page - 1) * 10}"
|
| 73 |
+
payload["FORM"] = f"PERE{page - 2 if page > 2 else ''}"
|
| 74 |
+
return payload
|
| 75 |
+
|
| 76 |
+
def post_extract_results(self, results: list[TextResult]) -> list[TextResult]:
|
| 77 |
+
"""Post-process search results."""
|
| 78 |
+
post_results = []
|
| 79 |
+
for result in results:
|
| 80 |
+
if result.href.startswith("https://www.bing.com/aclick?"):
|
| 81 |
+
continue
|
| 82 |
+
if result.href.startswith("https://www.bing.com/ck/a?"):
|
| 83 |
+
result.href = unwrap_bing_url(result.href) or result.href
|
| 84 |
+
post_results.append(result)
|
| 85 |
+
return post_results
|
ddgs/engines/bing_news.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Bing news engine implementation."""
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from collections.abc import Mapping
|
| 5 |
+
from contextlib import suppress
|
| 6 |
+
from datetime import datetime, timedelta, timezone
|
| 7 |
+
from typing import Any, ClassVar
|
| 8 |
+
|
| 9 |
+
from ddgs.base import BaseSearchEngine
|
| 10 |
+
from ddgs.results import NewsResult
|
| 11 |
+
|
| 12 |
+
DATE_RE = re.compile(r"\b(\d+)\s*(days|tagen|jours|giorni|dias|días|дн\.|день)?\b", re.IGNORECASE)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def extract_date(pub_date_str: str) -> str:
|
| 16 |
+
"""Extract date from string."""
|
| 17 |
+
# Try parsing the date with predefined formats
|
| 18 |
+
date_formats = ["%d.%m.%Y", "%m/%d/%Y", "%d/%m/%Y"]
|
| 19 |
+
for date_format in date_formats:
|
| 20 |
+
with suppress(ValueError):
|
| 21 |
+
return datetime.strptime(pub_date_str, date_format).astimezone(timezone.utc).isoformat()
|
| 22 |
+
|
| 23 |
+
# Search for relative date expressions
|
| 24 |
+
match = DATE_RE.search(pub_date_str)
|
| 25 |
+
if match:
|
| 26 |
+
days_ago = int(match.group(1))
|
| 27 |
+
return (datetime.now(timezone.utc) - timedelta(days=days_ago)).replace(microsecond=0).isoformat()
|
| 28 |
+
|
| 29 |
+
# Return the original string if no date is found
|
| 30 |
+
return pub_date_str
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class BingNews(BaseSearchEngine[NewsResult]):
|
| 34 |
+
"""Bing news engine."""
|
| 35 |
+
|
| 36 |
+
name = "bing"
|
| 37 |
+
category = "news"
|
| 38 |
+
provider = "bing"
|
| 39 |
+
|
| 40 |
+
search_url = "https://www.bing.com/news/infinitescrollajax"
|
| 41 |
+
search_method = "GET"
|
| 42 |
+
|
| 43 |
+
items_xpath = "//div[contains(@class, 'newsitem')]"
|
| 44 |
+
elements_xpath: ClassVar[Mapping[str, str]] = {
|
| 45 |
+
"date": ".//span[@aria-label]//@aria-label",
|
| 46 |
+
"title": "@data-title",
|
| 47 |
+
"body": ".//div[@class='snippet']//text()",
|
| 48 |
+
"url": "@url",
|
| 49 |
+
"image": ".//a[contains(@class, 'image')]//@src",
|
| 50 |
+
"source": "@data-author",
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
def build_payload(
|
| 54 |
+
self,
|
| 55 |
+
query: str,
|
| 56 |
+
region: str,
|
| 57 |
+
safesearch: str, # noqa: ARG002
|
| 58 |
+
timelimit: str | None,
|
| 59 |
+
page: int = 1,
|
| 60 |
+
**kwargs: str, # noqa: ARG002
|
| 61 |
+
) -> dict[str, Any]:
|
| 62 |
+
"""Build a payload for the Bing search request."""
|
| 63 |
+
country, lang = region.lower().split("-")
|
| 64 |
+
payload = {
|
| 65 |
+
"q": query,
|
| 66 |
+
"InfiniteScroll": "1",
|
| 67 |
+
"first": f"{page * 10 + 1}",
|
| 68 |
+
"SFX": f"{page}",
|
| 69 |
+
"cc": country,
|
| 70 |
+
"setlang": lang,
|
| 71 |
+
}
|
| 72 |
+
if timelimit:
|
| 73 |
+
payload["qft"] = {
|
| 74 |
+
"d": 'interval="4"', # doesn't exist so it's the same as one hour
|
| 75 |
+
"w": 'interval="7"',
|
| 76 |
+
"m": 'interval="9"',
|
| 77 |
+
"y": 'interval="9"', # doesn't exist so it's the same as month
|
| 78 |
+
}[timelimit]
|
| 79 |
+
return payload
|
| 80 |
+
|
| 81 |
+
def post_extract_results(self, results: list[NewsResult]) -> list[NewsResult]:
|
| 82 |
+
"""Post-process search results."""
|
| 83 |
+
for result in results:
|
| 84 |
+
result.date = extract_date(result.date)
|
| 85 |
+
result.image = f"https://www.bing.com{result.image.split('&')[0]}" if result.image else ""
|
| 86 |
+
return results
|
ddgs/engines/brave.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Brave search engine implementation."""
|
| 2 |
+
|
| 3 |
+
from collections.abc import Mapping
|
| 4 |
+
from typing import Any, ClassVar
|
| 5 |
+
|
| 6 |
+
from ddgs.base import BaseSearchEngine
|
| 7 |
+
from ddgs.results import TextResult
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class Brave(BaseSearchEngine[TextResult]):
|
| 11 |
+
"""Brave search engine."""
|
| 12 |
+
|
| 13 |
+
name = "brave"
|
| 14 |
+
category = "text"
|
| 15 |
+
provider = "brave"
|
| 16 |
+
|
| 17 |
+
search_url = "https://search.brave.com/search"
|
| 18 |
+
search_method = "GET"
|
| 19 |
+
|
| 20 |
+
items_xpath = "//div[@data-type='web']"
|
| 21 |
+
elements_xpath: ClassVar[Mapping[str, str]] = {
|
| 22 |
+
"title": ".//div[(contains(@class,'title') or contains(@class,'sitename-container')) and position()=last()]//text()", # noqa: E501
|
| 23 |
+
"href": ".//a[div[contains(@class, 'title')]]/@href",
|
| 24 |
+
"body": ".//div[contains(@class, 'snippet')]//div[contains(@class, 'content')]//text()",
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
def build_payload(
|
| 28 |
+
self,
|
| 29 |
+
query: str,
|
| 30 |
+
region: str,
|
| 31 |
+
safesearch: str,
|
| 32 |
+
timelimit: str | None,
|
| 33 |
+
page: int = 1,
|
| 34 |
+
**kwargs: str, # noqa: ARG002
|
| 35 |
+
) -> dict[str, Any]:
|
| 36 |
+
"""Build a payload for the search request."""
|
| 37 |
+
payload = {"q": query, "source": "web"}
|
| 38 |
+
country, _lang = region.lower().split("-")
|
| 39 |
+
cookies = {country: country, "useLocation": "0"}
|
| 40 |
+
if safesearch != "moderate":
|
| 41 |
+
cookies["safesearch"] = "strict" if safesearch == "on" else "off"
|
| 42 |
+
self.http_client.client.set_cookies("https://search.brave.com", cookies)
|
| 43 |
+
if timelimit:
|
| 44 |
+
payload["tf"] = {"d": "pd", "w": "pw", "m": "pm", "y": "py"}[timelimit]
|
| 45 |
+
if page > 1:
|
| 46 |
+
payload["offset"] = f"{page - 1}"
|
| 47 |
+
return payload
|
ddgs/engines/duckduckgo.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Duckduckgo search engine implementation."""
|
| 2 |
+
|
| 3 |
+
from collections.abc import Mapping
|
| 4 |
+
from typing import Any, ClassVar, TypeVar
|
| 5 |
+
|
| 6 |
+
from fake_useragent import UserAgent
|
| 7 |
+
|
| 8 |
+
from ddgs.base import BaseSearchEngine
|
| 9 |
+
from ddgs.http_client2 import HttpClient2
|
| 10 |
+
from ddgs.results import TextResult
|
| 11 |
+
|
| 12 |
+
ua = UserAgent()
|
| 13 |
+
|
| 14 |
+
T = TypeVar("T")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class Duckduckgo(BaseSearchEngine[TextResult]):
|
| 18 |
+
"""Duckduckgo search engine."""
|
| 19 |
+
|
| 20 |
+
name = "duckduckgo"
|
| 21 |
+
category = "text"
|
| 22 |
+
provider = "bing"
|
| 23 |
+
|
| 24 |
+
search_url = "https://html.duckduckgo.com/html/"
|
| 25 |
+
search_method = "POST"
|
| 26 |
+
|
| 27 |
+
items_xpath = "//div[contains(@class, 'body')]"
|
| 28 |
+
elements_xpath: ClassVar[Mapping[str, str]] = {"title": ".//h2//text()", "href": "./a/@href", "body": "./a//text()"}
|
| 29 |
+
|
| 30 |
+
headers: ClassVar[dict[str, str]] = {"User-Agent": ua.random}
|
| 31 |
+
|
| 32 |
+
def __init__(self, proxy: str | None = None, timeout: int | None = None, *, verify: bool = True) -> None:
|
| 33 |
+
"""Temporary, delete when HttpClient is fixed."""
|
| 34 |
+
self.http_client = HttpClient2(headers=self.headers, proxy=proxy, timeout=timeout, verify=verify) # type: ignore[assignment]
|
| 35 |
+
self.results: list[T] = [] # type: ignore[valid-type]
|
| 36 |
+
|
| 37 |
+
def build_payload(
|
| 38 |
+
self,
|
| 39 |
+
query: str,
|
| 40 |
+
region: str,
|
| 41 |
+
safesearch: str, # noqa: ARG002
|
| 42 |
+
timelimit: str | None,
|
| 43 |
+
page: int = 1,
|
| 44 |
+
**kwargs: str, # noqa: ARG002
|
| 45 |
+
) -> dict[str, Any]:
|
| 46 |
+
"""Build a payload for the search request."""
|
| 47 |
+
payload = {"q": query, "b": "", "l": region}
|
| 48 |
+
if page > 1:
|
| 49 |
+
payload["s"] = f"{10 + (page - 2) * 15}"
|
| 50 |
+
if timelimit:
|
| 51 |
+
payload["df"] = timelimit
|
| 52 |
+
return payload
|
| 53 |
+
|
| 54 |
+
def post_extract_results(self, results: list[TextResult]) -> list[TextResult]:
|
| 55 |
+
"""Post-process search results."""
|
| 56 |
+
return [r for r in results if not r.href.startswith("https://duckduckgo.com/y.js?")]
|
ddgs/engines/duckduckgo_images.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Duckduckgo images search engine implementation."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from collections.abc import Mapping
|
| 5 |
+
from typing import Any, ClassVar
|
| 6 |
+
|
| 7 |
+
from ddgs.base import BaseSearchEngine
|
| 8 |
+
from ddgs.results import ImagesResult
|
| 9 |
+
from ddgs.utils import _extract_vqd
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class DuckduckgoImages(BaseSearchEngine[ImagesResult]):
|
| 13 |
+
"""Duckduckgo images search engine."""
|
| 14 |
+
|
| 15 |
+
name = "duckduckgo"
|
| 16 |
+
category = "images"
|
| 17 |
+
provider = "bing"
|
| 18 |
+
|
| 19 |
+
search_url = "https://duckduckgo.com/i.js"
|
| 20 |
+
search_method = "GET"
|
| 21 |
+
search_headers: ClassVar[Mapping[str, str]] = {"Referer": "https://duckduckgo.com/", "Sec-Fetch-Mode": "cors"}
|
| 22 |
+
|
| 23 |
+
elements_replace: ClassVar[Mapping[str, str]] = {
|
| 24 |
+
"title": "title",
|
| 25 |
+
"image": "image",
|
| 26 |
+
"thumbnail": "thumbnail",
|
| 27 |
+
"url": "url",
|
| 28 |
+
"height": "height",
|
| 29 |
+
"width": "width",
|
| 30 |
+
"source": "source",
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
def _get_vqd(self, query: str) -> str:
|
| 34 |
+
"""Get vqd value for a search query using DuckDuckGo."""
|
| 35 |
+
resp_content = self.http_client.request("GET", "https://duckduckgo.com", params={"q": query}).content
|
| 36 |
+
return _extract_vqd(resp_content, query)
|
| 37 |
+
|
| 38 |
+
def build_payload(
|
| 39 |
+
self,
|
| 40 |
+
query: str,
|
| 41 |
+
region: str,
|
| 42 |
+
safesearch: str,
|
| 43 |
+
timelimit: str | None,
|
| 44 |
+
page: int = 1,
|
| 45 |
+
**kwargs: str,
|
| 46 |
+
) -> dict[str, Any]:
|
| 47 |
+
"""Build a payload for the search request."""
|
| 48 |
+
safesearch_base = {"on": "1", "moderate": "1", "off": "-1"}
|
| 49 |
+
timelimit_base = {"d": "Day", "w": "Week", "m": "Month", "y": "Year"}
|
| 50 |
+
timelimit = f"time:{timelimit_base[timelimit]}" if timelimit else ""
|
| 51 |
+
size = kwargs.get("size")
|
| 52 |
+
size = f"size:{size}" if size else ""
|
| 53 |
+
color = kwargs.get("color")
|
| 54 |
+
color = f"color:{color}" if color else ""
|
| 55 |
+
type_image = kwargs.get("type_image")
|
| 56 |
+
type_image = f"type:{type_image}" if type_image else ""
|
| 57 |
+
layout = kwargs.get("layout")
|
| 58 |
+
layout = f"layout:{layout}" if layout else ""
|
| 59 |
+
license_image = kwargs.get("license_image")
|
| 60 |
+
license_image = f"license:{license_image}" if license_image else ""
|
| 61 |
+
payload = {
|
| 62 |
+
"o": "json",
|
| 63 |
+
"q": query,
|
| 64 |
+
"l": region,
|
| 65 |
+
"vqd": self._get_vqd(query),
|
| 66 |
+
"p": safesearch_base[safesearch.lower()],
|
| 67 |
+
}
|
| 68 |
+
if timelimit or size or color or type_image or layout or license_image:
|
| 69 |
+
payload["f"] = f"{timelimit},{size},{color},{type_image},{layout},{license_image}"
|
| 70 |
+
if page > 1:
|
| 71 |
+
payload["s"] = f"{(page - 1) * 100}"
|
| 72 |
+
return payload
|
| 73 |
+
|
| 74 |
+
def extract_results(self, html_text: str) -> list[ImagesResult]:
|
| 75 |
+
"""Extract search results from html text."""
|
| 76 |
+
json_data = json.loads(html_text)
|
| 77 |
+
items = json_data.get("results", [])
|
| 78 |
+
results = []
|
| 79 |
+
for item in items:
|
| 80 |
+
result = ImagesResult()
|
| 81 |
+
for key, value in self.elements_replace.items():
|
| 82 |
+
data = item.get(key)
|
| 83 |
+
result.__setattr__(value, data)
|
| 84 |
+
results.append(result)
|
| 85 |
+
return results
|
ddgs/engines/duckduckgo_news.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Duckduckgo news search engine implementation."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from collections.abc import Mapping
|
| 5 |
+
from typing import Any, ClassVar
|
| 6 |
+
|
| 7 |
+
from ddgs.base import BaseSearchEngine
|
| 8 |
+
from ddgs.results import NewsResult
|
| 9 |
+
from ddgs.utils import _extract_vqd
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class DuckduckgoNews(BaseSearchEngine[NewsResult]):
|
| 13 |
+
"""Duckduckgo news search engine."""
|
| 14 |
+
|
| 15 |
+
name = "duckduckgo"
|
| 16 |
+
category = "news"
|
| 17 |
+
provider = "bing"
|
| 18 |
+
|
| 19 |
+
search_url = "https://duckduckgo.com/news.js"
|
| 20 |
+
search_method = "GET"
|
| 21 |
+
|
| 22 |
+
elements_replace: ClassVar[Mapping[str, str]] = {
|
| 23 |
+
"date": "date",
|
| 24 |
+
"title": "title",
|
| 25 |
+
"excerpt": "body",
|
| 26 |
+
"url": "url",
|
| 27 |
+
"image": "image",
|
| 28 |
+
"source": "source",
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
def _get_vqd(self, query: str) -> str:
|
| 32 |
+
"""Get vqd value for a search query using DuckDuckGo."""
|
| 33 |
+
resp_content = self.http_client.request("GET", "https://duckduckgo.com", params={"q": query}).content
|
| 34 |
+
return _extract_vqd(resp_content, query)
|
| 35 |
+
|
| 36 |
+
def build_payload(
|
| 37 |
+
self,
|
| 38 |
+
query: str,
|
| 39 |
+
region: str,
|
| 40 |
+
safesearch: str,
|
| 41 |
+
timelimit: str | None,
|
| 42 |
+
page: int = 1,
|
| 43 |
+
**kwargs: str, # noqa: ARG002
|
| 44 |
+
) -> dict[str, Any]:
|
| 45 |
+
"""Build a payload for the search request."""
|
| 46 |
+
safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
|
| 47 |
+
payload = {
|
| 48 |
+
"l": region,
|
| 49 |
+
"o": "json",
|
| 50 |
+
"noamp": "1",
|
| 51 |
+
"q": query,
|
| 52 |
+
"vqd": self._get_vqd(query),
|
| 53 |
+
"p": safesearch_base[safesearch.lower()],
|
| 54 |
+
}
|
| 55 |
+
if timelimit:
|
| 56 |
+
payload["df"] = timelimit
|
| 57 |
+
if page > 1:
|
| 58 |
+
payload["s"] = f"{(page - 1) * 30}"
|
| 59 |
+
return payload
|
| 60 |
+
|
| 61 |
+
def extract_results(self, html_text: str) -> list[NewsResult]:
|
| 62 |
+
"""Extract search results from lxml tree."""
|
| 63 |
+
json_data = json.loads(html_text)
|
| 64 |
+
items = json_data.get("results", [])
|
| 65 |
+
results = []
|
| 66 |
+
for item in items:
|
| 67 |
+
result = NewsResult()
|
| 68 |
+
for key, value in self.elements_replace.items():
|
| 69 |
+
data = item.get(key)
|
| 70 |
+
result.__setattr__(value, data)
|
| 71 |
+
results.append(result)
|
| 72 |
+
return results
|
ddgs/engines/duckduckgo_videos.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Duckduckgo videos search engine implementation."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from collections.abc import Mapping
|
| 5 |
+
from typing import Any, ClassVar
|
| 6 |
+
|
| 7 |
+
from ddgs.base import BaseSearchEngine
|
| 8 |
+
from ddgs.results import VideosResult
|
| 9 |
+
from ddgs.utils import _extract_vqd
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class DuckduckgoVideos(BaseSearchEngine[VideosResult]):
|
| 13 |
+
"""Duckduckgo videos search engine."""
|
| 14 |
+
|
| 15 |
+
name = "duckduckgo"
|
| 16 |
+
category = "videos"
|
| 17 |
+
provider = "bing"
|
| 18 |
+
|
| 19 |
+
search_url = "https://duckduckgo.com/v.js"
|
| 20 |
+
search_method = "GET"
|
| 21 |
+
|
| 22 |
+
elements_replace: ClassVar[Mapping[str, str]] = {
|
| 23 |
+
"content": "content",
|
| 24 |
+
"description": "description",
|
| 25 |
+
"duration": "duration",
|
| 26 |
+
"embed_html": "embed_html",
|
| 27 |
+
"embed_url": "embed_url",
|
| 28 |
+
"image_token": "image_token",
|
| 29 |
+
"images": "images",
|
| 30 |
+
"provider": "provider",
|
| 31 |
+
"published": "published",
|
| 32 |
+
"publisher": "publisher",
|
| 33 |
+
"statistics": "statistics",
|
| 34 |
+
"title": "title",
|
| 35 |
+
"uploader": "uploader",
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
def _get_vqd(self, query: str) -> str:
|
| 39 |
+
"""Get vqd value for a search query using DuckDuckGo."""
|
| 40 |
+
resp_content = self.http_client.request("GET", "https://duckduckgo.com", params={"q": query}).content
|
| 41 |
+
return _extract_vqd(resp_content, query)
|
| 42 |
+
|
| 43 |
+
def build_payload(
|
| 44 |
+
self,
|
| 45 |
+
query: str,
|
| 46 |
+
region: str,
|
| 47 |
+
safesearch: str,
|
| 48 |
+
timelimit: str | None,
|
| 49 |
+
page: int = 1,
|
| 50 |
+
**kwargs: str,
|
| 51 |
+
) -> dict[str, Any]:
|
| 52 |
+
"""Build a payload for the search request."""
|
| 53 |
+
safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
|
| 54 |
+
timelimit = f"publishedAfter:{timelimit}" if timelimit else ""
|
| 55 |
+
resolution = kwargs.get("resolution")
|
| 56 |
+
duration = kwargs.get("duration")
|
| 57 |
+
license_videos = kwargs.get("license_videos")
|
| 58 |
+
resolution = f"videoDefinition:{resolution}" if resolution else ""
|
| 59 |
+
duration = f"videoDuration:{duration}" if duration else ""
|
| 60 |
+
license_videos = f"videoLicense:{license_videos}" if license_videos else ""
|
| 61 |
+
payload = {
|
| 62 |
+
"l": region,
|
| 63 |
+
"o": "json",
|
| 64 |
+
"q": query,
|
| 65 |
+
"vqd": self._get_vqd(query),
|
| 66 |
+
"f": f"{timelimit},{resolution},{duration},{license_videos}",
|
| 67 |
+
"p": safesearch_base[safesearch.lower()],
|
| 68 |
+
}
|
| 69 |
+
if page > 1:
|
| 70 |
+
payload["s"] = f"{(page - 1) * 60}"
|
| 71 |
+
return payload
|
| 72 |
+
|
| 73 |
+
def extract_results(self, html_text: str) -> list[VideosResult]:
|
| 74 |
+
"""Extract search results from lxml tree."""
|
| 75 |
+
json_data = json.loads(html_text)
|
| 76 |
+
items = json_data.get("results", [])
|
| 77 |
+
results = []
|
| 78 |
+
for item in items:
|
| 79 |
+
result = VideosResult()
|
| 80 |
+
for key, value in self.elements_replace.items():
|
| 81 |
+
data = item.get(key)
|
| 82 |
+
result.__setattr__(value, data)
|
| 83 |
+
results.append(result)
|
| 84 |
+
return results
|
ddgs/engines/google.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Google search engine implementation."""
|
| 2 |
+
|
| 3 |
+
from collections.abc import Mapping
|
| 4 |
+
from random import SystemRandom
|
| 5 |
+
from typing import Any, ClassVar
|
| 6 |
+
|
| 7 |
+
from ddgs.base import BaseSearchEngine
|
| 8 |
+
from ddgs.results import TextResult
|
| 9 |
+
|
| 10 |
+
random = SystemRandom()
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def get_ua() -> str:
|
| 14 |
+
"""Return one random User-Agent string."""
|
| 15 |
+
patterns = [
|
| 16 |
+
"Opera/9.80 (J2ME/MIDP; Opera Mini/{v}/{b}; U; {l}) Presto/{p} Version/{f}",
|
| 17 |
+
"Opera/9.80 (Android; Linux; Opera Mobi/{b}; U; {l}) Presto/{p} Version/{f}",
|
| 18 |
+
"Opera/9.80 (iPhone; Opera Mini/{v}/{b}; U; {l}) Presto/{p} Version/{f}",
|
| 19 |
+
"Opera/9.80 (iPad; Opera Mini/{v}/{b}; U; {l}) Presto/{p} Version/{f}",
|
| 20 |
+
]
|
| 21 |
+
mini_versions = ["4.0", "5.0.17381", "7.1.32444", "9.80"]
|
| 22 |
+
mobi_builds = ["27", "447", "ADR-1011151731"]
|
| 23 |
+
builds = ["18.678", "24.743", "503"]
|
| 24 |
+
prestos = ["2.6.35", "2.7.60", "2.8.119"]
|
| 25 |
+
finals = ["10.00", "11.10", "12.16"]
|
| 26 |
+
langs = ["en-US", "en-GB", "de-DE", "fr-FR", "es-ES", "ru-RU", "zh-CN"]
|
| 27 |
+
fallback = "Opera/9.80 (iPad; Opera Mini/5.0.17381/503; U; eu) Presto/2.6.35 Version/11.10"
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
p = random.choice(patterns)
|
| 31 |
+
vals = {
|
| 32 |
+
"l": random.choice(langs),
|
| 33 |
+
"p": random.choice(prestos),
|
| 34 |
+
"f": random.choice(finals),
|
| 35 |
+
}
|
| 36 |
+
if "{v}" in p:
|
| 37 |
+
vals["v"] = random.choice(mini_versions)
|
| 38 |
+
if "{b}" in p:
|
| 39 |
+
vals["b"] = random.choice(mobi_builds) if "Opera Mobi" in p else random.choice(builds)
|
| 40 |
+
return p.format(**vals)
|
| 41 |
+
except Exception: # noqa: BLE001
|
| 42 |
+
return fallback
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class Google(BaseSearchEngine[TextResult]):
|
| 46 |
+
"""Google search engine."""
|
| 47 |
+
|
| 48 |
+
name = "google"
|
| 49 |
+
category = "text"
|
| 50 |
+
provider = "google"
|
| 51 |
+
|
| 52 |
+
search_url = "https://www.google.com/search"
|
| 53 |
+
search_method = "GET"
|
| 54 |
+
search_headers: ClassVar[dict[str, str]] = {"User-Agent": get_ua()}
|
| 55 |
+
|
| 56 |
+
items_xpath = "//div[div[@data-hveid]//div[h3]]"
|
| 57 |
+
elements_xpath: ClassVar[Mapping[str, str]] = {
|
| 58 |
+
"title": ".//h3//text()",
|
| 59 |
+
"href": ".//a/@href",
|
| 60 |
+
"body": "./div/div/div[2]//text()",
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
def build_payload(
|
| 64 |
+
self,
|
| 65 |
+
query: str,
|
| 66 |
+
region: str,
|
| 67 |
+
safesearch: str,
|
| 68 |
+
timelimit: str | None,
|
| 69 |
+
page: int = 1,
|
| 70 |
+
**kwargs: str, # noqa: ARG002
|
| 71 |
+
) -> dict[str, Any]:
|
| 72 |
+
"""Build a payload for the Google search request."""
|
| 73 |
+
safesearch_base = {"on": "2", "moderate": "1", "off": "0"}
|
| 74 |
+
start = (page - 1) * 10
|
| 75 |
+
payload = {
|
| 76 |
+
"q": query,
|
| 77 |
+
"filter": safesearch_base[safesearch.lower()],
|
| 78 |
+
"start": str(start),
|
| 79 |
+
}
|
| 80 |
+
country, lang = region.split("-")
|
| 81 |
+
payload["hl"] = f"{lang}-{country.upper()}" # interface language
|
| 82 |
+
payload["lr"] = f"lang_{lang}" # restricts to results written in a particular language
|
| 83 |
+
payload["cr"] = f"country{country.upper()}" # restricts to results written in a particular country
|
| 84 |
+
if timelimit:
|
| 85 |
+
payload["tbs"] = f"qdr:{timelimit}"
|
| 86 |
+
return payload
|
| 87 |
+
|
| 88 |
+
def post_extract_results(self, results: list[TextResult]) -> list[TextResult]:
|
| 89 |
+
"""Post-process search results."""
|
| 90 |
+
post_results = []
|
| 91 |
+
for result in results:
|
| 92 |
+
if result.href.startswith("/url?q="):
|
| 93 |
+
result.href = result.href.split("?q=")[1].split("&")[0]
|
| 94 |
+
post_results.append(result)
|
| 95 |
+
return post_results
|
ddgs/engines/grokipedia.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Grokipedia text search engine."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
from ddgs.base import BaseSearchEngine
|
| 8 |
+
from ddgs.results import TextResult
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Grokipedia(BaseSearchEngine[TextResult]):
|
| 14 |
+
"""Grokipedia text search engine."""
|
| 15 |
+
|
| 16 |
+
name = "grokipedia"
|
| 17 |
+
category = "text"
|
| 18 |
+
provider = "grokipedia"
|
| 19 |
+
priority = 1.9
|
| 20 |
+
|
| 21 |
+
search_url = "https://grokipedia.com/api/typeahead"
|
| 22 |
+
search_method = "GET"
|
| 23 |
+
|
| 24 |
+
def build_payload(
|
| 25 |
+
self,
|
| 26 |
+
query: str,
|
| 27 |
+
region: str, # noqa: ARG002
|
| 28 |
+
safesearch: str, # noqa: ARG002
|
| 29 |
+
timelimit: str | None, # noqa: ARG002
|
| 30 |
+
page: int = 1, # noqa: ARG002
|
| 31 |
+
**kwargs: str, # noqa: ARG002
|
| 32 |
+
) -> dict[str, Any]:
|
| 33 |
+
"""Build a payload for the search request."""
|
| 34 |
+
payload: dict[str, Any] = {"query": query, "limit": "1"}
|
| 35 |
+
return payload
|
| 36 |
+
|
| 37 |
+
def extract_results(self, html_text: str) -> list[TextResult]:
|
| 38 |
+
"""Extract search results from html text."""
|
| 39 |
+
json_data = json.loads(html_text)
|
| 40 |
+
items = json_data.get("results", [])
|
| 41 |
+
if not items:
|
| 42 |
+
return []
|
| 43 |
+
|
| 44 |
+
result = TextResult()
|
| 45 |
+
result.title = items[0].get("title", "").strip("_")
|
| 46 |
+
body = items[0].get("snippet", "")
|
| 47 |
+
result.body = body.split("\n\n", 1)[1] if "\n\n" in body else body
|
| 48 |
+
result.href = f"https://grokipedia.com/page/{items[0]['slug']}"
|
| 49 |
+
return [result]
|
ddgs/engines/mojeek.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Mojeek search engine implementation."""
|
| 2 |
+
|
| 3 |
+
from collections.abc import Mapping
|
| 4 |
+
from typing import Any, ClassVar
|
| 5 |
+
|
| 6 |
+
from ddgs.base import BaseSearchEngine
|
| 7 |
+
from ddgs.results import TextResult
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class Mojeek(BaseSearchEngine[TextResult]):
|
| 11 |
+
"""Mojeek search engine."""
|
| 12 |
+
|
| 13 |
+
name = "mojeek"
|
| 14 |
+
category = "text"
|
| 15 |
+
provider = "mojeek"
|
| 16 |
+
|
| 17 |
+
search_url = "https://www.mojeek.com/search"
|
| 18 |
+
search_method = "GET"
|
| 19 |
+
|
| 20 |
+
items_xpath = "//ul[contains(@class, 'results')]/li"
|
| 21 |
+
elements_xpath: ClassVar[Mapping[str, str]] = {
|
| 22 |
+
"title": ".//h2//text()",
|
| 23 |
+
"href": ".//h2/a/@href",
|
| 24 |
+
"body": ".//p[@class='s']//text()",
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
def build_payload(
|
| 28 |
+
self,
|
| 29 |
+
query: str,
|
| 30 |
+
region: str,
|
| 31 |
+
safesearch: str,
|
| 32 |
+
timelimit: str | None, # noqa: ARG002
|
| 33 |
+
page: int = 1,
|
| 34 |
+
**kwargs: str, # noqa: ARG002
|
| 35 |
+
) -> dict[str, Any]:
|
| 36 |
+
"""Build a payload for the search request."""
|
| 37 |
+
country, lang = region.lower().split("-")
|
| 38 |
+
cookies = {
|
| 39 |
+
"arc": country,
|
| 40 |
+
"lb": lang,
|
| 41 |
+
}
|
| 42 |
+
self.http_client.client.set_cookies("https://www.mojeek.com", cookies)
|
| 43 |
+
payload = {
|
| 44 |
+
"q": query,
|
| 45 |
+
# "tlen": f"{randint(68, 128)}", # Title length limit (default=68, max=128) # noqa: ERA001
|
| 46 |
+
# "dlen": f"{randint(160, 512)}", # Description length limit (default=160, max=512) # noqa: ERA001
|
| 47 |
+
}
|
| 48 |
+
if safesearch == "on":
|
| 49 |
+
payload["safe"] = "1"
|
| 50 |
+
if page > 1:
|
| 51 |
+
payload["s"] = f"{(page - 1) * 10 + 1}"
|
| 52 |
+
return payload
|
ddgs/engines/wikipedia.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Wikipedia text search engine."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Any
|
| 6 |
+
from urllib.parse import quote
|
| 7 |
+
|
| 8 |
+
from ddgs.base import BaseSearchEngine
|
| 9 |
+
from ddgs.results import TextResult
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class Wikipedia(BaseSearchEngine[TextResult]):
|
| 15 |
+
"""Wikipedia text search engine."""
|
| 16 |
+
|
| 17 |
+
name = "wikipedia"
|
| 18 |
+
category = "text"
|
| 19 |
+
provider = "wikipedia"
|
| 20 |
+
priority = 2
|
| 21 |
+
|
| 22 |
+
search_url = "https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={query}"
|
| 23 |
+
search_method = "GET"
|
| 24 |
+
|
| 25 |
+
def build_payload(
|
| 26 |
+
self,
|
| 27 |
+
query: str,
|
| 28 |
+
region: str,
|
| 29 |
+
safesearch: str, # noqa: ARG002
|
| 30 |
+
timelimit: str | None, # noqa: ARG002
|
| 31 |
+
page: int = 1, # noqa: ARG002
|
| 32 |
+
**kwargs: str, # noqa: ARG002
|
| 33 |
+
) -> dict[str, Any]:
|
| 34 |
+
"""Build a payload for the search request."""
|
| 35 |
+
_country, lang = region.lower().split("-")
|
| 36 |
+
encoded_query = quote(query)
|
| 37 |
+
self.search_url = (
|
| 38 |
+
f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&profile=fuzzy&limit=1&search={encoded_query}"
|
| 39 |
+
)
|
| 40 |
+
payload: dict[str, Any] = {}
|
| 41 |
+
self.lang = lang # used in extract_results
|
| 42 |
+
return payload
|
| 43 |
+
|
| 44 |
+
def extract_results(self, html_text: str) -> list[TextResult]:
|
| 45 |
+
"""Extract search results from html text."""
|
| 46 |
+
json_data = json.loads(html_text)
|
| 47 |
+
if not json_data[1]:
|
| 48 |
+
return []
|
| 49 |
+
|
| 50 |
+
result = TextResult()
|
| 51 |
+
result.title = json_data[1][0]
|
| 52 |
+
result.href = json_data[3][0]
|
| 53 |
+
|
| 54 |
+
# Add body
|
| 55 |
+
encoded_query = quote(result.title)
|
| 56 |
+
resp_data = self.request(
|
| 57 |
+
"GET",
|
| 58 |
+
f"https://{self.lang}.wikipedia.org/w/api.php?action=query&format=json&prop=extracts&titles={encoded_query}&explaintext=0&exintro=0&redirects=1",
|
| 59 |
+
)
|
| 60 |
+
if resp_data:
|
| 61 |
+
json_data = json.loads(resp_data)
|
| 62 |
+
result.body = next(iter(json_data["query"]["pages"].values())).get("extract", "")
|
| 63 |
+
if "may refer to:" in result.body:
|
| 64 |
+
return []
|
| 65 |
+
|
| 66 |
+
return [result]
|
ddgs/engines/yahoo.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Yahoo search engine."""
|
| 2 |
+
|
| 3 |
+
from collections.abc import Mapping
|
| 4 |
+
from secrets import token_urlsafe
|
| 5 |
+
from typing import Any, ClassVar
|
| 6 |
+
from urllib.parse import unquote_plus
|
| 7 |
+
|
| 8 |
+
from ddgs.base import BaseSearchEngine
|
| 9 |
+
from ddgs.results import TextResult
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def extract_url(u: str) -> str:
|
| 13 |
+
"""Sanitize url."""
|
| 14 |
+
t = u.split("/RU=", 1)[1]
|
| 15 |
+
return unquote_plus(t.split("/RK=", 1)[0].split("/RS=", 1)[0])
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class Yahoo(BaseSearchEngine[TextResult]):
|
| 19 |
+
"""Yahoo search engine."""
|
| 20 |
+
|
| 21 |
+
name = "yahoo"
|
| 22 |
+
category = "text"
|
| 23 |
+
provider = "bing"
|
| 24 |
+
|
| 25 |
+
search_url = "https://search.yahoo.com/search"
|
| 26 |
+
search_method = "GET"
|
| 27 |
+
|
| 28 |
+
items_xpath = "//div[contains(@class, 'relsrch')]"
|
| 29 |
+
elements_xpath: ClassVar[Mapping[str, str]] = {
|
| 30 |
+
"title": ".//div[contains(@class, 'Title')]//h3//text()",
|
| 31 |
+
"href": ".//div[contains(@class, 'Title')]//a/@href",
|
| 32 |
+
"body": ".//div[contains(@class, 'Text')]//text()",
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
def build_payload(
|
| 36 |
+
self,
|
| 37 |
+
query: str,
|
| 38 |
+
region: str, # noqa: ARG002
|
| 39 |
+
safesearch: str, # noqa: ARG002
|
| 40 |
+
timelimit: str | None,
|
| 41 |
+
page: int = 1,
|
| 42 |
+
**kwargs: str, # noqa: ARG002
|
| 43 |
+
) -> dict[str, Any]:
|
| 44 |
+
"""Build a payload for the search request."""
|
| 45 |
+
self.search_url = (
|
| 46 |
+
f"https://search.yahoo.com/search;_ylt={token_urlsafe(24 * 3 // 4)};_ylu={token_urlsafe(47 * 3 // 4)}"
|
| 47 |
+
)
|
| 48 |
+
payload = {"p": query}
|
| 49 |
+
if page > 1:
|
| 50 |
+
payload["b"] = f"{(page - 1) * 7 + 1}"
|
| 51 |
+
if timelimit:
|
| 52 |
+
payload["btf"] = timelimit
|
| 53 |
+
return payload
|
| 54 |
+
|
| 55 |
+
def post_extract_results(self, results: list[TextResult]) -> list[TextResult]:
|
| 56 |
+
"""Post-process search results."""
|
| 57 |
+
post_results = []
|
| 58 |
+
for result in results:
|
| 59 |
+
if result.href.startswith("https://www.bing.com/aclick?"):
|
| 60 |
+
continue
|
| 61 |
+
if "/RU=" in result.href:
|
| 62 |
+
result.href = extract_url(result.href)
|
| 63 |
+
post_results.append(result)
|
| 64 |
+
return post_results
|
ddgs/engines/yahoo_news.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Yahoo! News search engine."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import re
|
| 5 |
+
from collections.abc import Callable, Mapping
|
| 6 |
+
from datetime import datetime, timedelta, timezone
|
| 7 |
+
from typing import Any, ClassVar
|
| 8 |
+
from urllib.parse import unquote_plus
|
| 9 |
+
|
| 10 |
+
from ddgs.base import BaseSearchEngine
|
| 11 |
+
from ddgs.results import NewsResult
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
DATE_RE = re.compile(r"\b(\d+)\s*(year|month|week|day|hour|minute)s?\b", re.IGNORECASE)
|
| 16 |
+
DATE_UNITS: dict[str, Callable[[int], timedelta]] = {
|
| 17 |
+
"minute": lambda n: timedelta(minutes=n),
|
| 18 |
+
"hour": lambda n: timedelta(hours=n),
|
| 19 |
+
"day": lambda n: timedelta(days=n),
|
| 20 |
+
"week": lambda n: timedelta(weeks=n),
|
| 21 |
+
"month": lambda n: timedelta(days=30 * n),
|
| 22 |
+
"year": lambda n: timedelta(days=365 * n),
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def extract_date(pub_date_str: str) -> str:
|
| 27 |
+
"""Extract date from string."""
|
| 28 |
+
now = datetime.now(timezone.utc)
|
| 29 |
+
m = DATE_RE.search(pub_date_str)
|
| 30 |
+
if not m:
|
| 31 |
+
return pub_date_str
|
| 32 |
+
|
| 33 |
+
number = int(m.group(1))
|
| 34 |
+
unit = m.group(2).lower()
|
| 35 |
+
delta = DATE_UNITS[unit](number)
|
| 36 |
+
dt = (now - delta).replace(microsecond=0)
|
| 37 |
+
return dt.isoformat()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def extract_url(u: str) -> str:
|
| 41 |
+
"""Sanitize url."""
|
| 42 |
+
url = u.split("/RU=", 1)[1].split("/RK=", 1)[0].split("?", 1)[0]
|
| 43 |
+
return unquote_plus(url)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def extract_image(u: str) -> str:
|
| 47 |
+
"""Sanitize image url."""
|
| 48 |
+
idx = u.find("-/")
|
| 49 |
+
return u[idx + 2 :] if idx != -1 else u
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def extract_source(s: str) -> str:
|
| 53 |
+
"""Remove ' via Yahoo' from string."""
|
| 54 |
+
return s.split(" · via Yahoo")[0]
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class YahooNews(BaseSearchEngine[NewsResult]):
|
| 58 |
+
"""Yahoo news search engine."""
|
| 59 |
+
|
| 60 |
+
name = "yahoo"
|
| 61 |
+
category = "news"
|
| 62 |
+
provider = "yahoo"
|
| 63 |
+
|
| 64 |
+
search_url = "https://news.search.yahoo.com/search"
|
| 65 |
+
search_method = "GET"
|
| 66 |
+
|
| 67 |
+
items_xpath = "//div[@id='web']//li[a]"
|
| 68 |
+
elements_xpath: ClassVar[Mapping[str, str]] = {
|
| 69 |
+
"date": ".//span[contains(@class, 'time')]//text()",
|
| 70 |
+
"title": ".//h4//text()",
|
| 71 |
+
"body": ".//p//text()",
|
| 72 |
+
"url": ".//h4/a/@href",
|
| 73 |
+
"image": "(.//img/@data-src | .//img/@src)[1]",
|
| 74 |
+
"source": ".//span[contains(@class, 'source')]//text()",
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
def build_payload(
|
| 78 |
+
self,
|
| 79 |
+
query: str,
|
| 80 |
+
region: str, # noqa: ARG002
|
| 81 |
+
safesearch: str, # noqa: ARG002
|
| 82 |
+
timelimit: str | None,
|
| 83 |
+
page: int = 1,
|
| 84 |
+
**kwargs: str, # noqa: ARG002
|
| 85 |
+
) -> dict[str, Any]:
|
| 86 |
+
"""Build a payload for the search request."""
|
| 87 |
+
payload = {"p": query}
|
| 88 |
+
if page > 1:
|
| 89 |
+
payload["b"] = f"{(page - 1) * 10 + 1}"
|
| 90 |
+
if timelimit:
|
| 91 |
+
payload["btf"] = timelimit
|
| 92 |
+
return payload
|
| 93 |
+
|
| 94 |
+
def post_extract_results(self, results: list[NewsResult]) -> list[NewsResult]:
|
| 95 |
+
"""Post-process search results."""
|
| 96 |
+
try:
|
| 97 |
+
for result in results:
|
| 98 |
+
result.date = extract_date(result.date)
|
| 99 |
+
result.url = extract_url(result.url)
|
| 100 |
+
result.image = extract_image(result.image)
|
| 101 |
+
result.source = extract_source(result.source)
|
| 102 |
+
except Exception as ex: # noqa: BLE001
|
| 103 |
+
logger.warning("Error post-processing results: %r", ex)
|
| 104 |
+
return results
|
ddgs/engines/yandex.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Yandex search engine."""
|
| 2 |
+
|
| 3 |
+
from collections.abc import Mapping
|
| 4 |
+
from random import SystemRandom
|
| 5 |
+
from typing import Any, ClassVar
|
| 6 |
+
|
| 7 |
+
from ddgs.base import BaseSearchEngine
|
| 8 |
+
from ddgs.results import TextResult
|
| 9 |
+
|
| 10 |
+
random = SystemRandom()
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Yandex(BaseSearchEngine[TextResult]):
|
| 14 |
+
"""Yandex search engine."""
|
| 15 |
+
|
| 16 |
+
name = "yandex"
|
| 17 |
+
category = "text"
|
| 18 |
+
provider = "yandex"
|
| 19 |
+
|
| 20 |
+
search_url = "https://yandex.com/search/site/"
|
| 21 |
+
search_method = "GET"
|
| 22 |
+
|
| 23 |
+
items_xpath = "//li[contains(@class, 'serp-item')]"
|
| 24 |
+
elements_xpath: ClassVar[Mapping[str, str]] = {
|
| 25 |
+
"title": ".//h3//text()",
|
| 26 |
+
"href": ".//h3//a/@href",
|
| 27 |
+
"body": ".//div[contains(@class, 'text')]//text()",
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
def build_payload(
|
| 31 |
+
self,
|
| 32 |
+
query: str,
|
| 33 |
+
region: str, # noqa: ARG002
|
| 34 |
+
safesearch: str, # noqa: ARG002
|
| 35 |
+
timelimit: str | None, # noqa: ARG002
|
| 36 |
+
page: int = 1,
|
| 37 |
+
**kwargs: str, # noqa: ARG002
|
| 38 |
+
) -> dict[str, Any]:
|
| 39 |
+
"""Build a payload for the search request."""
|
| 40 |
+
payload = {
|
| 41 |
+
"text": query,
|
| 42 |
+
"web": "1",
|
| 43 |
+
"searchid": f"{random.randint(1000000, 9999999)}",
|
| 44 |
+
}
|
| 45 |
+
if page > 1:
|
| 46 |
+
payload["p"] = f"{page - 1}"
|
| 47 |
+
return payload
|
ddgs/exceptions.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""DDGS exceptions."""
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class DDGSException(Exception):
|
| 5 |
+
"""Base exception class for ddgs."""
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class RatelimitException(DDGSException):
|
| 9 |
+
"""Raised for rate limit exceeded errors during API requests."""
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class TimeoutException(DDGSException):
|
| 13 |
+
"""Raised for timeout errors during API requests."""
|
ddgs/http_client.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""HTTP client."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from secrets import choice
|
| 5 |
+
from typing import Any, Final, Literal, get_args
|
| 6 |
+
|
| 7 |
+
import primp
|
| 8 |
+
|
| 9 |
+
from .exceptions import DDGSException, TimeoutException
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class Response:
|
| 15 |
+
"""HTTP response."""
|
| 16 |
+
|
| 17 |
+
__slots__ = ("content", "status_code", "text")
|
| 18 |
+
|
| 19 |
+
def __init__(self, status_code: int, content: bytes, text: str) -> None:
|
| 20 |
+
self.status_code = status_code
|
| 21 |
+
self.content = content
|
| 22 |
+
self.text = text
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class HttpClient:
|
| 26 |
+
"""HTTP client."""
|
| 27 |
+
|
| 28 |
+
_impersonates: Final = get_args(Literal[
|
| 29 |
+
"chrome_100", "chrome_101", "chrome_104", "chrome_105", "chrome_106", "chrome_107",
|
| 30 |
+
"chrome_108", "chrome_109", "chrome_114", "chrome_116", "chrome_117", "chrome_118",
|
| 31 |
+
"chrome_119", "chrome_120", "chrome_123", "chrome_124", "chrome_126", "chrome_127",
|
| 32 |
+
"chrome_128", "chrome_129", "chrome_130", "chrome_131", "chrome_133",
|
| 33 |
+
"safari_15.3", "safari_15.5", "safari_15.6.1", "safari_16", "safari_16.5",
|
| 34 |
+
"safari_17.0", "safari_17.2.1", "safari_17.4.1", "safari_17.5",
|
| 35 |
+
"safari_18", "safari_18.2",
|
| 36 |
+
"edge_101", "edge_122", "edge_127", "edge_131",
|
| 37 |
+
"firefox_109", "firefox_117", "firefox_128", "firefox_133", "firefox_135",
|
| 38 |
+
]) # fmt: skip
|
| 39 |
+
_impersonates_os: Final = get_args(Literal["macos", "linux", "windows"])
|
| 40 |
+
|
| 41 |
+
def __init__(self, proxy: str | None = None, timeout: int | None = 10, *, verify: bool | str = True) -> None:
|
| 42 |
+
"""Initialize the HttpClient object.
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
proxy (str, optional): proxy for the HTTP client, supports http/https/socks5 protocols.
|
| 46 |
+
example: "http://user:pass@example.com:3128". Defaults to None.
|
| 47 |
+
timeout (int, optional): Timeout value for the HTTP client. Defaults to 10.
|
| 48 |
+
verify: (bool | str): True to verify, False to skip, or a str path to a PEM file. Defaults to True.
|
| 49 |
+
|
| 50 |
+
"""
|
| 51 |
+
self.client = primp.Client(
|
| 52 |
+
proxy=proxy,
|
| 53 |
+
timeout=timeout,
|
| 54 |
+
impersonate=choice(self._impersonates),
|
| 55 |
+
impersonate_os=choice(self._impersonates_os),
|
| 56 |
+
verify=verify if isinstance(verify, bool) else True,
|
| 57 |
+
ca_cert_file=verify if isinstance(verify, str) else None,
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
def request(self, *args: Any, **kwargs: Any) -> Response: # noqa: ANN401
|
| 61 |
+
"""Make a request to the HTTP client."""
|
| 62 |
+
try:
|
| 63 |
+
resp = self.client.request(*args, **kwargs)
|
| 64 |
+
return Response(status_code=resp.status_code, content=resp.content, text=resp.text)
|
| 65 |
+
except Exception as ex:
|
| 66 |
+
if "timed out" in f"{ex}":
|
| 67 |
+
msg = f"Request timed out: {ex!r}"
|
| 68 |
+
raise TimeoutException(msg) from ex
|
| 69 |
+
msg = f"{type(ex).__name__}: {ex!r}"
|
| 70 |
+
raise DDGSException(msg) from ex
|
| 71 |
+
|
| 72 |
+
def get(self, *args: Any, **kwargs: Any) -> Response: # noqa: ANN401
|
| 73 |
+
"""Make a GET request to the HTTP client."""
|
| 74 |
+
return self.request(*args, method="GET", **kwargs)
|
| 75 |
+
|
| 76 |
+
def post(self, *args: Any, **kwargs: Any) -> Response: # noqa: ANN401
|
| 77 |
+
"""Make a POST request to the HTTP client."""
|
| 78 |
+
return self.request(*args, method="POST", **kwargs)
|
ddgs/http_client2.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Temporary HTTP client for 'backend=duckduckgo'. Delete when HttpClient is fixed."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import ssl
|
| 5 |
+
from random import SystemRandom
|
| 6 |
+
from types import TracebackType
|
| 7 |
+
from typing import TYPE_CHECKING, Any
|
| 8 |
+
|
| 9 |
+
import h2
|
| 10 |
+
import httpcore
|
| 11 |
+
import httpx
|
| 12 |
+
|
| 13 |
+
from .exceptions import DDGSException, TimeoutException
|
| 14 |
+
|
| 15 |
+
if TYPE_CHECKING:
|
| 16 |
+
from collections.abc import Callable
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
random = SystemRandom()
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class Response:
|
| 24 |
+
"""HTTP response."""
|
| 25 |
+
|
| 26 |
+
__slots__ = ("content", "status_code", "text")
|
| 27 |
+
|
| 28 |
+
def __init__(self, status_code: int, content: bytes, text: str) -> None:
|
| 29 |
+
self.status_code = status_code
|
| 30 |
+
self.content = content
|
| 31 |
+
self.text = text
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class HttpClient2:
|
| 35 |
+
"""Temporary HTTP client."""
|
| 36 |
+
|
| 37 |
+
def __init__(
|
| 38 |
+
self,
|
| 39 |
+
headers: dict[str, str] | None = None,
|
| 40 |
+
proxy: str | None = None,
|
| 41 |
+
timeout: int | None = 10,
|
| 42 |
+
*,
|
| 43 |
+
verify: bool | str = True,
|
| 44 |
+
) -> None:
|
| 45 |
+
"""Initialize the HttpClient object.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
headers (dict, optional): headers for the HTTP client.
|
| 49 |
+
proxy (str, optional): proxy for the HTTP client, supports http/https/socks5 protocols.
|
| 50 |
+
example: "http://user:pass@example.com:3128". Defaults to None.
|
| 51 |
+
timeout (int, optional): Timeout value for the HTTP client. Defaults to 10.
|
| 52 |
+
verify: (bool | str): True to verify, False to skip or str path to a PEM file. Defaults to True.
|
| 53 |
+
|
| 54 |
+
"""
|
| 55 |
+
self.client = httpx.Client(
|
| 56 |
+
headers=headers,
|
| 57 |
+
proxy=proxy,
|
| 58 |
+
timeout=timeout,
|
| 59 |
+
verify=_get_random_ssl_context(verify=verify) if verify else False,
|
| 60 |
+
follow_redirects=False,
|
| 61 |
+
http2=True,
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
def request(self, *args: Any, **kwargs: Any) -> Response: # noqa: ANN401
|
| 65 |
+
"""Make a request to the HTTP client."""
|
| 66 |
+
with Patch():
|
| 67 |
+
try:
|
| 68 |
+
resp = self.client.request(*args, **kwargs)
|
| 69 |
+
return Response(status_code=resp.status_code, content=resp.content, text=resp.text)
|
| 70 |
+
except Exception as ex:
|
| 71 |
+
if "timed out" in f"{ex}":
|
| 72 |
+
msg = f"Request timed out: {ex!r}"
|
| 73 |
+
raise TimeoutException(msg) from ex
|
| 74 |
+
msg = f"{type(ex).__name__}: {ex!r}"
|
| 75 |
+
raise DDGSException(msg) from ex
|
| 76 |
+
|
| 77 |
+
def get(self, *args: Any, **kwargs: Any) -> Response: # noqa: ANN401
|
| 78 |
+
"""Make a GET request to the HTTP client."""
|
| 79 |
+
return self.request(*args, method="GET", **kwargs)
|
| 80 |
+
|
| 81 |
+
def post(self, *args: Any, **kwargs: Any) -> Response: # noqa: ANN401
|
| 82 |
+
"""Make a POST request to the HTTP client."""
|
| 83 |
+
return self.request(*args, method="POST", **kwargs)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# SSL
|
| 87 |
+
DEFAULT_CIPHERS = [ # https://developers.cloudflare.com/ssl/reference/cipher-suites/recommendations/
|
| 88 |
+
"TLS_AES_128_GCM_SHA256", "TLS_AES_256_GCM_SHA384", "TLS_CHACHA20_POLY1305_SHA256",
|
| 89 |
+
# Modern:
|
| 90 |
+
"ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-ECDSA-CHACHA20-POLY1305", "ECDHE-RSA-AES128-GCM-SHA256",
|
| 91 |
+
"ECDHE-RSA-CHACHA20-POLY1305", "ECDHE-ECDSA-AES256-GCM-SHA384", "ECDHE-RSA-AES256-GCM-SHA384",
|
| 92 |
+
# Compatible:
|
| 93 |
+
"ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-ECDSA-CHACHA20-POLY1305", "ECDHE-RSA-AES128-GCM-SHA256",
|
| 94 |
+
"ECDHE-RSA-CHACHA20-POLY1305", "ECDHE-ECDSA-AES256-GCM-SHA384", "ECDHE-RSA-AES256-GCM-SHA384",
|
| 95 |
+
"ECDHE-ECDSA-AES128-SHA256", "ECDHE-RSA-AES128-SHA256", "ECDHE-ECDSA-AES256-SHA384", "ECDHE-RSA-AES256-SHA384",
|
| 96 |
+
# Legacy:
|
| 97 |
+
"ECDHE-ECDSA-AES128-SHA", "ECDHE-RSA-AES128-SHA", "AES128-GCM-SHA256", "AES128-SHA256", "AES128-SHA",
|
| 98 |
+
"ECDHE-RSA-AES256-SHA", "AES256-GCM-SHA384", "AES256-SHA256", "AES256-SHA", "DES-CBC3-SHA",
|
| 99 |
+
] # fmt: skip
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def _get_random_ssl_context(*, verify: bool | str) -> ssl.SSLContext:
|
| 103 |
+
ssl_context = ssl.create_default_context(cafile=verify if isinstance(verify, str) else None)
|
| 104 |
+
shuffled_ciphers = random.sample(DEFAULT_CIPHERS[9:], len(DEFAULT_CIPHERS) - 9)
|
| 105 |
+
ssl_context.set_ciphers(":".join(DEFAULT_CIPHERS[:9] + shuffled_ciphers))
|
| 106 |
+
commands: list[None | Callable[[ssl.SSLContext], None]] = [
|
| 107 |
+
None,
|
| 108 |
+
lambda context: setattr(context, "maximum_version", ssl.TLSVersion.TLSv1_2),
|
| 109 |
+
lambda context: setattr(context, "minimum_version", ssl.TLSVersion.TLSv1_3),
|
| 110 |
+
lambda context: setattr(context, "options", context.options | ssl.OP_NO_TICKET),
|
| 111 |
+
]
|
| 112 |
+
random_command = random.choice(commands)
|
| 113 |
+
if random_command:
|
| 114 |
+
random_command(ssl_context)
|
| 115 |
+
return ssl_context
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
class Patch:
|
| 119 |
+
"""Patch the HTTP2Connection._send_connection_init method."""
|
| 120 |
+
|
| 121 |
+
def __enter__(self) -> None:
|
| 122 |
+
"""Enter the context manager."""
|
| 123 |
+
|
| 124 |
+
def _send_connection_init(self: httpcore._sync.http2.HTTP2Connection, request: httpcore.Request) -> None:
|
| 125 |
+
self._h2_state.local_settings = h2.settings.Settings(
|
| 126 |
+
client=True,
|
| 127 |
+
initial_values={
|
| 128 |
+
h2.settings.SettingCodes.INITIAL_WINDOW_SIZE: random.randint(100, 200),
|
| 129 |
+
h2.settings.SettingCodes.HEADER_TABLE_SIZE: random.randint(4000, 5000),
|
| 130 |
+
h2.settings.SettingCodes.MAX_FRAME_SIZE: random.randint(16384, 65535),
|
| 131 |
+
h2.settings.SettingCodes.MAX_CONCURRENT_STREAMS: random.randint(100, 200),
|
| 132 |
+
h2.settings.SettingCodes.MAX_HEADER_LIST_SIZE: random.randint(65500, 66500),
|
| 133 |
+
h2.settings.SettingCodes.ENABLE_CONNECT_PROTOCOL: random.randint(0, 1),
|
| 134 |
+
h2.settings.SettingCodes.ENABLE_PUSH: random.randint(0, 1),
|
| 135 |
+
},
|
| 136 |
+
)
|
| 137 |
+
self._h2_state.initiate_connection()
|
| 138 |
+
self._h2_state.increment_flow_control_window(2**24)
|
| 139 |
+
self._write_outgoing_data(request)
|
| 140 |
+
|
| 141 |
+
self.original_send_connection_init = httpcore._sync.http2.HTTP2Connection._send_connection_init
|
| 142 |
+
httpcore._sync.http2.HTTP2Connection._send_connection_init = _send_connection_init # type: ignore[method-assign]
|
| 143 |
+
|
| 144 |
+
def __exit__(
|
| 145 |
+
self,
|
| 146 |
+
exc_type: type[BaseException] | None = None,
|
| 147 |
+
exc_val: BaseException | None = None,
|
| 148 |
+
exc_tb: TracebackType | None = None,
|
| 149 |
+
) -> None:
|
| 150 |
+
"""Exit the context manager."""
|
| 151 |
+
httpcore._sync.http2.HTTP2Connection._send_connection_init = self.original_send_connection_init # type: ignore[method-assign]
|
ddgs/py.typed
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Marker file for PEP 561.
|
ddgs/results.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Result classes."""
|
| 2 |
+
|
| 3 |
+
from abc import ABC
|
| 4 |
+
from collections import Counter
|
| 5 |
+
from collections.abc import Callable, Mapping
|
| 6 |
+
from dataclasses import dataclass, field
|
| 7 |
+
from typing import Any, ClassVar, Generic, TypeVar
|
| 8 |
+
|
| 9 |
+
from .utils import _normalize_date, _normalize_text, _normalize_url
|
| 10 |
+
|
| 11 |
+
T = TypeVar("T")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class BaseResult:
|
| 15 |
+
"""Base class for all results. Contains normalization functions."""
|
| 16 |
+
|
| 17 |
+
_normalizers: ClassVar[Mapping[str, Callable[[Any], str]]] = {
|
| 18 |
+
"title": _normalize_text,
|
| 19 |
+
"body": _normalize_text,
|
| 20 |
+
"href": _normalize_url,
|
| 21 |
+
"url": _normalize_url,
|
| 22 |
+
"thumbnail": _normalize_url,
|
| 23 |
+
"image": _normalize_url,
|
| 24 |
+
"date": _normalize_date,
|
| 25 |
+
"author": _normalize_text,
|
| 26 |
+
"publisher": _normalize_text,
|
| 27 |
+
"info": _normalize_text,
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
def __setattr__(self, name: str, value: str) -> None:
|
| 31 |
+
"""Override setattr to apply normalization functions to certain attributes."""
|
| 32 |
+
if value and (normalizer := self._normalizers.get(name)):
|
| 33 |
+
value = normalizer(value)
|
| 34 |
+
object.__setattr__(self, name, value)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@dataclass
|
| 38 |
+
class TextResult(BaseResult):
|
| 39 |
+
"""Text search result."""
|
| 40 |
+
|
| 41 |
+
title: str = ""
|
| 42 |
+
href: str = ""
|
| 43 |
+
body: str = ""
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@dataclass
|
| 47 |
+
class ImagesResult(BaseResult):
|
| 48 |
+
"""Image search result."""
|
| 49 |
+
|
| 50 |
+
title: str = ""
|
| 51 |
+
image: str = ""
|
| 52 |
+
thumbnail: str = ""
|
| 53 |
+
url: str = ""
|
| 54 |
+
height: str = ""
|
| 55 |
+
width: str = ""
|
| 56 |
+
source: str = ""
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@dataclass
|
| 60 |
+
class NewsResult(BaseResult):
|
| 61 |
+
"""News search result."""
|
| 62 |
+
|
| 63 |
+
date: str = ""
|
| 64 |
+
title: str = ""
|
| 65 |
+
body: str = ""
|
| 66 |
+
url: str = ""
|
| 67 |
+
image: str = ""
|
| 68 |
+
source: str = ""
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
@dataclass
|
| 72 |
+
class VideosResult(BaseResult):
|
| 73 |
+
"""Video search result."""
|
| 74 |
+
|
| 75 |
+
title: str = ""
|
| 76 |
+
content: str = ""
|
| 77 |
+
description: str = ""
|
| 78 |
+
duration: str = ""
|
| 79 |
+
embed_html: str = ""
|
| 80 |
+
embed_url: str = ""
|
| 81 |
+
image_token: str = ""
|
| 82 |
+
images: dict[str, str] = field(default_factory=dict)
|
| 83 |
+
provider: str = ""
|
| 84 |
+
published: str = ""
|
| 85 |
+
publisher: str = ""
|
| 86 |
+
statistics: dict[str, str] = field(default_factory=dict)
|
| 87 |
+
uploader: str = ""
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
@dataclass
|
| 91 |
+
class BooksResult(BaseResult):
|
| 92 |
+
"""Book search result."""
|
| 93 |
+
|
| 94 |
+
title: str = ""
|
| 95 |
+
author: str = ""
|
| 96 |
+
publisher: str = ""
|
| 97 |
+
info: str = ""
|
| 98 |
+
url: str = ""
|
| 99 |
+
thumbnail: str = ""
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
class ResultsAggregator(ABC, Generic[T]):
|
| 103 |
+
"""Aggregates incoming results.
|
| 104 |
+
|
| 105 |
+
Items are deduplicated by `cache_field`. Append just increments a counter;
|
| 106 |
+
`extract_results` returns items sorted by descending frequency.
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
def __init__(self, cache_fields: set[str]) -> None:
|
| 110 |
+
if not cache_fields:
|
| 111 |
+
msg = "At least one cache_field must be provided"
|
| 112 |
+
raise ValueError(msg)
|
| 113 |
+
self.cache_fields = set(cache_fields)
|
| 114 |
+
self._counter: Counter[str] = Counter()
|
| 115 |
+
self._cache: dict[str, T] = {}
|
| 116 |
+
|
| 117 |
+
def _get_key(self, item: T) -> str:
|
| 118 |
+
for key in item.__dict__:
|
| 119 |
+
if key in self.cache_fields:
|
| 120 |
+
return str(item.__dict__[key])
|
| 121 |
+
msg = f"Item {item!r} has none of the cache fields {self.cache_fields}"
|
| 122 |
+
raise AttributeError(msg)
|
| 123 |
+
|
| 124 |
+
def __len__(self) -> int:
|
| 125 |
+
"""Return the number of items in the cache."""
|
| 126 |
+
return len(self._cache)
|
| 127 |
+
|
| 128 |
+
def append(self, item: T) -> None:
|
| 129 |
+
"""Add an item to the cache.
|
| 130 |
+
|
| 131 |
+
Register an occurrence of `item`. First time we see its key,
|
| 132 |
+
we store the item; every time we bump the counter.
|
| 133 |
+
"""
|
| 134 |
+
key = self._get_key(item)
|
| 135 |
+
if key not in self._cache or len(item.__dict__.get("body", "")) > len(
|
| 136 |
+
self._cache[key].__dict__.get("body", ""),
|
| 137 |
+
):
|
| 138 |
+
self._cache[key] = item
|
| 139 |
+
self._counter[key] += 1
|
| 140 |
+
|
| 141 |
+
def extend(self, items: list[T]) -> None:
|
| 142 |
+
"""Add a list of items to the cache."""
|
| 143 |
+
for item in items:
|
| 144 |
+
self.append(item)
|
| 145 |
+
|
| 146 |
+
def extract_dicts(self) -> list[dict[str, Any]]:
|
| 147 |
+
"""Return a list of items, sorted by descending frequency. Each item is returned as a dict."""
|
| 148 |
+
return [self._cache[key].__dict__ for key, _ in self._counter.most_common()]
|
ddgs/similarity.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Simple filter ranker."""
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from typing import Final
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class SimpleFilterRanker:
|
| 8 |
+
"""Simple filter ranker.
|
| 9 |
+
|
| 10 |
+
1) Pull any doc with 'wikipedia.org' in its href to the top.
|
| 11 |
+
2) Bucket the rest according to where query tokens appear:
|
| 12 |
+
- both title & body/description
|
| 13 |
+
- title only
|
| 14 |
+
- body only
|
| 15 |
+
- neither
|
| 16 |
+
3) Return wikipedia-top + both + title-only + body-only + neither.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
_splitter: Final = re.compile(r"\W+")
|
| 20 |
+
|
| 21 |
+
def __init__(self, min_token_length: int = 3) -> None:
|
| 22 |
+
self.min_token_length = min_token_length
|
| 23 |
+
|
| 24 |
+
def _extract_tokens(self, query: str) -> set[str]:
|
| 25 |
+
"""Split on non-word characters & filter out short tokens."""
|
| 26 |
+
return {token for token in self._splitter.split(query.lower()) if len(token) >= self.min_token_length}
|
| 27 |
+
|
| 28 |
+
def _has_any_token(self, text: str, tokens: set[str]) -> bool:
|
| 29 |
+
"""Check if any token is a substring of the lower-cased text."""
|
| 30 |
+
lower_text = text.lower()
|
| 31 |
+
return any(tok in lower_text for tok in tokens)
|
| 32 |
+
|
| 33 |
+
def rank(self, docs: list[dict[str, str]], query: str) -> list[dict[str, str]]:
|
| 34 |
+
"""Rank a list of docs based on a query string."""
|
| 35 |
+
tokens = self._extract_tokens(query)
|
| 36 |
+
|
| 37 |
+
wiki_hits = []
|
| 38 |
+
both = []
|
| 39 |
+
title_only = []
|
| 40 |
+
body_only = []
|
| 41 |
+
neither = []
|
| 42 |
+
|
| 43 |
+
for doc in docs:
|
| 44 |
+
href = doc.get("href", "")
|
| 45 |
+
title = doc.get("title", "")
|
| 46 |
+
# fallback to 'description' if no 'body'
|
| 47 |
+
body = doc.get("body", doc.get("description", ""))
|
| 48 |
+
|
| 49 |
+
# Skip Wikimedia category pages
|
| 50 |
+
if all(x in title for x in ["Category:", "Wikimedia"]):
|
| 51 |
+
continue
|
| 52 |
+
|
| 53 |
+
# Wikipedia check
|
| 54 |
+
if "wikipedia.org" in href:
|
| 55 |
+
wiki_hits.append(doc)
|
| 56 |
+
continue
|
| 57 |
+
|
| 58 |
+
# Title / Body match
|
| 59 |
+
hit_title = self._has_any_token(title, tokens)
|
| 60 |
+
hit_body = self._has_any_token(body, tokens)
|
| 61 |
+
|
| 62 |
+
if hit_title and hit_body:
|
| 63 |
+
both.append(doc)
|
| 64 |
+
elif hit_title:
|
| 65 |
+
title_only.append(doc)
|
| 66 |
+
elif hit_body:
|
| 67 |
+
body_only.append(doc)
|
| 68 |
+
else:
|
| 69 |
+
neither.append(doc)
|
| 70 |
+
|
| 71 |
+
# final ranking
|
| 72 |
+
return wiki_hits + both + title_only + body_only + neither
|
ddgs/utils.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Utilities."""
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
import unicodedata
|
| 5 |
+
from contextlib import suppress
|
| 6 |
+
from datetime import datetime, timezone
|
| 7 |
+
from html import unescape
|
| 8 |
+
from urllib.parse import unquote
|
| 9 |
+
|
| 10 |
+
from .exceptions import DDGSException
|
| 11 |
+
|
| 12 |
+
_REGEX_STRIP_TAGS = re.compile("<.*?>")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _extract_vqd(html_bytes: bytes, query: str) -> str:
|
| 16 |
+
"""Extract vqd from html bytes."""
|
| 17 |
+
for c1, c1_len, c2 in (
|
| 18 |
+
(b'vqd="', 5, b'"'),
|
| 19 |
+
(b"vqd=", 4, b"&"),
|
| 20 |
+
(b"vqd='", 5, b"'"),
|
| 21 |
+
):
|
| 22 |
+
with suppress(ValueError):
|
| 23 |
+
start = html_bytes.index(c1) + c1_len
|
| 24 |
+
end = html_bytes.index(c2, start)
|
| 25 |
+
return html_bytes[start:end].decode()
|
| 26 |
+
|
| 27 |
+
msg = f"_extract_vqd() {query=} Could not extract vqd."
|
| 28 |
+
raise DDGSException(msg)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _normalize_url(url: str) -> str:
|
| 32 |
+
"""Unquote URL and replace spaces with '+'."""
|
| 33 |
+
return unquote(url).replace(" ", "+") if url else ""
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _normalize_text(raw: str) -> str:
|
| 37 |
+
"""Normalize text.
|
| 38 |
+
|
| 39 |
+
Strip HTML tags, unescape HTML entities, normalize Unicode,
|
| 40 |
+
remove "c" category characters, and collapse whitespace.
|
| 41 |
+
"""
|
| 42 |
+
if not raw:
|
| 43 |
+
return ""
|
| 44 |
+
|
| 45 |
+
# 1. Strip HTML tags
|
| 46 |
+
text = _REGEX_STRIP_TAGS.sub("", raw)
|
| 47 |
+
|
| 48 |
+
# 2. Unescape HTML entities
|
| 49 |
+
text = unescape(text)
|
| 50 |
+
|
| 51 |
+
# 3. Unicode normalization
|
| 52 |
+
text = unicodedata.normalize("NFC", text)
|
| 53 |
+
|
| 54 |
+
# 4. Remove "C" category characters
|
| 55 |
+
c_to_none = {ord(ch): None for ch in set(text) if unicodedata.category(ch)[0] == "C"}
|
| 56 |
+
if c_to_none:
|
| 57 |
+
text = text.translate(c_to_none)
|
| 58 |
+
|
| 59 |
+
# 5. Collapse whitespace
|
| 60 |
+
return " ".join(text.split())
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def _normalize_date(date: int | str) -> str:
|
| 64 |
+
"""Normalize date from integer to ISO format if applicable."""
|
| 65 |
+
return datetime.fromtimestamp(date, timezone.utc).isoformat() if isinstance(date, int) else date
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _expand_proxy_tb_alias(proxy: str | None) -> str | None:
|
| 69 |
+
"""Expand "tb" to a full proxy URL if applicable."""
|
| 70 |
+
return "socks5h://127.0.0.1:9150" if proxy == "tb" else proxy
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
ddgs-api:
|
| 3 |
+
build: .
|
| 4 |
+
ports:
|
| 5 |
+
- "8000:8000"
|
| 6 |
+
environment:
|
| 7 |
+
- DDGS_PROXY
|
| 8 |
+
volumes:
|
| 9 |
+
- ./logs:/app/logs
|
| 10 |
+
restart: unless-stopped
|
| 11 |
+
healthcheck:
|
| 12 |
+
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
| 13 |
+
interval: 30s
|
| 14 |
+
timeout: 10s
|
| 15 |
+
retries: 3
|
| 16 |
+
start_period: 60s
|
pyproject.toml
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "ddgs"
|
| 7 |
+
description = "Dux Distributed Global Search. A metasearch library that aggregates results from diverse web search services."
|
| 8 |
+
readme = "README.md"
|
| 9 |
+
requires-python = ">=3.10"
|
| 10 |
+
license = "MIT"
|
| 11 |
+
keywords = ["python", "search", "metasearch"]
|
| 12 |
+
authors = [
|
| 13 |
+
{name = "deedy5"}
|
| 14 |
+
]
|
| 15 |
+
classifiers = [
|
| 16 |
+
"Development Status :: 5 - Production/Stable",
|
| 17 |
+
"Operating System :: OS Independent",
|
| 18 |
+
"Programming Language :: Python :: 3",
|
| 19 |
+
"Programming Language :: Python :: 3 :: Only",
|
| 20 |
+
"Programming Language :: Python :: 3.10",
|
| 21 |
+
"Programming Language :: Python :: 3.11",
|
| 22 |
+
"Programming Language :: Python :: 3.12",
|
| 23 |
+
"Programming Language :: Python :: 3.13",
|
| 24 |
+
"Programming Language :: Python :: 3.14",
|
| 25 |
+
"Programming Language :: Python :: Implementation :: CPython",
|
| 26 |
+
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
| 27 |
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
| 28 |
+
]
|
| 29 |
+
dependencies = [
|
| 30 |
+
"click>=8.1.8",
|
| 31 |
+
"primp>=0.15.0",
|
| 32 |
+
"lxml>=4.9.4",
|
| 33 |
+
"httpx[http2,socks,brotli]>=0.28.1", # temporarily
|
| 34 |
+
"fake-useragent>=2.2.0",
|
| 35 |
+
]
|
| 36 |
+
dynamic = ["version"]
|
| 37 |
+
|
| 38 |
+
[project.urls] # Optional
|
| 39 |
+
"Homepage" = "https://github.com/deedy5/ddgs"
|
| 40 |
+
|
| 41 |
+
[project.scripts]
|
| 42 |
+
ddgs = "ddgs.cli:safe_entry_point"
|
| 43 |
+
|
| 44 |
+
[tool.setuptools.dynamic]
|
| 45 |
+
version = {attr = "ddgs.__version__"}
|
| 46 |
+
|
| 47 |
+
[tool.setuptools.packages.find]
|
| 48 |
+
include = ["ddgs*"]
|
| 49 |
+
exclude = ["api*"]
|
| 50 |
+
|
| 51 |
+
[project.optional-dependencies]
|
| 52 |
+
dev = [
|
| 53 |
+
"mypy>=1.17.1",
|
| 54 |
+
"pre-commit",
|
| 55 |
+
"pytest>=8.4.1",
|
| 56 |
+
"pytest-dependency>=0.6.0",
|
| 57 |
+
"ruff>=0.13.0",
|
| 58 |
+
|
| 59 |
+
# for mypy
|
| 60 |
+
"lxml-stubs",
|
| 61 |
+
"types-Pygments",
|
| 62 |
+
"types-pexpect",
|
| 63 |
+
"types-PyYAML",
|
| 64 |
+
"types-ujson"
|
| 65 |
+
]
|
| 66 |
+
api = [
|
| 67 |
+
"fastapi[standard]>=0.104.0",
|
| 68 |
+
"fastapi-mcp>=0.4.0",
|
| 69 |
+
]
|
| 70 |
+
|
| 71 |
+
[tool.ruff]
|
| 72 |
+
line-length = 120
|
| 73 |
+
exclude = ["tests"]
|
| 74 |
+
|
| 75 |
+
[tool.ruff.lint]
|
| 76 |
+
select = [
|
| 77 |
+
# Core rules
|
| 78 |
+
"E", # pycodestyle errors
|
| 79 |
+
"W", # pycodestyle warnings
|
| 80 |
+
"F", # pyflakes
|
| 81 |
+
"I", # isort
|
| 82 |
+
|
| 83 |
+
# Enhanced rules
|
| 84 |
+
"ERA", # eradicate
|
| 85 |
+
"YTT", # flake8-2020
|
| 86 |
+
"ANN", # flake8-annotations
|
| 87 |
+
"ASYNC", # flake8-async
|
| 88 |
+
"S", # flake8-bandit
|
| 89 |
+
"BLE", # flake8-blind-except
|
| 90 |
+
"FBT", # flake8-boolean-trap
|
| 91 |
+
"B", # flake8-bugbear
|
| 92 |
+
"A", # flake8-builtins
|
| 93 |
+
"COM", # flake8-commas
|
| 94 |
+
"C4", # flake8-comprehensions
|
| 95 |
+
"DTZ", # flake8-datetimez
|
| 96 |
+
"T10", # flake8-debugger
|
| 97 |
+
"EM", # flake8-errmsg
|
| 98 |
+
"FIX", # flake8-fixme
|
| 99 |
+
"FA", # flake8-future-annotations
|
| 100 |
+
"INT", # flake8-gettext
|
| 101 |
+
"ISC", # flake8-implicit-str-concat
|
| 102 |
+
"ICN", # flake8-import-conventions
|
| 103 |
+
"LOG", # flake8-logging
|
| 104 |
+
"G", # flake8-logging-format
|
| 105 |
+
"INP", # flake8-no-pep420
|
| 106 |
+
"PIE", # flake8-pie
|
| 107 |
+
"T20", # flake8-print
|
| 108 |
+
"PYI", # flake8-pyi
|
| 109 |
+
"PT", # flake8-pytest-style
|
| 110 |
+
"Q", # flake8-quotes
|
| 111 |
+
"RSE", # flake8-raise
|
| 112 |
+
"RET", # flake8-return
|
| 113 |
+
"SLF", # flake8-self
|
| 114 |
+
"SIM", # flake8-simplify
|
| 115 |
+
"SLOT", # flake8-slots
|
| 116 |
+
"TID", # flake8-tidy-imports
|
| 117 |
+
"TD", # flake8-todos
|
| 118 |
+
"TC", # flake8-type-checking
|
| 119 |
+
"ARG", # flake8-unused-arguments
|
| 120 |
+
"PTH", # flake8-use-pathlib
|
| 121 |
+
"FLY", # flynt
|
| 122 |
+
"C90", # mccabe
|
| 123 |
+
"N", # pep8-naming
|
| 124 |
+
"PERF", # perflint
|
| 125 |
+
"PGH", # pygrep-hooks
|
| 126 |
+
"PL", # Pylint
|
| 127 |
+
"UP", # pyupgrade
|
| 128 |
+
"FURB", # refurb
|
| 129 |
+
"RUF", # ruff-specific rules
|
| 130 |
+
"TRY", # tryceratops
|
| 131 |
+
|
| 132 |
+
# Documentation
|
| 133 |
+
"D", # pydocstyle
|
| 134 |
+
]
|
| 135 |
+
ignore = [
|
| 136 |
+
"COM812", # Missing trailing comma (handled by formatter)
|
| 137 |
+
"D107", # Missing docstring in `__init__`
|
| 138 |
+
"D203", # incorrect-blank-line-before-class
|
| 139 |
+
"D213", # multi-line-summary-second-line
|
| 140 |
+
"N818", # Exception name {name} should be named with an Error suffix
|
| 141 |
+
"PLR0913", # Too many arguments to function call
|
| 142 |
+
"PLR2004", # Magic value used in comparison
|
| 143 |
+
"SLF001", # Private member accessed
|
| 144 |
+
]
|
| 145 |
+
|
| 146 |
+
[tool.mypy]
|
| 147 |
+
python_version = "3.10"
|
| 148 |
+
strict = true
|
| 149 |
+
exclude = ["build/"]
|
start_api.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Start the DDGS API server."""
|
| 3 |
+
|
| 4 |
+
import logging
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
import uvicorn
|
| 8 |
+
from fastapi_mcp import FastApiMCP # type: ignore[import-untyped]
|
| 9 |
+
|
| 10 |
+
from api.main import app
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
# Add current directory to Python path
|
| 15 |
+
sys.path.insert(0, ".")
|
| 16 |
+
|
| 17 |
+
# MCP server
|
| 18 |
+
mcp = FastApiMCP(app, name="ddgs-search", description="DDGS (Dux Distributed Global Search) MCP Server")
|
| 19 |
+
mcp.mount_http()
|
| 20 |
+
logger.info("✅ MCP server enabled at /mcp")
|
| 21 |
+
mcp.mount_sse()
|
| 22 |
+
logger.info("✅ MCP server enabled at /sse")
|
| 23 |
+
|
| 24 |
+
logger.info("🚀 Starting DDGS API server on http://0.0.0.0:8000")
|
| 25 |
+
uvicorn.run(app, host="0.0.0.0", port=8000, workers=1) # noqa: S104
|
start_api.sh
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# DDGS API Startup Script
|
| 4 |
+
|
| 5 |
+
set -e
|
| 6 |
+
|
| 7 |
+
echo "🚀 Starting DDGS API..."
|
| 8 |
+
|
| 9 |
+
# Check if virtual environment exists
|
| 10 |
+
if [ ! -d ".venv" ]; then
|
| 11 |
+
echo "📦 Creating virtual environment..."
|
| 12 |
+
python3 -m venv .venv
|
| 13 |
+
fi
|
| 14 |
+
|
| 15 |
+
# Activate virtual environment
|
| 16 |
+
echo "🔧 Activating virtual environment..."
|
| 17 |
+
source .venv/bin/activate
|
| 18 |
+
|
| 19 |
+
# Install dependencies
|
| 20 |
+
echo "📥 Installing dependencies..."
|
| 21 |
+
pip install -e ".[api]"
|
| 22 |
+
pip install -e .
|
| 23 |
+
|
| 24 |
+
# Run the API
|
| 25 |
+
echo "🌐 Starting FastAPI server on http://localhost:8000"
|
| 26 |
+
echo "📚 API documentation available at http://localhost:8000/docs"
|
| 27 |
+
echo "🔍 ReDoc documentation available at http://localhost:8000/redoc"
|
| 28 |
+
|
| 29 |
+
python start_api.py
|
tests/cli_test.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pathlib
|
| 2 |
+
import shutil
|
| 3 |
+
import time
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import pytest
|
| 7 |
+
from click.testing import CliRunner
|
| 8 |
+
|
| 9 |
+
from ddgs import DDGS, __version__
|
| 10 |
+
from ddgs.cli import _download_results, _save_csv, _save_json, cli
|
| 11 |
+
|
| 12 |
+
runner = CliRunner()
|
| 13 |
+
TEXT_RESULTS = []
|
| 14 |
+
IMAGES_RESULTS = []
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@pytest.fixture(autouse=True)
|
| 18 |
+
def pause_between_tests() -> None:
|
| 19 |
+
time.sleep(2)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def test_version_command() -> None:
|
| 23 |
+
result = runner.invoke(cli, ["version"])
|
| 24 |
+
assert result.output.strip() == __version__
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def test_text_command() -> None:
|
| 28 |
+
result = runner.invoke(cli, ["text", "-q", "zebra"])
|
| 29 |
+
assert "title" in result.output
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_images_command() -> None:
|
| 33 |
+
result = runner.invoke(cli, ["images", "-q", "fox"])
|
| 34 |
+
assert "title" in result.output
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def test_news_command() -> None:
|
| 38 |
+
result = runner.invoke(cli, ["news", "-q", "deer"])
|
| 39 |
+
assert "title" in result.output
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_videos_command() -> None:
|
| 43 |
+
result = runner.invoke(cli, ["videos", "-q", "pig"])
|
| 44 |
+
assert "title" in result.output
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def test_books_command() -> None:
|
| 48 |
+
result = runner.invoke(cli, ["books", "-q", "bee"])
|
| 49 |
+
assert "title" in result.output
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@pytest.mark.dependency()
|
| 53 |
+
def test_get_text() -> None:
|
| 54 |
+
global TEXT_RESULTS
|
| 55 |
+
TEXT_RESULTS = DDGS().text("cow", max_results=5)
|
| 56 |
+
assert TEXT_RESULTS
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@pytest.mark.dependency()
|
| 60 |
+
def test_get_images() -> None:
|
| 61 |
+
global IMAGES_RESULTS
|
| 62 |
+
IMAGES_RESULTS = DDGS().images("horse", max_results=5)
|
| 63 |
+
assert IMAGES_RESULTS
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@pytest.mark.dependency(depends=["test_get_text"])
|
| 67 |
+
def test_save_csv(tmp_path: Path) -> None:
|
| 68 |
+
temp_file = tmp_path / "test_csv.csv"
|
| 69 |
+
_save_csv(temp_file, TEXT_RESULTS)
|
| 70 |
+
assert temp_file.exists()
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@pytest.mark.dependency(depends=["test_get_text"])
|
| 74 |
+
def test_save_json(tmp_path: Path) -> None:
|
| 75 |
+
temp_file = tmp_path / "test_json.json"
|
| 76 |
+
_save_json(temp_file, TEXT_RESULTS)
|
| 77 |
+
assert temp_file.exists()
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
@pytest.mark.dependency(depends=["test_get_text"])
|
| 81 |
+
def test_text_download() -> None:
|
| 82 |
+
pathname = pathlib.Path("text_downloads")
|
| 83 |
+
_download_results(f"{test_text_download}", TEXT_RESULTS, function_name="text", pathname=str(pathname))
|
| 84 |
+
assert pathname.is_dir() and pathname.iterdir()
|
| 85 |
+
for file in pathname.iterdir():
|
| 86 |
+
assert file.is_file()
|
| 87 |
+
shutil.rmtree(str(pathname))
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
@pytest.mark.dependency(depends=["test_get_images"])
|
| 91 |
+
def test_images_download() -> None:
|
| 92 |
+
pathname = pathlib.Path("images_downloads")
|
| 93 |
+
_download_results(f"{test_images_download}", IMAGES_RESULTS, function_name="images", pathname=str(pathname))
|
| 94 |
+
assert pathname.is_dir() and pathname.iterdir()
|
| 95 |
+
for file in pathname.iterdir():
|
| 96 |
+
assert file.is_file()
|
| 97 |
+
shutil.rmtree(str(pathname))
|
tests/ddgs_test.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
|
| 5 |
+
from ddgs import DDGS
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@pytest.fixture(autouse=True)
|
| 9 |
+
def pause_between_tests() -> None:
|
| 10 |
+
time.sleep(2)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def test_context_manager() -> None:
|
| 14 |
+
with DDGS() as ddgs:
|
| 15 |
+
results = ddgs.text("python")
|
| 16 |
+
assert len(results) > 0
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def test_text_search() -> None:
|
| 20 |
+
query = "wolf"
|
| 21 |
+
results = DDGS().text(query)
|
| 22 |
+
assert isinstance(results, list)
|
| 23 |
+
assert len(results) > 0
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def test_images_search() -> None:
|
| 27 |
+
query = "tiger"
|
| 28 |
+
results = DDGS().images(query)
|
| 29 |
+
assert isinstance(results, list)
|
| 30 |
+
assert len(results) > 0
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_news_search() -> None:
|
| 34 |
+
query = "rabbit"
|
| 35 |
+
results = DDGS().news(query)
|
| 36 |
+
assert isinstance(results, list)
|
| 37 |
+
assert len(results) > 0
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def test_videos_search() -> None:
|
| 41 |
+
query = "monkey"
|
| 42 |
+
results = DDGS().videos(query)
|
| 43 |
+
assert isinstance(results, list)
|
| 44 |
+
assert len(results) > 0
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def test_books_search() -> None:
|
| 48 |
+
query = "mouse"
|
| 49 |
+
results = DDGS().books(query)
|
| 50 |
+
assert isinstance(results, list)
|
| 51 |
+
assert len(results) > 0
|