Spaces:

AZILS
/

Word

Sleeping

App Files Files Community

Word / wp-includes /rest-api /endpoints /class-wp-rest-url-details-controller.php

AZILS

Upload 2998 files

51345ad verified about 1 year ago

raw

history blame contribute delete

20.6 kB

	<?php
	/**
	* REST API: WP_REST_URL_Details_Controller class
	*
	* @package WordPress
	* @subpackage REST_API
	* @since 5.9.0
	*/

	/**
	* Controller which provides REST endpoint for retrieving information
	* from a remote site's HTML response.
	*
	* @since 5.9.0
	*
	* @see WP_REST_Controller
	*/
	class WP_REST_URL_Details_Controller extends WP_REST_Controller {

	/**
	* Constructs the controller.
	*
	* @since 5.9.0
	*/
	public function __construct() {
	$this->namespace = 'wp-block-editor/v1';
	$this->rest_base = 'url-details';
	}

	/**
	* Registers the necessary REST API routes.
	*
	* @since 5.9.0
	*/
	public function register_routes() {
	register_rest_route(
	$this->namespace,
	'/' . $this->rest_base,
	array(
	array(
	'methods' => WP_REST_Server::READABLE,
	'callback' => array( $this, 'parse_url_details' ),
	'args' => array(
	'url' => array(
	'required' => true,
	'description' => __( 'The URL to process.' ),
	'validate_callback' => 'wp_http_validate_url',
	'sanitize_callback' => 'sanitize_url',
	'type' => 'string',
	'format' => 'uri',
	),
	),
	'permission_callback' => array( $this, 'permissions_check' ),
	'schema' => array( $this, 'get_public_item_schema' ),
	),
	)
	);
	}

	/**
	* Retrieves the item's schema, conforming to JSON Schema.
	*
	* @since 5.9.0
	*
	* @return array Item schema data.
	*/
	public function get_item_schema() {
	if ( $this->schema ) {
	return $this->add_additional_fields_schema( $this->schema );
	}

	$this->schema = array(
	'$schema' => 'http://json-schema.org/draft-04/schema#',
	'title' => 'url-details',
	'type' => 'object',
	'properties' => array(
	'title' => array(
	'description' => sprintf(
	/* translators: %s: HTML title tag. */
	__( 'The contents of the %s element from the URL.' ),
	'<title>'
	),
	'type' => 'string',
	'context' => array( 'view', 'edit', 'embed' ),
	'readonly' => true,
	),
	'icon' => array(
	'description' => sprintf(
	/* translators: %s: HTML link tag. */
	__( 'The favicon image link of the %s element from the URL.' ),
	'<link rel="icon">'
	),
	'type' => 'string',
	'format' => 'uri',
	'context' => array( 'view', 'edit', 'embed' ),
	'readonly' => true,
	),
	'description' => array(
	'description' => sprintf(
	/* translators: %s: HTML meta tag. */
	__( 'The content of the %s element from the URL.' ),
	'<meta name="description">'
	),
	'type' => 'string',
	'context' => array( 'view', 'edit', 'embed' ),
	'readonly' => true,
	),
	'image' => array(
	'description' => sprintf(
	/* translators: 1: HTML meta tag, 2: HTML meta tag. */
	__( 'The Open Graph image link of the %1$s or %2$s element from the URL.' ),
	'<meta property="og:image">',
	'<meta property="og:image:url">'
	),
	'type' => 'string',
	'format' => 'uri',
	'context' => array( 'view', 'edit', 'embed' ),
	'readonly' => true,
	),
	),
	);

	return $this->add_additional_fields_schema( $this->schema );
	}

	/**
	* Retrieves the contents of the title tag from the HTML response.
	*
	* @since 5.9.0
	*
	* @param WP_REST_Request $request Full details about the request.
	* @return WP_REST_Response\|WP_Error The parsed details as a response object. WP_Error if there are errors.
	*/
	public function parse_url_details( $request ) {
	$url = untrailingslashit( $request['url'] );

	if ( empty( $url ) ) {
	return new WP_Error( 'rest_invalid_url', __( 'Invalid URL' ), array( 'status' => 404 ) );
	}

	// Transient per URL.
	$cache_key = $this->build_cache_key_for_url( $url );

	// Attempt to retrieve cached response.
	$cached_response = $this->get_cache( $cache_key );

	if ( ! empty( $cached_response ) ) {
	$remote_url_response = $cached_response;
	} else {
	$remote_url_response = $this->get_remote_url( $url );

	// Exit if we don't have a valid body or it's empty.
	if ( is_wp_error( $remote_url_response ) \|\| empty( $remote_url_response ) ) {
	return $remote_url_response;
	}

	// Cache the valid response.
	$this->set_cache( $cache_key, $remote_url_response );
	}

	$html_head = $this->get_document_head( $remote_url_response );
	$meta_elements = $this->get_meta_with_content_elements( $html_head );

	$data = $this->add_additional_fields_to_object(
	array(
	'title' => $this->get_title( $html_head ),
	'icon' => $this->get_icon( $html_head, $url ),
	'description' => $this->get_description( $meta_elements ),
	'image' => $this->get_image( $meta_elements, $url ),
	),
	$request
	);

	// Wrap the data in a response object.
	$response = rest_ensure_response( $data );

	/**
	* Filters the URL data for the response.
	*
	* @since 5.9.0
	*
	* @param WP_REST_Response $response The response object.
	* @param string $url The requested URL.
	* @param WP_REST_Request $request Request object.
	* @param string $remote_url_response HTTP response body from the remote URL.
	*/
	return apply_filters( 'rest_prepare_url_details', $response, $url, $request, $remote_url_response );
	}

	/**
	* Checks whether a given request has permission to read remote URLs.
	*
	* @since 5.9.0
	*
	* @return WP_Error\|bool True if the request has permission, else WP_Error.
	*/
	public function permissions_check() {
	if ( current_user_can( 'edit_posts' ) ) {
	return true;
	}

	foreach ( get_post_types( array( 'show_in_rest' => true ), 'objects' ) as $post_type ) {
	if ( current_user_can( $post_type->cap->edit_posts ) ) {
	return true;
	}
	}

	return new WP_Error(
	'rest_cannot_view_url_details',
	__( 'Sorry, you are not allowed to process remote URLs.' ),
	array( 'status' => rest_authorization_required_code() )
	);
	}

	/**
	* Retrieves the document title from a remote URL.
	*
	* @since 5.9.0
	*
	* @param string $url The website URL whose HTML to access.
	* @return string\|WP_Error The HTTP response from the remote URL on success.
	* WP_Error if no response or no content.
	*/
	private function get_remote_url( $url ) {

	/*
	* Provide a modified UA string to workaround web properties which block WordPress "Pingbacks".
	* Why? The UA string used for pingback requests contains `WordPress/` which is very similar
	* to that used as the default UA string by the WP HTTP API. Therefore requests from this
	* REST endpoint are being unintentionally blocked as they are misidentified as pingback requests.
	* By slightly modifying the UA string, but still retaining the "WordPress" identification (via "WP")
	* we are able to work around this issue.
	* Example UA string: `WP-URLDetails/5.9-alpha-51389 (+http://localhost:8888)`.
	*/
	$modified_user_agent = 'WP-URLDetails/' . get_bloginfo( 'version' ) . ' (+' . get_bloginfo( 'url' ) . ')';

	$args = array(
	'limit_response_size' => 150 * KB_IN_BYTES,
	'user-agent' => $modified_user_agent,
	);

	/**
	* Filters the HTTP request args for URL data retrieval.
	*
	* Can be used to adjust response size limit and other WP_Http::request() args.
	*
	* @since 5.9.0
	*
	* @param array $args Arguments used for the HTTP request.
	* @param string $url The attempted URL.
	*/
	$args = apply_filters( 'rest_url_details_http_request_args', $args, $url );

	$response = wp_safe_remote_get( $url, $args );

	if ( WP_Http::OK !== wp_remote_retrieve_response_code( $response ) ) {
	// Not saving the error response to cache since the error might be temporary.
	return new WP_Error(
	'no_response',
	__( 'URL not found. Response returned a non-200 status code for this URL.' ),
	array( 'status' => WP_Http::NOT_FOUND )
	);
	}

	$remote_body = wp_remote_retrieve_body( $response );

	if ( empty( $remote_body ) ) {
	return new WP_Error(
	'no_content',
	__( 'Unable to retrieve body from response at this URL.' ),
	array( 'status' => WP_Http::NOT_FOUND )
	);
	}

	return $remote_body;
	}

	/**
	* Parses the title tag contents from the provided HTML.
	*
	* @since 5.9.0
	*
	* @param string $html The HTML from the remote website at URL.
	* @return string The title tag contents on success. Empty string if not found.
	*/
	private function get_title( $html ) {
	$pattern = '#<title[^>]>(.?)<\s/\stitle>#is';
	preg_match( $pattern, $html, $match_title );

	if ( empty( $match_title[1] ) \|\| ! is_string( $match_title[1] ) ) {
	return '';
	}

	$title = trim( $match_title[1] );

	return $this->prepare_metadata_for_output( $title );
	}

	/**
	* Parses the site icon from the provided HTML.
	*
	* @since 5.9.0
	*
	* @param string $html The HTML from the remote website at URL.
	* @param string $url The target website URL.
	* @return string The icon URI on success. Empty string if not found.
	*/
	private function get_icon( $html, $url ) {
	// Grab the icon's link element.
	$pattern = '#<link\s[^>]rel=(?:[\"\']??)\s(?:icon\|shortcut icon\|icon shortcut)\s(?:[\"\']??)[^>]\/?>#isU';
	preg_match( $pattern, $html, $element );
	if ( empty( $element[0] ) \|\| ! is_string( $element[0] ) ) {
	return '';
	}
	$element = trim( $element[0] );

	// Get the icon's href value.
	$pattern = '#href=([\"\']??)([^\" >]?)\\1[^>]#isU';
	preg_match( $pattern, $element, $icon );
	if ( empty( $icon[2] ) \|\| ! is_string( $icon[2] ) ) {
	return '';
	}
	$icon = trim( $icon[2] );

	// If the icon is a data URL, return it.
	$parsed_icon = parse_url( $icon );
	if ( isset( $parsed_icon['scheme'] ) && 'data' === $parsed_icon['scheme'] ) {
	return $icon;
	}

	// Attempt to convert relative URLs to absolute.
	if ( ! is_string( $url ) \|\| '' === $url ) {
	return $icon;
	}
	$parsed_url = parse_url( $url );
	if ( isset( $parsed_url['scheme'] ) && isset( $parsed_url['host'] ) ) {
	$root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/';
	$icon = WP_Http::make_absolute_url( $icon, $root_url );
	}

	return $icon;
	}

	/**
	* Parses the meta description from the provided HTML.
	*
	* @since 5.9.0
	*
	* @param array $meta_elements {
	* A multi-dimensional indexed array on success, else empty array.
	*
	* @type string[] $0 Meta elements with a content attribute.
	* @type string[] $1 Content attribute's opening quotation mark.
	* @type string[] $2 Content attribute's value for each meta element.
	* }
	* @return string The meta description contents on success. Empty string if not found.
	*/
	private function get_description( $meta_elements ) {
	// Bail out if there are no meta elements.
	if ( empty( $meta_elements[0] ) ) {
	return '';
	}

	$description = $this->get_metadata_from_meta_element(
	$meta_elements,
	'name',
	'(?:description\|og:description)'
	);

	// Bail out if description not found.
	if ( '' === $description ) {
	return '';
	}

	return $this->prepare_metadata_for_output( $description );
	}

	/**
	* Parses the Open Graph (OG) Image from the provided HTML.
	*
	* See: https://ogp.me/.
	*
	* @since 5.9.0
	*
	* @param array $meta_elements {
	* A multi-dimensional indexed array on success, else empty array.
	*
	* @type string[] $0 Meta elements with a content attribute.
	* @type string[] $1 Content attribute's opening quotation mark.
	* @type string[] $2 Content attribute's value for each meta element.
	* }
	* @param string $url The target website URL.
	* @return string The OG image on success. Empty string if not found.
	*/
	private function get_image( $meta_elements, $url ) {
	$image = $this->get_metadata_from_meta_element(
	$meta_elements,
	'property',
	'(?:og:image\|og:image:url)'
	);

	// Bail out if image not found.
	if ( '' === $image ) {
	return '';
	}

	// Attempt to convert relative URLs to absolute.
	$parsed_url = parse_url( $url );
	if ( isset( $parsed_url['scheme'] ) && isset( $parsed_url['host'] ) ) {
	$root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/';
	$image = WP_Http::make_absolute_url( $image, $root_url );
	}

	return $image;
	}

	/**
	* Prepares the metadata by:
	* - stripping all HTML tags and tag entities.
	* - converting non-tag entities into characters.
	*
	* @since 5.9.0
	*
	* @param string $metadata The metadata content to prepare.
	* @return string The prepared metadata.
	*/
	private function prepare_metadata_for_output( $metadata ) {
	$metadata = html_entity_decode( $metadata, ENT_QUOTES, get_bloginfo( 'charset' ) );
	$metadata = wp_strip_all_tags( $metadata );
	return $metadata;
	}

	/**
	* Utility function to build cache key for a given URL.
	*
	* @since 5.9.0
	*
	* @param string $url The URL for which to build a cache key.
	* @return string The cache key.
	*/
	private function build_cache_key_for_url( $url ) {
	return 'g_url_details_response_' . md5( $url );
	}

	/**
	* Utility function to retrieve a value from the cache at a given key.
	*
	* @since 5.9.0
	*
	* @param string $key The cache key.
	* @return mixed The value from the cache.
	*/
	private function get_cache( $key ) {
	return get_site_transient( $key );
	}

	/**
	* Utility function to cache a given data set at a given cache key.
	*
	* @since 5.9.0
	*
	* @param string $key The cache key under which to store the value.
	* @param string $data The data to be stored at the given cache key.
	* @return bool True when transient set. False if not set.
	*/
	private function set_cache( $key, $data = '' ) {
	$ttl = HOUR_IN_SECONDS;

	/**
	* Filters the cache expiration.
	*
	* Can be used to adjust the time until expiration in seconds for the cache
	* of the data retrieved for the given URL.
	*
	* @since 5.9.0
	*
	* @param int $ttl The time until cache expiration in seconds.
	*/
	$cache_expiration = apply_filters( 'rest_url_details_cache_expiration', $ttl );

	return set_site_transient( $key, $data, $cache_expiration );
	}

	/**
	* Retrieves the head element section.
	*
	* @since 5.9.0
	*
	* @param string $html The string of HTML to parse.
	* @return string The `<head>..</head>` section on success. Given `$html` if not found.
	*/
	private function get_document_head( $html ) {
	$head_html = $html;

	// Find the opening `<head>` tag.
	$head_start = strpos( $html, '<head' );
	if ( false === $head_start ) {
	// Didn't find it. Return the original HTML.
	return $html;
	}

	// Find the closing `</head>` tag.
	$head_end = strpos( $head_html, '</head>' );
	if ( false === $head_end ) {
	// Didn't find it. Find the opening `<body>` tag.
	$head_end = strpos( $head_html, '<body' );

	// Didn't find it. Return the original HTML.
	if ( false === $head_end ) {
	return $html;
	}
	}

	// Extract the HTML from opening tag to the closing tag. Then add the closing tag.
	$head_html = substr( $head_html, $head_start, $head_end );
	$head_html .= '</head>';

	return $head_html;
	}

	/**
	* Gets all the meta tag elements that have a 'content' attribute.
	*
	* @since 5.9.0
	*
	* @param string $html The string of HTML to be parsed.
	* @return array {
	* A multi-dimensional indexed array on success, else empty array.
	*
	* @type string[] $0 Meta elements with a content attribute.
	* @type string[] $1 Content attribute's opening quotation mark.
	* @type string[] $2 Content attribute's value for each meta element.
	* }
	*/
	private function get_meta_with_content_elements( $html ) {
	/*
	* Parse all meta elements with a content attribute.
	*
	* Why first search for the content attribute rather than directly searching for name=description element?
	* tl;dr The content attribute's value will be truncated when it contains a > symbol.
	*
	* The content attribute's value (i.e. the description to get) can have HTML in it and be well-formed as
	* it's a string to the browser. Imagine what happens when attempting to match for the name=description
	* first. Hmm, if a > or /> symbol is in the content attribute's value, then it terminates the match
	* as the element's closing symbol. But wait, it's in the content attribute and is not the end of the
	* element. This is a limitation of using regex. It can't determine "wait a minute this is inside of quotation".
	* If this happens, what gets matched is not the entire element or all of the content.
	*
	* Why not search for the name=description and then content="(.*)"?
	* The attribute order could be opposite. Plus, additional attributes may exist including being between
	* the name and content attributes.
	*
	* Why not lookahead?
	* Lookahead is not constrained to stay within the element. The first <meta it finds may not include
	* the name or content, but rather could be from a different element downstream.
	*/
	$pattern = '#<meta\s' .

	/*
	* Allows for additional attributes before the content attribute.
	* Searches for anything other than > symbol.
	*/
	'[^>]*' .

	/*
	* Find the content attribute. When found, capture its value (.*).
	*
	* Allows for (a) single or double quotes and (b) whitespace in the value.
	*
	* Why capture the opening quotation mark, i.e. (["\']), and then backreference,
	* i.e \1, for the closing quotation mark?
	* To ensure the closing quotation mark matches the opening one. Why? Attribute values
	* can contain quotation marks, such as an apostrophe in the content.
	*/
	'content=(["\']??)(.*)\1' .

	/*
	* Allows for additional attributes after the content attribute.
	* Searches for anything other than > symbol.
	*/
	'[^>]*' .

	/*
	* \/?> searches for the closing > symbol, which can be in either /> or > format.
	* # ends the pattern.
	*/
	'\/?>#' .

	/*
	* These are the options:
	* - i : case insensitive
	* - s : allows newline characters for the . match (needed for multiline elements)
	* - U means non-greedy matching
	*/
	'isU';

	preg_match_all( $pattern, $html, $elements );

	return $elements;
	}

	/**
	* Gets the metadata from a target meta element.
	*
	* @since 5.9.0
	*
	* @param array $meta_elements {
	* A multi-dimensional indexed array on success, else empty array.
	*
	* @type string[] $0 Meta elements with a content attribute.
	* @type string[] $1 Content attribute's opening quotation mark.
	* @type string[] $2 Content attribute's value for each meta element.
	* }
	* @param string $attr Attribute that identifies the element with the target metadata.
	* @param string $attr_value The attribute's value that identifies the element with the target metadata.
	* @return string The metadata on success. Empty string if not found.
	*/
	private function get_metadata_from_meta_element( $meta_elements, $attr, $attr_value ) {
	// Bail out if there are no meta elements.
	if ( empty( $meta_elements[0] ) ) {
	return '';
	}

	$metadata = '';
	$pattern = '#' .
	/*
	* Target this attribute and value to find the metadata element.
	*
	* Allows for (a) no, single, double quotes and (b) whitespace in the value.
	*
	* Why capture the opening quotation mark, i.e. (["\']), and then backreference,
	* i.e \1, for the closing quotation mark?
	* To ensure the closing quotation mark matches the opening one. Why? Attribute values
	* can contain quotation marks, such as an apostrophe in the content.
	*/
	$attr . '=([\"\']??)\s' . $attr_value . '\s\1' .

	/*
	* These are the options:
	* - i : case insensitive
	* - s : allows newline characters for the . match (needed for multiline elements)
	* - U means non-greedy matching
	*/
	'#isU';

	// Find the metadata element.
	foreach ( $meta_elements[0] as $index => $element ) {
	preg_match( $pattern, $element, $match );

	// This is not the metadata element. Skip it.
	if ( empty( $match ) ) {
	continue;
	}

	/*
	* Found the metadata element.
	* Get the metadata from its matching content array.
	*/
	if ( isset( $meta_elements[2][ $index ] ) && is_string( $meta_elements[2][ $index ] ) ) {
	$metadata = trim( $meta_elements[2][ $index ] );
	}

	break;
	}

	return $metadata;
	}
	}