github-docs-arabic-enhanced / src /shielding /middleware /handle-invalid-query-strings.ts
AbdulElahGwaith's picture
Upload folder using huggingface_hub
88df9e4 verified
import type { Response, NextFunction } from 'express'
import statsd from '@/observability/lib/statsd'
import { noCacheControl, defaultCacheControl } from '@/frame/middleware/cache-control'
import { ExtendedRequest } from '@/types'
const STATSD_KEY = 'middleware.handle_invalid_querystrings'
// Exported for the sake of end-to-end tests
export const MAX_UNFAMILIAR_KEYS_BAD_REQUEST = 15
export const MAX_UNFAMILIAR_KEYS_REDIRECT = 3
const RECOGNIZED_KEYS_BY_PREFIX = {
'/_next/data/': ['versionId', 'productId', 'restPage', 'apiVersion', 'category', 'subcategory'],
'/api/search': ['query', 'language', 'version', 'page', 'product', 'autocomplete', 'limit'],
'/api/combined-search': ['query', 'version', 'size', 'debug'],
'/api/anchor-redirect': ['hash', 'path'],
'/api/webhooks': ['category', 'version'],
'/api/pageinfo': ['pathname'],
}
const RECOGNIZED_KEYS_BY_ANY = new Set([
// Learning track pages
'learn',
'learnProduct',
// Platform picker
'platform',
// Tool picker
'tool',
// When apiVersion isn't the only one. E.g. ?apiVersion=XXX&tool=vscode
'apiVersion',
// Search results page
'query',
// Any page, Search Overlay
'search-overlay-input',
'search-overlay-open',
'search-overlay-ask-ai',
// The drop-downs on "Webhook events and payloads"
'actionType',
// Landing page article grid filters
'articles-category',
'articles-filter',
'articles-page',
// Legacy domain tracking parameter (no longer processed but still recognized)
'ghdomain',
// UTM campaign tracking
'utm_source',
'utm_medium',
'utm_campaign',
// Used by experiments
'feature',
// Used to track API requests from external sources
'client_name',
])
export default function handleInvalidQuerystrings(
req: ExtendedRequest,
res: Response,
next: NextFunction,
) {
const { method, query, path } = req
if (method === 'GET' || method === 'HEAD') {
const originalKeys = Object.keys(query)
// Check for invalid query string patterns (square brackets, etc.)
const invalidKeys = originalKeys.filter((key) => {
// Check for square brackets which are invalid
return key.includes('[') || key.includes(']')
})
if (invalidKeys.length > 0) {
noCacheControl(res)
const invalidKey = invalidKeys[0].replace(/\[.*$/, '') // Get the base key name
res.status(400).send(`Invalid query string key (${invalidKey})`)
const tags = [
'response:400',
'reason:invalid-brackets',
`url:${req.url}`,
`path:${req.path}`,
`keys:${originalKeys.length}`,
]
statsd.increment(STATSD_KEY, 1, tags)
return
}
let keys = originalKeys.filter((key) => !RECOGNIZED_KEYS_BY_ANY.has(key))
if (keys.length > 0) {
// Before we judge the number of query strings, strip out all the ones
// we're familiar with.
for (const [prefix, recognizedKeys] of Object.entries(RECOGNIZED_KEYS_BY_PREFIX)) {
if (path.startsWith(prefix)) {
keys = keys.filter((key) => !recognizedKeys.includes(key))
}
}
}
// If you fill out the Survey form with all the fields and somehow
// don't attempt to make a POST request, you'll end up with a query
// string like this.
const honeypotted = 'survey-token' in query && 'survey-vote' in query
if (keys.length >= MAX_UNFAMILIAR_KEYS_BAD_REQUEST || honeypotted) {
noCacheControl(res)
const message = honeypotted ? 'Honeypotted' : 'Too many unrecognized query string parameters'
res.status(400).send(message)
const tags = [
'response:400',
`url:${req.url}`,
`path:${req.path}`,
`keys:${originalKeys.length}`,
]
statsd.increment(STATSD_KEY, 1, tags)
return
}
// This is a pattern we've observed in production and we're shielding
// against it happening again. The root home page is hit with a
// 8 character long query string that has no value.
const rootHomePage = path.split('/').length === 2
const badKeylessQuery =
rootHomePage && keys.length === 1 && keys[0].length === 8 && !query[keys[0]]
// It's still a mystery why these requests happen but we've seen large
// number of requests that have a very long URL-encoded query string
// that starts with 'tool' but doesn't have any value.
// For example
// ?tool%25252525253Dvisualstudio%252525253D%2525252526tool%25252525...
// ...3Dvscode%2525253D%25252526tool%2525253Dvscode%25253D%252526tool...
// ...%25253Dvimneovim%253D%2526tool%253Djetbrains%3D%26tool%3Djetbrains=&
// Let's shield against those by removing them.
const badToolsQuery = keys.some((key) => key.startsWith('tool%') && !query[key])
if (keys.length >= MAX_UNFAMILIAR_KEYS_REDIRECT || badKeylessQuery || badToolsQuery) {
if (process.env.NODE_ENV === 'development') {
console.warn(
'Redirecting because of a questionable query string, see https://github.com/github/docs/blob/main/src/shielding/README.md',
)
}
defaultCacheControl(res)
const sp = new URLSearchParams(query as any)
for (const key of keys) {
sp.delete(key)
}
let newURL = req.path
if (sp.toString()) newURL += `?${sp}`
res.redirect(302, newURL)
const tags = [
'response:302',
`url:${req.url}`,
`path:${req.path}`,
`keys:${originalKeys.length}`,
]
statsd.increment(STATSD_KEY, 1, tags)
return
}
}
return next()
}