AbdulElahGwaith's picture
Upload folder using huggingface_hub
88df9e4 verified
import fs from 'fs/promises'
import type { Response, NextFunction } from 'express'
import sharp from 'sharp'
import type { ExtendedRequest } from '@/types'
import { assetCacheControl, defaultCacheControl } from '@/frame/middleware/cache-control'
import { setFastlySurrogateKey, SURROGATE_ENUMS } from '@/frame/middleware/set-fastly-surrogate-key'
/**
* This is the indicator that is a virtual part of the URL.
* Similar to `/cb-1234/` in asset URLs, it's just there to tell the
* middleware that the image can be aggressively cached. It's not
* part of the actual file-on-disk path.
* Similarly, `/mw-1000/` is virtual and will be observed and removed from
* the pathname before trying to look it up as disk-on-file.
* The exact pattern needs to match how it's set in whatever Markdown
* processing code that might make dynamic asset URLs.
* So if you change this, make sure you change the code that expects
* to be able to inject this into the URL.
*/
const maxWidthPathPartRegex = /\/mw-(\d+)\//
/**
*
* Why not any free number? If we allowed it to be any integer number
* someone would put our backend servers at risk by doing something like:
*
* const makeURL = () => `${BASE}/assets/mw-${Math.floor(Math.random()*1000)}/foo.png`
* await Promise.all([...Array(10000).keys()].map(makeURL))
*
* Which would be lots of distinctly different and valid URLs that the
* CDN can never really "protect us" on because they're too often distinct.
*
* At the moment, the only business need is for 1,000 pixels, so the array
* only has one. But can change in the future and make this sentence moot.
*/
const VALID_MAX_WIDTHS = [1440, 1000]
export default async function dynamicAssets(
req: ExtendedRequest,
res: Response,
next: NextFunction,
) {
if (!req.url.startsWith('/assets/')) return next()
if (!(req.method === 'GET' || req.method === 'HEAD')) {
return res.status(405).type('text/plain').send('Method Not Allowed')
}
// To protect from possible denial of service, we never allow what
// we're going to do (the image file operation), if the whole thing
// won't be aggressively cached.
// If we didn't do this, someone making 2 requests, ...
//
// > GET /assets/images/site/logo.web?random=10476583
// > GET /assets/images/site/logo.web?random=20196996
//
// ...would be treated as 2 distinct backend requests. Sure, each one
// would be cached in the CDN, but that's not helping if someone does...
//
// while (true) {
// startFetchThread(`/assets/images/site/logo.web?whatever=${rand()}`)
// }
//
// So we "force" any deviation of the URL to a redirect to the canonical
// URL (which, again, is heavily cached).
if (Object.keys(req.query).length > 0) {
// Cache the 404 so it won't be re-attempted over and over
defaultCacheControl(res)
// This redirects to the same URL we're currently on, but with the
// query string part omitted.
// For example:
//
// > GET /assets/images/site/logo.web?foo=bar
// < 302
// < location: /assets/images/site/logo.web
//
return res.redirect(302, req.path)
}
// From PNG to WEBP, if the PNG exists
if (req.path.endsWith('.webp')) {
const { url, maxWidth, error } = deconstructImageURL(req.path)
if (error) {
return res.status(400).type('text/plain').send(error.toString())
}
try {
const originalBuffer = await fs.readFile(url.slice(1).replace(/\.webp$/, '.png'))
const image = sharp(originalBuffer)
if (maxWidth) {
const { width } = await image.metadata()
if (width === undefined) throw new Error('image metadata does not have a width')
if (width > maxWidth) {
image.resize({ width: maxWidth })
}
}
// The default in sharp.webp() for effort is 4. It's a sensible
// balance between time and compression.
// If you make it low, it makes the webp conversion faster.
// If you make it high, the webp conversion is slower but the
// resulting WEBP file are smaller.
// Given that our App Service containers aren't very strong in
// terms of CPU, we avoid the highest effort. But given how
// well our CDN protects repeated requests for the same image,
// we can pay this cost once and reap it for a very long time.
// Be mindful at the highest (6), it can be extremely slow so
// let's avoid that for now.
//
// For more information about the effort option, see:
// https://www.peterbe.com/plog/comparing-different-efforts-with-webp-in-sharp
//
let effort = 5
if (process.env.NODE_ENV === 'test') {
// When running tests, we want to make the conversion as fast
// as possible because the resulting WEBP buffer will most
// likely never be enjoyed by network or human eyes.
effort = 1
} else if (process.env.NODE_ENV === 'development') {
// If you're doing local development (or review), the
// network is not precious (localhost:4000) and you have no
// CDN to cache it for you. Make it low but not too unrealistically
// low.
effort = 1
}
// Note that by default, sharp will use a lossy compression.
// (i.e. `{lossless: false}` in the options)
// The difference is that a lossless image is slightly crisper
// but becomes on average 1.8x larger.
// Given how we serve images, no human would be able to tell the
// difference simply by looking at the image as it appears as an
// image tag in the web page.
// Also given that rendering-for-viewing is the "end of the line"
// for the image meaning it just ends up being viewed and not
// resaved as a source file. If we had intention to overwrite all
// original PNG source files to WEBP, we should consider lossless
// to preserve as much quality as possible at the source level.
// The default quality is 80% which, combined with `lossless:false`
// makes our images 2.8x smaller than the average PNG.
const buffer = await image.webp({ effort }).toBuffer()
assetCacheControl(res)
return res.type('image/webp').send(buffer)
} catch (catchError) {
if (
catchError instanceof Error &&
'code' in catchError &&
(catchError as NodeJS.ErrnoException).code !== 'ENOENT'
) {
throw catchError
}
}
}
// Cache the 404 so it won't be re-attempted over and over
defaultCacheControl(res)
// There's a preceeding middleware that sets the Surrogate-Key to
// "manual-purge" based on the URL possibly having the `/cb-xxxxx/`
// checksum in it. But, if it failed, we don't want that. So
// undo that if it was set.
// It's handy too to not overly cache 404s in the CDN because
// it could be that the next prod deployment fixes the missing image.
// For example, a PR landed that introduced the *reference* to the image
// but forgot to check in the new image, then a follow-up PR adds the image.
setFastlySurrogateKey(res, SURROGATE_ENUMS.DEFAULT)
// Don't use something like `next(404)` because we don't want a fancy
// HTML "Page not found" page response because a failed asset lookup
// is impossibly a typo in the browser address bar or an accidentally
// broken link, like it might be to a regular HTML page.
res.status(404).type('text/plain').send('Asset not found')
}
function deconstructImageURL(url: string) {
let error
let maxWidth
const match = url.match(maxWidthPathPartRegex)
if (match) {
const [whole, number] = match
maxWidth = parseInt(number)
if (isNaN(maxWidth) || maxWidth <= 0 || !VALID_MAX_WIDTHS.includes(maxWidth)) {
error = new Error(`width number (${maxWidth}) is not a valid number`)
} else {
url = url.replace(whole, '/')
}
}
return { url, maxWidth, error }
}