File size: 5,882 Bytes
1cce69a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15a4661
 
 
 
 
 
 
 
 
 
 
 
 
 
c2e58ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5146f3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cce69a
 
 
 
 
5146f3d
 
 
 
 
1cce69a
c2e58ef
1cce69a
8273830
 
 
 
 
 
 
15a4661
8273830
 
15a4661
 
 
 
 
8273830
15a4661
 
 
 
 
 
8273830
1cce69a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5146f3d
 
 
 
 
1cce69a
c2e58ef
1cce69a
8273830
 
 
 
 
 
 
15a4661
8273830
 
15a4661
8273830
15a4661
8273830
1cce69a
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import { NextRequest } from "next/server";

// Same-origin streaming proxy for huggingface.co. The native <video> element
// can't carry an Authorization header, so we proxy through this route, which
// pulls the user's HF access token from the HttpOnly `hf_access_token` cookie
// (set by /api/auth/session after OAuth) and forwards Range requests upstream.
//
// Public datasets work too — the upstream simply ignores the bearer token.
//
// Allowed path prefixes are constrained so this can't be turned into an open
// proxy for arbitrary huggingface.co URLs (e.g. user profile, billing pages).

const HF_HOST = "https://huggingface.co";
const COOKIE_NAME = "hf_access_token";
const ALLOWED_PREFIXES = ["datasets/", "buckets/"];

export const runtime = "nodejs";
export const dynamic = "force-dynamic";

const FORWARD_REQUEST_HEADERS = [
  "range",
  "if-modified-since",
  "if-none-match",
  "accept",
  "accept-encoding",
];

const FORWARD_RESPONSE_HEADERS = [
  "content-type",
  "content-length",
  "content-range",
  "accept-ranges",
  "etag",
  "last-modified",
  "cache-control",
];

// Generous enough for first-byte on a multi-GB video over a slow network,
// strict enough that hung connections don't pile up server-side.
const UPSTREAM_TIMEOUT_MS = 30_000;

// Cancel the upstream when either (a) the client disconnects, so we stop
// pulling bytes nobody is reading, or (b) the timeout fires, so a hung HF
// connection eventually surrenders its socket.
function upstreamSignal(req: NextRequest): AbortSignal {
  return AbortSignal.any([
    req.signal,
    AbortSignal.timeout(UPSTREAM_TIMEOUT_MS),
  ]);
}

// Shared by GET and HEAD so they always forward the same set of headers.
// Previously HEAD only attached Authorization, so a client sending a
// conditional HEAD (If-None-Match etag check) would always get a fresh
// 200 instead of a 304 — defeating the cache validation it was asking for.
function buildUpstreamHeaders(req: NextRequest): Headers {
  const headers = new Headers();
  const token = req.cookies.get(COOKIE_NAME)?.value;
  if (token) headers.set("authorization", `Bearer ${token}`);
  for (const h of FORWARD_REQUEST_HEADERS) {
    const v = req.headers.get(h);
    if (v) headers.set(h, v);
  }
  return headers;
}

// Build the upstream URL and validate it. Returns the URL or null if the
// request should be rejected.
//
// Two attack surfaces this guards against:
// 1. Path traversal — `subPath = "datasets/../api/tokens"` passes a naive
//    startsWith("datasets/") check, but URL normalization resolves it to
//    huggingface.co/api/tokens. We re-check the prefix on the *normalized*
//    pathname after construction, so traversal is caught.
// 2. Origin escape — exotic URL syntax could cause new URL() to land on a
//    different host. We assert origin === HF_HOST.
function resolveUpstreamUrl(
  subPath: string,
  searchParams: URLSearchParams,
): URL | null {
  let upstreamUrl: URL;
  try {
    upstreamUrl = new URL(`${HF_HOST}/${subPath}`);
  } catch {
    return null;
  }

  if (upstreamUrl.origin !== HF_HOST) return null;

  const normalized = upstreamUrl.pathname.replace(/^\/+/, "");
  if (!ALLOWED_PREFIXES.some((p) => normalized.startsWith(p))) return null;

  for (const [k, v] of searchParams) {
    upstreamUrl.searchParams.set(k, v);
  }
  return upstreamUrl;
}

export async function GET(
  req: NextRequest,
  ctx: { params: Promise<{ path: string[] }> },
) {
  const { path } = await ctx.params;
  const upstreamUrl = resolveUpstreamUrl(
    path.join("/"),
    req.nextUrl.searchParams,
  );
  if (!upstreamUrl) return new Response("Forbidden", { status: 403 });

  const headers = buildUpstreamHeaders(req);

  let upstream: Response;
  try {
    upstream = await fetch(upstreamUrl, {
      method: "GET",
      headers,
      redirect: "follow",
      cache: "no-store",
      signal: upstreamSignal(req),
    });
  } catch (err) {
    // Network error reaching huggingface.co, or the upstream timed out, or
    // the client went away. The native <video> turns this into a generic
    // load error with no details, so log server-side and return a useful
    // status the client can surface in devtools.
    const isTimeout = err instanceof Error && err.name === "TimeoutError";
    console.error("[proxy] upstream fetch failed", err);
    return new Response(
      isTimeout
        ? "Gateway timeout: upstream took too long"
        : "Bad gateway: upstream fetch failed",
      { status: isTimeout ? 504 : 502 },
    );
  }

  const respHeaders = new Headers();
  for (const h of FORWARD_RESPONSE_HEADERS) {
    const v = upstream.headers.get(h);
    if (v) respHeaders.set(h, v);
  }

  return new Response(upstream.body, {
    status: upstream.status,
    statusText: upstream.statusText,
    headers: respHeaders,
  });
}

export async function HEAD(
  req: NextRequest,
  ctx: { params: Promise<{ path: string[] }> },
) {
  const { path } = await ctx.params;
  const upstreamUrl = resolveUpstreamUrl(
    path.join("/"),
    req.nextUrl.searchParams,
  );
  if (!upstreamUrl) return new Response(null, { status: 403 });

  const headers = buildUpstreamHeaders(req);

  let upstream: Response;
  try {
    upstream = await fetch(upstreamUrl, {
      method: "HEAD",
      headers,
      redirect: "follow",
      cache: "no-store",
      signal: upstreamSignal(req),
    });
  } catch (err) {
    const isTimeout = err instanceof Error && err.name === "TimeoutError";
    console.error("[proxy] upstream HEAD failed", err);
    return new Response(null, { status: isTimeout ? 504 : 502 });
  }

  const respHeaders = new Headers();
  for (const h of FORWARD_RESPONSE_HEADERS) {
    const v = upstream.headers.get(h);
    if (v) respHeaders.set(h, v);
  }

  return new Response(null, {
    status: upstream.status,
    statusText: upstream.statusText,
    headers: respHeaders,
  });
}