| // Copyright 2025 The Go Authors. All rights reserved. | |
| // Use of this source code is governed by a BSD-style | |
| // license that can be found in the LICENSE file. | |
| package cgroup | |
| import ( | |
| "internal/bytealg" | |
| "internal/strconv" | |
| ) | |
| var ( | |
| ErrNoCgroup error = stringError("not in a cgroup") | |
| errMalformedFile error = stringError("malformed file") | |
| ) | |
| const _PATH_MAX = 4096 | |
| const ( | |
| // Required amount of scratch space for CPULimit. | |
| // | |
| // TODO(prattmic): This is shockingly large (~70KiB) due to the (very | |
| // unlikely) combination of extremely long paths consisting mostly | |
| // escaped characters. The scratch buffer ends up in .bss in package | |
| // runtime, so it doesn't contribute to binary size and generally won't | |
| // be faulted in, but it would still be nice to shrink this. A more | |
| // complex parser that did not need to keep entire lines in memory | |
| // could get away with much less. Alternatively, we could do a one-off | |
| // mmap allocation for this buffer, which is only mapped larger if we | |
| // actually need the extra space. | |
| ScratchSize = PathSize + ParseSize | |
| // Required space to store a path of the cgroup in the filesystem. | |
| PathSize = _PATH_MAX | |
| // /proc/self/mountinfo path escape sequences are 4 characters long, so | |
| // a path consisting entirely of escaped characters could be 4 times | |
| // larger. | |
| escapedPathMax = 4 * _PATH_MAX | |
| // Required space to parse /proc/self/mountinfo and /proc/self/cgroup. | |
| // See findCPUMount and findCPURelativePath. | |
| ParseSize = 4 * escapedPathMax | |
| ) | |
| // Version indicates the cgroup version. | |
| type Version int | |
| const ( | |
| VersionUnknown Version = iota | |
| V1 | |
| V2 | |
| ) | |
| func parseV1Number(buf []byte) (int64, error) { | |
| // Ignore trailing newline. | |
| i := bytealg.IndexByte(buf, '\n') | |
| if i < 0 { | |
| return 0, errMalformedFile | |
| } | |
| buf = buf[:i] | |
| val, err := strconv.ParseInt(string(buf), 10, 64) | |
| if err != nil { | |
| return 0, errMalformedFile | |
| } | |
| return val, nil | |
| } | |
| func parseV2Limit(buf []byte) (float64, bool, error) { | |
| i := bytealg.IndexByte(buf, ' ') | |
| if i < 0 { | |
| return 0, false, errMalformedFile | |
| } | |
| quotaStr := buf[:i] | |
| if bytealg.Compare(quotaStr, []byte("max")) == 0 { | |
| // No limit. | |
| return 0, false, nil | |
| } | |
| periodStr := buf[i+1:] | |
| // Ignore trailing newline, if any. | |
| i = bytealg.IndexByte(periodStr, '\n') | |
| if i < 0 { | |
| return 0, false, errMalformedFile | |
| } | |
| periodStr = periodStr[:i] | |
| quota, err := strconv.ParseInt(string(quotaStr), 10, 64) | |
| if err != nil { | |
| return 0, false, errMalformedFile | |
| } | |
| period, err := strconv.ParseInt(string(periodStr), 10, 64) | |
| if err != nil { | |
| return 0, false, errMalformedFile | |
| } | |
| return float64(quota) / float64(period), true, nil | |
| } | |
| // Finds the path of the current process's CPU cgroup and writes it to out. | |
| // | |
| // fd is a file descriptor for /proc/self/cgroup. | |
| // Returns the number of bytes written and the cgroup version (1 or 2). | |
| func parseCPUCgroup(fd int, read func(fd int, b []byte) (int, uintptr), out []byte, scratch []byte) (int, Version, error) { | |
| // The format of each line is | |
| // | |
| // hierarchy-ID:controller-list:cgroup-path | |
| // | |
| // controller-list is comma-separated. | |
| // | |
| // cgroup v2 has hierarchy-ID 0. If a v1 hierarchy contains "cpu", that | |
| // is the CPU controller. Otherwise the v2 hierarchy (if any) is the | |
| // CPU controller. It is not possible to mount the same controller | |
| // simultaneously under both the v1 and the v2 hierarchies. | |
| // | |
| // See man 7 cgroups for more details. | |
| // | |
| // hierarchy-ID and controller-list have relatively small maximum | |
| // sizes, and the path can be up to _PATH_MAX, so we need a bit more | |
| // than 1 _PATH_MAX of scratch space. | |
| l := newLineReader(fd, scratch, read) | |
| // Bytes written to out. | |
| n := 0 | |
| for { | |
| err := l.next() | |
| if err == errIncompleteLine { | |
| // Don't allow incomplete lines. While in theory the | |
| // incomplete line may be for a controller we don't | |
| // care about, in practice all lines should be of | |
| // similar length, so we should just have a buffer big | |
| // enough for any. | |
| return 0, 0, err | |
| } else if err == errEOF { | |
| break | |
| } else if err != nil { | |
| return 0, 0, err | |
| } | |
| line := l.line() | |
| // The format of each line is | |
| // | |
| // hierarchy-ID:controller-list:cgroup-path | |
| // | |
| // controller-list is comma-separated. | |
| // See man 7 cgroups for more details. | |
| i := bytealg.IndexByte(line, ':') | |
| if i < 0 { | |
| return 0, 0, errMalformedFile | |
| } | |
| hierarchy := line[:i] | |
| line = line[i+1:] | |
| i = bytealg.IndexByte(line, ':') | |
| if i < 0 { | |
| return 0, 0, errMalformedFile | |
| } | |
| controllers := line[:i] | |
| line = line[i+1:] | |
| path := line | |
| if len(path) == 0 || path[0] != '/' { | |
| // We rely on this when composing the full path. | |
| return 0, 0, errMalformedFile | |
| } | |
| if len(path) > len(out) { | |
| // Should not be possible. If we really get a very long cgroup path, | |
| // read /proc/self/cgroup will fail with ENAMETOOLONG. | |
| return 0, 0, errPathTooLong | |
| } | |
| if string(hierarchy) == "0" { | |
| // v2 hierarchy. | |
| n = copy(out, path) | |
| // Keep searching, we might find a v1 hierarchy with a | |
| // CPU controller, which takes precedence. | |
| } else { | |
| // v1 hierarchy | |
| if containsCPU(controllers) { | |
| // Found a v1 CPU controller. This must be the | |
| // only one, so we're done. | |
| return copy(out, path), V1, nil | |
| } | |
| } | |
| } | |
| if n == 0 { | |
| // Found nothing. | |
| return 0, 0, ErrNoCgroup | |
| } | |
| // Must be v2, v1 returns above. | |
| return n, V2, nil | |
| } | |
| // Returns true if comma-separated list b contains "cpu". | |
| func containsCPU(b []byte) bool { | |
| for len(b) > 0 { | |
| i := bytealg.IndexByte(b, ',') | |
| if i < 0 { | |
| // Neither cmd/compile nor gccgo allocates for these string conversions. | |
| return string(b) == "cpu" | |
| } | |
| curr := b[:i] | |
| rest := b[i+1:] | |
| if string(curr) == "cpu" { | |
| return true | |
| } | |
| b = rest | |
| } | |
| return false | |
| } | |
| // Returns the path to the specified cgroup and version with cpu controller | |
| // | |
| // fd is a file descriptor for /proc/self/mountinfo. | |
| // Returns the number of bytes written. | |
| func parseCPUMount(fd int, read func(fd int, b []byte) (int, uintptr), out, cgroup []byte, version Version, scratch []byte) (int, error) { | |
| // The format of each line is: | |
| // | |
| // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue | |
| // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) | |
| // | |
| // (1) mount ID: unique identifier of the mount (may be reused after umount) | |
| // (2) parent ID: ID of parent (or of self for the top of the mount tree) | |
| // (3) major:minor: value of st_dev for files on filesystem | |
| // (4) root: root of the mount within the filesystem | |
| // (5) mount point: mount point relative to the process's root | |
| // (6) mount options: per mount options | |
| // (7) optional fields: zero or more fields of the form "tag[:value]" | |
| // (8) separator: marks the end of the optional fields | |
| // (9) filesystem type: name of filesystem of the form "type[.subtype]" | |
| // (10) mount source: filesystem specific information or "none" | |
| // (11) super options: per super block options | |
| // | |
| // See man 5 proc_pid_mountinfo for more details. | |
| // | |
| // Note that emitted paths will not contain space, tab, newline, or | |
| // carriage return. Those are escaped. See Linux show_mountinfo -> | |
| // show_path. We must unescape before returning. | |
| // | |
| // A mount point matches if the filesystem type (9) is cgroup2, | |
| // or cgroup with "cpu" in the super options (11), | |
| // and the cgroup is in the root (4). If there are multiple matches, | |
| // the first one is selected. | |
| // | |
| // We return full cgroup path, which is the mount point (5) + | |
| // cgroup parameter without the root (4) prefix. | |
| // | |
| // (4), (5), and (10) are up to _PATH_MAX. The remaining fields have a | |
| // small fixed maximum size, so 4*_PATH_MAX is plenty of scratch space. | |
| // Note that non-cgroup mounts may have arbitrarily long (11), but we | |
| // can skip those when parsing. | |
| l := newLineReader(fd, scratch, read) | |
| for { | |
| err := l.next() | |
| if err == errIncompleteLine { | |
| // An incomplete line is fine as long as it doesn't | |
| // impede parsing the fields we need. It shouldn't be | |
| // possible for any mount to use more than 3*PATH_MAX | |
| // before (9) because there are two paths and all other | |
| // earlier fields have bounded options. Only (11) has | |
| // unbounded options. | |
| } else if err == errEOF { | |
| break | |
| } else if err != nil { | |
| return 0, err | |
| } | |
| line := l.line() | |
| // Skip first three fields. | |
| for range 3 { | |
| i := bytealg.IndexByte(line, ' ') | |
| if i < 0 { | |
| return 0, errMalformedFile | |
| } | |
| line = line[i+1:] | |
| } | |
| // (4) root: root of the mount within the filesystem | |
| i := bytealg.IndexByte(line, ' ') | |
| if i < 0 { | |
| return 0, errMalformedFile | |
| } | |
| root := line[:i] | |
| if len(root) == 0 || root[0] != '/' { | |
| // We rely on this in hasPathPrefix. | |
| return 0, errMalformedFile | |
| } | |
| line = line[i+1:] | |
| // (5) mount point: mount point relative to the process's root | |
| i = bytealg.IndexByte(line, ' ') | |
| if i < 0 { | |
| return 0, errMalformedFile | |
| } | |
| mnt := line[:i] | |
| line = line[i+1:] | |
| // Skip ahead past optional fields, delimited by " - ". | |
| for { | |
| i = bytealg.IndexByte(line, ' ') | |
| if i < 0 { | |
| return 0, errMalformedFile | |
| } | |
| if i+3 >= len(line) { | |
| return 0, errMalformedFile | |
| } | |
| delim := line[i : i+3] | |
| if string(delim) == " - " { | |
| line = line[i+3:] | |
| break | |
| } | |
| line = line[i+1:] | |
| } | |
| // (9) filesystem type: name of filesystem of the form "type[.subtype]" | |
| i = bytealg.IndexByte(line, ' ') | |
| if i < 0 { | |
| return 0, errMalformedFile | |
| } | |
| ftype := line[:i] | |
| line = line[i+1:] | |
| switch version { | |
| case V1: | |
| if string(ftype) != "cgroup" { | |
| continue | |
| } | |
| // (10) mount source: filesystem specific information or "none" | |
| i = bytealg.IndexByte(line, ' ') | |
| if i < 0 { | |
| return 0, errMalformedFile | |
| } | |
| // Don't care about mount source. | |
| line = line[i+1:] | |
| // (11) super options: per super block options | |
| if !containsCPU(line) { | |
| continue | |
| } | |
| case V2: | |
| if string(ftype) != "cgroup2" { | |
| continue | |
| } | |
| default: | |
| throw("impossible cgroup version") | |
| panic("unreachable") | |
| } | |
| // Check cgroup is in the root. | |
| // If the cgroup is /sandbox/container, the matching mount point root could be | |
| // /sandbox/container, /sandbox, or / | |
| rootLen, err := unescapePath(root, root) | |
| if err != nil { | |
| return 0, err | |
| } | |
| root = root[:rootLen] | |
| if !hasPathPrefix(cgroup, root) { | |
| continue // not matched, this is not the mount point we're looking for | |
| } | |
| // Cutoff the root from cgroup, ensure rel starts with '/' or is empty. | |
| rel := cgroup[rootLen:] | |
| if rootLen == 1 && len(cgroup) > 1 { | |
| // root is "/", but cgroup is not. Keep full cgroup path. | |
| rel = cgroup | |
| } | |
| if hasPathPrefix(rel, []byte("/..")) { | |
| // the cgroup is out of current cgroup namespace, and this mount point | |
| // cannot reach that cgroup. | |
| // | |
| // e.g. If the process is in cgroup /init, but in a cgroup namespace | |
| // rooted at /sandbox/container, /proc/self/cgroup will show /../../init. | |
| // we can reach it if the mount point root is | |
| // /../.. or /../../init, but not if it is /.. or / | |
| // While mount point with root /../../.. should able to reach the cgroup, | |
| // we don't know the path to the cgroup within that mount point. | |
| continue | |
| } | |
| // All conditions met, compose the full path. | |
| // Copy rel to the correct place first, it may overlap with out. | |
| n := unescapedLen(mnt) | |
| if n+len(rel) > len(out) { | |
| return 0, errPathTooLong | |
| } | |
| copy(out[n:], rel) | |
| n2, err := unescapePath(out[:n], mnt) | |
| if err != nil { | |
| return 0, err | |
| } | |
| if n2 != n { | |
| throw("wrong unescaped len") | |
| } | |
| return n + len(rel), nil | |
| } | |
| // Found nothing. | |
| return 0, ErrNoCgroup | |
| } | |
| func hasPathPrefix(p, prefix []byte) bool { | |
| i := len(prefix) | |
| if i == 1 { | |
| return true // root contains everything | |
| } | |
| if len(p) < i || !bytealg.Equal(prefix, p[:i]) { | |
| return false | |
| } | |
| return len(p) == i || p[i] == '/' // must match at path boundary | |
| } | |
| var ( | |
| errInvalidEscape error = stringError("invalid path escape sequence") | |
| errPathTooLong error = stringError("path too long") | |
| ) | |
| func unescapedLen(in []byte) int { | |
| return len(in) - bytealg.Count(in, byte('\\'))*3 | |
| } | |
| // unescapePath copies in to out, unescaping escape sequences generated by | |
| // Linux's show_path. | |
| // | |
| // That is, '\', ' ', '\t', and '\n' are converted to octal escape sequences, | |
| // like '\040' for space. | |
| // | |
| // Caller must ensure that out at least has unescapedLen(in) bytes. | |
| // in and out may alias; in-place unescaping is supported. | |
| // | |
| // Returns the number of bytes written to out. | |
| // | |
| // Also see escapePath in cgroup_linux_test.go. | |
| func unescapePath(out []byte, in []byte) (int, error) { | |
| var outi, ini int | |
| for ini < len(in) { | |
| if outi >= len(out) { | |
| // given that caller already ensured out is long enough, this | |
| // is only possible if there are malformed escape sequences | |
| // we have not parsed yet. | |
| return outi, errInvalidEscape | |
| } | |
| c := in[ini] | |
| if c != '\\' { | |
| out[outi] = c | |
| outi++ | |
| ini++ | |
| continue | |
| } | |
| // Start of escape sequence. | |
| // Escape sequence is always 4 characters: one slash and three | |
| // digits. | |
| if ini+3 >= len(in) { | |
| return outi, errInvalidEscape | |
| } | |
| var outc int | |
| for i := range 3 { | |
| c := in[ini+1+i] | |
| if c < '0' || c > '7' { | |
| return outi, errInvalidEscape | |
| } | |
| outc *= 8 | |
| outc += int(c - '0') | |
| } | |
| if outc > 0xFF { | |
| return outi, errInvalidEscape | |
| } | |
| out[outi] = byte(outc) | |
| outi++ | |
| ini += 4 | |
| } | |
| return outi, nil | |
| } | |