download
raw
6.01 kB
#!/usr/bin/env python3
# Copyright (c) 2026 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Check for Chinese characters in source code.
Two modes:
- CI mode : set env vars BASE_REF and HEAD_REF to only check the PR diff.
- Local mode: run without those env vars to scan every tracked file in the repo.
Usage:
python3 .github/workflows/check_chinese_chars.py # scan entire repo
BASE_REF=main HEAD_REF=HEAD python3 ... # scan diff only
"""
import os
import re
import subprocess
import sys
from typing import List, Tuple
CHINESE_CHAR_PATTERN = re.compile(
"["
"\u4e00-\u9fff" # CJK Unified Ideographs
"\u3400-\u4dbf" # CJK Unified Ideographs Extension A
"\uf900-\ufaff" # CJK Compatibility Ideographs
"\u3000-\u303f" # CJK Symbols and Punctuation
"\uff01-\uff5e" # Fullwidth ASCII variants
"]"
)
BINARY_EXTENSIONS = frozenset(
{
".png",
".jpg",
".jpeg",
".gif",
".bmp",
".ico",
".svg",
".webp",
".mp3",
".mp4",
".wav",
".avi",
".mov",
".mkv",
".zip",
".tar",
".gz",
".bz2",
".xz",
".7z",
".bin",
".exe",
".dll",
".so",
".dylib",
".pt",
".pth",
".onnx",
".safetensors",
".pickle",
".pkl",
".pdf",
".woff",
".woff2",
".ttf",
".otf",
".eot",
".pyc",
".o",
".a",
".nsys-rep",
".npz",
".npy",
}
)
def _is_binary(path: str) -> bool:
_, ext = os.path.splitext(path.lower())
return ext in BINARY_EXTENSIONS
# ---------------------------------------------------------------------------
# CI mode: only check newly added / modified lines in the PR diff
# ---------------------------------------------------------------------------
def _check_diff(base_sha: str, head_sha: str) -> List[Tuple[str, int, str]]:
base_sha = subprocess.check_output(["git", "rev-parse", base_sha], text=True).strip()
head_sha = subprocess.check_output(["git", "rev-parse", head_sha], text=True).strip()
print(f"[CI mode] Checking diff between {base_sha[:8]} and {head_sha[:8]} ...")
result = subprocess.run(
["git", "diff", "-U0", "--diff-filter=ACM", base_sha, head_sha], capture_output=True, text=True, check=True
)
findings: List[Tuple[str, int, str]] = []
current_file = None
line_num = 0
for line in result.stdout.split("\n"):
if line.startswith("diff --git"):
parts = line.split(" b/")
current_file = parts[-1] if len(parts) >= 2 else None
continue
if line.startswith("@@"):
match = re.search(r"\+(\d+)", line)
if match:
line_num = int(match.group(1)) - 1
continue
if line.startswith("+++") or line.startswith("---"):
continue
if line.startswith("+"):
line_num += 1
content = line[1:]
if current_file and not _is_binary(current_file) and CHINESE_CHAR_PATTERN.search(content):
findings.append((current_file, line_num, content))
elif not line.startswith("-"):
line_num += 1
return findings
# ---------------------------------------------------------------------------
# Local mode: scan every tracked file in the repo
# ---------------------------------------------------------------------------
def _check_all_files() -> List[Tuple[str, int, str]]:
print("[Local mode] Scanning all tracked files for Chinese characters ...")
tracked = subprocess.check_output(["git", "ls-files"], text=True).strip().split("\n")
findings: List[Tuple[str, int, str]] = []
for filepath in tracked:
if not filepath or _is_binary(filepath) or not os.path.isfile(filepath):
continue
try:
with open(filepath, encoding="utf-8", errors="ignore") as fh:
for line_num, line in enumerate(fh, start=1):
if CHINESE_CHAR_PATTERN.search(line):
findings.append((filepath, line_num, line.rstrip("\n")))
except (OSError, UnicodeDecodeError):
continue
return findings
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def _report(findings: List[Tuple[str, int, str]], is_ci: bool) -> None:
if not findings:
print("\nNo Chinese characters found.")
return
print(f"\nFound {len(findings)} line(s) containing Chinese characters:\n")
for filepath, line_no, content in findings:
stripped = content.strip()
print(f" {filepath}:{line_no}: {stripped}")
if is_ci:
print(f"::error file={filepath},line={line_no}::Chinese character detected: {stripped}")
print(f"\n{len(findings)} occurrence(s) total. Please remove Chinese characters from your code.")
def main():
base_ref = os.environ.get("BASE_REF")
head_ref = os.environ.get("HEAD_REF")
is_ci = bool(base_ref and head_ref)
if is_ci:
findings = _check_diff(base_ref, head_ref)
else:
findings = _check_all_files()
_report(findings, is_ci)
if findings:
sys.exit(1)
if __name__ == "__main__":
main()

Xet Storage Details

Size:
6.01 kB
·
Xet hash:
f38fbe8b3230939f8003210a969da7f68a5e01b9864351538705488b0d1e50ed

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.