Chart / dataset /label /Vchart /box /apply_source_from_webtxt.py
adddrett's picture
clean init
9fce90e
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import json
import re
import sys
from pathlib import Path
FILENAME_RE = re.compile(r"^chart_(\d+)_bar\.json$", re.IGNORECASE)
def extract_num(p: Path) -> int:
m = FILENAME_RE.match(p.name)
if not m:
raise ValueError(f"文件名不符合规则: {p.name}")
return int(m.group(1)) # 按数值排序
def read_lines_no_empty(txt_path: Path) -> list[str]:
try:
raw = txt_path.read_text(encoding="utf-8")
except UnicodeDecodeError:
raw = txt_path.read_text(encoding="utf-8-sig", errors="replace")
# splitlines() 不包含末尾换行符;并过滤空行(即使你说没有空行,也做防御)
lines = [line for line in raw.splitlines() if line.strip() != ""]
return lines
def main():
ap = argparse.ArgumentParser(
description="按文件名中间数字正序,将 web.txt 每行写入对应 JSON 的 Weblink 字段"
)
ap.add_argument("dir", help="包含 json 与 web.txt 的文件夹路径")
ap.add_argument("--txt", default="web.txt", help="文本文件名/路径(默认 web.txt)")
ap.add_argument("--dry-run", action="store_true", help="只预览不写入")
ap.add_argument("--backup", action="store_true", help="写入前备份为 .bak(仅第一次)")
args = ap.parse_args()
folder = Path(args.dir)
if not folder.exists() or not folder.is_dir():
print(f"目录不存在或不是文件夹:{folder}", file=sys.stderr)
return 2
txt_path = Path(args.txt)
if not txt_path.is_absolute():
txt_path = folder / txt_path
if not txt_path.exists():
print(f"找不到 web.txt:{txt_path}", file=sys.stderr)
return 2
# 找 json(严格匹配 chart_0001_line.json 这种)
candidates = list(folder.glob("chart_*_bar.json"))
json_files = []
bad_names = []
for p in candidates:
if FILENAME_RE.match(p.name):
json_files.append(p)
else:
bad_names.append(p.name)
if not json_files:
print("未找到符合 chart_0001_bar.json 规则的文件。", file=sys.stderr)
if candidates:
print("但找到一些类似文件:", file=sys.stderr)
for n in sorted([c.name for c in candidates])[:20]:
print(f" - {n}", file=sys.stderr)
return 1
# 按“中间数字”数值正序排序
json_files.sort(key=extract_num)
# 读 web.txt(过滤空行;不含换行符)
lines = read_lines_no_empty(txt_path)
if len(lines) != len(json_files):
print("行数与 JSON 数量不一致,停止执行:", file=sys.stderr)
print(f" web.txt 有效行数(已过滤空行): {len(lines)}", file=sys.stderr)
print(f" json 文件数: {len(json_files)}", file=sys.stderr)
print("\n前几个 json 文件(排序后):", file=sys.stderr)
for p in json_files[:10]:
print(f" - {p.name}", file=sys.stderr)
return 1
changed = 0
for i, (p, src_value) in enumerate(zip(json_files, lines), start=1):
try:
original_text = p.read_text(encoding="utf-8")
except UnicodeDecodeError:
original_text = p.read_text(encoding="utf-8-sig", errors="replace")
try:
data = json.loads(original_text)
except Exception as e:
print(f"错误(JSON 解析失败):{p.name} -> {e}")
continue
if not isinstance(data, dict):
print(f"跳过(JSON 顶层不是对象):{p.name}")
continue
old = data.get("Weblink", None)
data["Weblink"] = src_value
will_change = (old != src_value)
if args.dry_run:
print(f"[DRY {i:04d}] {p.name}: Weblink {old!r} -> {src_value!r}")
continue
if will_change and args.backup:
bak = p.with_suffix(p.suffix + ".bak")
if not bak.exists():
bak.write_text(original_text, encoding="utf-8")
if will_change:
p.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
changed += 1
print(f"[OK {i:04d}] {p.name}: Weblink 写入成功")
else:
print(f"[NO {i:04d}] {p.name}: Weblink 无变化(已相同)")
print("\n==== 完成 ====")
print(f"总文件数: {len(json_files)}")
print(f"发生写入: {changed}")
if bad_names:
print("\n提示:同目录下还有这些文件名不符合严格规则(未处理):")
for n in sorted(bad_names)[:50]:
print(f" - {n}")
return 0
if __name__ == "__main__":
raise SystemExit(main())