Buckets:

linvest21
/

shft-artifacts

Files

xet

linvest21/shft-artifacts / code /self_healing_finetuning /security /secret_scan.py

linvest21

about 1 month ago

download

raw

2.92 kB

	from __future__ import annotations

	import re
	from pathlib import Path

	MAX_SCAN_BYTES = 1_000_000

	SKIP_DIR_NAMES = {
	".git",
	".venv",
	"__pycache__",
	"site-packages",
	"shft_workspace",
	"implementation_products",
	"implementation_linvest21fingpt_equity_researcher",
	"implementation_linvest21fingpt_equity_portfolio_manager",
	"implementation_linvest21fingpt_equity_risk_manager",
	"implementation_linvest21fingpt_equity_performance_manager",
	"implementation_linvest21fingpt_equity_client_portfolio_manager",
	"implementation_linvest21fingpt_equity_chief_investment_officer",
	"implementation_linvest21fingpt_fixed_income_researcher",
	"implementation_linvest21fingpt_fixed_income_portfolio_manager",
	"implementation_linvest21fingpt_fixed_income_risk_manager",
	"implementation_linvest21fingpt_fixed_income_performance_manager",
	"implementation_linvest21fingpt_fixed_income_client_portfolio_manager",
	"implementation_linvest21fingpt_fixed_income_chief_investment_officer",
	"implementation_linvest21fingpt_multi_asset_researcher",
	"implementation_linvest21fingpt_multi_asset_portfolio_manager",
	"implementation_linvest21fingpt_multi_asset_risk_manager",
	"implementation_linvest21fingpt_multi_asset_performance_manager",
	"implementation_linvest21fingpt_multi_asset_client_portfolio_manager",
	"implementation_linvest21fingpt_multi_asset_chief_investment_officer",
	}


	SECRET_PATTERNS = [
	re.compile(r"\bhf_[A-Za-z0-9]{30,}\b"),
	re.compile(r"(?<![A-Za-z0-9])sk-[A-Za-z0-9_\-]{20,}"),
	re.compile(r"runpod_[A-Za-z0-9_\-]{20,}", re.IGNORECASE),
	re.compile(r"BEGIN (RSA\|OPENSSH\|PRIVATE) KEY"),
	]

	ALLOWLIST = {
	"hf_xxx_real_token_value",
	"sk-xxx_real_openai_key",
	"runpod_xxx_real_api_key",
	}


	def scan_text(text: str) -> list[str]:
	findings: list[str] = []
	for pattern in SECRET_PATTERNS:
	for match in pattern.findall(text):
	value = match if isinstance(match, str) else match[0]
	if value not in ALLOWLIST:
	findings.append(pattern.pattern)
	return findings


	def scan_file(path: Path) -> list[str]:
	try:
	text = path.read_text(encoding="utf-8")
	except UnicodeDecodeError:
	return []
	return scan_text(text)


	def scan_tree(root: Path) -> dict[str, list[str]]:
	findings: dict[str, list[str]] = {}
	for path in root.rglob("*"):
	if any(part in SKIP_DIR_NAMES for part in path.parts):
	continue
	if path.is_file() and path.suffix.lower() in {".md", ".html", ".yaml", ".yml", ".json", ".py", ".toml", ".txt", ".log"}:
	try:
	if path.stat().st_size > MAX_SCAN_BYTES:
	continue
	except OSError:
	continue
	result = scan_file(path)
	if result:
	findings[str(path)] = result
	return findings

Xet Storage Details

Size:: 2.92 kB
Xet hash:: 22b13f442824c3a42ad7bc93900fdb0283e262c31704a3770d7f1af3f8377a96

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.