linvest21's picture
download
raw
2.92 kB
from __future__ import annotations
import re
from pathlib import Path
MAX_SCAN_BYTES = 1_000_000
SKIP_DIR_NAMES = {
".git",
".venv",
"__pycache__",
"site-packages",
"shft_workspace",
"implementation_products",
"implementation_linvest21fingpt_equity_researcher",
"implementation_linvest21fingpt_equity_portfolio_manager",
"implementation_linvest21fingpt_equity_risk_manager",
"implementation_linvest21fingpt_equity_performance_manager",
"implementation_linvest21fingpt_equity_client_portfolio_manager",
"implementation_linvest21fingpt_equity_chief_investment_officer",
"implementation_linvest21fingpt_fixed_income_researcher",
"implementation_linvest21fingpt_fixed_income_portfolio_manager",
"implementation_linvest21fingpt_fixed_income_risk_manager",
"implementation_linvest21fingpt_fixed_income_performance_manager",
"implementation_linvest21fingpt_fixed_income_client_portfolio_manager",
"implementation_linvest21fingpt_fixed_income_chief_investment_officer",
"implementation_linvest21fingpt_multi_asset_researcher",
"implementation_linvest21fingpt_multi_asset_portfolio_manager",
"implementation_linvest21fingpt_multi_asset_risk_manager",
"implementation_linvest21fingpt_multi_asset_performance_manager",
"implementation_linvest21fingpt_multi_asset_client_portfolio_manager",
"implementation_linvest21fingpt_multi_asset_chief_investment_officer",
}
SECRET_PATTERNS = [
re.compile(r"\bhf_[A-Za-z0-9]{30,}\b"),
re.compile(r"(?<![A-Za-z0-9])sk-[A-Za-z0-9_\-]{20,}"),
re.compile(r"runpod_[A-Za-z0-9_\-]{20,}", re.IGNORECASE),
re.compile(r"BEGIN (RSA|OPENSSH|PRIVATE) KEY"),
]
ALLOWLIST = {
"hf_xxx_real_token_value",
"sk-xxx_real_openai_key",
"runpod_xxx_real_api_key",
}
def scan_text(text: str) -> list[str]:
findings: list[str] = []
for pattern in SECRET_PATTERNS:
for match in pattern.findall(text):
value = match if isinstance(match, str) else match[0]
if value not in ALLOWLIST:
findings.append(pattern.pattern)
return findings
def scan_file(path: Path) -> list[str]:
try:
text = path.read_text(encoding="utf-8")
except UnicodeDecodeError:
return []
return scan_text(text)
def scan_tree(root: Path) -> dict[str, list[str]]:
findings: dict[str, list[str]] = {}
for path in root.rglob("*"):
if any(part in SKIP_DIR_NAMES for part in path.parts):
continue
if path.is_file() and path.suffix.lower() in {".md", ".html", ".yaml", ".yml", ".json", ".py", ".toml", ".txt", ".log"}:
try:
if path.stat().st_size > MAX_SCAN_BYTES:
continue
except OSError:
continue
result = scan_file(path)
if result:
findings[str(path)] = result
return findings

Xet Storage Details

Size:
2.92 kB
·
Xet hash:
22b13f442824c3a42ad7bc93900fdb0283e262c31704a3770d7f1af3f8377a96

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.