Buckets:
| from __future__ import annotations | |
| import re | |
| from pathlib import Path | |
| MAX_SCAN_BYTES = 1_000_000 | |
| SKIP_DIR_NAMES = { | |
| ".git", | |
| ".venv", | |
| "__pycache__", | |
| "site-packages", | |
| "shft_workspace", | |
| "implementation_products", | |
| "implementation_linvest21fingpt_equity_researcher", | |
| "implementation_linvest21fingpt_equity_portfolio_manager", | |
| "implementation_linvest21fingpt_equity_risk_manager", | |
| "implementation_linvest21fingpt_equity_performance_manager", | |
| "implementation_linvest21fingpt_equity_client_portfolio_manager", | |
| "implementation_linvest21fingpt_equity_chief_investment_officer", | |
| "implementation_linvest21fingpt_fixed_income_researcher", | |
| "implementation_linvest21fingpt_fixed_income_portfolio_manager", | |
| "implementation_linvest21fingpt_fixed_income_risk_manager", | |
| "implementation_linvest21fingpt_fixed_income_performance_manager", | |
| "implementation_linvest21fingpt_fixed_income_client_portfolio_manager", | |
| "implementation_linvest21fingpt_fixed_income_chief_investment_officer", | |
| "implementation_linvest21fingpt_multi_asset_researcher", | |
| "implementation_linvest21fingpt_multi_asset_portfolio_manager", | |
| "implementation_linvest21fingpt_multi_asset_risk_manager", | |
| "implementation_linvest21fingpt_multi_asset_performance_manager", | |
| "implementation_linvest21fingpt_multi_asset_client_portfolio_manager", | |
| "implementation_linvest21fingpt_multi_asset_chief_investment_officer", | |
| } | |
| SECRET_PATTERNS = [ | |
| re.compile(r"\bhf_[A-Za-z0-9]{30,}\b"), | |
| re.compile(r"(?<![A-Za-z0-9])sk-[A-Za-z0-9_\-]{20,}"), | |
| re.compile(r"runpod_[A-Za-z0-9_\-]{20,}", re.IGNORECASE), | |
| re.compile(r"BEGIN (RSA|OPENSSH|PRIVATE) KEY"), | |
| ] | |
| ALLOWLIST = { | |
| "hf_xxx_real_token_value", | |
| "sk-xxx_real_openai_key", | |
| "runpod_xxx_real_api_key", | |
| } | |
| def scan_text(text: str) -> list[str]: | |
| findings: list[str] = [] | |
| for pattern in SECRET_PATTERNS: | |
| for match in pattern.findall(text): | |
| value = match if isinstance(match, str) else match[0] | |
| if value not in ALLOWLIST: | |
| findings.append(pattern.pattern) | |
| return findings | |
| def scan_file(path: Path) -> list[str]: | |
| try: | |
| text = path.read_text(encoding="utf-8") | |
| except UnicodeDecodeError: | |
| return [] | |
| return scan_text(text) | |
| def scan_tree(root: Path) -> dict[str, list[str]]: | |
| findings: dict[str, list[str]] = {} | |
| for path in root.rglob("*"): | |
| if any(part in SKIP_DIR_NAMES for part in path.parts): | |
| continue | |
| if path.is_file() and path.suffix.lower() in {".md", ".html", ".yaml", ".yml", ".json", ".py", ".toml", ".txt", ".log"}: | |
| try: | |
| if path.stat().st_size > MAX_SCAN_BYTES: | |
| continue | |
| except OSError: | |
| continue | |
| result = scan_file(path) | |
| if result: | |
| findings[str(path)] = result | |
| return findings | |
Xet Storage Details
- Size:
- 2.92 kB
- Xet hash:
- 22b13f442824c3a42ad7bc93900fdb0283e262c31704a3770d7f1af3f8377a96
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.