mike dupont
init: retro-sync API server + viewer + 71 Bach tiles + catalog
1295969
#![allow(dead_code)] // Security boundary module: exposes full validation API surface
//! LangSec β€” Language-Theoretic Security threat model and defensive parsing.
//!
//! Langsec (https://langsec.org) treats all input as a formal language and
//! requires that parsers accept ONLY the valid subset, rejecting everything
//! else at the boundary before any business logic runs.
//!
//! This module:
//! 1. Documents the threat model for every external input surface.
//! 2. Provides nom-based all-consuming recognisers for all identifier types
//! not already covered by shared::parsers.
//! 3. Provides a unified `validate_input` gateway used by route handlers
//! as the single point of LangSec enforcement.
//!
//! Design rules (enforced here):
//! - All recognisers use nom::combinator::all_consuming β€” partial matches fail.
//! - No regex β€” regexes have ambiguous failure modes; nom's typed combinators
//! produce explicit, structured errors.
//! - Input length is checked BEFORE parsing β€” unbounded input = DoS vector.
//! - Control characters outside the ASCII printable range are rejected.
//! - UTF-8 is validated by Rust's str type; invalid UTF-8 never reaches here.
use serde::Serialize;
use tracing::warn;
// ── Threat model ──────────────────────────────────────────────────────────────
//
// Surface | Attack class | Mitigated by
// --------------------------------|---------------------------|--------------------
// ISRC (track ID) | Injection via path seg | recognize_isrc()
// BTFS CID | Path traversal | recognize_btfs_cid()
// EVM address | Address spoofing | recognize_evm_address()
// Tron address | Address spoofing | recognize_tron_address()
// BOWI (work ID) | SSRF / injection | recognize_bowi()
// IPI number | PRO account hijack | recognize_ipi()
// ISWC | Work misattribution | recognize_iswc()
// UPC/EAN barcode | Product spoofing | recognize_upc()
// Wallet challenge nonce | Replay attack | 5-minute TTL + delete
// JWT token | Token forgery | HMAC-SHA256 (JWT_SECRET)
// Multipart file upload | Polyglot file, zip bomb | Content-Type + size limit
// XML input (DDEX/CWR) | XXE, XML injection | xml_escape() + quick-xml
// JSON API bodies | Type confusion | serde typed structs
// XSLT stylesheet path | SSRF/LFI | whitelist of known names
// SAP OData values | Formula injection | LangSec sanitise_sap_str()
// Coinbase webhook body | Spoofed events | HMAC-SHA256 shared secret
// Tron tx hash | Hash confusion | recognize_tron_tx_hash()
// Music Reports API key | Credential stuffing | environment variable only
// DURP CSV row | CSV injection | sanitise_csv_cell()
// DQI score | Score tampering | server-computed, not trusted
// Free-text title / description | Script injection, BOM | validate_free_text()
// ── Result type ──────────────────────────────────────────────────────────────
#[derive(Debug, Clone, Serialize)]
pub struct LangsecError {
pub field: String,
pub reason: String,
}
impl std::fmt::Display for LangsecError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"LangSec rejection β€” field '{}': {}",
self.field, self.reason
)
}
}
// ── Length limits (all in bytes/codepoints) ───────────────────────────────────
pub const MAX_TITLE_LEN: usize = 500;
pub const MAX_ISRC_LEN: usize = 15;
pub const MAX_BTFS_CID_LEN: usize = 200;
pub const MAX_EVM_ADDR_LEN: usize = 42; // 0x + 40 hex
pub const MAX_TRON_ADDR_LEN: usize = 34;
pub const MAX_BOWI_LEN: usize = 41; // bowi: + 36-char UUID
pub const MAX_IPI_LEN: usize = 11;
pub const MAX_ISWC_LEN: usize = 15; // T-000.000.000-C
pub const MAX_JWT_LEN: usize = 2048;
pub const MAX_NONCE_LEN: usize = 128;
pub const MAX_SAP_FIELD_LEN: usize = 60; // SAP typical field length
pub const MAX_XSLT_NAME_LEN: usize = 64;
pub const MAX_JSON_BODY_BYTES: usize = 256 * 1024; // 256 KiB
// ── Tron address recogniser ───────────────────────────────────────────────────
// Tron addresses:
// - Base58Check encoded
// - 21-byte raw: 0x41 (prefix) || 20-byte account hash
// - Decoded + checksum verified = 25 bytes
// - Encoded = 34 characters starting with 'T'
//
// LangSec: length-check β†’ charset-check β†’ Base58 decode β†’ checksum verify.
const BASE58_ALPHABET: &[u8] = b"123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz";
fn base58_decode(input: &str) -> Option<Vec<u8>> {
let mut result = [0u8; 32];
for &b in input.as_bytes() {
let digit = BASE58_ALPHABET.iter().position(|&x| x == b)?;
let mut carry = digit;
for byte in result.iter_mut().rev() {
carry += 58 * (*byte as usize);
*byte = (carry & 0xFF) as u8;
carry >>= 8;
}
if carry != 0 {
return None;
}
}
// Trim leading zero bytes that don't correspond to leading '1's in input
let leading_zeros = input.chars().take_while(|&c| c == '1').count();
let trim_start = result.iter().position(|&b| b != 0).unwrap_or(result.len());
let actual_start = trim_start.saturating_sub(leading_zeros);
Some(result[actual_start..].to_vec())
}
/// Validate a Tron Base58Check address.
/// Returns `Ok(lowercase_hex_account_bytes)` on success.
pub fn validate_tron_address(input: &str) -> Result<String, LangsecError> {
let mk_err = |reason: &str| LangsecError {
field: "tron_address".into(),
reason: reason.into(),
};
if input.len() != MAX_TRON_ADDR_LEN {
return Err(mk_err("must be exactly 34 characters"));
}
if !input.starts_with('T') {
return Err(mk_err("must start with 'T'"));
}
if !input.chars().all(|c| BASE58_ALPHABET.contains(&(c as u8))) {
return Err(mk_err("invalid Base58 character"));
}
let decoded = base58_decode(input).ok_or_else(|| mk_err("Base58 decode failed"))?;
if decoded.len() < 25 {
return Err(mk_err("decoded length < 25 bytes"));
}
// Last 4 bytes are the checksum; verify via double-SHA256
let payload = &decoded[..decoded.len() - 4];
let checksum_bytes = &decoded[decoded.len() - 4..];
use sha2::{Digest, Sha256};
let first = Sha256::digest(payload);
let second = Sha256::digest(first);
if second[..4] != checksum_bytes[..4] {
return Err(mk_err("Base58Check checksum mismatch"));
}
// Tron addresses start with 0x41 in raw form
if payload[0] != 0x41 {
return Err(mk_err("Tron address prefix must be 0x41"));
}
let hex: String = payload[1..].iter().map(|b| format!("{b:02x}")).collect();
Ok(hex)
}
/// Validate a Tron transaction hash.
/// Format: 64 hex characters (optionally prefixed by "0x").
pub fn validate_tron_tx_hash(input: &str) -> Result<String, LangsecError> {
let s = input.strip_prefix("0x").unwrap_or(input);
if s.len() != 64 {
return Err(LangsecError {
field: "tron_tx_hash".into(),
reason: format!("must be 64 hex chars, got {}", s.len()),
});
}
if !s.chars().all(|c| c.is_ascii_hexdigit()) {
return Err(LangsecError {
field: "tron_tx_hash".into(),
reason: "non-hex character".into(),
});
}
Ok(s.to_lowercase())
}
/// Validate free-text fields (titles, descriptions, artist names).
///
/// Policy:
/// - UTF-8 (guaranteed by Rust `str`)
/// - No C0/C1 control characters except TAB and NEWLINE
/// - No Unicode BOM (U+FEFF)
/// - No null bytes
/// - Max `max_len` codepoints
pub fn validate_free_text(input: &str, field: &str, max_len: usize) -> Result<(), LangsecError> {
let codepoints: Vec<char> = input.chars().collect();
if codepoints.len() > max_len {
return Err(LangsecError {
field: field.into(),
reason: format!("exceeds {max_len} codepoints ({} given)", codepoints.len()),
});
}
for c in &codepoints {
match *c {
'\t' | '\n' | '\r' => {} // allowed whitespace
'\u{FEFF}' => {
return Err(LangsecError {
field: field.into(),
reason: "BOM (U+FEFF) not permitted in text fields".into(),
});
}
c if (c as u32) < 0x20 || ((c as u32) >= 0x7F && (c as u32) <= 0x9F) => {
return Err(LangsecError {
field: field.into(),
reason: format!("control character U+{:04X} not permitted", c as u32),
});
}
_ => {}
}
}
Ok(())
}
/// Sanitise a value destined for a SAP field (OData/IDoc).
/// SAP ABAP fields do not support certain characters that trigger formula
/// injection in downstream SAP exports to Excel/CSV.
pub fn sanitise_sap_str(input: &str) -> String {
input
.chars()
.take(MAX_SAP_FIELD_LEN)
.map(|c| match c {
// CSV / formula injection prefixes
'=' | '+' | '-' | '@' | '\t' | '\r' | '\n' => '_',
// SAP special chars that can break IDoc fixed-width fields
'|' | '^' | '~' => '_',
c => c,
})
.collect()
}
/// Sanitise a value destined for a DURP CSV cell.
/// Rejects formula-injection prefixes; strips to printable ASCII+UTF-8.
pub fn sanitise_csv_cell(input: &str) -> String {
let s = input.trim();
// Strip formula injection prefixes
let s = if matches!(
s.chars().next(),
Some('=' | '+' | '-' | '@' | '\t' | '\r' | '\n')
) {
&s[1..]
} else {
s
};
// Replace embedded quotes with escaped form (RFC 4180)
s.replace('"', "\"\"")
}
/// Validate that a given XSLT stylesheet name is in the pre-approved allowlist.
/// Prevents path traversal / SSRF via stylesheet parameter.
pub fn validate_xslt_name(name: &str) -> Result<(), LangsecError> {
const ALLOWED: &[&str] = &[
"work_registration",
"apra_amcos",
"gema",
"jasrac",
"nordic",
"prs",
"sacem",
"samro",
"socan",
];
if name.len() > MAX_XSLT_NAME_LEN {
return Err(LangsecError {
field: "xslt_name".into(),
reason: "name too long".into(),
});
}
if !name
.chars()
.all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
{
return Err(LangsecError {
field: "xslt_name".into(),
reason: "name contains invalid characters".into(),
});
}
if !ALLOWED.contains(&name) {
warn!(xslt_name=%name, "XSLT name rejected β€” not in allowlist");
return Err(LangsecError {
field: "xslt_name".into(),
reason: format!("'{name}' is not in the approved stylesheet list"),
});
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tron_address_valid() {
// Known valid Tron mainnet address
let r = validate_tron_address("TQn9Y2khEsLJW1ChVWFMSMeRDow5KcbLSE");
assert!(r.is_ok(), "{r:?}");
}
#[test]
fn tron_address_wrong_prefix() {
assert!(validate_tron_address("AQn9Y2khEsLJW1ChVWFMSMeRDow5KcbLSE").is_err());
}
#[test]
fn tron_address_wrong_len() {
assert!(validate_tron_address("TQn9Y2k").is_err());
}
#[test]
fn tron_tx_hash_valid() {
let h = "a".repeat(64);
assert!(validate_tron_tx_hash(&h).is_ok());
}
#[test]
fn free_text_rejects_control() {
assert!(validate_free_text("hello\x00world", "title", 100).is_err());
}
#[test]
fn free_text_rejects_bom() {
assert!(validate_free_text("\u{FEFF}hello", "title", 100).is_err());
}
#[test]
fn free_text_rejects_long() {
let long = "a".repeat(501);
assert!(validate_free_text(&long, "title", 500).is_err());
}
#[test]
fn sanitise_csv_strips_formula() {
assert!(!sanitise_csv_cell("=SUM(A1)").starts_with('='));
}
#[test]
fn xslt_allowlist_works() {
assert!(validate_xslt_name("gema").is_ok());
assert!(validate_xslt_name("../../etc/passwd").is_err());
}
}