Spaces:

introspector
/

retro-sync-server

Building

retro-sync-server / apps /api-server /src /langsec.rs

mike dupont

init: retro-sync API server + viewer + 71 Bach tiles + catalog

1295969 2 days ago

13.2 kB

	#![allow(dead_code)] // Security boundary module: exposes full validation API surface
	//! LangSec — Language-Theoretic Security threat model and defensive parsing.
	//!
	//! Langsec (https://langsec.org) treats all input as a formal language and
	//! requires that parsers accept ONLY the valid subset, rejecting everything
	//! else at the boundary before any business logic runs.
	//!
	//! This module:
	//! 1. Documents the threat model for every external input surface.
	//! 2. Provides nom-based all-consuming recognisers for all identifier types
	//! not already covered by shared::parsers.
	//! 3. Provides a unified `validate_input` gateway used by route handlers
	//! as the single point of LangSec enforcement.
	//!
	//! Design rules (enforced here):
	//! - All recognisers use nom::combinator::all_consuming — partial matches fail.
	//! - No regex — regexes have ambiguous failure modes; nom's typed combinators
	//! produce explicit, structured errors.
	//! - Input length is checked BEFORE parsing — unbounded input = DoS vector.
	//! - Control characters outside the ASCII printable range are rejected.
	//! - UTF-8 is validated by Rust's str type; invalid UTF-8 never reaches here.
	use serde::Serialize;
	use tracing::warn;

	// ── Threat model ──────────────────────────────────────────────────────────────
	//
	// Surface \| Attack class \| Mitigated by
	// --------------------------------\|---------------------------\|--------------------
	// ISRC (track ID) \| Injection via path seg \| recognize_isrc()
	// BTFS CID \| Path traversal \| recognize_btfs_cid()
	// EVM address \| Address spoofing \| recognize_evm_address()
	// Tron address \| Address spoofing \| recognize_tron_address()
	// BOWI (work ID) \| SSRF / injection \| recognize_bowi()
	// IPI number \| PRO account hijack \| recognize_ipi()
	// ISWC \| Work misattribution \| recognize_iswc()
	// UPC/EAN barcode \| Product spoofing \| recognize_upc()
	// Wallet challenge nonce \| Replay attack \| 5-minute TTL + delete
	// JWT token \| Token forgery \| HMAC-SHA256 (JWT_SECRET)
	// Multipart file upload \| Polyglot file, zip bomb \| Content-Type + size limit
	// XML input (DDEX/CWR) \| XXE, XML injection \| xml_escape() + quick-xml
	// JSON API bodies \| Type confusion \| serde typed structs
	// XSLT stylesheet path \| SSRF/LFI \| whitelist of known names
	// SAP OData values \| Formula injection \| LangSec sanitise_sap_str()
	// Coinbase webhook body \| Spoofed events \| HMAC-SHA256 shared secret
	// Tron tx hash \| Hash confusion \| recognize_tron_tx_hash()
	// Music Reports API key \| Credential stuffing \| environment variable only
	// DURP CSV row \| CSV injection \| sanitise_csv_cell()
	// DQI score \| Score tampering \| server-computed, not trusted
	// Free-text title / description \| Script injection, BOM \| validate_free_text()

	// ── Result type ──────────────────────────────────────────────────────────────

	#[derive(Debug, Clone, Serialize)]
	pub struct LangsecError {
	pub field: String,
	pub reason: String,
	}

	impl std::fmt::Display for LangsecError {
	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
	write!(
	f,
	"LangSec rejection — field '{}': {}",
	self.field, self.reason
	)
	}
	}

	// ── Length limits (all in bytes/codepoints) ───────────────────────────────────

	pub const MAX_TITLE_LEN: usize = 500;
	pub const MAX_ISRC_LEN: usize = 15;
	pub const MAX_BTFS_CID_LEN: usize = 200;
	pub const MAX_EVM_ADDR_LEN: usize = 42; // 0x + 40 hex
	pub const MAX_TRON_ADDR_LEN: usize = 34;
	pub const MAX_BOWI_LEN: usize = 41; // bowi: + 36-char UUID
	pub const MAX_IPI_LEN: usize = 11;
	pub const MAX_ISWC_LEN: usize = 15; // T-000.000.000-C
	pub const MAX_JWT_LEN: usize = 2048;
	pub const MAX_NONCE_LEN: usize = 128;
	pub const MAX_SAP_FIELD_LEN: usize = 60; // SAP typical field length
	pub const MAX_XSLT_NAME_LEN: usize = 64;
	pub const MAX_JSON_BODY_BYTES: usize = 256 * 1024; // 256 KiB

	// ── Tron address recogniser ───────────────────────────────────────────────────
	// Tron addresses:
	// - Base58Check encoded
	// - 21-byte raw: 0x41 (prefix) \|\| 20-byte account hash
	// - Decoded + checksum verified = 25 bytes
	// - Encoded = 34 characters starting with 'T'
	//
	// LangSec: length-check → charset-check → Base58 decode → checksum verify.

	const BASE58_ALPHABET: &[u8] = b"123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz";

	fn base58_decode(input: &str) -> Option<Vec<u8>> {
	let mut result = [0u8; 32];
	for &b in input.as_bytes() {
	let digit = BASE58_ALPHABET.iter().position(\|&x\| x == b)?;
	let mut carry = digit;
	for byte in result.iter_mut().rev() {
	carry += 58 * (*byte as usize);
	*byte = (carry & 0xFF) as u8;
	carry >>= 8;
	}
	if carry != 0 {
	return None;
	}
	}
	// Trim leading zero bytes that don't correspond to leading '1's in input
	let leading_zeros = input.chars().take_while(\|&c\| c == '1').count();
	let trim_start = result.iter().position(\|&b\| b != 0).unwrap_or(result.len());
	let actual_start = trim_start.saturating_sub(leading_zeros);
	Some(result[actual_start..].to_vec())
	}

	/// Validate a Tron Base58Check address.
	/// Returns `Ok(lowercase_hex_account_bytes)` on success.
	pub fn validate_tron_address(input: &str) -> Result<String, LangsecError> {
	let mk_err = \|reason: &str\| LangsecError {
	field: "tron_address".into(),
	reason: reason.into(),
	};

	if input.len() != MAX_TRON_ADDR_LEN {
	return Err(mk_err("must be exactly 34 characters"));
	}
	if !input.starts_with('T') {
	return Err(mk_err("must start with 'T'"));
	}
	if !input.chars().all(\|c\| BASE58_ALPHABET.contains(&(c as u8))) {
	return Err(mk_err("invalid Base58 character"));
	}

	let decoded = base58_decode(input).ok_or_else(\|\| mk_err("Base58 decode failed"))?;
	if decoded.len() < 25 {
	return Err(mk_err("decoded length < 25 bytes"));
	}

	// Last 4 bytes are the checksum; verify via double-SHA256
	let payload = &decoded[..decoded.len() - 4];
	let checksum_bytes = &decoded[decoded.len() - 4..];

	use sha2::{Digest, Sha256};
	let first = Sha256::digest(payload);
	let second = Sha256::digest(first);
	if second[..4] != checksum_bytes[..4] {
	return Err(mk_err("Base58Check checksum mismatch"));
	}

	// Tron addresses start with 0x41 in raw form
	if payload[0] != 0x41 {
	return Err(mk_err("Tron address prefix must be 0x41"));
	}

	let hex: String = payload[1..].iter().map(\|b\| format!("{b:02x}")).collect();
	Ok(hex)
	}

	/// Validate a Tron transaction hash.
	/// Format: 64 hex characters (optionally prefixed by "0x").
	pub fn validate_tron_tx_hash(input: &str) -> Result<String, LangsecError> {
	let s = input.strip_prefix("0x").unwrap_or(input);
	if s.len() != 64 {
	return Err(LangsecError {
	field: "tron_tx_hash".into(),
	reason: format!("must be 64 hex chars, got {}", s.len()),
	});
	}
	if !s.chars().all(\|c\| c.is_ascii_hexdigit()) {
	return Err(LangsecError {
	field: "tron_tx_hash".into(),
	reason: "non-hex character".into(),
	});
	}
	Ok(s.to_lowercase())
	}

	/// Validate free-text fields (titles, descriptions, artist names).
	///
	/// Policy:
	/// - UTF-8 (guaranteed by Rust `str`)
	/// - No C0/C1 control characters except TAB and NEWLINE
	/// - No Unicode BOM (U+FEFF)
	/// - No null bytes
	/// - Max `max_len` codepoints
	pub fn validate_free_text(input: &str, field: &str, max_len: usize) -> Result<(), LangsecError> {
	let codepoints: Vec<char> = input.chars().collect();
	if codepoints.len() > max_len {
	return Err(LangsecError {
	field: field.into(),
	reason: format!("exceeds {max_len} codepoints ({} given)", codepoints.len()),
	});
	}
	for c in &codepoints {
	match *c {
	'\t' \| '\n' \| '\r' => {} // allowed whitespace
	'\u{FEFF}' => {
	return Err(LangsecError {
	field: field.into(),
	reason: "BOM (U+FEFF) not permitted in text fields".into(),
	});
	}
	c if (c as u32) < 0x20 \|\| ((c as u32) >= 0x7F && (c as u32) <= 0x9F) => {
	return Err(LangsecError {
	field: field.into(),
	reason: format!("control character U+{:04X} not permitted", c as u32),
	});
	}
	_ => {}
	}
	}
	Ok(())
	}

	/// Sanitise a value destined for a SAP field (OData/IDoc).
	/// SAP ABAP fields do not support certain characters that trigger formula
	/// injection in downstream SAP exports to Excel/CSV.
	pub fn sanitise_sap_str(input: &str) -> String {
	input
	.chars()
	.take(MAX_SAP_FIELD_LEN)
	.map(\|c\| match c {
	// CSV / formula injection prefixes
	'=' \| '+' \| '-' \| '@' \| '\t' \| '\r' \| '\n' => '_',
	// SAP special chars that can break IDoc fixed-width fields
	'\|' \| '^' \| '~' => '_',
	c => c,
	})
	.collect()
	}

	/// Sanitise a value destined for a DURP CSV cell.
	/// Rejects formula-injection prefixes; strips to printable ASCII+UTF-8.
	pub fn sanitise_csv_cell(input: &str) -> String {
	let s = input.trim();
	// Strip formula injection prefixes
	let s = if matches!(
	s.chars().next(),
	Some('=' \| '+' \| '-' \| '@' \| '\t' \| '\r' \| '\n')
	) {
	&s[1..]
	} else {
	s
	};
	// Replace embedded quotes with escaped form (RFC 4180)
	s.replace('"', "\"\"")
	}

	/// Validate that a given XSLT stylesheet name is in the pre-approved allowlist.
	/// Prevents path traversal / SSRF via stylesheet parameter.
	pub fn validate_xslt_name(name: &str) -> Result<(), LangsecError> {
	const ALLOWED: &[&str] = &[
	"work_registration",
	"apra_amcos",
	"gema",
	"jasrac",
	"nordic",
	"prs",
	"sacem",
	"samro",
	"socan",
	];
	if name.len() > MAX_XSLT_NAME_LEN {
	return Err(LangsecError {
	field: "xslt_name".into(),
	reason: "name too long".into(),
	});
	}
	if !name
	.chars()
	.all(\|c\| c.is_ascii_alphanumeric() \|\| c == '_' \|\| c == '-')
	{
	return Err(LangsecError {
	field: "xslt_name".into(),
	reason: "name contains invalid characters".into(),
	});
	}
	if !ALLOWED.contains(&name) {
	warn!(xslt_name=%name, "XSLT name rejected — not in allowlist");
	return Err(LangsecError {
	field: "xslt_name".into(),
	reason: format!("'{name}' is not in the approved stylesheet list"),
	});
	}
	Ok(())
	}

	#[cfg(test)]
	mod tests {
	use super::*;

	#[test]
	fn tron_address_valid() {
	// Known valid Tron mainnet address
	let r = validate_tron_address("TQn9Y2khEsLJW1ChVWFMSMeRDow5KcbLSE");
	assert!(r.is_ok(), "{r:?}");
	}

	#[test]
	fn tron_address_wrong_prefix() {
	assert!(validate_tron_address("AQn9Y2khEsLJW1ChVWFMSMeRDow5KcbLSE").is_err());
	}

	#[test]
	fn tron_address_wrong_len() {
	assert!(validate_tron_address("TQn9Y2k").is_err());
	}

	#[test]
	fn tron_tx_hash_valid() {
	let h = "a".repeat(64);
	assert!(validate_tron_tx_hash(&h).is_ok());
	}

	#[test]
	fn free_text_rejects_control() {
	assert!(validate_free_text("hello\x00world", "title", 100).is_err());
	}

	#[test]
	fn free_text_rejects_bom() {
	assert!(validate_free_text("\u{FEFF}hello", "title", 100).is_err());
	}

	#[test]
	fn free_text_rejects_long() {
	let long = "a".repeat(501);
	assert!(validate_free_text(&long, "title", 500).is_err());
	}

	#[test]
	fn sanitise_csv_strips_formula() {
	assert!(!sanitise_csv_cell("=SUM(A1)").starts_with('='));
	}

	#[test]
	fn xslt_allowlist_works() {
	assert!(validate_xslt_name("gema").is_ok());
	assert!(validate_xslt_name("../../etc/passwd").is_err());
	}
	}