File size: 13,153 Bytes
1295969
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
#![allow(dead_code)] // Security boundary module: exposes full validation API surface
//! LangSec β€” Language-Theoretic Security threat model and defensive parsing.
//!
//! Langsec (https://langsec.org) treats all input as a formal language and
//! requires that parsers accept ONLY the valid subset, rejecting everything
//! else at the boundary before any business logic runs.
//!
//! This module:
//!   1. Documents the threat model for every external input surface.
//!   2. Provides nom-based all-consuming recognisers for all identifier types
//!      not already covered by shared::parsers.
//!   3. Provides a unified `validate_input` gateway used by route handlers
//!      as the single point of LangSec enforcement.
//!
//! Design rules (enforced here):
//!   - All recognisers use nom::combinator::all_consuming β€” partial matches fail.
//!   - No regex β€” regexes have ambiguous failure modes; nom's typed combinators
//!     produce explicit, structured errors.
//!   - Input length is checked BEFORE parsing β€” unbounded input = DoS vector.
//!   - Control characters outside the ASCII printable range are rejected.
//!   - UTF-8 is validated by Rust's str type; invalid UTF-8 never reaches here.
use serde::Serialize;
use tracing::warn;

// ── Threat model ──────────────────────────────────────────────────────────────
//
// Surface                         | Attack class              | Mitigated by
// --------------------------------|---------------------------|--------------------
// ISRC (track ID)                 | Injection via path seg    | recognize_isrc()
// BTFS CID                        | Path traversal            | recognize_btfs_cid()
// EVM address                     | Address spoofing          | recognize_evm_address()
// Tron address                    | Address spoofing          | recognize_tron_address()
// BOWI (work ID)                  | SSRF / injection          | recognize_bowi()
// IPI number                      | PRO account hijack        | recognize_ipi()
// ISWC                            | Work misattribution       | recognize_iswc()
// UPC/EAN barcode                 | Product spoofing          | recognize_upc()
// Wallet challenge nonce          | Replay attack             | 5-minute TTL + delete
// JWT token                       | Token forgery             | HMAC-SHA256 (JWT_SECRET)
// Multipart file upload           | Polyglot file, zip bomb   | Content-Type + size limit
// XML input (DDEX/CWR)            | XXE, XML injection        | xml_escape() + quick-xml
// JSON API bodies                 | Type confusion            | serde typed structs
// XSLT stylesheet path            | SSRF/LFI                  | whitelist of known names
// SAP OData values                | Formula injection         | LangSec sanitise_sap_str()
// Coinbase webhook body           | Spoofed events            | HMAC-SHA256 shared secret
// Tron tx hash                    | Hash confusion            | recognize_tron_tx_hash()
// Music Reports API key           | Credential stuffing       | environment variable only
// DURP CSV row                    | CSV injection             | sanitise_csv_cell()
// DQI score                       | Score tampering           | server-computed, not trusted
// Free-text title / description   | Script injection, BOM     | validate_free_text()

// ── Result type ──────────────────────────────────────────────────────────────

#[derive(Debug, Clone, Serialize)]
pub struct LangsecError {
    pub field: String,
    pub reason: String,
}

impl std::fmt::Display for LangsecError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "LangSec rejection β€” field '{}': {}",
            self.field, self.reason
        )
    }
}

// ── Length limits (all in bytes/codepoints) ───────────────────────────────────

pub const MAX_TITLE_LEN: usize = 500;
pub const MAX_ISRC_LEN: usize = 15;
pub const MAX_BTFS_CID_LEN: usize = 200;
pub const MAX_EVM_ADDR_LEN: usize = 42; // 0x + 40 hex
pub const MAX_TRON_ADDR_LEN: usize = 34;
pub const MAX_BOWI_LEN: usize = 41; // bowi: + 36-char UUID
pub const MAX_IPI_LEN: usize = 11;
pub const MAX_ISWC_LEN: usize = 15; // T-000.000.000-C
pub const MAX_JWT_LEN: usize = 2048;
pub const MAX_NONCE_LEN: usize = 128;
pub const MAX_SAP_FIELD_LEN: usize = 60; // SAP typical field length
pub const MAX_XSLT_NAME_LEN: usize = 64;
pub const MAX_JSON_BODY_BYTES: usize = 256 * 1024; // 256 KiB

// ── Tron address recogniser ───────────────────────────────────────────────────
// Tron addresses:
//   - Base58Check encoded
//   - 21-byte raw: 0x41 (prefix) || 20-byte account hash
//   - Decoded + checksum verified = 25 bytes
//   - Encoded = 34 characters starting with 'T'
//
// LangSec: length-check β†’ charset-check β†’ Base58 decode β†’ checksum verify.

const BASE58_ALPHABET: &[u8] = b"123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz";

fn base58_decode(input: &str) -> Option<Vec<u8>> {
    let mut result = [0u8; 32];
    for &b in input.as_bytes() {
        let digit = BASE58_ALPHABET.iter().position(|&x| x == b)?;
        let mut carry = digit;
        for byte in result.iter_mut().rev() {
            carry += 58 * (*byte as usize);
            *byte = (carry & 0xFF) as u8;
            carry >>= 8;
        }
        if carry != 0 {
            return None;
        }
    }
    // Trim leading zero bytes that don't correspond to leading '1's in input
    let leading_zeros = input.chars().take_while(|&c| c == '1').count();
    let trim_start = result.iter().position(|&b| b != 0).unwrap_or(result.len());
    let actual_start = trim_start.saturating_sub(leading_zeros);
    Some(result[actual_start..].to_vec())
}

/// Validate a Tron Base58Check address.
/// Returns `Ok(lowercase_hex_account_bytes)` on success.
pub fn validate_tron_address(input: &str) -> Result<String, LangsecError> {
    let mk_err = |reason: &str| LangsecError {
        field: "tron_address".into(),
        reason: reason.into(),
    };

    if input.len() != MAX_TRON_ADDR_LEN {
        return Err(mk_err("must be exactly 34 characters"));
    }
    if !input.starts_with('T') {
        return Err(mk_err("must start with 'T'"));
    }
    if !input.chars().all(|c| BASE58_ALPHABET.contains(&(c as u8))) {
        return Err(mk_err("invalid Base58 character"));
    }

    let decoded = base58_decode(input).ok_or_else(|| mk_err("Base58 decode failed"))?;
    if decoded.len() < 25 {
        return Err(mk_err("decoded length < 25 bytes"));
    }

    // Last 4 bytes are the checksum; verify via double-SHA256
    let payload = &decoded[..decoded.len() - 4];
    let checksum_bytes = &decoded[decoded.len() - 4..];

    use sha2::{Digest, Sha256};
    let first = Sha256::digest(payload);
    let second = Sha256::digest(first);
    if second[..4] != checksum_bytes[..4] {
        return Err(mk_err("Base58Check checksum mismatch"));
    }

    // Tron addresses start with 0x41 in raw form
    if payload[0] != 0x41 {
        return Err(mk_err("Tron address prefix must be 0x41"));
    }

    let hex: String = payload[1..].iter().map(|b| format!("{b:02x}")).collect();
    Ok(hex)
}

/// Validate a Tron transaction hash.
/// Format: 64 hex characters (optionally prefixed by "0x").
pub fn validate_tron_tx_hash(input: &str) -> Result<String, LangsecError> {
    let s = input.strip_prefix("0x").unwrap_or(input);
    if s.len() != 64 {
        return Err(LangsecError {
            field: "tron_tx_hash".into(),
            reason: format!("must be 64 hex chars, got {}", s.len()),
        });
    }
    if !s.chars().all(|c| c.is_ascii_hexdigit()) {
        return Err(LangsecError {
            field: "tron_tx_hash".into(),
            reason: "non-hex character".into(),
        });
    }
    Ok(s.to_lowercase())
}

/// Validate free-text fields (titles, descriptions, artist names).
///
/// Policy:
///   - UTF-8 (guaranteed by Rust `str`)
///   - No C0/C1 control characters except TAB and NEWLINE
///   - No Unicode BOM (U+FEFF)
///   - No null bytes
///   - Max `max_len` codepoints
pub fn validate_free_text(input: &str, field: &str, max_len: usize) -> Result<(), LangsecError> {
    let codepoints: Vec<char> = input.chars().collect();
    if codepoints.len() > max_len {
        return Err(LangsecError {
            field: field.into(),
            reason: format!("exceeds {max_len} codepoints ({} given)", codepoints.len()),
        });
    }
    for c in &codepoints {
        match *c {
            '\t' | '\n' | '\r' => {} // allowed whitespace
            '\u{FEFF}' => {
                return Err(LangsecError {
                    field: field.into(),
                    reason: "BOM (U+FEFF) not permitted in text fields".into(),
                });
            }
            c if (c as u32) < 0x20 || ((c as u32) >= 0x7F && (c as u32) <= 0x9F) => {
                return Err(LangsecError {
                    field: field.into(),
                    reason: format!("control character U+{:04X} not permitted", c as u32),
                });
            }
            _ => {}
        }
    }
    Ok(())
}

/// Sanitise a value destined for a SAP field (OData/IDoc).
/// SAP ABAP fields do not support certain characters that trigger formula
/// injection in downstream SAP exports to Excel/CSV.
pub fn sanitise_sap_str(input: &str) -> String {
    input
        .chars()
        .take(MAX_SAP_FIELD_LEN)
        .map(|c| match c {
            // CSV / formula injection prefixes
            '=' | '+' | '-' | '@' | '\t' | '\r' | '\n' => '_',
            // SAP special chars that can break IDoc fixed-width fields
            '|' | '^' | '~' => '_',
            c => c,
        })
        .collect()
}

/// Sanitise a value destined for a DURP CSV cell.
/// Rejects formula-injection prefixes; strips to printable ASCII+UTF-8.
pub fn sanitise_csv_cell(input: &str) -> String {
    let s = input.trim();
    // Strip formula injection prefixes
    let s = if matches!(
        s.chars().next(),
        Some('=' | '+' | '-' | '@' | '\t' | '\r' | '\n')
    ) {
        &s[1..]
    } else {
        s
    };
    // Replace embedded quotes with escaped form (RFC 4180)
    s.replace('"', "\"\"")
}

/// Validate that a given XSLT stylesheet name is in the pre-approved allowlist.
/// Prevents path traversal / SSRF via stylesheet parameter.
pub fn validate_xslt_name(name: &str) -> Result<(), LangsecError> {
    const ALLOWED: &[&str] = &[
        "work_registration",
        "apra_amcos",
        "gema",
        "jasrac",
        "nordic",
        "prs",
        "sacem",
        "samro",
        "socan",
    ];
    if name.len() > MAX_XSLT_NAME_LEN {
        return Err(LangsecError {
            field: "xslt_name".into(),
            reason: "name too long".into(),
        });
    }
    if !name
        .chars()
        .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
    {
        return Err(LangsecError {
            field: "xslt_name".into(),
            reason: "name contains invalid characters".into(),
        });
    }
    if !ALLOWED.contains(&name) {
        warn!(xslt_name=%name, "XSLT name rejected β€” not in allowlist");
        return Err(LangsecError {
            field: "xslt_name".into(),
            reason: format!("'{name}' is not in the approved stylesheet list"),
        });
    }
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn tron_address_valid() {
        // Known valid Tron mainnet address
        let r = validate_tron_address("TQn9Y2khEsLJW1ChVWFMSMeRDow5KcbLSE");
        assert!(r.is_ok(), "{r:?}");
    }

    #[test]
    fn tron_address_wrong_prefix() {
        assert!(validate_tron_address("AQn9Y2khEsLJW1ChVWFMSMeRDow5KcbLSE").is_err());
    }

    #[test]
    fn tron_address_wrong_len() {
        assert!(validate_tron_address("TQn9Y2k").is_err());
    }

    #[test]
    fn tron_tx_hash_valid() {
        let h = "a".repeat(64);
        assert!(validate_tron_tx_hash(&h).is_ok());
    }

    #[test]
    fn free_text_rejects_control() {
        assert!(validate_free_text("hello\x00world", "title", 100).is_err());
    }

    #[test]
    fn free_text_rejects_bom() {
        assert!(validate_free_text("\u{FEFF}hello", "title", 100).is_err());
    }

    #[test]
    fn free_text_rejects_long() {
        let long = "a".repeat(501);
        assert!(validate_free_text(&long, "title", 500).is_err());
    }

    #[test]
    fn sanitise_csv_strips_formula() {
        assert!(!sanitise_csv_cell("=SUM(A1)").starts_with('='));
    }

    #[test]
    fn xslt_allowlist_works() {
        assert!(validate_xslt_name("gema").is_ok());
        assert!(validate_xslt_name("../../etc/passwd").is_err());
    }
}