mike dupont
init: retro-sync API server + viewer + 71 Bach tiles + catalog
1295969
#![allow(dead_code)] // Script detection module: full language validation API exposed
//! Hyperglot — Unicode script and language detection for multilingual metadata.
//!
//! Implements ISO 15924 script code detection using pure-Rust Unicode ranges.
//! Hyperglot (https://hyperglot.rosettatype.com) identifies languages from
//! writing systems; this module provides the same service without spawning
//! an external Python process.
//!
//! LangSec:
//! All inputs are length-bounded (max 4096 codepoints) before scanning.
//! Script detection is done via Unicode block ranges — no regex, no exec().
//!
//! Usage:
//! let result = detect_scripts("Hello мир 日本語");
//! // → [Latin (95%), Cyrillic (3%), CJK (2%)]
use serde::{Deserialize, Serialize};
use tracing::instrument;
/// ISO 15924 script identifier.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Script {
Latin,
Cyrillic,
Arabic,
Hebrew,
Devanagari,
Bengali,
Gurmukhi,
Gujarati,
Tamil,
Telugu,
Kannada,
Malayalam,
Sinhala,
Thai,
Lao,
Tibetan,
Myanmar,
Khmer,
CjkUnified, // Han ideographs
Hiragana,
Katakana,
Hangul,
Greek,
Georgian,
Armenian,
Ethiopic,
Cherokee,
Canadian, // Unified Canadian Aboriginal Syllabics
Runic,
Ogham,
Common, // Digits, punctuation — script-neutral
Unknown,
}
impl Script {
/// ISO 15924 4-letter code.
pub fn iso_code(&self) -> &'static str {
match self {
Self::Latin => "Latn",
Self::Cyrillic => "Cyrl",
Self::Arabic => "Arab",
Self::Hebrew => "Hebr",
Self::Devanagari => "Deva",
Self::Bengali => "Beng",
Self::Gurmukhi => "Guru",
Self::Gujarati => "Gujr",
Self::Tamil => "Taml",
Self::Telugu => "Telu",
Self::Kannada => "Knda",
Self::Malayalam => "Mlym",
Self::Sinhala => "Sinh",
Self::Thai => "Thai",
Self::Lao => "Laoo",
Self::Tibetan => "Tibt",
Self::Myanmar => "Mymr",
Self::Khmer => "Khmr",
Self::CjkUnified => "Hani",
Self::Hiragana => "Hira",
Self::Katakana => "Kana",
Self::Hangul => "Hang",
Self::Greek => "Grek",
Self::Georgian => "Geor",
Self::Armenian => "Armn",
Self::Ethiopic => "Ethi",
Self::Cherokee => "Cher",
Self::Canadian => "Cans",
Self::Runic => "Runr",
Self::Ogham => "Ogam",
Self::Common => "Zyyy",
Self::Unknown => "Zzzz",
}
}
/// Human-readable English name for logging / metadata.
pub fn display_name(&self) -> &'static str {
match self {
Self::Latin => "Latin",
Self::Cyrillic => "Cyrillic",
Self::Arabic => "Arabic",
Self::Hebrew => "Hebrew",
Self::Devanagari => "Devanagari",
Self::Bengali => "Bengali",
Self::Gurmukhi => "Gurmukhi",
Self::Gujarati => "Gujarati",
Self::Tamil => "Tamil",
Self::Telugu => "Telugu",
Self::Kannada => "Kannada",
Self::Malayalam => "Malayalam",
Self::Sinhala => "Sinhala",
Self::Thai => "Thai",
Self::Lao => "Lao",
Self::Tibetan => "Tibetan",
Self::Myanmar => "Myanmar",
Self::Khmer => "Khmer",
Self::CjkUnified => "CJK Unified Ideographs",
Self::Hiragana => "Hiragana",
Self::Katakana => "Katakana",
Self::Hangul => "Hangul",
Self::Greek => "Greek",
Self::Georgian => "Georgian",
Self::Armenian => "Armenian",
Self::Ethiopic => "Ethiopic",
Self::Cherokee => "Cherokee",
Self::Canadian => "Canadian Aboriginal Syllabics",
Self::Runic => "Runic",
Self::Ogham => "Ogham",
Self::Common => "Common (Neutral)",
Self::Unknown => "Unknown",
}
}
/// Writing direction.
pub fn is_rtl(&self) -> bool {
matches!(self, Self::Arabic | Self::Hebrew)
}
}
/// Map a Unicode codepoint to its ISO 15924 script using block ranges.
/// Source: Unicode 15.1 script assignment tables (chapter 4, Unicode standard).
fn codepoint_to_script(c: char) -> Script {
let u = c as u32;
match u {
// Basic Latin (A-Z, a-z only) + Latin Extended
// NOTE: 0x005B..=0x0060 (`[`, `\`, `]`, `^`, `_`, `` ` ``) are Common, not Latin.
0x0041..=0x005A
| 0x0061..=0x007A
| 0x00C0..=0x024F
| 0x0250..=0x02AF
| 0x1D00..=0x1D7F
| 0xFB00..=0xFB06 => Script::Latin,
// Cyrillic
0x0400..=0x04FF | 0x0500..=0x052F | 0x2DE0..=0x2DFF | 0xA640..=0xA69F => Script::Cyrillic,
// Greek
0x0370..=0x03FF | 0x1F00..=0x1FFF => Script::Greek,
// Arabic
0x0600..=0x06FF
| 0x0750..=0x077F
| 0xFB50..=0xFDFF
| 0xFE70..=0xFEFF
| 0x10E60..=0x10E7F => Script::Arabic,
// Hebrew
0x0590..=0x05FF | 0xFB1D..=0xFB4F => Script::Hebrew,
// Devanagari (Hindi, Sanskrit, Marathi, Nepali…)
0x0900..=0x097F | 0xA8E0..=0xA8FF => Script::Devanagari,
// Bengali
0x0980..=0x09FF => Script::Bengali,
// Gurmukhi (Punjabi)
0x0A00..=0x0A7F => Script::Gurmukhi,
// Gujarati
0x0A80..=0x0AFF => Script::Gujarati,
// Tamil
0x0B80..=0x0BFF => Script::Tamil,
// Telugu
0x0C00..=0x0C7F => Script::Telugu,
// Kannada
0x0C80..=0x0CFF => Script::Kannada,
// Malayalam
0x0D00..=0x0D7F => Script::Malayalam,
// Sinhala
0x0D80..=0x0DFF => Script::Sinhala,
// Thai
0x0E00..=0x0E7F => Script::Thai,
// Lao
0x0E80..=0x0EFF => Script::Lao,
// Tibetan
0x0F00..=0x0FFF => Script::Tibetan,
// Myanmar
0x1000..=0x109F | 0xA9E0..=0xA9FF | 0xAA60..=0xAA7F => Script::Myanmar,
// Khmer
0x1780..=0x17FF | 0x19E0..=0x19FF => Script::Khmer,
// Georgian
0x10A0..=0x10FF | 0x2D00..=0x2D2F => Script::Georgian,
// Armenian
0x0530..=0x058F | 0xFB13..=0xFB17 => Script::Armenian,
// Ethiopic
0x1200..=0x137F | 0x1380..=0x139F | 0x2D80..=0x2DDF | 0xAB01..=0xAB2F => Script::Ethiopic,
// Hangul (Korean)
0x1100..=0x11FF | 0x302E..=0x302F | 0x3131..=0x318F | 0xA960..=0xA97F | 0xAC00..=0xD7FF => {
Script::Hangul
}
// Hiragana
0x3041..=0x309F | 0x1B001..=0x1B0FF => Script::Hiragana,
// Katakana
0x30A0..=0x30FF | 0x31F0..=0x31FF | 0xFF66..=0xFF9F => Script::Katakana,
// CJK Unified Ideographs (Han)
0x4E00..=0x9FFF
| 0x3400..=0x4DBF
| 0x20000..=0x2A6DF
| 0x2A700..=0x2CEAF
| 0xF900..=0xFAFF => Script::CjkUnified,
// Cherokee
0x13A0..=0x13FF | 0xAB70..=0xABBF => Script::Cherokee,
// Unified Canadian Aboriginal Syllabics
0x1400..=0x167F | 0x18B0..=0x18FF => Script::Canadian,
// Runic
0x16A0..=0x16FF => Script::Runic,
// Ogham
0x1680..=0x169F => Script::Ogham,
// Common: digits, punctuation, whitespace
0x0021..=0x0040
| 0x005B..=0x0060
| 0x007B..=0x00BF
| 0x2000..=0x206F
| 0x2100..=0x214F
| 0x3000..=0x303F
| 0xFF01..=0xFF0F => Script::Common,
_ => Script::Unknown,
}
}
/// Script coverage result.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ScriptCoverage {
pub script: Script,
pub iso_code: String,
pub display_name: String,
pub codepoint_count: usize,
pub coverage_pct: f32,
pub is_rtl: bool,
}
/// Result of hyperglot analysis.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HyperglotResult {
/// All scripts found, sorted by coverage descending.
pub scripts: Vec<ScriptCoverage>,
/// Primary script (highest coverage, excluding Common/Unknown).
pub primary_script: Option<String>,
/// True if any RTL script detected.
pub has_rtl: bool,
/// True if multiple non-common scripts detected (multilingual text).
pub is_multilingual: bool,
/// Total analysed codepoints.
pub total_codepoints: usize,
}
/// Maximum input length in codepoints (LangSec safety bound).
const MAX_INPUT_CODEPOINTS: usize = 4096;
/// Detect Unicode scripts in `text`.
///
/// Returns script coverage sorted by frequency descending.
/// Common (punctuation/digits) and Unknown codepoints are counted but not
/// included in the primary script selection.
#[instrument(skip(text))]
pub fn detect_scripts(text: &str) -> HyperglotResult {
use std::collections::HashMap;
// LangSec: hard cap on input size before any work is done
let codepoints: Vec<char> = text.chars().take(MAX_INPUT_CODEPOINTS).collect();
let total = codepoints.len();
if total == 0 {
return HyperglotResult {
scripts: vec![],
primary_script: None,
has_rtl: false,
is_multilingual: false,
total_codepoints: 0,
};
}
let mut counts: HashMap<Script, usize> = HashMap::new();
for &c in &codepoints {
*counts.entry(codepoint_to_script(c)).or_insert(0) += 1;
}
let mut scripts: Vec<ScriptCoverage> = counts
.into_iter()
.map(|(script, count)| {
let pct = (count as f32 / total as f32) * 100.0;
let iso = script.iso_code().to_string();
let name = script.display_name().to_string();
let rtl = script.is_rtl();
ScriptCoverage {
script,
iso_code: iso,
display_name: name,
codepoint_count: count,
coverage_pct: pct,
is_rtl: rtl,
}
})
.collect();
// Sort by coverage descending
scripts.sort_by(|a, b| b.codepoint_count.cmp(&a.codepoint_count));
let has_rtl = scripts.iter().any(|s| s.is_rtl);
// Primary = highest-coverage script excluding Common/Unknown
let primary_script = scripts
.iter()
.find(|s| !matches!(s.script, Script::Common | Script::Unknown))
.map(|s| s.iso_code.clone());
// Multilingual = 2+ non-common/unknown scripts with ≥5% coverage each
let significant: Vec<_> = scripts
.iter()
.filter(|s| !matches!(s.script, Script::Common | Script::Unknown) && s.coverage_pct >= 5.0)
.collect();
let is_multilingual = significant.len() >= 2;
HyperglotResult {
scripts,
primary_script,
has_rtl,
is_multilingual,
total_codepoints: total,
}
}
/// Validate that a track title's script matches the declared language.
/// Returns `true` if the title is plausibly in the declared BCP-47 language.
pub fn validate_title_language(title: &str, bcp47_lang: &str) -> bool {
let result = detect_scripts(title);
let primary = match &result.primary_script {
Some(s) => s.as_str(),
None => return true, // empty / all-common → pass
};
// Map BCP-47 language prefixes to expected ISO 15924 script codes.
// This is a best-effort check, not an RFC 5646 full lookup.
let expected_script: &[&str] = match bcp47_lang.split('-').next().unwrap_or("") {
"ja" => &["Hira", "Kana", "Hani"],
"zh" => &["Hani"],
"ko" => &["Hang"],
"ar" => &["Arab"],
"he" => &["Hebr"],
"hi" | "mr" | "ne" | "sa" => &["Deva"],
"ru" | "uk" | "bg" | "sr" | "mk" | "be" => &["Cyrl"],
"ka" => &["Geor"],
"hy" => &["Armn"],
"th" => &["Thai"],
"lo" => &["Laoo"],
"my" => &["Mymr"],
"km" => &["Khmr"],
"am" | "ti" => &["Ethi"],
_ => return true, // Latin or unknown → accept
};
expected_script.contains(&primary)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_latin_detection() {
let r = detect_scripts("Hello World");
assert_eq!(r.primary_script.as_deref(), Some("Latn"));
}
#[test]
fn test_cyrillic_detection() {
let r = detect_scripts("Привет мир");
assert_eq!(r.primary_script.as_deref(), Some("Cyrl"));
}
#[test]
fn test_arabic_detection() {
let r = detect_scripts("مرحبا بالعالم");
assert_eq!(r.primary_script.as_deref(), Some("Arab"));
assert!(r.has_rtl);
}
#[test]
fn test_multilingual() {
let r = detect_scripts("Hello Привет مرحبا");
assert!(r.is_multilingual);
}
#[test]
fn test_cjk_detection() {
let r = detect_scripts("日本語テスト");
let codes: Vec<_> = r.scripts.iter().map(|s| s.iso_code.as_str()).collect();
assert!(codes.contains(&"Hani") || codes.contains(&"Hira") || codes.contains(&"Kana"));
}
#[test]
fn test_length_cap() {
let long: String = "a".repeat(10000);
let r = detect_scripts(&long);
assert!(r.total_codepoints <= 4096);
}
}