from worlddisasterlm.config import SUPPORTED_LANGUAGES from worlddisasterlm.data.etl import DisasterETL from worlddisasterlm.data.processors import build_instruction_dataset from worlddisasterlm.data.scenario_builder import build_all_scenarios def test_dataset_builder_generates_samples() -> None: etl = DisasterETL() records = etl.normalize(etl.deduplicate(etl.collect_records())) samples = build_instruction_dataset(records) assert len(samples) > 0 assert samples[0].instruction def test_nepali_in_supported_languages() -> None: """Nepali must be present in SUPPORTED_LANGUAGES.""" assert "Nepali" in SUPPORTED_LANGUAGES def test_nepali_scenario_samples_exist() -> None: """At least one Nepali-language training sample must be built from scenarios.""" samples = build_all_scenarios() nepali_samples = [s for s in samples if s.language.lower() == "nepali"] assert len(nepali_samples) >= 3, ( f"Expected >=3 Nepali samples, found {len(nepali_samples)}" ) def test_nepali_samples_use_devanagari() -> None: """Nepali scenario instructions must contain Devanagari Unicode characters.""" samples = build_all_scenarios() nepali_samples = [s for s in samples if s.language.lower() == "nepali"] for sample in nepali_samples: assert any("\u0900" <= ch <= "\u097F" for ch in sample.instruction), ( f"Nepali sample missing Devanagari: {sample.instruction!r}" )