WorldDisasterLM-8B / tests /test_dataset_builder.py
drdeveloper88's picture
Upload WorldDisasterLM-8B source code: FastAPI backend, training pipeline, 11-language support
495526b
Raw
History Blame Contribute Delete
1.46 kB
from worlddisasterlm.config import SUPPORTED_LANGUAGES
from worlddisasterlm.data.etl import DisasterETL
from worlddisasterlm.data.processors import build_instruction_dataset
from worlddisasterlm.data.scenario_builder import build_all_scenarios
def test_dataset_builder_generates_samples() -> None:
etl = DisasterETL()
records = etl.normalize(etl.deduplicate(etl.collect_records()))
samples = build_instruction_dataset(records)
assert len(samples) > 0
assert samples[0].instruction
def test_nepali_in_supported_languages() -> None:
"""Nepali must be present in SUPPORTED_LANGUAGES."""
assert "Nepali" in SUPPORTED_LANGUAGES
def test_nepali_scenario_samples_exist() -> None:
"""At least one Nepali-language training sample must be built from scenarios."""
samples = build_all_scenarios()
nepali_samples = [s for s in samples if s.language.lower() == "nepali"]
assert len(nepali_samples) >= 3, (
f"Expected >=3 Nepali samples, found {len(nepali_samples)}"
)
def test_nepali_samples_use_devanagari() -> None:
"""Nepali scenario instructions must contain Devanagari Unicode characters."""
samples = build_all_scenarios()
nepali_samples = [s for s in samples if s.language.lower() == "nepali"]
for sample in nepali_samples:
assert any("\u0900" <= ch <= "\u097F" for ch in sample.instruction), (
f"Nepali sample missing Devanagari: {sample.instruction!r}"
)