"""Tests for data utilities.""" import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) import pytest import pandas as pd import numpy as np from src.data import extract_metadata, clean_data class TestExtractMetadata: @pytest.fixture def sample_df(self): return pd.DataFrame({ "id": [ "12/2015/TT-BTC#1-Điều 1", "12/2015/TT-BTC#2-Điều 2", "45/2017/NĐ-CP#1-Điều 1", "01/2020/Luật-DN#1-Điều 1", ], "title": ["Điều 1", "Điều 2", "Điều 1", "Điều 1"], "text": ["Nội dung A " * 10, "Nội dung B " * 10, "Nội dung C " * 10, "Nội dung D " * 10], }) def test_extracts_law_id(self, sample_df): result = extract_metadata(sample_df) assert result["law_id"].tolist() == [ "12/2015/TT-BTC", "12/2015/TT-BTC", "45/2017/NĐ-CP", "01/2020/Luật-DN" ] def test_extracts_article_num(self, sample_df): result = extract_metadata(sample_df) assert result["article_num"].tolist() == [1, 2, 1, 1] def test_extracts_law_type(self, sample_df): result = extract_metadata(sample_df) # law_id "12/2015/TT-BTC" → split by "/" → ["12","2015","TT-BTC"] # parts[2] = "TT-BTC", split by "-" → ["TT","BTC"], last = "BTC" assert result["law_type"].tolist() == ["BTC", "BTC", "CP", "DN"] def test_extracts_year(self, sample_df): result = extract_metadata(sample_df) assert result["year"].tolist() == [2015, 2015, 2017, 2020] def test_no_hash_in_id(self): df = pd.DataFrame({ "id": ["some_simple_id"], "title": ["Test"], "text": ["Test content " * 5], }) result = extract_metadata(df) assert result["law_id"].iloc[0] == "some_simple_id" assert result["article_num"].iloc[0] == 0 def test_preserves_original_columns(self, sample_df): result = extract_metadata(sample_df) for col in sample_df.columns: assert col in result.columns def test_year_fallback(self): df = pd.DataFrame({ "id": ["law/invalid_year/type"], "title": ["Test"], "text": ["Test " * 5], }) result = extract_metadata(df) assert result["year"].iloc[0] == 1999 # fallback class TestCleanData: @pytest.fixture def sample_df(self): return pd.DataFrame({ "id": ["a", "b", "c", "d", "e"], "title": ["T1", "T2", "T3", "T4", "T5"], "text": [ "Nội dung đủ dài " * 5, "Ngắn", # too short "Nội dung C " * 5, "Nội dung C " * 5, # duplicate of above "Nội dung E " * 5, ], }) def test_removes_short_texts(self, sample_df): result = clean_data(sample_df, min_text_len=10) assert "b" not in result["id"].values def test_removes_duplicates(self, sample_df): result = clean_data(sample_df, min_text_len=10) # "Nội dung C " * 5 appears twice, one removed assert result["text"].value_counts().iloc[0] == 1 def test_normalizes_unicode(self): df = pd.DataFrame({ "id": ["x"], "title": ["Tiêu đề"], # decomposed "text": ["Nội dung thử nghiệm " * 5], }) result = clean_data(df, min_text_len=10) # NFC normalization assert result["title"].iloc[0] == "Tiêu đề" def test_preserves_count(self, sample_df): result = clean_data(sample_df, min_text_len=10) # 5 rows - 1 short - 1 duplicate = 3 assert len(result) == 3