| """Tests for data utilities.""" |
| import sys |
| from pathlib import Path |
| sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
| import pytest |
| import pandas as pd |
| import numpy as np |
| from src.data import extract_metadata, clean_data |
|
|
|
|
| class TestExtractMetadata: |
| @pytest.fixture |
| def sample_df(self): |
| return pd.DataFrame({ |
| "id": [ |
| "12/2015/TT-BTC#1-Điều 1", |
| "12/2015/TT-BTC#2-Điều 2", |
| "45/2017/NĐ-CP#1-Điều 1", |
| "01/2020/Luật-DN#1-Điều 1", |
| ], |
| "title": ["Điều 1", "Điều 2", "Điều 1", "Điều 1"], |
| "text": ["Nội dung A " * 10, "Nội dung B " * 10, "Nội dung C " * 10, "Nội dung D " * 10], |
| }) |
|
|
| def test_extracts_law_id(self, sample_df): |
| result = extract_metadata(sample_df) |
| assert result["law_id"].tolist() == [ |
| "12/2015/TT-BTC", "12/2015/TT-BTC", "45/2017/NĐ-CP", "01/2020/Luật-DN" |
| ] |
|
|
| def test_extracts_article_num(self, sample_df): |
| result = extract_metadata(sample_df) |
| assert result["article_num"].tolist() == [1, 2, 1, 1] |
|
|
| def test_extracts_law_type(self, sample_df): |
| result = extract_metadata(sample_df) |
| |
| |
| assert result["law_type"].tolist() == ["BTC", "BTC", "CP", "DN"] |
|
|
| def test_extracts_year(self, sample_df): |
| result = extract_metadata(sample_df) |
| assert result["year"].tolist() == [2015, 2015, 2017, 2020] |
|
|
| def test_no_hash_in_id(self): |
| df = pd.DataFrame({ |
| "id": ["some_simple_id"], |
| "title": ["Test"], |
| "text": ["Test content " * 5], |
| }) |
| result = extract_metadata(df) |
| assert result["law_id"].iloc[0] == "some_simple_id" |
| assert result["article_num"].iloc[0] == 0 |
|
|
| def test_preserves_original_columns(self, sample_df): |
| result = extract_metadata(sample_df) |
| for col in sample_df.columns: |
| assert col in result.columns |
|
|
| def test_year_fallback(self): |
| df = pd.DataFrame({ |
| "id": ["law/invalid_year/type"], |
| "title": ["Test"], |
| "text": ["Test " * 5], |
| }) |
| result = extract_metadata(df) |
| assert result["year"].iloc[0] == 1999 |
|
|
|
|
| class TestCleanData: |
| @pytest.fixture |
| def sample_df(self): |
| return pd.DataFrame({ |
| "id": ["a", "b", "c", "d", "e"], |
| "title": ["T1", "T2", "T3", "T4", "T5"], |
| "text": [ |
| "Nội dung đủ dài " * 5, |
| "Ngắn", |
| "Nội dung C " * 5, |
| "Nội dung C " * 5, |
| "Nội dung E " * 5, |
| ], |
| }) |
|
|
| def test_removes_short_texts(self, sample_df): |
| result = clean_data(sample_df, min_text_len=10) |
| assert "b" not in result["id"].values |
|
|
| def test_removes_duplicates(self, sample_df): |
| result = clean_data(sample_df, min_text_len=10) |
| |
| assert result["text"].value_counts().iloc[0] == 1 |
|
|
| def test_normalizes_unicode(self): |
| df = pd.DataFrame({ |
| "id": ["x"], |
| "title": ["Tiêu đề"], |
| "text": ["Nội dung thử nghiệm " * 5], |
| }) |
| result = clean_data(df, min_text_len=10) |
| |
| assert result["title"].iloc[0] == "Tiêu đề" |
|
|
| def test_preserves_count(self, sample_df): |
| result = clean_data(sample_df, min_text_len=10) |
| |
| assert len(result) == 3 |
|
|