| """Shared data utilities used by TELEN modules.""" |
| import unicodedata |
| import pandas as pd |
|
|
|
|
| def load_raw_data(parquet_path: str) -> pd.DataFrame: |
| """Load the raw parquet file.""" |
| return pd.read_parquet(parquet_path) |
|
|
|
|
| def extract_metadata(df: pd.DataFrame) -> pd.DataFrame: |
| """Extract law_id, article_num, law_type, year from id column.""" |
| df = df.copy() |
|
|
| def parse_id(id_str): |
| if "#" in id_str: |
| parts = id_str.split("#") |
| law_id = parts[0] |
| article_part = parts[1] |
| article_num = int(article_part.split("-")[0]) |
| else: |
| law_id = id_str |
| article_num = 0 |
| return law_id, article_num |
|
|
| parsed = df["id"].apply(parse_id) |
| df["law_id"] = parsed.apply(lambda x: x[0]) |
| df["article_num"] = parsed.apply(lambda x: x[1]) |
|
|
| def extract_law_type(law_id): |
| parts = law_id.split("/") |
| if len(parts) >= 3: |
| return parts[2].split("-")[-1] if "-" in parts[2] else parts[2] |
| return "unknown" |
|
|
| df["law_type"] = df["law_id"].apply(extract_law_type) |
|
|
| def extract_year(law_id): |
| parts = law_id.split("/") |
| if len(parts) >= 2: |
| year_str = parts[1] |
| try: |
| year = int(year_str) |
| return year if year >= 100 else year + 1900 |
| except ValueError: |
| pass |
| return 1999 |
|
|
| df["year"] = df["law_id"].apply(extract_year) |
| return df |
|
|
|
|
| def clean_data(df: pd.DataFrame, min_text_len: int = 10) -> pd.DataFrame: |
| """Remove short/empty texts and duplicates.""" |
| df = df.copy() |
| df = df[df["text"].str.len() >= min_text_len].reset_index(drop=True) |
| df["title"] = df["title"].apply(lambda x: unicodedata.normalize("NFC", str(x))) |
| df["text"] = df["text"].apply(lambda x: unicodedata.normalize("NFC", str(x))) |
| df = df.drop_duplicates(subset=["text"], keep="first").reset_index(drop=True) |
| return df |
|
|