d4rkk3y
/

Feature Extraction
Vietnamese
embedding
legal
law
vietnamese
telen / src /data.py
d4rkk3y's picture
Duplicate from haidang2405/telen
e40b98b
Raw
History Blame Contribute Delete
1.95 kB
"""Shared data utilities used by TELEN modules."""
import unicodedata
import pandas as pd
def load_raw_data(parquet_path: str) -> pd.DataFrame:
"""Load the raw parquet file."""
return pd.read_parquet(parquet_path)
def extract_metadata(df: pd.DataFrame) -> pd.DataFrame:
"""Extract law_id, article_num, law_type, year from id column."""
df = df.copy()
def parse_id(id_str):
if "#" in id_str:
parts = id_str.split("#")
law_id = parts[0]
article_part = parts[1]
article_num = int(article_part.split("-")[0])
else:
law_id = id_str
article_num = 0
return law_id, article_num
parsed = df["id"].apply(parse_id)
df["law_id"] = parsed.apply(lambda x: x[0])
df["article_num"] = parsed.apply(lambda x: x[1])
def extract_law_type(law_id):
parts = law_id.split("/")
if len(parts) >= 3:
return parts[2].split("-")[-1] if "-" in parts[2] else parts[2]
return "unknown"
df["law_type"] = df["law_id"].apply(extract_law_type)
def extract_year(law_id):
parts = law_id.split("/")
if len(parts) >= 2:
year_str = parts[1]
try:
year = int(year_str)
return year if year >= 100 else year + 1900
except ValueError:
pass
return 1999
df["year"] = df["law_id"].apply(extract_year)
return df
def clean_data(df: pd.DataFrame, min_text_len: int = 10) -> pd.DataFrame:
"""Remove short/empty texts and duplicates."""
df = df.copy()
df = df[df["text"].str.len() >= min_text_len].reset_index(drop=True)
df["title"] = df["title"].apply(lambda x: unicodedata.normalize("NFC", str(x)))
df["text"] = df["text"].apply(lambda x: unicodedata.normalize("NFC", str(x)))
df = df.drop_duplicates(subset=["text"], keep="first").reset_index(drop=True)
return df