|
|
from deduplication import find_near_duplicates |
|
|
from featurizer import custom_featurizer |
|
|
from issues import find_issues |
|
|
from pipeline import make_step, run_pipeline |
|
|
import pandas as pd |
|
|
from tqdm.auto import tqdm |
|
|
|
|
|
bar = tqdm(total=100, leave=True) |
|
|
|
|
|
steps = [ |
|
|
make_step(find_near_duplicates, name="dedup")(progress=bar), |
|
|
make_step(custom_featurizer, name="featurize")( |
|
|
label=None, |
|
|
nan_strategy="impute", |
|
|
on_pipeline_error="drop", |
|
|
progress=bar |
|
|
), |
|
|
make_step(find_issues, name="find_label_issues")(label="HARDSHIP_INDEX", progress=bar) |
|
|
] |
|
|
|
|
|
df = pd.read_csv("./data/Lisette.csv") |
|
|
results = run_pipeline(steps, df=df) |
|
|
|
|
|
bar.close() |
|
|
print(results) |
|
|
|
|
|
|
|
|
|