|
|
import os |
|
|
import pandas as pd |
|
|
from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk |
|
|
from .config import Config |
|
|
|
|
|
class DataProcessor: |
|
|
def __init__(self, tokenizer): |
|
|
self.tokenizer = tokenizer |
|
|
|
|
|
def load_clap_data(self): |
|
|
""" |
|
|
加载 clapAI/MultiLingualSentiment 数据集的中文部分 |
|
|
""" |
|
|
print("Loading clapAI/MultiLingualSentiment (zh)...") |
|
|
try: |
|
|
|
|
|
|
|
|
ds = load_dataset("clapAI/MultiLingualSentiment", "zh", split="train", trust_remote_code=True) |
|
|
except Exception: |
|
|
|
|
|
print("Warning: Could not load 'zh' specific config, attempting to load generic...") |
|
|
ds = load_dataset("clapAI/MultiLingualSentiment", split="train", trust_remote_code=True) |
|
|
ds = ds.filter(lambda x: x['language'] == 'zh') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return ds |
|
|
|
|
|
def load_medical_data(self): |
|
|
""" |
|
|
加载 OpenModels/Chinese-Herbal-Medicine-Sentiment 垂直领域数据 |
|
|
""" |
|
|
print("Loading OpenModels/Chinese-Herbal-Medicine-Sentiment...") |
|
|
ds = load_dataset("OpenModels/Chinese-Herbal-Medicine-Sentiment", split="train", trust_remote_code=True) |
|
|
return ds |
|
|
|
|
|
def clean_data(self, examples): |
|
|
""" |
|
|
数据清洗逻辑 |
|
|
""" |
|
|
text = examples['text'] |
|
|
|
|
|
|
|
|
if "此用户未填写评价内容" in text: |
|
|
return False |
|
|
|
|
|
|
|
|
if len(text.strip()) < 2: |
|
|
return False |
|
|
|
|
|
return True |
|
|
|
|
|
def unify_labels(self, example): |
|
|
""" |
|
|
统一标签为: 0 (Negative), 1 (Neutral), 2 (Positive) |
|
|
""" |
|
|
label = example['label'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if isinstance(label, str): |
|
|
label = label.lower() |
|
|
if label in ['negative', 'pos', '0']: |
|
|
return {'labels': 0} |
|
|
elif label in ['neutral', 'neu', '1']: |
|
|
return {'labels': 1} |
|
|
elif label in ['positive', 'neg', '2']: |
|
|
return {'labels': 2} |
|
|
|
|
|
|
|
|
return {'labels': int(label)} |
|
|
|
|
|
def tokenize_function(self, examples): |
|
|
return self.tokenizer( |
|
|
examples['text'], |
|
|
padding="max_length", |
|
|
truncation=True, |
|
|
max_length=Config.MAX_LENGTH |
|
|
) |
|
|
|
|
|
def get_processed_dataset(self, cache_dir=None, num_proc=1): |
|
|
|
|
|
if cache_dir is None: |
|
|
cache_dir = Config.DATA_DIR |
|
|
|
|
|
|
|
|
processed_path = os.path.join(cache_dir, "processed_dataset") |
|
|
if os.path.exists(processed_path): |
|
|
print(f"Loading processed dataset from {processed_path}...") |
|
|
return load_from_disk(processed_path) |
|
|
|
|
|
|
|
|
ds_clap = self.load_clap_data() |
|
|
ds_med = self.load_medical_data() |
|
|
|
|
|
|
|
|
|
|
|
if 'review_text' in ds_med.column_names: |
|
|
ds_med = ds_med.rename_column('review_text', 'text') |
|
|
if 'sentiment_label' in ds_med.column_names: |
|
|
ds_med = ds_med.rename_column('sentiment_label', 'label') |
|
|
|
|
|
|
|
|
print("Cleaning datasets...") |
|
|
ds_med = ds_med.filter(self.clean_data) |
|
|
ds_clap = ds_clap.filter(self.clean_data) |
|
|
|
|
|
|
|
|
|
|
|
common_cols = ['text', 'label'] |
|
|
ds_clap = ds_clap.select_columns(common_cols) |
|
|
ds_med = ds_med.select_columns(common_cols) |
|
|
|
|
|
combined_ds = concatenate_datasets([ds_clap, ds_med]) |
|
|
|
|
|
|
|
|
|
|
|
combined_ds = combined_ds.map(self.unify_labels, remove_columns=['label']) |
|
|
|
|
|
|
|
|
tokenized_ds = combined_ds.map( |
|
|
self.tokenize_function, |
|
|
batched=True, |
|
|
remove_columns=['text'] |
|
|
) |
|
|
|
|
|
|
|
|
split_ds = tokenized_ds.train_test_split(test_size=0.1) |
|
|
|
|
|
return split_ds |
|
|
|