Commit ·
1918468
1
Parent(s): ea70d21
Change datasets
Browse files- scripts/merge_datasets.py +20 -111
- utils/word_segmentation_vi.py +1 -1
scripts/merge_datasets.py
CHANGED
|
@@ -1,116 +1,25 @@
|
|
| 1 |
import os
|
| 2 |
-
import glob
|
| 3 |
import pandas as pd
|
| 4 |
-
import argparse
|
| 5 |
-
from tqdm import tqdm
|
| 6 |
-
import shutil
|
| 7 |
|
| 8 |
-
def
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
if preserve_splits:
|
| 25 |
-
combined_data = {'train': [], 'dev': [], 'test': []}
|
| 26 |
-
else:
|
| 27 |
-
combined_data['all'] = []
|
| 28 |
-
|
| 29 |
-
# Process each input directory
|
| 30 |
-
for input_dir in input_dirs:
|
| 31 |
-
print(f"Processing directory: {input_dir}")
|
| 32 |
-
|
| 33 |
-
# Find all CSV files in the directory
|
| 34 |
-
csv_files = glob.glob(os.path.join(input_dir, "*.csv"))
|
| 35 |
-
|
| 36 |
-
for file_path in tqdm(csv_files, desc=f"Processing files in {os.path.basename(input_dir)}"):
|
| 37 |
-
file_name = os.path.basename(file_path)
|
| 38 |
-
|
| 39 |
-
# Read the CSV file
|
| 40 |
-
try:
|
| 41 |
-
df = pd.read_csv(file_path)
|
| 42 |
-
print(f" Reading {file_name}: {len(df)} rows")
|
| 43 |
-
except Exception as e:
|
| 44 |
-
print(f" Error reading {file_name}: {e}")
|
| 45 |
-
continue
|
| 46 |
-
|
| 47 |
-
# Rename 'free_text' column to 'content' if it exists
|
| 48 |
-
if 'free_text' in df.columns:
|
| 49 |
-
df.rename(columns={'free_text': 'content'}, inplace=True)
|
| 50 |
-
|
| 51 |
-
# Check if 'content' column exists
|
| 52 |
-
if 'content' not in df.columns:
|
| 53 |
-
print(f" Warning: 'content' column not found in {file_name}. Skipping.")
|
| 54 |
-
continue
|
| 55 |
-
|
| 56 |
-
# Ensure all required columns exist
|
| 57 |
-
for col in expected_columns:
|
| 58 |
-
if col != 'content' and col not in df.columns:
|
| 59 |
-
df[col] = 0 # Set default value for missing columns
|
| 60 |
-
|
| 61 |
-
# Convert category columns to integer type
|
| 62 |
-
for col in expected_columns:
|
| 63 |
-
if col != 'content' and col in df.columns:
|
| 64 |
-
df[col] = df[col].fillna(0).astype(int)
|
| 65 |
-
|
| 66 |
-
# Drop unnecessary columns
|
| 67 |
-
df = df[expected_columns]
|
| 68 |
-
|
| 69 |
-
# Determine which split this file belongs to
|
| 70 |
-
if preserve_splits:
|
| 71 |
-
if 'train' in file_name.lower():
|
| 72 |
-
combined_data['train'].append(df)
|
| 73 |
-
elif 'dev' in file_name.lower():
|
| 74 |
-
combined_data['dev'].append(df)
|
| 75 |
-
elif 'test' in file_name.lower():
|
| 76 |
-
combined_data['test'].append(df)
|
| 77 |
-
else:
|
| 78 |
-
# If not explicitly marked, add to all splits
|
| 79 |
-
for split in ['train', 'dev', 'test']:
|
| 80 |
-
combined_data[split].append(df)
|
| 81 |
-
else:
|
| 82 |
-
combined_data['all'].append(df)
|
| 83 |
-
|
| 84 |
-
# Combine and save the data
|
| 85 |
-
for split, dfs in combined_data.items():
|
| 86 |
-
if not dfs:
|
| 87 |
-
print(f"No data for {split} split")
|
| 88 |
-
continue
|
| 89 |
-
|
| 90 |
-
combined_df = pd.concat(dfs, ignore_index=True)
|
| 91 |
-
|
| 92 |
-
# Remove duplicates
|
| 93 |
-
combined_df = combined_df.drop_duplicates(subset=['content'])
|
| 94 |
-
|
| 95 |
-
# Save to output directory
|
| 96 |
-
output_file = os.path.join(output_dir, f"{split}.csv" if preserve_splits else "combined.csv")
|
| 97 |
-
combined_df.to_csv(output_file, index=False)
|
| 98 |
-
print(f"Saved {len(combined_df)} rows to {output_file}")
|
| 99 |
-
|
| 100 |
-
def main():
|
| 101 |
-
parser = argparse.ArgumentParser(description="Merge CSV datasets from multiple directories")
|
| 102 |
-
parser.add_argument("--input_dirs", required=True, nargs='+',
|
| 103 |
-
help="List of input directory paths containing CSV files")
|
| 104 |
-
parser.add_argument("--output_dir", required=True,
|
| 105 |
-
help="Output directory path for merged datasets")
|
| 106 |
-
|
| 107 |
-
args = parser.parse_args()
|
| 108 |
-
|
| 109 |
-
merge_datasets(
|
| 110 |
-
args.input_dirs,
|
| 111 |
-
args.output_dir,
|
| 112 |
-
preserve_splits=True
|
| 113 |
-
)
|
| 114 |
|
| 115 |
if __name__ == "__main__":
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
+
def merge_csv_files(directories, output_file, target_name):
|
| 5 |
+
dfs = []
|
| 6 |
+
for directory in directories:
|
| 7 |
+
csv_path = os.path.join(directory, target_name)
|
| 8 |
+
if os.path.isfile(csv_path):
|
| 9 |
+
dfs.append(pd.read_csv(csv_path))
|
| 10 |
+
if not dfs:
|
| 11 |
+
print(f"No {target_name} found.")
|
| 12 |
+
return
|
| 13 |
+
common_columns = set(dfs[0].columns)
|
| 14 |
+
for df in dfs[1:]:
|
| 15 |
+
common_columns &= set(df.columns)
|
| 16 |
+
merged = pd.concat([df[list(common_columns)] for df in dfs], ignore_index=True)
|
| 17 |
+
# Shuffle the merged dataframe
|
| 18 |
+
merged = merged.sample(frac=1).reset_index(drop=True)
|
| 19 |
+
merged.to_csv(output_file, index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
if __name__ == "__main__":
|
| 22 |
+
directories = ["../datasets_vithsd", "../datasets_vihsd_gemini"]
|
| 23 |
+
merge_csv_files(directories, "../datasets/train.csv", "train.csv")
|
| 24 |
+
merge_csv_files(directories, "../datasets/dev.csv", "dev.csv")
|
| 25 |
+
merge_csv_files(directories, "../datasets/test.csv", "test.csv")
|
utils/word_segmentation_vi.py
CHANGED
|
@@ -7,7 +7,7 @@ def word_segmentation_vi(text):
|
|
| 7 |
|
| 8 |
if __name__ == "__main__":
|
| 9 |
# Script này để segment các file CSV và TSV trong thư mục datasets cho tiếng Việt (do PhoBERT yêu cầu đầu vào đã được segment theo từ)
|
| 10 |
-
dataset_dir = "../
|
| 11 |
|
| 12 |
csv_files = [f for f in os.listdir(dataset_dir) if f.endswith('.csv')]
|
| 13 |
tsv_files = [f for f in os.listdir(dataset_dir) if f.endswith('.tsv')]
|
|
|
|
| 7 |
|
| 8 |
if __name__ == "__main__":
|
| 9 |
# Script này để segment các file CSV và TSV trong thư mục datasets cho tiếng Việt (do PhoBERT yêu cầu đầu vào đã được segment theo từ)
|
| 10 |
+
dataset_dir = "../datasets_vithsd"
|
| 11 |
|
| 12 |
csv_files = [f for f in os.listdir(dataset_dir) if f.endswith('.csv')]
|
| 13 |
tsv_files = [f for f in os.listdir(dataset_dir) if f.endswith('.tsv')]
|