jesse-tong commited on
Commit
1918468
·
1 Parent(s): ea70d21

Change datasets

Browse files
scripts/merge_datasets.py CHANGED
@@ -1,116 +1,25 @@
1
  import os
2
- import glob
3
  import pandas as pd
4
- import argparse
5
- from tqdm import tqdm
6
- import shutil
7
 
8
- def merge_datasets(input_dirs, output_dir, preserve_splits=False):
9
- """
10
- Merge CSV datasets from multiple directories into one directory
11
-
12
- Args:
13
- input_dirs (list): List of input directory paths
14
- output_dir (str): Output directory path
15
- """
16
- # Create output directory if it doesn't exist
17
- os.makedirs(output_dir, exist_ok=True)
18
-
19
- # Define the expected columns for the format in test.csv
20
- expected_columns = ['content', 'individual', 'groups', 'religion/creed', 'race/ethnicity', 'politics']
21
-
22
- # Dictionary to hold dataframes for each split if preserving splits
23
- combined_data = {}
24
- if preserve_splits:
25
- combined_data = {'train': [], 'dev': [], 'test': []}
26
- else:
27
- combined_data['all'] = []
28
-
29
- # Process each input directory
30
- for input_dir in input_dirs:
31
- print(f"Processing directory: {input_dir}")
32
-
33
- # Find all CSV files in the directory
34
- csv_files = glob.glob(os.path.join(input_dir, "*.csv"))
35
-
36
- for file_path in tqdm(csv_files, desc=f"Processing files in {os.path.basename(input_dir)}"):
37
- file_name = os.path.basename(file_path)
38
-
39
- # Read the CSV file
40
- try:
41
- df = pd.read_csv(file_path)
42
- print(f" Reading {file_name}: {len(df)} rows")
43
- except Exception as e:
44
- print(f" Error reading {file_name}: {e}")
45
- continue
46
-
47
- # Rename 'free_text' column to 'content' if it exists
48
- if 'free_text' in df.columns:
49
- df.rename(columns={'free_text': 'content'}, inplace=True)
50
-
51
- # Check if 'content' column exists
52
- if 'content' not in df.columns:
53
- print(f" Warning: 'content' column not found in {file_name}. Skipping.")
54
- continue
55
-
56
- # Ensure all required columns exist
57
- for col in expected_columns:
58
- if col != 'content' and col not in df.columns:
59
- df[col] = 0 # Set default value for missing columns
60
-
61
- # Convert category columns to integer type
62
- for col in expected_columns:
63
- if col != 'content' and col in df.columns:
64
- df[col] = df[col].fillna(0).astype(int)
65
-
66
- # Drop unnecessary columns
67
- df = df[expected_columns]
68
-
69
- # Determine which split this file belongs to
70
- if preserve_splits:
71
- if 'train' in file_name.lower():
72
- combined_data['train'].append(df)
73
- elif 'dev' in file_name.lower():
74
- combined_data['dev'].append(df)
75
- elif 'test' in file_name.lower():
76
- combined_data['test'].append(df)
77
- else:
78
- # If not explicitly marked, add to all splits
79
- for split in ['train', 'dev', 'test']:
80
- combined_data[split].append(df)
81
- else:
82
- combined_data['all'].append(df)
83
-
84
- # Combine and save the data
85
- for split, dfs in combined_data.items():
86
- if not dfs:
87
- print(f"No data for {split} split")
88
- continue
89
-
90
- combined_df = pd.concat(dfs, ignore_index=True)
91
-
92
- # Remove duplicates
93
- combined_df = combined_df.drop_duplicates(subset=['content'])
94
-
95
- # Save to output directory
96
- output_file = os.path.join(output_dir, f"{split}.csv" if preserve_splits else "combined.csv")
97
- combined_df.to_csv(output_file, index=False)
98
- print(f"Saved {len(combined_df)} rows to {output_file}")
99
-
100
- def main():
101
- parser = argparse.ArgumentParser(description="Merge CSV datasets from multiple directories")
102
- parser.add_argument("--input_dirs", required=True, nargs='+',
103
- help="List of input directory paths containing CSV files")
104
- parser.add_argument("--output_dir", required=True,
105
- help="Output directory path for merged datasets")
106
-
107
- args = parser.parse_args()
108
-
109
- merge_datasets(
110
- args.input_dirs,
111
- args.output_dir,
112
- preserve_splits=True
113
- )
114
 
115
  if __name__ == "__main__":
116
- main()
 
 
 
 
1
  import os
 
2
  import pandas as pd
 
 
 
3
 
4
+ def merge_csv_files(directories, output_file, target_name):
5
+ dfs = []
6
+ for directory in directories:
7
+ csv_path = os.path.join(directory, target_name)
8
+ if os.path.isfile(csv_path):
9
+ dfs.append(pd.read_csv(csv_path))
10
+ if not dfs:
11
+ print(f"No {target_name} found.")
12
+ return
13
+ common_columns = set(dfs[0].columns)
14
+ for df in dfs[1:]:
15
+ common_columns &= set(df.columns)
16
+ merged = pd.concat([df[list(common_columns)] for df in dfs], ignore_index=True)
17
+ # Shuffle the merged dataframe
18
+ merged = merged.sample(frac=1).reset_index(drop=True)
19
+ merged.to_csv(output_file, index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  if __name__ == "__main__":
22
+ directories = ["../datasets_vithsd", "../datasets_vihsd_gemini"]
23
+ merge_csv_files(directories, "../datasets/train.csv", "train.csv")
24
+ merge_csv_files(directories, "../datasets/dev.csv", "dev.csv")
25
+ merge_csv_files(directories, "../datasets/test.csv", "test.csv")
utils/word_segmentation_vi.py CHANGED
@@ -7,7 +7,7 @@ def word_segmentation_vi(text):
7
 
8
  if __name__ == "__main__":
9
  # Script này để segment các file CSV và TSV trong thư mục datasets cho tiếng Việt (do PhoBERT yêu cầu đầu vào đã được segment theo từ)
10
- dataset_dir = "../datasets"
11
 
12
  csv_files = [f for f in os.listdir(dataset_dir) if f.endswith('.csv')]
13
  tsv_files = [f for f in os.listdir(dataset_dir) if f.endswith('.tsv')]
 
7
 
8
  if __name__ == "__main__":
9
  # Script này để segment các file CSV và TSV trong thư mục datasets cho tiếng Việt (do PhoBERT yêu cầu đầu vào đã được segment theo từ)
10
+ dataset_dir = "../datasets_vithsd"
11
 
12
  csv_files = [f for f in os.listdir(dataset_dir) if f.endswith('.csv')]
13
  tsv_files = [f for f in os.listdir(dataset_dir) if f.endswith('.tsv')]