compendious commited on
Commit
f71ba81
·
1 Parent(s): 830b470

data cleanres

Browse files
.gitignore CHANGED
@@ -14,6 +14,7 @@ temp.py
14
  *.csv
15
  *.json*
16
  /*data*/
 
17
 
18
  # Front end
19
  node_modules
 
14
  *.csv
15
  *.json*
16
  /*data*/
17
+ notes/
18
 
19
  # Front end
20
  node_modules
scripts/clean.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Clean up the raw data files so as to curate specifically-required
3
+ """
4
+
5
+ import subprocess
6
+ import threading
7
+ import os
8
+
9
+ def run_script(script_path):
10
+ subprocess.run(["python", script_path], cwd=os.path.dirname(__file__))
11
+
12
+ # Run both cleaning scripts in parallel for speed
13
+ t1 = threading.Thread(target=run_script, args=("cleaners/clean_ms.py",))
14
+ t2 = threading.Thread(target=run_script, args=("cleaners/clean_ds.py",))
15
+
16
+ t1.start()
17
+ t2.start()
18
+
19
+ t1.join()
20
+ t2.join()
21
+
22
+ print("All cleaning scripts completed.")
scripts/cleaners/clean_ds.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RANDOMLY Takes 10,000 lines from ../raw_data/raw_dialogsum_train.csv, 1,000 lines from ../raw_data/raw_dialogsum_test.csv, and 700 lines from ../raw_data/raw_dialogsum_val.csv. Then converts each one to JSONL.
3
+
4
+ """
5
+ import random
6
+ import json
7
+ import csv
8
+ import os
9
+
10
+ def reservoir_sample_csv(file_path, k):
11
+ rows = []
12
+ with open(file_path, 'r', encoding='utf-8') as f:
13
+ reader = csv.DictReader(f)
14
+ for row in reader:
15
+ rows.append(row)
16
+ if len(rows) <= k:
17
+ return rows
18
+ return random.sample(rows, k)
19
+
20
+ def write_jsonl(rows, output_path):
21
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
22
+ with open(output_path, 'w', encoding='utf-8') as f:
23
+ for row in rows:
24
+ new_data = {
25
+ "id": row["id"],
26
+ "original_source": "DialogSum",
27
+ "dialogue": row["dialogue"],
28
+ "summary": row["summary"],
29
+ "topic": row["topic"]
30
+ }
31
+ json.dump(new_data, f, indent=2)
32
+ f.write('\n')
33
+
34
+ print("Cleaning DialogSum dataset...")
35
+
36
+ ta = '../raw_data/raw_dialogsum_train.csv'
37
+ tb = '../raw_data/raw_dialogsum_test.csv'
38
+ vc = '../raw_data/raw_dialogsum_val.csv'
39
+
40
+ train_loc = '../clean1/ds/dialogsum_train_10k.jsonl'
41
+ test_loc = '../clean1/ds/dialogsum_test_1k.jsonl'
42
+ val_loc = '../clean1/ds/dialogsum_val_700.jsonl'
43
+
44
+ print("Sampling rows from raw data CSV files...")
45
+
46
+ train_rows = reservoir_sample_csv(ta, 10000)
47
+ test_rows = reservoir_sample_csv(tb, 1000)
48
+ val_rows = reservoir_sample_csv(vc, 700)
49
+
50
+ print("Collected Samples. Writing to JSONL files...")
51
+
52
+ write_jsonl(train_rows, train_loc)
53
+ write_jsonl(test_rows, test_loc)
54
+ write_jsonl(val_rows, val_loc)
55
+
56
+ print("Done")
scripts/cleaners/clean_ms.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RANDOMLY Takes 10,000 lines from ../raw_data/raw_mediasum_train_data.txt, 1,000 lines from ../raw_data/raw_mediasum_test_data.txt, and 1,000 lines from ../raw_data/raw_mediasum_val_data.txt. Then converts each one to JSONL.
3
+
4
+ """
5
+ import random
6
+ import json
7
+ import os
8
+
9
+ def reservoir_sample(file_path, k):
10
+ reservoir = []
11
+ with open(file_path, 'r', encoding='utf-8') as f:
12
+ for i, line in enumerate(f):
13
+ if i < k:
14
+ reservoir.append(line.strip())
15
+ else:
16
+ j = random.randint(0, i)
17
+ if j < k:
18
+ reservoir[j] = line.strip()
19
+ return reservoir
20
+
21
+ def write_jsonl(lines, output_path):
22
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
23
+ with open(output_path, 'w', encoding='utf-8') as f:
24
+ for line in lines:
25
+ data = json.loads(line)
26
+ new_data = {
27
+ "id": data["id"],
28
+ "original_source": "MediaSum",
29
+ "url": data["url"],
30
+ "summary": data["summary"],
31
+ "transcript": data["utt"],
32
+ "speaker": data["speaker"]
33
+ }
34
+ json.dump(new_data, f, indent=2)
35
+ f.write('\n')
36
+
37
+ print("Cleaning Mediasum dataset...")
38
+
39
+ ta = '../raw_data/raw_mediasum_train_data.txt'
40
+ tb = '../raw_data/raw_mediasum_test_data.txt'
41
+ vc = '../raw_data/raw_mediasum_val_data.txt'
42
+
43
+ train_loc = '../clean1/ms/mediasum_train_10k.jsonl'
44
+ test_loc = '../clean1/ms/mediasum_test_1k.jsonl'
45
+ val_loc = '../clean1/ms/mediasum_val_1k.jsonl'
46
+
47
+ print("Sampling lines from raw data files...")
48
+
49
+ train_lines = reservoir_sample(ta, 10000)
50
+ test_lines = reservoir_sample(tb, 1000)
51
+ val_lines = reservoir_sample(vc, 1000)
52
+
53
+ print("Collected Samples. Writing to JSONL files...")
54
+
55
+ write_jsonl(train_lines, train_loc)
56
+ write_jsonl(test_lines, test_loc)
57
+ write_jsonl(val_lines, val_loc)
58
+
59
+ print("Done")
scripts/pull.py CHANGED
@@ -3,17 +3,17 @@ Pulls raw samples of 10k each from the [cited in README] datasets used in this p
3
  In the final version of the training data, a lot of the example outputs are tuned, and they are all merged into a single
4
 
5
  HuggingFace seems to have disabled this functionality.
6
- Currently trying to see how to work around it.
7
  """
8
 
9
  import json
10
  from datasets import load_dataset
11
 
12
  targets = {
13
- "mediasum": ("nbroad/mediasum", None, "train"), # Parquet‑exported version, no loader script needed :contentReference[oaicite:0]{index=0}
14
- "dialogsum": ("knkarthick/dialogsum", None, "train"), # CSV on HF :contentReference[oaicite:1]{index=1}
15
- "squality": ("mattercalm/squality", None, "train"), # assumed generic supported format
16
- "msmarco_corpus": ("Hyukkyu/beir-msmarco", "corpus", "train"), # Parquet migrated version :contentReference[oaicite:2]{index=2}
17
  }
18
 
19
  for name, (repo, config, split) in targets.items():
 
3
  In the final version of the training data, a lot of the example outputs are tuned, and they are all merged into a single
4
 
5
  HuggingFace seems to have disabled this functionality.
6
+ Currently trying to see how to work around it
7
  """
8
 
9
  import json
10
  from datasets import load_dataset
11
 
12
  targets = {
13
+ "mediasum": ("nbroad/mediasum", None, "train"),
14
+ "dialogsum": ("knkarthick/dialogsum", None, "train"),
15
+ "squality": ("mattercalm/squality", None, "train"),
16
+ "msmarco_corpus": ("Hyukkyu/beir-msmarco", "corpus", "train"),
17
  }
18
 
19
  for name, (repo, config, split) in targets.items():