Spaces:
Build error
Build error
Commit ·
f71ba81
1
Parent(s): 830b470
data cleanres
Browse files- .gitignore +1 -0
- scripts/clean.py +22 -0
- scripts/cleaners/clean_ds.py +56 -0
- scripts/cleaners/clean_ms.py +59 -0
- scripts/pull.py +5 -5
.gitignore
CHANGED
|
@@ -14,6 +14,7 @@ temp.py
|
|
| 14 |
*.csv
|
| 15 |
*.json*
|
| 16 |
/*data*/
|
|
|
|
| 17 |
|
| 18 |
# Front end
|
| 19 |
node_modules
|
|
|
|
| 14 |
*.csv
|
| 15 |
*.json*
|
| 16 |
/*data*/
|
| 17 |
+
notes/
|
| 18 |
|
| 19 |
# Front end
|
| 20 |
node_modules
|
scripts/clean.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Clean up the raw data files so as to curate specifically-required
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import subprocess
|
| 6 |
+
import threading
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
def run_script(script_path):
|
| 10 |
+
subprocess.run(["python", script_path], cwd=os.path.dirname(__file__))
|
| 11 |
+
|
| 12 |
+
# Run both cleaning scripts in parallel for speed
|
| 13 |
+
t1 = threading.Thread(target=run_script, args=("cleaners/clean_ms.py",))
|
| 14 |
+
t2 = threading.Thread(target=run_script, args=("cleaners/clean_ds.py",))
|
| 15 |
+
|
| 16 |
+
t1.start()
|
| 17 |
+
t2.start()
|
| 18 |
+
|
| 19 |
+
t1.join()
|
| 20 |
+
t2.join()
|
| 21 |
+
|
| 22 |
+
print("All cleaning scripts completed.")
|
scripts/cleaners/clean_ds.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
RANDOMLY Takes 10,000 lines from ../raw_data/raw_dialogsum_train.csv, 1,000 lines from ../raw_data/raw_dialogsum_test.csv, and 700 lines from ../raw_data/raw_dialogsum_val.csv. Then converts each one to JSONL.
|
| 3 |
+
|
| 4 |
+
"""
|
| 5 |
+
import random
|
| 6 |
+
import json
|
| 7 |
+
import csv
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
def reservoir_sample_csv(file_path, k):
|
| 11 |
+
rows = []
|
| 12 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 13 |
+
reader = csv.DictReader(f)
|
| 14 |
+
for row in reader:
|
| 15 |
+
rows.append(row)
|
| 16 |
+
if len(rows) <= k:
|
| 17 |
+
return rows
|
| 18 |
+
return random.sample(rows, k)
|
| 19 |
+
|
| 20 |
+
def write_jsonl(rows, output_path):
|
| 21 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 22 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 23 |
+
for row in rows:
|
| 24 |
+
new_data = {
|
| 25 |
+
"id": row["id"],
|
| 26 |
+
"original_source": "DialogSum",
|
| 27 |
+
"dialogue": row["dialogue"],
|
| 28 |
+
"summary": row["summary"],
|
| 29 |
+
"topic": row["topic"]
|
| 30 |
+
}
|
| 31 |
+
json.dump(new_data, f, indent=2)
|
| 32 |
+
f.write('\n')
|
| 33 |
+
|
| 34 |
+
print("Cleaning DialogSum dataset...")
|
| 35 |
+
|
| 36 |
+
ta = '../raw_data/raw_dialogsum_train.csv'
|
| 37 |
+
tb = '../raw_data/raw_dialogsum_test.csv'
|
| 38 |
+
vc = '../raw_data/raw_dialogsum_val.csv'
|
| 39 |
+
|
| 40 |
+
train_loc = '../clean1/ds/dialogsum_train_10k.jsonl'
|
| 41 |
+
test_loc = '../clean1/ds/dialogsum_test_1k.jsonl'
|
| 42 |
+
val_loc = '../clean1/ds/dialogsum_val_700.jsonl'
|
| 43 |
+
|
| 44 |
+
print("Sampling rows from raw data CSV files...")
|
| 45 |
+
|
| 46 |
+
train_rows = reservoir_sample_csv(ta, 10000)
|
| 47 |
+
test_rows = reservoir_sample_csv(tb, 1000)
|
| 48 |
+
val_rows = reservoir_sample_csv(vc, 700)
|
| 49 |
+
|
| 50 |
+
print("Collected Samples. Writing to JSONL files...")
|
| 51 |
+
|
| 52 |
+
write_jsonl(train_rows, train_loc)
|
| 53 |
+
write_jsonl(test_rows, test_loc)
|
| 54 |
+
write_jsonl(val_rows, val_loc)
|
| 55 |
+
|
| 56 |
+
print("Done")
|
scripts/cleaners/clean_ms.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
RANDOMLY Takes 10,000 lines from ../raw_data/raw_mediasum_train_data.txt, 1,000 lines from ../raw_data/raw_mediasum_test_data.txt, and 1,000 lines from ../raw_data/raw_mediasum_val_data.txt. Then converts each one to JSONL.
|
| 3 |
+
|
| 4 |
+
"""
|
| 5 |
+
import random
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
def reservoir_sample(file_path, k):
|
| 10 |
+
reservoir = []
|
| 11 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 12 |
+
for i, line in enumerate(f):
|
| 13 |
+
if i < k:
|
| 14 |
+
reservoir.append(line.strip())
|
| 15 |
+
else:
|
| 16 |
+
j = random.randint(0, i)
|
| 17 |
+
if j < k:
|
| 18 |
+
reservoir[j] = line.strip()
|
| 19 |
+
return reservoir
|
| 20 |
+
|
| 21 |
+
def write_jsonl(lines, output_path):
|
| 22 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 23 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 24 |
+
for line in lines:
|
| 25 |
+
data = json.loads(line)
|
| 26 |
+
new_data = {
|
| 27 |
+
"id": data["id"],
|
| 28 |
+
"original_source": "MediaSum",
|
| 29 |
+
"url": data["url"],
|
| 30 |
+
"summary": data["summary"],
|
| 31 |
+
"transcript": data["utt"],
|
| 32 |
+
"speaker": data["speaker"]
|
| 33 |
+
}
|
| 34 |
+
json.dump(new_data, f, indent=2)
|
| 35 |
+
f.write('\n')
|
| 36 |
+
|
| 37 |
+
print("Cleaning Mediasum dataset...")
|
| 38 |
+
|
| 39 |
+
ta = '../raw_data/raw_mediasum_train_data.txt'
|
| 40 |
+
tb = '../raw_data/raw_mediasum_test_data.txt'
|
| 41 |
+
vc = '../raw_data/raw_mediasum_val_data.txt'
|
| 42 |
+
|
| 43 |
+
train_loc = '../clean1/ms/mediasum_train_10k.jsonl'
|
| 44 |
+
test_loc = '../clean1/ms/mediasum_test_1k.jsonl'
|
| 45 |
+
val_loc = '../clean1/ms/mediasum_val_1k.jsonl'
|
| 46 |
+
|
| 47 |
+
print("Sampling lines from raw data files...")
|
| 48 |
+
|
| 49 |
+
train_lines = reservoir_sample(ta, 10000)
|
| 50 |
+
test_lines = reservoir_sample(tb, 1000)
|
| 51 |
+
val_lines = reservoir_sample(vc, 1000)
|
| 52 |
+
|
| 53 |
+
print("Collected Samples. Writing to JSONL files...")
|
| 54 |
+
|
| 55 |
+
write_jsonl(train_lines, train_loc)
|
| 56 |
+
write_jsonl(test_lines, test_loc)
|
| 57 |
+
write_jsonl(val_lines, val_loc)
|
| 58 |
+
|
| 59 |
+
print("Done")
|
scripts/pull.py
CHANGED
|
@@ -3,17 +3,17 @@ Pulls raw samples of 10k each from the [cited in README] datasets used in this p
|
|
| 3 |
In the final version of the training data, a lot of the example outputs are tuned, and they are all merged into a single
|
| 4 |
|
| 5 |
HuggingFace seems to have disabled this functionality.
|
| 6 |
-
Currently trying to see how to work around it
|
| 7 |
"""
|
| 8 |
|
| 9 |
import json
|
| 10 |
from datasets import load_dataset
|
| 11 |
|
| 12 |
targets = {
|
| 13 |
-
"mediasum": ("nbroad/mediasum", None, "train"),
|
| 14 |
-
"dialogsum": ("knkarthick/dialogsum", None, "train"),
|
| 15 |
-
"squality": ("mattercalm/squality", None, "train"),
|
| 16 |
-
"msmarco_corpus": ("Hyukkyu/beir-msmarco", "corpus", "train"),
|
| 17 |
}
|
| 18 |
|
| 19 |
for name, (repo, config, split) in targets.items():
|
|
|
|
| 3 |
In the final version of the training data, a lot of the example outputs are tuned, and they are all merged into a single
|
| 4 |
|
| 5 |
HuggingFace seems to have disabled this functionality.
|
| 6 |
+
Currently trying to see how to work around it
|
| 7 |
"""
|
| 8 |
|
| 9 |
import json
|
| 10 |
from datasets import load_dataset
|
| 11 |
|
| 12 |
targets = {
|
| 13 |
+
"mediasum": ("nbroad/mediasum", None, "train"),
|
| 14 |
+
"dialogsum": ("knkarthick/dialogsum", None, "train"),
|
| 15 |
+
"squality": ("mattercalm/squality", None, "train"),
|
| 16 |
+
"msmarco_corpus": ("Hyukkyu/beir-msmarco", "corpus", "train"),
|
| 17 |
}
|
| 18 |
|
| 19 |
for name, (repo, config, split) in targets.items():
|