LorenzoBioinfo commited on
Commit
f42bc7f
·
1 Parent(s): d0873ef

Save structure

Browse files
Files changed (1) hide show
  1. src/data_preparation.py +13 -4
src/data_preparation.py CHANGED
@@ -73,11 +73,20 @@ def prepare_tweet_eval(tokenizer, output_path):
73
  "label": [2, 0, 1, 2, 0],
74
  }
75
  ds = safe_load_dataset("tweet_eval", "sentiment", fallback_data=fallback_data)
76
- ds =ds["train"].select(range(1000))
77
- ds = ds.map(lambda x: {"text": clean_text(x["text"])})
78
- ds = ds.map(tokenize_function, batched=True)
 
 
 
 
 
 
 
 
 
79
  ds.save_to_disk(output_path)
80
- print(f" Dataset Tweet Eval salvato in {output_path}")
81
 
82
 
83
  def prepare_youtube(tokenizer, output_path):
 
73
  "label": [2, 0, 1, 2, 0],
74
  }
75
  ds = safe_load_dataset("tweet_eval", "sentiment", fallback_data=fallback_data)
76
+ if isinstance(ds, dict) or "train" in ds:
77
+ reduced_splits = {}
78
+ for split in ds.keys():
79
+ reduced_splits[split] = ds[split].select(range(min(1000, len(ds[split]))))
80
+ reduced_splits[split] = reduced_splits[split].map(lambda x: {"text": clean_text(x["text"])})
81
+ reduced_splits[split] = reduced_splits[split].map(tokenize_function, batched=True)
82
+ ds = datasets.DatasetDict(reduced_splits)
83
+ else:
84
+ ds = ds.select(range(min(1000, len(ds))))
85
+ ds = ds.map(lambda x: {"text": clean_text(x["text"])})
86
+ ds = ds.map(tokenize_function, batched=True)
87
+
88
  ds.save_to_disk(output_path)
89
+ print(f"Dataset Tweet Eval salvato in {output_path}")
90
 
91
 
92
  def prepare_youtube(tokenizer, output_path):