roshbeed commited on
Commit
e7e6c82
·
verified ·
1 Parent(s): bf28f93

Upload src/load_dataset.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/load_dataset.py +35 -0
src/load_dataset.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from collections import Counter
3
+
4
+ def main():
5
+ # Load the MS MARCO dataset
6
+ print("Loading MS MARCO dataset...")
7
+ dataset = load_dataset("ms_marco", "v1.1")
8
+
9
+ # Print information about each split
10
+ print("\nDataset splits:")
11
+ print("-" * 50)
12
+ for split in ['train', 'validation', 'test']:
13
+ print(f"\n{split.upper()} split:")
14
+ print(f"Number of examples: {len(dataset[split])}")
15
+
16
+ # Show multiple examples from each split
17
+ print("\nExamples:")
18
+ for i in range(3): # Show 3 examples
19
+ example = dataset[split][i]
20
+ print(f"\nExample {i+1}:")
21
+ print(f"Query: {example['query']}")
22
+ print(f"Number of passages: {len(example['passages']['passage_text'])}")
23
+ print(f"First passage preview: {example['passages']['passage_text'][0][:200]}...")
24
+
25
+ # Calculate some statistics
26
+ query_lengths = [len(ex['query'].split()) for ex in dataset[split]]
27
+ passage_lengths = [len(p.split()) for ex in dataset[split] for p in ex['passages']['passage_text']]
28
+
29
+ print(f"\nStatistics for {split} split:")
30
+ print(f"Average query length: {sum(query_lengths)/len(query_lengths):.2f} words")
31
+ print(f"Average passage length: {sum(passage_lengths)/len(passage_lengths):.2f} words")
32
+ print(f"Total number of passages: {len(passage_lengths)}")
33
+
34
+ if __name__ == "__main__":
35
+ main()