roshbeed commited on
Commit
362c0ce
·
verified ·
1 Parent(s): 28939a3

Upload src/tokenize_triples.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/tokenize_triples.py +76 -0
src/tokenize_triples.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pickle
3
+ from tqdm import tqdm
4
+ import numpy as np
5
+
6
+ def load_tokenizer():
7
+ """Load the CBOW tokenizer mappings."""
8
+ with open('tkn_words_to_ids.pkl', 'rb') as f:
9
+ words_to_ids = pickle.load(f)
10
+ with open('tkn_ids_to_words.pkl', 'rb') as f:
11
+ ids_to_words = pickle.load(f)
12
+ return words_to_ids, ids_to_words
13
+
14
+ def tokenize_text(text, words_to_ids):
15
+ """Tokenize text using the CBOW tokenizer."""
16
+ # Convert to lowercase and split
17
+ words = text.lower().split()
18
+ # Convert words to IDs, using 0 for unknown words
19
+ token_ids = [words_to_ids.get(word, 0) for word in words]
20
+ return token_ids
21
+
22
+ def process_triples(input_file, output_file):
23
+ """Process triples and tokenize queries and documents."""
24
+ print("Loading tokenizer...")
25
+ words_to_ids, ids_to_words = load_tokenizer()
26
+
27
+ print("Loading triples...")
28
+ with open(input_file, 'r') as f:
29
+ data = json.load(f)
30
+
31
+ tokenized_data = {
32
+ 'train': [],
33
+ 'validation': [],
34
+ 'test': []
35
+ }
36
+
37
+ for split in ['train', 'validation', 'test']:
38
+ print(f"\nTokenizing {split} split...")
39
+ for triple in tqdm(data[split]):
40
+ query = triple['query']
41
+ pos_doc = triple['positive_doc']
42
+ neg_doc = triple['negative_doc']
43
+
44
+ # Tokenize query and documents
45
+ query_tokens = tokenize_text(query, words_to_ids)
46
+ pos_doc_tokens = tokenize_text(pos_doc, words_to_ids)
47
+ neg_doc_tokens = tokenize_text(neg_doc, words_to_ids)
48
+
49
+ tokenized_data[split].append({
50
+ 'query_tokens': query_tokens,
51
+ 'positive_document_tokens': pos_doc_tokens,
52
+ 'negative_document_tokens': neg_doc_tokens,
53
+ 'query': query, # Keep original text for reference
54
+ 'positive_document': pos_doc,
55
+ 'negative_document': neg_doc
56
+ })
57
+
58
+ print("Saving tokenized triples...")
59
+ with open(output_file, 'w') as f:
60
+ json.dump(tokenized_data, f, indent=2)
61
+
62
+ # Print statistics
63
+ for split in ['train', 'validation', 'test']:
64
+ print(f"\n{split.upper()} split:")
65
+ print(f"Number of tokenized triples: {len(tokenized_data[split])}")
66
+ if tokenized_data[split]:
67
+ sample = tokenized_data[split][0]
68
+ print("\nSample tokenized triple:")
69
+ print("Query tokens length:", len(sample['query_tokens']))
70
+ print("Positive doc tokens length:", len(sample['positive_document_tokens']))
71
+ print("Negative doc tokens length:", len(sample['negative_document_tokens']))
72
+
73
+ if __name__ == "__main__":
74
+ input_file = "triples_small.json"
75
+ output_file = "tokenized_triples.json"
76
+ process_triples(input_file, output_file)