File size: 2,817 Bytes
362c0ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import json
import pickle
from tqdm import tqdm
import numpy as np

def load_tokenizer():
    """Load the CBOW tokenizer mappings."""
    with open('tkn_words_to_ids.pkl', 'rb') as f:
        words_to_ids = pickle.load(f)
    with open('tkn_ids_to_words.pkl', 'rb') as f:
        ids_to_words = pickle.load(f)
    return words_to_ids, ids_to_words

def tokenize_text(text, words_to_ids):
    """Tokenize text using the CBOW tokenizer."""
    # Convert to lowercase and split
    words = text.lower().split()
    # Convert words to IDs, using 0 for unknown words
    token_ids = [words_to_ids.get(word, 0) for word in words]
    return token_ids

def process_triples(input_file, output_file):
    """Process triples and tokenize queries and documents."""
    print("Loading tokenizer...")
    words_to_ids, ids_to_words = load_tokenizer()
    
    print("Loading triples...")
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    tokenized_data = {
        'train': [],
        'validation': [],
        'test': []
    }
    
    for split in ['train', 'validation', 'test']:
        print(f"\nTokenizing {split} split...")
        for triple in tqdm(data[split]):
            query = triple['query']
            pos_doc = triple['positive_doc']
            neg_doc = triple['negative_doc']
            
            # Tokenize query and documents
            query_tokens = tokenize_text(query, words_to_ids)
            pos_doc_tokens = tokenize_text(pos_doc, words_to_ids)
            neg_doc_tokens = tokenize_text(neg_doc, words_to_ids)
            
            tokenized_data[split].append({
                'query_tokens': query_tokens,
                'positive_document_tokens': pos_doc_tokens,
                'negative_document_tokens': neg_doc_tokens,
                'query': query,  # Keep original text for reference
                'positive_document': pos_doc,
                'negative_document': neg_doc
            })
    
    print("Saving tokenized triples...")
    with open(output_file, 'w') as f:
        json.dump(tokenized_data, f, indent=2)
    
    # Print statistics
    for split in ['train', 'validation', 'test']:
        print(f"\n{split.upper()} split:")
        print(f"Number of tokenized triples: {len(tokenized_data[split])}")
        if tokenized_data[split]:
            sample = tokenized_data[split][0]
            print("\nSample tokenized triple:")
            print("Query tokens length:", len(sample['query_tokens']))
            print("Positive doc tokens length:", len(sample['positive_document_tokens']))
            print("Negative doc tokens length:", len(sample['negative_document_tokens']))

if __name__ == "__main__":
    input_file = "triples_small.json"
    output_file = "tokenized_triples.json"
    process_triples(input_file, output_file)