roshbeed
/

mlx2

Model card Files Files and versions

xet

Community

roshbeed commited on Jun 17, 2025

Commit

362c0ce

verified ·

1 Parent(s): 28939a3

Upload src/tokenize_triples.py with huggingface_hub

Browse files

Files changed (1) hide show

src/tokenize_triples.py +76 -0

src/tokenize_triples.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import json
+import pickle
+from tqdm import tqdm
+import numpy as np
+def load_tokenizer():
+    """Load the CBOW tokenizer mappings."""
+    with open('tkn_words_to_ids.pkl', 'rb') as f:
+        words_to_ids = pickle.load(f)
+    with open('tkn_ids_to_words.pkl', 'rb') as f:
+        ids_to_words = pickle.load(f)
+    return words_to_ids, ids_to_words
+def tokenize_text(text, words_to_ids):
+    """Tokenize text using the CBOW tokenizer."""
+    # Convert to lowercase and split
+    words = text.lower().split()
+    # Convert words to IDs, using 0 for unknown words
+    token_ids = [words_to_ids.get(word, 0) for word in words]
+    return token_ids
+def process_triples(input_file, output_file):
+    """Process triples and tokenize queries and documents."""
+    print("Loading tokenizer...")
+    words_to_ids, ids_to_words = load_tokenizer()
+    print("Loading triples...")
+    with open(input_file, 'r') as f:
+        data = json.load(f)
+    tokenized_data = {
+        'train': [],
+        'validation': [],
+        'test': []
+    }
+    for split in ['train', 'validation', 'test']:
+        print(f"\nTokenizing {split} split...")
+        for triple in tqdm(data[split]):
+            query = triple['query']
+            pos_doc = triple['positive_doc']
+            neg_doc = triple['negative_doc']
+            # Tokenize query and documents
+            query_tokens = tokenize_text(query, words_to_ids)
+            pos_doc_tokens = tokenize_text(pos_doc, words_to_ids)
+            neg_doc_tokens = tokenize_text(neg_doc, words_to_ids)
+            tokenized_data[split].append({
+                'query_tokens': query_tokens,
+                'positive_document_tokens': pos_doc_tokens,
+                'negative_document_tokens': neg_doc_tokens,
+                'query': query,  # Keep original text for reference
+                'positive_document': pos_doc,
+                'negative_document': neg_doc
+            })
+    print("Saving tokenized triples...")
+    with open(output_file, 'w') as f:
+        json.dump(tokenized_data, f, indent=2)
+    # Print statistics
+    for split in ['train', 'validation', 'test']:
+        print(f"\n{split.upper()} split:")
+        print(f"Number of tokenized triples: {len(tokenized_data[split])}")
+        if tokenized_data[split]:
+            sample = tokenized_data[split][0]
+            print("\nSample tokenized triple:")
+            print("Query tokens length:", len(sample['query_tokens']))
+            print("Positive doc tokens length:", len(sample['positive_document_tokens']))
+            print("Negative doc tokens length:", len(sample['negative_document_tokens']))
+if __name__ == "__main__":
+    input_file = "triples_small.json"
+    output_file = "tokenized_triples.json"
+    process_triples(input_file, output_file)