roshbeed commited on
Commit
7e73a05
·
verified ·
1 Parent(s): 3e28790

Upload src/create_embeddings.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/create_embeddings.py +96 -0
src/create_embeddings.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ import pickle
4
+ from tqdm import tqdm
5
+ import torch
6
+ import torch.nn as nn
7
+
8
+ def load_tokenizer():
9
+ """Load the CBOW tokenizer mappings."""
10
+ print("Loading tokenizer...")
11
+ with open('tkn_words_to_ids.pkl', 'rb') as f:
12
+ words_to_ids = pickle.load(f)
13
+ with open('tkn_ids_to_words.pkl', 'rb') as f:
14
+ ids_to_words = pickle.load(f)
15
+ return words_to_ids, ids_to_words
16
+
17
+ def load_tokenized_triples():
18
+ """Load the tokenized triples."""
19
+ print("Loading tokenized triples...")
20
+ with open('tokenized_triples.json', 'r') as f:
21
+ data = json.load(f)
22
+ return data
23
+
24
+ def create_embedding_layer(vocab_size, embedding_dim=128):
25
+ """Create a simple embedding layer."""
26
+ embedding = nn.Embedding(vocab_size, embedding_dim)
27
+ # Initialize with random weights
28
+ nn.init.xavier_uniform_(embedding.weight)
29
+ return embedding
30
+
31
+ def average_pool(tokens, embedding_layer):
32
+ """Create average pooled vector for a list of tokens."""
33
+ # Convert tokens to tensor
34
+ tokens_tensor = torch.tensor(tokens, dtype=torch.long)
35
+ # Get embeddings
36
+ embeddings = embedding_layer(tokens_tensor)
37
+ # Average the embeddings and detach before converting to numpy
38
+ return torch.mean(embeddings, dim=0).detach().numpy()
39
+
40
+ def process_triples(data, embedding_layer):
41
+ """Process triples and create average pooled vectors."""
42
+ processed_data = {
43
+ 'train': [],
44
+ 'validation': [],
45
+ 'test': []
46
+ }
47
+
48
+ for split in ['train', 'validation', 'test']:
49
+ print(f"\nProcessing {split} split...")
50
+ for triple in tqdm(data[split]):
51
+ # Get average pooled vectors
52
+ query_vector = average_pool(triple['query_tokens'], embedding_layer)
53
+ pos_doc_vector = average_pool(triple['positive_document_tokens'], embedding_layer)
54
+ neg_doc_vector = average_pool(triple['negative_document_tokens'], embedding_layer)
55
+
56
+ processed_data[split].append({
57
+ 'query_vector': query_vector.tolist(),
58
+ 'positive_document_vector': pos_doc_vector.tolist(),
59
+ 'negative_document_vector': neg_doc_vector.tolist(),
60
+ 'query': triple['query'], # Keep original text for reference
61
+ 'positive_document': triple['positive_document'],
62
+ 'negative_document': triple['negative_document']
63
+ })
64
+
65
+ return processed_data
66
+
67
+ def main():
68
+ # Load data
69
+ words_to_ids, ids_to_words = load_tokenizer()
70
+ data = load_tokenized_triples()
71
+
72
+ # Create embedding layer
73
+ vocab_size = len(words_to_ids)
74
+ embedding_layer = create_embedding_layer(vocab_size)
75
+
76
+ # Process triples
77
+ processed_data = process_triples(data, embedding_layer)
78
+
79
+ # Save processed data
80
+ print("\nSaving processed data...")
81
+ with open('triple_embeddings.json', 'w') as f:
82
+ json.dump(processed_data, f)
83
+
84
+ # Print statistics
85
+ for split in ['train', 'validation', 'test']:
86
+ print(f"\n{split.upper()} split:")
87
+ print(f"Number of processed triples: {len(processed_data[split])}")
88
+ if processed_data[split]:
89
+ sample = processed_data[split][0]
90
+ print("\nSample vector shapes:")
91
+ print("Query vector shape:", len(sample['query_vector']))
92
+ print("Positive doc vector shape:", len(sample['positive_document_vector']))
93
+ print("Negative doc vector shape:", len(sample['negative_document_vector']))
94
+
95
+ if __name__ == "__main__":
96
+ main()