surazbhandari commited on
Commit
fe0dabe
·
verified ·
1 Parent(s): fe68020

Sync from GitHub Actions

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. config.json +10 -0
  3. src/inference.py +19 -9
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 30000,
3
+ "d_model": 256,
4
+ "num_heads": 4,
5
+ "num_layers": 4,
6
+ "d_ff": 1024,
7
+ "max_seq_len": 128,
8
+ "pad_token_id": 0,
9
+ "size_name": "mini"
10
+ }
src/inference.py CHANGED
@@ -105,8 +105,21 @@ class EmbeddingModelManager:
105
  # 1. Load config
106
  config_path = model_dir / 'config.json'
107
 
108
- with open(config_path, 'r') as f:
109
- config = json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  # 2. Load tokenizer
112
  tokenizer_path = model_dir / 'tokenizer.json'
@@ -122,7 +135,7 @@ class EmbeddingModelManager:
122
  num_layers=config['num_layers'],
123
  d_ff=config['d_ff'],
124
  max_seq_len=config['max_seq_len'],
125
- pad_token_id=config['pad_token_id']
126
  )
127
 
128
  # Load weights (prefer safetensors)
@@ -155,18 +168,15 @@ class EmbeddingModelManager:
155
  Local directory path containing the downloaded files.
156
  """
157
  try:
158
- from huggingface_hub import hf_hub_download, snapshot_download
159
  except ImportError:
160
  raise ImportError(
161
  "huggingface_hub is required to download models from HuggingFace. "
162
  "Install it with: pip install huggingface_hub"
163
  )
164
 
165
- # Download the full model snapshot
166
- local_dir = snapshot_download(
167
- repo_id=repo_id,
168
- allow_patterns=["config.json", "model.safetensors", "model.pt", "tokenizer.json", "training_info.json"],
169
- )
170
 
171
  return local_dir
172
 
 
105
  # 1. Load config
106
  config_path = model_dir / 'config.json'
107
 
108
+ if config_path.exists():
109
+ with open(config_path, 'r') as f:
110
+ config = json.load(f)
111
+ else:
112
+ # Fallback defaults matching the MiniEmbed-Mini architecture
113
+ print("Warning: config.json not found. Using default MiniEmbed-Mini configuration.")
114
+ config = {
115
+ "vocab_size": 30000,
116
+ "d_model": 256,
117
+ "num_heads": 4,
118
+ "num_layers": 4,
119
+ "d_ff": 1024,
120
+ "max_seq_len": 128,
121
+ "pad_token_id": 0
122
+ }
123
 
124
  # 2. Load tokenizer
125
  tokenizer_path = model_dir / 'tokenizer.json'
 
135
  num_layers=config['num_layers'],
136
  d_ff=config['d_ff'],
137
  max_seq_len=config['max_seq_len'],
138
+ pad_token_id=config.get('pad_token_id', 0)
139
  )
140
 
141
  # Load weights (prefer safetensors)
 
168
  Local directory path containing the downloaded files.
169
  """
170
  try:
171
+ from huggingface_hub import snapshot_download
172
  except ImportError:
173
  raise ImportError(
174
  "huggingface_hub is required to download models from HuggingFace. "
175
  "Install it with: pip install huggingface_hub"
176
  )
177
 
178
+ # Download the full repo (including src/ for inference code)
179
+ local_dir = snapshot_download(repo_id=repo_id)
 
 
 
180
 
181
  return local_dir
182