WildnerveAI commited on
Commit
7602079
·
verified ·
1 Parent(s): 2d637f7

Upload 2 files

Browse files
Files changed (2) hide show
  1. adapter_layer.py +88 -233
  2. dataset.py +3 -7
adapter_layer.py CHANGED
@@ -1,20 +1,20 @@
1
  import os
2
  import sys
 
3
  import torch
 
4
  import logging
 
5
  import traceback
6
- from typing import Dict, Any, Optional, List
7
  import importlib.util
8
- import inspect
9
 
10
  # Directly import the packages that are now installed
11
  try:
12
- import pydantic
13
- import codecarbon
14
  print(f"Successfully using installed dependencies - pydantic: {pydantic.__version__}, codecarbon: {codecarbon.__version__}")
15
  except ImportError as e:
16
  print(f"Error importing dependencies: {e}")
17
- # No mocking anymore - let errors propagate if packages aren't available
18
 
19
  # Import dependency helpers
20
  def is_module_available(module_name):
@@ -27,10 +27,9 @@ def is_module_available(module_name):
27
  logger = logging.getLogger(__name__)
28
 
29
  class WildnerveModelAdapter:
30
- """
31
- Adapter layer that interfaces between HF inference endpoints and the model.
32
- """
33
-
34
  def __init__(self, model_path: str):
35
  self.model_path = model_path
36
  self.tokenizer = None
@@ -39,20 +38,53 @@ class WildnerveModelAdapter:
39
 
40
  # ensure model directory and repo root are first on import path
41
  root = os.getcwd()
42
- for p in (model_path, root):
 
 
 
 
 
 
43
  if p not in sys.path:
44
  sys.path.insert(0, p)
45
 
46
  logger.info(f"Model adapter initialized with path: {model_path}")
47
-
48
- # Initialize components
49
- self._initialize_tokenizer()
50
- self._initialize_model()
51
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def _initialize_tokenizer(self):
53
- """Initialize tokenizer from registry or directly"""
 
 
 
 
 
 
 
 
 
 
54
  try:
55
- # Try to import from service_registry if available
56
  if is_module_available('service_registry'):
57
  from service_registry import registry, TOKENIZER
58
 
@@ -71,47 +103,39 @@ class WildnerveModelAdapter:
71
  except Exception as e:
72
  logger.warning(f"Error initializing original tokenizer: {e}")
73
 
74
- # If we reach here, try the HuggingFace transformers approach
75
  try:
76
- from transformers import AutoTokenizer
77
-
78
- models_to_try = [
79
- "bert-base-uncased",
80
- "distilbert-base-uncased",
81
- "gpt2"
82
- ]
83
-
84
- for model_name in models_to_try:
85
- try:
86
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
87
- logger.info(f"Using transformers AutoTokenizer with {model_name}")
88
- return
89
- except Exception as e:
90
- logger.warning(f"Failed to load {model_name}: {e}")
91
-
92
- except ImportError:
93
- logger.warning("transformers package not available")
94
- raise ImportError("No tokenizer could be initialized")
95
 
96
  def _initialize_model(self):
97
  """Load actual model modules by file path to avoid import issues."""
 
 
98
  try:
99
- # Read config to know which files to try
100
- import json
101
- cfg_path = os.path.join(self.model_path, "config.json")
102
- with open(cfg_path, "r") as f:
103
- cfg = json.load(f)
104
- candidates = cfg.get("SELECTED_MODEL", [])
105
- except Exception:
 
 
 
 
106
  candidates = ["model_Combn.py", "model_Custm.py", "model_PrTr.py"]
107
-
108
- logger.debug(f"Adapter will try files: {candidates}")
109
 
 
110
  for filename in candidates:
111
  fp = os.path.join(self.model_path, filename)
112
- logger.debug(f"Checking existence of {fp}")
113
  if not os.path.isfile(fp):
114
- logger.debug(f"Not found: {filename}")
115
  continue
116
 
117
  name = os.path.splitext(filename)[0]
@@ -120,6 +144,9 @@ class WildnerveModelAdapter:
120
  try:
121
  spec.loader.exec_module(module)
122
  logger.debug(f"Loaded module '{name}' from {filename}")
 
 
 
123
  except Exception as e:
124
  logger.error(f"Failed exec_module for {filename}: {e}", exc_info=True)
125
  continue
@@ -129,10 +156,15 @@ class WildnerveModelAdapter:
129
  if inspect.isclass(getattr(module, c)) and getattr(module, c).__module__ == module.__name__]
130
  logger.debug(f"Classes found in {filename}: {classes}")
131
 
132
- # try primary names
133
  for class_name in ("Wildnerve_tlm01_Hybrid_Model", "Wildnerve_tlm01"):
134
  if hasattr(module, class_name):
135
- self.model = getattr(module, class_name)(**self._build_init_kwargs())
 
 
 
 
 
136
  self.initialized = True
137
  logger.info(f"Instantiated {class_name} from {filename}")
138
  return
@@ -142,7 +174,12 @@ class WildnerveModelAdapter:
142
  obj = getattr(module, cls)
143
  bases = [b.__name__ for b in inspect.getmro(obj)]
144
  if "AbstractModel" in bases:
145
- self.model = obj(**self._build_init_kwargs())
 
 
 
 
 
146
  self.initialized = True
147
  logger.info(f"Instantiated fallback subclass {cls} from {filename}")
148
  return
@@ -215,185 +252,3 @@ class WildnerveModelAdapter:
215
  logger.error(f"Error in generate: {e}")
216
  logger.error(traceback.format_exc())
217
  return f"Error generating response: {str(e)}"
218
-
219
-
220
- # Minimal implementations below - these are only used if absolutely necessary
221
-
222
- class SimpleTokenizer:
223
- """
224
- A minimal tokenizer implementation for fallback purposes.
225
- """
226
- def __init__(self):
227
- self.eos_token_id = 102 # BERT [SEP]
228
- self.pad_token_id = 0 # BERT [PAD]
229
-
230
- # Quick lookup vocabulary (just basic ASCII)
231
- self.vocab = {
232
- "[PAD]": 0,
233
- "[UNK]": 1,
234
- "[CLS]": 2,
235
- "[SEP]": 102,
236
- "[MASK]": 103
237
- }
238
- # Add some basic ASCII
239
- for i in range(97, 123): # a-z
240
- self.vocab[chr(i)] = i + 200
241
- for i in range(65, 91): # A-Z
242
- self.vocab[chr(i)] = i + 300
243
- for i in range(48, 58): # 0-9
244
- self.vocab[chr(i)] = i + 400
245
-
246
- # Reverse vocab for decoding
247
- self.id_to_token = {v: k for k, v in self.vocab.items()}
248
-
249
- def __call__(self, text, return_tensors="pt", truncation=None, padding=None, max_length=None):
250
- """Simple tokenizer implementation"""
251
- if max_length is None:
252
- max_length = 512
253
-
254
- if isinstance(text, list):
255
- # Process batch of texts
256
- tokenized = [self._tokenize(t, max_length) for t in text]
257
- max_len = max(len(t) for t in tokenized)
258
- padded = [t + [self.pad_token_id] * (max_len - len(t)) for t in tokenized]
259
- input_ids = torch.tensor(padded)
260
- else:
261
- # Process single text
262
- tokenized = self._tokenize(text, max_length)
263
- input_ids = torch.tensor([tokenized])
264
-
265
- # Create attention mask (1 for tokens, 0 for padding)
266
- attention_mask = (input_ids != self.pad_token_id).long()
267
-
268
- return {"input_ids": input_ids, "attention_mask": attention_mask}
269
-
270
- def _tokenize(self, text, max_length=512):
271
- """Split text into tokens and convert to IDs"""
272
- # Simple whitespace tokenization
273
- words = text.replace('\n', ' ').split()
274
-
275
- # Truncate if needed
276
- if len(words) > max_length - 2: # Leave room for [CLS] and [SEP]
277
- words = words[:max_length - 2]
278
-
279
- # Convert to IDs
280
- ids = [2] # [CLS]
281
- for word in words:
282
- # Look up in vocab or split into characters if not found
283
- if word in self.vocab:
284
- ids.append(self.vocab[word])
285
- else:
286
- # Character-level fallback
287
- for char in word[:20]: # Limit long words
288
- if char in self.vocab:
289
- ids.append(self.vocab[char])
290
- else:
291
- ids.append(1) # [UNK]
292
- ids.append(102) # [SEP]
293
-
294
- return ids[:max_length]
295
-
296
- def decode(self, token_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True):
297
- """Decode token IDs back to text"""
298
- if isinstance(token_ids, torch.Tensor):
299
- token_ids = token_ids.cpu().tolist()
300
-
301
- # Handle list of lists
302
- if isinstance(token_ids[0], list):
303
- return [self.decode(ids) for ids in token_ids]
304
-
305
- # Process single list of ids
306
- text_tokens = []
307
- for token_id in token_ids:
308
- # Skip special tokens if requested
309
- if skip_special_tokens and token_id in (self.pad_token_id, 2, 102, 103):
310
- continue
311
-
312
- # Get token from id
313
- token = self.id_to_token.get(token_id, f"[{token_id}]")
314
- text_tokens.append(token)
315
-
316
- # Join tokens into text
317
- text = " ".join(text_tokens)
318
-
319
- # Clean up spaces around punctuation
320
- if clean_up_tokenization_spaces:
321
- text = text.replace(" .", ".").replace(" ,", ",").replace(" !", "!").replace(" ?", "?")
322
- text = text.replace(" ' ", "'").replace(' " ', '"')
323
-
324
- return text
325
-
326
- # Add compatibility methods for HuggingFace tokenizers
327
- def tokenize(self, text):
328
- """Tokenize text to tokens before conversion to ids"""
329
- return text.split()
330
-
331
- def convert_tokens_to_ids(self, tokens):
332
- """Convert tokens to ids"""
333
- return [self.vocab.get(token, 1) for token in tokens]
334
-
335
- def convert_ids_to_tokens(self, ids):
336
- """Convert ids to tokens"""
337
- return [self.id_to_token.get(id, f"[{id}]") for id in ids]
338
-
339
- def encode(self, text, add_special_tokens=True, **kwargs):
340
- """Encode text to ids"""
341
- tokens = self.tokenize(text)
342
- if add_special_tokens:
343
- tokens = ["[CLS]"] + tokens + ["[SEP]"]
344
- return self.convert_tokens_to_ids(tokens)
345
-
346
-
347
- class SimpleFallbackModel:
348
- """
349
- A minimal model implementation that can generate responses
350
- without requiring complex dependencies.
351
- """
352
- def __init__(self, tokenizer=None):
353
- self.tokenizer = tokenizer or SimpleTokenizer()
354
- self.device = torch.device("cpu")
355
-
356
- # Predefine some response templates
357
- self.responses = {
358
- "greeting": [
359
- "Hello! I'm running in fallback mode. How can I assist you?",
360
- "Hi there! I'm currently operating with limited capabilities.",
361
- "Greetings! I'm in fallback mode but will try to help."
362
- ],
363
- "question": [
364
- "That's an interesting question. In normal operation, I could provide a detailed answer. I'm currently in fallback mode with limited capabilities.",
365
- "Good question. When fully operational, I can provide in-depth answers across many topics.",
366
- "I'd need my full model capabilities to properly answer that question. I'm currently running in fallback mode."
367
- ],
368
- "code": [
369
- "I see you're asking about code. In normal operation, I can write, explain, and debug code in many languages.",
370
- "When fully operational, I can help with programming tasks like writing code, debugging, and explaining algorithms.",
371
- "I'd normally be able to help with this coding task, but I'm currently in fallback mode with limited capabilities."
372
- ],
373
- "default": [
374
- "I appreciate your message. I'm currently operating in fallback mode due to technical issues.",
375
- "Thanks for your input. The regular model is temporarily unavailable. Please try again later.",
376
- "I've received your message but can only provide limited responses in fallback mode."
377
- ]
378
- }
379
-
380
- def generate(self, prompt, **kwargs):
381
- """Generate a simple response based on prompt content"""
382
- # ULTRA-SIMPLIFIED IMPLEMENTATION: No tensor processing at all!
383
- try:
384
- # Just log what type we received for debugging
385
- logger.info(f"SimpleFallbackModel.generate received input of type {type(prompt)}")
386
-
387
- # FIXED: Return a simple string response regardless of input type
388
- # This completely avoids any tensor processing/lower() calls
389
- return """I apologize, but I'm currently operating in fallback mode due to loading issues.
390
-
391
- The system is missing required dependencies (pydantic, codecarbon) needed to load the full model.
392
- The administrator should install these packages to enable full functionality.
393
-
394
- Please try again later when the system has been properly configured."""
395
-
396
- except Exception as e:
397
- # This should never happen now, but just in case
398
- logger.error(f"Error in simple generate (this should be impossible): {e}")
399
- return "System is in emergency fallback mode. Please contact administrator."
 
1
  import os
2
  import sys
3
+ import json
4
  import torch
5
+ import inspect
6
  import logging
7
+ import pydantic
8
  import traceback
9
+ import codecarbon
10
  import importlib.util
11
+ from typing import Dict, Any, Optional, List
12
 
13
  # Directly import the packages that are now installed
14
  try:
 
 
15
  print(f"Successfully using installed dependencies - pydantic: {pydantic.__version__}, codecarbon: {codecarbon.__version__}")
16
  except ImportError as e:
17
  print(f"Error importing dependencies: {e}")
 
18
 
19
  # Import dependency helpers
20
  def is_module_available(module_name):
 
27
  logger = logging.getLogger(__name__)
28
 
29
  class WildnerveModelAdapter:
30
+ """Adapter layer that interfaces between HF inference endpoints and the model."""
31
+ RETRY_COUNT = 5
32
+
 
33
  def __init__(self, model_path: str):
34
  self.model_path = model_path
35
  self.tokenizer = None
 
38
 
39
  # ensure model directory and repo root are first on import path
40
  root = os.getcwd()
41
+ paths = []
42
+ if os.path.isdir(model_path):
43
+ paths.append(model_path)
44
+ else:
45
+ logger.warning(f"Model path not found or not a directory: {model_path}")
46
+ paths.append(root)
47
+ for p in paths:
48
  if p not in sys.path:
49
  sys.path.insert(0, p)
50
 
51
  logger.info(f"Model adapter initialized with path: {model_path}")
52
+
53
+ # Initialize components with retry logic
54
+ for attempt in range(1, self.RETRY_COUNT + 1):
55
+ try:
56
+ self._initialize_tokenizer()
57
+ logger.info("Tokenizer initialized")
58
+ break
59
+ except Exception as e:
60
+ logger.warning(f"Tokenizer init attempt {attempt}/{self.RETRY_COUNT} failed: {e}")
61
+ logger.debug("Tokenizer init stack trace:", exc_info=True)
62
+ if attempt == self.RETRY_COUNT:
63
+ raise
64
+
65
+ for attempt in range(1, self.RETRY_COUNT + 1):
66
+ try:
67
+ self._initialize_model()
68
+ logger.info("Model initialized")
69
+ break
70
+ except Exception as e:
71
+ logger.warning(f"Model init attempt {attempt}/{self.RETRY_COUNT} failed: {e}")
72
+ logger.debug("Model init stack trace:", exc_info=True)
73
+ if attempt == self.RETRY_COUNT:
74
+ raise
75
  def _initialize_tokenizer(self):
76
+ """Initialize tokenizer via our local wrapper first, then fallback."""
77
+ try:
78
+ # primary: use our tokenizer.py
79
+ from tokenizer import TokenizerWrapper
80
+ self.tokenizer = TokenizerWrapper()
81
+ logger.info("Using TokenizerWrapper from tokenizer.py")
82
+ return
83
+ except Exception as e:
84
+ logger.warning(f"TokenizerWrapper init failed: {e}")
85
+
86
+ # Try to import from service_registry if available
87
  try:
 
88
  if is_module_available('service_registry'):
89
  from service_registry import registry, TOKENIZER
90
 
 
103
  except Exception as e:
104
  logger.warning(f"Error initializing original tokenizer: {e}")
105
 
106
+ # Final fallback: use your get_tokenizer wrapper
107
  try:
108
+ from tokenizer import get_tokenizer
109
+ self.tokenizer = get_tokenizer()
110
+ logger.info("Using get_tokenizer() fallback")
111
+ return
112
+ except Exception as e:
113
+ logger.error(f"No tokenizer could be initialized: {e}")
114
+ raise ImportError("Tokenizer initialization failed")
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  def _initialize_model(self):
117
  """Load actual model modules by file path to avoid import issues."""
118
+ # Parse config.json more narrowly
119
+ cfg_file = os.path.join(self.model_path, "config.json")
120
  try:
121
+ with open(cfg_file, "r") as f:
122
+ raw = json.load(f)
123
+ candidates = raw.get("SELECTED_MODEL", [])
124
+ if not isinstance(candidates, list):
125
+ logger.warning(f"SELECTED_MODEL not a list, wrapping: {candidates}")
126
+ candidates = [candidates]
127
+ except (FileNotFoundError, json.JSONDecodeError) as e:
128
+ logger.warning(f"Could not read/parse config.json ({e}), using default model list")
129
+ candidates = ["model_Combn.py", "model_Custm.py", "model_PrTr.py"]
130
+ except Exception as e:
131
+ logger.error(f"Unexpected error loading config.json: {e}", exc_info=True)
132
  candidates = ["model_Combn.py", "model_Custm.py", "model_PrTr.py"]
 
 
133
 
134
+ logger.debug(f"Adapter will try files: {candidates}")
135
  for filename in candidates:
136
  fp = os.path.join(self.model_path, filename)
 
137
  if not os.path.isfile(fp):
138
+ logger.debug(f"Not found: {fp}")
139
  continue
140
 
141
  name = os.path.splitext(filename)[0]
 
144
  try:
145
  spec.loader.exec_module(module)
146
  logger.debug(f"Loaded module '{name}' from {filename}")
147
+ except ImportError as e:
148
+ logger.error(f"Missing dependency in {filename}: {e}", exc_info=True)
149
+ continue
150
  except Exception as e:
151
  logger.error(f"Failed exec_module for {filename}: {e}", exc_info=True)
152
  continue
 
156
  if inspect.isclass(getattr(module, c)) and getattr(module, c).__module__ == module.__name__]
157
  logger.debug(f"Classes found in {filename}: {classes}")
158
 
159
+ # Instantiate first matching class
160
  for class_name in ("Wildnerve_tlm01_Hybrid_Model", "Wildnerve_tlm01"):
161
  if hasattr(module, class_name):
162
+ try:
163
+ inst = getattr(module, class_name)(**self._build_init_kwargs())
164
+ except TypeError as e:
165
+ logger.error(f"Instantiation failed for {class_name}: {e}", exc_info=True)
166
+ continue
167
+ self.model = inst
168
  self.initialized = True
169
  logger.info(f"Instantiated {class_name} from {filename}")
170
  return
 
174
  obj = getattr(module, cls)
175
  bases = [b.__name__ for b in inspect.getmro(obj)]
176
  if "AbstractModel" in bases:
177
+ try:
178
+ inst = obj(**self._build_init_kwargs())
179
+ except Exception as e:
180
+ logger.error(f"Fallback instantiation failed for {cls}: {e}", exc_info=True)
181
+ continue
182
+ self.model = inst
183
  self.initialized = True
184
  logger.info(f"Instantiated fallback subclass {cls} from {filename}")
185
  return
 
252
  logger.error(f"Error in generate: {e}")
253
  logger.error(traceback.format_exc())
254
  return f"Error generating response: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset.py CHANGED
@@ -4,18 +4,14 @@ import csv
4
  import json
5
  import torch
6
  import logging
 
 
 
7
  from torch.utils.data import Dataset
8
  from typing import List, Dict, Any, Optional, Union
9
- from functools import wraps
10
- from time import time
11
 
12
  logger = logging.getLogger(__name__)
13
 
14
- # Attempt to import Preprocessor; fall back if missing
15
- try:
16
- from preprocess import Preprocessor
17
- except ImportError:
18
- Preprocessor = None
19
 
20
  def safe_file_operation(func):
21
  """Decorator to safely handle file operations with timeout"""
 
4
  import json
5
  import torch
6
  import logging
7
+ from time import time
8
+ from functools import wraps
9
+ from preprocess import Preprocessor
10
  from torch.utils.data import Dataset
11
  from typing import List, Dict, Any, Optional, Union
 
 
12
 
13
  logger = logging.getLogger(__name__)
14
 
 
 
 
 
 
15
 
16
  def safe_file_operation(func):
17
  """Decorator to safely handle file operations with timeout"""