EvolphTech
/

Wildnerve-tlm01_Hybrid_Model

Text Generation

wildnerve_tlm01

Model card Files Files and versions

xet

Community

WildnerveAI commited on May 8, 2025

Commit

7602079

verified ·

1 Parent(s): 2d637f7

Upload 2 files

Browse files

Files changed (2) hide show

adapter_layer.py +88 -233
dataset.py +3 -7

adapter_layer.py CHANGED Viewed

@@ -1,20 +1,20 @@
 import os
 import sys
 import torch
 import logging
 import traceback
-from typing import Dict, Any, Optional, List
 import importlib.util
-import inspect
 # Directly import the packages that are now installed
 try:
-    import pydantic
-    import codecarbon
     print(f"Successfully using installed dependencies - pydantic: {pydantic.__version__}, codecarbon: {codecarbon.__version__}")
 except ImportError as e:
     print(f"Error importing dependencies: {e}")
-    # No mocking anymore - let errors propagate if packages aren't available
 # Import dependency helpers
 def is_module_available(module_name):
@@ -27,10 +27,9 @@ def is_module_available(module_name):
 logger = logging.getLogger(__name__)
 class WildnerveModelAdapter:
-    """
-    Adapter layer that interfaces between HF inference endpoints and the model.
-    """
     def __init__(self, model_path: str):
         self.model_path = model_path
         self.tokenizer = None
@@ -39,20 +38,53 @@ class WildnerveModelAdapter:
         # ensure model directory and repo root are first on import path
         root = os.getcwd()
-        for p in (model_path, root):
             if p not in sys.path:
                 sys.path.insert(0, p)
         logger.info(f"Model adapter initialized with path: {model_path}")
-        # Initialize components
-        self._initialize_tokenizer()
-        self._initialize_model()
     def _initialize_tokenizer(self):
-        """Initialize tokenizer from registry or directly"""
         try:
-            # Try to import from service_registry if available
             if is_module_available('service_registry'):
                 from service_registry import registry, TOKENIZER
@@ -71,47 +103,39 @@ class WildnerveModelAdapter:
         except Exception as e:
             logger.warning(f"Error initializing original tokenizer: {e}")
-        # If we reach here, try the HuggingFace transformers approach
         try:
-            from transformers import AutoTokenizer
-            models_to_try = [
-                "bert-base-uncased",
-                "distilbert-base-uncased",
-                "gpt2"
-            ]
-            for model_name in models_to_try:
-                try:
-                    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-                    logger.info(f"Using transformers AutoTokenizer with {model_name}")
-                    return
-                except Exception as e:
-                    logger.warning(f"Failed to load {model_name}: {e}")
-        except ImportError:
-            logger.warning("transformers package not available")
-            raise ImportError("No tokenizer could be initialized")
     def _initialize_model(self):
         """Load actual model modules by file path to avoid import issues."""
         try:
-            # Read config to know which files to try
-            import json
-            cfg_path = os.path.join(self.model_path, "config.json")
-            with open(cfg_path, "r") as f:
-                cfg = json.load(f)
-            candidates = cfg.get("SELECTED_MODEL", [])
-        except Exception:
             candidates = ["model_Combn.py", "model_Custm.py", "model_PrTr.py"]
-        logger.debug(f"Adapter will try files: {candidates}")
         for filename in candidates:
             fp = os.path.join(self.model_path, filename)
-            logger.debug(f"Checking existence of {fp}")
             if not os.path.isfile(fp):
-                logger.debug(f"Not found: {filename}")
                 continue
             name = os.path.splitext(filename)[0]
@@ -120,6 +144,9 @@ class WildnerveModelAdapter:
             try:
                 spec.loader.exec_module(module)
                 logger.debug(f"Loaded module '{name}' from {filename}")
             except Exception as e:
                 logger.error(f"Failed exec_module for {filename}: {e}", exc_info=True)
                 continue
@@ -129,10 +156,15 @@ class WildnerveModelAdapter:
                        if inspect.isclass(getattr(module, c)) and getattr(module, c).__module__ == module.__name__]
             logger.debug(f"Classes found in {filename}: {classes}")
-            # try primary names
             for class_name in ("Wildnerve_tlm01_Hybrid_Model", "Wildnerve_tlm01"):
                 if hasattr(module, class_name):
-                    self.model = getattr(module, class_name)(**self._build_init_kwargs())
                     self.initialized = True
                     logger.info(f"Instantiated {class_name} from {filename}")
                     return
@@ -142,7 +174,12 @@ class WildnerveModelAdapter:
                 obj = getattr(module, cls)
                 bases = [b.__name__ for b in inspect.getmro(obj)]
                 if "AbstractModel" in bases:
-                    self.model = obj(**self._build_init_kwargs())
                     self.initialized = True
                     logger.info(f"Instantiated fallback subclass {cls} from {filename}")
                     return
@@ -215,185 +252,3 @@ class WildnerveModelAdapter:
             logger.error(f"Error in generate: {e}")
             logger.error(traceback.format_exc())
             return f"Error generating response: {str(e)}"
-# Minimal implementations below - these are only used if absolutely necessary
-class SimpleTokenizer:
-    """
-    A minimal tokenizer implementation for fallback purposes.
-    """
-    def __init__(self):
-        self.eos_token_id = 102  # BERT [SEP]
-        self.pad_token_id = 0    # BERT [PAD]
-        # Quick lookup vocabulary (just basic ASCII)
-        self.vocab = {
-            "[PAD]": 0,
-            "[UNK]": 1,
-            "[CLS]": 2,
-            "[SEP]": 102,
-            "[MASK]": 103
-        }
-        # Add some basic ASCII
-        for i in range(97, 123):  # a-z
-            self.vocab[chr(i)] = i + 200
-        for i in range(65, 91):   # A-Z
-            self.vocab[chr(i)] = i + 300
-        for i in range(48, 58):   # 0-9
-            self.vocab[chr(i)] = i + 400
-        # Reverse vocab for decoding
-        self.id_to_token = {v: k for k, v in self.vocab.items()}
-    def __call__(self, text, return_tensors="pt", truncation=None, padding=None, max_length=None):
-        """Simple tokenizer implementation"""
-        if max_length is None:
-            max_length = 512
-        if isinstance(text, list):
-            # Process batch of texts
-            tokenized = [self._tokenize(t, max_length) for t in text]
-            max_len = max(len(t) for t in tokenized)
-            padded = [t + [self.pad_token_id] * (max_len - len(t)) for t in tokenized]
-            input_ids = torch.tensor(padded)
-        else:
-            # Process single text
-            tokenized = self._tokenize(text, max_length)
-            input_ids = torch.tensor([tokenized])
-        # Create attention mask (1 for tokens, 0 for padding)
-        attention_mask = (input_ids != self.pad_token_id).long()
-        return {"input_ids": input_ids, "attention_mask": attention_mask}
-    def _tokenize(self, text, max_length=512):
-        """Split text into tokens and convert to IDs"""
-        # Simple whitespace tokenization
-        words = text.replace('\n', ' ').split()
-        # Truncate if needed
-        if len(words) > max_length - 2:  # Leave room for [CLS] and [SEP]
-            words = words[:max_length - 2]
-        # Convert to IDs
-        ids = [2]  # [CLS]
-        for word in words:
-            # Look up in vocab or split into characters if not found
-            if word in self.vocab:
-                ids.append(self.vocab[word])
-            else:
-                # Character-level fallback
-                for char in word[:20]:  # Limit long words
-                    if char in self.vocab:
-                        ids.append(self.vocab[char])
-                    else:
-                        ids.append(1)  # [UNK]
-        ids.append(102)  # [SEP]
-        return ids[:max_length]
-    def decode(self, token_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True):
-        """Decode token IDs back to text"""
-        if isinstance(token_ids, torch.Tensor):
-            token_ids = token_ids.cpu().tolist()
-        # Handle list of lists
-        if isinstance(token_ids[0], list):
-            return [self.decode(ids) for ids in token_ids]
-        # Process single list of ids
-        text_tokens = []
-        for token_id in token_ids:
-            # Skip special tokens if requested
-            if skip_special_tokens and token_id in (self.pad_token_id, 2, 102, 103):
-                continue
-            # Get token from id
-            token = self.id_to_token.get(token_id, f"[{token_id}]")
-            text_tokens.append(token)
-        # Join tokens into text
-        text = " ".join(text_tokens)
-        # Clean up spaces around punctuation
-        if clean_up_tokenization_spaces:
-            text = text.replace(" .", ".").replace(" ,", ",").replace(" !", "!").replace(" ?", "?")
-            text = text.replace(" ' ", "'").replace(' " ', '"')
-        return text
-    # Add compatibility methods for HuggingFace tokenizers
-    def tokenize(self, text):
-        """Tokenize text to tokens before conversion to ids"""
-        return text.split()
-    def convert_tokens_to_ids(self, tokens):
-        """Convert tokens to ids"""
-        return [self.vocab.get(token, 1) for token in tokens]
-    def convert_ids_to_tokens(self, ids):
-        """Convert ids to tokens"""
-        return [self.id_to_token.get(id, f"[{id}]") for id in ids]
-    def encode(self, text, add_special_tokens=True, **kwargs):
-        """Encode text to ids"""
-        tokens = self.tokenize(text)
-        if add_special_tokens:
-            tokens = ["[CLS]"] + tokens + ["[SEP]"]
-        return self.convert_tokens_to_ids(tokens)
-class SimpleFallbackModel:
-    """
-    A minimal model implementation that can generate responses
-    without requiring complex dependencies.
-    """
-    def __init__(self, tokenizer=None):
-        self.tokenizer = tokenizer or SimpleTokenizer()
-        self.device = torch.device("cpu")
-        # Predefine some response templates
-        self.responses = {
-            "greeting": [
-                "Hello! I'm running in fallback mode. How can I assist you?",
-                "Hi there! I'm currently operating with limited capabilities.",
-                "Greetings! I'm in fallback mode but will try to help."
-            ],
-            "question": [
-                "That's an interesting question. In normal operation, I could provide a detailed answer. I'm currently in fallback mode with limited capabilities.",
-                "Good question. When fully operational, I can provide in-depth answers across many topics.",
-                "I'd need my full model capabilities to properly answer that question. I'm currently running in fallback mode."
-            ],
-            "code": [
-                "I see you're asking about code. In normal operation, I can write, explain, and debug code in many languages.",
-                "When fully operational, I can help with programming tasks like writing code, debugging, and explaining algorithms.",
-                "I'd normally be able to help with this coding task, but I'm currently in fallback mode with limited capabilities."
-            ],
-            "default": [
-                "I appreciate your message. I'm currently operating in fallback mode due to technical issues.",
-                "Thanks for your input. The regular model is temporarily unavailable. Please try again later.",
-                "I've received your message but can only provide limited responses in fallback mode."
-            ]
-        }
-    def generate(self, prompt, **kwargs):
-        """Generate a simple response based on prompt content"""
-        # ULTRA-SIMPLIFIED IMPLEMENTATION: No tensor processing at all!
-        try:
-            # Just log what type we received for debugging
-            logger.info(f"SimpleFallbackModel.generate received input of type {type(prompt)}")
-            # FIXED: Return a simple string response regardless of input type
-            # This completely avoids any tensor processing/lower() calls
-            return """I apologize, but I'm currently operating in fallback mode due to loading issues.
-The system is missing required dependencies (pydantic, codecarbon) needed to load the full model.
-The administrator should install these packages to enable full functionality.
-Please try again later when the system has been properly configured."""
-        except Exception as e:
-            # This should never happen now, but just in case
-            logger.error(f"Error in simple generate (this should be impossible): {e}")
-            return "System is in emergency fallback mode. Please contact administrator."

 import os
 import sys
+import json
 import torch
+import inspect
 import logging
+import pydantic
 import traceback
+import codecarbon
 import importlib.util
+from typing import Dict, Any, Optional, List
 # Directly import the packages that are now installed
 try:
     print(f"Successfully using installed dependencies - pydantic: {pydantic.__version__}, codecarbon: {codecarbon.__version__}")
 except ImportError as e:
     print(f"Error importing dependencies: {e}")
 # Import dependency helpers
 def is_module_available(module_name):
 logger = logging.getLogger(__name__)
 class WildnerveModelAdapter:
+    """Adapter layer that interfaces between HF inference endpoints and the model."""
+    RETRY_COUNT = 5
     def __init__(self, model_path: str):
         self.model_path = model_path
         self.tokenizer = None
         # ensure model directory and repo root are first on import path
         root = os.getcwd()
+        paths = []
+        if os.path.isdir(model_path):
+            paths.append(model_path)
+        else:
+            logger.warning(f"Model path not found or not a directory: {model_path}")
+        paths.append(root)
+        for p in paths:
             if p not in sys.path:
                 sys.path.insert(0, p)
         logger.info(f"Model adapter initialized with path: {model_path}")
+        # Initialize components with retry logic
+        for attempt in range(1, self.RETRY_COUNT + 1):
+            try:
+                self._initialize_tokenizer()
+                logger.info("Tokenizer initialized")
+                break
+            except Exception as e:
+                logger.warning(f"Tokenizer init attempt {attempt}/{self.RETRY_COUNT} failed: {e}")
+                logger.debug("Tokenizer init stack trace:", exc_info=True)
+                if attempt == self.RETRY_COUNT:
+                    raise
+        for attempt in range(1, self.RETRY_COUNT + 1):
+            try:
+                self._initialize_model()
+                logger.info("Model initialized")
+                break
+            except Exception as e:
+                logger.warning(f"Model init attempt {attempt}/{self.RETRY_COUNT} failed: {e}")
+                logger.debug("Model init stack trace:", exc_info=True)
+                if attempt == self.RETRY_COUNT:
+                    raise
     def _initialize_tokenizer(self):
+        """Initialize tokenizer via our local wrapper first, then fallback."""
+        try:
+            # primary: use our tokenizer.py
+            from tokenizer import TokenizerWrapper
+            self.tokenizer = TokenizerWrapper()
+            logger.info("Using TokenizerWrapper from tokenizer.py")
+            return
+        except Exception as e:
+            logger.warning(f"TokenizerWrapper init failed: {e}")
+        # Try to import from service_registry if available
         try:
             if is_module_available('service_registry'):
                 from service_registry import registry, TOKENIZER
         except Exception as e:
             logger.warning(f"Error initializing original tokenizer: {e}")
+        # Final fallback: use your get_tokenizer wrapper
         try:
+            from tokenizer import get_tokenizer
+            self.tokenizer = get_tokenizer()
+            logger.info("Using get_tokenizer() fallback")
+            return
+        except Exception as e:
+            logger.error(f"No tokenizer could be initialized: {e}")
+            raise ImportError("Tokenizer initialization failed")
     def _initialize_model(self):
         """Load actual model modules by file path to avoid import issues."""
+        # Parse config.json more narrowly
+        cfg_file = os.path.join(self.model_path, "config.json")
         try:
+            with open(cfg_file, "r") as f:
+                raw = json.load(f)
+            candidates = raw.get("SELECTED_MODEL", [])
+            if not isinstance(candidates, list):
+                logger.warning(f"SELECTED_MODEL not a list, wrapping: {candidates}")
+                candidates = [candidates]
+        except (FileNotFoundError, json.JSONDecodeError) as e:
+            logger.warning(f"Could not read/parse config.json ({e}), using default model list")
+            candidates = ["model_Combn.py", "model_Custm.py", "model_PrTr.py"]
+        except Exception as e:
+            logger.error(f"Unexpected error loading config.json: {e}", exc_info=True)
             candidates = ["model_Combn.py", "model_Custm.py", "model_PrTr.py"]
+        logger.debug(f"Adapter will try files: {candidates}")
         for filename in candidates:
             fp = os.path.join(self.model_path, filename)
             if not os.path.isfile(fp):
+                logger.debug(f"Not found: {fp}")
                 continue
             name = os.path.splitext(filename)[0]
             try:
                 spec.loader.exec_module(module)
                 logger.debug(f"Loaded module '{name}' from {filename}")
+            except ImportError as e:
+                logger.error(f"Missing dependency in {filename}: {e}", exc_info=True)
+                continue
             except Exception as e:
                 logger.error(f"Failed exec_module for {filename}: {e}", exc_info=True)
                 continue
                        if inspect.isclass(getattr(module, c)) and getattr(module, c).__module__ == module.__name__]
             logger.debug(f"Classes found in {filename}: {classes}")
+            # Instantiate first matching class
             for class_name in ("Wildnerve_tlm01_Hybrid_Model", "Wildnerve_tlm01"):
                 if hasattr(module, class_name):
+                    try:
+                        inst = getattr(module, class_name)(**self._build_init_kwargs())
+                    except TypeError as e:
+                        logger.error(f"Instantiation failed for {class_name}: {e}", exc_info=True)
+                        continue
+                    self.model = inst
                     self.initialized = True
                     logger.info(f"Instantiated {class_name} from {filename}")
                     return
                 obj = getattr(module, cls)
                 bases = [b.__name__ for b in inspect.getmro(obj)]
                 if "AbstractModel" in bases:
+                    try:
+                        inst = obj(**self._build_init_kwargs())
+                    except Exception as e:
+                        logger.error(f"Fallback instantiation failed for {cls}: {e}", exc_info=True)
+                        continue
+                    self.model = inst
                     self.initialized = True
                     logger.info(f"Instantiated fallback subclass {cls} from {filename}")
                     return
             logger.error(f"Error in generate: {e}")
             logger.error(traceback.format_exc())
             return f"Error generating response: {str(e)}"

dataset.py CHANGED Viewed

@@ -4,18 +4,14 @@ import csv
 import json
 import torch
 import logging
 from torch.utils.data import Dataset
 from typing import List, Dict, Any, Optional, Union
-from functools import wraps
-from time import time
 logger = logging.getLogger(__name__)
-# Attempt to import Preprocessor; fall back if missing
-try:
-    from preprocess import Preprocessor
-except ImportError:
-    Preprocessor = None
 def safe_file_operation(func):
     """Decorator to safely handle file operations with timeout"""

 import json
 import torch
 import logging
+from time import time
+from functools import wraps
+from preprocess import Preprocessor
 from torch.utils.data import Dataset
 from typing import List, Dict, Any, Optional, Union
 logger = logging.getLogger(__name__)
 def safe_file_operation(func):
     """Decorator to safely handle file operations with timeout"""