Upload 8 files

Browse files

Files changed (8) hide show

config.py +49 -45
dataloader.py +340 -0
dataset.py +435 -0
dependency_helpers.py +104 -98
handler.py +33 -6
model_Custm.py +6 -5
service_registry.py +45 -0
transformer_patches.py +23 -0

config.py CHANGED Viewed

@@ -6,13 +6,17 @@ import argparse
 from pathlib import Path
 from typing import Optional, Dict, List, Literal, Any
 # --- gracefully handle missing pydantic ---
 try:
     from pydantic import BaseModel, Field, ValidationError, ConfigDict
 except ImportError:
-    import logging
     logger = logging.getLogger(__name__)
-    logger.warning("pydantic not available, using dummy BaseModel/Field")
     class BaseModel:
         def __init__(self, **kwargs):
             for k, v in kwargs.items(): setattr(self, k, v)
@@ -103,82 +107,82 @@ SPECIALIZATIONS = [
 # Define DATASET_PATHS so that each specialization is a string or a list of strings
 DATASET_PATHS = {
     "computer": [
-        List[str(DATA_DIR / "data" / "computer_advanced_debugging.json")],
-        List[str(DATA_DIR / "data" / "computer_agenticAI.json")],
-        List[str(DATA_DIR / "data" / "computer_architecture.json")],
-        List[str(DATA_DIR / "data" / "computer_cloud_security.json")],
-        List[str(DATA_DIR / "data" / "computer_creativity.json")],
-        List[str(DATA_DIR / "data" / "computer_crossplatform.json")],
-        List[str(DATA_DIR / "data" / "computer_cybersecurity.json")],
-        List[str(DATA_DIR / "data" / "computer_error_handling_examples.json")],
-        List[str(DATA_DIR / "data" / "computer_gitInstruct.json")]
     ],
     "cpp": [
-        List[str(DATA_DIR / "data" / "cpp_advanced_debugging.json")],
-        List[str(DATA_DIR / "data" / "cpp_blockchain.json")],
-        List[str(DATA_DIR / "data" / "cpp_mbcppp.json")],
-        List[str(DATA_DIR / "data" / "cpp_programming.json")]
     ],
     "java": [
-        List[str(DATA_DIR / "data" / "java_ai_language_model.json")],
-        List[str(DATA_DIR / "data" / "java_blockchain.json")],
-        List[str(DATA_DIR / "data" / "java_mbjp.json")],
-        List[str(DATA_DIR / "data" / "java_transformer_language_model.json")],
     ],
     "go": [
-        List[str(DATA_DIR / "data" / "golang_ai_language_model.json")],
-        List[str(DATA_DIR / "data" / "golang_mbgp.json")],
-        List[str(DATA_DIR / "data" / "golang_programming.json")]
     ],
     "javascript": [
-        List[str(DATA_DIR / "data" / "javascript_chatbot.json")],
-        List[str(DATA_DIR / "data" / "javascript_n_Typescript_frontend.json")],
-        List[str(DATA_DIR / "data" / "javascript_n_Typescript_backend.json")],
-        List[str(DATA_DIR / "data" / "javascript_programming.json")]
     ],
     "nim": [
-        List[str(DATA_DIR / "data" / "nim_ai_language_model.json")],
-        List[str(DATA_DIR / "data" / "nim_blockchain.json")],
-        List[str(DATA_DIR / "data" / "nim_chatbot.json")],
-        List[str(DATA_DIR / "data" / "nim_mbnp.json")],
-        List[str(DATA_DIR / "data" / "nim_programming.json")]
     ],
     "python": [
-        List[str(DATA_DIR / "data" / "python_chatbot_guide.json")],
-        List[str(DATA_DIR / "data" / "python_mbpp.json")],
-        List[str(DATA_DIR / "data" / "python_programming.json")],
-        List[str(DATA_DIR / "data" / "python_transformer_model.json")]
     ],
     "rust": [
-        List[str(DATA_DIR / "data" / "rust_ai_language_model.json")],
-        List[str(DATA_DIR / "data" / "rust_blockchain.json")],
-        List[str(DATA_DIR / "data" / "rust_mbrp.json")],
-        List[str(DATA_DIR / "data" / "rust_programming.json")]
     ],
     "solidity": [
-        List[str(DATA_DIR / "data" / "solidity_programming.json")]
     ],
     "mathematics": [
-        List[str(DATA_DIR / "data" / "mathematics.json")],
-        List[str(DATA_DIR / "data" / "mathematics_training.json")]
     ],
     "physics": [
-        List[str(DATA_DIR / "data" / "physics_n_engineering.json")],
-        List[str(DATA_DIR / "data" / "physics_n_engineering_applied.json")]
     ],
     "other_information": [
-        List[str(DATA_DIR / "data" / "other_information.json")]
     ]
 }

 from pathlib import Path
 from typing import Optional, Dict, List, Literal, Any
+# Import dependency helpers first
+import dependency_helpers
 # --- gracefully handle missing pydantic ---
 try:
     from pydantic import BaseModel, Field, ValidationError, ConfigDict
 except ImportError:
+    # The import error should be handled by dependency_helpers
+    # But we'll add one more fallback just to be safe
     logger = logging.getLogger(__name__)
+    logger.warning("pydantic not available, using dummy implementation")
     class BaseModel:
         def __init__(self, **kwargs):
             for k, v in kwargs.items(): setattr(self, k, v)
 # Define DATASET_PATHS so that each specialization is a string or a list of strings
 DATASET_PATHS = {
     "computer": [
+        str(DATA_DIR / "data" / "computer_advanced_debugging.json"),
+        str(DATA_DIR / "data" / "computer_agenticAI.json"),
+        str(DATA_DIR / "data" / "computer_architecture.json"),
+        str(DATA_DIR / "data" / "computer_cloud_security.json"),
+        str(DATA_DIR / "data" / "computer_creativity.json"),
+        str(DATA_DIR / "data" / "computer_crossplatform.json"),
+        str(DATA_DIR / "data" / "computer_cybersecurity.json"),
+        str(DATA_DIR / "data" / "computer_error_handling_examples.json"),
+        str(DATA_DIR / "data" / "computer_gitInstruct.json")
     ],
     "cpp": [
+        str(DATA_DIR / "data" / "cpp_advanced_debugging.json"),
+        str(DATA_DIR / "data" / "cpp_blockchain.json"),
+        str(DATA_DIR / "data" / "cpp_mbcppp.json"),
+        str(DATA_DIR / "data" / "cpp_programming.json")
     ],
     "java": [
+        str(DATA_DIR / "data" / "java_ai_language_model.json"),
+        str(DATA_DIR / "data" / "java_blockchain.json"),
+        str(DATA_DIR / "data" / "java_mbjp.json"),
+        str(DATA_DIR / "data" / "java_transformer_language_model.json"),
     ],
     "go": [
+        str(DATA_DIR / "data" / "golang_ai_language_model.json"),
+        str(DATA_DIR / "data" / "golang_mbgp.json"),
+        str(DATA_DIR / "data" / "golang_programming.json")
     ],
     "javascript": [
+        str(DATA_DIR / "data" / "javascript_chatbot.json"),
+        str(DATA_DIR / "data" / "javascript_n_Typescript_frontend.json"),
+        str(DATA_DIR / "data" / "javascript_n_Typescript_backend.json"),
+        str(DATA_DIR / "data" / "javascript_programming.json")
     ],
     "nim": [
+        str(DATA_DIR / "data" / "nim_ai_language_model.json"),
+        str(DATA_DIR / "data" / "nim_blockchain.json"),
+        str(DATA_DIR / "data" / "nim_chatbot.json"),
+        str(DATA_DIR / "data" / "nim_mbnp.json"),
+        str(DATA_DIR / "data" / "nim_programming.json")
     ],
     "python": [
+        str(DATA_DIR / "data" / "python_chatbot_guide.json"),
+        str(DATA_DIR / "data" / "python_mbpp.json"),
+        str(DATA_DIR / "data" / "python_programming.json"),
+        str(DATA_DIR / "data" / "python_transformer_model.json")
     ],
     "rust": [
+        str(DATA_DIR / "data" / "rust_ai_language_model.json"),
+        str(DATA_DIR / "data" / "rust_blockchain.json"),
+        str(DATA_DIR / "data" / "rust_mbrp.json"),
+        str(DATA_DIR / "data" / "rust_programming.json")
     ],
     "solidity": [
+        str(DATA_DIR / "data" / "solidity_programming.json")
     ],
     "mathematics": [
+        str(DATA_DIR / "data" / "mathematics.json"),
+        str(DATA_DIR / "data" / "mathematics_training.json")
     ],
     "physics": [
+        str(DATA_DIR / "data" / "physics_n_engineering.json"),
+        str(DATA_DIR / "data" / "physics_n_engineering_applied.json")
     ],
     "other_information": [
+        str(DATA_DIR / "data" / "other_information.json")
     ]
 }

dataloader.py ADDED Viewed

	@@ -0,0 +1,340 @@

+"""
+Data loader factory and utilities for transformer models.
+"""
+import os
+import json
+import torch
+import logging
+import pandas as pd
+import numpy as np
+from typing import Dict, List, Optional, Union, Any, Tuple
+from torch.utils.data import Dataset, DataLoader, TensorDataset
+from pathlib import Path
+from config import app_config
+from tokenizer import TokenizerWrapper
+from datagrower.Crawl4MyAI import AdvancedWebCrawler
+from datagrower.Webconverter import WebConverter
+from dataset import DatasetManager
+logger = logging.getLogger(__name__)
+class TransformerDataset(Dataset):
+    """Base dataset for transformer models that handles multiple input formats."""
+    def __init__(
+        self,
+        data_path: str,
+        tokenizer: TokenizerWrapper,
+        max_length: int = 512,
+        format_type: str = None
+    ):
+        """
+        Initialize dataset.
+        Args:
+            data_path: Path to the data file
+            tokenizer: Tokenizer to use for encoding
+            max_length: Maximum sequence length
+            format_type: Format of data file ('csv', 'json', 'txt')
+        """
+        self.data_path = data_path
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.format_type = format_type or self._detect_format(data_path)
+        # Load data
+        self.data = self._load_data()
+        logger.info(f"Loaded {len(self.data)} samples from {data_path}")
+    def _detect_format(self, path: str) -> str:
+        """Detect file format from extension."""
+        ext = os.path.splitext(path)[1].lower().lstrip('.')
+        if ext in ['csv']:
+            return 'csv'
+        elif ext in ['json']:
+            return 'json'
+        elif ext in ['txt', 'text']:
+            return 'txt'
+        else:
+            logger.warning(f"Unknown file extension: {ext}, defaulting to CSV")
+            return 'csv'
+    def _load_data(self) -> List[Dict[str, Any]]:
+        """Load data based on format type."""
+        if not os.path.exists(self.data_path):
+            raise FileNotFoundError(f"Data file not found: {self.data_path}")
+        try:
+            if self.format_type == 'csv':
+                return self._load_csv()
+            elif self.format_type == 'json':
+                return self._load_json()
+            elif self.format_type == 'txt':
+                return self._load_txt()
+            else:
+                raise ValueError(f"Unsupported format type: {self.format_type}")
+        except Exception as e:
+            logger.error(f"Error loading data from {self.data_path}: {e}")
+            raise
+    def _load_csv(self) -> List[Dict[str, Any]]:
+        """Load data from CSV file."""
+        df = pd.read_csv(self.data_path)
+        # Check for required columns
+        if 'text' not in df.columns:
+            # Try to find a column with text data
+            text_cols = [col for col in df.columns if 'text' in col.lower() or 'content' in col.lower()]
+            if text_cols:
+                df = df.rename(columns={text_cols[0]: 'text'})
+            else:
+                # Use the first column as text
+                df = df.rename(columns={df.columns[0]: 'text'})
+        # Check for label column
+        if 'label' not in df.columns and len(df.columns) > 1:
+            # Use the second column as label if present
+            df = df.rename(columns={df.columns[1]: 'label'})
+        return df.to_dict('records')
+    def _load_json(self) -> List[Dict[str, Any]]:
+        """Load data from JSON file."""
+        with open(self.data_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        # Handle different JSON formats
+        if isinstance(data, list):
+            # Already in list format
+            return data
+        elif isinstance(data, dict):
+            # Extract data from dictionary
+            if 'data' in data:
+                return data['data']
+            elif 'examples' in data:
+                return data['examples']
+            elif 'user_inputs' in data:
+                return data['user_inputs']
+            else:
+                # Convert flat dictionary to list
+                return [{'text': str(value), 'id': key} for key, value in data.items()]
+        else:
+            raise ValueError(f"Unsupported JSON data structure: {type(data)}")
+    def _load_txt(self) -> List[Dict[str, Any]]:
+        """Load data from text file, one sample per line."""
+        with open(self.data_path, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+        # Clean and convert to dictionaries
+        return [{'text': line.strip(), 'id': i} for i, line in enumerate(lines) if line.strip()]
+    def __len__(self) -> int:
+        """Get dataset length."""
+        return len(self.data)
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        """Get an item from the dataset."""
+        item = self.data[idx]
+        text = item.get('text', '')
+        # Handle empty text
+        if not text:
+            text = " "  # Use space to avoid tokenizer errors
+        # Tokenize text
+        encoding = self.tokenizer(
+            text,
+            max_length=self.max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt"
+        )
+        # Extract tensors and squeeze batch dimension
+        input_ids = encoding["input_ids"].squeeze(0)
+        attention_mask = encoding["attention_mask"].squeeze(0)
+        # Get label if available
+        label = item.get('label', 0)
+        if isinstance(label, str):
+            try:
+                label = float(label)
+            except ValueError:
+                # Use hash of string for categorical labels
+                label = hash(label) % 100  # Limit to 100 categories
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'labels': torch.tensor(label, dtype=torch.long)
+        }
+def prepare_data_loaders_extended(
+    data_path: Union[str, Dict[str, str]],
+    tokenizer: Any,
+    batch_size: int = 16,
+    max_length: int = 512,
+    val_split: float = 0.1,
+    format_type: Optional[str] = None,
+    num_workers: int = 0
+) -> Dict[str, DataLoader]:
+    """
+    Create data loaders for training and validation.
+    Args:
+        data_path: Path to data file or dictionary mapping split to path
+        tokenizer: Tokenizer to use for encoding
+        batch_size: Batch size
+        max_length: Maximum sequence length
+        val_split: Validation split ratio when only one path is provided
+        format_type: Format of data file
+        num_workers: Number of workers for DataLoader
+    Returns:
+        Dictionary mapping split names to DataLoaders
+    """
+    data_loaders = {}
+    # Handle different types of data_path
+    if isinstance(data_path, dict):
+        # Multiple paths for different splits
+        for split_name, path in data_path.items():
+            dataset = TransformerDataset(
+                data_path=path,
+                tokenizer=tokenizer,
+                max_length=max_length,
+                format_type=format_type
+            )
+            data_loaders[split_name] = DataLoader(
+                dataset,
+                batch_size=batch_size,
+                shuffle=(split_name == 'train'),
+                num_workers=num_workers
+            )
+    else:
+        # Single path, create train/val split
+        dataset = TransformerDataset(
+            data_path=data_path,
+            tokenizer=tokenizer,
+            max_length=max_length,
+            format_type=format_type
+        )
+        # Split dataset
+        val_size = int(len(dataset) * val_split)
+        train_size = len(dataset) - val_size
+        if val_size > 0:
+            train_dataset, val_dataset = torch.utils.data.random_split(
+                dataset, [train_size, val_size]
+            )
+            data_loaders['train'] = DataLoader(
+                train_dataset,
+                batch_size=batch_size,
+                shuffle=True,
+                num_workers=num_workers
+            )
+            data_loaders['validation'] = DataLoader(
+                val_dataset,
+                batch_size=batch_size,
+                shuffle=False,
+                num_workers=num_workers
+            )
+        else:
+            # No validation split
+            data_loaders['train'] = DataLoader(
+                dataset,
+                batch_size=batch_size,
+                shuffle=True,
+                num_workers=num_workers
+            )
+    return data_loaders
+def prepare_data_loaders(
+    data_path: str,
+    tokenizer: Any,
+    batch_size: int = 16,
+    val_split: float = 0.1
+) -> Tuple[DataLoader, Optional[DataLoader]]:
+    """
+    Simplified version that returns train and validation loaders directly.
+    Args:
+        data_path: Path to data file
+        tokenizer: Tokenizer to use for encoding
+        batch_size: Batch size
+        val_split: Validation split ratio
+    Returns:
+        Tuple of (train_loader, val_loader)
+    """
+    loaders = prepare_data_loaders_extended(
+        data_path=data_path,
+        tokenizer=tokenizer,
+        batch_size=batch_size,
+        val_split=val_split
+    )
+    train_loader = loaders.get('train')
+    val_loader = loaders.get('validation')
+    return train_loader, val_loader
+def load_dataset(
+    specialization: str,
+    tokenizer: Any = None,
+    split: str = 'train'
+) -> Dataset:
+    """
+    Load a dataset for a specific specialization.
+    Args:
+        specialization: Name of the specialization
+        tokenizer: Tokenizer to use (optional)
+        split: Dataset split to load
+    Returns:
+        Dataset instance
+    """
+    # Get dataset path from config
+    if hasattr(app_config, 'DATASET_PATHS') and specialization in app_config.DATASET_PATHS:
+        data_path = app_config.DATASET_PATHS[specialization]
+    else:
+        data_path = os.path.join(app_config.BASE_DATA_DIR, f"{specialization}.csv")
+    # Get or create tokenizer
+    if tokenizer is None:
+        from tokenizer import TokenizerWrapper
+        tokenizer = TokenizerWrapper()
+    # handle URL paths first via crawler + converter
+    if data_path.startswith("http://") or data_path.startswith("https://"):
+        crawler = AdvancedWebCrawler()
+        converter = WebConverter(crawler=crawler)
+        raw_entries = converter.get_converted_web_data([data_path])
+        # assume raw_entries is list of dicts {"text":…, "label":…}
+        return TransformerDataset(data_path=data_path, tokenizer=tokenizer)._process_records(raw_entries)
+    # Create dataset
+    dataset = TransformerDataset(
+        data_path=data_path,
+        tokenizer=tokenizer,
+        max_length=app_config.TRANSFORMER_CONFIG.MAX_SEQ_LENGTH
+    )
+    return dataset
+def load_for_specialization(spec: str):
+    paths = app_config.get("DATASET_PATHS", {}).get(spec, [])
+    # normalize to list
+    if isinstance(paths, str):
+        paths = [paths]
+    manager = DatasetManager()
+    return manager.load_dataset(paths, spec)
+# Short alias for common use case
+get_dataloader = prepare_data_loaders

dataset.py ADDED Viewed

	@@ -0,0 +1,435 @@

+# dataset.py
+import os
+import csv
+import json
+import torch
+import logging
+from preprocess import Preprocessor
+from torch.utils.data import Dataset
+from typing import List, Dict, Any, Optional, Union
+from functools import wraps
+from time import time
+logger = logging.getLogger(__name__)
+def safe_file_operation(func):
+    """Decorator to safely handle file operations with timeout"""
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        start_time = time()
+        timeout_seconds = 300  # 5-minute timeout
+        try:
+            # Try to perform the operation
+            result = func(self, *args, **kwargs)
+            # Check if operation took too long
+            if time() - start_time > timeout_seconds:
+                logger.warning(f"File operation {func.__name__} took more than {timeout_seconds} seconds")
+            return result
+        except (IOError, OSError) as e:
+            logger.error(f"File operation error in {func.__name__}: {str(e)}")
+            # Return empty result based on function type
+            if func.__name__.startswith('_load_'):
+                return []
+            raise
+        except json.JSONDecodeError as e:
+            logger.error(f"JSON decode error in {self.file_path}: {str(e)}")
+            return []
+        except csv.Error as e:
+            logger.error(f"CSV error in {self.file_path}: {str(e)}")
+            return []
+        except Exception as e:
+            logger.error(f"Unexpected error in {func.__name__}: {str(e)}")
+            raise
+    return wrapper
+class TensorDataset(Dataset):
+    """Dataset class for handling tensor data with features and labels."""
+    def __init__(self, features, labels):
+        """
+        Initialize TensorDataset.
+        Args:
+            features (Tensor): Feature tensors.
+            labels (Tensor): Label tensors.
+        """
+        self.features = features
+        self.labels = labels
+    def __len__(self):
+        return len(self.features)
+    def __getitem__(self, idx):
+        return self.features[idx], self.labels[idx]
+class CustomDataset(Dataset):
+    """A dataset that supports loading JSON, CSV, and TXT formats.
+    It auto-detects the file type (if not specified) and filters out any
+    records that are not dictionaries. If a preprocessor is provided, it
+    applies it to each record. Additionally, it can standardize sample keys
+    dynamically using a provided header mapping. For example, you can define a
+    mapping like:
+       mapping = {
+           "title": ["Title", "Headline", "Article Title"],
+           "content": ["Content", "Body", "Text"],
+       }
+    so that regardless of the CSV's header names your trainer always sees a
+    standardized set of keys."""
+    def __init__(
+        self,
+        file_path: Optional[str] = None,
+        tokenizer = None,
+        max_length: Optional[int] = None,
+        file_format: Optional[str] = None,
+        preprocessor: Optional[Preprocessor] = None,
+        header_mapping: Optional[Dict[str, List[str]]] = None,
+        data: Optional[List[Dict[str, Any]]] = None,  # Add data parameter
+        specialization: Optional[str] = None  # Add specialization parameter
+    ):
+        """Args:
+            file_path (Optional[str]): Path to the dataset file.
+            tokenizer: Tokenizer instance to process the text.
+            max_length (Optional[int]): Maximum sequence length.
+            file_format (Optional[str]): Format of the file; inferred from the extension if not provided.
+            preprocessor (Optional[Preprocessor]): Preprocessor to apply to each sample.
+            header_mapping (Optional[Dict[str, List[str]]]): Dictionary that maps standardized keys.
+            data (Optional[List[Dict[str, Any]]]): Direct data input instead of loading from file.
+            specialization (Optional[str]): Specialization field for the dataset."""
+        self.file_path = file_path
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.preprocessor = preprocessor
+        self.header_mapping = header_mapping
+        self.specialization = specialization  # Store the specialization
+        # Initialize samples either from data or file
+        if data is not None:
+            self.samples = data
+        else:
+            # Determine the file format if not specified and file_path is provided
+            if file_path is not None:
+                if file_format is None:
+                    _, ext = os.path.splitext(file_path)
+                    ext = ext.lower()
+                    if ext in ['.json']:
+                        file_format = 'json'
+                    elif ext in ['.csv']:
+                        file_format = 'csv'
+                    elif ext in ['.txt']:
+                        file_format = 'txt'
+                    else:
+                        logger.error(f"Unsupported file extension: {ext}")
+                        raise ValueError(f"Unsupported file extension: {ext}")
+                self.file_format = file_format
+                self.samples = self._load_file()
+            else:
+                self.samples = []
+        # Auto-detection: Ensure all loaded samples are dicts.
+        initial_sample_count = len(self.samples)
+        self.samples = [sample for sample in self.samples if isinstance(sample, dict)]
+        if len(self.samples) < initial_sample_count:
+            logger.warning(f"Filtered out {initial_sample_count - len(self.samples)} samples that were not dicts.")
+        # If a preprocessor is provided, apply preprocessing to each record.
+        if self.preprocessor:
+            preprocessed_samples = []
+            for sample in self.samples:
+                try:
+                    processed = self.preprocessor.preprocess_record(sample)
+                    preprocessed_samples.append(processed)
+                except Exception as e:
+                    logger.error(f"Error preprocessing record {sample}: {e}")
+            self.samples = preprocessed_samples
+    def _load_file(self) -> List[Dict[str, Any]]:
+        try:
+            if self.file_format == 'json':
+                return self._load_json()
+            elif self.file_format == 'csv':
+                return self._load_csv()
+            elif self.file_format == 'txt':
+                return self._load_txt()
+            else:
+                logger.error(f"Unrecognized file format: {self.file_format}")
+                raise ValueError(f"Unrecognized file format: {self.file_format}")
+        except Exception as e:
+            logger.error(f"Error loading file {self.file_path}: {e}")
+            raise
+    @safe_file_operation
+    def _load_json(self) -> List[Dict[str, Any]]:
+        """Load JSON file with better error handling and validation"""
+        try:
+            with open(self.file_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            # Validate data structure
+            if isinstance(data, list):
+                valid_records = [record for record in data if isinstance(record, dict)]
+                if len(valid_records) < len(data):
+                    logger.warning(f"{len(data) - len(valid_records)} records were not dictionaries in {self.file_path}")
+                return valid_records
+            elif isinstance(data, dict):
+                # Handle single record case
+                logger.warning(f"JSON file contains a single dictionary, not a list: {self.file_path}")
+                return [data]
+            else:
+                logger.error(f"JSON file does not contain a list or dictionary: {self.file_path}")
+                return []
+        except json.JSONDecodeError as e:
+            line_col = f"line {e.lineno}, column {e.colno}"
+            logger.error(f"JSON decode error at {line_col} in {self.file_path}: {e.msg}")
+            # Try to recover partial content
+            try:
+                with open(self.file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                # Try parsing up to the error
+                valid_part = content[:e.pos]
+                import re
+                # Find complete objects (rough approach)
+                matches = re.findall(r'\{[^{}]*\}', valid_part)
+                if matches:
+                    logger.info(f"Recovered {len(matches)} complete records from {self.file_path}")
+                    parsed_records = []
+                    for match in matches:
+                        try:
+                            parsed_records.append(json.loads(match))
+                        except:
+                            pass
+                    return parsed_records
+            except:
+                pass
+            return []
+    @safe_file_operation
+    def _load_csv(self) -> List[Dict[str, Any]]:
+        """Load CSV with better error handling"""
+        samples = []
+        try:
+            with open(self.file_path, 'r', encoding='utf-8') as csvfile:
+                # Try detecting dialect first
+                try:
+                    dialect = csv.Sniffer().sniff(csvfile.read(1024))
+                    csvfile.seek(0)
+                    reader = csv.DictReader(csvfile, dialect=dialect)
+                except:
+                    # Fall back to excel dialect
+                    csvfile.seek(0)
+                    reader = csv.DictReader(csvfile, dialect='excel')
+                for i, row in enumerate(reader):
+                    if not isinstance(row, dict):
+                        logger.warning(f"Row {i} is not a dict: {row} -- skipping.")
+                        continue
+                    samples.append(row)
+                if not samples:
+                    logger.warning(f"No valid rows found in CSV file: {self.file_path}")
+        except csv.Error as e:
+            logger.error(f"Error reading CSV file {self.file_path}: {e}")
+        return samples
+    def _load_txt(self) -> List[Dict[str, Any]]:
+        samples = []
+        with open(self.file_path, 'r', encoding='utf-8') as txtfile:
+            for i, line in enumerate(txtfile):
+                line = line.strip()
+                if line:
+                    # Wrap each line in a dictionary.
+                    samples.append({"text": line})
+        return samples
+    def _standardize_sample(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        """Remaps the sample's keys to a set of standardized keys using self.header_mapping.
+        For each standardized key, the first matching header from the sample is used.
+        If none is found, a default empty string is assigned."""
+        standardized = {}
+        for std_field, possible_keys in self.header_mapping.items():
+            for key in possible_keys:
+                if key in sample:
+                    standardized[std_field] = sample[key]
+                    break
+            if std_field not in standardized:
+                standardized[std_field] = ""
+        return standardized
+    def __len__(self) -> int:
+        return len(self.samples)
+    def __getitem__(self, index: int) -> Dict[str, Any]:
+        sample = self.samples[index]
+        # If a header mapping is provided, standardize the sample keys.
+        if self.header_mapping is not None:
+            sample = self._standardize_sample(sample)
+        # Determine the text to tokenize:
+        # If standardized keys "title" or "content" exist, combine them.
+        if 'title' in sample or 'content' in sample:
+            title = sample.get('title', '')
+            content = sample.get('content', '')
+            # Convert non-string fields to strings
+            if not isinstance(title, str):
+                title = str(title)
+            if not isinstance(content, str):
+                content = str(content)
+            text = (title + " " + content).strip()
+        elif "text" in sample:
+            text = sample["text"] if isinstance(sample["text"], str) else str(sample["text"])
+        else:
+            # Fallback: join all values (cast to str)
+            text = " ".join(str(v) for v in sample.values())
+        # Tokenize the combined text.
+        tokenized = self.tokenizer.encode_plus(
+            text,
+            max_length=self.max_length,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt'
+        )
+        # Get specialization from sample or use class default
+        specialization = None
+        if isinstance(sample, dict) and "specialization" in sample:
+            specialization = sample["specialization"]
+        elif self.specialization:
+            specialization = self.specialization
+        # Return a standardized dictionary for training.
+        result = {
+            "input_ids": tokenized["input_ids"].squeeze(0),
+            "attention_mask": tokenized["attention_mask"].squeeze(0),
+            "token_type_ids": tokenized.get("token_type_ids", torch.zeros_like(tokenized["input_ids"])).squeeze(0),
+        }
+        # Add specialization if available
+        if specialization:
+            result["specialization"] = specialization
+        # Optionally include standardized text fields if needed
+        if 'title' in locals():
+            result["title"] = title
+        if 'content' in locals():
+            result["content"] = content
+        return result
+# dataset.py - Simple dataset module to fix initialization dependency issues
+import logging
+import os
+import json
+from typing import Dict, List, Any, Optional, Union
+logger = logging.getLogger(__name__)
+class DatasetManager:
+    """
+    Simple dataset manager to provide basic functionality for model_manager
+    without requiring external dataset dependencies
+    """
+    def __init__(self, data_dir: Optional[str] = None):
+        self.data_dir = data_dir or os.path.join(os.path.dirname(__file__), "data")
+        self.datasets = {}
+        self._ensure_data_dir()
+    def _ensure_data_dir(self):
+        """Ensure data directory exists"""
+        try:
+            if not os.path.exists(self.data_dir):
+                os.makedirs(self.data_dir, exist_ok=True)
+                logger.info(f"Created dataset directory at {self.data_dir}")
+        except (PermissionError, OSError) as e:
+            logger.warning(f"Could not create data directory: {e}")
+            # Fall back to temp directory
+            self.data_dir = os.path.join("/tmp", "wildnerve_data")
+            os.makedirs(self.data_dir, exist_ok=True)
+            logger.info(f"Using fallback data directory at {self.data_dir}")
+    def load_dataset(self, name: str) -> List[Dict[str, Any]]:
+        """Load dataset by name"""
+        if name in self.datasets:
+            return self.datasets[name]
+        # Check for dataset file
+        filepath = os.path.join(self.data_dir, f"{name}.json")
+        if os.path.exists(filepath):
+            try:
+                with open(filepath, 'r', encoding='utf-8') as f:
+                    data = json.load(f)
+                self.datasets[name] = data
+                return data
+            except Exception as e:
+                logger.error(f"Error loading dataset {name}: {e}")
+        # Return empty dataset if not found
+        logger.warning(f"Dataset {name} not found, returning empty dataset")
+        return []
+    def get_dataset_names(self) -> List[str]:
+        """Get list of available datasets"""
+        try:
+            return [f.split('.')[0] for f in os.listdir(self.data_dir)
+                   if f.endswith('.json')]
+        except Exception as e:
+            logger.error(f"Error listing datasets: {e}")
+            return []
+    def create_sample_dataset(self, name: str, samples: int = 10) -> List[Dict[str, Any]]:
+        """Create a sample dataset for testing"""
+        data = [
+            {
+                "id": i,
+                "text": f"Sample text {i} for model training",
+                "label": i % 2  # Binary label
+            }
+            for i in range(samples)
+        ]
+        # Save to file
+        filepath = os.path.join(self.data_dir, f"{name}.json")
+        try:
+            with open(filepath, 'w', encoding='utf-8') as f:
+                json.dump(data, f, indent=2)
+            self.datasets[name] = data
+            logger.info(f"Created sample dataset {name} with {samples} samples")
+        except Exception as e:
+            logger.error(f"Error creating sample dataset: {e}")
+        return data
+    def _load_and_process_dataset(self, path_or_paths: Union[str, List[str]], specialization: str) -> TensorDataset:
+        # …existing code up to reading the file…
+        import pandas as pd
+        # Handle multiple JSON files by concatenation
+        if isinstance(path_or_paths, list):
+            frames = [pd.read_json(p) for p in path_or_paths]
+            data = pd.concat(frames, ignore_index=True)
+        else:
+            data = pd.read_json(path_or_paths)
+        # …existing code that splits into features/labels and returns TensorDataset…
+# Create a default dataset manager instance
+dataset_manager = DatasetManager()
+def get_dataset(name: str) -> List[Dict[str, Any]]:
+    """Helper function to get a dataset by name"""
+    return dataset_manager.load_dataset(name)
+# Create some minimal sample data if running as main
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    dm = DatasetManager()
+    dm.create_sample_dataset("test_dataset", samples=20)
+    print(f"Available datasets: {dm.get_dataset_names()}")
+    test_data = dm.load_dataset("test_dataset")
+    print(f"Loaded {len(test_data)} samples from test_dataset")

dependency_helpers.py CHANGED Viewed

@@ -1,118 +1,124 @@
 """
-Helper utilities for handling dependencies in a graceful manner.
-This module provides functions to check for and load dependencies without crashing.
 """
-import importlib
-import logging
-import sys
 import os
-from typing import Optional, Any, Dict, Callable, List
 logger = logging.getLogger(__name__)
-def safely_import(module_name: str) -> Optional[Any]:
-    """
-    Safely import a module without crashing if it's not available.
-    Args:
-        module_name: Name of the module to import
-    Returns:
-        The imported module or None if import failed
-    """
-    try:
-        return importlib.import_module(module_name)
-    except ImportError as e:
-        logger.warning(f"Failed to import {module_name}: {e}")
-        return None
 def is_module_available(module_name: str) -> bool:
-    """
-    Check if a module is available without importing it.
-    Args:
-        module_name: Name of the module to check
-    Returns:
-        True if module is available, False otherwise
-    """
-    try:
-        importlib.util.find_spec(module_name)
-        return True
-    except ImportError:
-        return False
-def check_dependencies(dependencies: List[str]) -> Dict[str, bool]:
-    """
-    Check multiple dependencies at once.
-    Args:
-        dependencies: List of module names to check
-    Returns:
-        Dictionary mapping module names to availability (True/False)
-    """
-    return {dep: is_module_available(dep) for dep in dependencies}
-def get_object_if_available(module_name: str, object_name: str) -> Optional[Any]:
-    """
-    Get an object from a module if the module is available.
-    Args:
-        module_name: Name of the module containing the object
-        object_name: Name of the object to get
-    Returns:
-        The requested object or None if not available
-    """
-    module = safely_import(module_name)
-    if module and hasattr(module, object_name):
-        return getattr(module, object_name)
-    return None
-def with_fallback(primary_func: Callable, fallback_func: Callable, *args, **kwargs) -> Any:
-    """
-    Call primary_func with the given args/kwargs, falling back to fallback_func if it fails.
-    Args:
-        primary_func: Function to try first
-        fallback_func: Function to use if primary_func fails
-        args: Positional arguments to pass to both functions
-        kwargs: Keyword arguments to pass to both functions
-    Returns:
-        Result from either primary_func or fallback_func
-    """
-    try:
-        return primary_func(*args, **kwargs)
-    except Exception as e:
-        logger.warning(f"Primary function {primary_func.__name__} failed: {e}")
-        return fallback_func(*args, **kwargs)
-def install_package(package_name: str) -> bool:
-    """
-    Attempt to install a package using pip.
-    Note: This is generally not recommended in production code but can be useful for development.
-    Args:
-        package_name: Name of the package to install
-    Returns:
-        True if installation was successful, False otherwise
-    """
-    try:
-        import subprocess
-        logger.info(f"Attempting to install {package_name}")
-        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
-        return True
-    except Exception as e:
-        logger.warning(f"Failed to install {package_name}: {e}")
-        return False
-# Check critical dependencies used in the project
-CRITICAL_DEPENDENCIES = ["torch", "transformers", "sentencepiece", "pydantic", "nltk"]
-DEPENDENCY_STATUS = check_dependencies(CRITICAL_DEPENDENCIES)
-def get_dependency_status() -> Dict[str, bool]:
-    """Get the status of critical dependencies."""
-    return DEPENDENCY_STATUS

 """
+Dependency helpers to make the model work even if some libraries are missing.
+This file provides fallback implementations for missing dependencies.
 """
 import os
+import logging
+import importlib.util
+from typing import Any, Dict, Optional, Type, Callable
 logger = logging.getLogger(__name__)
+# Dictionary to track mock implementations
+MOCK_MODULES = {}
 def is_module_available(module_name: str) -> bool:
+    """Check if a module is available without importing it"""
+    return importlib.util.find_spec(module_name) is not None
+def create_mock_emissions_tracker() -> Type:
+    """Create a mock implementation of codecarbon's EmissionsTracker"""
+    class MockEmissionsTracker:
+        def __init__(self, *args, **kwargs):
+            logger.info("Using mock EmissionsTracker")
+        def __enter__(self):
+            return self
+        def __exit__(self, exc_type, exc_val, exc_tb):
+            pass
+        def start(self):
+            return self
+        def stop(self):
+            return 0.0  # Return zero emissions
+    return MockEmissionsTracker
+def create_mock_pydantic_classes() -> Dict[str, Type]:
+    """Create mock implementations of pydantic classes"""
+    class MockBaseModel:
+        """Mock implementation of pydantic's BaseModel"""
+        def __init__(self, **kwargs):
+            for key, value in kwargs.items():
+                setattr(self, key, value)
+        def dict(self) -> Dict[str, Any]:
+            return {k: v for k, v in self.__dict__.items()
+                    if not k.startswith('_')}
+        def json(self) -> str:
+            import json
+            return json.dumps(self.dict())
+    def mock_field(*args, **kwargs) -> Any:
+        """Mock implementation of pydantic's Field"""
+        return kwargs.get('default', None)
+    class MockValidationError(Exception):
+        """Mock implementation of pydantic's ValidationError"""
+        pass
+    mock_config_dict = dict
+    return {
+        "BaseModel": MockBaseModel,
+        "Field": mock_field,
+        "ValidationError": MockValidationError,
+        "ConfigDict": mock_config_dict
+    }
+def setup_dependency_fallbacks():
+    """Setup fallbacks for all required dependencies"""
+    # Handle codecarbon
+    if not is_module_available("codecarbon"):
+        logger.warning("codecarbon not found, using mock implementation")
+        MOCK_MODULES["codecarbon"] = type("MockCodecarbon", (), {
+            "EmissionsTracker": create_mock_emissions_tracker()
+        })
+    # Handle pydantic
+    if not is_module_available("pydantic"):
+        logger.warning("pydantic not found, using mock implementation")
+        mock_classes = create_mock_pydantic_classes()
+        MOCK_MODULES["pydantic"] = type("MockPydantic", (), mock_classes)
+    # Setup service_registry fallback if needed
+    if not is_module_available("service_registry"):
+        from types import SimpleNamespace
+        registry_obj = SimpleNamespace()
+        registry_obj.register = lambda *args, **kwargs: None
+        registry_obj.get = lambda *args: None
+        registry_obj.has = lambda *args: False
+        MOCK_MODULES["service_registry"] = type("MockServiceRegistry", (), {
+            "registry": registry_obj,
+            "MODEL": "MODEL",
+            "TOKENIZER": "TOKENIZER"
+        })
+# Custom import hook to provide mock implementations
+class DependencyImportFinder:
+    def __init__(self):
+        self._mock_modules = MOCK_MODULES
+    def find_module(self, fullname, path=None):
+        if fullname in self._mock_modules:
+            return self
+    def load_module(self, fullname):
+        import sys
+        if fullname in sys.modules:
+            return sys.modules[fullname]
+        module = self._mock_modules[fullname]
+        sys.modules[fullname] = module
+        return module
+# Initialize the fallbacks
+setup_dependency_fallbacks()
+# Install the custom import hook
+import sys
+sys.meta_path.insert(0, DependencyImportFinder())

handler.py CHANGED Viewed

@@ -17,13 +17,33 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
-# Verify installed packages
 try:
     import pydantic
-    import codecarbon
-    print(f"Required dependencies are available - pydantic: {pydantic.__version__}, codecarbon: {codecarbon.__version__}")
-except ImportError as e:
-    print(f"WARNING: Missing dependency: {e}")
 # Make sure adapter_layer.py is properly located
 try:
@@ -43,7 +63,14 @@ try:
 except ImportError as e:
     logger.error(f"Could not import adapter_layer: {e}")
-    raise
 class EndpointHandler:
     def __init__(self, path=""):

 )
 logger = logging.getLogger(__name__)
+# Safely check for required packages without crashing
 try:
     import pydantic
+    print(f"pydantic is available: {pydantic.__version__}")
+except ImportError:
+    print("pydantic is not available - continuing without it")
+    # Create minimal compatibility layer
+    class pydantic:
+        @staticmethod
+        def __version__():
+            return "unavailable"
+        class BaseModel:
+            def __init__(self, **kwargs):
+                for k, v in kwargs.items():
+                    setattr(self, k, v)
+try:
+    from codecarbon import EmissionsTracker
+    print(f"codecarbon is available")
+except ImportError:
+    print("codecarbon is not available - continuing without carbon tracking")
+    # Create minimal compatibility class
+    class EmissionsTracker:
+        def __init__(self, *args, **kwargs): pass
+        def start(self): return self
+        def stop(self): return 0.0
 # Make sure adapter_layer.py is properly located
 try:
 except ImportError as e:
     logger.error(f"Could not import adapter_layer: {e}")
+    # Don't raise error - provide fallback adapter implementation
+    class WildnerveModelAdapter:
+        def __init__(self, path=""):
+            self.path = path
+            logger.info(f"Using fallback WildnerveModelAdapter with path: {path}")
+        def generate(self, text_input, **kwargs):
+            return f"Model adapter unavailable. Received input: {text_input[:30]}..."
 class EndpointHandler:
     def __init__(self, path=""):

model_Custm.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# model_Custm.py
 import os
 import sys
 import math
@@ -8,14 +8,15 @@ import numpy as np
 import torch.nn as nn
 from typing import Optional, List, Dict, Union
 # Import the carbon tracker early - before transformers
 try:
     from codecarbon import EmissionsTracker
 except ImportError:
-    class EmissionsTracker:
-        def __init__(self, *args, **kwargs): pass
-        def start(self): return self
-        def stop(self): return 0.0
 # Apply patches before importing transformers
 import transformer_patches

+# model_Custm.py - with dependency fallbacks
 import os
 import sys
 import math
 import torch.nn as nn
 from typing import Optional, List, Dict, Union
+# Import dependency helpers first
+import dependency_helpers
 # Import the carbon tracker early - before transformers
 try:
     from codecarbon import EmissionsTracker
 except ImportError:
+    # Use the mock from dependency_helpers
+    EmissionsTracker = dependency_helpers.create_mock_emissions_tracker()
 # Apply patches before importing transformers
 import transformer_patches

service_registry.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""
+Simple service registry for dependency injection
+"""
+import logging
+from typing import Any, Dict, Optional
+logger = logging.getLogger(__name__)
+# Constants used as keys
+MODEL = "model"
+TOKENIZER = "tokenizer"
+class ServiceRegistry:
+    """A simple service registry for dependency management"""
+    def __init__(self):
+        self._services = {}
+    def register(self, key: str, service: Any, overwrite: bool = False) -> None:
+        """Register a service with the given key"""
+        if key in self._services and not overwrite:
+            logger.warning(f"Service with key '{key}' already registered")
+            return
+        self._services[key] = service
+        logger.debug(f"Registered service with key: {key}")
+    def get(self, key: str) -> Optional[Any]:
+        """Get a service by its key"""
+        if key not in self._services:
+            logger.warning(f"No service registered with key: {key}")
+            return None
+        return self._services[key]
+    def has(self, key: str) -> bool:
+        """Check if a service with the given key exists"""
+        return key in self._services
+    def clear(self) -> None:
+        """Clear all registered services"""
+        self._services.clear()
+# Create singleton instance
+registry = ServiceRegistry()

transformer_patches.py CHANGED Viewed

@@ -211,3 +211,26 @@ if __name__ == "__main__":
     print("\nPatch status:")
     for patch, status in _patch_status.items():
         print(f"  {'✓' if status else '✗'} {patch}")

     print("\nPatch status:")
     for patch, status in _patch_status.items():
         print(f"  {'✓' if status else '✗'} {patch}")
+"""
+Transformer patches to make the model work better with HuggingFace transformers.
+This file applies monkey patches to fix compatibility issues or add functionality.
+"""
+import logging
+from typing import Dict, Any, Optional
+logger = logging.getLogger(__name__)
+def apply_transformer_patches():
+    """Apply monkey patches to transformers if needed"""
+    try:
+        import transformers
+        logger.info(f"Applying patches to transformers v{transformers.__version__}")
+        # No patches needed currently, but you can add them here if needed in future
+    except ImportError:
+        logger.warning("Transformers library not found, skipping patches")
+# Apply patches when imported
+apply_transformer_patches()