Spaces:

Fola-AI
/

FarmEyes

Sleeping

App Files Files Community

Fola-AI commited on 20 days ago

Commit

068e72c

1 Parent(s): d9a0eb4

Use official N-ATLaS via transformers - no llama-cpp-python

Browse files

Files changed (4) hide show

.DS_Store +0 -0
Dockerfile +6 -7
models/natlas_model.py +298 -467
requirements.txt +12 -6

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

Dockerfile CHANGED Viewed

@@ -1,8 +1,8 @@
 # =============================================================================
-# FarmEyes - HuggingFace Spaces Dockerfile (GPU - Simplified)
 # =============================================================================
-# Uses PyTorch base image with CUDA pre-installed
-# llama-cpp-python CPU version works fine - still faster than free tier
 # =============================================================================
 FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime
@@ -21,17 +21,16 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libsm6 \
     libxext6 \
     libgl1 \
     && rm -rf /var/lib/apt/lists/*
 # Copy requirements
 COPY requirements.txt .
-# Install Python dependencies (torch already in base image)
 RUN pip install --no-cache-dir -r requirements.txt
-# Install llama-cpp-python (CPU version - avoids long compile)
-RUN pip install --no-cache-dir llama-cpp-python
 # Copy application code
 COPY . .

 # =============================================================================
+# FarmEyes - HuggingFace Spaces Dockerfile (Transformers Version)
 # =============================================================================
+# Uses official N-ATLaS model via transformers - NO llama-cpp-python needed!
+# Fast build, official model support, GPU accelerated.
 # =============================================================================
 FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime
     libsm6 \
     libxext6 \
     libgl1 \
+    git \
     && rm -rf /var/lib/apt/lists/*
 # Copy requirements
 COPY requirements.txt .
+# Install Python dependencies
+# Note: torch is already in base image
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy application code
 COPY . .

models/natlas_model.py CHANGED Viewed

@@ -1,19 +1,16 @@
 """
-FarmEyes N-ATLaS Model Integration (Hybrid)
-============================================
-HYBRID APPROACH:
-1. PRIMARY: HuggingFace Inference API (fast, cloud-based)
-2. FALLBACK: Local GGUF model (optional - requires llama-cpp-python)
-API Model: NCAIR1/N-ATLaS
-GGUF Model: tosinamuda/N-ATLaS-GGUF (N-ATLaS-GGUF-Q4_K_M.gguf)
-HUGGINGFACE SPACES OPTIMIZED:
-- llama-cpp-python is OPTIONAL (avoids build timeout)
-- Works with HuggingFace API only if GGUF not available
-- GPU support when available
 Languages: English, Hausa, Yoruba, Igbo
 """
 import os
@@ -22,55 +19,37 @@ from pathlib import Path
 from typing import Optional, Dict, List
 import logging
 import time
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# =============================================================================
-# CHECK IF LLAMA-CPP-PYTHON IS AVAILABLE
-# =============================================================================
-LLAMA_CPP_AVAILABLE = False
-try:
-    from llama_cpp import Llama
-    LLAMA_CPP_AVAILABLE = True
-    logger.info("✅ llama-cpp-python is available - GGUF fallback enabled")
-except ImportError:
-    logger.warning("⚠️ llama-cpp-python not installed - GGUF fallback disabled")
-    logger.warning("   App will use HuggingFace API only for translations")
 # =============================================================================
 # ENVIRONMENT DETECTION
 # =============================================================================
-# Check if running on HuggingFace Spaces
 IS_HF_SPACES = os.environ.get("SPACE_ID") is not None
 # Check for GPU
 HAS_GPU = False
 try:
     import torch
     HAS_GPU = torch.cuda.is_available()
     if HAS_GPU:
-        logger.info("🎮 GPU detected - GPU acceleration enabled")
 except ImportError:
-    pass
-# Set GPU layers based on environment
-if HAS_GPU:
-    DEFAULT_GPU_LAYERS = -1  # Use all GPU layers
-    logger.info("🎮 Using GPU acceleration")
-elif IS_HF_SPACES:
-    DEFAULT_GPU_LAYERS = 0  # CPU only on Spaces free tier
-    logger.info("🤗 Running on HuggingFace Spaces - CPU mode")
 else:
-    DEFAULT_GPU_LAYERS = -1  # Try GPU locally (Apple Silicon MPS)
     logger.info("🖥️ Running locally")
-DEFAULT_THREADS = 4
 # =============================================================================
 # LANGUAGE MAPPINGS
@@ -92,291 +71,100 @@ NATIVE_LANGUAGE_NAMES = {
 # =============================================================================
-# HUGGINGFACE INFERENCE API CLIENT (PRIMARY)
 # =============================================================================
-class HuggingFaceAPIClient:
     """
-    Client for HuggingFace Serverless Inference API.
-    Primary method - fast cloud-based inference.
-    NOTE: This is the MAIN method on HuggingFace Spaces when
-    llama-cpp-python is not installed.
-    """
-    MODEL_ID = "NCAIR1/N-ATLaS"
-    API_URL = "https://api-inference.huggingface.co/models/NCAIR1/N-ATLaS"
-    def __init__(self, api_token: Optional[str] = None):
-        self.api_token = api_token or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
-        self._is_available = None
-        self._last_check = 0
-        self._check_interval = 300  # 5 minutes
-        if self.api_token:
-            logger.info("✅ HuggingFace API token found")
-        else:
-            logger.warning("⚠️ No HF_TOKEN set - translations may not work")
-    def is_available(self) -> bool:
-        """Check if API is available."""
-        if not self.api_token:
-            return False
-        current_time = time.time()
-        if self._is_available is not None and current_time - self._last_check < self._check_interval:
-            return self._is_available
-        try:
-            import requests
-            headers = {"Authorization": "Bearer " + self.api_token}
-            response = requests.get(
-                "https://huggingface.co/api/models/" + self.MODEL_ID,
-                headers=headers,
-                timeout=10
-            )
-            self._is_available = response.status_code == 200
-            self._last_check = current_time
-            if self._is_available:
-                logger.info("✅ HuggingFace API is available")
-            else:
-                logger.warning("⚠️ HuggingFace API unavailable: " + str(response.status_code))
-            return self._is_available
-        except Exception as e:
-            logger.warning("⚠️ API check failed: " + str(e))
-            self._is_available = False
-            self._last_check = current_time
-            return False
-    def generate(
-        self,
-        prompt: str,
-        max_new_tokens: int = 512,
-        temperature: float = 0.7,
-        top_p: float = 0.9
-    ) -> Optional[str]:
-        """Generate text using HuggingFace Inference API."""
-        if not self.api_token:
-            return None
-        try:
-            import requests
-            headers = {
-                "Authorization": "Bearer " + self.api_token,
-                "Content-Type": "application/json"
-            }
-            payload = {
-                "inputs": prompt,
-                "parameters": {
-                    "max_new_tokens": max_new_tokens,
-                    "temperature": temperature,
-                    "top_p": top_p,
-                    "do_sample": True,
-                    "return_full_text": False
-                },
-                "options": {
-                    "wait_for_model": True
-                }
-            }
-            logger.info("📡 Calling HuggingFace Inference API...")
-            response = requests.post(
-                self.API_URL,
-                headers=headers,
-                json=payload,
-                timeout=120
-            )
-            if response.status_code == 200:
-                result = response.json()
-                if isinstance(result, list) and len(result) > 0:
-                    text = result[0].get("generated_text", "")
-                    if text:
-                        logger.info("✅ API generation successful: " + str(len(text)) + " chars")
-                        return text
-                return None
-            else:
-                logger.warning("⚠️ API request failed: " + str(response.status_code))
-                return None
-        except Exception as e:
-            logger.error("❌ API call failed: " + str(e))
-            return None
-    def translate(self, text: str, target_language: str) -> Optional[str]:
-        """Translate text using the API."""
-        if target_language == "en" or not text:
-            return text
-        lang_name = LANGUAGE_NAMES.get(target_language, target_language)
-        prompt = "Translate to " + lang_name + ": " + text
-        result = self.generate(prompt, max_new_tokens=len(text) * 3, temperature=0.3)
-        if result:
-            result = result.strip()
-            # Clean up prefixes
-            for prefix in [lang_name + ":", "Translation:"]:
-                if result.lower().startswith(prefix.lower()):
-                    result = result[len(prefix):].strip()
-            return result
-        return None
-    def chat_response(self, message: str, context: Dict, language: str = "en") -> Optional[str]:
-        """Generate chat response using API."""
-        crop = context.get("crop_type", "crop").capitalize()
-        disease = context.get("disease_name", "unknown disease")
-        severity = context.get("severity_level", "unknown")
-        confidence = context.get("confidence", 0)
-        if confidence <= 1:
-            confidence = int(confidence * 100)
-        lang_instructions = {
-            "ha": "Respond in Hausa language.",
-            "yo": "Respond in Yoruba language.",
-            "ig": "Respond in Igbo language."
-        }
-        lang_instruction = lang_instructions.get(language, "Respond in English.")
-        prompt = (
-            "You are FarmEyes, an AI assistant helping African farmers with crop diseases.\n\n"
-            "Current diagnosis:\n"
-            "- Crop: " + crop + "\n"
-            "- Disease: " + disease + "\n"
-            "- Severity: " + severity + "\n"
-            "- Confidence: " + str(confidence) + "%\n\n"
-            + lang_instruction + "\n\n"
-            "Farmer's question: " + message + "\n\n"
-            "Provide a helpful, practical response about this disease or related farming advice. "
-            "Keep it concise (2-3 paragraphs max)."
-        )
-        return self.generate(prompt, max_new_tokens=400, temperature=0.7)
-# =============================================================================
-# LOCAL GGUF MODEL (FALLBACK - OPTIONAL)
-# =============================================================================
-class LocalGGUFModel:
-    """
-    Local GGUF model using llama-cpp-python.
-    FALLBACK: Only works if llama-cpp-python is installed.
-    Model: tosinamuda/N-ATLaS-GGUF
-    File: N-ATLaS-GGUF-Q4_K_M.gguf (4.92GB)
     """
-    HF_REPO = "tosinamuda/N-ATLaS-GGUF"
-    MODEL_FILENAME = "N-ATLaS-GGUF-Q4_K_M.gguf"
     def __init__(
         self,
-        model_path: Optional[str] = None,
-        n_ctx: int = 2048,
-        n_gpu_layers: int = DEFAULT_GPU_LAYERS,
-        n_threads: int = DEFAULT_THREADS,
-        n_batch: int = 256,
-        verbose: bool = False
     ):
-        self.model_path = model_path
-        self.n_ctx = n_ctx
-        self.n_gpu_layers = n_gpu_layers
-        self.n_threads = n_threads
-        self.n_batch = n_batch
-        self.verbose = verbose
         self._model = None
         self._is_loaded = False
-        logger.info(f"GGUF Config: ctx={n_ctx}, gpu_layers={n_gpu_layers}, threads={n_threads}, batch={n_batch}")
-    def download_model(self) -> str:
-        """Download GGUF model from HuggingFace Hub."""
-        try:
-            from huggingface_hub import hf_hub_download
-            logger.info("=" * 60)
-            logger.info("📥 DOWNLOADING N-ATLaS GGUF MODEL")
-            logger.info("=" * 60)
-            logger.info(f"   Repository: {self.HF_REPO}")
-            logger.info(f"   File: {self.MODEL_FILENAME}")
-            logger.info(f"   Size: ~4.92 GB")
-            logger.info("   This may take 5-15 minutes on first startup...")
-            logger.info("=" * 60)
-            model_path = hf_hub_download(
-                repo_id=self.HF_REPO,
-                filename=self.MODEL_FILENAME,
-                cache_dir=None,
-                resume_download=True
-            )
-            logger.info("=" * 60)
-            logger.info("✅ MODEL DOWNLOAD COMPLETE!")
-            logger.info(f"   Path: {model_path}")
-            logger.info("=" * 60)
-            return model_path
-        except Exception as e:
-            logger.error("=" * 60)
-            logger.error("❌ MODEL DOWNLOAD FAILED!")
-            logger.error(f"   Error: {str(e)}")
-            logger.error("=" * 60)
-            raise
     def load_model(self) -> bool:
-        """Load GGUF model."""
         if self._is_loaded:
             return True
-        # Check if llama-cpp-python is available
-        if not LLAMA_CPP_AVAILABLE:
-            logger.warning("❌ Cannot load GGUF - llama-cpp-python not installed")
-            logger.warning("   App will use HuggingFace API only")
-            return False
         try:
-            from llama_cpp import Llama
-            # Download if not present
-            if self.model_path is None or not Path(self.model_path).exists():
-                logger.info("Model not found locally, downloading...")
-                self.model_path = self.download_model()
-            logger.info("🔄 Loading GGUF model into memory...")
-            logger.info(f"   Path: {self.model_path}")
-            logger.info(f"   GPU Layers: {self.n_gpu_layers}")
-            logger.info(f"   Context: {self.n_ctx}")
-            self._model = Llama(
-                model_path=self.model_path,
-                n_ctx=self.n_ctx,
-                n_gpu_layers=self.n_gpu_layers,
-                n_threads=self.n_threads,
-                n_batch=self.n_batch,
-                verbose=self.verbose
             )
             self._is_loaded = True
-            logger.info("✅ GGUF model loaded successfully!")
             return True
-        except ImportError:
-            logger.error("❌ llama-cpp-python not installed!")
-            return False
         except Exception as e:
-            logger.error(f"❌ Model load failed: {str(e)}")
             return False
     def unload_model(self):
@@ -384,87 +172,168 @@ class LocalGGUFModel:
         if self._model is not None:
             del self._model
             self._model = None
-            self._is_loaded = False
-            logger.info("Model unloaded")
     @property
     def is_loaded(self) -> bool:
         return self._is_loaded
     def generate(
         self,
         prompt: str,
-        max_tokens: int = 512,
         temperature: float = 0.7,
         top_p: float = 0.9,
-        stop: Optional[List[str]] = None
     ) -> Optional[str]:
-        """Generate text using GGUF model with Llama-3 format."""
-        if not LLAMA_CPP_AVAILABLE:
-            logger.warning("GGUF not available - llama-cpp-python not installed")
-            return None
         if not self._is_loaded:
             if not self.load_model():
                 return None
         try:
-            formatted_prompt = (
-                "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
-                "You are a helpful AI assistant for African farmers. You help with crop disease diagnosis, "
-                "treatment advice, and agricultural questions. Respond in the same language the user writes in."
-                "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
-                + prompt +
-                "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-            )
-            response = self._model(
-                formatted_prompt,
-                max_tokens=max_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                stop=stop or ["<|eot_id|>", "<|end_of_text|>"],
-                echo=False
             )
-            text = response["choices"][0]["text"].strip()
-            for token in ["<|eot_id|>", "<|end_of_text|>", "<|start_header_id|>", "<|end_header_id|>"]:
-                text = text.replace(token, "")
-            text = text.strip()
-            if text:
-                logger.info(f"✅ GGUF generation: {len(text)} chars")
-                return text
             else:
-                logger.warning("⚠️ GGUF returned empty response")
-                return None
         except Exception as e:
-            logger.error(f"❌ GGUF generation error: {str(e)}")
             return None
     def translate(self, text: str, target_language: str) -> Optional[str]:
-        """Translate text using GGUF model."""
-        if not LLAMA_CPP_AVAILABLE:
-            return None
         if target_language == "en" or not text:
             return text
         lang_name = LANGUAGE_NAMES.get(target_language, target_language)
-        prompt = "Translate to " + lang_name + ": " + text
         result = self.generate(
-            prompt,
-            max_tokens=len(text) * 4,
-            temperature=0.3
         )
         if result:
             result = result.strip()
-            for prefix in [lang_name + ":", "Translation:", "In " + lang_name + ":"]:
                 if result.lower().startswith(prefix.lower()):
                     result = result[len(prefix):].strip()
             return result
@@ -473,9 +342,6 @@ class LocalGGUFModel:
     def chat_response(self, message: str, context: Dict, language: str = "en") -> Optional[str]:
         """Generate chat response with diagnosis context."""
-        if not LLAMA_CPP_AVAILABLE:
-            return None
         crop = context.get("crop_type", "crop").capitalize()
         disease = context.get("disease_name", "unknown disease")
         severity = context.get("severity_level", "unknown")
@@ -483,27 +349,38 @@ class LocalGGUFModel:
         if confidence <= 1:
             confidence = int(confidence * 100)
         lang_instructions = {
-            "ha": "Respond in Hausa language.",
-            "yo": "Respond in Yoruba language.",
-            "ig": "Respond in Igbo language."
         }
         lang_instruction = lang_instructions.get(language, "Respond in English.")
         prompt = (
-            "You are FarmEyes, an AI assistant helping African farmers with crop diseases.\n\n"
-            "Current diagnosis:\n"
-            "- Crop: " + crop + "\n"
-            "- Disease: " + disease + "\n"
-            "- Severity: " + severity + "\n"
-            "- Confidence: " + str(confidence) + "%\n\n"
-            + lang_instruction + "\n\n"
-            "Farmer's question: " + message + "\n\n"
-            "Provide a helpful, practical response about this disease or related farming advice. "
-            "Keep it concise (2-3 paragraphs max)."
         )
-        return self.generate(prompt, max_tokens=400, temperature=0.7)
 # =============================================================================
@@ -512,162 +389,117 @@ class LocalGGUFModel:
 class NATLaSModel:
     """
-    HYBRID N-ATLaS model.
-    Strategy:
-    1. Try HuggingFace Inference API first (if token available)
-    2. Fall back to local GGUF model (if llama-cpp-python installed)
-    On HuggingFace Spaces (without llama-cpp-python):
-    - Only HuggingFace API is used
-    - Make sure HF_TOKEN secret is set!
     """
     def __init__(
         self,
-        api_token: Optional[str] = None,
-        prefer_api: bool = True,
-        auto_load_local: bool = True,
-        **local_kwargs
     ):
-        self.prefer_api = prefer_api
-        # Initialize API client (PRIMARY)
-        self.api_client = HuggingFaceAPIClient(api_token)
-        # Initialize GGUF model (FALLBACK - optional)
-        self.local_model = LocalGGUFModel(**local_kwargs)
         # Translation cache
         self._cache: Dict[str, str] = {}
-        # Only try to load GGUF if llama-cpp-python is available
-        if auto_load_local and LLAMA_CPP_AVAILABLE:
-            logger.info("🔄 Pre-loading GGUF model for fallback...")
-            self.local_model.load_model()
-        elif not LLAMA_CPP_AVAILABLE:
-            logger.info("ℹ️ GGUF fallback disabled - using API only")
         logger.info("=" * 60)
-        logger.info("✅ NATLaSModel (Hybrid) initialized")
-        logger.info(f"   API token: {'Yes' if self.api_client.api_token else 'No'}")
-        logger.info(f"   GGUF available: {'Yes' if LLAMA_CPP_AVAILABLE else 'No'}")
-        logger.info(f"   GGUF loaded: {'Yes' if self.local_model.is_loaded else 'No'}")
-        logger.info(f"   GPU available: {'Yes' if HAS_GPU else 'No'}")
         logger.info(f"   Running on: {'HuggingFace Spaces' if IS_HF_SPACES else 'Local'}")
         logger.info("=" * 60)
     @property
     def is_loaded(self) -> bool:
-        return self.api_client.api_token is not None or self.local_model.is_loaded
     def load_model(self) -> bool:
-        if self.api_client.api_token:
-            return True
-        if LLAMA_CPP_AVAILABLE:
-            return self.local_model.load_model()
-        return False
     def translate(self, text: str, target_language: str, use_cache: bool = True) -> str:
-        """
-        Translate text using hybrid approach.
-        1. Try API first
-        2. Fall back to GGUF (if available)
-        """
         if target_language == "en" or not text or not text.strip():
             return text
         # Check cache
-        cache_key = target_language + ":" + str(hash(text))
         if use_cache and cache_key in self._cache:
             return self._cache[cache_key]
-        result = None
-        # Try API first if preferred and available
-        if self.prefer_api and self.api_client.api_token:
-            logger.info("📡 Trying API translation...")
-            result = self.api_client.translate(text, target_language)
-            if result:
-                logger.info("✅ API translation successful")
-        # Fall back to GGUF (only if available)
-        if result is None and LLAMA_CPP_AVAILABLE:
-            logger.info("🔄 Using GGUF for translation (fallback)...")
-            result = self.local_model.translate(text, target_language)
-        # If still no result, return original text
-        if result is None:
-            logger.warning("⚠️ Translation failed - returning original text")
-            return text
-        # Cache and return
-        if result and result != text and use_cache:
-            self._cache[cache_key] = result
-            if len(self._cache) > 500:
-                keys = list(self._cache.keys())[:100]
-                for k in keys:
-                    del self._cache[k]
-        return result if result else text
     def translate_batch(self, texts: List[str], target_language: str) -> List[str]:
         """Translate multiple texts."""
         return [self.translate(text, target_language) for text in texts]
     def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.7, **kwargs) -> str:
-        """
-        Generate text using hybrid approach.
-        1. Try API first
-        2. Fall back to GGUF (if available)
-        """
-        result = None
-        # Try API first if preferred and available
-        if self.prefer_api and self.api_client.api_token:
-            logger.info("📡 Trying API generation...")
-            result = self.api_client.generate(prompt, max_tokens, temperature)
-            if result:
-                logger.info("✅ API generation successful")
-        # Fall back to GGUF (only if available)
-        if result is None and LLAMA_CPP_AVAILABLE:
-            logger.info("🔄 Using GGUF for generation (fallback)...")
-            result = self.local_model.generate(prompt, max_tokens, temperature)
         return result if result else ""
     def chat_response(self, message: str, context: Dict, language: str = "en") -> str:
         """Generate chat response with context."""
-        result = None
-        # Try GGUF first for better context handling (if available)
-        if LLAMA_CPP_AVAILABLE and self.local_model.is_loaded:
-            result = self.local_model.chat_response(message, context, language)
-        # Fall back to API
-        if result is None and self.api_client.api_token:
-            result = self.api_client.chat_response(message, context, language)
-        return result if result else "I'm sorry, I couldn't generate a response. Please try again."
     def load_local_model(self) -> bool:
-        if LLAMA_CPP_AVAILABLE:
-            return self.local_model.load_model()
-        return False
     def unload_local_model(self):
-        if LLAMA_CPP_AVAILABLE:
-            self.local_model.unload_model()
     def get_status(self) -> Dict:
         return {
-            "api_available": self.api_client.is_available() if self.api_client.api_token else False,
-            "api_token_set": bool(self.api_client.api_token),
-            "llama_cpp_available": LLAMA_CPP_AVAILABLE,
-            "local_model_loaded": self.local_model.is_loaded,
             "gpu_available": HAS_GPU,
-            "prefer_api": self.prefer_api,
             "cache_size": len(self._cache),
             "running_on": "HuggingFace Spaces" if IS_HF_SPACES else "Local"
         }
@@ -694,8 +526,7 @@ def get_natlas_model(
     if _model_instance is None:
         _model_instance = NATLaSModel(
             api_token=api_token,
-            prefer_api=True,
-            auto_load_local=auto_load_local,
             **kwargs
         )

 """
+FarmEyes N-ATLaS Model Integration (Transformers Version)
+==========================================================
+Uses the official N-ATLaS model via HuggingFace Transformers library.
+NO llama-cpp-python required - faster builds, official model support.
+Model: NCAIR1/N-ATLaS (8B parameters, Llama-3 based)
+Size: ~16GB (downloaded at runtime)
 Languages: English, Hausa, Yoruba, Igbo
+Powered by Awarri Technologies and the Federal Ministry of
+Communications, Innovation and Digital Economy.
 """
 import os
 from typing import Optional, Dict, List
 import logging
 import time
+from datetime import datetime
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # =============================================================================
 # ENVIRONMENT DETECTION
 # =============================================================================
 IS_HF_SPACES = os.environ.get("SPACE_ID") is not None
 # Check for GPU
 HAS_GPU = False
+GPU_NAME = "None"
 try:
     import torch
     HAS_GPU = torch.cuda.is_available()
     if HAS_GPU:
+        GPU_NAME = torch.cuda.get_device_name(0)
+        logger.info(f"🎮 GPU detected: {GPU_NAME}")
+    else:
+        logger.info("🖥️ No GPU detected - using CPU")
 except ImportError:
+    logger.warning("PyTorch not installed")
+if IS_HF_SPACES:
+    logger.info("🤗 Running on HuggingFace Spaces")
 else:
     logger.info("🖥️ Running locally")
 # =============================================================================
 # LANGUAGE MAPPINGS
 # =============================================================================
+# N-ATLAS MODEL (TRANSFORMERS VERSION)
 # =============================================================================
+class NATLaSTransformersModel:
     """
+    N-ATLaS model using HuggingFace Transformers.
+    This is the OFFICIAL way to use N-ATLaS as shown in the model documentation.
+    No llama-cpp-python compilation required!
+    Model: NCAIR1/N-ATLaS
+    Base: Llama-3 8B
+    Size: ~16GB
     """
+    MODEL_ID = "NCAIR1/N-ATLaS"
     def __init__(
         self,
+        model_id: str = MODEL_ID,
+        torch_dtype: str = "float16",
+        device_map: str = "auto",
+        load_on_init: bool = True
     ):
+        self.model_id = model_id
+        self.torch_dtype = torch_dtype
+        self.device_map = device_map
         self._model = None
+        self._tokenizer = None
         self._is_loaded = False
+        logger.info(f"NATLaS Config: model={model_id}, dtype={torch_dtype}, device_map={device_map}")
+        if load_on_init:
+            self.load_model()
     def load_model(self) -> bool:
+        """Load N-ATLaS model using transformers."""
         if self._is_loaded:
             return True
         try:
+            import torch
+            from transformers import AutoTokenizer, AutoModelForCausalLM
+            logger.info("=" * 60)
+            logger.info("📥 LOADING N-ATLaS MODEL")
+            logger.info("=" * 60)
+            logger.info(f"   Model: {self.model_id}")
+            logger.info(f"   Size: ~16GB")
+            logger.info("   This may take 5-15 minutes on first load...")
+            logger.info("=" * 60)
+            # Determine torch dtype
+            if self.torch_dtype == "float16":
+                dtype = torch.float16
+            elif self.torch_dtype == "bfloat16":
+                dtype = torch.bfloat16
+            else:
+                dtype = torch.float32
+            # Load tokenizer
+            logger.info("Loading tokenizer...")
+            self._tokenizer = AutoTokenizer.from_pretrained(
+                self.model_id,
+                trust_remote_code=True
+            )
+            # Load model
+            logger.info("Loading model weights...")
+            self._model = AutoModelForCausalLM.from_pretrained(
+                self.model_id,
+                torch_dtype=dtype,
+                device_map=self.device_map,
+                trust_remote_code=True
             )
             self._is_loaded = True
+            logger.info("=" * 60)
+            logger.info("✅ N-ATLaS MODEL LOADED SUCCESSFULLY!")
+            if HAS_GPU:
+                logger.info(f"   Running on GPU: {GPU_NAME}")
+            else:
+                logger.info("   Running on CPU")
+            logger.info("=" * 60)
             return True
         except Exception as e:
+            logger.error(f"❌ Failed to load N-ATLaS model: {e}")
+            logger.error("   Make sure you have accepted the model license at:")
+            logger.error("   https://huggingface.co/NCAIR1/N-ATLaS")
             return False
     def unload_model(self):
         if self._model is not None:
             del self._model
             self._model = None
+        if self._tokenizer is not None:
+            del self._tokenizer
+            self._tokenizer = None
+        self._is_loaded = False
+        # Clear CUDA cache
+        try:
+            import torch
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        except:
+            pass
+        logger.info("Model unloaded")
     @property
     def is_loaded(self) -> bool:
         return self._is_loaded
+    def _format_messages(self, messages: List[Dict]) -> str:
+        """Format messages using the tokenizer's chat template."""
+        try:
+            current_date = datetime.now().strftime('%d %b %Y')
+            text = self._tokenizer.apply_chat_template(
+                messages,
+                add_generation_prompt=True,
+                tokenize=False,
+                date_string=current_date
+            )
+            return text
+        except Exception as e:
+            # Fallback formatting if chat template fails
+            logger.warning(f"Chat template failed, using fallback: {e}")
+            text = ""
+            for msg in messages:
+                role = msg.get("role", "user")
+                content = msg.get("content", "")
+                if role == "system":
+                    text += f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>"
+                elif role == "user":
+                    text += f"<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|>"
+                elif role == "assistant":
+                    text += f"<|start_header_id|>assistant<|end_header_id|>\n\n{content}<|eot_id|>"
+            text += "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            return text
     def generate(
         self,
         prompt: str,
+        system_prompt: str = None,
+        max_new_tokens: int = 512,
         temperature: float = 0.7,
         top_p: float = 0.9,
+        repetition_penalty: float = 1.12
     ) -> Optional[str]:
+        """Generate text using N-ATLaS model."""
         if not self._is_loaded:
             if not self.load_model():
                 return None
         try:
+            import torch
+            # Default system prompt
+            if system_prompt is None:
+                system_prompt = (
+                    "You are a helpful AI assistant for African farmers. "
+                    "You help with crop disease diagnosis, treatment advice, and agricultural questions. "
+                    "Respond in the same language the user writes in."
+                )
+            # Format messages
+            messages = [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": prompt}
+            ]
+            text = self._format_messages(messages)
+            # Tokenize
+            input_tokens = self._tokenizer(
+                text,
+                return_tensors='pt',
+                add_special_tokens=False
             )
+            # Move to device
+            if HAS_GPU:
+                input_tokens = input_tokens.to('cuda')
+            # Generate
+            with torch.no_grad():
+                outputs = self._model.generate(
+                    **input_tokens,
+                    max_new_tokens=max_new_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    repetition_penalty=repetition_penalty,
+                    do_sample=True,
+                    use_cache=True,
+                    pad_token_id=self._tokenizer.eos_token_id
+                )
+            # Decode
+            full_response = self._tokenizer.decode(outputs[0], skip_special_tokens=False)
+            # Extract assistant response
+            # Look for the last assistant header and get text after it
+            assistant_marker = "<|start_header_id|>assistant<|end_header_id|>"
+            if assistant_marker in full_response:
+                response = full_response.split(assistant_marker)[-1]
             else:
+                response = full_response
+            # Clean up special tokens
+            for token in ["<|eot_id|>", "<|end_of_text|>", "<|begin_of_text|>",
+                         "<|start_header_id|>", "<|end_header_id|>"]:
+                response = response.replace(token, "")
+            response = response.strip()
+            if response:
+                logger.info(f"✅ Generation successful: {len(response)} chars")
+                return response
+            else:
+                logger.warning("⚠️ Empty response generated")
+                return None
         except Exception as e:
+            logger.error(f"❌ Generation error: {e}")
             return None
     def translate(self, text: str, target_language: str) -> Optional[str]:
+        """Translate text to target language."""
         if target_language == "en" or not text:
             return text
         lang_name = LANGUAGE_NAMES.get(target_language, target_language)
+        prompt = f"Translate the following text to {lang_name}. Only provide the translation, nothing else.\n\nText: {text}"
+        system_prompt = f"You are a professional translator. Translate text accurately to {lang_name}. Only output the translation."
         result = self.generate(
+            prompt=prompt,
+            system_prompt=system_prompt,
+            max_new_tokens=len(text) * 4,
+            temperature=0.3,
+            repetition_penalty=1.1
         )
         if result:
             result = result.strip()
+            # Clean up common prefixes
+            prefixes_to_remove = [
+                f"{lang_name}:",
+                f"{lang_name} translation:",
+                "Translation:",
+                "Here is the translation:",
+                "The translation is:",
+            ]
+            for prefix in prefixes_to_remove:
                 if result.lower().startswith(prefix.lower()):
                     result = result[len(prefix):].strip()
             return result
     def chat_response(self, message: str, context: Dict, language: str = "en") -> Optional[str]:
         """Generate chat response with diagnosis context."""
         crop = context.get("crop_type", "crop").capitalize()
         disease = context.get("disease_name", "unknown disease")
         severity = context.get("severity_level", "unknown")
         if confidence <= 1:
             confidence = int(confidence * 100)
+        # Language instruction
         lang_instructions = {
+            "en": "Respond in English.",
+            "ha": "Respond in Hausa language (Yaren Hausa).",
+            "yo": "Respond in Yoruba language (Èdè Yorùbá).",
+            "ig": "Respond in Igbo language (Asụsụ Igbo)."
         }
         lang_instruction = lang_instructions.get(language, "Respond in English.")
+        system_prompt = (
+            "You are FarmEyes, an AI assistant helping African farmers with crop diseases. "
+            "You provide practical, helpful advice about crop diseases and farming. "
+            f"{lang_instruction}"
+        )
         prompt = (
+            f"Current diagnosis information:\n"
+            f"- Crop: {crop}\n"
+            f"- Disease: {disease}\n"
+            f"- Severity: {severity}\n"
+            f"- Confidence: {confidence}%\n\n"
+            f"Farmer's question: {message}\n\n"
+            f"Provide a helpful, practical response about this disease or related farming advice. "
+            f"Keep your response concise (2-3 paragraphs maximum)."
+        )
+        return self.generate(
+            prompt=prompt,
+            system_prompt=system_prompt,
+            max_new_tokens=500,
+            temperature=0.7
         )
 # =============================================================================
 class NATLaSModel:
     """
+    N-ATLaS model wrapper.
+    Uses the official NCAIR1/N-ATLaS model via HuggingFace Transformers.
+    This is the recommended way to use N-ATLaS.
     """
     def __init__(
         self,
+        api_token: Optional[str] = None,  # Kept for compatibility
+        auto_load: bool = True,
+        **kwargs
     ):
+        # Get HF token from environment
+        self.hf_token = api_token or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
+        if self.hf_token:
+            logger.info("✅ HuggingFace token found")
+            # Set token for huggingface_hub
+            try:
+                from huggingface_hub import login
+                login(token=self.hf_token, add_to_git_credential=False)
+            except Exception as e:
+                logger.warning(f"Could not set HF token: {e}")
+        else:
+            logger.warning("⚠️ No HF_TOKEN found - model access may fail")
+        # Initialize the transformers model
+        self.model = NATLaSTransformersModel(load_on_init=auto_load)
         # Translation cache
         self._cache: Dict[str, str] = {}
         logger.info("=" * 60)
+        logger.info("✅ NATLaSModel initialized")
+        logger.info(f"   Model loaded: {'Yes' if self.model.is_loaded else 'No'}")
+        logger.info(f"   GPU available: {'Yes - ' + GPU_NAME if HAS_GPU else 'No'}")
+        logger.info(f"   HF Token: {'Yes' if self.hf_token else 'No'}")
         logger.info(f"   Running on: {'HuggingFace Spaces' if IS_HF_SPACES else 'Local'}")
         logger.info("=" * 60)
     @property
     def is_loaded(self) -> bool:
+        return self.model.is_loaded
     def load_model(self) -> bool:
+        return self.model.load_model()
     def translate(self, text: str, target_language: str, use_cache: bool = True) -> str:
+        """Translate text to target language."""
         if target_language == "en" or not text or not text.strip():
             return text
         # Check cache
+        cache_key = f"{target_language}:{hash(text)}"
         if use_cache and cache_key in self._cache:
+            logger.info("📦 Using cached translation")
             return self._cache[cache_key]
+        logger.info(f"🌍 Translating to {LANGUAGE_NAMES.get(target_language, target_language)}...")
+        result = self.model.translate(text, target_language)
+        if result and result != text:
+            # Cache the result
+            if use_cache:
+                self._cache[cache_key] = result
+                # Limit cache size
+                if len(self._cache) > 500:
+                    keys = list(self._cache.keys())[:100]
+                    for k in keys:
+                        del self._cache[k]
+            logger.info("✅ Translation successful")
+            return result
+        logger.warning("⚠️ Translation failed - returning original")
+        return text
     def translate_batch(self, texts: List[str], target_language: str) -> List[str]:
         """Translate multiple texts."""
         return [self.translate(text, target_language) for text in texts]
     def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.7, **kwargs) -> str:
+        """Generate text."""
+        result = self.model.generate(
+            prompt=prompt,
+            max_new_tokens=max_tokens,
+            temperature=temperature
+        )
         return result if result else ""
     def chat_response(self, message: str, context: Dict, language: str = "en") -> str:
         """Generate chat response with context."""
+        result = self.model.chat_response(message, context, language)
+        if result:
+            return result
+        return "I'm sorry, I couldn't generate a response. Please try again."
     def load_local_model(self) -> bool:
+        """Compatibility method."""
+        return self.load_model()
     def unload_local_model(self):
+        """Unload model."""
+        self.model.unload_model()
     def get_status(self) -> Dict:
         return {
+            "model_loaded": self.model.is_loaded,
+            "model_id": self.model.model_id,
             "gpu_available": HAS_GPU,
+            "gpu_name": GPU_NAME if HAS_GPU else None,
+            "hf_token_set": bool(self.hf_token),
             "cache_size": len(self._cache),
             "running_on": "HuggingFace Spaces" if IS_HF_SPACES else "Local"
         }
     if _model_instance is None:
         _model_instance = NATLaSModel(
             api_token=api_token,
+            auto_load=auto_load_local,
             **kwargs
         )

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 # =============================================================================
-# FarmEyes - Requirements (GPU Version)
 # =============================================================================
 # Note: torch/torchvision already in base Docker image
 # =============================================================================
@@ -9,24 +10,29 @@ fastapi>=0.104.0
 uvicorn[standard]>=0.24.0
 python-multipart>=0.0.6
-# AI/ML (torch already in base image)
-ultralytics>=8.0.0
 transformers>=4.35.0
 huggingface-hub>=0.19.0
-# Audio
 openai-whisper>=20231117
 soundfile>=0.12.0
-# Image
 Pillow>=10.0.0
 opencv-python-headless>=4.8.0
 # HTTP
 requests>=2.31.0
-# Utils
 numpy>=1.24.0
 scipy>=1.11.0
 pydantic>=2.0.0
 python-dotenv>=1.0.0

 # =============================================================================
+# FarmEyes - Requirements (Transformers Version)
 # =============================================================================
+# NO llama-cpp-python needed! Uses official N-ATLaS via transformers.
 # Note: torch/torchvision already in base Docker image
 # =============================================================================
 uvicorn[standard]>=0.24.0
 python-multipart>=0.0.6
+# AI/ML - Transformers (for N-ATLaS)
 transformers>=4.35.0
+accelerate>=0.25.0
 huggingface-hub>=0.19.0
+# AI/ML - Vision (for YOLOv11)
+ultralytics>=8.0.0
+# Audio Processing (for Whisper)
 openai-whisper>=20231117
 soundfile>=0.12.0
+# Image Processing
 Pillow>=10.0.0
 opencv-python-headless>=4.8.0
 # HTTP
 requests>=2.31.0
+# Utilities
 numpy>=1.24.0
 scipy>=1.11.0
 pydantic>=2.0.0
 python-dotenv>=1.0.0
+sentencepiece>=0.1.99
+protobuf>=3.20.0