jeanbaptdzd
/

dragon-3b-inference

Model card Files Files and versions

xet

Community

jeanbaptdzd commited on Oct 14, 2025

Commit

cf73c6e

verified ·

1 Parent(s): 0aef7e8

Upload app_config_hf.py with huggingface_hub

Browse files

Files changed (1) hide show

app_config_hf.py +340 -0

app_config_hf.py ADDED Viewed

	@@ -0,0 +1,340 @@

+"""
+HuggingFace Spaces compatible configuration for Dragon-3B model
+No Pydantic dependencies - pure Python dicts
+"""
+import os
+import torch
+import gc
+import logging
+from typing import Dict, Any, Optional
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from huggingface_hub import login
+logger = logging.getLogger(__name__)
+# Global variables for model and tokenizer
+model = None
+tokenizer = None
+pipe = None
+model_loaded = False
+current_model_name = None
+# Updated Dragon configuration based on latest model
+# Performance optimizations enabled:
+# - flash-attn: Memory-efficient attention computation
+# - flash-linear-attention: Gated DeltaNet Triton kernels
+# - causal-conv1d: Short convolution for Gated DeltaNet layer
+# - attn_implementation="flash_attention_2": Uses flash attention when available
+DRAGON_CONFIG = {
+    "model_id": "DragonLLM/Dragon-3B-Base-alpha",
+    "display_name": "Dragon-3B-Base-alpha",
+    "architecture": "DragonForCausalLM",
+    "tokenizer": {
+        "eos_token": "<|endoftext|>",
+        "bos_token": "<|beginoftext|>",
+        "pad_token": "<|pad|>",
+        "unk_token": "<|unk|>",
+        "eos_token_id": 0,
+        "bos_token_id": 0,
+        "pad_token_id": 0,
+        "eot_token_id": 0,
+        "vocab_size": 196736,
+        "model_max_length": 8192
+    },
+    "generation": {
+        "eos_tokens": [0],
+        "bos_token_id": 0,
+        "temperature": 0.6,
+        "top_p": 0.9,
+        "max_new_tokens": 150,
+        "repetition_penalty": 1.05,
+        "no_repeat_ngram_size": 2,
+        "early_stopping": False,
+        "min_length": 50,
+        "do_sample": True,
+        "use_cache": True,
+        "pad_token_id": 0
+    }
+}
+def get_app_settings() -> Dict[str, Any]:
+    """Get application settings - simple dict."""
+    return {
+        "model_name": "dragon-3b-base-alpha",
+        "hf_token_dragon": os.getenv("HF_TOKEN_DRAGON"),
+        "debug": False
+    }
+def get_model_config(model_name: str) -> Dict[str, Any]:
+    """Get model configuration - simple dict."""
+    return DRAGON_CONFIG
+def cleanup_model_memory():
+    """Clean up model memory."""
+    global model, tokenizer, pipe, model_loaded, current_model_name
+    if model is not None:
+        del model
+        model = None
+    if tokenizer is not None:
+        del tokenizer
+        tokenizer = None
+    if pipe is not None:
+        del pipe
+        pipe = None
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    model_loaded = False
+    current_model_name = None
+    logger.info("✅ Model memory cleaned")
+def load_linguacustodia_model() -> bool:
+    """Load the Dragon model."""
+    global model, tokenizer, pipe, model_loaded, current_model_name
+    if model_loaded and model is not None:
+        logger.info(f"✅ Model '{current_model_name}' already loaded")
+        return True
+    settings = get_app_settings()
+    hf_token_dragon = settings["hf_token_dragon"]
+    model_config = get_model_config(settings["model_name"])
+    model_id = model_config["model_id"]
+    if not hf_token_dragon:
+        logger.error("❌ HF_TOKEN_DRAGON not found in environment")
+        return False
+    try:
+        logger.info(f"🐉 Initializing {model_config['display_name']} model...")
+        login(token=hf_token_dragon, add_to_git_credential=False)
+        logger.info("✅ Authenticated with HuggingFace")
+        logger.info(f"🚀 Loading {model_id} with CUDA support...")
+        # Determine device and dtype for CUDA - use bfloat16 as per model config
+        if torch.cuda.is_available():
+            torch_dtype = torch.bfloat16  # Model config specifies bfloat16
+            device_map = "auto"  # Let accelerate handle device placement
+            logger.info(f"⚡ Using CUDA with {torch.cuda.get_device_name(0)} and bfloat16")
+        else:
+            torch_dtype = torch.float32
+            device_map = None  # Use CPU
+            logger.warning("⚠️ CUDA not available, falling back to CPU with float32")
+        # Check if HF_HOME is set for caching
+        hf_home = os.getenv("HF_HOME")
+        if hf_home:
+            logger.info(f"📁 Using HF_HOME cache: {hf_home}")
+        else:
+            logger.info("📁 Using default HF cache location")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id,
+            token=hf_token_dragon,
+            trust_remote_code=True,
+            cache_dir=hf_home if hf_home else None
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            token=hf_token_dragon,
+            dtype=torch_dtype,  # Use dtype instead of torch_dtype
+            device_map=device_map,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            cache_dir=hf_home if hf_home else None,
+            attn_implementation="flash_attention_2" if torch.cuda.is_available() else None  # Use flash attention when available
+        )
+        # Create pipeline with proper device handling
+        if device_map == "auto":
+            # When using device_map="auto", don't specify device in pipeline
+            pipe = pipeline(
+                "text-generation",
+                model=model,
+                tokenizer=tokenizer,
+                dtype=torch_dtype  # Use dtype instead of torch_dtype
+            )
+        else:
+            # For CPU, specify device explicitly
+            pipe = pipeline(
+                "text-generation",
+                model=model,
+                tokenizer=tokenizer,
+                dtype=torch_dtype,  # Use dtype instead of torch_dtype
+                device=-1  # CPU
+            )
+        model_loaded = True
+        current_model_name = model_config["display_name"]
+        device_name = "CUDA" if torch.cuda.is_available() else "CPU"
+        logger.info(f"✅ Dragon model loaded successfully with {device_name}!")
+        return True
+    except Exception as e:
+        logger.error(f"❌ Failed to load model: {e}")
+        cleanup_model_memory()
+        return False
+def run_inference(prompt: str, max_new_tokens: int = 150, temperature: float = 0.6) -> Dict[str, Any]:
+    """Run inference with the loaded model."""
+    global pipe, model, tokenizer, model_loaded, current_model_name
+    if not model_loaded or pipe is None or tokenizer is None:
+        raise RuntimeError("Model not loaded")
+    try:
+        logger.info(f"🧪 Generating inference for: '{prompt[:50]}...'")
+        pipe.max_new_tokens = max_new_tokens
+        pipe.temperature = temperature
+        if hasattr(model, 'generation_config'):
+            settings = get_app_settings()
+            model_config = get_model_config(settings["model_name"])
+            model.generation_config.eos_token_id = model_config["generation"]["eos_tokens"]
+            model.generation_config.early_stopping = model_config["generation"]["early_stopping"]
+            model.generation_config.min_length = model_config["generation"]["min_length"]
+            logger.info(f"🔧 Using model-specific EOS tokens: {model_config['generation']['eos_tokens']}")
+            logger.info("🔧 Applied anti-truncation measures")
+        # Tokenize input to get proper length for attention mask
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
+        input_length = inputs['input_ids'].shape[1]
+        # Ensure inputs are on the same device and dtype as the model
+        if hasattr(model, 'device'):
+            inputs = {k: v.to(model.device) for k, v in inputs.items()}
+        # Ensure model is in eval mode
+        model.eval()
+        # Generate with proper attention mask handling
+        result = pipe(
+            prompt,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            return_full_text=False,
+            use_cache=False,
+            truncation=True,
+            max_length=input_length + max_new_tokens,
+            do_sample=True,
+            pad_token_id=tokenizer.eos_token_id
+        )
+        if result and len(result) > 0:
+            response_text = result[0]['generated_text']
+            tokens_generated = len(tokenizer.encode(response_text))
+        else:
+            raise RuntimeError("No response generated")
+        settings = get_app_settings()
+        model_config = get_model_config(settings["model_name"])
+        generation_params = {
+            "max_new_tokens": max_new_tokens,
+            "temperature": temperature,
+            "eos_token_id": model_config["generation"]["eos_tokens"],
+            "early_stopping": model_config["generation"]["early_stopping"],
+            "min_length": model_config["generation"]["min_length"],
+            "repetition_penalty": model_config["generation"]["repetition_penalty"],
+            "respectful_approach": True,
+            "storage_enabled": True,
+            "model_specific_config": True
+        }
+        logger.info(f"✅ Generated {tokens_generated} tokens with RESPECTFUL official config")
+        return {
+            "response": response_text,
+            "model_used": current_model_name,
+            "success": True,
+            "tokens_generated": tokens_generated,
+            "generation_params": generation_params
+        }
+    except Exception as e:
+        logger.error(f"❌ Inference error: {e}")
+        # If it's a block mask error, try with different parameters
+        if "block_mask" in str(e):
+            logger.warning("🔧 Block mask error detected, trying with adjusted parameters...")
+            try:
+                # Retry with shorter sequence and no cache
+                result = pipe(
+                    prompt,
+                    max_new_tokens=min(max_new_tokens, 100),
+                    temperature=temperature,
+                    return_full_text=False,
+                    use_cache=False,
+                    truncation=True,
+                    max_length=1024
+                )
+                if result and len(result) > 0:
+                    response_text = result[0]['generated_text']
+                    tokens_generated = len(tokenizer.encode(response_text))
+                    logger.info(f"✅ Generated {tokens_generated} tokens (retry)")
+                    return {
+                        "response": response_text,
+                        "model_used": current_model_name,
+                        "success": True,
+                        "tokens_generated": tokens_generated,
+                        "generation_params": {"retry": True, "reason": "block_mask_fix"}
+                    }
+            except Exception as retry_error:
+                logger.error(f"❌ Retry inference error: {retry_error}")
+        return {
+            "response": "",
+            "model_used": current_model_name,
+            "success": False,
+            "tokens_generated": 0,
+            "generation_params": {},
+            "error": str(e)
+        }
+def get_gpu_memory_info() -> Dict[str, Any]:
+    """Get detailed GPU memory usage."""
+    if not torch.cuda.is_available():
+        return {"gpu_available": False}
+    try:
+        # Get current GPU device
+        device = torch.cuda.current_device()
+        gpu_name = torch.cuda.get_device_name(device)
+        # Get total memory
+        total_memory = torch.cuda.get_device_properties(device).total_memory
+        total_memory_gb = total_memory / (1024**3)
+        # Get allocated and reserved memory
+        allocated_memory = torch.cuda.memory_allocated(device)
+        reserved_memory = torch.cuda.memory_reserved(device)
+        allocated_memory_gb = allocated_memory / (1024**3)
+        reserved_memory_gb = reserved_memory / (1024**3)
+        # Calculate free memory (approximate)
+        free_memory_gb = total_memory_gb - allocated_memory_gb
+        return {
+            "gpu_available": True,
+            "gpu_name": gpu_name,
+            "gpu_memory_total": f"{total_memory_gb:.2f} GB",
+            "gpu_memory_allocated": f"{allocated_memory_gb:.2f} GB",
+            "gpu_memory_reserved": f"{reserved_memory_gb:.2f} GB",
+            "gpu_memory_free": f"{free_memory_gb:.2f} GB"
+        }
+    except Exception as e:
+        logger.error(f"Error getting GPU memory info: {e}")
+        return {"gpu_available": False, "error": str(e)}