Spaces:

SolarumAsteridion
/

Human

Sleeping

App Files Files Community

Solarum Asteridion commited on Oct 23, 2024

Commit

5a1081c

verified ·

1 Parent(s): b2d02e9

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -41

app.py CHANGED Viewed

@@ -9,7 +9,6 @@ import psutil
 import os
 from huggingface_hub import login, hf_api
 from typing import List, Dict, Optional
-import numpy as np
 from threading import Lock
 class MemoryTracker:
@@ -41,12 +40,12 @@ def setup_huggingface_auth():
 class ModelConfig:
     DEFAULT_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
-    SMALLER_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"  # Fallback for low-resource systems
-    MAX_LENGTH_CPU = 384
     MAX_LENGTH_GPU = 512
     BATCH_SIZE = 1
-    CPU_THREADS = max(1, os.cpu_count() - 1)  # Leave one core free for system
 class CacheManager:
     def __init__(self, max_size: int = 100):
         self.cache = {}
@@ -60,7 +59,6 @@ class CacheManager:
     def set(self, key: str, value: str):
         with self.lock:
             if len(self.cache) >= self.max_size:
-                # Remove oldest entry
                 self.cache.pop(next(iter(self.cache)))
             self.cache[key] = value
@@ -74,27 +72,27 @@ class LocalLLMHandler:
         torch.set_num_threads(ModelConfig.CPU_THREADS)
     def optimize_model_settings(self):
-        """Apply various optimizations based on available resources"""
         total_memory = psutil.virtual_memory().total / (1024 ** 3)  # GB
         logger.info(f"Total system memory: {total_memory:.2f} GB")
         if total_memory < 8:  # Less than 8GB RAM
             return {
                 "model_name": ModelConfig.SMALLER_MODEL,
-                "use_half_precision": False,
                 "max_length": ModelConfig.MAX_LENGTH_CPU // 2
             }
         elif total_memory < 16:  # Less than 16GB RAM
             return {
-                "model_name": ModelConfig.DEFAULT_MODEL,
-                "use_half_precision": True,
                 "max_length": ModelConfig.MAX_LENGTH_CPU
             }
         else:  # 16GB+ RAM
             return {
                 "model_name": ModelConfig.DEFAULT_MODEL,
-                "use_half_precision": True,
-                "max_length": ModelConfig.MAX_LENGTH_CPU * 2
             }
     def load_model(self, model_name: Optional[str] = None):
@@ -109,7 +107,7 @@ class LocalLLMHandler:
             logger.info(f"Loading model: {model_name}")
             logger.info(f"Current memory usage: {self.memory_tracker.get_memory_usage()}")
-            # Initialize tokenizer first to save memory
             self.tokenizer = AutoTokenizer.from_pretrained(
                 model_name,
                 model_max_length=settings["max_length"],
@@ -117,41 +115,32 @@ class LocalLLMHandler:
                 truncation=True
             )
-            # Configure model loading
             model_kwargs = {
-                "device_map": "auto",
                 "low_cpu_mem_usage": True,
             }
             if torch.cuda.is_available():
                 logger.info("CUDA available - using GPU configuration")
                 model_kwargs.update({
-                    "torch_dtype": torch.float16,
                 })
             else:
-                logger.info("Running in CPU-only mode with optimizations")
-                if settings["use_half_precision"]:
-                    model_kwargs.update({"torch_dtype": torch.float16})
-                # Load config first to modify architecture if needed
-                config = AutoConfig.from_pretrained(model_name)
-                config.num_attention_heads = min(config.num_attention_heads, 8)
-                model_kwargs["config"] = config
-            # Load the model with optimizations
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 **model_kwargs
             )
-            if not torch.cuda.is_available():
-                # Additional CPU optimizations
-                self.model.eval()  # Set to evaluation mode
-                with torch.no_grad():
-                    # Pre-compile common operations
-                    self.model = torch.jit.optimize_for_inference(
-                        torch.jit.script(self.model)
-                    )
             logger.info(f"Model loaded successfully on {self.model.device}")
             logger.info(f"Final memory usage: {self.memory_tracker.get_memory_usage()}")
@@ -162,18 +151,17 @@ class LocalLLMHandler:
             return f"Error loading model: {e}"
     def generate_response(self, prompt: str, max_length: Optional[int] = None) -> str:
-        # Check cache first
         cache_key = f"{prompt[:100]}_{max_length}"
         cached_response = self.cache_manager.get(cache_key)
         if cached_response:
             return cached_response
         try:
-            with self.generation_lock:  # Ensure thread-safe generation
                 settings = self.optimize_model_settings()
                 max_length = max_length or settings["max_length"]
-                # Efficient tokenization
                 inputs = self.tokenizer(
                     prompt,
                     return_tensors="pt",
@@ -182,14 +170,13 @@ class LocalLLMHandler:
                     max_length=max_length
                 ).to(self.model.device)
-                # Optimize generation parameters for CPU
                 generation_config = {
                     "max_length": max_length,
                     "num_return_sequences": 1,
                     "temperature": 0.7,
                     "do_sample": True,
                     "pad_token_id": self.tokenizer.eos_token_id,
-                    "num_beams": 1,  # Disable beam search for CPU
                     "early_stopping": True,
                     "no_repeat_ngram_size": 3,
                     "length_penalty": 1.0,
@@ -201,10 +188,9 @@ class LocalLLMHandler:
                         "temperature": 0.8,
                         "top_k": 40,
                         "top_p": 0.9,
-                        "repetition_penalty": 1.2
                     })
-                with torch.no_grad():  # Disable gradient computation
                     outputs = self.model.generate(
                         inputs["input_ids"],
                         **generation_config
@@ -216,7 +202,6 @@ class LocalLLMHandler:
                     clean_up_tokenization_spaces=True
                 )
-                # Cache the response
                 self.cache_manager.set(cache_key, response)
                 return response

 import os
 from huggingface_hub import login, hf_api
 from typing import List, Dict, Optional
 from threading import Lock
 class MemoryTracker:
 class ModelConfig:
     DEFAULT_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
+    SMALLER_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
+    MAX_LENGTH_CPU = 256
     MAX_LENGTH_GPU = 512
     BATCH_SIZE = 1
+    CPU_THREADS = max(1, os.cpu_count() - 1)
 class CacheManager:
     def __init__(self, max_size: int = 100):
         self.cache = {}
     def set(self, key: str, value: str):
         with self.lock:
             if len(self.cache) >= self.max_size:
                 self.cache.pop(next(iter(self.cache)))
             self.cache[key] = value
         torch.set_num_threads(ModelConfig.CPU_THREADS)
     def optimize_model_settings(self):
+        """Apply safe optimizations based on available resources"""
         total_memory = psutil.virtual_memory().total / (1024 ** 3)  # GB
         logger.info(f"Total system memory: {total_memory:.2f} GB")
         if total_memory < 8:  # Less than 8GB RAM
             return {
                 "model_name": ModelConfig.SMALLER_MODEL,
+                "use_float16": False,
                 "max_length": ModelConfig.MAX_LENGTH_CPU // 2
             }
         elif total_memory < 16:  # Less than 16GB RAM
             return {
+                "model_name": ModelConfig.SMALLER_MODEL,
+                "use_float16": False,
                 "max_length": ModelConfig.MAX_LENGTH_CPU
             }
         else:  # 16GB+ RAM
             return {
                 "model_name": ModelConfig.DEFAULT_MODEL,
+                "use_float16": False,
+                "max_length": ModelConfig.MAX_LENGTH_CPU
             }
     def load_model(self, model_name: Optional[str] = None):
             logger.info(f"Loading model: {model_name}")
             logger.info(f"Current memory usage: {self.memory_tracker.get_memory_usage()}")
+            # Load tokenizer with safe settings
             self.tokenizer = AutoTokenizer.from_pretrained(
                 model_name,
                 model_max_length=settings["max_length"],
                 truncation=True
             )
+            # Basic model loading configuration
             model_kwargs = {
                 "low_cpu_mem_usage": True,
             }
             if torch.cuda.is_available():
                 logger.info("CUDA available - using GPU configuration")
                 model_kwargs.update({
+                    "device_map": "auto",
+                    "torch_dtype": torch.float16 if settings["use_float16"] else torch.float32
                 })
             else:
+                logger.info("Running in CPU-only mode with safe optimizations")
+                model_kwargs.update({
+                    "device_map": "cpu",
+                    "torch_dtype": torch.float32  # Use float32 for CPU stability
+                })
+            # Load the model without trying to modify its architecture
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 **model_kwargs
             )
+            # Set to eval mode for inference
+            self.model.eval()
             logger.info(f"Model loaded successfully on {self.model.device}")
             logger.info(f"Final memory usage: {self.memory_tracker.get_memory_usage()}")
             return f"Error loading model: {e}"
     def generate_response(self, prompt: str, max_length: Optional[int] = None) -> str:
         cache_key = f"{prompt[:100]}_{max_length}"
         cached_response = self.cache_manager.get(cache_key)
         if cached_response:
             return cached_response
         try:
+            with self.generation_lock:
                 settings = self.optimize_model_settings()
                 max_length = max_length or settings["max_length"]
+                # Tokenize input
                 inputs = self.tokenizer(
                     prompt,
                     return_tensors="pt",
                     max_length=max_length
                 ).to(self.model.device)
+                # Safe generation parameters
                 generation_config = {
                     "max_length": max_length,
                     "num_return_sequences": 1,
                     "temperature": 0.7,
                     "do_sample": True,
                     "pad_token_id": self.tokenizer.eos_token_id,
                     "early_stopping": True,
                     "no_repeat_ngram_size": 3,
                     "length_penalty": 1.0,
                         "temperature": 0.8,
                         "top_k": 40,
                         "top_p": 0.9,
                     })
+                with torch.no_grad():
                     outputs = self.model.generate(
                         inputs["input_ids"],
                         **generation_config
                     clean_up_tokenization_spaces=True
                 )
                 self.cache_manager.set(cache_key, response)
                 return response