Spaces:

BoostedJonP
/

powell-assistant

Sleeping

BoostedJonP commited on Oct 8, 2025

Commit

9c0c216

1 Parent(s): 6f78921

auto config

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from functools import lru_cache
 import logging
@@ -20,6 +20,12 @@ def load_model():
     logger.info(f"Loading model: {MODEL_NAME}")
     try:
         tokenizer = AutoTokenizer.from_pretrained(
             MODEL_NAME,
             trust_remote_code=True,
@@ -31,6 +37,7 @@ def load_model():
             logger.info("CUDA available, loading with GPU optimizations")
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_NAME,
                 trust_remote_code=True,
                 torch_dtype=torch.float16,
                 device_map="auto",
@@ -40,8 +47,12 @@ def load_model():
             )
         else:
             logger.info("CUDA not available, loading with CPU optimizations")
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_NAME,
                 trust_remote_code=True,
                 torch_dtype=torch.float32,
                 attn_implementation="eager",

 import gradio as gr
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
 from functools import lru_cache
 import logging
     logger.info(f"Loading model: {MODEL_NAME}")
     try:
+        config = AutoConfig.from_pretrained(
+            MODEL_NAME,
+            trust_remote_code=True,
+            cache_dir="/tmp/model_cache",
+        )
         tokenizer = AutoTokenizer.from_pretrained(
             MODEL_NAME,
             trust_remote_code=True,
             logger.info("CUDA available, loading with GPU optimizations")
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_NAME,
+                config=config,
                 trust_remote_code=True,
                 torch_dtype=torch.float16,
                 device_map="auto",
             )
         else:
             logger.info("CUDA not available, loading with CPU optimizations")
+            if getattr(config, "quantization_config", None) is not None:
+                logger.info("Disabling quantization settings for CPU execution")
+                config.quantization_config = None
             model = AutoModelForCausalLM.from_pretrained(
                 MODEL_NAME,
+                config=config,
                 trust_remote_code=True,
                 torch_dtype=torch.float32,
                 attn_implementation="eager",