BoostedJonP commited on
Commit
8b45180
·
1 Parent(s): c58fada

fixed cpu running issue

Browse files
Files changed (1) hide show
  1. app.py +42 -11
app.py CHANGED
@@ -25,15 +25,32 @@ def load_model():
25
  trust_remote_code=True,
26
  cache_dir="/tmp/model_cache",
27
  )
28
- model = AutoModelForCausalLM.from_pretrained(
29
- MODEL_NAME,
30
- trust_remote_code=True,
31
- torch_dtype=torch.float16,
32
- device_map="auto",
33
- attn_implementation="eager",
34
- use_cache=True,
35
- cache_dir="/tmp/model_cache",
36
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  logger.info("Model loaded successfully!")
38
  except Exception as e:
39
  logger.error(f"Error loading model: {e}")
@@ -42,7 +59,14 @@ def load_model():
42
  model.generation_config.use_cache = True
43
  model.generation_config.pad_token_id = tokenizer.eos_token_id
44
 
45
- model = torch.compile(model, mode="reduce-overhead")
 
 
 
 
 
 
 
46
  return model, tokenizer
47
 
48
 
@@ -60,6 +84,10 @@ def generate_powell_response(question, max_length=256, num_beams=3, temperature=
60
  "Please ask a question about monetary policy, economics, or Federal Reserve operations."
61
  )
62
 
 
 
 
 
63
  system_prompt = """You are Jerome Powell, the Chairman of the Federal Reserve."""
64
 
65
  prompt = f"System: {system_prompt}\n\nQuestion: {question.strip()}\nAnswer:"
@@ -73,8 +101,11 @@ def generate_powell_response(question, max_length=256, num_beams=3, temperature=
73
  padding=False,
74
  )
75
 
76
- if torch.cuda.is_available():
 
77
  inputs = {k: v.cuda() for k, v in inputs.items()}
 
 
78
 
79
  with torch.no_grad():
80
  generation_config = {
 
25
  trust_remote_code=True,
26
  cache_dir="/tmp/model_cache",
27
  )
28
+
29
+ # Check if CUDA is available, otherwise use CPU-friendly settings
30
+ if torch.cuda.is_available():
31
+ logger.info("CUDA available, loading with GPU optimizations")
32
+ model = AutoModelForCausalLM.from_pretrained(
33
+ MODEL_NAME,
34
+ trust_remote_code=True,
35
+ torch_dtype=torch.float16,
36
+ device_map="auto",
37
+ attn_implementation="eager",
38
+ use_cache=True,
39
+ cache_dir="/tmp/model_cache",
40
+ )
41
+ else:
42
+ logger.info("CUDA not available, loading with CPU optimizations")
43
+ model = AutoModelForCausalLM.from_pretrained(
44
+ MODEL_NAME,
45
+ trust_remote_code=True,
46
+ torch_dtype=torch.float32, # Use float32 for CPU
47
+ device_map="cpu", # Explicitly set to CPU
48
+ attn_implementation="eager",
49
+ use_cache=True,
50
+ cache_dir="/tmp/model_cache",
51
+ low_cpu_mem_usage=True, # Helpful for CPU environments
52
+ )
53
+
54
  logger.info("Model loaded successfully!")
55
  except Exception as e:
56
  logger.error(f"Error loading model: {e}")
 
59
  model.generation_config.use_cache = True
60
  model.generation_config.pad_token_id = tokenizer.eos_token_id
61
 
62
+ # Only compile on GPU, skip compilation on CPU to avoid compatibility issues
63
+ if torch.cuda.is_available():
64
+ try:
65
+ model = torch.compile(model, mode="reduce-overhead")
66
+ logger.info("Model compiled successfully")
67
+ except Exception as e:
68
+ logger.warning(f"Model compilation failed: {e}, continuing without compilation")
69
+
70
  return model, tokenizer
71
 
72
 
 
84
  "Please ask a question about monetary policy, economics, or Federal Reserve operations."
85
  )
86
 
87
+ # Log device information for debugging
88
+ device = next(model.parameters()).device
89
+ logger.info(f"Generating response on device: {device}")
90
+
91
  system_prompt = """You are Jerome Powell, the Chairman of the Federal Reserve."""
92
 
93
  prompt = f"System: {system_prompt}\n\nQuestion: {question.strip()}\nAnswer:"
 
101
  padding=False,
102
  )
103
 
104
+ # Move inputs to the same device as the model
105
+ if torch.cuda.is_available() and next(model.parameters()).is_cuda:
106
  inputs = {k: v.cuda() for k, v in inputs.items()}
107
+ else:
108
+ inputs = {k: v.cpu() for k, v in inputs.items()}
109
 
110
  with torch.no_grad():
111
  generation_config = {