LucianStorm commited on
Commit
b91dc30
·
verified ·
1 Parent(s): e2770d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -14
app.py CHANGED
@@ -6,6 +6,10 @@ import torch
6
  import uvicorn
7
  import os
8
 
 
 
 
 
9
  app = FastAPI(title="TinyLlama Fitness Bot")
10
 
11
  app.add_middleware(
@@ -28,23 +32,24 @@ def load_model():
28
  model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
29
 
30
  # CPU-specific settings
31
- torch.set_num_threads(4) # Limit CPU threads
32
 
33
  print("Loading tokenizer...")
34
  tokenizer = AutoTokenizer.from_pretrained(
35
  model_name,
36
- local_files_only=False
37
  )
38
 
39
  print("Loading model...")
40
  model = AutoModelForCausalLM.from_pretrained(
41
  model_name,
42
- torch_dtype=torch.float32, # Use float32 for CPU
43
  low_cpu_mem_usage=True,
44
- device_map=None # Force CPU
 
45
  )
46
 
47
- model.eval() # Set to evaluation mode
48
  MODEL_LOADED = True
49
  print("Model loaded successfully on CPU!")
50
  return True
@@ -59,7 +64,7 @@ load_model()
59
 
60
  class Query(BaseModel):
61
  prompt: str
62
- max_length: int = 100 # Reduced for CPU
63
  temperature: float = 0.7
64
 
65
  @app.post("/chat")
@@ -74,18 +79,15 @@ async def chat(query: Query):
74
  )
75
 
76
  try:
77
- # Simpler prompt template for efficiency
78
  formatted_prompt = f"<|user|>{query.prompt}</s><|assistant|>"
79
 
80
- # Tokenize with smaller context
81
  inputs = tokenizer(
82
  formatted_prompt,
83
  return_tensors="pt",
84
  truncation=True,
85
- max_length=256 # Reduced context window for CPU
86
  )
87
 
88
- # Generate with CPU-optimized settings
89
  with torch.no_grad():
90
  outputs = model.generate(
91
  inputs["input_ids"],
@@ -94,7 +96,7 @@ async def chat(query: Query):
94
  top_p=0.9,
95
  do_sample=True,
96
  pad_token_id=tokenizer.eos_token_id,
97
- num_beams=1, # No beam search for speed
98
  early_stopping=True
99
  )
100
 
@@ -124,9 +126,7 @@ def debug_info():
124
  "model_loaded": MODEL_LOADED,
125
  "device": "cpu",
126
  "num_threads": torch.get_num_threads(),
127
- "memory_info": {
128
- "max_memory": f"{torch.cuda.max_memory_allocated() / 1024**2:.2f}MB" if torch.cuda.is_available() else "CPU only"
129
- }
130
  }
131
 
132
  if __name__ == "__main__":
 
6
  import uvicorn
7
  import os
8
 
9
+ # Set cache directories to /tmp which is writable
10
+ os.environ['TRANSFORMERS_CACHE'] = '/tmp/transformers_cache'
11
+ os.environ['TORCH_HOME'] = '/tmp/torch_cache'
12
+
13
  app = FastAPI(title="TinyLlama Fitness Bot")
14
 
15
  app.add_middleware(
 
32
  model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
33
 
34
  # CPU-specific settings
35
+ torch.set_num_threads(4)
36
 
37
  print("Loading tokenizer...")
38
  tokenizer = AutoTokenizer.from_pretrained(
39
  model_name,
40
+ cache_dir='/tmp/transformers_cache' # Use /tmp directory
41
  )
42
 
43
  print("Loading model...")
44
  model = AutoModelForCausalLM.from_pretrained(
45
  model_name,
46
+ torch_dtype=torch.float32,
47
  low_cpu_mem_usage=True,
48
+ device_map=None, # Force CPU
49
+ cache_dir='/tmp/transformers_cache' # Use /tmp directory
50
  )
51
 
52
+ model.eval()
53
  MODEL_LOADED = True
54
  print("Model loaded successfully on CPU!")
55
  return True
 
64
 
65
  class Query(BaseModel):
66
  prompt: str
67
+ max_length: int = 100
68
  temperature: float = 0.7
69
 
70
  @app.post("/chat")
 
79
  )
80
 
81
  try:
 
82
  formatted_prompt = f"<|user|>{query.prompt}</s><|assistant|>"
83
 
 
84
  inputs = tokenizer(
85
  formatted_prompt,
86
  return_tensors="pt",
87
  truncation=True,
88
+ max_length=256
89
  )
90
 
 
91
  with torch.no_grad():
92
  outputs = model.generate(
93
  inputs["input_ids"],
 
96
  top_p=0.9,
97
  do_sample=True,
98
  pad_token_id=tokenizer.eos_token_id,
99
+ num_beams=1,
100
  early_stopping=True
101
  )
102
 
 
126
  "model_loaded": MODEL_LOADED,
127
  "device": "cpu",
128
  "num_threads": torch.get_num_threads(),
129
+ "cache_dir": os.environ.get('TRANSFORMERS_CACHE')
 
 
130
  }
131
 
132
  if __name__ == "__main__":