pradeeparul2 commited on
Commit
6384eb8
·
verified ·
1 Parent(s): 6388a60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -5
app.py CHANGED
@@ -3,22 +3,32 @@ import gradio as gr
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import torch
5
 
6
- # Redirect HF cache to /tmp (ephemeral, unlimited)
7
  os.environ['HF_HOME'] = '/tmp/hf_home'
8
  os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache'
9
 
10
  model_name = "Qwen/Qwen2.5-Coder-14B-Instruct"
11
- tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
 
 
 
 
 
12
  model = AutoModelForCausalLM.from_pretrained(
13
  model_name,
14
- load_in_4bit=True, # Quantization for T4 GPU (~9-10GB VRAM)
15
- device_map="auto"
 
 
 
16
  )
17
 
18
  def chat(message, history):
19
  messages = [{"role": "user", "content": message}]
20
  text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
21
- inputs = tokenizer(text, return_tensors="pt").to(model.device)
22
  with torch.no_grad():
23
  outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.7)
24
  response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
 
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import torch
5
 
6
+ # Redirect cache to /tmp to avoid 50GB storage limit
7
  os.environ['HF_HOME'] = '/tmp/hf_home'
8
  os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache'
9
 
10
  model_name = "Qwen/Qwen2.5-Coder-14B-Instruct"
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
12
+
13
+ # Custom device map: Place most layers on GPU, offload rest to CPU
14
+ device_map = {
15
+ "transformer": "cuda", # Main layers on GPU
16
+ "lm_head": "cpu" # Output layer to CPU
17
+ }
18
+
19
  model = AutoModelForCausalLM.from_pretrained(
20
  model_name,
21
+ load_in_4bit=True,
22
+ device_map=device_map,
23
+ llm_int8_enable_fp32_cpu_offload=True, # Enable CPU offloading
24
+ torch_dtype=torch.float16, # Reduce memory with FP16
25
+ trust_remote_code=True
26
  )
27
 
28
  def chat(message, history):
29
  messages = [{"role": "user", "content": message}]
30
  text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
31
+ inputs = tokenizer(text, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
32
  with torch.no_grad():
33
  outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.7)
34
  response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)