pradeeparul2 commited on
Commit
c4f569f
·
verified ·
1 Parent(s): 6384eb8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -29
app.py CHANGED
@@ -1,38 +1,23 @@
1
  import os
2
  import gradio as gr
3
- from transformers import AutoModelForCausalLM, AutoTokenizer
4
- import torch
5
 
6
- # Redirect cache to /tmp to avoid 50GB storage limit
7
- os.environ['HF_HOME'] = '/tmp/hf_home'
8
- os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache'
9
-
10
- model_name = "Qwen/Qwen2.5-Coder-14B-Instruct"
11
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
12
-
13
- # Custom device map: Place most layers on GPU, offload rest to CPU
14
- device_map = {
15
- "transformer": "cuda", # Main layers on GPU
16
- "lm_head": "cpu" # Output layer to CPU
17
- }
18
-
19
- model = AutoModelForCausalLM.from_pretrained(
20
- model_name,
21
- load_in_4bit=True,
22
- device_map=device_map,
23
- llm_int8_enable_fp32_cpu_offload=True, # Enable CPU offloading
24
- torch_dtype=torch.float16, # Reduce memory with FP16
25
- trust_remote_code=True
26
  )
27
 
28
  def chat(message, history):
29
- messages = [{"role": "user", "content": message}]
30
- text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
31
- inputs = tokenizer(text, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
32
- with torch.no_grad():
33
- outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.7)
34
- response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
35
- history.append((message, response))
36
  return history, ""
37
 
38
  demo = gr.ChatInterface(chat)
 
1
  import os
2
  import gradio as gr
3
+ from llama_cpp import Llama
 
4
 
5
+ # No cache redirect needed for GGUF
6
+ model_path = "Qwen/Qwen2.5-Coder-14B-Instruct-GGUF" # Auto-downloads Q4_K_M (~9GB)
7
+ llm = Llama(
8
+ model_path,
9
+ n_ctx=4096, # Adjust for coding tasks
10
+ n_gpu_layers=99, # Offload to T4 GPU
11
+ verbose=False
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  )
13
 
14
  def chat(message, history):
15
+ response = llm.create_chat_completion(
16
+ messages=[{"role": "user", "content": message}],
17
+ max_tokens=512,
18
+ temperature=0.7
19
+ )
20
+ history.append((message, response["choices"][0]["message"]["content"]))
 
21
  return history, ""
22
 
23
  demo = gr.ChatInterface(chat)