FrederickSundeep commited on
Commit
314bc06
·
1 Parent(s): 24b2d6f

update commit with phi-3 mini 13

Browse files
Files changed (2) hide show
  1. app.py +19 -19
  2. requirements.txt +1 -2
app.py CHANGED
@@ -2,32 +2,33 @@ import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
3
  import torch
4
  from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetUtilizationRates
5
- from huggingface_hub import spaces
6
 
7
- # 🔐 Required for ZeroGPU to allocate GPU
8
- @spaces.GPU
9
- def trigger_gpu():
10
- print("✅ GPU requested")
11
- return torch.cuda.is_available()
 
 
12
 
13
- trigger_gpu()
14
 
15
- # ✅ GPU Monitoring
16
  def log_gpu_usage():
17
  try:
18
  nvmlInit()
19
  handle = nvmlDeviceGetHandleByIndex(0)
20
  mem = nvmlDeviceGetMemoryInfo(handle)
21
  util = nvmlDeviceGetUtilizationRates(handle)
22
- print(f"[GPU] Memory Used: {mem.used / 1024 ** 2:.1f} MB / {mem.total / 1024 ** 2:.1f} MB")
23
  print(f"[GPU] Utilization: {util.gpu}%")
24
  except Exception as e:
25
  print(f"[GPU Monitor] Error: {e}")
26
 
27
- # 📦 Model Choice (Phi-2 for fast inference)
28
  model_id = "microsoft/phi-2"
29
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
30
- print(f"🔧 Using device: {device}")
31
 
32
  tokenizer = AutoTokenizer.from_pretrained(model_id)
33
  model = AutoModelForCausalLM.from_pretrained(
@@ -37,7 +38,7 @@ model = AutoModelForCausalLM.from_pretrained(
37
 
38
  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device.type == "cuda" else -1)
39
 
40
- # 💬 Chat logic with openai-style messages
41
  def chat_fn(message, history):
42
  history_text = ""
43
  for item in history:
@@ -47,19 +48,19 @@ def chat_fn(message, history):
47
  history_text += f"<|assistant|>\n{item['content']}\n"
48
  prompt = f"{history_text}<|user|>\n{message}\n<|assistant|>\n"
49
 
50
- response = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)[0]['generated_text']
51
- reply = response.split("<|assistant|>")[-1].strip()
52
 
53
  if "```" not in reply and any(w in reply for w in ["def ", "class ", "import "]):
54
  reply = f"```\n{reply}\n```"
55
 
56
- log_gpu_usage() # 🔍 log usage per response
57
  return reply
58
 
59
- # 🖥️ Gradio app
60
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
61
- gr.Markdown("## 🤖 Chat with Phi-2 (Fast Lightweight Model)")
62
- gr.Markdown("Ask questions or generate code. Powered by Microsoft's Phi-2 (2.7B).")
63
 
64
  gr.ChatInterface(
65
  fn=chat_fn,
@@ -71,5 +72,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
71
  ]
72
  )
73
 
74
- # ✅ Launch safely without SSR for Hugging Face Spaces
75
  demo.launch(debug=True, ssr_mode=False)
 
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
3
  import torch
4
  from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetUtilizationRates
 
5
 
6
+ # Manually trigger GPU to keep ZeroGPU alive
7
+ def force_gpu():
8
+ if torch.cuda.is_available():
9
+ print("✅ GPU is available, allocating tensor...")
10
+ _ = torch.randn(1).to("cuda")
11
+ else:
12
+ print("⚠️ GPU not available, using CPU.")
13
 
14
+ force_gpu()
15
 
16
+ # ✅ GPU usage logging
17
  def log_gpu_usage():
18
  try:
19
  nvmlInit()
20
  handle = nvmlDeviceGetHandleByIndex(0)
21
  mem = nvmlDeviceGetMemoryInfo(handle)
22
  util = nvmlDeviceGetUtilizationRates(handle)
23
+ print(f"[GPU] Memory Used: {mem.used / 1024**2:.1f} MB / {mem.total / 1024**2:.1f} MB")
24
  print(f"[GPU] Utilization: {util.gpu}%")
25
  except Exception as e:
26
  print(f"[GPU Monitor] Error: {e}")
27
 
28
+ # Lightweight model for speed
29
  model_id = "microsoft/phi-2"
30
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31
+ print(f"🚀 Using device: {device}")
32
 
33
  tokenizer = AutoTokenizer.from_pretrained(model_id)
34
  model = AutoModelForCausalLM.from_pretrained(
 
38
 
39
  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device.type == "cuda" else -1)
40
 
41
+ # 💬 Chat function
42
  def chat_fn(message, history):
43
  history_text = ""
44
  for item in history:
 
48
  history_text += f"<|assistant|>\n{item['content']}\n"
49
  prompt = f"{history_text}<|user|>\n{message}\n<|assistant|>\n"
50
 
51
+ result = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)[0]["generated_text"]
52
+ reply = result.split("<|assistant|>")[-1].strip()
53
 
54
  if "```" not in reply and any(w in reply for w in ["def ", "class ", "import "]):
55
  reply = f"```\n{reply}\n```"
56
 
57
+ log_gpu_usage()
58
  return reply
59
 
60
+ # Gradio interface
61
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
62
+ gr.Markdown("## 🤖 Chat with Phi-2")
63
+ gr.Markdown("Fast, privacy-friendly AI assistant powered by Phi-2 (2.7B).")
64
 
65
  gr.ChatInterface(
66
  fn=chat_fn,
 
72
  ]
73
  )
74
 
 
75
  demo.launch(debug=True, ssr_mode=False)
requirements.txt CHANGED
@@ -1,6 +1,5 @@
 
1
  transformers
2
  torch
3
  accelerate
4
- gradio
5
  pynvml
6
- huggingface_hub==0.20.3
 
1
+ gradio
2
  transformers
3
  torch
4
  accelerate
 
5
  pynvml