FrederickSundeep commited on
Commit
b1d3d86
·
1 Parent(s): 314bc06

update commit with phi-3 mini 14

Browse files
Files changed (2) hide show
  1. app.py +18 -31
  2. requirements.txt +0 -1
app.py CHANGED
@@ -1,35 +1,22 @@
1
  import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
3
  import torch
4
- from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetUtilizationRates
5
 
6
- # ✅ Manually trigger GPU to keep ZeroGPU alive
7
- def force_gpu():
8
  if torch.cuda.is_available():
9
- print("✅ GPU is available, allocating tensor...")
10
- _ = torch.randn(1).to("cuda")
11
  else:
12
- print("⚠️ GPU not available, using CPU.")
 
 
 
13
 
14
- force_gpu()
15
-
16
- # ✅ GPU usage logging
17
- def log_gpu_usage():
18
- try:
19
- nvmlInit()
20
- handle = nvmlDeviceGetHandleByIndex(0)
21
- mem = nvmlDeviceGetMemoryInfo(handle)
22
- util = nvmlDeviceGetUtilizationRates(handle)
23
- print(f"[GPU] Memory Used: {mem.used / 1024**2:.1f} MB / {mem.total / 1024**2:.1f} MB")
24
- print(f"[GPU] Utilization: {util.gpu}%")
25
- except Exception as e:
26
- print(f"[GPU Monitor] Error: {e}")
27
-
28
- # ✅ Lightweight model for speed
29
- model_id = "microsoft/phi-2"
30
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31
  print(f"🚀 Using device: {device}")
32
 
 
33
  tokenizer = AutoTokenizer.from_pretrained(model_id)
34
  model = AutoModelForCausalLM.from_pretrained(
35
  model_id,
@@ -38,7 +25,7 @@ model = AutoModelForCausalLM.from_pretrained(
38
 
39
  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device.type == "cuda" else -1)
40
 
41
- # 💬 Chat function
42
  def chat_fn(message, history):
43
  history_text = ""
44
  for item in history:
@@ -48,28 +35,28 @@ def chat_fn(message, history):
48
  history_text += f"<|assistant|>\n{item['content']}\n"
49
  prompt = f"{history_text}<|user|>\n{message}\n<|assistant|>\n"
50
 
51
- result = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)[0]["generated_text"]
52
- reply = result.split("<|assistant|>")[-1].strip()
53
 
54
  if "```" not in reply and any(w in reply for w in ["def ", "class ", "import "]):
55
  reply = f"```\n{reply}\n```"
56
 
57
- log_gpu_usage()
58
  return reply
59
 
60
- # Gradio interface
61
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
62
  gr.Markdown("## 🤖 Chat with Phi-2")
63
- gr.Markdown("Fast, privacy-friendly AI assistant powered by Phi-2 (2.7B).")
64
 
65
  gr.ChatInterface(
66
  fn=chat_fn,
67
  chatbot=gr.Chatbot(type="messages"),
68
  examples=[
69
- "What is a function in Python?",
70
- "Write a for loop in JavaScript.",
71
- "Explain how AI models are trained."
72
  ]
73
  )
74
 
 
75
  demo.launch(debug=True, ssr_mode=False)
 
1
  import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
3
  import torch
 
4
 
5
+ # ✅ Force GPU allocation EARLY (ZeroGPU needs this before model load)
6
+ try:
7
  if torch.cuda.is_available():
8
+ print("✅ CUDA is already available")
 
9
  else:
10
+ torch.randn(1).cuda()
11
+ print("✅ Triggered CUDA tensor to force GPU allocation")
12
+ except Exception as e:
13
+ print(f"⚠️ GPU not available or failed to allocate: {e}")
14
 
15
+ # ✅ Load model after GPU trigger
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
  print(f"🚀 Using device: {device}")
18
 
19
+ model_id = "microsoft/phi-2" # Choose phi-2 for performance
20
  tokenizer = AutoTokenizer.from_pretrained(model_id)
21
  model = AutoModelForCausalLM.from_pretrained(
22
  model_id,
 
25
 
26
  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device.type == "cuda" else -1)
27
 
28
+ # 💬 Chat logic
29
  def chat_fn(message, history):
30
  history_text = ""
31
  for item in history:
 
35
  history_text += f"<|assistant|>\n{item['content']}\n"
36
  prompt = f"{history_text}<|user|>\n{message}\n<|assistant|>\n"
37
 
38
+ response = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)[0]["generated_text"]
39
+ reply = response.split("<|assistant|>")[-1].strip()
40
 
41
  if "```" not in reply and any(w in reply for w in ["def ", "class ", "import "]):
42
  reply = f"```\n{reply}\n```"
43
 
 
44
  return reply
45
 
46
+ # 🖥️ Gradio UI
47
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
48
  gr.Markdown("## 🤖 Chat with Phi-2")
49
+ gr.Markdown("Fast AI assistant powered by Microsoft’s Phi-2, optimized for ZeroGPU on Hugging Face Spaces.")
50
 
51
  gr.ChatInterface(
52
  fn=chat_fn,
53
  chatbot=gr.Chatbot(type="messages"),
54
  examples=[
55
+ "What is a Python generator?",
56
+ "Write a for loop in C++",
57
+ "Explain LLM training"
58
  ]
59
  )
60
 
61
+ # Launch without SSR for ZeroGPU
62
  demo.launch(debug=True, ssr_mode=False)
requirements.txt CHANGED
@@ -2,4 +2,3 @@ gradio
2
  transformers
3
  torch
4
  accelerate
5
- pynvml
 
2
  transformers
3
  torch
4
  accelerate