Spaces:

FrederickSundeep
/

ChatMate

Sleeping

App Files Files Community

FrederickSundeep commited on Jun 26, 2025

Commit

314bc06

1 Parent(s): 24b2d6f

update commit with phi-3 mini 13

Browse files

Files changed (2) hide show

app.py +19 -19
requirements.txt +1 -2

app.py CHANGED Viewed

@@ -2,32 +2,33 @@ import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
 from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetUtilizationRates
-from huggingface_hub import spaces
-# 🔐 Required for ZeroGPU to allocate GPU
-@spaces.GPU
-def trigger_gpu():
-    print("✅ GPU requested")
-    return torch.cuda.is_available()
-trigger_gpu()
-# ✅ GPU Monitoring
 def log_gpu_usage():
     try:
         nvmlInit()
         handle = nvmlDeviceGetHandleByIndex(0)
         mem = nvmlDeviceGetMemoryInfo(handle)
         util = nvmlDeviceGetUtilizationRates(handle)
-        print(f"[GPU] Memory Used: {mem.used / 1024 ** 2:.1f} MB / {mem.total / 1024 ** 2:.1f} MB")
         print(f"[GPU] Utilization: {util.gpu}%")
     except Exception as e:
         print(f"[GPU Monitor] Error: {e}")
-# 📦 Model Choice (Phi-2 for fast inference)
 model_id = "microsoft/phi-2"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print(f"🔧 Using device: {device}")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
@@ -37,7 +38,7 @@ model = AutoModelForCausalLM.from_pretrained(
 pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device.type == "cuda" else -1)
-# 💬 Chat logic with openai-style messages
 def chat_fn(message, history):
     history_text = ""
     for item in history:
@@ -47,19 +48,19 @@ def chat_fn(message, history):
             history_text += f"<|assistant|>\n{item['content']}\n"
     prompt = f"{history_text}<|user|>\n{message}\n<|assistant|>\n"
-    response = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)[0]['generated_text']
-    reply = response.split("<|assistant|>")[-1].strip()
     if "```" not in reply and any(w in reply for w in ["def ", "class ", "import "]):
         reply = f"```\n{reply}\n```"
-    log_gpu_usage()  # 🔍 log usage per response
     return reply
-# 🖥️ Gradio app
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## 🤖 Chat with Phi-2 (Fast Lightweight Model)")
-    gr.Markdown("Ask questions or generate code. Powered by Microsoft's Phi-2 (2.7B).")
     gr.ChatInterface(
         fn=chat_fn,
@@ -71,5 +72,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         ]
     )
-# ✅ Launch safely without SSR for Hugging Face Spaces
 demo.launch(debug=True, ssr_mode=False)

 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
 from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetUtilizationRates
+# ✅ Manually trigger GPU to keep ZeroGPU alive
+def force_gpu():
+    if torch.cuda.is_available():
+        print("✅ GPU is available, allocating tensor...")
+        _ = torch.randn(1).to("cuda")
+    else:
+        print("⚠️ GPU not available, using CPU.")
+force_gpu()
+# ✅ GPU usage logging
 def log_gpu_usage():
     try:
         nvmlInit()
         handle = nvmlDeviceGetHandleByIndex(0)
         mem = nvmlDeviceGetMemoryInfo(handle)
         util = nvmlDeviceGetUtilizationRates(handle)
+        print(f"[GPU] Memory Used: {mem.used / 1024**2:.1f} MB / {mem.total / 1024**2:.1f} MB")
         print(f"[GPU] Utilization: {util.gpu}%")
     except Exception as e:
         print(f"[GPU Monitor] Error: {e}")
+# ✅ Lightweight model for speed
 model_id = "microsoft/phi-2"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"🚀 Using device: {device}")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
 pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device.type == "cuda" else -1)
+# 💬 Chat function
 def chat_fn(message, history):
     history_text = ""
     for item in history:
             history_text += f"<|assistant|>\n{item['content']}\n"
     prompt = f"{history_text}<|user|>\n{message}\n<|assistant|>\n"
+    result = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7)[0]["generated_text"]
+    reply = result.split("<|assistant|>")[-1].strip()
     if "```" not in reply and any(w in reply for w in ["def ", "class ", "import "]):
         reply = f"```\n{reply}\n```"
+    log_gpu_usage()
     return reply
+# Gradio interface
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## 🤖 Chat with Phi-2")
+    gr.Markdown("Fast, privacy-friendly AI assistant powered by Phi-2 (2.7B).")
     gr.ChatInterface(
         fn=chat_fn,
         ]
     )
 demo.launch(debug=True, ssr_mode=False)

requirements.txt CHANGED Viewed

@@ -1,6 +1,5 @@
 transformers
 torch
 accelerate
-gradio
 pynvml
-huggingface_hub==0.20.3

+gradio
 transformers
 torch
 accelerate
 pynvml