Spaces:

SatyamSinghal
/

taskmind_interface

Sleeping

App Files Files Community

SatyamSinghal commited on Apr 13

Commit

0567ace

verified ·

1 Parent(s): 5a9105e

lazy loading with model wrapping

Browse files

Files changed (1) hide show

app.py +37 -19

app.py CHANGED Viewed

@@ -1,35 +1,52 @@
 import os
 import gradio as gr
 import torch
-from peft import AutoPeftModelForCausalLM
-from transformers import AutoTokenizer, pipeline
 MODEL_ID = "SatyamSinghal/taskmind-1.1b-chat-lora"
 HF_TOKEN = os.getenv("HF_TOKEN")
-tokenizer = AutoTokenizer.from_pretrained(
-    MODEL_ID,
-    token=HF_TOKEN,
-)
-model = AutoPeftModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    token=HF_TOKEN,
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    low_cpu_mem_usage=True,
-)
-pipe = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
-)
 def respond(message, history):
     messages = []
     for item in history:
         messages.append({"role": item["role"], "content": item["content"]})
     messages.append({"role": "user", "content": message})
     result = pipe(
@@ -45,6 +62,7 @@ def respond(message, history):
         return generated[-1]["content"]
     return str(generated)
 demo = gr.ChatInterface(
     fn=respond,
     type="messages",
@@ -52,7 +70,7 @@ demo = gr.ChatInterface(
     description="Chat with the TaskMind LoRA model.",
     examples=[
         "Who are you?",
-        "@Agrim fix the growstreams deck ASAP NO Delay",
         "done bhai, merged the PR",
         "login page 60% ho gaya",
         "getting 500 error on registration",

 import os
 import gradio as gr
 import torch
 MODEL_ID = "SatyamSinghal/taskmind-1.1b-chat-lora"
 HF_TOKEN = os.getenv("HF_TOKEN")
+# Lazy globals — loaded on first request, not at startup
+pipe = None
+def load_model():
+    global pipe
+    if pipe is not None:
+        return
+    from peft import AutoPeftModelForCausalLM
+    from transformers import AutoTokenizer, pipeline
+    print("Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(
+        MODEL_ID,
+        token=HF_TOKEN,
+    )
+    print("Loading model...")
+    model = AutoPeftModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        token=HF_TOKEN,
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+        low_cpu_mem_usage=True,
+    )
+    pipe = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+    )
+    print("Model loaded successfully.")
 def respond(message, history):
+    try:
+        load_model()
+    except Exception as e:
+        return f"❌ Model failed to load: {str(e)}"
     messages = []
     for item in history:
         messages.append({"role": item["role"], "content": item["content"]})
     messages.append({"role": "user", "content": message})
     result = pipe(
         return generated[-1]["content"]
     return str(generated)
 demo = gr.ChatInterface(
     fn=respond,
     type="messages",
     description="Chat with the TaskMind LoRA model.",
     examples=[
         "Who are you?",
+        "@Model fix the growstreams deck ASAP NO Delay",
         "done bhai, merged the PR",
         "login page 60% ho gaya",
         "getting 500 error on registration",