SatyamSinghal commited on
Commit
0567ace
·
verified ·
1 Parent(s): 5a9105e

lazy loading with model wrapping

Browse files
Files changed (1) hide show
  1. app.py +37 -19
app.py CHANGED
@@ -1,35 +1,52 @@
1
  import os
2
  import gradio as gr
3
  import torch
4
- from peft import AutoPeftModelForCausalLM
5
- from transformers import AutoTokenizer, pipeline
6
 
7
  MODEL_ID = "SatyamSinghal/taskmind-1.1b-chat-lora"
8
  HF_TOKEN = os.getenv("HF_TOKEN")
9
 
10
- tokenizer = AutoTokenizer.from_pretrained(
11
- MODEL_ID,
12
- token=HF_TOKEN,
13
- )
14
 
15
- model = AutoPeftModelForCausalLM.from_pretrained(
16
- MODEL_ID,
17
- token=HF_TOKEN,
18
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
19
- low_cpu_mem_usage=True,
20
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- pipe = pipeline(
23
- "text-generation",
24
- model=model,
25
- tokenizer=tokenizer,
26
- )
27
 
28
  def respond(message, history):
 
 
 
 
 
29
  messages = []
30
  for item in history:
31
  messages.append({"role": item["role"], "content": item["content"]})
32
-
33
  messages.append({"role": "user", "content": message})
34
 
35
  result = pipe(
@@ -45,6 +62,7 @@ def respond(message, history):
45
  return generated[-1]["content"]
46
  return str(generated)
47
 
 
48
  demo = gr.ChatInterface(
49
  fn=respond,
50
  type="messages",
@@ -52,7 +70,7 @@ demo = gr.ChatInterface(
52
  description="Chat with the TaskMind LoRA model.",
53
  examples=[
54
  "Who are you?",
55
- "@Agrim fix the growstreams deck ASAP NO Delay",
56
  "done bhai, merged the PR",
57
  "login page 60% ho gaya",
58
  "getting 500 error on registration",
 
1
  import os
2
  import gradio as gr
3
  import torch
 
 
4
 
5
  MODEL_ID = "SatyamSinghal/taskmind-1.1b-chat-lora"
6
  HF_TOKEN = os.getenv("HF_TOKEN")
7
 
8
+ # Lazy globals — loaded on first request, not at startup
9
+ pipe = None
 
 
10
 
11
+ def load_model():
12
+ global pipe
13
+ if pipe is not None:
14
+ return
15
+
16
+ from peft import AutoPeftModelForCausalLM
17
+ from transformers import AutoTokenizer, pipeline
18
+
19
+ print("Loading tokenizer...")
20
+ tokenizer = AutoTokenizer.from_pretrained(
21
+ MODEL_ID,
22
+ token=HF_TOKEN,
23
+ )
24
+
25
+ print("Loading model...")
26
+ model = AutoPeftModelForCausalLM.from_pretrained(
27
+ MODEL_ID,
28
+ token=HF_TOKEN,
29
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
30
+ low_cpu_mem_usage=True,
31
+ )
32
+
33
+ pipe = pipeline(
34
+ "text-generation",
35
+ model=model,
36
+ tokenizer=tokenizer,
37
+ )
38
+ print("Model loaded successfully.")
39
 
 
 
 
 
 
40
 
41
  def respond(message, history):
42
+ try:
43
+ load_model()
44
+ except Exception as e:
45
+ return f"❌ Model failed to load: {str(e)}"
46
+
47
  messages = []
48
  for item in history:
49
  messages.append({"role": item["role"], "content": item["content"]})
 
50
  messages.append({"role": "user", "content": message})
51
 
52
  result = pipe(
 
62
  return generated[-1]["content"]
63
  return str(generated)
64
 
65
+
66
  demo = gr.ChatInterface(
67
  fn=respond,
68
  type="messages",
 
70
  description="Chat with the TaskMind LoRA model.",
71
  examples=[
72
  "Who are you?",
73
+ "@Model fix the growstreams deck ASAP NO Delay",
74
  "done bhai, merged the PR",
75
  "login page 60% ho gaya",
76
  "getting 500 error on registration",