ninjals commited on
Commit
5c22f83
·
1 Parent(s): 465426d

Add lazy LLM loading to fix ZeroGPU startup

Browse files
Files changed (1) hide show
  1. model.py +6 -4
model.py CHANGED
@@ -2,7 +2,7 @@ import torch
2
  import numpy as np
3
  import pandas as pd
4
  from sentence_transformers import SentenceTransformer, util
5
- from transformers import AutoTokenizer, AutoModelForCausalLM
6
  import os
7
 
8
  # Load saved embeddings
@@ -15,17 +15,19 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
15
  print(f"[INFO] Using device: {device}")
16
 
17
  # Load embedding model
18
- embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
19
 
20
  # Lazy-load the LLM model
21
  llm_model = None
22
  tokenizer = None
23
- HF_TOKEN = os.getenv("HF_TOKEN")
24
- model_id = "google/gemma-2-2b-it"
25
 
26
  def load_llm():
27
  global llm_model, tokenizer
28
  if llm_model is None or tokenizer is None:
 
 
 
29
  print("[INFO] Loading LLM model:", model_id)
30
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
31
  llm_model = AutoModelForCausalLM.from_pretrained(
 
2
  import numpy as np
3
  import pandas as pd
4
  from sentence_transformers import SentenceTransformer, util
5
+
6
  import os
7
 
8
  # Load saved embeddings
 
15
  print(f"[INFO] Using device: {device}")
16
 
17
  # Load embedding model
18
+ embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device=device)
19
 
20
  # Lazy-load the LLM model
21
  llm_model = None
22
  tokenizer = None
23
+
 
24
 
25
  def load_llm():
26
  global llm_model, tokenizer
27
  if llm_model is None or tokenizer is None:
28
+ from transformers import AutoTokenizer, AutoModelForCausalLM
29
+ HF_TOKEN = os.getenv("HF_TOKEN")
30
+ model_id = "google/gemma-2-2b-it"
31
  print("[INFO] Loading LLM model:", model_id)
32
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
33
  llm_model = AutoModelForCausalLM.from_pretrained(