johneze commited on
Commit
e06e3a1
·
verified ·
1 Parent(s): 107f585

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +13 -6
app.py CHANGED
@@ -10,15 +10,22 @@ import re
10
  import spaces
11
  import gradio as gr
12
  import torch
 
13
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
14
 
15
  MODEL_ID = "johneze/Llama-3.1-8B-Instruct-chichewa-text2sql"
16
 
 
 
 
 
 
 
 
17
  # Tokenizer is tiny — safe to load at startup without a GPU
18
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
19
 
20
- # Model is loaded lazily on the FIRST call to generate_sql, where the GPU
21
- # context (@spaces.GPU) is already active and CUDA is available.
22
  _pipe = None
23
 
24
 
@@ -33,7 +40,7 @@ def extract_sql(text: str) -> str:
33
  return sql.strip() + ";"
34
 
35
 
36
- @spaces.GPU(duration=120)
37
  def generate_sql(question: str, language: str = "ny") -> str:
38
  """
39
  Generate SQL from a Chichewa or English question.
@@ -42,9 +49,9 @@ def generate_sql(question: str, language: str = "ny") -> str:
42
  """
43
  global _pipe
44
  if _pipe is None:
45
- # First call: GPU is now available load the 4-bit quantized model
46
  model = AutoModelForCausalLM.from_pretrained(
47
- MODEL_ID,
48
  dtype=torch.bfloat16,
49
  device_map="auto",
50
  )
 
10
  import spaces
11
  import gradio as gr
12
  import torch
13
+ from huggingface_hub import snapshot_download
14
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
15
 
16
  MODEL_ID = "johneze/Llama-3.1-8B-Instruct-chichewa-text2sql"
17
 
18
+ # Pre-download all model files to disk at startup (no GPU required).
19
+ # When @spaces.GPU activates, from_pretrained reads from the local cache
20
+ # instead of downloading — slashing first-call latency significantly.
21
+ print("Downloading model weights to cache …")
22
+ _model_cache = snapshot_download(repo_id=MODEL_ID)
23
+ print(f"Model cached at: {_model_cache}")
24
+
25
  # Tokenizer is tiny — safe to load at startup without a GPU
26
+ tokenizer = AutoTokenizer.from_pretrained(_model_cache)
27
 
28
+ # Model is loaded lazily on the FIRST call inside @spaces.GPU where CUDA is live.
 
29
  _pipe = None
30
 
31
 
 
40
  return sql.strip() + ";"
41
 
42
 
43
+ @spaces.GPU(duration=300)
44
  def generate_sql(question: str, language: str = "ny") -> str:
45
  """
46
  Generate SQL from a Chichewa or English question.
 
49
  """
50
  global _pipe
51
  if _pipe is None:
52
+ # Weights already on diskthis only loads into VRAM (~30-60s)
53
  model = AutoModelForCausalLM.from_pretrained(
54
+ _model_cache,
55
  dtype=torch.bfloat16,
56
  device_map="auto",
57
  )