Spaces:

johneze
/

chichewa-text2sql

Sleeping

johneze commited on Feb 19

Commit

e06e3a1

verified ·

1 Parent(s): 107f585

Upload folder using huggingface_hub

Files changed (1) hide show

app.py CHANGED Viewed

@@ -10,15 +10,22 @@ import re
 import spaces
 import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 MODEL_ID = "johneze/Llama-3.1-8B-Instruct-chichewa-text2sql"
 # Tokenizer is tiny — safe to load at startup without a GPU
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-# Model is loaded lazily on the FIRST call to generate_sql, where the GPU
-# context (@spaces.GPU) is already active and CUDA is available.
 _pipe = None
@@ -33,7 +40,7 @@ def extract_sql(text: str) -> str:
     return sql.strip() + ";"
-@spaces.GPU(duration=120)
 def generate_sql(question: str, language: str = "ny") -> str:
     """
     Generate SQL from a Chichewa or English question.
@@ -42,9 +49,9 @@ def generate_sql(question: str, language: str = "ny") -> str:
     """
     global _pipe
     if _pipe is None:
-        # First call: GPU is now available — load the 4-bit quantized model
         model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
             dtype=torch.bfloat16,
             device_map="auto",
         )

 import spaces
 import gradio as gr
 import torch
+from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 MODEL_ID = "johneze/Llama-3.1-8B-Instruct-chichewa-text2sql"
+# Pre-download all model files to disk at startup (no GPU required).
+# When @spaces.GPU activates, from_pretrained reads from the local cache
+# instead of downloading — slashing first-call latency significantly.
+print("Downloading model weights to cache …")
+_model_cache = snapshot_download(repo_id=MODEL_ID)
+print(f"Model cached at: {_model_cache}")
 # Tokenizer is tiny — safe to load at startup without a GPU
+tokenizer = AutoTokenizer.from_pretrained(_model_cache)
+# Model is loaded lazily on the FIRST call inside @spaces.GPU where CUDA is live.
 _pipe = None
     return sql.strip() + ";"
+@spaces.GPU(duration=300)
 def generate_sql(question: str, language: str = "ny") -> str:
     """
     Generate SQL from a Chichewa or English question.
     """
     global _pipe
     if _pipe is None:
+        # Weights already on disk — this only loads into VRAM (~30-60s)
         model = AutoModelForCausalLM.from_pretrained(
+            _model_cache,
             dtype=torch.bfloat16,
             device_map="auto",
         )