Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -10,15 +10,22 @@ import re
|
|
| 10 |
import spaces
|
| 11 |
import gradio as gr
|
| 12 |
import torch
|
|
|
|
| 13 |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
| 14 |
|
| 15 |
MODEL_ID = "johneze/Llama-3.1-8B-Instruct-chichewa-text2sql"
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
# Tokenizer is tiny — safe to load at startup without a GPU
|
| 18 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 19 |
|
| 20 |
-
# Model is loaded lazily on the FIRST call
|
| 21 |
-
# context (@spaces.GPU) is already active and CUDA is available.
|
| 22 |
_pipe = None
|
| 23 |
|
| 24 |
|
|
@@ -33,7 +40,7 @@ def extract_sql(text: str) -> str:
|
|
| 33 |
return sql.strip() + ";"
|
| 34 |
|
| 35 |
|
| 36 |
-
@spaces.GPU(duration=
|
| 37 |
def generate_sql(question: str, language: str = "ny") -> str:
|
| 38 |
"""
|
| 39 |
Generate SQL from a Chichewa or English question.
|
|
@@ -42,9 +49,9 @@ def generate_sql(question: str, language: str = "ny") -> str:
|
|
| 42 |
"""
|
| 43 |
global _pipe
|
| 44 |
if _pipe is None:
|
| 45 |
-
#
|
| 46 |
model = AutoModelForCausalLM.from_pretrained(
|
| 47 |
-
|
| 48 |
dtype=torch.bfloat16,
|
| 49 |
device_map="auto",
|
| 50 |
)
|
|
|
|
| 10 |
import spaces
|
| 11 |
import gradio as gr
|
| 12 |
import torch
|
| 13 |
+
from huggingface_hub import snapshot_download
|
| 14 |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
| 15 |
|
| 16 |
MODEL_ID = "johneze/Llama-3.1-8B-Instruct-chichewa-text2sql"
|
| 17 |
|
| 18 |
+
# Pre-download all model files to disk at startup (no GPU required).
|
| 19 |
+
# When @spaces.GPU activates, from_pretrained reads from the local cache
|
| 20 |
+
# instead of downloading — slashing first-call latency significantly.
|
| 21 |
+
print("Downloading model weights to cache …")
|
| 22 |
+
_model_cache = snapshot_download(repo_id=MODEL_ID)
|
| 23 |
+
print(f"Model cached at: {_model_cache}")
|
| 24 |
+
|
| 25 |
# Tokenizer is tiny — safe to load at startup without a GPU
|
| 26 |
+
tokenizer = AutoTokenizer.from_pretrained(_model_cache)
|
| 27 |
|
| 28 |
+
# Model is loaded lazily on the FIRST call inside @spaces.GPU where CUDA is live.
|
|
|
|
| 29 |
_pipe = None
|
| 30 |
|
| 31 |
|
|
|
|
| 40 |
return sql.strip() + ";"
|
| 41 |
|
| 42 |
|
| 43 |
+
@spaces.GPU(duration=300)
|
| 44 |
def generate_sql(question: str, language: str = "ny") -> str:
|
| 45 |
"""
|
| 46 |
Generate SQL from a Chichewa or English question.
|
|
|
|
| 49 |
"""
|
| 50 |
global _pipe
|
| 51 |
if _pipe is None:
|
| 52 |
+
# Weights already on disk — this only loads into VRAM (~30-60s)
|
| 53 |
model = AutoModelForCausalLM.from_pretrained(
|
| 54 |
+
_model_cache,
|
| 55 |
dtype=torch.bfloat16,
|
| 56 |
device_map="auto",
|
| 57 |
)
|