Spaces:

Artples
/

L-MChat-ZeroGPU

Running on Zero

App Files Files Community

Artples commited on 25 days ago

Commit

d77a5d3

verified ·

1 Parent(s): 5e419a8

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -15

app.py CHANGED Viewed

@@ -14,27 +14,36 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 DESCRIPTION = """\
 # L-MChat
-This Space demonstrates **L-MChat**, a pair of chat-optimized language models:
-- **Fast-Model**: `Artples/L-MChat-Small`
-- **Quality-Model**: `Artples/L-MChat-7b`
-By default the **Quality-Model** is selected. You can switch to the Fast-Model if you want
-lower latency at the cost of quality.
-Use the *System prompt* field to steer the assistant’s behavior (for example:
-“Act as a helpful programming tutor”). The sliders allow you to configure the
-generation parameters.
 """
 if not torch.cuda.is_available():
-    DESCRIPTION += "\n\n<p>Running on CPU! This demo does not work on CPU.</p>"
 model_dict = {
     "Fast-Model": "Artples/L-MChat-Small",
     "Quality-Model": "Artples/L-MChat-7b",
 }
 @spaces.GPU(enable_queue=True, duration=90)
 def generate(
@@ -49,11 +58,9 @@ def generate(
     repetition_penalty: float = 1.2,
 ) -> Iterator[str]:
     model_id = model_dict[model_choice]
-    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    tokenizer.use_default_system_prompt = False
-    conversation: list[dict] = []
     if system_prompt:
         conversation.append({"role": "system", "content": system_prompt})
@@ -161,7 +168,19 @@ chat_interface = gr.ChatInterface(
     ],
 )
-with gr.Blocks(css="styles.css") as demo:
     gr.Markdown(DESCRIPTION)
     chat_interface.render()

 DESCRIPTION = """\
 # L-MChat
+This Space demonstrates L-MChat, a pair of chat-optimized language models:
+- Fast-Model: `Artples/L-MChat-Small`
+- Quality-Model: `Artples/L-MChat-7b`
+By default the Quality-Model is used. You can switch to the Fast-Model if you prefer lower latency over maximum quality.
 """
 if not torch.cuda.is_available():
+    DESCRIPTION += "\n\n<p>Running on CPU – this demo is intended for GPU and may be extremely slow.</p>"
 model_dict = {
     "Fast-Model": "Artples/L-MChat-Small",
     "Quality-Model": "Artples/L-MChat-7b",
 }
+_model_cache: dict[str, AutoModelForCausalLM] = {}
+_tokenizer_cache: dict[str, AutoTokenizer] = {}
+def get_model_and_tokenizer(model_id: str):
+    """Lazy-load and cache model and tokenizer per model id."""
+    if model_id not in _model_cache:
+        model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.use_default_system_prompt = False
+        _model_cache[model_id] = model
+        _tokenizer_cache[model_id] = tokenizer
+    return _model_cache[model_id], _tokenizer_cache[model_id]
 @spaces.GPU(enable_queue=True, duration=90)
 def generate(
     repetition_penalty: float = 1.2,
 ) -> Iterator[str]:
     model_id = model_dict[model_choice]
+    model, tokenizer = get_model_and_tokenizer(model_id)
+    conversation: list[dict[str, str]] = []
     if system_prompt:
         conversation.append({"role": "system", "content": system_prompt})
     ],
 )
+# Load external CSS from styles.css and inject it as an HTML <style> block
+custom_css = ""
+css_path = "styles.css"
+if os.path.exists(css_path):
+    try:
+        with open(css_path, encoding="utf-8") as f:
+            custom_css = f"<style>{f.read()}</style>"
+    except Exception:
+        custom_css = ""
+with gr.Blocks() as demo:
+    if custom_css:
+        gr.HTML(custom_css)
     gr.Markdown(DESCRIPTION)
     chat_interface.render()