Artples commited on
Commit
d77a5d3
·
verified ·
1 Parent(s): 5e419a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -15
app.py CHANGED
@@ -14,27 +14,36 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
14
  DESCRIPTION = """\
15
  # L-MChat
16
 
17
- This Space demonstrates **L-MChat**, a pair of chat-optimized language models:
18
 
19
- - **Fast-Model**: `Artples/L-MChat-Small`
20
- - **Quality-Model**: `Artples/L-MChat-7b`
21
 
22
- By default the **Quality-Model** is selected. You can switch to the Fast-Model if you want
23
- lower latency at the cost of quality.
24
-
25
- Use the *System prompt* field to steer the assistant’s behavior (for example:
26
- “Act as a helpful programming tutor”). The sliders allow you to configure the
27
- generation parameters.
28
  """
29
 
30
  if not torch.cuda.is_available():
31
- DESCRIPTION += "\n\n<p>Running on CPU! This demo does not work on CPU.</p>"
32
 
33
  model_dict = {
34
  "Fast-Model": "Artples/L-MChat-Small",
35
  "Quality-Model": "Artples/L-MChat-7b",
36
  }
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  @spaces.GPU(enable_queue=True, duration=90)
40
  def generate(
@@ -49,11 +58,9 @@ def generate(
49
  repetition_penalty: float = 1.2,
50
  ) -> Iterator[str]:
51
  model_id = model_dict[model_choice]
52
- model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
53
- tokenizer = AutoTokenizer.from_pretrained(model_id)
54
- tokenizer.use_default_system_prompt = False
55
 
56
- conversation: list[dict] = []
57
  if system_prompt:
58
  conversation.append({"role": "system", "content": system_prompt})
59
 
@@ -161,7 +168,19 @@ chat_interface = gr.ChatInterface(
161
  ],
162
  )
163
 
164
- with gr.Blocks(css="styles.css") as demo:
 
 
 
 
 
 
 
 
 
 
 
 
165
  gr.Markdown(DESCRIPTION)
166
  chat_interface.render()
167
 
 
14
  DESCRIPTION = """\
15
  # L-MChat
16
 
17
+ This Space demonstrates L-MChat, a pair of chat-optimized language models:
18
 
19
+ - Fast-Model: `Artples/L-MChat-Small`
20
+ - Quality-Model: `Artples/L-MChat-7b`
21
 
22
+ By default the Quality-Model is used. You can switch to the Fast-Model if you prefer lower latency over maximum quality.
 
 
 
 
 
23
  """
24
 
25
  if not torch.cuda.is_available():
26
+ DESCRIPTION += "\n\n<p>Running on CPU this demo is intended for GPU and may be extremely slow.</p>"
27
 
28
  model_dict = {
29
  "Fast-Model": "Artples/L-MChat-Small",
30
  "Quality-Model": "Artples/L-MChat-7b",
31
  }
32
 
33
+ _model_cache: dict[str, AutoModelForCausalLM] = {}
34
+ _tokenizer_cache: dict[str, AutoTokenizer] = {}
35
+
36
+
37
+ def get_model_and_tokenizer(model_id: str):
38
+ """Lazy-load and cache model and tokenizer per model id."""
39
+ if model_id not in _model_cache:
40
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
41
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
42
+ tokenizer.use_default_system_prompt = False
43
+ _model_cache[model_id] = model
44
+ _tokenizer_cache[model_id] = tokenizer
45
+ return _model_cache[model_id], _tokenizer_cache[model_id]
46
+
47
 
48
  @spaces.GPU(enable_queue=True, duration=90)
49
  def generate(
 
58
  repetition_penalty: float = 1.2,
59
  ) -> Iterator[str]:
60
  model_id = model_dict[model_choice]
61
+ model, tokenizer = get_model_and_tokenizer(model_id)
 
 
62
 
63
+ conversation: list[dict[str, str]] = []
64
  if system_prompt:
65
  conversation.append({"role": "system", "content": system_prompt})
66
 
 
168
  ],
169
  )
170
 
171
+ # Load external CSS from styles.css and inject it as an HTML <style> block
172
+ custom_css = ""
173
+ css_path = "styles.css"
174
+ if os.path.exists(css_path):
175
+ try:
176
+ with open(css_path, encoding="utf-8") as f:
177
+ custom_css = f"<style>{f.read()}</style>"
178
+ except Exception:
179
+ custom_css = ""
180
+
181
+ with gr.Blocks() as demo:
182
+ if custom_css:
183
+ gr.HTML(custom_css)
184
  gr.Markdown(DESCRIPTION)
185
  chat_interface.render()
186