Spaces:

Rapnss
/

VIA-01

Runtime error

App Files Files Community

Invescoz commited on Sep 15, 2025

Commit

502b633

verified ·

1 Parent(s): 7f1f452

Update app.py

Browse files

Files changed (1) hide show

app.py +158 -24

app.py CHANGED Viewed

@@ -1,35 +1,169 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
-MODEL_NAME = "Rapnss/VIA-01"   # your uploaded model repo on HF
-# Load model & tokenizer
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_NAME,
-    torch_dtype=torch.float16,
-    device_map="auto"
-)
-def chat_fn(prompt, history=[]):
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    outputs = model.generate(
         **inputs,
-        max_new_tokens=512,
-        temperature=0.7,
-        top_p=0.9,
     )
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return response
-demo = gr.Interface(
-    fn=chat_fn,
-    inputs=gr.Textbox(lines=2, placeholder="Ask VIA-01 something..."),
-    outputs=gr.Textbox(label="VIA-01 Response"),
-    title="Rapnss VIA-01",
-    description="Lightweight Reasoning + Code Model, by Rapnss"
-)
 if __name__ == "__main__":
-    demo.launch()

+# app.py
+import time
+import os
 import gradio as gr
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteriaList, StoppingCriteria
+MODEL_NAME = "Rapnss/VIA-01"  # your HF repo
+# Configs you can tune
+DEFAULT_MAX_NEW_TOKENS = 64   # keep low to meet latency targets
+MAX_PROMPT_TOKENS = 512       # truncate long prompts
+TEMPERATURE = 0.3
+TOP_P = 0.9
+DO_SAMPLE = False             # deterministic and usually faster than sampling
+NUM_BEAMS = 1                 # beam=1 is fastest
+WARMUP_PROMPT = "Hello."      # used to warm model after loading
+# Try to load tokenizer / model in quantized mode (4-bit) if bitsandbytes available
+print("Loading tokenizer & model...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
+model = None
+device = "cpu"
+try:
+    # If CUDA is available and bitsandbytes exists, load 4-bit
+    if torch.cuda.is_available():
+        device = "cuda"
+        print("CUDA available — attempting 4-bit load with bitsandbytes...")
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            load_in_4bit=True,
+            device_map="auto",
+            torch_dtype=torch.float16,
+            trust_remote_code=True,   # some user repos need it
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+        )
+    else:
+        raise RuntimeError("CUDA not available; load fallback")
+except Exception as e:
+    print("4-bit load failed or not available:", e)
+    print("Falling back to fp16/cpu (best-effort).")
+    # fallback: try fp16 on GPU or float32 on CPU
+    if torch.cuda.is_available():
+        device = "cuda"
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            torch_dtype=torch.float16,
+            device_map="auto",
+            trust_remote_code=True,
+        )
+    else:
+        device = "cpu"
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            torch_dtype=torch.float32,
+            device_map={"": "cpu"},
+            trust_remote_code=True,
+        )
+# Put model to eval & optionally compile
+model.eval()
+# Optional: try torch.compile for small speedups (PyTorch 2.x only, may increase startup)
+try:
+    if torch.__version__.startswith("2"):
+        print("Attempting torch.compile(model) for runtime speedups...")
+        model = torch.compile(model)
+except Exception as e:
+    print("torch.compile not used:", e)
+print(f"Model loaded on {device}")
+# Utility: fast tokenize + move to proper device
+def prepare_inputs(prompt_text):
+    # Truncate long prompts to limit total tokens on generation
+    inputs = tokenizer(
+        prompt_text,
+        return_tensors="pt",
+        truncation=True,
+        max_length=MAX_PROMPT_TOKENS,
+        padding=False,
+    )
+    if device == "cuda":
+        inputs = {k: v.cuda() for k, v in inputs.items()}
+    return inputs
+# Optional: short stopping criteria example (stop on newline double)
+class StopOnDoubleNewline(StoppingCriteria):
+    def __call__(self, input_ids, scores, **kwargs):
+        # stop when last two tokens are newline + newline (customize as needed)
+        if input_ids.shape[-1] >= 2:
+            if input_ids[0, -2].item() == tokenizer.eos_token_id or input_ids[0, -1].item() == tokenizer.eos_token_id:
+                return True
+        return False
+stop_criteria = StoppingCriteriaList([StopOnDoubleNewline()])
+# Warm-up function (to run a single tiny generation so the model caches kernels)
+def warm_up_model():
+    try:
+        prompt = WARMUP_PROMPT
+        inputs = prepare_inputs(prompt)
+        with torch.inference_mode():
+            model.generate(
+                **inputs,
+                max_new_tokens=8,
+                do_sample=False,
+                use_cache=True,
+            )
+        print("Warmup complete.")
+    except Exception as e:
+        print("Warmup failed:", e)
+# Warm up once at startup to reduce first-request latency
+warm_up_model()
+# The actual chat function used by Gradio
+def chat_fn(prompt: str, max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, temperature: float = TEMPERATURE):
+    t0 = time.time()
+    prompt = prompt.strip()
+    if not prompt:
+        return "Please enter a prompt."
+    # safety: clamp max_new_tokens to avoid huge generations
+    max_new_tokens = int(max(1, min(max_new_tokens, 256)))
+    inputs = prepare_inputs(prompt)
+    # Generation arguments tuned for speed
+    gen_kwargs = dict(
         **inputs,
+        max_new_tokens=max_new_tokens,
+        temperature=float(temperature),
+        top_p=float(TOP_P),
+        do_sample=DO_SAMPLE,
+        num_beams=NUM_BEAMS,
+        eos_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.sep_token_id,
+        pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
+        use_cache=True,
+        early_stopping=True,
+        # stopping_criteria=stop_criteria,  # enable if you want custom stopping
     )
+    # Inference context to reduce overhead
+    with torch.inference_mode():
+        outputs = model.generate(**gen_kwargs)
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    latency = time.time() - t0
+    # Return response and latency for debugging
+    return f"{response}\n\n---\nLatency: {latency:.2f}s  (max_new_tokens={max_new_tokens}, device={device})"
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("# Rapnss VIA-01")
+    with gr.Row():
+        txt = gr.Textbox(lines=3, placeholder="Ask VIA-01 something...", label="Prompt")
+    with gr.Row():
+        max_tokens = gr.Slider(16, 256, value=DEFAULT_MAX_NEW_TOKENS, step=16, label="Max new tokens")
+        temp = gr.Slider(0.0, 1.0, value=TEMPERATURE, step=0.05, label="Temperature")
+    out = gr.Textbox(label="VIA-01 Response", lines=12)
+    submit = gr.Button("Generate")
+    submit.click(fn=chat_fn, inputs=[txt, max_tokens, temp], outputs=out)
 if __name__ == "__main__":
+    demo.launch(share=False, server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))