Spaces:

GamerC0der
/

gradio

Runtime error

GamerC0der commited on 16 days ago

Commit

e102131

verified ·

1 Parent(s): 0a6e604

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,25 +3,24 @@ from flask import Flask, request, jsonify
 from llama_cpp import Llama
 app = Flask(__name__)
 GRADIO_PORT = 7860
 draft_model = Llama.from_pretrained(
     repo_id="QuantFactory/SmolLM2-135M-Instruct-GGUF",
-    filename="SmolLM2-135M-Instruct.Q4_0.gguf",
     n_ctx=2048,
     n_threads=2,
-    flash_attn=True,
     verbose=False
 )
 main_model = Llama.from_pretrained(
     repo_id="QuantFactory/SmolLM2-360M-Instruct-GGUF",
-    filename="SmolLM2-360M-Instruct.Q4_0.gguf",
     n_ctx=2048,
     n_threads=2,
-    flash_attn=True,
-    draft_model=draft_model,
     verbose=False
 )
@@ -30,22 +29,26 @@ def chat_completions():
     data = request.json or {}
     if 'messages' not in data:
         return jsonify({"error": "Missing messages array"}), 400
     start_time = time.time()
     response = main_model.create_chat_completion(
         messages=data.get('messages', []),
         temperature=0.7,
         max_tokens=data.get('max_tokens', 512),
-        stream=False
     )
-    generation_time = time.time() - start_time
     tps = response['usage']['completion_tokens'] / generation_time if generation_time > 0 else 0
     response['system_performance'] = {
         "tokens_per_second": round(tps, 2),
         "generation_time_sec": round(generation_time, 2),
-        "acceleration_technique": "Lossless Speculative Decoding"
     }
     return jsonify(response)
 if __name__ == '__main__':

 from llama_cpp import Llama
 app = Flask(__name__)
 GRADIO_PORT = 7860
 draft_model = Llama.from_pretrained(
     repo_id="QuantFactory/SmolLM2-135M-Instruct-GGUF",
+    filename="*SmolLM2-135M-Instruct.Q4_0.gguf",
     n_ctx=2048,
+    n_batch=512,
     n_threads=2,
     verbose=False
 )
 main_model = Llama.from_pretrained(
     repo_id="QuantFactory/SmolLM2-360M-Instruct-GGUF",
+    filename="*SmolLM2-360M-Instruct.Q4_0.gguf",
     n_ctx=2048,
+    n_batch=512,
     n_threads=2,
+    draft_model=draft_model,
     verbose=False
 )
     data = request.json or {}
     if 'messages' not in data:
         return jsonify({"error": "Missing messages array"}), 400
     start_time = time.time()
     response = main_model.create_chat_completion(
         messages=data.get('messages', []),
         temperature=0.7,
         max_tokens=data.get('max_tokens', 512),
+        stream=False,
+        cache_prompt=True
     )
+    generation_time = time.time() - start_time
     tps = response['usage']['completion_tokens'] / generation_time if generation_time > 0 else 0
     response['system_performance'] = {
         "tokens_per_second": round(tps, 2),
         "generation_time_sec": round(generation_time, 2),
+        "acceleration_technique": "Lossless Speculative Decoding + Prompt Caching"
     }
     return jsonify(response)
 if __name__ == '__main__':