Spaces:

Sachin5112
/

model1

Runtime error

Sachin5112 commited on Mar 14

Commit

5942ab0

verified ·

1 Parent(s): d963bbd

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,30 +1,40 @@
-import gradio as gr
 from llama_cpp import Llama
-from huggingface_hub import hf_hub_download
-model_path = hf_hub_download(
-    repo_id="LocoreMind/LocoOperator-4B-GGUF",
-    filename="LocoOperator-4B.Q4_K_M.gguf"
-)
-llm = Llama(
-    model_path=model_path,
-    n_ctx=4096,
-    n_threads=4
 )
-def chat(message):
-    output = llm(
-        message,
-        max_tokens=300
     )
-    return output["choices"][0]["text"]
-demo = gr.Interface(
-    fn=chat,
-    inputs="text",
-    outputs="text",
-    title="LocoOperator Coding AI"
-)
-demo.launch()

+import os
 from llama_cpp import Llama
+from flask import Flask, request, jsonify
+# 1. Faster downloads
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+app = Flask(__name__)
+# 2. Load the most powerful model that fits in 16GB RAM (Qwen2.5-Coder-7B-Instrust-GGUF)
+# We use the Q4_K_M quantization (the "gold standard" for CPU)
+print("Loading model... this takes a minute on CPU...")
+llm = Llama.from_pretrained(
+    repo_id="Qwen/Qwen2.5-Coder-7B-Instruct-GGUF",
+    filename="*q4_k_m.gguf",
+    n_ctx=2048,      # Context window
+    n_threads=2,     # HF Free tier has 2 vCPUs
+    verbose=False
 )
+@app.route("/chat", methods=["POST"])
+def chat():
+    data = request.json
+    prompt = data.get("prompt", "")
+    # 3. Optimization Trick: Lowering 'repeat_penalty' and using 'top_k'
+    # speeds up the math for the CPU.
+    output = llm.create_chat_completion(
+        messages=[
+            {"role": "system", "content": "You are a helpful coding assistant."},
+            {"role": "user", "content": prompt}
+        ],
+        max_tokens=512,
+        temperature=0.7
     )
+    return jsonify(output["choices"][0]["message"])
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=7860)