Sachin5112 commited on
Commit
5942ab0
·
verified ·
1 Parent(s): d963bbd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -23
app.py CHANGED
@@ -1,30 +1,40 @@
1
- import gradio as gr
2
  from llama_cpp import Llama
3
- from huggingface_hub import hf_hub_download
4
 
5
- model_path = hf_hub_download(
6
- repo_id="LocoreMind/LocoOperator-4B-GGUF",
7
- filename="LocoOperator-4B.Q4_K_M.gguf"
8
- )
9
 
10
- llm = Llama(
11
- model_path=model_path,
12
- n_ctx=4096,
13
- n_threads=4
 
 
 
 
 
14
  )
15
 
16
- def chat(message):
17
- output = llm(
18
- message,
19
- max_tokens=300
 
 
 
 
 
 
 
 
 
 
20
  )
21
- return output["choices"][0]["text"]
22
-
23
- demo = gr.Interface(
24
- fn=chat,
25
- inputs="text",
26
- outputs="text",
27
- title="LocoOperator Coding AI"
28
- )
29
 
30
- demo.launch()
 
 
1
+ import os
2
  from llama_cpp import Llama
3
+ from flask import Flask, request, jsonify
4
 
5
+ # 1. Faster downloads
6
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
7
+
8
+ app = Flask(__name__)
9
 
10
+ # 2. Load the most powerful model that fits in 16GB RAM (Qwen2.5-Coder-7B-Instrust-GGUF)
11
+ # We use the Q4_K_M quantization (the "gold standard" for CPU)
12
+ print("Loading model... this takes a minute on CPU...")
13
+ llm = Llama.from_pretrained(
14
+ repo_id="Qwen/Qwen2.5-Coder-7B-Instruct-GGUF",
15
+ filename="*q4_k_m.gguf",
16
+ n_ctx=2048, # Context window
17
+ n_threads=2, # HF Free tier has 2 vCPUs
18
+ verbose=False
19
  )
20
 
21
+ @app.route("/chat", methods=["POST"])
22
+ def chat():
23
+ data = request.json
24
+ prompt = data.get("prompt", "")
25
+
26
+ # 3. Optimization Trick: Lowering 'repeat_penalty' and using 'top_k'
27
+ # speeds up the math for the CPU.
28
+ output = llm.create_chat_completion(
29
+ messages=[
30
+ {"role": "system", "content": "You are a helpful coding assistant."},
31
+ {"role": "user", "content": prompt}
32
+ ],
33
+ max_tokens=512,
34
+ temperature=0.7
35
  )
36
+
37
+ return jsonify(output["choices"][0]["message"])
 
 
 
 
 
 
38
 
39
+ if __name__ == "__main__":
40
+ app.run(host="0.0.0.0", port=7860)