|
|
import flask |
|
|
from flask import request, jsonify |
|
|
from llama_cpp import Llama |
|
|
import os |
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
|
|
|
app = flask.Flask(__name__) |
|
|
|
|
|
|
|
|
REPO_ID = "dexcommunity/indexQ4" |
|
|
GGUF_FILENAME = "indexq4.gguf" |
|
|
|
|
|
print(f"🔄 Downloading GGUF model from {REPO_ID}...") |
|
|
|
|
|
|
|
|
try: |
|
|
model_path = hf_hub_download( |
|
|
repo_id=REPO_ID, |
|
|
filename=GGUF_FILENAME, |
|
|
repo_type="model" |
|
|
) |
|
|
print(f"✅ Model downloaded to: {model_path}") |
|
|
except Exception as e: |
|
|
print(f"❌ Download failed: {e}") |
|
|
print("💡 Make sure your GGUF file is uploaded to HuggingFace!") |
|
|
raise |
|
|
|
|
|
print(f"🔄 Loading GGUF model with llama.cpp...") |
|
|
|
|
|
|
|
|
llm = Llama( |
|
|
model_path=model_path, |
|
|
n_ctx=2048, |
|
|
n_threads=4, |
|
|
n_batch=512, |
|
|
verbose=False, |
|
|
n_gpu_layers=0 |
|
|
) |
|
|
|
|
|
print(f"✅ GGUF Model loaded successfully!") |
|
|
print(f"📊 Model: {GGUF_FILENAME}") |
|
|
print(f"🔧 Context: 2048 tokens, Threads: 4") |
|
|
|
|
|
@app.route('/chat', methods=['POST']) |
|
|
def chat(): |
|
|
try: |
|
|
data = request.get_json() |
|
|
msg = data.get("message", "") |
|
|
|
|
|
if not msg: |
|
|
return jsonify({"error": "No message sent"}), 400 |
|
|
|
|
|
|
|
|
prompt = f"""<start_of_turn>user |
|
|
{msg}<end_of_turn> |
|
|
<start_of_turn>model |
|
|
""" |
|
|
|
|
|
|
|
|
response = llm( |
|
|
prompt, |
|
|
max_tokens=256, |
|
|
temperature=0.7, |
|
|
top_p=0.9, |
|
|
top_k=40, |
|
|
repeat_penalty=1.1, |
|
|
stop=["<end_of_turn>", "<start_of_turn>"], |
|
|
echo=False |
|
|
) |
|
|
|
|
|
|
|
|
reply = response['choices'][0]['text'].strip() |
|
|
|
|
|
return jsonify({ |
|
|
"reply": reply, |
|
|
"tokens_used": response['usage']['completion_tokens'] |
|
|
}) |
|
|
|
|
|
except Exception as e: |
|
|
import traceback |
|
|
error_details = traceback.format_exc() |
|
|
print(f"❌ Error: {error_details}") |
|
|
return jsonify({"error": str(e)}), 500 |
|
|
|
|
|
@app.route('/health', methods=['GET']) |
|
|
def health(): |
|
|
"""Health check endpoint""" |
|
|
return jsonify({ |
|
|
"status": "healthy", |
|
|
"model": GGUF_FILENAME, |
|
|
"backend": "llama.cpp (GGUF)", |
|
|
"device": "CPU" |
|
|
}) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.run(host='0.0.0.0', port=7860, debug=False, threaded=True) |