indexdio / app.py
ghosthets's picture
Update app.py
1856082 verified
import flask
from flask import request, jsonify
from llama_cpp import Llama
import os
from huggingface_hub import hf_hub_download
# Initialize Flask
app = flask.Flask(__name__)
# GGUF Model Configuration
REPO_ID = "dexcommunity/indexQ4"
GGUF_FILENAME = "indexq4.gguf"
print(f"🔄 Downloading GGUF model from {REPO_ID}...")
# Download GGUF file from HuggingFace Hub
try:
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=GGUF_FILENAME,
repo_type="model"
)
print(f"✅ Model downloaded to: {model_path}")
except Exception as e:
print(f"❌ Download failed: {e}")
print("💡 Make sure your GGUF file is uploaded to HuggingFace!")
raise
print(f"🔄 Loading GGUF model with llama.cpp...")
# Load GGUF model with llama-cpp-python
llm = Llama(
model_path=model_path,
n_ctx=2048, # Context window
n_threads=4, # CPU threads (HF Free gives 2-4 cores)
n_batch=512, # Batch size for processing
verbose=False,
n_gpu_layers=0 # CPU only (HF Free doesn't have GPU)
)
print(f"✅ GGUF Model loaded successfully!")
print(f"📊 Model: {GGUF_FILENAME}")
print(f"🔧 Context: 2048 tokens, Threads: 4")
@app.route('/chat', methods=['POST'])
def chat():
try:
data = request.get_json()
msg = data.get("message", "")
if not msg:
return jsonify({"error": "No message sent"}), 400
# Gemma 2B chat template
prompt = f"""<start_of_turn>user
{msg}<end_of_turn>
<start_of_turn>model
"""
# Generate response with GGUF model
response = llm(
prompt,
max_tokens=256, # Max response length
temperature=0.7,
top_p=0.9,
top_k=40,
repeat_penalty=1.1,
stop=["<end_of_turn>", "<start_of_turn>"], # Stop sequences
echo=False # Don't include prompt in output
)
# Extract generated text
reply = response['choices'][0]['text'].strip()
return jsonify({
"reply": reply,
"tokens_used": response['usage']['completion_tokens']
})
except Exception as e:
import traceback
error_details = traceback.format_exc()
print(f"❌ Error: {error_details}")
return jsonify({"error": str(e)}), 500
@app.route('/health', methods=['GET'])
def health():
"""Health check endpoint"""
return jsonify({
"status": "healthy",
"model": GGUF_FILENAME,
"backend": "llama.cpp (GGUF)",
"device": "CPU"
})
if __name__ == "__main__":
app.run(host='0.0.0.0', port=7860, debug=False, threaded=True)