import flask
from flask import request, jsonify
from llama_cpp import Llama
import os
from huggingface_hub import hf_hub_download

# Initialize Flask
app = flask.Flask(__name__)

# GGUF Model Configuration
REPO_ID = "dexcommunity/indexQ4"
GGUF_FILENAME = "indexq4.gguf" 

print(f"🔄 Downloading GGUF model from {REPO_ID}...")

# Download GGUF file from HuggingFace Hub
try:
    model_path = hf_hub_download(
        repo_id=REPO_ID,
        filename=GGUF_FILENAME,
        repo_type="model"
    )
    print(f"✅ Model downloaded to: {model_path}")
except Exception as e:
    print(f"❌ Download failed: {e}")
    print("💡 Make sure your GGUF file is uploaded to HuggingFace!")
    raise

print(f"🔄 Loading GGUF model with llama.cpp...")

# Load GGUF model with llama-cpp-python
llm = Llama(
    model_path=model_path,
    n_ctx=2048,  # Context window
    n_threads=4,  # CPU threads (HF Free gives 2-4 cores)
    n_batch=512,  # Batch size for processing
    verbose=False,
    n_gpu_layers=0  # CPU only (HF Free doesn't have GPU)
)

print(f"✅ GGUF Model loaded successfully!")
print(f"📊 Model: {GGUF_FILENAME}")
print(f"🔧 Context: 2048 tokens, Threads: 4")

@app.route('/chat', methods=['POST'])
def chat():
    try:
        data = request.get_json()
        msg = data.get("message", "")

        if not msg:
            return jsonify({"error": "No message sent"}), 400

        # Gemma 2B chat template
        prompt = f"""<start_of_turn>user
{msg}<end_of_turn>
<start_of_turn>model
"""

        # Generate response with GGUF model
        response = llm(
            prompt,
            max_tokens=256,  # Max response length
            temperature=0.7,
            top_p=0.9,
            top_k=40,
            repeat_penalty=1.1,
            stop=["<end_of_turn>", "<start_of_turn>"],  # Stop sequences
            echo=False  # Don't include prompt in output
        )

        # Extract generated text
        reply = response['choices'][0]['text'].strip()

        return jsonify({
            "reply": reply,
            "tokens_used": response['usage']['completion_tokens']
        })

    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        print(f"❌ Error: {error_details}")
        return jsonify({"error": str(e)}), 500

@app.route('/health', methods=['GET'])
def health():
    """Health check endpoint"""
    return jsonify({
        "status": "healthy",
        "model": GGUF_FILENAME,
        "backend": "llama.cpp (GGUF)",
        "device": "CPU"
    })

if __name__ == "__main__":
    app.run(host='0.0.0.0', port=7860, debug=False, threaded=True)