File size: 2,642 Bytes
1856082
 
e3dfbc8
1856082
e3dfbc8
 
1856082
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e3dfbc8
1856082
 
 
 
 
 
 
e3dfbc8
1856082
 
 
 
 
 
 
 
e3dfbc8
 
1856082
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e3dfbc8
1856082
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import flask
from flask import request, jsonify
from llama_cpp import Llama
import os
from huggingface_hub import hf_hub_download

# Initialize Flask
app = flask.Flask(__name__)

# GGUF Model Configuration
REPO_ID = "dexcommunity/indexQ4"
GGUF_FILENAME = "indexq4.gguf" 

print(f"🔄 Downloading GGUF model from {REPO_ID}...")

# Download GGUF file from HuggingFace Hub
try:
    model_path = hf_hub_download(
        repo_id=REPO_ID,
        filename=GGUF_FILENAME,
        repo_type="model"
    )
    print(f"✅ Model downloaded to: {model_path}")
except Exception as e:
    print(f"❌ Download failed: {e}")
    print("💡 Make sure your GGUF file is uploaded to HuggingFace!")
    raise

print(f"🔄 Loading GGUF model with llama.cpp...")

# Load GGUF model with llama-cpp-python
llm = Llama(
    model_path=model_path,
    n_ctx=2048,  # Context window
    n_threads=4,  # CPU threads (HF Free gives 2-4 cores)
    n_batch=512,  # Batch size for processing
    verbose=False,
    n_gpu_layers=0  # CPU only (HF Free doesn't have GPU)
)

print(f"✅ GGUF Model loaded successfully!")
print(f"📊 Model: {GGUF_FILENAME}")
print(f"🔧 Context: 2048 tokens, Threads: 4")

@app.route('/chat', methods=['POST'])
def chat():
    try:
        data = request.get_json()
        msg = data.get("message", "")

        if not msg:
            return jsonify({"error": "No message sent"}), 400

        # Gemma 2B chat template
        prompt = f"""<start_of_turn>user
{msg}<end_of_turn>
<start_of_turn>model
"""

        # Generate response with GGUF model
        response = llm(
            prompt,
            max_tokens=256,  # Max response length
            temperature=0.7,
            top_p=0.9,
            top_k=40,
            repeat_penalty=1.1,
            stop=["<end_of_turn>", "<start_of_turn>"],  # Stop sequences
            echo=False  # Don't include prompt in output
        )

        # Extract generated text
        reply = response['choices'][0]['text'].strip()

        return jsonify({
            "reply": reply,
            "tokens_used": response['usage']['completion_tokens']
        })

    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        print(f"❌ Error: {error_details}")
        return jsonify({"error": str(e)}), 500

@app.route('/health', methods=['GET'])
def health():
    """Health check endpoint"""
    return jsonify({
        "status": "healthy",
        "model": GGUF_FILENAME,
        "backend": "llama.cpp (GGUF)",
        "device": "CPU"
    })

if __name__ == "__main__":
    app.run(host='0.0.0.0', port=7860, debug=False, threaded=True)