import flask from flask import request, jsonify from llama_cpp import Llama import os from huggingface_hub import hf_hub_download # Initialize Flask app = flask.Flask(__name__) # GGUF Model Configuration REPO_ID = "dexcommunity/indexQ4" GGUF_FILENAME = "indexq4.gguf" print(f"🔄 Downloading GGUF model from {REPO_ID}...") # Download GGUF file from HuggingFace Hub try: model_path = hf_hub_download( repo_id=REPO_ID, filename=GGUF_FILENAME, repo_type="model" ) print(f"✅ Model downloaded to: {model_path}") except Exception as e: print(f"❌ Download failed: {e}") print("💡 Make sure your GGUF file is uploaded to HuggingFace!") raise print(f"🔄 Loading GGUF model with llama.cpp...") # Load GGUF model with llama-cpp-python llm = Llama( model_path=model_path, n_ctx=2048, # Context window n_threads=4, # CPU threads (HF Free gives 2-4 cores) n_batch=512, # Batch size for processing verbose=False, n_gpu_layers=0 # CPU only (HF Free doesn't have GPU) ) print(f"✅ GGUF Model loaded successfully!") print(f"📊 Model: {GGUF_FILENAME}") print(f"🔧 Context: 2048 tokens, Threads: 4") @app.route('/chat', methods=['POST']) def chat(): try: data = request.get_json() msg = data.get("message", "") if not msg: return jsonify({"error": "No message sent"}), 400 # Gemma 2B chat template prompt = f"""user {msg} model """ # Generate response with GGUF model response = llm( prompt, max_tokens=256, # Max response length temperature=0.7, top_p=0.9, top_k=40, repeat_penalty=1.1, stop=["", ""], # Stop sequences echo=False # Don't include prompt in output ) # Extract generated text reply = response['choices'][0]['text'].strip() return jsonify({ "reply": reply, "tokens_used": response['usage']['completion_tokens'] }) except Exception as e: import traceback error_details = traceback.format_exc() print(f"❌ Error: {error_details}") return jsonify({"error": str(e)}), 500 @app.route('/health', methods=['GET']) def health(): """Health check endpoint""" return jsonify({ "status": "healthy", "model": GGUF_FILENAME, "backend": "llama.cpp (GGUF)", "device": "CPU" }) if __name__ == "__main__": app.run(host='0.0.0.0', port=7860, debug=False, threaded=True)