import os import uuid import requests from flask import Flask, request, jsonify, render_template_string from llama_cpp import Llama app = Flask(__name__) # === CONFIGURATION === MODEL_URL = "https://huggingface.co/CooLLaMACEO/CooLLaMA-Gemma2/resolve/main/gemma-2-2b-it.q3_k_m.gguf" MODEL_PATH = "model.gguf" # === DOWNLOAD MODEL (Run once) === if not os.path.exists(MODEL_PATH): print("Downloading GGUF model... this may take a few minutes.") with requests.get(MODEL_URL, stream=True) as r: r.raise_for_status() with open(MODEL_PATH, "wb") as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) print("Download complete!") # === INITIALIZE LLM === # n_ctx is the context window (memory). 2048 is a good balance for free tier RAM. llm = Llama( model_path=MODEL_PATH, n_ctx=2048, n_threads=4 # Optimized for HF free tier CPU ) # === HTML TEMPLATE === HTML_TEMPLATE = """ CooLLaMA AI

🦙 CooLLaMA

v3.5 GGUF

""" @app.route("/") def index(): return render_template_string(HTML_TEMPLATE) @app.route("/new_chat", methods=["POST"]) def new_chat(): return jsonify({"chat_id": str(uuid.uuid4())[:8].upper()}) @app.route("/chat", methods=["POST"]) def chat(): data = request.json history = data.get("history", []) # Simple Prompt Formatting for Gemma-2 user_input = history[-1]["content"] if history else "" prompt = f"user\n{user_input}\nmodel\n" # Generate output = llm( prompt, max_tokens=512, stop=["", "user"], echo=False ) response_text = output["choices"][0]["text"].strip() return jsonify({"response": response_text}) if __name__ == "__main__": # HF Spaces listen on port 7860 by default app.run(host="0.0.0.0", port=7860)