import os import uuid import requests from flask import Flask, request, jsonify, render_template_string from llama_cpp import Llama app = Flask(__name__) # === CONFIGURATION === MODEL_URL = "https://huggingface.co/CooLLaMACEO/CooLLaMA-Gemma2/resolve/main/gemma-2-2b-it.q3_k_m.gguf" MODEL_PATH = "model.gguf" # === DOWNLOAD MODEL (Run once) === if not os.path.exists(MODEL_PATH): print("Downloading GGUF model... this may take a few minutes.") with requests.get(MODEL_URL, stream=True) as r: r.raise_for_status() with open(MODEL_PATH, "wb") as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) print("Download complete!") # === INITIALIZE LLM === # n_ctx is the context window (memory). 2048 is a good balance for free tier RAM. llm = Llama( model_path=MODEL_PATH, n_ctx=2048, n_threads=4 # Optimized for HF free tier CPU ) # === HTML TEMPLATE === HTML_TEMPLATE = """