| from flask import Flask, request, Response, jsonify | |
| from llama_cpp import Llama | |
| app = Flask(__name__) | |
| # Load the model | |
| print("π Loading model...") | |
| llm = Llama.from_pretrained( | |
| repo_id="bartowski/google_gemma-3-1b-it-GGUF", | |
| filename="google_gemma-3-1b-it-IQ4_XS.gguf", | |
| n_ctx=2048 | |
| ) | |
| print("β Model loaded!") | |
| def home(): | |
| print("π’ Serving index.html") | |
| return render_template("index.html") | |
| def generate_response(user_input): | |
| """Generator function to stream model output""" | |
| try: | |
| response = llm.create_chat_completion( | |
| messages=[{"role": "user", "content": user_input}], | |
| stream=True # Enable streaming | |
| ) | |
| for chunk in response: | |
| if "choices" in chunk and len(chunk["choices"]) > 0: | |
| token = chunk["choices"][0]["delta"].get("content", "") | |
| if token: | |
| print(f"π Token: {token}", flush=True) # Debugging | |
| yield token | |
| except Exception as e: | |
| print(f"β Error generating response: {e}") | |
| yield "[Error occurred]" | |
| def chat(): | |
| user_input = request.json.get("message", "") | |
| if not user_input: | |
| return jsonify({"error": "Empty input"}), 400 | |
| return Response(generate_response(user_input), content_type="text/plain") | |
| if __name__ == "__main__": | |
| app.run(host="0.0.0.0", port=7860, debug=True) | |