import os import torch from flask import Flask, render_template, request, redirect, url_for from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from peft import PeftModel # --- Load Model & Tokenizer --- base_model_name = "unsloth/llama-3.2-3b-bnb-4bit" adapter_model_name = "aismaanly/ai_synthetic" bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16 ) print("Loading base model...") model = AutoModelForCausalLM.from_pretrained( base_model_name, quantization_config=bnb_config, device_map="auto" ) print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(base_model_name) print("Loading PEFT adapter...") model = PeftModel.from_pretrained(model, adapter_model_name) model = model.merge_and_unload() print("Model ready!") # --- Flask App --- app = Flask(__name__) @app.route("/", methods=["GET"]) def index(): return render_template("index.html") @app.route("/generate", methods=["POST"]) def generate(): prompt = request.form["prompt"] inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate(**inputs, max_new_tokens=100) text = tokenizer.decode(outputs[0], skip_special_tokens=True) return redirect(url_for("result", generated_text=text)) @app.route("/result") def result(): generated_text = request.args.get("generated_text", "") return render_template("result.html", generated_text=generated_text) if __name__ == "__main__": port = int(os.environ.get("PORT", 7860)) app.run(host="0.0.0.0", port=port)