from flask import Flask, request, jsonify from transformers import AutoTokenizer, AutoModelForCausalLM import torch app = Flask(__name__) print("🚀 Loading Dolphin-Phi-2 (uncensored)...") model_name = "cognitivecomputations/dolphin-2_6-phi-2" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float32, # ✅ Changed to float32 for CPU device_map="cpu", # ✅ Explicitly use CPU low_cpu_mem_usage=True, trust_remote_code=True ) print("✅ Model loaded!") @app.route('/v1/chat/completions', methods=['POST']) def generate(): try: data = request.json messages = data.get('messages', []) max_tokens = data.get('max_tokens', 300) temperature = data.get('temperature', 0.8) system_msg = "" user_msg = "" for msg in messages: if msg['role'] == 'system': system_msg = msg['content'] elif msg['role'] == 'user': user_msg = msg['content'] prompt = f"<|im_start|>system\n{system_msg}<|im_end|>\n<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n" inputs = tokenizer(prompt, return_tensors="pt") with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=0.9, do_sample=True, pad_token_id=tokenizer.eos_token_id ) full_response = tokenizer.decode(outputs[0], skip_special_tokens=True) response_text = full_response.split("<|im_start|>assistant")[-1].replace("<|im_end|>", "").strip() return jsonify({ "choices": [{ "message": { "role": "assistant", "content": response_text } }] }) except Exception as e: print(f"❌ Error: {str(e)}") return jsonify({"error": str(e)}), 500 @app.route('/health', methods=['GET']) def health(): return jsonify({"status": "ok", "model": "dolphin-phi-2"}) @app.route('/', methods=['GET']) def home(): return jsonify({ "message": "Uncensored LLM API", "model": "dolphin-phi-2-2.7b", "endpoints": { "chat": "/v1/chat/completions (POST)", "health": "/health (GET)" } }) if __name__ == '__main__': app.run(host='0.0.0.0', port=7860)