from fastapi import FastAPI, Request from transformers import pipeline import torch import uvicorn import os app = FastAPI(title="Qwen 0.5B AI Chat API") # Check for GPU (even though free Space uses CPU) device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32 print(f"Loading Qwen 2.5 0.5B model on {device}...") try: pipe = pipeline( "text-generation", model="Qwen/Qwen2.5-0.5B-Instruct", torch_dtype=dtype, device_map="auto", ) print("Model loaded successfully!") except Exception as e: print(f"Error loading model: {e}") pipe = None @app.get("/") def read_root(): return { "status": "Chat API is active", "model": "Qwen 0.5B", "device": device } @app.post("/analyze") async def chat_endpoint(request: Request): if pipe is None: return {"error": "Model not loaded properly. Check logs."} prompt = "" try: # Primary: Accept JSON payload data = await request.json() prompt = data.get("prompt", "") except Exception: # Fallback: Trying to read form data just in case try: form = await request.form() prompt = form.get("prompt", "") except: pass if not prompt: return {"error": "لا يوجد نص في الرسالة."} # Set the personality and language for the model system_prompt = "أنت مساعد ذكاء اصطناعي طبيب وودود. أجب باللغة العربية بوضوح وإيجاز." messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": prompt} ] try: # Run inference using the chat template directly output = pipe( messages, max_new_tokens=400, do_sample=True, temperature=0.7, top_p=0.9 ) # The output includes the system, user, and assistant messages. We take the last one. result = output[0]["generated_text"][-1]["content"] return { "analysis": result, "success": True } except Exception as e: return {"error": f"Failed to generate response: {str(e)}"} if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)