Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Zephyr-7B Backend für HF Spaces | |
| Frontend + Backend in EINEM Container (kein Vite-Drama!) | |
| """ | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.staticfiles import StaticFiles | |
| from fastapi.responses import FileResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| import torch | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| BitsAndBytesConfig, | |
| pipeline | |
| ) | |
| import logging | |
| import time | |
| from pathlib import Path | |
| import os | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| app = FastAPI(title="Zephyr-7B - HF Spaces") | |
| # CORS | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # ========== MODEL LOADING ========== | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| USE_QUANTIZATION = True | |
| def select_model(): | |
| """Auto-select model based on available GPU memory""" | |
| # Qwen 1.5B ist klein und schnell - nehmen wir immer das! | |
| return "Qwen/Qwen2.5-1.5B-Instruct" | |
| MODEL_NAME = os.getenv("MODEL_NAME", select_model()) | |
| logger.info(f"📌 Using model: {MODEL_NAME}") | |
| def load_model_optimized(): | |
| """Load with quantization""" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| if USE_QUANTIZATION and DEVICE == "cuda": | |
| try: | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_8bit=True, | |
| bnb_8bit_compute_dtype=torch.float16, | |
| bnb_8bit_use_double_quant=True, | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| quantization_config=bnb_config, | |
| device_map="auto", | |
| ) | |
| logger.info("✅ Model loaded with 8-bit quantization") | |
| except Exception as e: | |
| logger.warning(f"⚠️ 8-bit failed: {e}, trying standard") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| device_map="auto" if DEVICE == "cuda" else None, | |
| torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32, | |
| ) | |
| else: | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| device_map="auto" if DEVICE == "cuda" else None, | |
| torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32, | |
| ) | |
| return tokenizer, model | |
| try: | |
| logger.info(f"⏳ Loading {MODEL_NAME}...") | |
| tokenizer, model = load_model_optimized() | |
| pipe = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| device=0 if DEVICE == "cuda" else -1, | |
| ) | |
| logger.info("✅ Model ready!") | |
| except Exception as e: | |
| logger.error(f"❌ Model loading failed: {e}") | |
| raise | |
| # ========== API ENDPOINTS ========== | |
| class GenerateRequest(BaseModel): | |
| prompt: str | |
| system_prompt: str = None | |
| max_tokens: int = 1024 # 512 | |
| temperature: float = 0.7 | |
| top_p: float = 0.9 | |
| async def generate(request: GenerateRequest): | |
| """Generate text response""" | |
| try: | |
| start = time.time() | |
| # Qwen prompt format: <|im_start|>role\ncontent\n<|im_end|> | |
| messages = [] | |
| if request.system_prompt: | |
| messages.append({"role": "system", "content": request.system_prompt}) | |
| messages.append({"role": "user", "content": request.prompt}) | |
| outputs = pipe( | |
| messages, | |
| max_new_tokens=request.max_tokens, | |
| temperature=request.temperature, | |
| top_p=request.top_p, | |
| do_sample=True, | |
| return_full_text=False, | |
| ) | |
| response_text = outputs[0]["generated_text"].strip() | |
| elapsed = time.time() - start | |
| return { | |
| "response": response_text, | |
| "tokens": len(tokenizer.encode(response_text)), | |
| "time_seconds": round(elapsed, 2), | |
| "model": MODEL_NAME, | |
| } | |
| except Exception as e: | |
| logger.error(f"Generation error: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def health(): | |
| """Health check""" | |
| return { | |
| "status": "ok", | |
| "model": MODEL_NAME, | |
| "device": DEVICE, | |
| } | |
| async def info(): | |
| """Model info""" | |
| gpu_memory = None | |
| if torch.cuda.is_available(): | |
| gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9 | |
| return { | |
| "model": MODEL_NAME, | |
| "device": DEVICE, | |
| "gpu_memory_gb": gpu_memory, | |
| "quantization": USE_QUANTIZATION, | |
| } | |
| # ========== STATIC FILES & FRONTEND ========== | |
| async def serve_frontend(): | |
| """Serve main page""" | |
| return FileResponse("frontend.html", media_type="text/html") | |
| async def fallback(full_path: str): | |
| """Fallback for SPA routing""" | |
| file_path = Path(full_path) | |
| # Check if it's a static file | |
| if file_path.exists(): | |
| return FileResponse(file_path) | |
| # Otherwise serve frontend (SPA routing) | |
| return FileResponse("frontend.html", media_type="text/html") | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |