| | import os |
| | from fastapi import FastAPI, HTTPException |
| | from pydantic import BaseModel |
| | from llama_cpp import Llama |
| | from huggingface_hub import hf_hub_download |
| | import logging |
| | from fastapi.responses import StreamingResponse |
| |
|
| | |
| | from fastapi.middleware.cors import CORSMiddleware |
| |
|
| | |
| | logging.basicConfig(level=logging.INFO) |
| | logger = logging.getLogger(__name__) |
| |
|
| | |
| | MODEL_REPO = "Qwen/Qwen2.5-3B-Instruct-GGUF" |
| | MODEL_FILE = "qwen2.5-3b-instruct-q2_k.gguf" |
| |
|
| | |
| | logger.info(f"{MODEL_FILE} modeli Hugging Face Hub'dan indiriliyor...") |
| | try: |
| | model_path = hf_hub_download( |
| | repo_id=MODEL_REPO, |
| | filename=MODEL_FILE |
| | ) |
| | logger.info(f"Model başarıyla {model_path} adresine indirildi.") |
| | except Exception as e: |
| | logger.error(f"Model indirilemedi: {e}") |
| | raise |
| |
|
| | |
| | logger.info("GGUF modeli yükleniyor...") |
| | try: |
| | llm = Llama( |
| | model_path=model_path, |
| | n_ctx=4096, |
| | n_gpu_layers=0, |
| | verbose=True |
| | ) |
| | logger.info("Model başarıyla yüklendi.") |
| | except Exception as e: |
| | logger.error(f"Model yüklenirken hata oluştu: {e}") |
| | raise |
| |
|
| | |
| | app = FastAPI( |
| | title="Qwen 2.5 API (Streaming - No RAG)", |
| | description="Sadece Qwen 2.5 modelini stream eder." |
| | ) |
| |
|
| | |
| | origins = [ |
| | "https://deede.tr", |
| | "http://deede.tr", |
| | "*" |
| | ] |
| |
|
| | app.add_middleware( |
| | CORSMiddleware, |
| | allow_origins=origins, |
| | allow_credentials=True, |
| | allow_methods=["*"], |
| | allow_headers=["*"], |
| | ) |
| | |
| |
|
| |
|
| | class ChatRequest(BaseModel): |
| | prompt: str |
| |
|
| | |
| | async def stream_generator(messages): |
| | """ |
| | Sadece LLM'den gelen token'ları stream eder. |
| | """ |
| | try: |
| | stream = llm.create_chat_completion( |
| | messages=messages, |
| | max_tokens=1024, |
| | temperature=0.7, |
| | stream=True |
| | ) |
| | |
| | for chunk in stream: |
| | content = chunk['choices'][0]['delta'].get('content', None) |
| | if content: |
| | yield content |
| |
|
| | except Exception as e: |
| | logger.error(f"LLM stream hatası: {e}") |
| | yield f" [LLM STREAM HATASI: {e}]" |
| |
|
| |
|
| | @app.get("/") |
| | def read_root(): |
| | return {"status": "Streaming API çalışıyor (No RAG)", "model_repo": MODEL_REPO} |
| |
|
| | @app.post("/api/chat") |
| | async def chat_with_rag(request: ChatRequest): |
| | user_prompt = request.prompt |
| | logger.info(f"Gelen prompt: {user_prompt}") |
| | |
| | |
| | messages = [ |
| | { |
| | "role": "system", |
| | "content": "Sen Qwen, Alibaba Cloud tarafından yaratılmış bir yapay zeka asistansısın. Sana sorulan sorulara yardımcı olacak şekilde cevap ver." |
| | }, |
| | { |
| | "role": "user", |
| | "content": user_prompt |
| | } |
| | ] |
| |
|
| | |
| | return StreamingResponse( |
| | stream_generator(messages), |
| | media_type="text/plain" |
| | ) |
| |
|
| | |
| | if __name__ == "__main__": |
| | import uvicorn |
| | uvicorn.run(app, host="0.0.0.0", port=7860) |