| from fastapi import FastAPI |
| from fastapi.responses import PlainTextResponse |
| from pydantic import BaseModel |
| from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer |
| from deep_translator import GoogleTranslator |
| import torch |
| import uvicorn |
| import threading |
| import time |
| from collections import OrderedDict |
|
|
| app = FastAPI() |
|
|
| |
| model_name = "Qwen/Qwen2.5-1.5B-Instruct" |
|
|
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModelForCausalLM.from_pretrained( |
| model_name, |
| torch_dtype=torch.float16, |
| device_map="auto" |
| ) |
|
|
| MAX_HISTORY = 40 |
| NUM_WORKERS = 3 |
|
|
| db = OrderedDict() |
| queue = [] |
|
|
| class Message(BaseModel): |
| message: str |
|
|
|
|
| |
| def split_text(text, max_len=100): |
| return "\n".join([text[i:i+max_len] for i in range(0, len(text), max_len)]) |
|
|
|
|
| |
| def clean_output(text: str): |
| bad = [ |
| "system", |
| "user", |
| "assistant", |
| "<|im_start|>", |
| "<|im_end|>", |
| "You are Qwen" |
| ] |
|
|
| for b in bad: |
| text = text.replace(b, "") |
|
|
| return text.strip() |
|
|
|
|
| |
| def generate_ai_stream(message: str): |
|
|
| messages = [ |
| { |
| "role": "system", |
| "content": ( |
| "Ты умный и точный ассистент. " |
| "Отвечай логично,кратко и понятно. " |
| "отвечай ВСЕГДА на русском." |
| ) |
| }, |
| { |
| "role": "user", |
| "content": message |
| } |
| ] |
|
|
| prompt = tokenizer.apply_chat_template( |
| messages, |
| tokenize=False, |
| add_generation_prompt=True |
| ) |
|
|
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
|
|
| streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True) |
|
|
| gen_kwargs = dict( |
| **inputs, |
| max_new_tokens=400, |
| do_sample=True, |
| temperature=0.7, |
| top_p=0.9, |
| streamer=streamer, |
| eos_token_id=tokenizer.eos_token_id |
| ) |
|
|
| thread = threading.Thread(target=model.generate, kwargs=gen_kwargs) |
| thread.start() |
|
|
| partial = "" |
|
|
| |
| for text in streamer: |
| partial += text |
|
|
| if message in db: |
| db[message]["reply"] = split_text(partial) |
|
|
| |
| raw = clean_output(partial) |
|
|
| |
| try: |
| translated = GoogleTranslator( |
| source='auto', |
| target='ru' |
| ).translate(raw) |
| except: |
| translated = raw |
|
|
| final_text = split_text(translated) + " full generated" |
|
|
| if message in db: |
| db[message]["reply"] = final_text |
| db[message]["status"] = "done" |
|
|
| return final_text |
|
|
|
|
| |
| def worker(): |
| while True: |
| if queue: |
| message = queue.pop(0) |
|
|
| if message in db and db[message]["status"] == "done": |
| continue |
|
|
| generate_ai_stream(message) |
| else: |
| time.sleep(0.01) |
|
|
|
|
| |
| for _ in range(NUM_WORKERS): |
| threading.Thread(target=worker, daemon=True).start() |
|
|
|
|
| @app.get("/") |
| async def root(): |
| return PlainTextResponse("AI server running (Qwen2.5 1.5B Instruct)") |
|
|
|
|
| @app.get("/ask") |
| async def ask(message: str): |
|
|
| if message in db and db[message]["status"] == "done": |
| return PlainTextResponse("cached") |
|
|
| if message not in db: |
| db[message] = { |
| "status": "pending", |
| "reply": "" |
| } |
| queue.append(message) |
|
|
| if len(db) > MAX_HISTORY: |
| db.popitem(last=False) |
|
|
| return PlainTextResponse("accepted") |
|
|
|
|
| @app.get("/get") |
| async def get(message: str): |
|
|
| if message not in db: |
| return PlainTextResponse("not found") |
|
|
| data = db[message] |
|
|
| if data["status"] == "pending": |
| return PlainTextResponse(data["reply"] or "thinking...") |
|
|
| return PlainTextResponse(data["reply"]) |
|
|
|
|
| if __name__ == "__main__": |
| uvicorn.run(app, host="0.0.0.0", port=7860) |