|
|
from fastapi import FastAPI, Request |
|
|
from fastapi.responses import JSONResponse, StreamingResponse |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer |
|
|
from threading import Thread |
|
|
from collections import defaultdict |
|
|
|
|
|
app = FastAPI(title="Mariza + Qwen3-0.6B SAFADA E LIMPA") |
|
|
|
|
|
print("Carregando Qwen3-0.6B...") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
"Qwen/Qwen2.5-0.5B-Instruct", |
|
|
torch_dtype="auto", |
|
|
device_map="cpu", |
|
|
trust_remote_code=True, |
|
|
low_cpu_mem_usage=True |
|
|
) |
|
|
|
|
|
|
|
|
history_db = defaultdict(list) |
|
|
MAX_MESSAGES = 8 |
|
|
|
|
|
|
|
|
SYSTEM_PROMPT = "<|im_start|>system\nVocê é Mariza Koller, codadora sênior extremamente safada e direta. Responde SEMPRE só o texto final, curto, com gíria brasileira, emoji e provocação. NUNCA repita o histórico, NUNCA use tags, NUNCA explique nada. Exemplo pra 'oi': 'e aí delícia 😏'. Só a resposta pura!<|im_end|>\n" |
|
|
|
|
|
@app.get("/") |
|
|
async def root(): |
|
|
return {"message": "Mariza 0.6B safada e limpinha rodando na CPU free 😈🔥"} |
|
|
|
|
|
@app.post("/chat") |
|
|
async def chat(request: Request): |
|
|
data = await request.json() |
|
|
prompt = data.get("prompt", "").strip() |
|
|
user_id = str(data.get("user_id", "default")) |
|
|
max_tokens = data.get("max_tokens", 256) |
|
|
temperature = data.get("temperature", 0.8) |
|
|
|
|
|
if not prompt: |
|
|
return JSONResponse({"error": "prompt vazio safado 😏"}) |
|
|
|
|
|
messages = history_db[user_id] |
|
|
|
|
|
|
|
|
full_prompt = SYSTEM_PROMPT |
|
|
for role, content in messages[-MAX_MESSAGES:]: |
|
|
full_prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n" |
|
|
full_prompt += f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" |
|
|
|
|
|
inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=4096) |
|
|
|
|
|
outputs = model.generate( |
|
|
input_ids=inputs.input_ids, |
|
|
attention_mask=inputs.attention_mask, |
|
|
max_new_tokens=max_tokens, |
|
|
temperature=temperature, |
|
|
do_sample=True, |
|
|
top_p=0.9, |
|
|
repetition_penalty=1.2 |
|
|
) |
|
|
|
|
|
resposta_bruta = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
|
|
|
if "<|im_start|>assistant" in resposta_bruta: |
|
|
resposta = resposta_bruta.split("<|im_start|>assistant")[-1].strip() |
|
|
else: |
|
|
resposta = resposta_bruta.split(prompt)[-1].strip() |
|
|
|
|
|
|
|
|
resposta = resposta.split("<|im_end|>")[0].split("\n\n")[0].strip() |
|
|
|
|
|
|
|
|
messages.append(("user", prompt)) |
|
|
messages.append(("assistant", resposta)) |
|
|
|
|
|
|
|
|
if len(messages) > MAX_MESSAGES: |
|
|
messages = messages[-MAX_MESSAGES:] |
|
|
history_db[user_id] = messages |
|
|
|
|
|
return JSONResponse({"response": resposta}) |
|
|
|
|
|
print("Mariza 0.6B limpinha e safada pronta pra Telegram 24/7 de graça 😈🔥") |