Tatar_language / app.py
Nefertury's picture
Upload app.py with huggingface_hub
736e275 verified
import os, torch, gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
BASE_MODEL = os.getenv("BASE_MODEL", "Qwen/Qwen2.5-7B")
ADAPTER_REPO = os.getenv("ADAPTER_REPO", "your-username/tt-qwen25-7b-tt-lora")
LOAD_IN_4BIT = os.getenv("LOAD_IN_4BIT", "true").lower() == "true"
def load_model():
tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
if tok.pad_token_id is None:
tok.pad_token = tok.eos_token
base = None
if LOAD_IN_4BIT:
try:
bnb_cfg = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16, # float16 для Spaces GPU
)
base = AutoModelForCausalLM.from_pretrained(
BASE_MODEL, quantization_config=bnb_cfg, device_map="auto"
)
print("Loaded base in 4-bit NF4")
except Exception as e:
print("[warn] 4-bit failed:", e)
if base is None:
try:
bnb8 = BitsAndBytesConfig(load_in_8bit=True)
base = AutoModelForCausalLM.from_pretrained(
BASE_MODEL, quantization_config=bnb8, device_map="auto"
)
print("Loaded base in 8-bit")
except Exception as e:
print("[warn] 8-bit failed:", e)
base = AutoModelForCausalLM.from_pretrained(
BASE_MODEL, torch_dtype=torch.float16, device_map="auto"
)
print("Loaded base in FP16 (may offload to CPU)")
base.config.pad_token_id = tok.pad_token_id
model = PeftModel.from_pretrained(
base, ADAPTER_REPO, is_trainable=False, torch_dtype=torch.float16
)
model = model.to(dtype=torch.float16)
model.eval()
return tok, model
tok, model = load_model()
def format_prompt(user, system, mode):
if mode == "Qwen chat":
msgs = [{"role":"system","content":system},{"role":"user","content":user}]
input_ids = tok.apply_chat_template(msgs, add_generation_prompt=True, return_tensors="pt")
attn = torch.ones_like(input_ids)
return {"input_ids": input_ids.to(model.device), "attention_mask": attn.to(model.device)}
else:
prompt = f"<|system|> {system}\n<|user|> {user}\n<|assistant|>"
enc = tok(prompt, return_tensors="pt")
return {
"input_ids": enc["input_ids"].to(model.device),
"attention_mask": enc["attention_mask"].to(model.device)
}
@torch.inference_mode()
def respond(message, history, system_prompt, mode, temperature, top_p, rep_penalty, max_new_tokens):
inputs = format_prompt(message, system_prompt, mode)
with torch.autocast("cuda", dtype=torch.float16):
out = model.generate(
**inputs,
do_sample=True,
temperature=temperature,
top_p=top_p,
repetition_penalty=rep_penalty,
max_new_tokens=max_new_tokens,
pad_token_id=tok.pad_token_id,
eos_token_id=tok.eos_token_id,
no_repeat_ngram_size=4
)
gen_only = out[0][inputs["input_ids"].shape[1]:]
text = tok.decode(gen_only, skip_special_tokens=True)
return text
with gr.Blocks() as demo:
gr.Markdown("## Татарча чат-демо (Qwen2.5-7B + LoRA)")
gr.Markdown("Бета-версия. Модель обучена отвечать **по-татарски**. Если переключаться на русский/английский — это ошибка; сообщите нам примеры.")
with gr.Row():
system_prompt = gr.Textbox(
value="Син бары тик татарча гына җавап бир. Җавапларың кыска һәм нейтраль булсын.",
label="System prompt"
)
mode = gr.Radio(choices=["SFT tags", "Qwen chat"], value="SFT tags", label="Формат промпта")
with gr.Row():
temperature = gr.Slider(0.1, 1.2, value=0.7, step=0.05, label="temperature")
top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p")
rep_penalty = gr.Slider(1.0, 1.4, value=1.15, step=0.05, label="repetition_penalty")
max_new_tokens = gr.Slider(16, 512, value=200, step=8, label="max_new_tokens")
gr.ChatInterface(
fn=respond,
additional_inputs=[system_prompt, mode, temperature, top_p, rep_penalty, max_new_tokens],
title=None, undo_btn=None, retry_btn=None, clear_btn="Clear"
)
demo.queue(concurrency_count=1).launch()