Spaces:
Sleeping
Sleeping
File size: 5,313 Bytes
54ba978 cc6f54d 692a239 cc6f54d 73f52cd 5cecbb0 692a239 73f52cd 5cecbb0 73f52cd 6fe1066 5cecbb0 54ba978 bec8f6d 5cecbb0 cc6f54d 5cecbb0 54ba978 cc6f54d 6fe1066 b14c8d8 6fe1066 b14c8d8 6fe1066 4bdd945 54ba978 cc6f54d b2d905e cc6f54d 73f52cd cc6f54d 54ba978 4bdd945 5cecbb0 fd78eab 4bdd945 54ba978 4bdd945 6fe1066 5cecbb0 6fe1066 363f32b 5cecbb0 54ba978 cc6f54d 692a239 54ba978 5cecbb0 6fe1066 54ba978 6fe1066 fd78eab 73f52cd 692a239 6fe1066 73f52cd 6fe1066 5cecbb0 6fe1066 73f52cd 54ba978 5cecbb0 e50e821 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import os
import time
import threading
import torch
import gradio as gr
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
MODEL_REPO = "daniel-dona/gemma-3-270m-it"
LOCAL_DIR = os.path.join(os.getcwd(), "local_model")
# CPU optimizasyonları
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
os.environ.setdefault("OMP_NUM_THREADS", str(os.cpu_count() or 1))
os.environ.setdefault("MKL_NUM_THREADS", os.environ["OMP_NUM_THREADS"])
os.environ.setdefault("OMP_PROC_BIND", "TRUE")
torch.set_num_threads(int(os.environ["OMP_NUM_THREADS"]))
torch.set_num_interop_threads(1)
torch.set_float32_matmul_precision("high")
def ensure_local_model(repo_id: str, local_dir: str, tries: int = 3, sleep_s: float = 3.0) -> str:
os.makedirs(local_dir, exist_ok=True)
for i in range(tries):
try:
snapshot_download(
repo_id=repo_id,
local_dir=local_dir,
local_dir_use_symlinks=False,
resume_download=True,
allow_patterns=["*.json", "*.model", "*.safetensors", "*.bin", "*.txt", "*.py"]
)
return local_dir
except Exception:
if i == tries - 1:
raise
time.sleep(sleep_s * (2 ** i))
return local_dir
model_path = ensure_local_model(MODEL_REPO, LOCAL_DIR)
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
local_files_only=True,
torch_dtype=torch.float32,
device_map=None
)
model.eval()
# Çok dilli moderasyon system prompt
MODERATION_SYSTEM_PROMPT = (
"You are a multilingual content moderation classifier. "
"You analyze the user's message in any language and decide if it is safe or unsafe. "
"Rules: If the message contains hate speech, harassment, sexual content involving minors, "
"extreme violence, self-harm encouragement, or other unsafe material, respond with exactly 'unsafe'. "
"If it is acceptable and safe, respond with exactly 'safe'. "
"Do not explain, do not add anything else, only output 'safe' or 'unsafe'."
)
def build_prompt(message, max_ctx_tokens=512):
msgs = [
{"role": "system", "content": MODERATION_SYSTEM_PROMPT},
{"role": "user", "content": message}
]
chat_template = """{% for m in messages %}
{{ m['role'] }}: {{ m['content'] }}
{% endfor %}
Assistant:"""
text = tokenizer.apply_chat_template(
msgs,
chat_template=chat_template,
tokenize=False,
add_generation_prompt=True
)
# Token sınırını aşarsa kısalt
while len(tokenizer(text, add_special_tokens=False).input_ids) > max_ctx_tokens and len(msgs) > 2:
msgs.pop(1)
text = tokenizer.apply_chat_template(
msgs,
chat_template=chat_template,
tokenize=False,
add_generation_prompt=True
)
return text
def respond_stream(message, history, max_tokens, temperature, top_p):
text = build_prompt(message)
inputs = tokenizer([text], return_tensors="pt").to(model.device)
do_sample = bool(temperature and temperature > 0.0)
gen_kwargs = dict(
max_new_tokens=max_tokens,
do_sample=do_sample,
top_p=top_p,
temperature=temperature if do_sample else None,
use_cache=True,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.eos_token_id
)
try:
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
except TypeError:
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
thread = threading.Thread(
target=model.generate,
kwargs={**inputs, **{k: v for k, v in gen_kwargs.items() if v is not None}, "streamer": streamer}
)
partial_text = ""
token_count = 0
start_time = None
with torch.inference_mode():
thread.start()
try:
for chunk in streamer:
if start_time is None:
start_time = time.time()
partial_text += chunk
token_count += 1
yield partial_text.strip()
finally:
thread.join()
end_time = time.time() if start_time else time.time()
duration = max(1e-6, end_time - start_time)
tps = token_count / duration if duration > 0 else 0.0
yield partial_text.strip() + f"\n\n⚡ Speed: {tps:.2f} token/s"
demo = gr.ChatInterface(
respond_stream,
chatbot=False,
additional_inputs=[
gr.Slider(minimum=1, maximum=16, value=4, step=1, label="Max new tokens"),
gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p")
],
title="Multilingual Moderation Classifier",
description="Enter any text in any language. The model will output only 'safe' or 'unsafe'."
)
if __name__ == "__main__":
with torch.inference_mode():
_ = model.generate(
**tokenizer(["Hi"], return_tensors="pt").to(model.device),
max_new_tokens=1, do_sample=False, use_cache=True
)
demo.queue(max_size=32).launch()
|