Spaces:

UcsTurkey
/

test-oncu

Paused

App Files Files Community

ciyidogan commited on May 30, 2025

Commit

8a39f61

verified ·

1 Parent(s): e68dc63

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -7

app.py CHANGED Viewed

@@ -18,6 +18,48 @@ def log(message):
     timestamp = datetime.now().strftime("%H:%M:%S")
     print(f"[{timestamp}] {message}", flush=True)
 # === Global model değişkenleri
 tokenizer = None
 model = None
@@ -34,7 +76,7 @@ async def lifespan(app: FastAPI):
         tokenizer.pad_token = tokenizer.eos_token
         quant_config = BitsAndBytesConfig(
-            load_in_8bit=True,  # ✅ 8-bit quantization (modern BitsAndBytesConfig)
             llm_int8_threshold=6.0
         )
@@ -58,6 +100,7 @@ app = FastAPI(lifespan=lifespan)
 class UserInputRequest(BaseModel):
     user_input: str
     system_prompt: str
 @app.post("/generate")
 def generate(req: UserInputRequest):
@@ -65,17 +108,24 @@ def generate(req: UserInputRequest):
         overall_start = time.time()
         log(f"💬 Kullanıcı isteği alındı: {req.user_input}")
         # === Apply chat template
         t0 = time.time()
-        messages = [
-            {"role": "system", "content": req.system_prompt},
-            {"role": "user", "content": req.user_input}
-        ]
-        chat_template_str = tokenizer.apply_chat_template(
             messages,
             add_generation_prompt=True,
             return_tensors=None
         )
         t1 = time.time()
         log(f"⏱️ apply_chat_template süresi: {t1 - t0:.2f} saniye")
@@ -93,7 +143,7 @@ def generate(req: UserInputRequest):
         input_len = input_ids.shape[-1]
         total_ctx = model.config.max_position_embeddings if hasattr(model.config, 'max_position_embeddings') else 4096
-        max_new_tokens = min(512, max(1, total_ctx - input_len))
         log(f"ℹ️ Input uzunluğu: {input_len}, max_new_tokens ayarlandı: {max_new_tokens}")

     timestamp = datetime.now().strftime("%H:%M:%S")
     print(f"[{timestamp}] {message}", flush=True)
+# === Helper fonksiyonlar
+def trim_history(messages, max_blocks=20):
+    """
+    Mesaj geçmişini en fazla max_blocks (örn. 20: 10 user + 10 assistant) ile sınırla.
+    En eski mesajlar atılır.
+    """
+    return messages[-max_blocks:]
+def enforce_token_budget(tokenizer, system_prompt, history_messages, user_input, total_ctx=4096, max_new_tokens=300):
+    """
+    Toplam token limitini aşmamak için history’yi gerekirse budar.
+    Her mesaj komple kesilir, kısmen alınmaz.
+    """
+    system_tokens = len(tokenizer(system_prompt)['input_ids'])
+    user_tokens = len(tokenizer(user_input)['input_ids'])
+    history_tokens = sum(len(tokenizer(m['content'])['input_ids']) for m in history_messages)
+    log(f"ℹ️ Token hesaplama -> System: {system_tokens}, History: {history_tokens}, User: {user_tokens}")
+    available_budget = total_ctx - max_new_tokens
+    total_input_tokens = system_tokens + history_tokens + user_tokens
+    if total_input_tokens <= available_budget:
+        log(f"✅ Token bütçesi uygun (toplam {total_input_tokens}/{available_budget})")
+        return history_messages
+    trimmed_history = history_messages.copy()
+    while trimmed_history:
+        current_history_tokens = sum(len(tokenizer(m['content'])['input_ids']) for m in trimmed_history)
+        total_input_tokens = system_tokens + current_history_tokens + user_tokens
+        if total_input_tokens <= available_budget:
+            break
+        removed = trimmed_history.pop(0)
+        removed_tokens = len(tokenizer(removed['content'])['input_ids'])
+        log(f"⚠️ Token bütçesi aşıldı, en eski {removed['role']} mesajı ({removed_tokens} token) atıldı.")
+    final_tokens = system_tokens + sum(len(tokenizer(m['content'])['input_ids']) for m in trimmed_history) + user_tokens
+    log(f"✅ Budanmış token toplamı: {final_tokens}/{available_budget}")
+    return trimmed_history
 # === Global model değişkenleri
 tokenizer = None
 model = None
         tokenizer.pad_token = tokenizer.eos_token
         quant_config = BitsAndBytesConfig(
+            load_in_8bit=True,
             llm_int8_threshold=6.0
         )
 class UserInputRequest(BaseModel):
     user_input: str
     system_prompt: str
+    history: list  # [{"role": "user"/"assistant", "content": "..."}, ...]
 @app.post("/generate")
 def generate(req: UserInputRequest):
         overall_start = time.time()
         log(f"💬 Kullanıcı isteği alındı: {req.user_input}")
+        # === History budama
+        trimmed_history = trim_history(req.history, max_blocks=20)
+        trimmed_history = enforce_token_budget(tokenizer, req.system_prompt, trimmed_history, req.user_input, total_ctx=4096, max_new_tokens=300)
         # === Apply chat template
         t0 = time.time()
+        messages = [{"role": "system", "content": req.system_prompt}] + trimmed_history + [{"role": "user", "content": req.user_input}]
+        chat_template_raw = tokenizer.apply_chat_template(
             messages,
             add_generation_prompt=True,
             return_tensors=None
         )
+        if chat_template_raw is None:
+            chat_template_str = ""
+        elif isinstance(chat_template_raw, str):
+            chat_template_str = chat_template_raw
+        else:
+            chat_template_str = str(chat_template_raw)
         t1 = time.time()
         log(f"⏱️ apply_chat_template süresi: {t1 - t0:.2f} saniye")
         input_len = input_ids.shape[-1]
         total_ctx = model.config.max_position_embeddings if hasattr(model.config, 'max_position_embeddings') else 4096
+        max_new_tokens = min(300, max(1, total_ctx - input_len))
         log(f"ℹ️ Input uzunluğu: {input_len}, max_new_tokens ayarlandı: {max_new_tokens}")