Spaces:

Jay1121
/

immitate_chatbot

Running

App Files Files Community

Jay1121 commited on Nov 18, 2025

Commit

5c6662b

verified ·

1 Parent(s): 07c4cee

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -261

app.py CHANGED Viewed

@@ -1,11 +1,10 @@
 # -*- coding: utf-8 -*-
-# app.py — 어느 MZ 친구의 느린 DM방 (Blossom 8B, 4bit, Gradio)
 import os
 import re
 import random
 import difflib
-import torch
 from datetime import datetime
 try:
@@ -14,20 +13,25 @@ except Exception:
     ZoneInfo = None
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-from peft import PeftModel
 # =========================================================
-# 기본 모델 / 파인튜닝 모델 경로
 # =========================================================
 BASE_MODEL_PATH = "MLP-KTLim/llama-3-Korean-Bllossom-8B"
-# Hugging Face Hub에 올려둔 병합 모델 리포 ID
-# ❗ 스페이스에서는 여기만 네 모델 리포로 맞춰주면 됨
-MODEL_DIR_DEFAULT = "Jay1121/blossom_v2"  # repo id만! (필요하면 subfolder는 나중에 옵션으로 빼자)
 MODEL_DIR = os.environ.get("MODEL_DIR", MODEL_DIR_DEFAULT)
 # =========================================================
 # 환경 변수 / 기본값 설정
 # =========================================================
@@ -44,11 +48,6 @@ STRICT_MODE = os.environ.get("STRICT_MODE", "0") == "1"  # 기본 OFF
 SAFETY_ON = os.environ.get("SAFETY_ON", "0") == "1"      # 기본 OFF
 BAN_JAMO = os.environ.get("BAN_JAMO", "1") == "1"
-# ⚠ 스페이스는 GPU가 없는 경우가 많으니까 기본은 4bit ON이지만,
-# 실제로는 아래 _get_bnb_config()에서 torch.cuda 확인해서 자동으로 꺼짐
-USE_FA = os.environ.get("USE_FLASH_ATTN", "1") == "1"
-USE_4BIT = os.environ.get("USE_4BIT", "1") == "1"        # ✅ 기본 4bit 사용 (GPU 있을 때만 적용)
 STYLE_MODE = os.environ.get("STYLE_MODE", "auto")  # auto | deadpan | neutral
 WHITELIST_JAMO = set(
@@ -68,205 +67,37 @@ DEFAULT_PROFANITY = {
 }
 # =========================================================
-# 로더 보조
-# =========================================================
-def _pick_attn_impl():
-    return "flash_attention_2" if USE_FA and torch.cuda.is_available() else "sdpa"
-def _is_peft_adapter(model_dir: str) -> bool:
-    return os.path.exists(os.path.join(model_dir, "adapter_config.json"))
-def _has_full_model(model_dir: str) -> bool:
-    names = ["pytorch_model.bin", "model.safetensors", "consolidated.safetensors"]
-    has_weight = any(os.path.exists(os.path.join(model_dir, n)) for n in names)
-    has_cfg = os.path.exists(os.path.join(model_dir, "config.json"))
-    return has_weight and has_cfg
-def _has_tokenizer_files(path: str) -> bool:
-    if not path:
-        return False
-    return any(
-        os.path.exists(os.path.join(path, n))
-        for n in ["tokenizer.model", "tokenizer.json", "vocab.json", "merges.txt"]
-    )
-def _load_tokenizer_pref_local(local_dir: str, fallback_dir: str):
-    def _try(path, fast):
-        return AutoTokenizer.from_pretrained(
-            path, trust_remote_code=True, use_fast=fast
-        )
-    # 1) 로컬 tokenizer.model 우선
-    if local_dir and os.path.exists(os.path.join(local_dir, "tokenizer.model")):
-        try:
-            tok = _try(local_dir, False)
-            if tok.pad_token is None:
-                tok.pad_token = tok.eos_token
-            print(f"🔤 토크나이저 OK: {local_dir} (slow, tokenizer.model)")
-            return tok
-        except Exception as e:
-            print(f"⚠️ local slow 실패: {e}")
-    # 2) 로컬 tokenizer.json
-    if local_dir and os.path.exists(os.path.join(local_dir, "tokenizer.json")):
-        try:
-            tok = _try(local_dir, True)
-            if tok.pad_token is None:
-                tok.pad_token = tok.eos_token
-            print(f"🔤 토크나이저 OK: {local_dir} (fast, tokenizer.json)")
-            return tok
-        except Exception as e:
-            print(f"⚠️ local fast 실패: {e}")
-    # 3) fallback (베이스 모델)
-    for fast in (True, False):
-        try:
-            tok = _try(fallback_dir, fast)
-            if tok.pad_token is None:
-                tok.pad_token = tok.eos_token
-            print(f"🔤 토크나이저 OK: {fallback_dir} (fast={fast})")
-            return tok
-        except Exception as e:
-            print(f"⚠️ fallback (fast={fast}) 실패: {e}")
-    raise RuntimeError("토크나이저 로드에 모두 실패했습니다.")
-# =========================================================
-# 모델 로드 (4bit 지원)
 # =========================================================
-def _get_bnb_config():
     """
-    GPU 없으면 4bit quantization 자체를 꺼버려서
-    'No GPU found. A GPU is needed for quantization.' 에러 막기.
     """
-    if (not USE_4BIT) or (not torch.cuda.is_available()):
-        print("💡 4bit 비활성화 (USE_4BIT=0 이거나 GPU 없음) → 일반 fp16/bf16 로드")
-        return None
-    compute_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16
-    print(f"🧮 4bit 양자화 사용 (compute_dtype={compute_dtype})")
-    return BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_use_double_quant=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=compute_dtype,
     )
-def load_model_for_chat(model_dir: str, tokenizer_dir: str | None = None):
-    """
-    model_dir:
-      - 로컬 폴더
-      - 또는 Hugging Face Hub repo id (예: 'Jay1121/blossom_v2')
-    """
-    if os.path.isdir(model_dir):
-        print(f"▶ 로컬 모델 폴더: {model_dir}")
-        is_adapter = _is_peft_adapter(model_dir)
-        is_full = _has_full_model(model_dir)
-    else:
-        print(f"▶ 로컬 폴더 없음 → HF Hub에서 '{model_dir}' 로드 시도")
-        is_adapter = False
-        is_full = False
-    attn_impl = _pick_attn_impl()
-    bnb_config = _get_bnb_config()
-    # 토크나이저 경로 선택
-    if tokenizer_dir:
-        tk_dir = tokenizer_dir
-    elif os.path.isdir(model_dir) and _has_tokenizer_files(model_dir):
-        tk_dir = model_dir
-    else:
-        tk_dir = BASE_MODEL_PATH
-    print(f"🔎 토크나이저 경로 선택: {tk_dir}")
-    tok = _load_tokenizer_pref_local(tk_dir, BASE_MODEL_PATH)
-    # 1) PEFT 어댑터만 있는 경우 (로컬에서만 의미)
-    if is_adapter and not is_full:
-        print("📦 감지: PEFT LoRA 어댑터 → 베이스(Bllossom) 로드 후 어댑터 적용")
-        try:
-            base = AutoModelForCausalLM.from_pretrained(
-                BASE_MODEL_PATH,
-                torch_dtype=torch.float16,
-                device_map="auto",
-                trust_remote_code=True,
-                attn_implementation=attn_impl,
-            )
-        except Exception as e:
-            if attn_impl == "flash_attention_2":
-                print(f"⚠️ flash-attn 실패 → SDPA로 전환: {e}")
-                base = AutoModelForCausalLM.from_pretrained(
-                    BASE_MODEL_PATH,
-                    torch_dtype=torch.float16,
-                    device_map="auto",
-                    trust_remote_code=True,
-                    attn_implementation="sdpa",
-                )
-            else:
-                raise
-        model = PeftModel.from_pretrained(base, model_dir, offload_folder="offload")
-        try:
-            model = model.merge_and_unload()
-            print("✅ 어댑터 병합(merge_and_unload) 완료")
-        except Exception as e:
-            print(f"ℹ️ 병합 스킵: {e}")
-        model.eval()
-        print("✅ 모델 로드 완료!")
-        return model, tok
-    # 2) 병합된 풀 모델 or HF Hub 모델 (4bit 가능)
-    print("📦 감지: 병합된 '완전체' 모델 또는 HF Hub 모델 → from_pretrained 로 로드")
-    try:
-        if bnb_config is not None:
-            model = AutoModelForCausalLM.from_pretrained(
-                model_dir,
-                device_map="auto",
-                trust_remote_code=True,
-                attn_implementation=attn_impl,
-                quantization_config=bnb_config,
-            )
-        else:
-            model = AutoModelForCausalLM.from_pretrained(
-                model_dir,
-                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                device_map="auto",
-                trust_remote_code=True,
-                attn_implementation=attn_impl,
-            )
-    except Exception as e:
-        if attn_impl == "flash_attention_2":
-            print(f"⚠️ flash-attn 실패 → SDPA로 전환: {e}")
-            if bnb_config is not None:
-                model = AutoModelForCausalLM.from_pretrained(
-                    model_dir,
-                    device_map="auto",
-                    trust_remote_code=True,
-                    attn_implementation="sdpa",
-                    quantization_config=bnb_config,
-                )
-            else:
-                model = AutoModelForCausalLM.from_pretrained(
-                    model_dir,
-                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                    device_map="auto",
-                    trust_remote_code=True,
-                    attn_implementation="sdpa",
-                )
-        else:
-            raise
-    model.eval()
-    print("✅ 모델 로드 완료!")
-    return model, tok
 # =========================================================
 # 사전 / 욕설
@@ -301,28 +132,37 @@ RE_LAUGH = re.compile(r"(ㅋ|ㅎ|ㅠ|ㅜ)\1{2,}")
 RE_EN = re.compile(r"[A-Za-z]+")
 RE_WORDS = re.compile(r"[가-힣]{2,}")
-def build_bad_words_ids(tokenizer):
-    ids = [
-        tokenizer(w, add_special_tokens=False).input_ids
-        for w in META_BANS
-    ]
-    for ch in list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"):
-        ids.append(tokenizer(ch, add_special_tokens=False).input_ids)
-    if BAN_JAMO:
-        for code in list(range(0x1100, 0x11FF + 1)) + list(range(0x3130, 0x318F + 1)):
-            ch = chr(code)
-            if ch in WHITELIST_JAMO:
-                continue
-            ids.append(tokenizer(ch, add_special_tokens=False).input_ids)
-    return ids
 def clean_text(txt: str):
     if not KEEP_REPEATS:
         txt = RE_LAUGH.sub(lambda m: m.group(1) * 2, txt)
     txt = RE_EN.sub("", txt)
     cut = txt.split("### User:")[0]
-    return cut.strip()
 def count_oov(txt: str, dictionary, allowlist):
@@ -412,49 +252,37 @@ def postprocess_deadpan(reply: str):
     return reply.strip()
 # =========================================================
-# 디코딩
 # =========================================================
-def decode_once(model, tok, prompt, bad_words_ids, *, deadpan=False):
-    """답변 길이를 줄여서 속도 확보."""
     if deadpan:
-        cfg = dict(
-            do_sample=True,
-            temperature=0.25,
-            top_p=0.85,
-            max_new_tokens=48,
-        )
     elif STRICT_MODE:
-        cfg = dict(
-            do_sample=True,
-            temperature=0.35,
-            top_p=0.88,
-            max_new_tokens=56,
-        )
     else:
-        cfg = dict(
-            do_sample=True,
-            temperature=0.6,
-            top_p=0.9,
-            max_new_tokens=64,
-        )
-    inputs = tok(prompt, return_tensors="pt").to(model.device)
-    with torch.no_grad():
-        out = model.generate(
-            **inputs,
-            repetition_penalty=1.12,
-            no_repeat_ngram_size=3,
-            eos_token_id=tok.eos_token_id,
-            pad_token_id=tok.pad_token_id,
-            bad_words_ids=bad_words_ids,
-            **cfg,
-        )
-    gen = tok.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
     return clean_text(gen)
 # =========================================================
-# 시스템 프롬프트
 # =========================================================
 SYSTEM_PROMPT = (
@@ -468,7 +296,7 @@ SYSTEM_PROMPT = (
     "User: 무슨 일 해?\n"
     "Assistant: 별 건 안해.. 그냥 먹고 살려고\n"
     "User: 심심하다\n"
-    "Assistant: 지금 평일인데? 왜?\n"
     "--- 여기까지 예시 ---\n\n"
 )
@@ -476,11 +304,20 @@ SYSTEM_PROMPT = (
 # 전역 초기화
 # =========================================================
-print("🚀 모델/토크나이저 로드 중...")
-model, tokenizer = load_model_for_chat(MODEL_DIR, tokenizer_dir=None)
 dictionary = load_dictionary()
 profanity = load_profanity()
-bad_words_ids = build_bad_words_ids(tokenizer)
 print("✅ 초기화 완료")
 # =========================================================
@@ -497,6 +334,7 @@ def chat_fn(user_input, history):
         messages.append({"role": "assistant", "content": b})
     messages.append({"role": "user", "content": user_input})
     prompt = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
@@ -504,7 +342,7 @@ def chat_fn(user_input, history):
     )
     deadpan = should_deadpan(user_input)
-    reply = decode_once(model, tokenizer, prompt, bad_words_ids, deadpan=deadpan)
     oov_cnt, _ = count_oov(reply, dictionary, profanity)
     if OOV_STRIP and oov_cnt > 0:
@@ -538,7 +376,7 @@ demo = gr.ChatInterface(
     fn=chat_fn,
     title="어느 MZ 친구의 느린 DM방",
     description=(
-        "Blossom 8B + 카카오톡 말투 LoRA를 얹은, 어떤 MZ의 말투를 따라하는 한국어 친구 챗봇입니다.\n"
         "(⚠️ 개 느림주의: 대답 늦어도 서운해하지 말 것)"
     ),
     examples=[

 # -*- coding: utf-8 -*-
+# app.py — 어느 MZ 친구의 느린 DM방 (Blossom 8B GGUF, llama.cpp, Gradio)
 import os
 import re
 import random
 import difflib
 from datetime import datetime
 try:
     ZoneInfo = None
 import gradio as gr
+from transformers import AutoTokenizer
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
 # =========================================================
+# 기본 모델 / 토크나이저 / GGUF 경로 설정
 # =========================================================
+# 베이스 모델 (토크나이저용)
 BASE_MODEL_PATH = "MLP-KTLim/llama-3-Korean-Bllossom-8B"
+# 병합된 GGUF 모델이 올라간 Hugging Face Repo
+# (예: Jay1121/blossom_v2 에 blossom_v2.Q4_K_M.gguf)
+MODEL_DIR_DEFAULT = "Jay1121/blossom_v2"  # repo id
 MODEL_DIR = os.environ.get("MODEL_DIR", MODEL_DIR_DEFAULT)
+GGUF_REPO_ID = os.environ.get("GGUF_REPO_ID", MODEL_DIR)
+GGUF_FILENAME = os.environ.get("GGUF_FILENAME", "blossom_v2.Q4_K_M.gguf")
 # =========================================================
 # 환경 변수 / 기본값 설정
 # =========================================================
 SAFETY_ON = os.environ.get("SAFETY_ON", "0") == "1"      # 기본 OFF
 BAN_JAMO = os.environ.get("BAN_JAMO", "1") == "1"
 STYLE_MODE = os.environ.get("STYLE_MODE", "auto")  # auto | deadpan | neutral
 WHITELIST_JAMO = set(
 }
 # =========================================================
+# GGUF 로더 (llama.cpp)
 # =========================================================
+def load_model_for_chat(model_repo: str):
     """
+    GGUF + llama.cpp 로드.
+    - model_repo: Hugging Face repo id (예: 'Jay1121/blossom_v2')
+    - GGUF_REPO_ID / GGUF_FILENAME 환경변수로 오버라이드 가능
     """
+    repo_id = os.environ.get("GGUF_REPO_ID", model_repo)
+    filename = os.environ.get("GGUF_FILENAME", GGUF_FILENAME)
+    print(f"📥 GGUF 다운로드: {repo_id}/{filename}")
+    model_path = hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
     )
+    n_threads = int(os.environ.get("N_THREADS", str(os.cpu_count() or 4)))
+    n_ctx = int(os.environ.get("N_CTX", "2048"))
+    print(f"🧠 llama.cpp 초기화 (n_threads={n_threads}, n_ctx={n_ctx})")
+    llm = Llama(
+        model_path=model_path,
+        n_ctx=n_ctx,
+        n_threads=n_threads,
+        logits_all=False,
+        seed=0,
+    )
+    print("✅ GGUF 모델 로드 완료!")
+    return llm
 # =========================================================
 # 사전 / 욕설
 RE_EN = re.compile(r"[A-Za-z]+")
 RE_WORDS = re.compile(r"[가-힣]{2,}")
+def _is_jamo(ch: str) -> bool:
+    code = ord(ch)
+    return (0x1100 <= code <= 0x11FF) or (0x3130 <= code <= 0x318F)
+def _strip_jamo(text: str) -> str:
+    if not BAN_JAMO:
+        return text
+    out_chars = []
+    for ch in text:
+        if _is_jamo(ch) and (ch not in WHITELIST_JAMO):
+            continue
+        out_chars.append(ch)
+    return "".join(out_chars)
 def clean_text(txt: str):
+    # 1) ㅋㅋㅋㅋ/ㅠㅠㅠ 등 줄이기
     if not KEEP_REPEATS:
         txt = RE_LAUGH.sub(lambda m: m.group(1) * 2, txt)
+    # 2) 영문 제거
     txt = RE_EN.sub("", txt)
+    # 3) prompt template 섞인 경우 잘라내기
     cut = txt.split("### User:")[0]
+    txt = cut.strip()
+    # 4) 메타 단어 제거
+    for banned in META_BANS:
+        txt = txt.replace(banned, "")
+    # 5) 자모 제거 (화이트리스트 제외)
+    txt = _strip_jamo(txt)
+    return txt.strip()
 def count_oov(txt: str, dictionary, allowlist):
     return reply.strip()
 # =========================================================
+# 디코딩 (llama.cpp 사용)
 # =========================================================
+def decode_once(model, prompt: str, *, deadpan: bool = False) -> str:
+    """llama.cpp로 한 번 디코딩."""
     if deadpan:
+        temperature = 0.25
+        top_p = 0.85
+        max_tokens = 48
     elif STRICT_MODE:
+        temperature = 0.35
+        top_p = 0.88
+        max_tokens = 56
     else:
+        temperature = 0.6
+        top_p = 0.9
+        max_tokens = 64
+    # llama_cpp.Llama.__call__
+    out = model(
+        prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        stop=["</s>", "User:", "Assistant:", "### User:"],
+    )
+    gen = out["choices"][0]["text"]
     return clean_text(gen)
 # =========================================================
+# 시스템 프롬프트  (⚠ 예시 문구 그대로 유지)
 # =========================================================
 SYSTEM_PROMPT = (
     "User: 무슨 일 해?\n"
     "Assistant: 별 건 안해.. 그냥 먹고 살려고\n"
     "User: 심심하다\n"
+    "Assistant: 심심해? 개부럽누..\n"
     "--- 여기까지 예시 ---\n\n"
 )
 # 전역 초기화
 # =========================================================
+print("🚀 모델 로드 중 (GGUF + llama.cpp)...")
+model = load_model_for_chat(MODEL_DIR)
+print("🔤 토크나이저 로드 중...")
+tokenizer = AutoTokenizer.from_pretrained(
+    BASE_MODEL_PATH,
+    trust_remote_code=True,
+    use_fast=True,
+)
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
 dictionary = load_dictionary()
 profanity = load_profanity()
 print("✅ 초기화 완료")
 # =========================================================
         messages.append({"role": "assistant", "content": b})
     messages.append({"role": "user", "content": user_input})
+    # 원래 쓰던 chat_template 그대로 활용 (토크나이저만 사용)
     prompt = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
     )
     deadpan = should_deadpan(user_input)
+    reply = decode_once(model, prompt, deadpan=deadpan)
     oov_cnt, _ = count_oov(reply, dictionary, profanity)
     if OOV_STRIP and oov_cnt > 0:
     fn=chat_fn,
     title="어느 MZ 친구의 느린 DM방",
     description=(
+        "Blossom 8B GGUF + 카카오톡 말투 LoRA를 얹은, 어떤 MZ의 말투를 따라하는 한국어 친구 챗봇입니다.\n"
         "(⚠️ 개 느림주의: 대답 늦어도 서운해하지 말 것)"
     ),
     examples=[